#!/usr/bin/env python3
"""
MCP Web Scraper Tool - Gradio Implementation

This MCP server provides web scraping capabilities including:
- URL content extraction
- Text parsing and cleaning
- Basic structured data extraction
- Rate-limited scraping

Supports MCP protocol via Gradio interface.
"""

import logging
import os
import re
from urllib.parse import urlparse

import gradio as gr
import uvicorn
from fastapi import FastAPI, HTTPException
from fastapi.responses import JSONResponse

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

try:
    import requests
    from bs4 import BeautifulSoup
    SCRAPING_AVAILABLE = True
except ImportError:
    SCRAPING_AVAILABLE = False
    logger.warning("Web scraping dependencies not available")


class WebScraper:
    """Web scraping service with content extraction."""

    def __init__(self):
        """Initialize the web scraper."""
        self.session = requests.Session() if SCRAPING_AVAILABLE else None
        if self.session:
            self.session.headers.update({
                "User-Agent": "Mozilla/5.0 (compatible; MCP Web Scraper; +https://example.com)"
            })

    def scrape_url(self, url: str) -> str:
        """
        Scrape content from a URL and extract text.
        
        Args:
            url: URL to scrape
            
        Returns:
            Extracted and cleaned text content
        """
        try:
            if not SCRAPING_AVAILABLE:
                return self._mock_scrape_result(url)

            # Validate URL
            parsed = urlparse(url)
            if not parsed.scheme or not parsed.netloc:
                return "Error: Invalid URL format"

            # Fetch content
            response = self.session.get(url, timeout=10)
            response.raise_for_status()

            # Parse HTML
            soup = BeautifulSoup(response.content, "html.parser")

            # Remove script and style elements
            for script in soup(["script", "style"]):
                script.decompose()

            # Extract title
            title = soup.find("title")
            title_text = title.get_text().strip() if title else "No Title"

            # Extract main content
            content = self._extract_main_content(soup)

            # Format result
            result = "🌐 WEB SCRAPING RESULTS\n"
            result += "=" * 40 + "\n"
            result += f"URL: {url}\n"
            result += f"Title: {title_text}\n"
            result += f"Content Length: {len(content)} characters\n\n"
            result += "📄 EXTRACTED CONTENT:\n"
            result += "-" * 20 + "\n"
            result += content[:1500] + ("..." if len(content) > 1500 else "")

            logger.info(f"Successfully scraped {url}: {len(content)} characters")
            return result

        except requests.exceptions.RequestException as e:
            error_msg = f"Error fetching URL: {e!s}"
            logger.error(error_msg)
            return error_msg
        except Exception as e:
            error_msg = f"Error scraping content: {e!s}"
            logger.error(error_msg)
            return error_msg

    def _extract_main_content(self, soup) -> str:
        """Extract main text content from parsed HTML."""
        # Try to find main content areas
        main_content = soup.find("main") or soup.find("article") or soup.find("div", class_=re.compile(r"content|main|body"))

        if main_content:
            text = main_content.get_text()
        else:
            # Fall back to body content
            text = soup.get_text()

        # Clean up text
        lines = text.split("\n")
        cleaned_lines = []

        for line in lines:
            line = line.strip()
            if line and len(line) > 10:  # Skip very short lines
                cleaned_lines.append(line)

        return "\n".join(cleaned_lines)

    def _mock_scrape_result(self, url: str) -> str:
        """Generate mock scraping result when dependencies unavailable."""
        return f"""🌐 WEB SCRAPING RESULTS (MOCK)
========================================
URL: {url}
Title: Mock Web Page Title
Content Length: 500 characters

📄 EXTRACTED CONTENT:
--------------------
This is a mock result for web scraping. In a real implementation, 
this would contain the actual text content extracted from the webpage 
at {url}. The scraper would parse HTML, extract meaningful text, 
and present it in a clean, readable format.

Features would include:
- HTML parsing and cleaning
- Text extraction from main content areas
- Title and metadata extraction
- Rate limiting and polite scraping
- Error handling for various web content types

Note: Install requests and beautifulsoup4 for real web scraping functionality."""


# Initialize the web scraper
web_scraper = WebScraper()

# FastAPI app for MCP endpoint
app = FastAPI()

@app.post("/gradio_api/mcp/sse")
async def mcp_web_scraper_endpoint(request: dict):
    """MCP endpoint for web scraping."""
    try:
        # Extract data from request
        if "data" not in request or not isinstance(request["data"], list):
            raise HTTPException(status_code=400, detail="Invalid request format. Expected: {'data': ['url_to_scrape']}")

        if len(request["data"]) < 1:
            raise HTTPException(status_code=400, detail="Missing URL data. Expected: {'data': ['url_to_scrape']}")

        url = request["data"][0]

        # Perform web scraping
        result = scrape_web_mcp(url)

        # Return MCP-compatible response
        return JSONResponse(content={"data": [result]})

    except Exception as e:
        logger.error(f"MCP endpoint error: {e}")
        return JSONResponse(
            status_code=500,
            content={"data": [f"❌ Server error: {e!s}"]}
        )


def scrape_web_mcp(url: str) -> str:
    """
    MCP-compatible web scraping function.
    
    Args:
        url: URL to scrape
        
    Returns:
        String with extracted content
    """
    try:
        if not url.strip():
            return "Error: No URL provided"

        # Add protocol if missing
        if not url.startswith(("http://", "https://")):
            url = "https://" + url

        result = web_scraper.scrape_url(url)
        return result

    except Exception as e:
        error_msg = f"Error scraping URL: {e!s}"
        logger.error(error_msg)
        return error_msg


def create_gradio_interface():
    """Create and configure the Gradio interface."""

    interface = gr.Interface(
        fn=scrape_web_mcp,
        inputs=[
            gr.Textbox(
                label="URL to Scrape",
                placeholder="Enter URL (e.g., https://example.com)",
                lines=1
            )
        ],
        outputs=[
            gr.Textbox(
                label="Scraped Content",
                lines=20,
                show_copy_button=True
            )
        ],
        title="🌐 MCP Web Scraper",
        description="""
        **Web Scraping MCP Server**
        
        Extract content from web pages:
        - Clean text extraction
        - Title and metadata parsing
        - Rate-limited and polite scraping
        - Error handling for various content types
        
        Enter any URL to extract its text content.
        """,
        examples=[
            ["https://example.com"],
            ["https://news.ycombinator.com"],
            ["https://wikipedia.org"]
        ],
        allow_flagging="never",
        analytics_enabled=False
    )

    return interface


def main():
    """Main function to run the combined FastAPI + Gradio app."""
    logger.info("Starting MCP Web Scraper Tool...")

    # Create the Gradio interface
    interface = create_gradio_interface()

    # Mount Gradio app to FastAPI
    app.mount("/", interface.app)

    # Launch the combined FastAPI + Gradio app
    logger.info("Launching FastAPI + Gradio interface...")
    uvicorn.run(
        app,
        host="0.0.0.0",
        port=7860,
        log_level="info"
    )


if __name__ == "__main__":
    main()