luciagomez commited on
Commit
555f6a1
Β·
verified Β·
1 Parent(s): cbe72d1

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +215 -0
app.py ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ from urllib.parse import urlparse, urljoin
4
+ from markdownify import markdownify as md
5
+ import tempfile
6
+ import zipfile
7
+ import re
8
+ from typing import Tuple, Set, List
9
+ import os
10
+ import gradio as gr
11
+ from collections import deque
12
+
13
+
14
+ # ===========================================================
15
+ # UTILITIES β€” Recursive crawler
16
+ # ===========================================================
17
+ def crawl_site_for_links(start_url: str, max_pages: int = 50, max_depth: int = 2):
18
+ """Crawl the given site (up to max_depth) and return sets of HTML and PDF URLs."""
19
+ visited = set()
20
+ html_links = set()
21
+ pdf_links = set()
22
+
23
+ parsed_base = urlparse(start_url)
24
+ domain = parsed_base.netloc
25
+
26
+ queue = deque([(start_url, 0)]) # (url, depth)
27
+ session = requests.Session()
28
+ session.headers.update({
29
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0 Safari/537.36'
30
+ })
31
+
32
+ while queue and len(visited) < max_pages:
33
+ current_url, depth = queue.popleft()
34
+ if current_url in visited or depth > max_depth:
35
+ continue
36
+
37
+ visited.add(current_url)
38
+ try:
39
+ response = session.get(current_url, timeout=10)
40
+ if "text/html" not in response.headers.get("Content-Type", ""):
41
+ continue
42
+
43
+ soup = BeautifulSoup(response.content, "html.parser")
44
+ for a in soup.find_all("a", href=True):
45
+ href = a["href"]
46
+ full_url = urljoin(current_url, href)
47
+ parsed = urlparse(full_url)
48
+
49
+ # Stay in same domain
50
+ if parsed.netloc != domain:
51
+ continue
52
+
53
+ if full_url.lower().endswith(".pdf"):
54
+ pdf_links.add(full_url)
55
+ elif not href.startswith(("#", "javascript:", "mailto:", "tel:")):
56
+ html_links.add(full_url)
57
+ if full_url not in visited and depth + 1 <= max_depth:
58
+ queue.append((full_url, depth + 1))
59
+ except Exception:
60
+ continue
61
+
62
+ return html_links, pdf_links
63
+
64
+
65
+ # ===========================================================
66
+ # MAIN FUNCTION β€” Extract text & PDFs into ZIP
67
+ # ===========================================================
68
+ def extract_all_content_as_zip(url: str, max_links: int = None, max_depth: int = 2) -> Tuple[str, str]:
69
+ """
70
+ Extract text content and PDFs from all internal links found on a website recursively.
71
+ """
72
+ try:
73
+ if not url.startswith(("http://", "https://")):
74
+ url = "https://" + url
75
+
76
+ html_links, pdf_links = crawl_site_for_links(url, max_pages=(max_links or 50), max_depth=max_depth)
77
+
78
+ if not html_links and not pdf_links:
79
+ return "No internal links or PDFs found to extract.", None
80
+
81
+ # Limit HTML pages if requested
82
+ total_html = len(html_links)
83
+ if max_links is not None:
84
+ html_links = list(html_links)[:max_links]
85
+ limited_message = f" (limited to {max_links} out of {total_html})"
86
+ else:
87
+ limited_message = ""
88
+
89
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".zip") as temp_zip:
90
+ zip_path = temp_zip.name
91
+
92
+ successful_html = 0
93
+ failed_html = 0
94
+ successful_pdfs = 0
95
+ failed_pdfs = 0
96
+
97
+ session = requests.Session()
98
+ session.headers.update({
99
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
100
+ })
101
+
102
+ with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zip_file:
103
+ # Process HTML pages
104
+ for i, link_url in enumerate(html_links, 1):
105
+ try:
106
+ resp = session.get(link_url, timeout=10)
107
+ resp.raise_for_status()
108
+ soup = BeautifulSoup(resp.content, "html.parser")
109
+ for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
110
+ tag.decompose()
111
+ main_content = (
112
+ soup.find("main") or
113
+ soup.find("article") or
114
+ soup.find("div", class_=re.compile(r"content|main|post|article")) or
115
+ soup.find("body")
116
+ )
117
+ if not main_content:
118
+ failed_html += 1
119
+ continue
120
+ markdown_text = md(str(main_content), heading_style="ATX")
121
+ markdown_text = re.sub(r"\n{3,}", "\n\n", markdown_text)
122
+ title = soup.find("title")
123
+ if title:
124
+ markdown_text = f"# {title.get_text().strip()}\n\n{markdown_text.strip()}"
125
+
126
+ filename = re.sub(r"[^\w\-_.]", "_", urlparse(link_url).path or f"page_{i}") + ".md"
127
+ if filename in [".md", "index.md"]:
128
+ filename = f"page_{i}.md"
129
+
130
+ zip_file.writestr(filename, f"<!-- Source: {link_url} -->\n\n{markdown_text.strip()}")
131
+ successful_html += 1
132
+ except Exception:
133
+ failed_html += 1
134
+ continue
135
+
136
+ # Process PDFs
137
+ for j, pdf_url in enumerate(pdf_links, 1):
138
+ try:
139
+ resp = session.get(pdf_url, timeout=20)
140
+ resp.raise_for_status()
141
+ pdf_filename = os.path.basename(urlparse(pdf_url).path)
142
+ if not pdf_filename.lower().endswith(".pdf"):
143
+ pdf_filename = f"document_{j}.pdf"
144
+ zip_file.writestr(f"pdfs/{pdf_filename}", resp.content)
145
+ successful_pdfs += 1
146
+ except Exception:
147
+ failed_pdfs += 1
148
+ continue
149
+
150
+ status_message = (
151
+ f"Extracted {successful_html} HTML pages{limited_message} and "
152
+ f"downloaded {successful_pdfs} PDFs successfully."
153
+ )
154
+ if failed_html or failed_pdfs:
155
+ status_message += f" ({failed_html} HTML and {failed_pdfs} PDF downloads failed.)"
156
+ status_message += f" Created ZIP with {successful_html} markdown files and {successful_pdfs} PDFs."
157
+
158
+ return status_message, zip_path
159
+
160
+ except Exception as e:
161
+ return f"Error: {str(e)}", None
162
+
163
+
164
+ # ===========================================================
165
+ # GRADIO UI
166
+ # ===========================================================
167
+ def gradio_extract(url, max_links, max_depth):
168
+ message, zip_path = extract_all_content_as_zip(url, max_links, max_depth)
169
+ if zip_path:
170
+ return message, zip_path
171
+ else:
172
+ return message, None
173
+
174
+
175
+ gr_interface = gr.Interface(
176
+ fn=gradio_extract,
177
+ inputs=[
178
+ gr.Textbox(label="Website URL", placeholder="https://example.com"),
179
+ gr.Number(label="Max number of links (optional)", value=50),
180
+ gr.Slider(label="Crawl depth", minimum=1, maximum=3, value=2, step=1)
181
+ ],
182
+ outputs=[
183
+ gr.Textbox(label="Status Message"),
184
+ gr.File(label="Download ZIP")
185
+ ],
186
+ title="Recursive Website Content & PDF Extractor",
187
+ description="Recursively crawls a website to extract text and PDF files from internal pages and bundles them into a ZIP file."
188
+ )
189
+
190
+
191
+ # ===========================================================
192
+ # MCP SERVER STUB
193
+ # ===========================================================
194
+ class MCPServerApp:
195
+ def __init__(self):
196
+ self.name = "mcp_content_extractor"
197
+
198
+ def launch(self, mcp_server=False):
199
+ if mcp_server:
200
+ print("πŸš€ MCP server running (stub mode) β€” ready for agent connections.")
201
+ else:
202
+ print("Launching Gradio UI...")
203
+ gr_interface.launch(server_name="0.0.0.0", server_port=7860)
204
+
205
+
206
+ def create_mcp_interface():
207
+ return MCPServerApp()
208
+
209
+
210
+ # ===========================================================
211
+ # ENTRY POINT
212
+ # ===========================================================
213
+ if __name__ == "__main__":
214
+ app = create_mcp_interface()
215
+ app.launch(mcp_server=False)