Spaces:
Sleeping
Sleeping
| import logging | |
| from pathlib import Path | |
| from typing import Union | |
| from .pdf_parser import PDFParser | |
| from .url_parser import URLParser | |
| logger = logging.getLogger(__name__) | |
| class InputProcessor: | |
| """ | |
| Main entry point for processing different types of inputs. | |
| """ | |
| def process(input_type: str, input_data: str) -> str: | |
| """ | |
| Process input based on type. | |
| Args: | |
| input_type: 'text', 'pdf', or 'url' | |
| input_data: The actual text, file path, or URL | |
| Returns: | |
| Extracted text content | |
| """ | |
| logger.info(f"Processing input type: {input_type}") | |
| if input_type == 'text': | |
| return input_data | |
| elif input_type == 'pdf': | |
| # Check if input_data is a file path | |
| is_path = False | |
| try: | |
| # Only check if it looks like a path (not too long) | |
| if len(str(input_data)) < 256 and Path(input_data).exists(): | |
| is_path = True | |
| except Exception: | |
| pass | |
| if is_path: | |
| return PDFParser.parse(input_data) | |
| # Try to decode as base64 | |
| import base64 | |
| import tempfile | |
| import os | |
| try: | |
| # Create a temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp: | |
| # If it's bytes, write directly. If str, decode. | |
| if isinstance(input_data, bytes): | |
| tmp.write(input_data) | |
| else: | |
| # Handle potential header "data:application/pdf;base64," | |
| if "," in input_data: | |
| input_data = input_data.split(",")[1] | |
| tmp.write(base64.b64decode(input_data)) | |
| tmp_path = tmp.name | |
| logger.info(f"Saved base64 PDF to temporary file: {tmp_path}") | |
| try: | |
| text = PDFParser.parse(tmp_path) | |
| return text | |
| finally: | |
| # Cleanup temp file | |
| try: | |
| os.unlink(tmp_path) | |
| except Exception as e: | |
| logger.warning(f"Failed to delete temp PDF file: {e}") | |
| except Exception as e: | |
| logger.error(f"Failed to process PDF input: {e}") | |
| raise ValueError(f"Invalid PDF input: {e}") | |
| elif input_type == 'url': | |
| return URLParser.parse(input_data) | |
| else: | |
| raise ValueError(f"Unsupported input type: {input_type}") | |