Adityahulk
Restoring repo state for deployment
6fc3143
raw
history blame
2.82 kB
import logging
from pathlib import Path
from typing import Union
from .pdf_parser import PDFParser
from .url_parser import URLParser
logger = logging.getLogger(__name__)
class InputProcessor:
"""
Main entry point for processing different types of inputs.
"""
@staticmethod
def process(input_type: str, input_data: str) -> str:
"""
Process input based on type.
Args:
input_type: 'text', 'pdf', or 'url'
input_data: The actual text, file path, or URL
Returns:
Extracted text content
"""
logger.info(f"Processing input type: {input_type}")
if input_type == 'text':
return input_data
elif input_type == 'pdf':
# Check if input_data is a file path
is_path = False
try:
# Only check if it looks like a path (not too long)
if len(str(input_data)) < 256 and Path(input_data).exists():
is_path = True
except Exception:
pass
if is_path:
return PDFParser.parse(input_data)
# Try to decode as base64
import base64
import tempfile
import os
try:
# Create a temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
# If it's bytes, write directly. If str, decode.
if isinstance(input_data, bytes):
tmp.write(input_data)
else:
# Handle potential header "data:application/pdf;base64,"
if "," in input_data:
input_data = input_data.split(",")[1]
tmp.write(base64.b64decode(input_data))
tmp_path = tmp.name
logger.info(f"Saved base64 PDF to temporary file: {tmp_path}")
try:
text = PDFParser.parse(tmp_path)
return text
finally:
# Cleanup temp file
try:
os.unlink(tmp_path)
except Exception as e:
logger.warning(f"Failed to delete temp PDF file: {e}")
except Exception as e:
logger.error(f"Failed to process PDF input: {e}")
raise ValueError(f"Invalid PDF input: {e}")
elif input_type == 'url':
return URLParser.parse(input_data)
else:
raise ValueError(f"Unsupported input type: {input_type}")