|
|
import pdfplumber
|
|
|
import fitz
|
|
|
from docx import Document
|
|
|
from typing import IO
|
|
|
|
|
|
from .logging_config import get_logger
|
|
|
|
|
|
logger = get_logger(__name__)
|
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_file: IO) -> str:
|
|
|
"""Try PyMuPDF first, fallback to pdfplumber.
|
|
|
|
|
|
Args:
|
|
|
pdf_file: file-like object opened in binary mode.
|
|
|
|
|
|
Returns:
|
|
|
Extracted text or empty string on failure.
|
|
|
"""
|
|
|
text = ""
|
|
|
|
|
|
|
|
|
try:
|
|
|
file_bytes = pdf_file.read()
|
|
|
pdf_file.seek(0)
|
|
|
with fitz.open(stream=file_bytes, filetype="pdf") as doc:
|
|
|
for page in doc:
|
|
|
page_text = page.get_text("text")
|
|
|
if page_text:
|
|
|
text += page_text + "\n"
|
|
|
if len(text.strip()) > 400:
|
|
|
logger.debug("Extracted PDF via PyMuPDF; length=%d", len(text))
|
|
|
return text.strip()
|
|
|
except Exception as exc:
|
|
|
logger.exception("PyMuPDF extraction failed: %s", exc)
|
|
|
|
|
|
|
|
|
try:
|
|
|
pdf_file.seek(0)
|
|
|
with pdfplumber.open(pdf_file) as pdf:
|
|
|
for page in pdf.pages:
|
|
|
page_text = page.extract_text() or ""
|
|
|
text += page_text + "\n"
|
|
|
logger.debug("Extracted PDF via pdfplumber; length=%d", len(text))
|
|
|
return text.strip()
|
|
|
except Exception as exc:
|
|
|
logger.exception("pdfplumber extraction failed: %s", exc)
|
|
|
return ""
|
|
|
|
|
|
|
|
|
def extract_text_from_docx(docx_file: IO) -> str:
|
|
|
try:
|
|
|
doc = Document(docx_file)
|
|
|
text = "\n".join([p.text for p in doc.paragraphs])
|
|
|
logger.debug("Extracted DOCX; length=%d", len(text))
|
|
|
return text
|
|
|
except Exception as exc:
|
|
|
logger.exception("DOCX extraction failed: %s", exc)
|
|
|
return ""
|
|
|
|
|
|
|
|
|
def extract_text_from_txt(txt_file: IO) -> str:
|
|
|
try:
|
|
|
content = txt_file.read()
|
|
|
if isinstance(content, bytes):
|
|
|
text = content.decode("utf-8", errors="replace")
|
|
|
else:
|
|
|
text = str(content)
|
|
|
logger.debug("Extracted TXT; length=%d", len(text))
|
|
|
return text
|
|
|
except Exception as exc:
|
|
|
logger.exception("TXT extraction failed: %s", exc)
|
|
|
return ""
|
|
|
|