File size: 2,399 Bytes
bac493d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import pdfplumber
import fitz # PyMuPDF
from docx import Document
from typing import IO
from .logging_config import get_logger
logger = get_logger(__name__)
def extract_text_from_pdf(pdf_file: IO) -> str:
"""Try PyMuPDF first, fallback to pdfplumber.
Args:
pdf_file: file-like object opened in binary mode.
Returns:
Extracted text or empty string on failure.
"""
text = ""
# First: PyMuPDF
try:
file_bytes = pdf_file.read()
pdf_file.seek(0) # reset pointer so other libs can read
with fitz.open(stream=file_bytes, filetype="pdf") as doc:
for page in doc:
page_text = page.get_text("text")
if page_text:
text += page_text + "\n"
if len(text.strip()) > 400: # if PyMuPDF extracted enough
logger.debug("Extracted PDF via PyMuPDF; length=%d", len(text))
return text.strip()
except Exception as exc: # pragma: no cover - depends on external files
logger.exception("PyMuPDF extraction failed: %s", exc)
# Fallback: pdfplumber
try:
pdf_file.seek(0)
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
page_text = page.extract_text() or ""
text += page_text + "\n"
logger.debug("Extracted PDF via pdfplumber; length=%d", len(text))
return text.strip()
except Exception as exc: # pragma: no cover
logger.exception("pdfplumber extraction failed: %s", exc)
return ""
def extract_text_from_docx(docx_file: IO) -> str:
try:
doc = Document(docx_file)
text = "\n".join([p.text for p in doc.paragraphs])
logger.debug("Extracted DOCX; length=%d", len(text))
return text
except Exception as exc:
logger.exception("DOCX extraction failed: %s", exc)
return ""
def extract_text_from_txt(txt_file: IO) -> str:
try:
content = txt_file.read()
if isinstance(content, bytes):
text = content.decode("utf-8", errors="replace")
else:
text = str(content)
logger.debug("Extracted TXT; length=%d", len(text))
return text
except Exception as exc:
logger.exception("TXT extraction failed: %s", exc)
return ""
|