skeegap / utils /extract_text.py
asoyewole's picture
upload prv missing utils/
bac493d verified
import pdfplumber
import fitz # PyMuPDF
from docx import Document
from typing import IO
from .logging_config import get_logger
logger = get_logger(__name__)
def extract_text_from_pdf(pdf_file: IO) -> str:
"""Try PyMuPDF first, fallback to pdfplumber.
Args:
pdf_file: file-like object opened in binary mode.
Returns:
Extracted text or empty string on failure.
"""
text = ""
# First: PyMuPDF
try:
file_bytes = pdf_file.read()
pdf_file.seek(0) # reset pointer so other libs can read
with fitz.open(stream=file_bytes, filetype="pdf") as doc:
for page in doc:
page_text = page.get_text("text")
if page_text:
text += page_text + "\n"
if len(text.strip()) > 400: # if PyMuPDF extracted enough
logger.debug("Extracted PDF via PyMuPDF; length=%d", len(text))
return text.strip()
except Exception as exc: # pragma: no cover - depends on external files
logger.exception("PyMuPDF extraction failed: %s", exc)
# Fallback: pdfplumber
try:
pdf_file.seek(0)
with pdfplumber.open(pdf_file) as pdf:
for page in pdf.pages:
page_text = page.extract_text() or ""
text += page_text + "\n"
logger.debug("Extracted PDF via pdfplumber; length=%d", len(text))
return text.strip()
except Exception as exc: # pragma: no cover
logger.exception("pdfplumber extraction failed: %s", exc)
return ""
def extract_text_from_docx(docx_file: IO) -> str:
try:
doc = Document(docx_file)
text = "\n".join([p.text for p in doc.paragraphs])
logger.debug("Extracted DOCX; length=%d", len(text))
return text
except Exception as exc:
logger.exception("DOCX extraction failed: %s", exc)
return ""
def extract_text_from_txt(txt_file: IO) -> str:
try:
content = txt_file.read()
if isinstance(content, bytes):
text = content.decode("utf-8", errors="replace")
else:
text = str(content)
logger.debug("Extracted TXT; length=%d", len(text))
return text
except Exception as exc:
logger.exception("TXT extraction failed: %s", exc)
return ""