File size: 2,399 Bytes
bac493d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import pdfplumber
import fitz  # PyMuPDF
from docx import Document
from typing import IO

from .logging_config import get_logger

logger = get_logger(__name__)


def extract_text_from_pdf(pdf_file: IO) -> str:
    """Try PyMuPDF first, fallback to pdfplumber.



    Args:

        pdf_file: file-like object opened in binary mode.



    Returns:

        Extracted text or empty string on failure.

    """
    text = ""

    # First: PyMuPDF
    try:
        file_bytes = pdf_file.read()
        pdf_file.seek(0)  # reset pointer so other libs can read
        with fitz.open(stream=file_bytes, filetype="pdf") as doc:
            for page in doc:
                page_text = page.get_text("text")
                if page_text:
                    text += page_text + "\n"
        if len(text.strip()) > 400:  # if PyMuPDF extracted enough
            logger.debug("Extracted PDF via PyMuPDF; length=%d", len(text))
            return text.strip()
    except Exception as exc:  # pragma: no cover - depends on external files
        logger.exception("PyMuPDF extraction failed: %s", exc)

    # Fallback: pdfplumber
    try:
        pdf_file.seek(0)
        with pdfplumber.open(pdf_file) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text() or ""
                text += page_text + "\n"
        logger.debug("Extracted PDF via pdfplumber; length=%d", len(text))
        return text.strip()
    except Exception as exc:  # pragma: no cover
        logger.exception("pdfplumber extraction failed: %s", exc)
        return ""


def extract_text_from_docx(docx_file: IO) -> str:
    try:
        doc = Document(docx_file)
        text = "\n".join([p.text for p in doc.paragraphs])
        logger.debug("Extracted DOCX; length=%d", len(text))
        return text
    except Exception as exc:
        logger.exception("DOCX extraction failed: %s", exc)
        return ""


def extract_text_from_txt(txt_file: IO) -> str:
    try:
        content = txt_file.read()
        if isinstance(content, bytes):
            text = content.decode("utf-8", errors="replace")
        else:
            text = str(content)
        logger.debug("Extracted TXT; length=%d", len(text))
        return text
    except Exception as exc:
        logger.exception("TXT extraction failed: %s", exc)
        return ""