Spaces:

Shami96
/

NHVAS_Quote_Generator

Sleeping

Shami96 commited on Aug 31

Commit

1adc50b

verified ·

1 Parent(s): 3187cdc

Update invoice.py

Files changed (1) hide show

invoice.py CHANGED Viewed

@@ -432,10 +432,37 @@ def read_pdf(path: str) -> str:
         return ""
 def read_docx_plain(path: str) -> str:
-    if not docx_available: return ""
     try:
         d = docx.Document(path)
-        return "\n".join(p.text for p in d.paragraphs)
     except Exception:
         return ""

         return ""
 def read_docx_plain(path: str) -> str:
+    if not docx_available:
+        return ""  # consider logging: "python-docx not installed"
     try:
         d = docx.Document(path)
+        parts = []
+        # Body paragraphs
+        parts += [p.text.strip() for p in d.paragraphs if p.text and p.text.strip()]
+        # Tables: join first two cells as "Label: Value" to satisfy your regex
+        for tbl in d.tables:
+            for row in tbl.rows:
+                cells = [c.text.strip() for c in row.cells]
+                cells = [c for c in cells if c]  # drop empties
+                if not cells:
+                    continue
+                if len(cells) >= 2:
+                    parts.append(f"{cells[0]}: {cells[1]}")
+                else:
+                    parts.append(cells[0])
+        # Headers and footers (often hold contact blocks)
+        for s in d.sections:
+            parts += [p.text.strip() for p in s.header.paragraphs if p.text and p.text.strip()]
+            parts += [p.text.strip() for p in s.footer.paragraphs if p.text and p.text.strip()]
+        # Normalize spacing so ^|\n anchors work
+        text = "\n".join(parts)
+        text = re.sub(r"[ \t]+", " ", text)
+        text = re.sub(r"\r?\n[ \t]*", "\n", text)
+        return text
     except Exception:
         return ""