Spaces:
Sleeping
Sleeping
Update invoice.py
Browse files- invoice.py +29 -2
invoice.py
CHANGED
|
@@ -432,10 +432,37 @@ def read_pdf(path: str) -> str:
|
|
| 432 |
return ""
|
| 433 |
|
| 434 |
def read_docx_plain(path: str) -> str:
|
| 435 |
-
if not docx_available:
|
|
|
|
| 436 |
try:
|
| 437 |
d = docx.Document(path)
|
| 438 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 439 |
except Exception:
|
| 440 |
return ""
|
| 441 |
|
|
|
|
| 432 |
return ""
|
| 433 |
|
| 434 |
def read_docx_plain(path: str) -> str:
|
| 435 |
+
if not docx_available:
|
| 436 |
+
return "" # consider logging: "python-docx not installed"
|
| 437 |
try:
|
| 438 |
d = docx.Document(path)
|
| 439 |
+
parts = []
|
| 440 |
+
|
| 441 |
+
# Body paragraphs
|
| 442 |
+
parts += [p.text.strip() for p in d.paragraphs if p.text and p.text.strip()]
|
| 443 |
+
|
| 444 |
+
# Tables: join first two cells as "Label: Value" to satisfy your regex
|
| 445 |
+
for tbl in d.tables:
|
| 446 |
+
for row in tbl.rows:
|
| 447 |
+
cells = [c.text.strip() for c in row.cells]
|
| 448 |
+
cells = [c for c in cells if c] # drop empties
|
| 449 |
+
if not cells:
|
| 450 |
+
continue
|
| 451 |
+
if len(cells) >= 2:
|
| 452 |
+
parts.append(f"{cells[0]}: {cells[1]}")
|
| 453 |
+
else:
|
| 454 |
+
parts.append(cells[0])
|
| 455 |
+
|
| 456 |
+
# Headers and footers (often hold contact blocks)
|
| 457 |
+
for s in d.sections:
|
| 458 |
+
parts += [p.text.strip() for p in s.header.paragraphs if p.text and p.text.strip()]
|
| 459 |
+
parts += [p.text.strip() for p in s.footer.paragraphs if p.text and p.text.strip()]
|
| 460 |
+
|
| 461 |
+
# Normalize spacing so ^|\n anchors work
|
| 462 |
+
text = "\n".join(parts)
|
| 463 |
+
text = re.sub(r"[ \t]+", " ", text)
|
| 464 |
+
text = re.sub(r"\r?\n[ \t]*", "\n", text)
|
| 465 |
+
return text
|
| 466 |
except Exception:
|
| 467 |
return ""
|
| 468 |
|