Shami96 commited on
Commit
1adc50b
·
verified ·
1 Parent(s): 3187cdc

Update invoice.py

Browse files
Files changed (1) hide show
  1. invoice.py +29 -2
invoice.py CHANGED
@@ -432,10 +432,37 @@ def read_pdf(path: str) -> str:
432
  return ""
433
 
434
  def read_docx_plain(path: str) -> str:
435
- if not docx_available: return ""
 
436
  try:
437
  d = docx.Document(path)
438
- return "\n".join(p.text for p in d.paragraphs)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
439
  except Exception:
440
  return ""
441
 
 
432
  return ""
433
 
434
  def read_docx_plain(path: str) -> str:
435
+ if not docx_available:
436
+ return "" # consider logging: "python-docx not installed"
437
  try:
438
  d = docx.Document(path)
439
+ parts = []
440
+
441
+ # Body paragraphs
442
+ parts += [p.text.strip() for p in d.paragraphs if p.text and p.text.strip()]
443
+
444
+ # Tables: join first two cells as "Label: Value" to satisfy your regex
445
+ for tbl in d.tables:
446
+ for row in tbl.rows:
447
+ cells = [c.text.strip() for c in row.cells]
448
+ cells = [c for c in cells if c] # drop empties
449
+ if not cells:
450
+ continue
451
+ if len(cells) >= 2:
452
+ parts.append(f"{cells[0]}: {cells[1]}")
453
+ else:
454
+ parts.append(cells[0])
455
+
456
+ # Headers and footers (often hold contact blocks)
457
+ for s in d.sections:
458
+ parts += [p.text.strip() for p in s.header.paragraphs if p.text and p.text.strip()]
459
+ parts += [p.text.strip() for p in s.footer.paragraphs if p.text and p.text.strip()]
460
+
461
+ # Normalize spacing so ^|\n anchors work
462
+ text = "\n".join(parts)
463
+ text = re.sub(r"[ \t]+", " ", text)
464
+ text = re.sub(r"\r?\n[ \t]*", "\n", text)
465
+ return text
466
  except Exception:
467
  return ""
468