Spaces:
Sleeping
Sleeping
Commit
·
365a20a
1
Parent(s):
dd2978b
ueifu
Browse files
vector_stores/chroma_db/checklist_examples/create_industry_db.py
CHANGED
|
@@ -63,7 +63,7 @@ class ChecklistExamplesVDB:
|
|
| 63 |
document_type TEXT,
|
| 64 |
product_name TEXT,
|
| 65 |
supplier_name TEXT,
|
| 66 |
-
|
| 67 |
total_parameters INTEGER DEFAULT 0,
|
| 68 |
extracted_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
| 69 |
)
|
|
@@ -148,23 +148,23 @@ class ChecklistExamplesVDB:
|
|
| 148 |
return hashlib.md5(f.read()).hexdigest()
|
| 149 |
|
| 150 |
def extract_document_metadata(self, pdf_path, text_content):
|
| 151 |
-
"""Extract document metadata from checklist"""
|
| 152 |
metadata = {
|
| 153 |
"document_type": "QC Checklist",
|
| 154 |
"product_name": "",
|
| 155 |
"supplier_name": "",
|
| 156 |
-
"
|
| 157 |
}
|
| 158 |
|
| 159 |
-
# Extract document type
|
| 160 |
doc_type_patterns = {
|
| 161 |
"Inspection Record": ["inspection record", "inspection checklist", "quality inspection"],
|
| 162 |
"Pre-Shipment Inspection": ["pre-shipment", "container inspection", "shipment inspection"],
|
| 163 |
"Production Checklist": ["production checklist", "manufacturing checklist", "process checklist"],
|
| 164 |
-
"Temperature Log": ["temperature", "
|
| 165 |
"Receiving Inspection": ["receiving", "goods receipt", "incoming inspection"],
|
| 166 |
"Hygiene Checklist": ["hygiene", "sanitation", "cleaning checklist"],
|
| 167 |
-
"
|
| 168 |
}
|
| 169 |
|
| 170 |
text_lower = text_content.lower()
|
|
@@ -173,11 +173,13 @@ class ChecklistExamplesVDB:
|
|
| 173 |
metadata["document_type"] = doc_type
|
| 174 |
break
|
| 175 |
|
| 176 |
-
# Extract product name
|
| 177 |
product_patterns = [
|
| 178 |
r"product\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
|
|
|
|
|
|
|
| 179 |
r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*[-–]\s*inspection",
|
| 180 |
-
r"
|
| 181 |
r"product:\s*([^\n]{1,40})",
|
| 182 |
]
|
| 183 |
|
|
@@ -187,12 +189,13 @@ class ChecklistExamplesVDB:
|
|
| 187 |
metadata["product_name"] = match.group(1).strip()
|
| 188 |
break
|
| 189 |
|
| 190 |
-
# Extract supplier name -
|
| 191 |
supplier_patterns = [
|
| 192 |
r"supplier\s*(?:name)?\s*[:\-]\s*([^\n]{1,40})",
|
| 193 |
-
r"(
|
| 194 |
r"manufacturer\s*[:\-]\s*([^\n]{1,40})",
|
| 195 |
-
r"company\s*[:\-]\s*([^\n]{1,40})"
|
|
|
|
| 196 |
]
|
| 197 |
|
| 198 |
for pattern in supplier_patterns:
|
|
@@ -201,23 +204,38 @@ class ChecklistExamplesVDB:
|
|
| 201 |
metadata["supplier_name"] = match.group(1).strip()
|
| 202 |
break
|
| 203 |
|
| 204 |
-
#
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
"
|
| 210 |
-
"
|
| 211 |
-
"
|
| 212 |
-
"
|
| 213 |
-
"
|
| 214 |
}
|
| 215 |
|
| 216 |
-
for
|
| 217 |
if any(keyword in text_lower for keyword in keywords):
|
| 218 |
-
|
| 219 |
break
|
| 220 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
return metadata
|
| 222 |
|
| 223 |
def extract_checklist_parameters(self, text_content):
|
|
@@ -232,23 +250,31 @@ class ChecklistExamplesVDB:
|
|
| 232 |
section_order = 0
|
| 233 |
parameter_order = 0
|
| 234 |
|
| 235 |
-
#
|
| 236 |
param_patterns = [
|
| 237 |
# Format: "Parameter Name: Type/Method"
|
| 238 |
-
r"^([A-Z][^:]+?):\s*(Acceptable\s*/\s*Non-acceptable|To be mentioned|Present\s*/\s*Absent)",
|
| 239 |
# Format: "Parameter (Spec: value)"
|
| 240 |
-
r"^([A-Z][^(]+?)\s*\(Spec:\s*([^)]+)\)",
|
| 241 |
-
# Format: "Parameter Name
|
| 242 |
-
r"^([A-Z][
|
| 243 |
-
# Format: "Parameter:
|
| 244 |
-
r"^([A-Z][^:]+?):\s*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
]
|
| 246 |
|
| 247 |
# Section header patterns
|
| 248 |
section_patterns = [
|
| 249 |
-
r"^([A-Z\s]+(?:EVALUATION|
|
| 250 |
r"^[0-9]+\.\s*([A-Z][^.]+)$",
|
| 251 |
-
r"^\*\*([A-Z\s]+)\*\*$"
|
|
|
|
|
|
|
| 252 |
]
|
| 253 |
|
| 254 |
for line_idx, line in enumerate(lines):
|
|
@@ -257,7 +283,7 @@ class ChecklistExamplesVDB:
|
|
| 257 |
match = re.match(pattern, line)
|
| 258 |
if match:
|
| 259 |
section_name = match.group(1).strip()
|
| 260 |
-
if len(section_name) > 5: # Valid section name
|
| 261 |
current_section = section_name
|
| 262 |
section_order += 1
|
| 263 |
sections.append({
|
|
@@ -301,13 +327,13 @@ class ChecklistExamplesVDB:
|
|
| 301 |
return parameters, sections
|
| 302 |
|
| 303 |
def analyze_parameter(self, param_name, param_details, current_line, all_lines, line_idx):
|
| 304 |
-
"""Analyze parameter to determine type, input method, etc."""
|
| 305 |
param_name_lower = param_name.lower()
|
| 306 |
-
param_details_lower = param_details.lower()
|
| 307 |
context_lines = all_lines[max(0, line_idx-2):min(len(all_lines), line_idx+3)]
|
| 308 |
context_text = " ".join(context_lines).lower()
|
| 309 |
|
| 310 |
-
#
|
| 311 |
parameter_type = "Quality Check"
|
| 312 |
input_method = "Text Input"
|
| 313 |
specifications = ""
|
|
@@ -316,43 +342,51 @@ class ChecklistExamplesVDB:
|
|
| 316 |
measurement_units = ""
|
| 317 |
has_remarks = False
|
| 318 |
|
| 319 |
-
#
|
| 320 |
-
if any(keyword in param_details_lower for keyword in ["acceptable
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
| 323 |
parameter_type = "Quality Assessment"
|
| 324 |
|
| 325 |
-
elif any(keyword in param_details_lower for keyword in ["present
|
| 326 |
input_method = "Toggle"
|
| 327 |
-
options_list = "
|
| 328 |
parameter_type = "Presence Check"
|
| 329 |
|
| 330 |
-
elif "to be mentioned" in param_details_lower:
|
| 331 |
-
|
|
|
|
| 332 |
input_method = "Numeric Input"
|
| 333 |
parameter_type = "Measurement"
|
| 334 |
else:
|
| 335 |
input_method = "Text Input"
|
| 336 |
parameter_type = "Information Entry"
|
| 337 |
|
| 338 |
-
elif any(keyword in param_name_lower for keyword in ["photo", "
|
| 339 |
input_method = "Image Upload"
|
| 340 |
parameter_type = "Visual Documentation"
|
| 341 |
|
| 342 |
-
elif any(keyword in param_name_lower for keyword in ["
|
| 343 |
input_method = "Remarks"
|
| 344 |
parameter_type = "Detailed Notes"
|
| 345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
# Specification extraction
|
| 347 |
spec_patterns = [
|
| 348 |
-
r"\(spec:\s*([^)]+)\)",
|
| 349 |
-
r"tolerance\s*limit[:\s]*([^,\n]+)",
|
| 350 |
r"(\d+\s*[±]\s*\d+\s*[a-zA-Z%°]+)",
|
| 351 |
-
r"(
|
| 352 |
-
r"(\d+\s*[
|
| 353 |
]
|
| 354 |
|
| 355 |
-
combined_text = f"{param_name} {param_details}"
|
| 356 |
for pattern in spec_patterns:
|
| 357 |
match = re.search(pattern, combined_text, re.IGNORECASE)
|
| 358 |
if match:
|
|
@@ -362,7 +396,7 @@ class ChecklistExamplesVDB:
|
|
| 362 |
# Extract measurement units
|
| 363 |
unit_patterns = [
|
| 364 |
r"(\d+\s*[a-zA-Z%°]+)",
|
| 365 |
-
r"(°[
|
| 366 |
]
|
| 367 |
|
| 368 |
for pattern in unit_patterns:
|
|
@@ -372,35 +406,26 @@ class ChecklistExamplesVDB:
|
|
| 372 |
break
|
| 373 |
|
| 374 |
# Check for tolerance limits
|
| 375 |
-
|
| 376 |
-
r"
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
]
|
| 380 |
-
|
| 381 |
-
for pattern in tolerance_patterns:
|
| 382 |
-
match = re.search(pattern, combined_text, re.IGNORECASE)
|
| 383 |
-
if match:
|
| 384 |
-
tolerance_limits = match.group(1).strip()
|
| 385 |
-
break
|
| 386 |
|
| 387 |
# Check for remarks requirement
|
| 388 |
-
has_remarks = any(keyword in context_text for keyword in ["
|
| 389 |
|
| 390 |
-
#
|
| 391 |
-
if any(keyword in param_name_lower for keyword in ["foreign", "
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
options_list = "Stones, Glass, Metals, Plastic, Wood, Insects/Pests, Hair, Threads"
|
| 396 |
|
| 397 |
-
elif any(keyword in param_name_lower for keyword in ["
|
|
|
|
| 398 |
input_method = "Text Input"
|
| 399 |
-
parameter_type = "Metal Detection"
|
| 400 |
|
| 401 |
-
elif any(keyword in param_name_lower for keyword in ["
|
| 402 |
-
|
| 403 |
-
parameter_type = "Traceability"
|
| 404 |
|
| 405 |
return {
|
| 406 |
"parameter_type": parameter_type,
|
|
@@ -414,21 +439,26 @@ class ChecklistExamplesVDB:
|
|
| 414 |
}
|
| 415 |
|
| 416 |
def classify_section_type(self, section_name):
|
| 417 |
-
"""Classify section based on name"""
|
| 418 |
section_name_lower = section_name.lower()
|
| 419 |
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
|
|
|
| 423 |
return "Physical Measurement"
|
| 424 |
-
elif any(keyword in section_name_lower for keyword in ["temperature", "thermal", "
|
| 425 |
return "Temperature Control"
|
| 426 |
-
elif any(keyword in section_name_lower for keyword in ["
|
| 427 |
return "Packaging Inspection"
|
| 428 |
-
elif any(keyword in section_name_lower for keyword in ["
|
| 429 |
-
return "
|
| 430 |
-
elif any(keyword in section_name_lower for keyword in ["
|
| 431 |
-
return "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
else:
|
| 433 |
return "General Inspection"
|
| 434 |
|
|
@@ -459,6 +489,17 @@ class ChecklistExamplesVDB:
|
|
| 459 |
chunks = text_splitter.split_text(text)
|
| 460 |
documents = []
|
| 461 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 462 |
for i, chunk in enumerate(chunks):
|
| 463 |
# Enrich metadata with structural information
|
| 464 |
chunk_metadata = metadata.copy()
|
|
@@ -471,7 +512,7 @@ class ChecklistExamplesVDB:
|
|
| 471 |
"parameter_types": ", ".join(set([p["parameter_type"] for p in parameters])),
|
| 472 |
"input_methods": ", ".join(set([p["input_method"] for p in parameters])),
|
| 473 |
"section_types": ", ".join(set([s["section_type"] for s in sections]))
|
| 474 |
-
})
|
| 475 |
|
| 476 |
documents.append({
|
| 477 |
"text": chunk,
|
|
@@ -507,69 +548,67 @@ class ChecklistExamplesVDB:
|
|
| 507 |
pdf_path = Path(pdf_path)
|
| 508 |
file_hash = self.get_file_hash(pdf_path)
|
| 509 |
filename = pdf_path.name
|
| 510 |
-
|
| 511 |
# Check if already processed
|
| 512 |
if filename in self.manifest["processed_files"]:
|
| 513 |
if self.manifest["processed_files"][filename]["hash"] == file_hash:
|
| 514 |
print(f"Skipping {filename} - already processed")
|
| 515 |
return
|
| 516 |
-
|
| 517 |
print(f"Processing checklist: {filename}...")
|
| 518 |
-
|
| 519 |
try:
|
| 520 |
# Load PDF content
|
| 521 |
loader = PyPDFLoader(str(pdf_path))
|
| 522 |
pages = loader.load()
|
| 523 |
-
|
| 524 |
# Combine all pages
|
| 525 |
full_text = ""
|
| 526 |
for i, page in enumerate(pages):
|
| 527 |
full_text += f"\n--- Page {i+1} ---\n{page.page_content}"
|
| 528 |
-
|
| 529 |
# If text is too short, use OCR
|
| 530 |
if len(full_text.strip()) < 100:
|
| 531 |
print(f"Using OCR for {filename}")
|
| 532 |
ocr_text = self.ocr_pdf(pdf_path)
|
| 533 |
if len(ocr_text) > len(full_text):
|
| 534 |
full_text = ocr_text
|
| 535 |
-
|
| 536 |
# Extract document metadata
|
| 537 |
doc_metadata = self.extract_document_metadata(pdf_path, full_text)
|
| 538 |
-
|
| 539 |
# Extract parameters and sections
|
| 540 |
parameters, sections = self.extract_checklist_parameters(full_text)
|
| 541 |
-
|
| 542 |
# Create base metadata for chunks
|
| 543 |
metadata = {
|
| 544 |
"source": filename,
|
| 545 |
"document_type": doc_metadata["document_type"],
|
| 546 |
"product_name": doc_metadata["product_name"],
|
| 547 |
"supplier_name": doc_metadata["supplier_name"],
|
| 548 |
-
"
|
| 549 |
"file_hash": file_hash,
|
| 550 |
"processed_date": datetime.now().isoformat(),
|
| 551 |
-
"domain": "
|
| 552 |
}
|
| 553 |
-
|
| 554 |
# Create chunks
|
| 555 |
documents = self.create_chunks(full_text, metadata, parameters, sections)
|
| 556 |
-
|
| 557 |
# Generate embeddings and store in ChromaDB
|
| 558 |
for i, doc in enumerate(documents):
|
| 559 |
embedding = self.embedder.encode(doc["text"]).tolist()
|
| 560 |
-
|
| 561 |
self.collection.add(
|
| 562 |
documents=[doc["text"]],
|
| 563 |
embeddings=[embedding],
|
| 564 |
metadatas=[doc["metadata"]],
|
| 565 |
ids=[f"{file_hash}_{i}"]
|
| 566 |
)
|
| 567 |
-
|
| 568 |
# Store metadata in SQLite
|
| 569 |
self.save_document_metadata(file_hash, filename, doc_metadata, len(parameters))
|
| 570 |
self.save_parameters(file_hash, parameters)
|
| 571 |
self.save_sections(file_hash, sections)
|
| 572 |
-
|
| 573 |
# Update manifest
|
| 574 |
self.manifest["processed_files"][filename] = {
|
| 575 |
"hash": file_hash,
|
|
@@ -579,19 +618,20 @@ class ChecklistExamplesVDB:
|
|
| 579 |
"parameters_extracted": len(parameters),
|
| 580 |
"sections_extracted": len(sections),
|
| 581 |
"document_type": doc_metadata["document_type"],
|
| 582 |
-
"product_name": doc_metadata["product_name"]
|
|
|
|
| 583 |
}
|
| 584 |
self.save_manifest()
|
| 585 |
-
|
| 586 |
# Log success
|
| 587 |
self.log_processing(filename, file_hash, "SUCCESS", None, len(parameters), len(sections))
|
| 588 |
-
|
| 589 |
print(f"Successfully processed {filename}")
|
| 590 |
print(f" - Document Type: {doc_metadata['document_type']}")
|
| 591 |
print(f" - Product: {doc_metadata['product_name']}")
|
| 592 |
print(f" - Parameters extracted: {len(parameters)}")
|
| 593 |
print(f" - Sections extracted: {len(sections)}")
|
| 594 |
-
|
| 595 |
except Exception as e:
|
| 596 |
error_msg = str(e)
|
| 597 |
print(f"Error processing {filename}: {error_msg}")
|
|
@@ -599,309 +639,354 @@ class ChecklistExamplesVDB:
|
|
| 599 |
traceback.print_exc()
|
| 600 |
self.log_processing(filename, file_hash, "ERROR", error_msg, 0, 0)
|
| 601 |
|
|
|
|
| 602 |
def save_document_metadata(self, file_hash, filename, metadata, total_parameters):
|
| 603 |
-
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
| 607 |
-
|
| 608 |
-
|
| 609 |
-
|
| 610 |
-
|
| 611 |
-
|
| 612 |
-
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
|
| 621 |
def save_parameters(self, file_hash, parameters):
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
|
| 628 |
-
|
| 629 |
-
|
| 630 |
-
|
| 631 |
-
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
def save_sections(self, file_hash, sections):
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
def log_processing(self, filename, file_hash, status, error_message, parameters_extracted=0, sections_extracted=0):
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
def process_all_pdfs(self):
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
def get_processing_stats(self):
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
| 717 |
-
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
|
| 748 |
-
|
| 749 |
-
|
| 750 |
-
|
| 751 |
-
|
| 752 |
-
|
| 753 |
-
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
|
| 763 |
-
|
| 764 |
-
|
| 765 |
-
|
| 766 |
-
|
| 767 |
-
|
| 768 |
-
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 776 |
def get_parameter_patterns(self):
|
| 777 |
-
|
| 778 |
-
|
| 779 |
-
|
| 780 |
-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
|
| 787 |
-
|
| 788 |
-
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
def search_similar_checklists(self, product_name, checklist_type="", limit=5):
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 839 |
|
| 840 |
|
| 841 |
def main():
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
|
| 852 |
-
|
| 853 |
-
|
| 854 |
-
|
| 855 |
-
|
| 856 |
-
|
| 857 |
-
|
| 858 |
-
|
| 859 |
-
|
| 860 |
-
|
| 861 |
-
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
|
| 866 |
-
|
| 867 |
-
|
| 868 |
-
|
| 869 |
-
|
| 870 |
-
|
| 871 |
-
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
| 878 |
-
|
| 879 |
-
|
| 880 |
-
|
| 881 |
-
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
-
|
| 895 |
-
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
| 901 |
-
|
| 902 |
-
|
| 903 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 904 |
|
| 905 |
|
| 906 |
if __name__ == "__main__":
|
| 907 |
-
|
|
|
|
| 63 |
document_type TEXT,
|
| 64 |
product_name TEXT,
|
| 65 |
supplier_name TEXT,
|
| 66 |
+
checklist_attributes TEXT, -- Dynamic attributes instead of category
|
| 67 |
total_parameters INTEGER DEFAULT 0,
|
| 68 |
extracted_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
| 69 |
)
|
|
|
|
| 148 |
return hashlib.md5(f.read()).hexdigest()
|
| 149 |
|
| 150 |
def extract_document_metadata(self, pdf_path, text_content):
|
| 151 |
+
"""Extract document metadata from checklist - generic approach"""
|
| 152 |
metadata = {
|
| 153 |
"document_type": "QC Checklist",
|
| 154 |
"product_name": "",
|
| 155 |
"supplier_name": "",
|
| 156 |
+
"checklist_attributes": {} # Dynamic attributes
|
| 157 |
}
|
| 158 |
|
| 159 |
+
# Extract document type generically
|
| 160 |
doc_type_patterns = {
|
| 161 |
"Inspection Record": ["inspection record", "inspection checklist", "quality inspection"],
|
| 162 |
"Pre-Shipment Inspection": ["pre-shipment", "container inspection", "shipment inspection"],
|
| 163 |
"Production Checklist": ["production checklist", "manufacturing checklist", "process checklist"],
|
| 164 |
+
"Temperature Log": ["temperature", "thermal", "cooling log"],
|
| 165 |
"Receiving Inspection": ["receiving", "goods receipt", "incoming inspection"],
|
| 166 |
"Hygiene Checklist": ["hygiene", "sanitation", "cleaning checklist"],
|
| 167 |
+
"Quality Control": ["quality control", "qc checklist", "quality check"]
|
| 168 |
}
|
| 169 |
|
| 170 |
text_lower = text_content.lower()
|
|
|
|
| 173 |
metadata["document_type"] = doc_type
|
| 174 |
break
|
| 175 |
|
| 176 |
+
# Extract product name generically
|
| 177 |
product_patterns = [
|
| 178 |
r"product\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
|
| 179 |
+
r"item\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
|
| 180 |
+
r"material\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
|
| 181 |
r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*[-–]\s*inspection",
|
| 182 |
+
r"inspection\s*of\s*([^\n]{1,40})",
|
| 183 |
r"product:\s*([^\n]{1,40})",
|
| 184 |
]
|
| 185 |
|
|
|
|
| 189 |
metadata["product_name"] = match.group(1).strip()
|
| 190 |
break
|
| 191 |
|
| 192 |
+
# Extract supplier name generically - no specific company bias
|
| 193 |
supplier_patterns = [
|
| 194 |
r"supplier\s*(?:name)?\s*[:\-]\s*([^\n]{1,40})",
|
| 195 |
+
r"vendor\s*(?:name)?\s*[:\-]\s*([^\n]{1,40})",
|
| 196 |
r"manufacturer\s*[:\-]\s*([^\n]{1,40})",
|
| 197 |
+
r"company\s*[:\-]\s*([^\n]{1,40})",
|
| 198 |
+
r"produced\s*by\s*[:\-]\s*([^\n]{1,40})"
|
| 199 |
]
|
| 200 |
|
| 201 |
for pattern in supplier_patterns:
|
|
|
|
| 204 |
metadata["supplier_name"] = match.group(1).strip()
|
| 205 |
break
|
| 206 |
|
| 207 |
+
# Extract dynamic attributes
|
| 208 |
+
attributes = {}
|
| 209 |
+
|
| 210 |
+
# Inspection stage/phase
|
| 211 |
+
stage_keywords = {
|
| 212 |
+
"pre-production": ["pre-production", "before production", "initial"],
|
| 213 |
+
"during-production": ["during production", "in-process", "mid-production"],
|
| 214 |
+
"final": ["final inspection", "finished goods", "end product"],
|
| 215 |
+
"incoming": ["incoming", "receiving", "goods receipt"],
|
| 216 |
+
"outgoing": ["outgoing", "dispatch", "shipping"]
|
| 217 |
}
|
| 218 |
|
| 219 |
+
for stage, keywords in stage_keywords.items():
|
| 220 |
if any(keyword in text_lower for keyword in keywords):
|
| 221 |
+
attributes["inspection_stage"] = stage
|
| 222 |
break
|
| 223 |
|
| 224 |
+
# Inspection focus
|
| 225 |
+
if any(word in text_lower for word in ["visual", "appearance", "cosmetic"]):
|
| 226 |
+
attributes["inspection_focus"] = "visual"
|
| 227 |
+
elif any(word in text_lower for word in ["dimension", "measurement", "size"]):
|
| 228 |
+
attributes["inspection_focus"] = "dimensional"
|
| 229 |
+
elif any(word in text_lower for word in ["functional", "performance", "operation"]):
|
| 230 |
+
attributes["inspection_focus"] = "functional"
|
| 231 |
+
elif any(word in text_lower for word in ["safety", "hazard", "risk"]):
|
| 232 |
+
attributes["inspection_focus"] = "safety"
|
| 233 |
+
|
| 234 |
+
# Complexity level based on parameter count (will be updated later)
|
| 235 |
+
attributes["complexity"] = "standard" # Will be updated after parameter extraction
|
| 236 |
+
|
| 237 |
+
metadata["checklist_attributes"] = json.dumps(attributes)
|
| 238 |
+
|
| 239 |
return metadata
|
| 240 |
|
| 241 |
def extract_checklist_parameters(self, text_content):
|
|
|
|
| 250 |
section_order = 0
|
| 251 |
parameter_order = 0
|
| 252 |
|
| 253 |
+
# Generic parameter extraction patterns
|
| 254 |
param_patterns = [
|
| 255 |
# Format: "Parameter Name: Type/Method"
|
| 256 |
+
r"^([A-Z][^:]+?):\s*(Acceptable\s*/\s*Non-acceptable|To be mentioned|Present\s*/\s*Absent|Pass\s*/\s*Fail)",
|
| 257 |
# Format: "Parameter (Spec: value)"
|
| 258 |
+
r"^([A-Z][^(]+?)\s*\((?:Spec|Specification):\s*([^)]+)\)",
|
| 259 |
+
# Format: "Parameter Name: [measurement/value]"
|
| 260 |
+
r"^([A-Z][^:]+?):\s*\[([^\]]+)\]",
|
| 261 |
+
# Format: "Parameter: _____" (blank field)
|
| 262 |
+
r"^([A-Z][^:]+?):\s*_{3,}",
|
| 263 |
+
# Format: "□ Parameter Name"
|
| 264 |
+
r"^[□☐]\s*([A-Z][^:]+?)$",
|
| 265 |
+
# Format: "• Parameter Name"
|
| 266 |
+
r"^[•·]\s*([A-Z][^:]+?)$",
|
| 267 |
+
# Generic: "Parameter Name: [details]"
|
| 268 |
+
r"^([A-Z][^:]+?):\s*(.{0,100})",
|
| 269 |
]
|
| 270 |
|
| 271 |
# Section header patterns
|
| 272 |
section_patterns = [
|
| 273 |
+
r"^([A-Z\s]+(?:EVALUATION|INSPECTION|CHECK|VERIFICATION|ASSESSMENT|CONTROL))\s*$",
|
| 274 |
r"^[0-9]+\.\s*([A-Z][^.]+)$",
|
| 275 |
+
r"^\*\*([A-Z\s]+)\*\*$",
|
| 276 |
+
r"^={3,}\s*([A-Z\s]+)\s*={3,}$",
|
| 277 |
+
r"^-{3,}\s*([A-Z\s]+)\s*-{3,}$"
|
| 278 |
]
|
| 279 |
|
| 280 |
for line_idx, line in enumerate(lines):
|
|
|
|
| 283 |
match = re.match(pattern, line)
|
| 284 |
if match:
|
| 285 |
section_name = match.group(1).strip()
|
| 286 |
+
if len(section_name) > 5 and len(section_name) < 50: # Valid section name
|
| 287 |
current_section = section_name
|
| 288 |
section_order += 1
|
| 289 |
sections.append({
|
|
|
|
| 327 |
return parameters, sections
|
| 328 |
|
| 329 |
def analyze_parameter(self, param_name, param_details, current_line, all_lines, line_idx):
|
| 330 |
+
"""Analyze parameter to determine type, input method, etc. - generic approach"""
|
| 331 |
param_name_lower = param_name.lower()
|
| 332 |
+
param_details_lower = param_details.lower() if param_details else ""
|
| 333 |
context_lines = all_lines[max(0, line_idx-2):min(len(all_lines), line_idx+3)]
|
| 334 |
context_text = " ".join(context_lines).lower()
|
| 335 |
|
| 336 |
+
# Initialize default values
|
| 337 |
parameter_type = "Quality Check"
|
| 338 |
input_method = "Text Input"
|
| 339 |
specifications = ""
|
|
|
|
| 342 |
measurement_units = ""
|
| 343 |
has_remarks = False
|
| 344 |
|
| 345 |
+
# Generic input method determination
|
| 346 |
+
if any(keyword in param_details_lower for keyword in ["acceptable", "non-acceptable", "pass", "fail"]):
|
| 347 |
+
if "/" in param_details_lower:
|
| 348 |
+
input_method = "Dropdown"
|
| 349 |
+
options_list = param_details.replace("/", ", ")
|
| 350 |
+
else:
|
| 351 |
+
input_method = "Toggle"
|
| 352 |
parameter_type = "Quality Assessment"
|
| 353 |
|
| 354 |
+
elif any(keyword in param_details_lower for keyword in ["present", "absent", "yes", "no"]):
|
| 355 |
input_method = "Toggle"
|
| 356 |
+
options_list = param_details.replace("/", ", ")
|
| 357 |
parameter_type = "Presence Check"
|
| 358 |
|
| 359 |
+
elif "to be mentioned" in param_details_lower or "_____" in current_line:
|
| 360 |
+
# Determine based on parameter name
|
| 361 |
+
if any(unit in param_name_lower for unit in ["temperature", "weight", "time", "dimension", "size", "count", "number"]):
|
| 362 |
input_method = "Numeric Input"
|
| 363 |
parameter_type = "Measurement"
|
| 364 |
else:
|
| 365 |
input_method = "Text Input"
|
| 366 |
parameter_type = "Information Entry"
|
| 367 |
|
| 368 |
+
elif any(keyword in param_name_lower for keyword in ["photo", "picture", "image", "visual"]):
|
| 369 |
input_method = "Image Upload"
|
| 370 |
parameter_type = "Visual Documentation"
|
| 371 |
|
| 372 |
+
elif any(keyword in param_name_lower for keyword in ["remark", "comment", "observation", "note"]):
|
| 373 |
input_method = "Remarks"
|
| 374 |
parameter_type = "Detailed Notes"
|
| 375 |
|
| 376 |
+
elif "□" in current_line or "☐" in current_line:
|
| 377 |
+
input_method = "Checklist"
|
| 378 |
+
parameter_type = "Verification Check"
|
| 379 |
+
|
| 380 |
# Specification extraction
|
| 381 |
spec_patterns = [
|
| 382 |
+
r"\((?:spec|specification):\s*([^)]+)\)",
|
| 383 |
+
r"tolerance\s*(?:limit)?[:\s]*([^,\n]+)",
|
| 384 |
r"(\d+\s*[±]\s*\d+\s*[a-zA-Z%°]+)",
|
| 385 |
+
r"([<>≤≥]\s*\d+[^,\n]*)",
|
| 386 |
+
r"(\d+\s*-\s*\d+\s*[a-zA-Z]+)",
|
| 387 |
]
|
| 388 |
|
| 389 |
+
combined_text = f"{param_name} {param_details} {' '.join(context_lines)}"
|
| 390 |
for pattern in spec_patterns:
|
| 391 |
match = re.search(pattern, combined_text, re.IGNORECASE)
|
| 392 |
if match:
|
|
|
|
| 396 |
# Extract measurement units
|
| 397 |
unit_patterns = [
|
| 398 |
r"(\d+\s*[a-zA-Z%°]+)",
|
| 399 |
+
r"(°[CcFf]|g|gram|kg|mm|cm|m|ml|L|minutes?|hours?|seconds?|%|ppm|cfu)",
|
| 400 |
]
|
| 401 |
|
| 402 |
for pattern in unit_patterns:
|
|
|
|
| 406 |
break
|
| 407 |
|
| 408 |
# Check for tolerance limits
|
| 409 |
+
if "±" in combined_text or any(op in combined_text for op in ["<", ">", "≤", "≥"]):
|
| 410 |
+
tolerance_match = re.search(r"([±<>≤≥]\s*\d+(?:\.\d+)?)", combined_text)
|
| 411 |
+
if tolerance_match:
|
| 412 |
+
tolerance_limits = tolerance_match.group(1).strip()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 413 |
|
| 414 |
# Check for remarks requirement
|
| 415 |
+
has_remarks = any(keyword in context_text for keyword in ["remark", "comment", "observation", "corrective action"])
|
| 416 |
|
| 417 |
+
# Generic parameter type classification based on content
|
| 418 |
+
if any(keyword in param_name_lower for keyword in ["contamination", "foreign", "defect", "damage"]):
|
| 419 |
+
parameter_type = "Safety/Quality Check"
|
| 420 |
+
if not options_list and input_method == "Checklist":
|
| 421 |
+
options_list = "None observed, Minor issue, Major issue, Critical"
|
|
|
|
| 422 |
|
| 423 |
+
elif any(keyword in param_name_lower for keyword in ["batch", "lot", "code", "number", "id"]):
|
| 424 |
+
parameter_type = "Traceability"
|
| 425 |
input_method = "Text Input"
|
|
|
|
| 426 |
|
| 427 |
+
elif any(keyword in param_name_lower for keyword in ["signature", "verified", "checked"]):
|
| 428 |
+
parameter_type = "Verification"
|
|
|
|
| 429 |
|
| 430 |
return {
|
| 431 |
"parameter_type": parameter_type,
|
|
|
|
| 439 |
}
|
| 440 |
|
| 441 |
def classify_section_type(self, section_name):
|
| 442 |
+
"""Classify section based on name - generic approach"""
|
| 443 |
section_name_lower = section_name.lower()
|
| 444 |
|
| 445 |
+
# Generic section classification
|
| 446 |
+
if any(keyword in section_name_lower for keyword in ["visual", "appearance", "cosmetic"]):
|
| 447 |
+
return "Visual Assessment"
|
| 448 |
+
elif any(keyword in section_name_lower for keyword in ["measurement", "dimension", "size", "weight"]):
|
| 449 |
return "Physical Measurement"
|
| 450 |
+
elif any(keyword in section_name_lower for keyword in ["temperature", "thermal", "heat", "cold"]):
|
| 451 |
return "Temperature Control"
|
| 452 |
+
elif any(keyword in section_name_lower for keyword in ["package", "packing", "container", "seal"]):
|
| 453 |
return "Packaging Inspection"
|
| 454 |
+
elif any(keyword in section_name_lower for keyword in ["test", "analysis", "laboratory"]):
|
| 455 |
+
return "Testing/Analysis"
|
| 456 |
+
elif any(keyword in section_name_lower for keyword in ["safety", "hazard", "risk", "contamination"]):
|
| 457 |
+
return "Safety Assessment"
|
| 458 |
+
elif any(keyword in section_name_lower for keyword in ["document", "record", "certificate"]):
|
| 459 |
+
return "Documentation"
|
| 460 |
+
elif any(keyword in section_name_lower for keyword in ["final", "overall", "summary"]):
|
| 461 |
+
return "Final Assessment"
|
| 462 |
else:
|
| 463 |
return "General Inspection"
|
| 464 |
|
|
|
|
| 489 |
chunks = text_splitter.split_text(text)
|
| 490 |
documents = []
|
| 491 |
|
| 492 |
+
# Update complexity in metadata based on parameters
|
| 493 |
+
if metadata.get("checklist_attributes"):
|
| 494 |
+
attrs = json.loads(metadata["checklist_attributes"])
|
| 495 |
+
if len(parameters) < 10:
|
| 496 |
+
attrs["complexity"] = "simple"
|
| 497 |
+
elif len(parameters) < 25:
|
| 498 |
+
attrs["complexity"] = "standard"
|
| 499 |
+
else:
|
| 500 |
+
attrs["complexity"] = "comprehensive"
|
| 501 |
+
metadata["checklist_attributes"] = json.dumps(attrs)
|
| 502 |
+
|
| 503 |
for i, chunk in enumerate(chunks):
|
| 504 |
# Enrich metadata with structural information
|
| 505 |
chunk_metadata = metadata.copy()
|
|
|
|
| 512 |
"parameter_types": ", ".join(set([p["parameter_type"] for p in parameters])),
|
| 513 |
"input_methods": ", ".join(set([p["input_method"] for p in parameters])),
|
| 514 |
"section_types": ", ".join(set([s["section_type"] for s in sections]))
|
| 515 |
+
})
|
| 516 |
|
| 517 |
documents.append({
|
| 518 |
"text": chunk,
|
|
|
|
| 548 |
pdf_path = Path(pdf_path)
|
| 549 |
file_hash = self.get_file_hash(pdf_path)
|
| 550 |
filename = pdf_path.name
|
| 551 |
+
|
| 552 |
# Check if already processed
|
| 553 |
if filename in self.manifest["processed_files"]:
|
| 554 |
if self.manifest["processed_files"][filename]["hash"] == file_hash:
|
| 555 |
print(f"Skipping {filename} - already processed")
|
| 556 |
return
|
| 557 |
+
|
| 558 |
print(f"Processing checklist: {filename}...")
|
|
|
|
| 559 |
try:
|
| 560 |
# Load PDF content
|
| 561 |
loader = PyPDFLoader(str(pdf_path))
|
| 562 |
pages = loader.load()
|
| 563 |
+
|
| 564 |
# Combine all pages
|
| 565 |
full_text = ""
|
| 566 |
for i, page in enumerate(pages):
|
| 567 |
full_text += f"\n--- Page {i+1} ---\n{page.page_content}"
|
| 568 |
+
|
| 569 |
# If text is too short, use OCR
|
| 570 |
if len(full_text.strip()) < 100:
|
| 571 |
print(f"Using OCR for {filename}")
|
| 572 |
ocr_text = self.ocr_pdf(pdf_path)
|
| 573 |
if len(ocr_text) > len(full_text):
|
| 574 |
full_text = ocr_text
|
| 575 |
+
|
| 576 |
# Extract document metadata
|
| 577 |
doc_metadata = self.extract_document_metadata(pdf_path, full_text)
|
| 578 |
+
|
| 579 |
# Extract parameters and sections
|
| 580 |
parameters, sections = self.extract_checklist_parameters(full_text)
|
| 581 |
+
|
| 582 |
# Create base metadata for chunks
|
| 583 |
metadata = {
|
| 584 |
"source": filename,
|
| 585 |
"document_type": doc_metadata["document_type"],
|
| 586 |
"product_name": doc_metadata["product_name"],
|
| 587 |
"supplier_name": doc_metadata["supplier_name"],
|
| 588 |
+
"checklist_attributes": doc_metadata["checklist_attributes"],
|
| 589 |
"file_hash": file_hash,
|
| 590 |
"processed_date": datetime.now().isoformat(),
|
| 591 |
+
"domain": "Quality Control" # Generic domain
|
| 592 |
}
|
| 593 |
+
|
| 594 |
# Create chunks
|
| 595 |
documents = self.create_chunks(full_text, metadata, parameters, sections)
|
| 596 |
+
|
| 597 |
# Generate embeddings and store in ChromaDB
|
| 598 |
for i, doc in enumerate(documents):
|
| 599 |
embedding = self.embedder.encode(doc["text"]).tolist()
|
|
|
|
| 600 |
self.collection.add(
|
| 601 |
documents=[doc["text"]],
|
| 602 |
embeddings=[embedding],
|
| 603 |
metadatas=[doc["metadata"]],
|
| 604 |
ids=[f"{file_hash}_{i}"]
|
| 605 |
)
|
| 606 |
+
|
| 607 |
# Store metadata in SQLite
|
| 608 |
self.save_document_metadata(file_hash, filename, doc_metadata, len(parameters))
|
| 609 |
self.save_parameters(file_hash, parameters)
|
| 610 |
self.save_sections(file_hash, sections)
|
| 611 |
+
|
| 612 |
# Update manifest
|
| 613 |
self.manifest["processed_files"][filename] = {
|
| 614 |
"hash": file_hash,
|
|
|
|
| 618 |
"parameters_extracted": len(parameters),
|
| 619 |
"sections_extracted": len(sections),
|
| 620 |
"document_type": doc_metadata["document_type"],
|
| 621 |
+
"product_name": doc_metadata["product_name"],
|
| 622 |
+
"attributes": doc_metadata["checklist_attributes"]
|
| 623 |
}
|
| 624 |
self.save_manifest()
|
| 625 |
+
|
| 626 |
# Log success
|
| 627 |
self.log_processing(filename, file_hash, "SUCCESS", None, len(parameters), len(sections))
|
| 628 |
+
|
| 629 |
print(f"Successfully processed {filename}")
|
| 630 |
print(f" - Document Type: {doc_metadata['document_type']}")
|
| 631 |
print(f" - Product: {doc_metadata['product_name']}")
|
| 632 |
print(f" - Parameters extracted: {len(parameters)}")
|
| 633 |
print(f" - Sections extracted: {len(sections)}")
|
| 634 |
+
|
| 635 |
except Exception as e:
|
| 636 |
error_msg = str(e)
|
| 637 |
print(f"Error processing {filename}: {error_msg}")
|
|
|
|
| 639 |
traceback.print_exc()
|
| 640 |
self.log_processing(filename, file_hash, "ERROR", error_msg, 0, 0)
|
| 641 |
|
| 642 |
+
|
| 643 |
def save_document_metadata(self, file_hash, filename, metadata, total_parameters):
|
| 644 |
+
"""Save document metadata to SQLite"""
|
| 645 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 646 |
+
cursor = conn.cursor()
|
| 647 |
+
|
| 648 |
+
try:
|
| 649 |
+
cursor.execute("""
|
| 650 |
+
INSERT OR REPLACE INTO checklist_documents
|
| 651 |
+
(file_hash, filename, document_type, product_name, supplier_name,
|
| 652 |
+
checklist_attributes, total_parameters)
|
| 653 |
+
VALUES (?, ?, ?, ?, ?, ?, ?)
|
| 654 |
+
""", (
|
| 655 |
+
file_hash, filename, metadata["document_type"], metadata["product_name"],
|
| 656 |
+
metadata["supplier_name"], metadata["checklist_attributes"], total_parameters
|
| 657 |
+
))
|
| 658 |
+
conn.commit()
|
| 659 |
+
finally:
|
| 660 |
+
conn.close()
|
| 661 |
+
|
| 662 |
def save_parameters(self, file_hash, parameters):
|
| 663 |
+
"""Save extracted parameters to SQLite"""
|
| 664 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 665 |
+
cursor = conn.cursor()
|
| 666 |
+
|
| 667 |
+
try:
|
| 668 |
+
# Delete existing parameters for this file
|
| 669 |
+
cursor.execute("DELETE FROM checklist_parameters WHERE file_hash = ?", (file_hash,))
|
| 670 |
+
|
| 671 |
+
# Insert new parameters
|
| 672 |
+
for param in parameters:
|
| 673 |
+
cursor.execute("""
|
| 674 |
+
INSERT INTO checklist_parameters
|
| 675 |
+
(file_hash, parameter_name, parameter_type, input_method, specifications,
|
| 676 |
+
options_list, tolerance_limits, measurement_units, section_category,
|
| 677 |
+
parameter_order, has_remarks, is_mandatory)
|
| 678 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 679 |
+
""", (
|
| 680 |
+
file_hash, param["parameter_name"], param["parameter_type"],
|
| 681 |
+
param["input_method"], param["specifications"], param["options_list"],
|
| 682 |
+
param["tolerance_limits"], param["measurement_units"],
|
| 683 |
+
param["section_category"], param["parameter_order"],
|
| 684 |
+
param["has_remarks"], param["is_mandatory"]
|
| 685 |
+
))
|
| 686 |
+
|
| 687 |
+
conn.commit()
|
| 688 |
+
finally:
|
| 689 |
+
conn.close()
|
| 690 |
+
|
| 691 |
def save_sections(self, file_hash, sections):
|
| 692 |
+
"""Save extracted sections to SQLite"""
|
| 693 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 694 |
+
cursor = conn.cursor()
|
| 695 |
+
|
| 696 |
+
try:
|
| 697 |
+
# Delete existing sections for this file
|
| 698 |
+
cursor.execute("DELETE FROM checklist_sections WHERE file_hash = ?", (file_hash,))
|
| 699 |
+
|
| 700 |
+
# Insert new sections
|
| 701 |
+
for section in sections:
|
| 702 |
+
cursor.execute("""
|
| 703 |
+
INSERT INTO checklist_sections
|
| 704 |
+
(file_hash, section_name, section_type, section_order, parameter_count)
|
| 705 |
+
VALUES (?, ?, ?, ?, ?)
|
| 706 |
+
""", (
|
| 707 |
+
file_hash, section["section_name"], section["section_type"],
|
| 708 |
+
section["section_order"], section["parameter_count"]
|
| 709 |
+
))
|
| 710 |
+
|
| 711 |
+
conn.commit()
|
| 712 |
+
finally:
|
| 713 |
+
conn.close()
|
| 714 |
+
|
| 715 |
def log_processing(self, filename, file_hash, status, error_message, parameters_extracted=0, sections_extracted=0):
|
| 716 |
+
"""Log processing status"""
|
| 717 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 718 |
+
cursor = conn.cursor()
|
| 719 |
+
|
| 720 |
+
try:
|
| 721 |
+
cursor.execute("""
|
| 722 |
+
INSERT INTO processing_log
|
| 723 |
+
(filename, file_hash, status, error_message, parameters_extracted, sections_extracted)
|
| 724 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 725 |
+
""", (filename, file_hash, status, error_message, parameters_extracted, sections_extracted))
|
| 726 |
+
|
| 727 |
+
conn.commit()
|
| 728 |
+
finally:
|
| 729 |
+
conn.close()
|
| 730 |
+
|
| 731 |
def process_all_pdfs(self):
|
| 732 |
+
"""Process all PDFs in the directory"""
|
| 733 |
+
pdf_files = list(self.pdf_path.glob("*.pdf"))
|
| 734 |
+
|
| 735 |
+
if not pdf_files:
|
| 736 |
+
print(f"No PDF files found in {self.pdf_path}")
|
| 737 |
+
return
|
| 738 |
+
|
| 739 |
+
print(f"Found {len(pdf_files)} checklist PDF files")
|
| 740 |
+
|
| 741 |
+
for pdf_file in pdf_files:
|
| 742 |
+
self.process_pdf(pdf_file)
|
| 743 |
+
|
| 744 |
+
print("\nChecklist processing complete!")
|
| 745 |
+
print(f"Total files in manifest: {len(self.manifest['processed_files'])}")
|
| 746 |
+
|
| 747 |
def get_processing_stats(self):
|
| 748 |
+
"""Get processing statistics"""
|
| 749 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 750 |
+
cursor = conn.cursor()
|
| 751 |
+
|
| 752 |
+
try:
|
| 753 |
+
# Get overall stats
|
| 754 |
+
cursor.execute("""
|
| 755 |
+
SELECT COUNT(*) as total,
|
| 756 |
+
SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
|
| 757 |
+
SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
|
| 758 |
+
SUM(parameters_extracted) as total_parameters,
|
| 759 |
+
SUM(sections_extracted) as total_sections
|
| 760 |
+
FROM processing_log
|
| 761 |
+
""")
|
| 762 |
+
|
| 763 |
+
stats = cursor.fetchone()
|
| 764 |
+
|
| 765 |
+
# Get document type distribution
|
| 766 |
+
cursor.execute("""
|
| 767 |
+
SELECT document_type, COUNT(*) as count
|
| 768 |
+
FROM checklist_documents
|
| 769 |
+
GROUP BY document_type
|
| 770 |
+
ORDER BY count DESC
|
| 771 |
+
""")
|
| 772 |
+
|
| 773 |
+
doc_types = cursor.fetchall()
|
| 774 |
+
|
| 775 |
+
# Get attribute distribution
|
| 776 |
+
cursor.execute("""
|
| 777 |
+
SELECT checklist_attributes, COUNT(*) as count
|
| 778 |
+
FROM checklist_documents
|
| 779 |
+
WHERE checklist_attributes IS NOT NULL
|
| 780 |
+
GROUP BY checklist_attributes
|
| 781 |
+
""")
|
| 782 |
+
|
| 783 |
+
attr_dist = cursor.fetchall()
|
| 784 |
+
|
| 785 |
+
# Parse attributes for summary
|
| 786 |
+
attribute_summary = {}
|
| 787 |
+
for attrs_json, count in attr_dist:
|
| 788 |
+
if attrs_json:
|
| 789 |
+
try:
|
| 790 |
+
attrs = json.loads(attrs_json)
|
| 791 |
+
for key, value in attrs.items():
|
| 792 |
+
if key not in attribute_summary:
|
| 793 |
+
attribute_summary[key] = {}
|
| 794 |
+
if value not in attribute_summary[key]:
|
| 795 |
+
attribute_summary[key][value] = 0
|
| 796 |
+
attribute_summary[key][value] += count
|
| 797 |
+
except:
|
| 798 |
+
pass
|
| 799 |
+
|
| 800 |
+
# Get parameter type distribution
|
| 801 |
+
cursor.execute("""
|
| 802 |
+
SELECT input_method, COUNT(*) as count
|
| 803 |
+
FROM checklist_parameters
|
| 804 |
+
GROUP BY input_method
|
| 805 |
+
ORDER BY count DESC
|
| 806 |
+
""")
|
| 807 |
+
|
| 808 |
+
input_methods = cursor.fetchall()
|
| 809 |
+
|
| 810 |
+
# Get most common parameters
|
| 811 |
+
cursor.execute("""
|
| 812 |
+
SELECT parameter_name, parameter_type, input_method, COUNT(*) as frequency
|
| 813 |
+
FROM checklist_parameters
|
| 814 |
+
GROUP BY parameter_name, parameter_type, input_method
|
| 815 |
+
HAVING frequency > 1
|
| 816 |
+
ORDER BY frequency DESC
|
| 817 |
+
LIMIT 10
|
| 818 |
+
""")
|
| 819 |
+
|
| 820 |
+
common_params = cursor.fetchall()
|
| 821 |
+
|
| 822 |
+
return {
|
| 823 |
+
"total_processed": stats[0],
|
| 824 |
+
"successful": stats[1],
|
| 825 |
+
"errors": stats[2],
|
| 826 |
+
"total_parameters": stats[3],
|
| 827 |
+
"total_sections": stats[4],
|
| 828 |
+
"document_types": dict(doc_types),
|
| 829 |
+
"input_methods": dict(input_methods),
|
| 830 |
+
"attribute_summary": attribute_summary,
|
| 831 |
+
"common_parameters": [
|
| 832 |
+
{
|
| 833 |
+
"name": p[0],
|
| 834 |
+
"type": p[1],
|
| 835 |
+
"input_method": p[2],
|
| 836 |
+
"frequency": p[3]
|
| 837 |
+
} for p in common_params
|
| 838 |
+
]
|
| 839 |
+
}
|
| 840 |
+
finally:
|
| 841 |
+
conn.close()
|
| 842 |
+
|
| 843 |
def get_parameter_patterns(self):
|
| 844 |
+
"""Get common parameter patterns for AI reference"""
|
| 845 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 846 |
+
cursor = conn.cursor()
|
| 847 |
+
|
| 848 |
+
try:
|
| 849 |
+
cursor.execute("""
|
| 850 |
+
SELECT
|
| 851 |
+
parameter_type,
|
| 852 |
+
input_method,
|
| 853 |
+
GROUP_CONCAT(DISTINCT specifications) as common_specs,
|
| 854 |
+
GROUP_CONCAT(DISTINCT options_list) as common_options,
|
| 855 |
+
COUNT(*) as usage_count
|
| 856 |
+
FROM checklist_parameters
|
| 857 |
+
WHERE specifications != '' OR options_list != ''
|
| 858 |
+
GROUP BY parameter_type, input_method
|
| 859 |
+
ORDER BY usage_count DESC
|
| 860 |
+
""")
|
| 861 |
+
|
| 862 |
+
patterns = []
|
| 863 |
+
for row in cursor.fetchall():
|
| 864 |
+
patterns.append({
|
| 865 |
+
"parameter_type": row[0],
|
| 866 |
+
"input_method": row[1],
|
| 867 |
+
"common_specifications": row[2],
|
| 868 |
+
"common_options": row[3],
|
| 869 |
+
"usage_count": row[4]
|
| 870 |
+
})
|
| 871 |
+
|
| 872 |
+
return patterns
|
| 873 |
+
finally:
|
| 874 |
+
conn.close()
|
| 875 |
+
|
| 876 |
def search_similar_checklists(self, product_name, checklist_type="", limit=5):
|
| 877 |
+
"""Search for similar checklists based on product and type"""
|
| 878 |
+
query_text = f"{product_name} {checklist_type} quality control inspection checklist"
|
| 879 |
+
query_embedding = self.embedder.encode(query_text).tolist()
|
| 880 |
+
|
| 881 |
+
try:
|
| 882 |
+
results = self.collection.query(
|
| 883 |
+
query_embeddings=[query_embedding],
|
| 884 |
+
n_results=limit,
|
| 885 |
+
where={"domain": "Quality Control"}
|
| 886 |
+
)
|
| 887 |
+
|
| 888 |
+
similar_checklists = []
|
| 889 |
+
if results['documents'][0]:
|
| 890 |
+
for i, doc in enumerate(results['documents'][0]):
|
| 891 |
+
metadata = results['metadatas'][0][i]
|
| 892 |
+
|
| 893 |
+
# Parse attributes
|
| 894 |
+
attrs = {}
|
| 895 |
+
if metadata.get('checklist_attributes'):
|
| 896 |
+
try:
|
| 897 |
+
attrs = json.loads(metadata['checklist_attributes'])
|
| 898 |
+
except:
|
| 899 |
+
pass
|
| 900 |
+
|
| 901 |
+
similar_checklists.append({
|
| 902 |
+
"document": metadata.get('source', 'Unknown'),
|
| 903 |
+
"product": metadata.get('product_name', 'Unknown'),
|
| 904 |
+
"type": metadata.get('document_type', 'Unknown'),
|
| 905 |
+
"attributes": attrs,
|
| 906 |
+
"parameters": metadata.get('total_parameters', 0),
|
| 907 |
+
"relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0,
|
| 908 |
+
"content_preview": doc[:200] + "..." if len(doc) > 200 else doc
|
| 909 |
+
})
|
| 910 |
+
|
| 911 |
+
return similar_checklists
|
| 912 |
+
except Exception as e:
|
| 913 |
+
print(f"Error searching checklists: {str(e)}")
|
| 914 |
+
return []
|
| 915 |
|
| 916 |
|
| 917 |
def main():
|
| 918 |
+
"""Main function to create/update the checklist examples database"""
|
| 919 |
+
print("Starting Generic Checklist Examples Database Creation...")
|
| 920 |
+
print("Features: No company bias, dynamic attributes, universal patterns")
|
| 921 |
+
|
| 922 |
+
# Initialize database
|
| 923 |
+
db = ChecklistExamplesVDB()
|
| 924 |
+
|
| 925 |
+
# Process all PDFs
|
| 926 |
+
db.process_all_pdfs()
|
| 927 |
+
|
| 928 |
+
# Show processing stats
|
| 929 |
+
print("\n" + "="*60)
|
| 930 |
+
print("PROCESSING STATISTICS (Generic)")
|
| 931 |
+
print("="*60)
|
| 932 |
+
|
| 933 |
+
stats = db.get_processing_stats()
|
| 934 |
+
print(f"Total files processed: {stats['total_processed']}")
|
| 935 |
+
print(f"Successful: {stats['successful']}")
|
| 936 |
+
print(f"Errors: {stats['errors']}")
|
| 937 |
+
print(f"Total parameters extracted: {stats['total_parameters']}")
|
| 938 |
+
print(f"Total sections extracted: {stats['total_sections']}")
|
| 939 |
+
|
| 940 |
+
print("\nDocument Types:")
|
| 941 |
+
for doc_type, count in stats["document_types"].items():
|
| 942 |
+
print(f" - {doc_type}: {count} documents")
|
| 943 |
+
|
| 944 |
+
print("\nDynamic Attributes Found:")
|
| 945 |
+
for attr_type, values in stats["attribute_summary"].items():
|
| 946 |
+
print(f"\n{attr_type}:")
|
| 947 |
+
for value, count in values.items():
|
| 948 |
+
print(f" - {value}: {count} documents")
|
| 949 |
+
|
| 950 |
+
print("\nInput Methods Distribution:")
|
| 951 |
+
for method, count in stats["input_methods"].items():
|
| 952 |
+
print(f" - {method}: {count} parameters")
|
| 953 |
+
|
| 954 |
+
print("\nMost Common Parameters (Generic):")
|
| 955 |
+
for param in stats["common_parameters"]:
|
| 956 |
+
print(f" - {param['name']} ({param['input_method']}) - used {param['frequency']} times")
|
| 957 |
+
|
| 958 |
+
# Show parameter patterns
|
| 959 |
+
print("\n" + "="*60)
|
| 960 |
+
print("PARAMETER PATTERNS DISCOVERED")
|
| 961 |
+
print("="*60)
|
| 962 |
+
|
| 963 |
+
patterns = db.get_parameter_patterns()
|
| 964 |
+
for pattern in patterns[:10]: # Show top 10 patterns
|
| 965 |
+
print(f"\n{pattern['parameter_type']} -> {pattern['input_method']}")
|
| 966 |
+
print(f" Usage: {pattern['usage_count']} times")
|
| 967 |
+
if pattern['common_specifications']:
|
| 968 |
+
specs = pattern['common_specifications'][:100]
|
| 969 |
+
print(f" Common specs: {specs}{'...' if len(pattern['common_specifications']) > 100 else ''}")
|
| 970 |
+
if pattern['common_options']:
|
| 971 |
+
options = pattern['common_options'][:100]
|
| 972 |
+
print(f" Common options: {options}{'...' if len(pattern['common_options']) > 100 else ''}")
|
| 973 |
+
|
| 974 |
+
# Test search functionality
|
| 975 |
+
print("\n" + "="*60)
|
| 976 |
+
print("TESTING SEARCH FUNCTIONALITY")
|
| 977 |
+
print("="*60)
|
| 978 |
+
|
| 979 |
+
test_products = ["Quality Inspection", "Production Check", "Safety Assessment"]
|
| 980 |
+
for product in test_products:
|
| 981 |
+
print(f"\nSearching for '{product}' checklists:")
|
| 982 |
+
similar = db.search_similar_checklists(product, limit=3)
|
| 983 |
+
for i, checklist in enumerate(similar, 1):
|
| 984 |
+
print(f" {i}. {checklist['document']} ({checklist['type']})")
|
| 985 |
+
print(f" Product: {checklist['product']}, Parameters: {checklist['parameters']}")
|
| 986 |
+
if checklist['attributes']:
|
| 987 |
+
print(f" Attributes: {checklist['attributes']}")
|
| 988 |
+
print(f" Relevance: {checklist['relevance_score']:.3f}")
|
| 989 |
|
| 990 |
|
| 991 |
if __name__ == "__main__":
|
| 992 |
+
main()
|
vector_stores/chroma_db/product_specs/create_product_spec_db.py
CHANGED
|
@@ -48,6 +48,7 @@ class ProductSpecificationVectorDB:
|
|
| 48 |
conn = sqlite3.connect(self.metadata_db_path)
|
| 49 |
cursor = conn.cursor()
|
| 50 |
|
|
|
|
| 51 |
cursor.execute("""
|
| 52 |
CREATE TABLE IF NOT EXISTS product_documents (
|
| 53 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
@@ -56,7 +57,7 @@ class ProductSpecificationVectorDB:
|
|
| 56 |
product_name TEXT,
|
| 57 |
brand TEXT,
|
| 58 |
supplier TEXT,
|
| 59 |
-
|
| 60 |
shelf_life TEXT,
|
| 61 |
storage_conditions TEXT,
|
| 62 |
manufacturing_location TEXT,
|
|
@@ -136,12 +137,12 @@ class ProductSpecificationVectorDB:
|
|
| 136 |
return hashlib.md5(f.read()).hexdigest()
|
| 137 |
|
| 138 |
def extract_product_metadata(self, text):
|
| 139 |
-
"""Extract product-specific metadata"""
|
| 140 |
metadata = {
|
| 141 |
"product_name": "",
|
| 142 |
"brand": "",
|
| 143 |
"supplier": "",
|
| 144 |
-
"
|
| 145 |
"shelf_life": "",
|
| 146 |
"storage_conditions": "",
|
| 147 |
"manufacturing_location": "",
|
|
@@ -156,16 +157,16 @@ class ProductSpecificationVectorDB:
|
|
| 156 |
metadata["product_name"] = re.search(r'Product\s*Name[:]*\s*(.+)', line, re.IGNORECASE).group(1).strip()
|
| 157 |
break
|
| 158 |
|
| 159 |
-
# Extract brand
|
| 160 |
brand_patterns = [
|
| 161 |
r'Brand[:]*\s*(.+)',
|
| 162 |
-
r'
|
| 163 |
r'Company[:]*\s*(.+)'
|
| 164 |
]
|
| 165 |
for pattern in brand_patterns:
|
| 166 |
match = re.search(pattern, text, re.IGNORECASE)
|
| 167 |
if match:
|
| 168 |
-
metadata["brand"] = match.group(1).strip()
|
| 169 |
break
|
| 170 |
|
| 171 |
# Extract shelf life
|
|
@@ -192,21 +193,56 @@ class ProductSpecificationVectorDB:
|
|
| 192 |
metadata["storage_conditions"] = match.group(1).strip()
|
| 193 |
break
|
| 194 |
|
| 195 |
-
#
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
}
|
| 203 |
|
| 204 |
text_lower = text.lower()
|
| 205 |
-
for
|
| 206 |
if any(keyword in text_lower for keyword in keywords):
|
| 207 |
-
|
| 208 |
break
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
return metadata
|
| 211 |
|
| 212 |
def extract_parameters(self, text):
|
|
@@ -276,30 +312,30 @@ class ProductSpecificationVectorDB:
|
|
| 276 |
return "Text Input"
|
| 277 |
|
| 278 |
def classify_parameter_category(self, param_name):
|
| 279 |
-
"""Classify parameter into categories"""
|
| 280 |
param_lower = param_name.lower()
|
| 281 |
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
|
| 298 |
def is_critical_parameter(self, param_name):
|
| 299 |
"""Determine if parameter is critical for safety/quality"""
|
| 300 |
critical_keywords = [
|
| 301 |
"temperature", "microbiological", "pathogen", "salmonella", "listeria",
|
| 302 |
-
"foreign", "contamination", "allergen", "
|
| 303 |
]
|
| 304 |
return any(keyword in param_name.lower() for keyword in critical_keywords)
|
| 305 |
|
|
@@ -310,10 +346,11 @@ class ProductSpecificationVectorDB:
|
|
| 310 |
|
| 311 |
# Look for table headers
|
| 312 |
table_indicators = [
|
| 313 |
-
"
|
| 314 |
-
"
|
| 315 |
-
"
|
| 316 |
-
"
|
|
|
|
| 317 |
]
|
| 318 |
|
| 319 |
in_table = False
|
|
@@ -340,7 +377,7 @@ class ProductSpecificationVectorDB:
|
|
| 340 |
|
| 341 |
if len(param_name) > 3 and param_name not in ["PARAMETERS", "ACCEPTED LIMIT"]:
|
| 342 |
param_type = self.classify_parameter_type(param_name, value, unit)
|
| 343 |
-
category =
|
| 344 |
|
| 345 |
parameters.append({
|
| 346 |
"parameter_name": param_name,
|
|
@@ -389,50 +426,38 @@ class ProductSpecificationVectorDB:
|
|
| 389 |
return nutritional_data
|
| 390 |
|
| 391 |
def extract_compliance_standards(self, text):
|
| 392 |
-
"""Extract compliance standards and certifications"""
|
| 393 |
standards = []
|
| 394 |
|
| 395 |
-
#
|
| 396 |
standard_patterns = [
|
| 397 |
-
r'(
|
| 398 |
-
r'(
|
| 399 |
-
r'(
|
| 400 |
-
r'(
|
| 401 |
-
r'(FDA)',
|
| 402 |
-
r'(SASO\s*Standard)',
|
| 403 |
-
r'(EU\s*Regulation)',
|
| 404 |
-
r'(AOAC)',
|
| 405 |
-
r'(Codex\s*Alimentarius)'
|
| 406 |
]
|
| 407 |
|
| 408 |
for pattern in standard_patterns:
|
| 409 |
matches = re.finditer(pattern, text, re.IGNORECASE)
|
| 410 |
for match in matches:
|
| 411 |
-
|
| 412 |
|
| 413 |
-
#
|
| 414 |
-
if
|
| 415 |
-
standard_name = "
|
| 416 |
-
compliance_type = "
|
| 417 |
-
elif "ISO" in standard_code:
|
| 418 |
-
standard_name = "International Organization for Standardization"
|
| 419 |
-
compliance_type = "International Standard"
|
| 420 |
-
elif "HACCP" in standard_code:
|
| 421 |
-
standard_name = "Hazard Analysis Critical Control Points"
|
| 422 |
-
compliance_type = "Food Safety System"
|
| 423 |
-
elif "HALAL" in standard_code:
|
| 424 |
-
standard_name = "Halal Certification"
|
| 425 |
-
compliance_type = "Religious Compliance"
|
| 426 |
else:
|
| 427 |
-
standard_name =
|
| 428 |
-
compliance_type = "
|
| 429 |
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
|
|
|
|
|
|
| 436 |
|
| 437 |
return standards
|
| 438 |
|
|
@@ -456,8 +481,14 @@ class ProductSpecificationVectorDB:
|
|
| 456 |
|
| 457 |
# Add product context to searchable content
|
| 458 |
searchable_content = f"Product: {metadata.get('product_name', 'Unknown')}\n"
|
| 459 |
-
|
| 460 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 461 |
|
| 462 |
documents.append({
|
| 463 |
"text": searchable_content,
|
|
@@ -511,7 +542,7 @@ class ProductSpecificationVectorDB:
|
|
| 511 |
"source": filename,
|
| 512 |
"product_name": product_metadata["product_name"],
|
| 513 |
"brand": product_metadata["brand"],
|
| 514 |
-
"
|
| 515 |
"shelf_life": product_metadata["shelf_life"],
|
| 516 |
"storage_conditions": product_metadata["storage_conditions"],
|
| 517 |
"file_hash": file_hash,
|
|
@@ -546,7 +577,8 @@ class ProductSpecificationVectorDB:
|
|
| 546 |
"processed_date": datetime.now().isoformat(),
|
| 547 |
"product_name": product_metadata["product_name"],
|
| 548 |
"parameters_extracted": len(parameters),
|
| 549 |
-
"compliance_standards": len(compliance_standards)
|
|
|
|
| 550 |
}
|
| 551 |
self.save_manifest()
|
| 552 |
|
|
@@ -568,13 +600,14 @@ class ProductSpecificationVectorDB:
|
|
| 568 |
try:
|
| 569 |
cursor.execute("""
|
| 570 |
INSERT OR REPLACE INTO product_documents
|
| 571 |
-
(file_hash, filename, product_name, brand, supplier,
|
| 572 |
shelf_life, storage_conditions, manufacturing_location, document_type)
|
| 573 |
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 574 |
""", (
|
| 575 |
file_hash, filename,
|
| 576 |
metadata["product_name"], metadata["brand"], metadata["supplier"],
|
| 577 |
-
metadata["
|
|
|
|
| 578 |
metadata["storage_conditions"], metadata["manufacturing_location"],
|
| 579 |
metadata["document_type"]
|
| 580 |
))
|
|
@@ -615,97 +648,162 @@ class ProductSpecificationVectorDB:
|
|
| 615 |
cursor.execute("DELETE FROM nutritional_info WHERE file_hash = ?", (file_hash,))
|
| 616 |
|
| 617 |
for nutrition in nutritional_data:
|
| 618 |
-
cursor.execute("""
|
| 619 |
-
|
| 620 |
-
|
| 621 |
-
|
| 622 |
-
|
| 623 |
-
|
| 624 |
-
|
| 625 |
-
|
| 626 |
-
|
| 627 |
conn.commit()
|
| 628 |
finally:
|
| 629 |
conn.close()
|
| 630 |
-
|
| 631 |
def save_compliance_standards(self, file_hash, standards):
|
| 632 |
-
|
| 633 |
-
|
| 634 |
-
|
| 635 |
-
|
| 636 |
-
|
| 637 |
-
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
|
| 649 |
-
|
| 650 |
-
|
| 651 |
-
|
| 652 |
-
|
| 653 |
def log_processing(self, filename, file_hash, status, error_message, params_count=0, standards_count=0):
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
|
| 658 |
-
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
|
| 665 |
-
|
| 666 |
-
|
| 667 |
-
|
| 668 |
-
|
| 669 |
def ocr_pdf(self, pdf_path):
|
| 670 |
-
|
| 671 |
-
|
| 672 |
-
|
| 673 |
-
|
| 674 |
-
|
| 675 |
-
|
| 676 |
-
|
| 677 |
-
|
| 678 |
-
|
| 679 |
-
|
| 680 |
-
|
| 681 |
-
|
| 682 |
-
|
| 683 |
-
|
| 684 |
def process_all_pdfs(self):
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
|
| 699 |
|
| 700 |
def main():
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 708 |
|
| 709 |
|
| 710 |
if __name__ == "__main__":
|
| 711 |
-
|
|
|
|
| 48 |
conn = sqlite3.connect(self.metadata_db_path)
|
| 49 |
cursor = conn.cursor()
|
| 50 |
|
| 51 |
+
# UPDATED: Added product_attributes instead of fixed category
|
| 52 |
cursor.execute("""
|
| 53 |
CREATE TABLE IF NOT EXISTS product_documents (
|
| 54 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
|
|
|
| 57 |
product_name TEXT,
|
| 58 |
brand TEXT,
|
| 59 |
supplier TEXT,
|
| 60 |
+
product_attributes TEXT, -- Dynamic attributes instead of category
|
| 61 |
shelf_life TEXT,
|
| 62 |
storage_conditions TEXT,
|
| 63 |
manufacturing_location TEXT,
|
|
|
|
| 137 |
return hashlib.md5(f.read()).hexdigest()
|
| 138 |
|
| 139 |
def extract_product_metadata(self, text):
|
| 140 |
+
"""Extract product-specific metadata without forcing categories"""
|
| 141 |
metadata = {
|
| 142 |
"product_name": "",
|
| 143 |
"brand": "",
|
| 144 |
"supplier": "",
|
| 145 |
+
"product_attributes": {}, # Dynamic attributes
|
| 146 |
"shelf_life": "",
|
| 147 |
"storage_conditions": "",
|
| 148 |
"manufacturing_location": "",
|
|
|
|
| 157 |
metadata["product_name"] = re.search(r'Product\s*Name[:]*\s*(.+)', line, re.IGNORECASE).group(1).strip()
|
| 158 |
break
|
| 159 |
|
| 160 |
+
# Extract brand (generic)
|
| 161 |
brand_patterns = [
|
| 162 |
r'Brand[:]*\s*(.+)',
|
| 163 |
+
r'Manufacturer[:]*\s*(.+)',
|
| 164 |
r'Company[:]*\s*(.+)'
|
| 165 |
]
|
| 166 |
for pattern in brand_patterns:
|
| 167 |
match = re.search(pattern, text, re.IGNORECASE)
|
| 168 |
if match:
|
| 169 |
+
metadata["brand"] = match.group(1).strip()
|
| 170 |
break
|
| 171 |
|
| 172 |
# Extract shelf life
|
|
|
|
| 193 |
metadata["storage_conditions"] = match.group(1).strip()
|
| 194 |
break
|
| 195 |
|
| 196 |
+
# UPDATED: Extract dynamic product attributes instead of fixed categories
|
| 197 |
+
attributes = {}
|
| 198 |
+
|
| 199 |
+
# Temperature requirements
|
| 200 |
+
temp_match = re.search(r'(?:stored?|kept?|maintain(?:ed)?)\s*at\s*([-\d]+\s*[°]?[CF])', text, re.IGNORECASE)
|
| 201 |
+
if temp_match:
|
| 202 |
+
attributes["temperature_requirement"] = temp_match.group(1)
|
| 203 |
+
|
| 204 |
+
# Processing method
|
| 205 |
+
processing_keywords = {
|
| 206 |
+
"frozen": ["frozen", "freeze", "iqf", "individually quick frozen"],
|
| 207 |
+
"fresh": ["fresh", "chilled", "refrigerated"],
|
| 208 |
+
"dried": ["dried", "dehydrated", "dry"],
|
| 209 |
+
"canned": ["canned", "tinned", "preserved"],
|
| 210 |
+
"fried": ["fried", "deep fried", "oil fried"],
|
| 211 |
+
"baked": ["baked", "oven", "bakery"],
|
| 212 |
+
"raw": ["raw", "uncooked", "unprocessed"],
|
| 213 |
+
"cooked": ["cooked", "pre-cooked", "ready to eat"]
|
| 214 |
}
|
| 215 |
|
| 216 |
text_lower = text.lower()
|
| 217 |
+
for method, keywords in processing_keywords.items():
|
| 218 |
if any(keyword in text_lower for keyword in keywords):
|
| 219 |
+
attributes["processing_method"] = method
|
| 220 |
break
|
| 221 |
|
| 222 |
+
# Product form
|
| 223 |
+
form_keywords = {
|
| 224 |
+
"powder": ["powder", "powdered"],
|
| 225 |
+
"liquid": ["liquid", "juice", "syrup"],
|
| 226 |
+
"solid": ["solid", "whole", "pieces"],
|
| 227 |
+
"paste": ["paste", "puree"],
|
| 228 |
+
"granular": ["granular", "granules"]
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
for form, keywords in form_keywords.items():
|
| 232 |
+
if any(keyword in text_lower for keyword in keywords):
|
| 233 |
+
attributes["product_form"] = form
|
| 234 |
+
break
|
| 235 |
+
|
| 236 |
+
# Special characteristics
|
| 237 |
+
if any(word in text_lower for word in ["organic", "natural", "no preservatives"]):
|
| 238 |
+
attributes["special_characteristics"] = "natural/organic"
|
| 239 |
+
if any(word in text_lower for word in ["halal", "kosher"]):
|
| 240 |
+
attributes["certification"] = "religious compliance"
|
| 241 |
+
if any(word in text_lower for word in ["gluten free", "allergen free"]):
|
| 242 |
+
attributes["dietary"] = "allergen-free"
|
| 243 |
+
|
| 244 |
+
metadata["product_attributes"] = json.dumps(attributes)
|
| 245 |
+
|
| 246 |
return metadata
|
| 247 |
|
| 248 |
def extract_parameters(self, text):
|
|
|
|
| 312 |
return "Text Input"
|
| 313 |
|
| 314 |
def classify_parameter_category(self, param_name):
|
| 315 |
+
"""Classify parameter into categories dynamically"""
|
| 316 |
param_lower = param_name.lower()
|
| 317 |
|
| 318 |
+
# Dynamic categorization based on parameter nature
|
| 319 |
+
if any(word in param_lower for word in ["weight", "size", "dimension", "length", "width"]):
|
| 320 |
+
return "Physical Measurement"
|
| 321 |
+
elif any(word in param_lower for word in ["appearance", "color", "texture", "taste", "flavor"]):
|
| 322 |
+
return "Sensory Attribute"
|
| 323 |
+
elif any(word in param_lower for word in ["bacteria", "microb", "pathogen", "coli"]):
|
| 324 |
+
return "Microbiological"
|
| 325 |
+
elif any(word in param_lower for word in ["moisture", "fat", "protein", "ph", "acid"]):
|
| 326 |
+
return "Chemical Composition"
|
| 327 |
+
elif any(word in param_lower for word in ["foreign", "contamination", "hazard"]):
|
| 328 |
+
return "Safety Parameter"
|
| 329 |
+
elif any(word in param_lower for word in ["temperature", "thermal"]):
|
| 330 |
+
return "Temperature Control"
|
| 331 |
+
else:
|
| 332 |
+
return "Quality Parameter"
|
| 333 |
|
| 334 |
def is_critical_parameter(self, param_name):
|
| 335 |
"""Determine if parameter is critical for safety/quality"""
|
| 336 |
critical_keywords = [
|
| 337 |
"temperature", "microbiological", "pathogen", "salmonella", "listeria",
|
| 338 |
+
"foreign", "contamination", "allergen", "critical"
|
| 339 |
]
|
| 340 |
return any(keyword in param_name.lower() for keyword in critical_keywords)
|
| 341 |
|
|
|
|
| 346 |
|
| 347 |
# Look for table headers
|
| 348 |
table_indicators = [
|
| 349 |
+
"SPECIFICATIONS",
|
| 350 |
+
"PARAMETERS",
|
| 351 |
+
"CHARACTERISTICS",
|
| 352 |
+
"REQUIREMENTS",
|
| 353 |
+
"LIMITS"
|
| 354 |
]
|
| 355 |
|
| 356 |
in_table = False
|
|
|
|
| 377 |
|
| 378 |
if len(param_name) > 3 and param_name not in ["PARAMETERS", "ACCEPTED LIMIT"]:
|
| 379 |
param_type = self.classify_parameter_type(param_name, value, unit)
|
| 380 |
+
category = self.classify_parameter_category(param_name)
|
| 381 |
|
| 382 |
parameters.append({
|
| 383 |
"parameter_name": param_name,
|
|
|
|
| 426 |
return nutritional_data
|
| 427 |
|
| 428 |
def extract_compliance_standards(self, text):
|
| 429 |
+
"""Extract compliance standards and certifications generically"""
|
| 430 |
standards = []
|
| 431 |
|
| 432 |
+
# Generic standard patterns
|
| 433 |
standard_patterns = [
|
| 434 |
+
r'(?:complies?\s*with|as\s*per|according\s*to)\s*([A-Z]+\s*\d+[:/]?\d*)',
|
| 435 |
+
r'(?:standard|specification)\s*:?\s*([A-Z]+\s*\d+[:/]?\d*)',
|
| 436 |
+
r'(?:certified|certification)\s*:?\s*([A-Za-z\s]+)',
|
| 437 |
+
r'([A-Z]{2,}\s*\d+(?::\d+)?)', # Generic standard format
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 438 |
]
|
| 439 |
|
| 440 |
for pattern in standard_patterns:
|
| 441 |
matches = re.finditer(pattern, text, re.IGNORECASE)
|
| 442 |
for match in matches:
|
| 443 |
+
standard_ref = match.group(1).strip()
|
| 444 |
|
| 445 |
+
# Generic classification
|
| 446 |
+
if re.match(r'^[A-Z]{2,4}\s*\d+', standard_ref):
|
| 447 |
+
standard_name = "Industry Standard"
|
| 448 |
+
compliance_type = "Technical Standard"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 449 |
else:
|
| 450 |
+
standard_name = standard_ref
|
| 451 |
+
compliance_type = "Certification"
|
| 452 |
|
| 453 |
+
# Avoid duplicates
|
| 454 |
+
if not any(s["standard_code"] == standard_ref for s in standards):
|
| 455 |
+
standards.append({
|
| 456 |
+
"standard_name": standard_name,
|
| 457 |
+
"standard_code": standard_ref,
|
| 458 |
+
"compliance_type": compliance_type,
|
| 459 |
+
"requirements": ""
|
| 460 |
+
})
|
| 461 |
|
| 462 |
return standards
|
| 463 |
|
|
|
|
| 481 |
|
| 482 |
# Add product context to searchable content
|
| 483 |
searchable_content = f"Product: {metadata.get('product_name', 'Unknown')}\n"
|
| 484 |
+
|
| 485 |
+
# Add dynamic attributes
|
| 486 |
+
if metadata.get('product_attributes'):
|
| 487 |
+
attrs = json.loads(metadata['product_attributes'])
|
| 488 |
+
if attrs:
|
| 489 |
+
searchable_content += f"Attributes: {', '.join(f'{k}={v}' for k, v in attrs.items())}\n"
|
| 490 |
+
|
| 491 |
+
searchable_content += f"\n{chunk}"
|
| 492 |
|
| 493 |
documents.append({
|
| 494 |
"text": searchable_content,
|
|
|
|
| 542 |
"source": filename,
|
| 543 |
"product_name": product_metadata["product_name"],
|
| 544 |
"brand": product_metadata["brand"],
|
| 545 |
+
"product_attributes": product_metadata["product_attributes"], # Dynamic attributes
|
| 546 |
"shelf_life": product_metadata["shelf_life"],
|
| 547 |
"storage_conditions": product_metadata["storage_conditions"],
|
| 548 |
"file_hash": file_hash,
|
|
|
|
| 577 |
"processed_date": datetime.now().isoformat(),
|
| 578 |
"product_name": product_metadata["product_name"],
|
| 579 |
"parameters_extracted": len(parameters),
|
| 580 |
+
"compliance_standards": len(compliance_standards),
|
| 581 |
+
"attributes": product_metadata["product_attributes"]
|
| 582 |
}
|
| 583 |
self.save_manifest()
|
| 584 |
|
|
|
|
| 600 |
try:
|
| 601 |
cursor.execute("""
|
| 602 |
INSERT OR REPLACE INTO product_documents
|
| 603 |
+
(file_hash, filename, product_name, brand, supplier, product_attributes,
|
| 604 |
shelf_life, storage_conditions, manufacturing_location, document_type)
|
| 605 |
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 606 |
""", (
|
| 607 |
file_hash, filename,
|
| 608 |
metadata["product_name"], metadata["brand"], metadata["supplier"],
|
| 609 |
+
metadata["product_attributes"], # Dynamic attributes as JSON
|
| 610 |
+
metadata["shelf_life"],
|
| 611 |
metadata["storage_conditions"], metadata["manufacturing_location"],
|
| 612 |
metadata["document_type"]
|
| 613 |
))
|
|
|
|
| 648 |
cursor.execute("DELETE FROM nutritional_info WHERE file_hash = ?", (file_hash,))
|
| 649 |
|
| 650 |
for nutrition in nutritional_data:
|
| 651 |
+
cursor.execute("""INSERT INTO nutritional_info
|
| 652 |
+
(file_hash, nutrient_name, value_per_100g, daily_value_percent)
|
| 653 |
+
VALUES (?, ?, ?, ?)
|
| 654 |
+
""", (
|
| 655 |
+
file_hash, nutrition["nutrient_name"],
|
| 656 |
+
nutrition["value_per_100g"], nutrition["daily_value_percent"]
|
| 657 |
+
))
|
| 658 |
+
|
|
|
|
| 659 |
conn.commit()
|
| 660 |
finally:
|
| 661 |
conn.close()
|
| 662 |
+
|
| 663 |
def save_compliance_standards(self, file_hash, standards):
|
| 664 |
+
"""Save compliance standards to SQLite"""
|
| 665 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 666 |
+
cursor = conn.cursor()
|
| 667 |
+
|
| 668 |
+
try:
|
| 669 |
+
cursor.execute("DELETE FROM compliance_standards WHERE file_hash = ?", (file_hash,))
|
| 670 |
+
|
| 671 |
+
for standard in standards:
|
| 672 |
+
cursor.execute("""
|
| 673 |
+
INSERT INTO compliance_standards
|
| 674 |
+
(file_hash, standard_name, standard_code, compliance_type, requirements)
|
| 675 |
+
VALUES (?, ?, ?, ?, ?)
|
| 676 |
+
""", (
|
| 677 |
+
file_hash, standard["standard_name"], standard["standard_code"],
|
| 678 |
+
standard["compliance_type"], standard["requirements"]
|
| 679 |
+
))
|
| 680 |
+
|
| 681 |
+
conn.commit()
|
| 682 |
+
finally:
|
| 683 |
+
conn.close()
|
| 684 |
+
|
| 685 |
def log_processing(self, filename, file_hash, status, error_message, params_count=0, standards_count=0):
|
| 686 |
+
"""Log processing results"""
|
| 687 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 688 |
+
cursor = conn.cursor()
|
| 689 |
+
|
| 690 |
+
try:
|
| 691 |
+
cursor.execute("""
|
| 692 |
+
INSERT INTO processing_log
|
| 693 |
+
(filename, file_hash, status, error_message, parameters_extracted, compliance_standards_extracted)
|
| 694 |
+
VALUES (?, ?, ?, ?, ?, ?)
|
| 695 |
+
""", (filename, file_hash, status, error_message, params_count, standards_count))
|
| 696 |
+
|
| 697 |
+
conn.commit()
|
| 698 |
+
finally:
|
| 699 |
+
conn.close()
|
| 700 |
+
|
| 701 |
def ocr_pdf(self, pdf_path):
|
| 702 |
+
"""OCR fallback for scanned PDFs"""
|
| 703 |
+
try:
|
| 704 |
+
images = pdf2image.convert_from_path(pdf_path)
|
| 705 |
+
full_text = ""
|
| 706 |
+
|
| 707 |
+
for i, image in enumerate(images):
|
| 708 |
+
text = pytesseract.image_to_string(image)
|
| 709 |
+
full_text += f"\n--- Page {i+1} ---\n{text}"
|
| 710 |
+
|
| 711 |
+
return full_text
|
| 712 |
+
except Exception as e:
|
| 713 |
+
print(f"OCR error: {e}")
|
| 714 |
+
return ""
|
| 715 |
+
|
| 716 |
def process_all_pdfs(self):
|
| 717 |
+
"""Process all product specification PDFs"""
|
| 718 |
+
pdf_files = list(self.pdf_path.glob("*.pdf"))
|
| 719 |
+
|
| 720 |
+
if not pdf_files:
|
| 721 |
+
print(f"No PDF files found in {self.pdf_path}")
|
| 722 |
+
return
|
| 723 |
+
|
| 724 |
+
print(f"Found {len(pdf_files)} product specification files")
|
| 725 |
+
|
| 726 |
+
for pdf_file in pdf_files:
|
| 727 |
+
self.process_pdf(pdf_file)
|
| 728 |
+
|
| 729 |
+
print(f"Product specification VDB creation complete!")
|
| 730 |
+
|
| 731 |
+
def get_processing_stats(self):
|
| 732 |
+
"""Get processing statistics"""
|
| 733 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 734 |
+
cursor = conn.cursor()
|
| 735 |
+
|
| 736 |
+
try:
|
| 737 |
+
# Overall stats
|
| 738 |
+
cursor.execute("""
|
| 739 |
+
SELECT COUNT(*) as total,
|
| 740 |
+
SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
|
| 741 |
+
SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
|
| 742 |
+
SUM(parameters_extracted) as total_parameters
|
| 743 |
+
FROM processing_log
|
| 744 |
+
""")
|
| 745 |
+
|
| 746 |
+
stats = cursor.fetchone()
|
| 747 |
+
|
| 748 |
+
# Get attribute distribution
|
| 749 |
+
cursor.execute("""
|
| 750 |
+
SELECT product_attributes, COUNT(*) as count
|
| 751 |
+
FROM product_documents
|
| 752 |
+
WHERE product_attributes IS NOT NULL
|
| 753 |
+
GROUP BY product_attributes
|
| 754 |
+
""")
|
| 755 |
+
|
| 756 |
+
attribute_dist = cursor.fetchall()
|
| 757 |
+
|
| 758 |
+
# Parse attributes to get summary
|
| 759 |
+
attribute_summary = {}
|
| 760 |
+
for attrs_json, count in attribute_dist:
|
| 761 |
+
if attrs_json:
|
| 762 |
+
try:
|
| 763 |
+
attrs = json.loads(attrs_json)
|
| 764 |
+
for key, value in attrs.items():
|
| 765 |
+
if key not in attribute_summary:
|
| 766 |
+
attribute_summary[key] = {}
|
| 767 |
+
if value not in attribute_summary[key]:
|
| 768 |
+
attribute_summary[key][value] = 0
|
| 769 |
+
attribute_summary[key][value] += count
|
| 770 |
+
except:
|
| 771 |
+
pass
|
| 772 |
+
|
| 773 |
+
return {
|
| 774 |
+
"total_processed": stats[0] or 0,
|
| 775 |
+
"successful": stats[1] or 0,
|
| 776 |
+
"errors": stats[2] or 0,
|
| 777 |
+
"total_parameters": stats[3] or 0,
|
| 778 |
+
"attribute_summary": attribute_summary
|
| 779 |
+
}
|
| 780 |
+
finally:
|
| 781 |
+
conn.close()
|
| 782 |
|
| 783 |
|
| 784 |
def main():
|
| 785 |
+
"""Main function"""
|
| 786 |
+
print("Creating Product Specification Vector Database...")
|
| 787 |
+
print("Features: Dynamic product attributes, no fixed categories")
|
| 788 |
+
|
| 789 |
+
db = ProductSpecificationVectorDB()
|
| 790 |
+
db.process_all_pdfs()
|
| 791 |
+
|
| 792 |
+
# Show stats
|
| 793 |
+
stats = db.get_processing_stats()
|
| 794 |
+
print(f"\n📊 Processing Statistics:")
|
| 795 |
+
print(f"Total files: {stats['total_processed']}")
|
| 796 |
+
print(f"Successful: {stats['successful']}")
|
| 797 |
+
print(f"Total parameters: {stats['total_parameters']}")
|
| 798 |
+
|
| 799 |
+
print(f"\n🏷️ Dynamic Product Attributes Found:")
|
| 800 |
+
for attr_type, values in stats['attribute_summary'].items():
|
| 801 |
+
print(f"\n{attr_type}:")
|
| 802 |
+
for value, count in values.items():
|
| 803 |
+
print(f" - {value}: {count} products")
|
| 804 |
+
|
| 805 |
+
print("\nProduct Specification VDB ready!")
|
| 806 |
|
| 807 |
|
| 808 |
if __name__ == "__main__":
|
| 809 |
+
main()
|
vector_stores/chroma_db/regulatory_docs/create_regulatory_db.py
CHANGED
|
@@ -1,578 +1,3 @@
|
|
| 1 |
-
# import os
|
| 2 |
-
# import json
|
| 3 |
-
# import sqlite3
|
| 4 |
-
# from datetime import datetime
|
| 5 |
-
# from pathlib import Path
|
| 6 |
-
# import chromadb
|
| 7 |
-
# from chromadb import Settings
|
| 8 |
-
# from langchain_community.document_loaders import PyPDFLoader
|
| 9 |
-
# from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 10 |
-
# from sentence_transformers import SentenceTransformer
|
| 11 |
-
# import pytesseract
|
| 12 |
-
# from PIL import Image
|
| 13 |
-
# import pdf2image
|
| 14 |
-
# import hashlib
|
| 15 |
-
# import re
|
| 16 |
-
|
| 17 |
-
# class RegulatoryGuidelinesDB:
|
| 18 |
-
# def __init__(self, base_path="./vector_stores"):
|
| 19 |
-
# self.base_path = Path(base_path)
|
| 20 |
-
# self.pdf_path = self.base_path / "regulatory_guidelines" / "pdfs"
|
| 21 |
-
# self.chroma_path = self.base_path / "chroma_db" / "regulatory_docs"
|
| 22 |
-
# self.metadata_path = self.chroma_path / "metadata"
|
| 23 |
-
# self.manifest_path = self.metadata_path / "manifest.json"
|
| 24 |
-
# self.metadata_db_path = self.metadata_path / "regulatory_metadata.db"
|
| 25 |
-
|
| 26 |
-
# # Create directories
|
| 27 |
-
# self.pdf_path.mkdir(parents=True, exist_ok=True)
|
| 28 |
-
# self.chroma_path.mkdir(parents=True, exist_ok=True)
|
| 29 |
-
# self.metadata_path.mkdir(parents=True, exist_ok=True)
|
| 30 |
-
|
| 31 |
-
# # Initialize ChromaDB
|
| 32 |
-
# self.client = chromadb.PersistentClient(
|
| 33 |
-
# path=str(self.chroma_path),
|
| 34 |
-
# settings=Settings(anonymized_telemetry=False)
|
| 35 |
-
# )
|
| 36 |
-
|
| 37 |
-
# # Get or create collection
|
| 38 |
-
# self.collection = self.client.get_or_create_collection(
|
| 39 |
-
# name="regulatory_guidelines",
|
| 40 |
-
# metadata={"description": "Regulatory guidelines and standards for QC"}
|
| 41 |
-
# )
|
| 42 |
-
|
| 43 |
-
# # Initialize embedding model
|
| 44 |
-
# self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
|
| 45 |
-
|
| 46 |
-
# # Initialize metadata database
|
| 47 |
-
# self.init_metadata_db()
|
| 48 |
-
|
| 49 |
-
# # Load manifest
|
| 50 |
-
# self.manifest = self.load_manifest()
|
| 51 |
-
|
| 52 |
-
# def init_metadata_db(self):
|
| 53 |
-
# """Initialize SQLite database for storing regulatory metadata"""
|
| 54 |
-
# conn = sqlite3.connect(self.metadata_db_path)
|
| 55 |
-
# cursor = conn.cursor()
|
| 56 |
-
|
| 57 |
-
# cursor.execute("""
|
| 58 |
-
# CREATE TABLE IF NOT EXISTS regulatory_documents (
|
| 59 |
-
# id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 60 |
-
# file_hash TEXT UNIQUE NOT NULL,
|
| 61 |
-
# filename TEXT NOT NULL,
|
| 62 |
-
# regulatory_body TEXT,
|
| 63 |
-
# standard_type TEXT,
|
| 64 |
-
# standard_code TEXT,
|
| 65 |
-
# publication_date TEXT,
|
| 66 |
-
# effective_date TEXT,
|
| 67 |
-
# jurisdiction TEXT,
|
| 68 |
-
# industry TEXT,
|
| 69 |
-
# extracted_at DATETIME DEFAULT CURRENT_TIMESTAMP
|
| 70 |
-
# )
|
| 71 |
-
# """)
|
| 72 |
-
|
| 73 |
-
# cursor.execute("""
|
| 74 |
-
# CREATE TABLE IF NOT EXISTS processing_log (
|
| 75 |
-
# id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 76 |
-
# filename TEXT NOT NULL,
|
| 77 |
-
# file_hash TEXT NOT NULL,
|
| 78 |
-
# processed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
|
| 79 |
-
# status TEXT,
|
| 80 |
-
# error_message TEXT,
|
| 81 |
-
# text_length INTEGER,
|
| 82 |
-
# chunk_count INTEGER
|
| 83 |
-
# )
|
| 84 |
-
# """)
|
| 85 |
-
|
| 86 |
-
# cursor.execute("""
|
| 87 |
-
# CREATE TABLE IF NOT EXISTS key_topics (
|
| 88 |
-
# id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 89 |
-
# file_hash TEXT NOT NULL,
|
| 90 |
-
# topic TEXT NOT NULL,
|
| 91 |
-
# relevance_score REAL,
|
| 92 |
-
# FOREIGN KEY (file_hash) REFERENCES regulatory_documents(file_hash)
|
| 93 |
-
# )
|
| 94 |
-
# """)
|
| 95 |
-
|
| 96 |
-
# conn.commit()
|
| 97 |
-
# conn.close()
|
| 98 |
-
|
| 99 |
-
# def load_manifest(self):
|
| 100 |
-
# """Load processing manifest"""
|
| 101 |
-
# if self.manifest_path.exists():
|
| 102 |
-
# with open(self.manifest_path, 'r') as f:
|
| 103 |
-
# return json.load(f)
|
| 104 |
-
# return {"processed_files": {}, "last_updated": None}
|
| 105 |
-
|
| 106 |
-
# def save_manifest(self):
|
| 107 |
-
# """Save processing manifest"""
|
| 108 |
-
# self.manifest["last_updated"] = datetime.now().isoformat()
|
| 109 |
-
# with open(self.manifest_path, 'w') as f:
|
| 110 |
-
# json.dump(self.manifest, f, indent=2)
|
| 111 |
-
|
| 112 |
-
# def get_file_hash(self, filepath):
|
| 113 |
-
# """Generate hash for file to track changes"""
|
| 114 |
-
# with open(filepath, 'rb') as f:
|
| 115 |
-
# return hashlib.md5(f.read()).hexdigest()
|
| 116 |
-
|
| 117 |
-
# def extract_metadata_from_pdf(self, pdf_path, text_content):
|
| 118 |
-
# """Extract regulatory metadata from PDF"""
|
| 119 |
-
# metadata = {
|
| 120 |
-
# "regulatory_body": "Unknown",
|
| 121 |
-
# "standard_type": "Document",
|
| 122 |
-
# "standard_code": "",
|
| 123 |
-
# "publication_date": "",
|
| 124 |
-
# "effective_date": "",
|
| 125 |
-
# "jurisdiction": "General",
|
| 126 |
-
# "industry": "General"
|
| 127 |
-
# }
|
| 128 |
-
|
| 129 |
-
# # Extract regulatory body
|
| 130 |
-
# regulatory_bodies = {
|
| 131 |
-
# "Dubai Municipality": ["dubai municipality", "dm ", "بلدية دبي", "@dmunicipality", "food safety department"],
|
| 132 |
-
# "HACCP": ["haccp", "hazard analysis"],
|
| 133 |
-
# "ISO": ["iso ", "international organization"],
|
| 134 |
-
# "GSO": ["gso ", "gcc standardization", "gulf standard"],
|
| 135 |
-
# "FDA": ["fda", "food and drug administration"],
|
| 136 |
-
# "ESMA": ["esma", "emirates authority for standardization", "emirates standardisation"],
|
| 137 |
-
# "SASO": ["saso", "saudi standards"],
|
| 138 |
-
# "UAE Ministry": ["uae ministry", "ministry of", "الإمارات العربية المتحدة", "ministry of environment and water"],
|
| 139 |
-
# "Federal Law": ["federal law", "uae law", "united arab emirates law"],
|
| 140 |
-
# "DHA": ["dubai health authority", "dha"],
|
| 141 |
-
# "Ministry of Health": ["ministry of health and prevention", "mohp"]
|
| 142 |
-
# }
|
| 143 |
-
|
| 144 |
-
# text_lower = text_content.lower()
|
| 145 |
-
# for body, patterns in regulatory_bodies.items():
|
| 146 |
-
# if any(pattern in text_lower for pattern in patterns):
|
| 147 |
-
# metadata["regulatory_body"] = body
|
| 148 |
-
# break
|
| 149 |
-
|
| 150 |
-
# # Extract standard code (e.g., ISO 22000, GSO 2055)
|
| 151 |
-
# standard_patterns = [
|
| 152 |
-
# r"(ISO\s*\d+(?::\d+)?)",
|
| 153 |
-
# r"(GSO\s*\d+(?:/\d+)?)",
|
| 154 |
-
# r"(HACCP\s*(?:Rev\s*\d+)?)",
|
| 155 |
-
# r"(DM[-/]\d+)",
|
| 156 |
-
# r"(ESMA\s*\d+)",
|
| 157 |
-
# r"(FDA\s*\d+)",
|
| 158 |
-
# r"(Edition\s*\d+)", # For Dubai Municipality documents
|
| 159 |
-
# r"(Federal Law No\.\s*\d+\s*of\s*\d+)", # For UAE Federal Laws
|
| 160 |
-
# r"(Circular\s*(?:No\.)?\s*\d+)",
|
| 161 |
-
# ]
|
| 162 |
-
|
| 163 |
-
# for pattern in standard_patterns:
|
| 164 |
-
# matches = re.findall(pattern, text_content, re.IGNORECASE)
|
| 165 |
-
# if matches:
|
| 166 |
-
# metadata["standard_code"] = matches[0]
|
| 167 |
-
# break
|
| 168 |
-
|
| 169 |
-
# # Extract publication/effective dates
|
| 170 |
-
# date_patterns = [
|
| 171 |
-
# r"(?:publication|published|issue)[\s:]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
|
| 172 |
-
# r"(?:effective|validity)[\s:]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
|
| 173 |
-
# r"(?:date|dated)[\s:]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
|
| 174 |
-
# ]
|
| 175 |
-
|
| 176 |
-
# for i, pattern in enumerate(date_patterns):
|
| 177 |
-
# matches = re.findall(pattern, text_content, re.IGNORECASE)
|
| 178 |
-
# if matches:
|
| 179 |
-
# if i == 0:
|
| 180 |
-
# metadata["publication_date"] = matches[0]
|
| 181 |
-
# elif i == 1:
|
| 182 |
-
# metadata["effective_date"] = matches[0]
|
| 183 |
-
# else:
|
| 184 |
-
# if not metadata["publication_date"]:
|
| 185 |
-
# metadata["publication_date"] = matches[0]
|
| 186 |
-
|
| 187 |
-
# # Extract jurisdiction
|
| 188 |
-
# jurisdictions = {
|
| 189 |
-
# "UAE": ["uae", "united arab emirates", "الإمارات"],
|
| 190 |
-
# "Dubai": ["dubai", "دبي"],
|
| 191 |
-
# "GCC": ["gcc", "gulf cooperation council", "مجلس التعاون الخليجي"],
|
| 192 |
-
# "International": ["international", "global"],
|
| 193 |
-
# }
|
| 194 |
-
|
| 195 |
-
# for jurisdiction, patterns in jurisdictions.items():
|
| 196 |
-
# if any(pattern in text_lower for pattern in patterns):
|
| 197 |
-
# metadata["jurisdiction"] = jurisdiction
|
| 198 |
-
# break
|
| 199 |
-
|
| 200 |
-
# # Determine industry/domain
|
| 201 |
-
# industry_keywords = {
|
| 202 |
-
# "Food": ["food", "beverage", "nutrition", "edible", "consumption"],
|
| 203 |
-
# "Pharmaceutical": ["pharmaceutical", "drug", "medicine", "pharma"],
|
| 204 |
-
# "Cosmetics": ["cosmetic", "beauty", "personal care"],
|
| 205 |
-
# "Medical Devices": ["medical device", "medical equipment"],
|
| 206 |
-
# "General Manufacturing": ["manufacturing", "production", "industrial"]
|
| 207 |
-
# }
|
| 208 |
-
|
| 209 |
-
# for industry, keywords in industry_keywords.items():
|
| 210 |
-
# if any(keyword in text_lower for keyword in keywords):
|
| 211 |
-
# metadata["industry"] = industry
|
| 212 |
-
# break
|
| 213 |
-
|
| 214 |
-
# # If no industry detected, default to Food (since this is for Swift Check)
|
| 215 |
-
# if not metadata["industry"]:
|
| 216 |
-
# metadata["industry"] = "Food"
|
| 217 |
-
|
| 218 |
-
# # Determine standard type
|
| 219 |
-
# if "haccp" in text_lower:
|
| 220 |
-
# metadata["standard_type"] = "Food Safety Management"
|
| 221 |
-
# elif "iso" in text_lower:
|
| 222 |
-
# metadata["standard_type"] = "International Standard"
|
| 223 |
-
# elif "municipal" in text_lower or "municipality" in text_lower:
|
| 224 |
-
# metadata["standard_type"] = "Local Regulation"
|
| 225 |
-
# elif "ministry" in text_lower:
|
| 226 |
-
# metadata["standard_type"] = "Government Regulation"
|
| 227 |
-
# else:
|
| 228 |
-
# metadata["standard_type"] = "Industry Standard"
|
| 229 |
-
|
| 230 |
-
# return metadata
|
| 231 |
-
|
| 232 |
-
# def ocr_pdf(self, pdf_path):
|
| 233 |
-
# """Use OCR to extract text from PDF"""
|
| 234 |
-
# try:
|
| 235 |
-
# # Convert PDF to images
|
| 236 |
-
# images = pdf2image.convert_from_path(pdf_path)
|
| 237 |
-
# full_text = ""
|
| 238 |
-
|
| 239 |
-
# for i, image in enumerate(images):
|
| 240 |
-
# # Perform OCR
|
| 241 |
-
# text = pytesseract.image_to_string(image)
|
| 242 |
-
# full_text += f"\n--- Page {i+1} ---\n{text}"
|
| 243 |
-
|
| 244 |
-
# return full_text
|
| 245 |
-
# except Exception as e:
|
| 246 |
-
# print(f"OCR error: {e}")
|
| 247 |
-
# return ""
|
| 248 |
-
|
| 249 |
-
# def extract_key_topics(self, text):
|
| 250 |
-
# """Extract key regulatory topics from text"""
|
| 251 |
-
# topics = set()
|
| 252 |
-
|
| 253 |
-
# # Define topic patterns
|
| 254 |
-
# topic_patterns = {
|
| 255 |
-
# "Temperature Control": ["temperature", "cold chain", "frozen", "refrigerated", "cooling"],
|
| 256 |
-
# "Packaging Requirements": ["packaging", "labeling", "package", "container"],
|
| 257 |
-
# "Microbiological Standards": ["microbiological", "bacteria", "pathogen", "contamination"],
|
| 258 |
-
# "Chemical Requirements": ["chemical", "pesticide", "residue", "additive", "preservative"],
|
| 259 |
-
# "Traceability": ["traceability", "track", "batch", "lot number"],
|
| 260 |
-
# "Storage Requirements": ["storage", "warehouse", "shelf life"],
|
| 261 |
-
# "Transportation": ["transport", "distribution", "delivery"],
|
| 262 |
-
# "Documentation": ["documentation", "record", "certificate", "report"],
|
| 263 |
-
# "Testing Requirements": ["testing", "analysis", "laboratory", "sample"],
|
| 264 |
-
# "Hygiene Standards": ["hygiene", "sanitation", "cleaning", "disinfection"],
|
| 265 |
-
# "HACCP Principles": ["haccp", "critical control", "hazard analysis"],
|
| 266 |
-
# "Certification": ["certification", "accreditation", "approval", "license"],
|
| 267 |
-
# "Compliance": ["compliance", "conform", "requirement", "specification"],
|
| 268 |
-
# "Quality Management": ["quality management", "qms", "quality system"],
|
| 269 |
-
# "Risk Assessment": ["risk assessment", "risk analysis", "hazard"],
|
| 270 |
-
# # COVID-19 specific topics
|
| 271 |
-
# "COVID-19 Guidelines": ["covid-19", "coronavirus", "pandemic", "quarantine"],
|
| 272 |
-
# "Social Distancing": ["social distancing", "physical distancing", "2 meters"],
|
| 273 |
-
# "PPE Requirements": ["ppe", "personal protective equipment", "masks", "gloves"],
|
| 274 |
-
# "Employee Health": ["employee health", "health screening", "symptoms"],
|
| 275 |
-
# "Disinfection": ["disinfection", "sanitization", "cleaning and disinfection"],
|
| 276 |
-
# # Food specific topics
|
| 277 |
-
# "Food Safety": ["food safety", "food hygiene", "food handling"],
|
| 278 |
-
# "Halal Requirements": ["halal", "islamic", "sharia"],
|
| 279 |
-
# "Allergen Management": ["allergen", "allergy", "contains", "may contain"],
|
| 280 |
-
# "Import/Export": ["import", "export", "customs", "border"]
|
| 281 |
-
# }
|
| 282 |
-
|
| 283 |
-
# text_lower = text.lower()
|
| 284 |
-
|
| 285 |
-
# for topic, keywords in topic_patterns.items():
|
| 286 |
-
# # Calculate relevance score based on keyword frequency
|
| 287 |
-
# count = sum(1 for keyword in keywords if keyword in text_lower)
|
| 288 |
-
# if count > 0:
|
| 289 |
-
# relevance_score = count / len(keywords)
|
| 290 |
-
# topics.add((topic, relevance_score))
|
| 291 |
-
|
| 292 |
-
# return list(topics)
|
| 293 |
-
|
| 294 |
-
# def create_chunks(self, text, metadata):
|
| 295 |
-
# """Create text chunks for vector storage"""
|
| 296 |
-
# text_splitter = RecursiveCharacterTextSplitter(
|
| 297 |
-
# chunk_size=1500, # Larger chunks for regulatory docs
|
| 298 |
-
# chunk_overlap=300,
|
| 299 |
-
# length_function=len,
|
| 300 |
-
# separators=["\n\n", "\n", ". ", " ", ""]
|
| 301 |
-
# )
|
| 302 |
-
|
| 303 |
-
# chunks = text_splitter.split_text(text)
|
| 304 |
-
# documents = []
|
| 305 |
-
|
| 306 |
-
# for i, chunk in enumerate(chunks):
|
| 307 |
-
# doc_metadata = metadata.copy()
|
| 308 |
-
# doc_metadata["chunk_index"] = i
|
| 309 |
-
# doc_metadata["chunk_size"] = len(chunk)
|
| 310 |
-
# doc_metadata["total_chunks"] = len(chunks)
|
| 311 |
-
# documents.append({
|
| 312 |
-
# "text": chunk,
|
| 313 |
-
# "metadata": doc_metadata
|
| 314 |
-
# })
|
| 315 |
-
|
| 316 |
-
# return documents
|
| 317 |
-
|
| 318 |
-
# def process_pdf(self, pdf_path):
|
| 319 |
-
# """Process a single PDF file"""
|
| 320 |
-
# pdf_path = Path(pdf_path)
|
| 321 |
-
# file_hash = self.get_file_hash(pdf_path)
|
| 322 |
-
# filename = pdf_path.name
|
| 323 |
-
|
| 324 |
-
# # Check if already processed
|
| 325 |
-
# if filename in self.manifest["processed_files"]:
|
| 326 |
-
# if self.manifest["processed_files"][filename]["hash"] == file_hash:
|
| 327 |
-
# print(f"Skipping {filename} - already processed")
|
| 328 |
-
# return
|
| 329 |
-
|
| 330 |
-
# print(f"Processing {filename}...")
|
| 331 |
-
|
| 332 |
-
# try:
|
| 333 |
-
# # Load PDF content
|
| 334 |
-
# loader = PyPDFLoader(str(pdf_path))
|
| 335 |
-
# pages = loader.load()
|
| 336 |
-
|
| 337 |
-
# # Combine all pages
|
| 338 |
-
# full_text = ""
|
| 339 |
-
# for i, page in enumerate(pages):
|
| 340 |
-
# full_text += f"\n--- Page {i+1} ---\n{page.page_content}"
|
| 341 |
-
|
| 342 |
-
# # If text is too short, use OCR
|
| 343 |
-
# if len(full_text.strip()) < 100:
|
| 344 |
-
# print(f"Using OCR for {filename}")
|
| 345 |
-
# ocr_text = self.ocr_pdf(pdf_path)
|
| 346 |
-
# if len(ocr_text) > len(full_text):
|
| 347 |
-
# full_text = ocr_text
|
| 348 |
-
|
| 349 |
-
# # Extract regulatory metadata
|
| 350 |
-
# reg_metadata = self.extract_metadata_from_pdf(pdf_path, full_text)
|
| 351 |
-
|
| 352 |
-
# # Extract key topics
|
| 353 |
-
# topics = self.extract_key_topics(full_text)
|
| 354 |
-
|
| 355 |
-
# # Create base metadata for chunks
|
| 356 |
-
# metadata = {
|
| 357 |
-
# "source": filename,
|
| 358 |
-
# "regulatory_body": reg_metadata["regulatory_body"] or "Unknown",
|
| 359 |
-
# "standard_type": reg_metadata["standard_type"] or "Unknown",
|
| 360 |
-
# "standard_code": reg_metadata["standard_code"] or "",
|
| 361 |
-
# "jurisdiction": reg_metadata["jurisdiction"] or "Unknown",
|
| 362 |
-
# "industry": reg_metadata["industry"] or "General",
|
| 363 |
-
# "publication_date": reg_metadata["publication_date"] or "",
|
| 364 |
-
# "effective_date": reg_metadata["effective_date"] or "",
|
| 365 |
-
# "file_hash": file_hash,
|
| 366 |
-
# "processed_date": datetime.now().isoformat(),
|
| 367 |
-
# "topics": ", ".join([topic[0] for topic in topics]) if topics else ""
|
| 368 |
-
# }
|
| 369 |
-
|
| 370 |
-
# # Create chunks
|
| 371 |
-
# documents = self.create_chunks(full_text, metadata)
|
| 372 |
-
|
| 373 |
-
# # Generate embeddings and store in ChromaDB
|
| 374 |
-
# for i, doc in enumerate(documents):
|
| 375 |
-
# embedding = self.embedder.encode(doc["text"]).tolist()
|
| 376 |
-
|
| 377 |
-
# self.collection.add(
|
| 378 |
-
# documents=[doc["text"]],
|
| 379 |
-
# embeddings=[embedding],
|
| 380 |
-
# metadatas=[doc["metadata"]],
|
| 381 |
-
# ids=[f"{file_hash}_{i}"]
|
| 382 |
-
# )
|
| 383 |
-
|
| 384 |
-
# # Store metadata in SQLite
|
| 385 |
-
# self.save_metadata(file_hash, filename, reg_metadata)
|
| 386 |
-
|
| 387 |
-
# # Store topics
|
| 388 |
-
# self.save_topics(file_hash, topics)
|
| 389 |
-
|
| 390 |
-
# # Update manifest
|
| 391 |
-
# self.manifest["processed_files"][filename] = {
|
| 392 |
-
# "hash": file_hash,
|
| 393 |
-
# "processed_date": datetime.now().isoformat(),
|
| 394 |
-
# "chunks": len(documents),
|
| 395 |
-
# "text_length": len(full_text),
|
| 396 |
-
# "regulatory_body": reg_metadata["regulatory_body"],
|
| 397 |
-
# "standard_code": reg_metadata["standard_code"]
|
| 398 |
-
# }
|
| 399 |
-
# self.save_manifest()
|
| 400 |
-
|
| 401 |
-
# # Log success
|
| 402 |
-
# self.log_processing(filename, file_hash, "SUCCESS", None, len(full_text), len(documents))
|
| 403 |
-
|
| 404 |
-
# print(f"Successfully processed {filename}")
|
| 405 |
-
# print(f" - Regulatory Body: {reg_metadata['regulatory_body']}")
|
| 406 |
-
# print(f" - Standard Code: {reg_metadata['standard_code']}")
|
| 407 |
-
# print(f" - Text chunks: {len(documents)}")
|
| 408 |
-
# print(f" - Topics extracted: {len(topics)}")
|
| 409 |
-
|
| 410 |
-
# except Exception as e:
|
| 411 |
-
# error_msg = str(e)
|
| 412 |
-
# print(f"Error processing {filename}: {error_msg}")
|
| 413 |
-
# import traceback
|
| 414 |
-
# traceback.print_exc()
|
| 415 |
-
# self.log_processing(filename, file_hash, "ERROR", error_msg, 0, 0)
|
| 416 |
-
|
| 417 |
-
# def save_metadata(self, file_hash, filename, metadata):
|
| 418 |
-
# """Save regulatory metadata to SQLite"""
|
| 419 |
-
# conn = sqlite3.connect(self.metadata_db_path)
|
| 420 |
-
# cursor = conn.cursor()
|
| 421 |
-
|
| 422 |
-
# try:
|
| 423 |
-
# cursor.execute("""
|
| 424 |
-
# INSERT OR REPLACE INTO regulatory_documents
|
| 425 |
-
# (file_hash, filename, regulatory_body, standard_type, standard_code,
|
| 426 |
-
# publication_date, effective_date, jurisdiction, industry)
|
| 427 |
-
# VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 428 |
-
# """, (
|
| 429 |
-
# file_hash,
|
| 430 |
-
# filename,
|
| 431 |
-
# metadata["regulatory_body"] or "Unknown",
|
| 432 |
-
# metadata["standard_type"] or "Document",
|
| 433 |
-
# metadata["standard_code"] or "",
|
| 434 |
-
# metadata["publication_date"] or "",
|
| 435 |
-
# metadata["effective_date"] or "",
|
| 436 |
-
# metadata["jurisdiction"] or "General",
|
| 437 |
-
# metadata["industry"] or "General"
|
| 438 |
-
# ))
|
| 439 |
-
|
| 440 |
-
# conn.commit()
|
| 441 |
-
# finally:
|
| 442 |
-
# conn.close()
|
| 443 |
-
|
| 444 |
-
# def save_topics(self, file_hash, topics):
|
| 445 |
-
# """Save extracted topics to SQLite"""
|
| 446 |
-
# conn = sqlite3.connect(self.metadata_db_path)
|
| 447 |
-
# cursor = conn.cursor()
|
| 448 |
-
|
| 449 |
-
# try:
|
| 450 |
-
# # Delete existing topics for this file
|
| 451 |
-
# cursor.execute("DELETE FROM key_topics WHERE file_hash = ?", (file_hash,))
|
| 452 |
-
|
| 453 |
-
# # Insert new topics
|
| 454 |
-
# for topic, relevance_score in topics:
|
| 455 |
-
# cursor.execute("""
|
| 456 |
-
# INSERT INTO key_topics
|
| 457 |
-
# (file_hash, topic, relevance_score)
|
| 458 |
-
# VALUES (?, ?, ?)
|
| 459 |
-
# """, (file_hash, topic, relevance_score))
|
| 460 |
-
|
| 461 |
-
# conn.commit()
|
| 462 |
-
# finally:
|
| 463 |
-
# conn.close()
|
| 464 |
-
|
| 465 |
-
# def log_processing(self, filename, file_hash, status, error_message, text_length=0, chunk_count=0):
|
| 466 |
-
# """Log processing status"""
|
| 467 |
-
# conn = sqlite3.connect(self.metadata_db_path)
|
| 468 |
-
# cursor = conn.cursor()
|
| 469 |
-
|
| 470 |
-
# try:
|
| 471 |
-
# cursor.execute("""
|
| 472 |
-
# INSERT INTO processing_log
|
| 473 |
-
# (filename, file_hash, status, error_message, text_length, chunk_count)
|
| 474 |
-
# VALUES (?, ?, ?, ?, ?, ?)
|
| 475 |
-
# """, (filename, file_hash, status, error_message, text_length, chunk_count))
|
| 476 |
-
|
| 477 |
-
# conn.commit()
|
| 478 |
-
# finally:
|
| 479 |
-
# conn.close()
|
| 480 |
-
|
| 481 |
-
# def process_all_pdfs(self):
|
| 482 |
-
# """Process all PDFs in the directory"""
|
| 483 |
-
# pdf_files = list(self.pdf_path.glob("*.pdf"))
|
| 484 |
-
|
| 485 |
-
# if not pdf_files:
|
| 486 |
-
# print(f"No PDF files found in {self.pdf_path}")
|
| 487 |
-
# return
|
| 488 |
-
|
| 489 |
-
# print(f"Found {len(pdf_files)} PDF files")
|
| 490 |
-
|
| 491 |
-
# for pdf_file in pdf_files:
|
| 492 |
-
# self.process_pdf(pdf_file)
|
| 493 |
-
|
| 494 |
-
# print("\nProcessing complete!")
|
| 495 |
-
# print(f"Total files in manifest: {len(self.manifest['processed_files'])}")
|
| 496 |
-
|
| 497 |
-
# def get_processing_stats(self):
|
| 498 |
-
# """Get processing statistics"""
|
| 499 |
-
# conn = sqlite3.connect(self.metadata_db_path)
|
| 500 |
-
# cursor = conn.cursor()
|
| 501 |
-
|
| 502 |
-
# try:
|
| 503 |
-
# # Get overall stats
|
| 504 |
-
# cursor.execute("""
|
| 505 |
-
# SELECT COUNT(*) as total,
|
| 506 |
-
# SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
|
| 507 |
-
# SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors
|
| 508 |
-
# FROM processing_log
|
| 509 |
-
# """)
|
| 510 |
-
|
| 511 |
-
# stats = cursor.fetchone()
|
| 512 |
-
|
| 513 |
-
# # Get regulatory body distribution
|
| 514 |
-
# cursor.execute("""
|
| 515 |
-
# SELECT regulatory_body, COUNT(*) as count
|
| 516 |
-
# FROM regulatory_documents
|
| 517 |
-
# GROUP BY regulatory_body
|
| 518 |
-
# ORDER BY count DESC
|
| 519 |
-
# """)
|
| 520 |
-
|
| 521 |
-
# body_dist = cursor.fetchall()
|
| 522 |
-
|
| 523 |
-
# # Get top topics
|
| 524 |
-
# cursor.execute("""
|
| 525 |
-
# SELECT topic, COUNT(*) as count, AVG(relevance_score) as avg_relevance
|
| 526 |
-
# FROM key_topics
|
| 527 |
-
# GROUP BY topic
|
| 528 |
-
# ORDER BY count DESC
|
| 529 |
-
# LIMIT 10
|
| 530 |
-
# """)
|
| 531 |
-
|
| 532 |
-
# top_topics = cursor.fetchall()
|
| 533 |
-
|
| 534 |
-
# return {
|
| 535 |
-
# "total_processed": stats[0],
|
| 536 |
-
# "successful": stats[1],
|
| 537 |
-
# "errors": stats[2],
|
| 538 |
-
# "regulatory_bodies": dict(body_dist),
|
| 539 |
-
# "top_topics": [{"topic": t[0], "count": t[1], "relevance": t[2]} for t in top_topics]
|
| 540 |
-
# }
|
| 541 |
-
# finally:
|
| 542 |
-
# conn.close()
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
# def main():
|
| 546 |
-
# """Main function to create/update the regulatory guidelines database"""
|
| 547 |
-
# print("Starting Regulatory Guidelines Database Creation...")
|
| 548 |
-
|
| 549 |
-
# # Initialize database
|
| 550 |
-
# db = RegulatoryGuidelinesDB()
|
| 551 |
-
|
| 552 |
-
# # Process all PDFs
|
| 553 |
-
# db.process_all_pdfs()
|
| 554 |
-
|
| 555 |
-
# # Show processing stats
|
| 556 |
-
# print("\nProcessing Statistics:")
|
| 557 |
-
# stats = db.get_processing_stats()
|
| 558 |
-
# print(f"Total files processed: {stats['total_processed']}")
|
| 559 |
-
# print(f"Successful: {stats['successful']}")
|
| 560 |
-
# print(f"Errors: {stats['errors']}")
|
| 561 |
-
|
| 562 |
-
# print("\nRegulatory Bodies:")
|
| 563 |
-
# for body, count in stats["regulatory_bodies"].items():
|
| 564 |
-
# print(f" - {body}: {count} documents")
|
| 565 |
-
|
| 566 |
-
# print("\nTop Topics:")
|
| 567 |
-
# for topic_data in stats["top_topics"]:
|
| 568 |
-
# print(f" - {topic_data['topic']}: {topic_data['count']} documents (relevance: {topic_data['relevance']:.2f})")
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
# if __name__ == "__main__":
|
| 572 |
-
# main()
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
import os
|
| 577 |
import json
|
| 578 |
import sqlite3
|
|
@@ -612,7 +37,7 @@ class EnhancedRegulatoryVectorDB:
|
|
| 612 |
# Get or create collection
|
| 613 |
self.collection = self.client.get_or_create_collection(
|
| 614 |
name="regulatory_guidelines",
|
| 615 |
-
metadata={"description": "
|
| 616 |
)
|
| 617 |
|
| 618 |
# Initialize embedding model
|
|
@@ -625,7 +50,7 @@ class EnhancedRegulatoryVectorDB:
|
|
| 625 |
self.manifest = self.load_manifest()
|
| 626 |
|
| 627 |
def init_metadata_db(self):
|
| 628 |
-
"""Initialize
|
| 629 |
conn = sqlite3.connect(self.metadata_db_path)
|
| 630 |
cursor = conn.cursor()
|
| 631 |
|
|
@@ -714,25 +139,27 @@ class EnhancedRegulatoryVectorDB:
|
|
| 714 |
return hashlib.md5(f.read()).hexdigest()
|
| 715 |
|
| 716 |
def extract_sections_and_clauses(self, text_content):
|
| 717 |
-
"""
|
| 718 |
sections = []
|
| 719 |
|
| 720 |
-
#
|
| 721 |
section_patterns = [
|
| 722 |
-
#
|
| 723 |
-
r'(\d
|
| 724 |
-
#
|
| 725 |
-
r'(
|
| 726 |
-
#
|
| 727 |
-
r'(
|
| 728 |
-
# Section format: "Section
|
| 729 |
-
r'(Section\s+\d
|
| 730 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
| 731 |
r'^(\d+\.\s+)([A-Z][^.\n\r]+)',
|
| 732 |
-
#
|
| 733 |
-
r'
|
| 734 |
-
# Prerequisites: "7.1. Management Policy"
|
| 735 |
-
r'^(\d+\.\d+\.\s+)([A-Z][^.\n\r]+)',
|
| 736 |
]
|
| 737 |
|
| 738 |
lines = text_content.split('\n')
|
|
@@ -763,7 +190,7 @@ class EnhancedRegulatoryVectorDB:
|
|
| 763 |
|
| 764 |
if section_title: # Only add if we have a meaningful title
|
| 765 |
# Determine section level
|
| 766 |
-
level = section_num.count('.')
|
| 767 |
|
| 768 |
# Extract content preview (next few lines)
|
| 769 |
preview_lines = []
|
|
@@ -789,7 +216,7 @@ class EnhancedRegulatoryVectorDB:
|
|
| 789 |
return sections
|
| 790 |
|
| 791 |
def extract_enhanced_metadata(self, pdf_path, text_content):
|
| 792 |
-
"""
|
| 793 |
metadata = {
|
| 794 |
"regulatory_body": "Unknown",
|
| 795 |
"standard_type": "Document",
|
|
@@ -803,38 +230,49 @@ class EnhancedRegulatoryVectorDB:
|
|
| 803 |
|
| 804 |
text_lower = text_content.lower()
|
| 805 |
|
| 806 |
-
#
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
"
|
| 813 |
-
"
|
| 814 |
-
"
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
"
|
| 818 |
-
"
|
| 819 |
-
"
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 823 |
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
|
|
|
|
| 827 |
break
|
| 828 |
|
| 829 |
-
#
|
| 830 |
standard_patterns = [
|
| 831 |
-
r"
|
| 832 |
-
r"
|
| 833 |
-
r"
|
| 834 |
-
r"
|
| 835 |
-
r"Dubai\s*Municipality\s*[-–]\s*Food\s*Control\s*Section",
|
| 836 |
-
r"Federal\s*Law\s*No\.\s*\d+\s*of\s*\d+",
|
| 837 |
-
r"Administrative\s*Order\s*No\.\s*\d+/\d+",
|
| 838 |
]
|
| 839 |
|
| 840 |
for pattern in standard_patterns:
|
|
@@ -844,18 +282,18 @@ class EnhancedRegulatoryVectorDB:
|
|
| 844 |
break
|
| 845 |
|
| 846 |
# Document structure detection
|
| 847 |
-
if
|
| 848 |
-
metadata["document_structure"] = "
|
| 849 |
elif re.search(r'\d+\.\d+\s+[A-Z]', text_content):
|
| 850 |
metadata["document_structure"] = "numbered_sections"
|
| 851 |
else:
|
| 852 |
metadata["document_structure"] = "flat"
|
| 853 |
|
| 854 |
-
#
|
| 855 |
date_patterns = [
|
| 856 |
-
r"
|
| 857 |
-
r"
|
| 858 |
-
r"(\d{1,2}
|
| 859 |
r"(\d{4})"
|
| 860 |
]
|
| 861 |
|
|
@@ -865,80 +303,83 @@ class EnhancedRegulatoryVectorDB:
|
|
| 865 |
metadata["publication_date"] = matches[0]
|
| 866 |
break
|
| 867 |
|
| 868 |
-
# Industry
|
| 869 |
-
|
| 870 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 871 |
|
| 872 |
-
|
| 873 |
-
|
| 874 |
-
|
| 875 |
-
|
| 876 |
-
|
| 877 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 878 |
|
| 879 |
return metadata
|
| 880 |
|
| 881 |
def extract_enhanced_topics(self, text, sections):
|
| 882 |
-
"""
|
| 883 |
topics = []
|
| 884 |
|
| 885 |
-
#
|
| 886 |
topic_patterns = {
|
| 887 |
-
"
|
| 888 |
-
"keywords": ["
|
| 889 |
-
"section_hints": ["
|
| 890 |
},
|
| 891 |
-
"
|
| 892 |
-
"keywords": ["
|
| 893 |
-
"section_hints": ["
|
| 894 |
-
},
|
| 895 |
-
"Critical Control Points": {
|
| 896 |
-
"keywords": ["critical control point", "ccp", "control points", "decision tree"],
|
| 897 |
-
"section_hints": ["8.8", "ccp", "critical"]
|
| 898 |
},
|
| 899 |
-
"
|
| 900 |
-
"keywords": ["
|
| 901 |
-
"section_hints": ["
|
| 902 |
},
|
| 903 |
-
"
|
| 904 |
-
"keywords": ["
|
| 905 |
-
"section_hints": ["
|
| 906 |
},
|
| 907 |
-
"
|
| 908 |
-
"keywords": ["
|
| 909 |
-
"section_hints": ["
|
| 910 |
},
|
| 911 |
"Corrective Actions": {
|
| 912 |
-
"keywords": ["corrective action", "
|
| 913 |
-
"section_hints": ["
|
| 914 |
-
},
|
| 915 |
-
"Monitoring Systems": {
|
| 916 |
-
"keywords": ["monitoring", "monitoring system", "continuous monitoring"],
|
| 917 |
-
"section_hints": ["8.10", "monitoring"]
|
| 918 |
},
|
| 919 |
-
"
|
| 920 |
-
"keywords": ["
|
| 921 |
-
"section_hints": ["
|
| 922 |
},
|
| 923 |
-
"
|
| 924 |
-
"keywords": ["
|
| 925 |
-
"section_hints": ["
|
| 926 |
},
|
| 927 |
-
"
|
| 928 |
-
"keywords": ["
|
| 929 |
-
"section_hints": ["
|
| 930 |
},
|
| 931 |
-
"
|
| 932 |
-
"keywords": ["
|
| 933 |
-
"section_hints": ["
|
| 934 |
},
|
| 935 |
-
"
|
| 936 |
-
"keywords": ["
|
| 937 |
-
"section_hints": ["
|
| 938 |
},
|
| 939 |
-
"
|
| 940 |
-
"keywords": ["
|
| 941 |
-
"section_hints": ["
|
| 942 |
}
|
| 943 |
}
|
| 944 |
|
|
@@ -1052,7 +493,7 @@ class EnhancedRegulatoryVectorDB:
|
|
| 1052 |
return None
|
| 1053 |
|
| 1054 |
def process_pdf(self, pdf_path):
|
| 1055 |
-
"""
|
| 1056 |
pdf_path = Path(pdf_path)
|
| 1057 |
file_hash = self.get_file_hash(pdf_path)
|
| 1058 |
filename = pdf_path.name
|
|
@@ -1278,102 +719,102 @@ class EnhancedRegulatoryVectorDB:
|
|
| 1278 |
for pdf_file in pdf_files:
|
| 1279 |
self.process_pdf(pdf_file)
|
| 1280 |
|
| 1281 |
-
print(f"\n🎯 Processing complete!
|
| 1282 |
print(f"📊 Total files in manifest: {len(self.manifest['processed_files'])}")
|
| 1283 |
-
|
| 1284 |
def get_enhanced_stats(self):
|
| 1285 |
-
|
| 1286 |
-
|
| 1287 |
-
|
| 1288 |
-
|
| 1289 |
-
|
| 1290 |
-
|
| 1291 |
-
|
| 1292 |
-
|
| 1293 |
-
|
| 1294 |
-
|
| 1295 |
-
|
| 1296 |
-
|
| 1297 |
-
|
| 1298 |
-
|
| 1299 |
-
|
| 1300 |
-
|
| 1301 |
-
|
| 1302 |
-
|
| 1303 |
-
|
| 1304 |
-
|
| 1305 |
-
|
| 1306 |
-
|
| 1307 |
-
|
| 1308 |
-
|
| 1309 |
-
|
| 1310 |
-
|
| 1311 |
-
|
| 1312 |
-
|
| 1313 |
-
|
| 1314 |
-
|
| 1315 |
-
|
| 1316 |
-
|
| 1317 |
-
|
| 1318 |
-
|
| 1319 |
-
|
| 1320 |
-
|
| 1321 |
-
|
| 1322 |
-
|
| 1323 |
-
|
| 1324 |
-
|
| 1325 |
-
|
| 1326 |
-
|
| 1327 |
-
|
| 1328 |
-
|
| 1329 |
-
|
| 1330 |
-
|
| 1331 |
-
|
| 1332 |
-
|
| 1333 |
|
| 1334 |
|
| 1335 |
def main():
|
| 1336 |
-
|
| 1337 |
-
|
| 1338 |
-
|
| 1339 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1340 |
|
| 1341 |
if __name__ == "__main__":
|
| 1342 |
-
|
| 1343 |
-
|
| 1344 |
-
# Initialize enhanced database
|
| 1345 |
-
db = EnhancedRegulatoryVectorDB()
|
| 1346 |
-
|
| 1347 |
-
# Process all PDFs
|
| 1348 |
-
db.process_all_pdfs()
|
| 1349 |
-
|
| 1350 |
-
# Show enhanced processing stats
|
| 1351 |
-
print("\n" + "=" * 80)
|
| 1352 |
-
print("📊 ENHANCED PROCESSING STATISTICS:")
|
| 1353 |
-
print("=" * 80)
|
| 1354 |
-
|
| 1355 |
-
stats = db.get_enhanced_stats()
|
| 1356 |
-
print(f"📄 Total files processed: {stats['total_processed']}")
|
| 1357 |
-
print(f"✅ Successful: {stats['successful']}")
|
| 1358 |
-
print(f"❌ Errors: {stats['errors']}")
|
| 1359 |
-
print(f"📑 Total sections extracted: {stats['total_sections']}")
|
| 1360 |
-
|
| 1361 |
-
print(f"\n🏛️ REGULATORY BODIES:")
|
| 1362 |
-
for body, count, sections in stats["regulatory_bodies"]:
|
| 1363 |
-
print(f" - {body}: {count} documents ({sections} sections)")
|
| 1364 |
-
|
| 1365 |
-
print(f"\n🎯 TOP TOPICS WITH CLAUSE REFERENCES:")
|
| 1366 |
-
for topic_data in stats["top_topics"]:
|
| 1367 |
-
sections_info = topic_data['sections'][:50] + "..." if len(topic_data['sections']) > 50 else topic_data['sections']
|
| 1368 |
-
print(f" - {topic_data['topic']}: {topic_data['count']} documents")
|
| 1369 |
-
print(f" └── Relevance: {topic_data['relevance']:.2f} | Sections: {sections_info}")
|
| 1370 |
-
|
| 1371 |
-
print("\n" + "=" * 80)
|
| 1372 |
-
print("🎉 Enhanced Regulatory VDB Creation Complete!")
|
| 1373 |
-
print("🔍 HACCP clause references are now available for the demo")
|
| 1374 |
-
print("📝 The system can now provide:")
|
| 1375 |
-
print(" - Section-specific guidance (e.g., 'Section 7.8 - Temperature Control')")
|
| 1376 |
-
print(" - Clause references for each parameter")
|
| 1377 |
-
print(" - Regulatory body attribution")
|
| 1378 |
-
print(" - Hierarchical document structure awareness")
|
| 1379 |
-
print("=" * 80)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import sqlite3
|
|
|
|
| 37 |
# Get or create collection
|
| 38 |
self.collection = self.client.get_or_create_collection(
|
| 39 |
name="regulatory_guidelines",
|
| 40 |
+
metadata={"description": "Regulatory guidelines and standards for quality control"}
|
| 41 |
)
|
| 42 |
|
| 43 |
# Initialize embedding model
|
|
|
|
| 50 |
self.manifest = self.load_manifest()
|
| 51 |
|
| 52 |
def init_metadata_db(self):
|
| 53 |
+
"""Initialize SQLite database for storing regulatory metadata"""
|
| 54 |
conn = sqlite3.connect(self.metadata_db_path)
|
| 55 |
cursor = conn.cursor()
|
| 56 |
|
|
|
|
| 139 |
return hashlib.md5(f.read()).hexdigest()
|
| 140 |
|
| 141 |
def extract_sections_and_clauses(self, text_content):
|
| 142 |
+
"""Generic section and clause extraction for regulatory documents"""
|
| 143 |
sections = []
|
| 144 |
|
| 145 |
+
# Generic section patterns that work for any regulatory document
|
| 146 |
section_patterns = [
|
| 147 |
+
# Numbered sections: "1.2.3 Title"
|
| 148 |
+
r'(\d+(?:\.\d+)*\.?\s+)([A-Z][^.\n\r]+)',
|
| 149 |
+
# Lettered sections: "A.1 Title"
|
| 150 |
+
r'([A-Z]\.\d+\s+)([A-Z][^.\n\r]+)',
|
| 151 |
+
# Article format: "Article 1:"
|
| 152 |
+
r'(Article\s+\d+)[\s:]*([^.\n\r]*)',
|
| 153 |
+
# Section format: "Section 1.2"
|
| 154 |
+
r'(Section\s+\d+(?:\.\d+)*)[\s\-–]*([^.\n\r]*)',
|
| 155 |
+
# Chapter format: "Chapter 1"
|
| 156 |
+
r'(Chapter\s+\d+)[\s:]*([^.\n\r]*)',
|
| 157 |
+
# Part format: "Part I"
|
| 158 |
+
r'(Part\s+[IVX]+)[\s:]*([^.\n\r]*)',
|
| 159 |
+
# Simple numbered: "1. Title"
|
| 160 |
r'^(\d+\.\s+)([A-Z][^.\n\r]+)',
|
| 161 |
+
# Annex format: "Annex 1"
|
| 162 |
+
r'(Annex\s+\d+)[\s:]*([^.\n\r]*)',
|
|
|
|
|
|
|
| 163 |
]
|
| 164 |
|
| 165 |
lines = text_content.split('\n')
|
|
|
|
| 190 |
|
| 191 |
if section_title: # Only add if we have a meaningful title
|
| 192 |
# Determine section level
|
| 193 |
+
level = section_num.count('.')
|
| 194 |
|
| 195 |
# Extract content preview (next few lines)
|
| 196 |
preview_lines = []
|
|
|
|
| 216 |
return sections
|
| 217 |
|
| 218 |
def extract_enhanced_metadata(self, pdf_path, text_content):
|
| 219 |
+
"""Generic metadata extraction without bias toward specific standards"""
|
| 220 |
metadata = {
|
| 221 |
"regulatory_body": "Unknown",
|
| 222 |
"standard_type": "Document",
|
|
|
|
| 230 |
|
| 231 |
text_lower = text_content.lower()
|
| 232 |
|
| 233 |
+
# UPDATED: Generic regulatory body detection without prioritization
|
| 234 |
+
# Extract regulatory body from document content
|
| 235 |
+
regulatory_indicators = [
|
| 236 |
+
# International standards
|
| 237 |
+
(r"iso\s*\d+", "ISO"),
|
| 238 |
+
(r"iec\s*\d+", "IEC"),
|
| 239 |
+
(r"codex\s+alimentarius", "Codex Alimentarius"),
|
| 240 |
+
(r"who\s+guidelines", "WHO"),
|
| 241 |
+
(r"fao\s+standards", "FAO"),
|
| 242 |
+
|
| 243 |
+
# Regional standards
|
| 244 |
+
(r"european\s+union", "EU"),
|
| 245 |
+
(r"gcc\s+standard", "GCC"),
|
| 246 |
+
(r"asean\s+standard", "ASEAN"),
|
| 247 |
+
|
| 248 |
+
# National standards
|
| 249 |
+
(r"uae\s+standard", "UAE National"),
|
| 250 |
+
(r"saudi\s+standard", "Saudi Arabia"),
|
| 251 |
+
(r"indian\s+standard", "India"),
|
| 252 |
+
|
| 253 |
+
# Generic detection
|
| 254 |
+
(r"ministry\s+of\s+\w+", "Government Ministry"),
|
| 255 |
+
(r"department\s+of\s+\w+", "Government Department"),
|
| 256 |
+
(r"authority\s+for\s+\w+", "Regulatory Authority"),
|
| 257 |
+
|
| 258 |
+
# Industry standards
|
| 259 |
+
(r"haccp", "HACCP System"),
|
| 260 |
+
(r"gmp", "GMP"),
|
| 261 |
+
(r"gap", "GAP"),
|
| 262 |
+
]
|
| 263 |
|
| 264 |
+
# Find regulatory body without bias
|
| 265 |
+
for pattern, body_name in regulatory_indicators:
|
| 266 |
+
if re.search(pattern, text_lower):
|
| 267 |
+
metadata["regulatory_body"] = body_name
|
| 268 |
break
|
| 269 |
|
| 270 |
+
# Extract standard code generically
|
| 271 |
standard_patterns = [
|
| 272 |
+
r"(?:standard|guideline|regulation)\s*(?:no\.|number)?\s*:?\s*(\w+[-/]\d+)",
|
| 273 |
+
r"document\s*(?:no\.|number)?\s*:?\s*(\w+[-/]\d+)",
|
| 274 |
+
r"reference\s*:?\s*(\w+[-/]\d+)",
|
| 275 |
+
r"(\w{2,10}[-/]\d{2,6})", # Generic code pattern
|
|
|
|
|
|
|
|
|
|
| 276 |
]
|
| 277 |
|
| 278 |
for pattern in standard_patterns:
|
|
|
|
| 282 |
break
|
| 283 |
|
| 284 |
# Document structure detection
|
| 285 |
+
if re.search(r'(?:article|chapter|part|annex)\s+\d+', text_lower):
|
| 286 |
+
metadata["document_structure"] = "hierarchical"
|
| 287 |
elif re.search(r'\d+\.\d+\s+[A-Z]', text_content):
|
| 288 |
metadata["document_structure"] = "numbered_sections"
|
| 289 |
else:
|
| 290 |
metadata["document_structure"] = "flat"
|
| 291 |
|
| 292 |
+
# Date extraction
|
| 293 |
date_patterns = [
|
| 294 |
+
r"(?:published|issued|effective|dated?)\s*:?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
|
| 295 |
+
r"(?:version|revision)\s*date\s*:?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
|
| 296 |
+
r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
|
| 297 |
r"(\d{4})"
|
| 298 |
]
|
| 299 |
|
|
|
|
| 303 |
metadata["publication_date"] = matches[0]
|
| 304 |
break
|
| 305 |
|
| 306 |
+
# Industry detection - generic
|
| 307 |
+
industry_indicators = [
|
| 308 |
+
(["quality", "control", "inspection", "standard"], "Quality Control"),
|
| 309 |
+
(["manufacturing", "production", "processing"], "Manufacturing"),
|
| 310 |
+
(["safety", "health", "hygiene"], "Health & Safety"),
|
| 311 |
+
(["environment", "sustainable", "green"], "Environmental"),
|
| 312 |
+
(["trade", "commerce", "export", "import"], "Trade & Commerce"),
|
| 313 |
+
]
|
| 314 |
|
| 315 |
+
for keywords, industry in industry_indicators:
|
| 316 |
+
if any(keyword in text_lower for keyword in keywords):
|
| 317 |
+
metadata["industry"] = industry
|
| 318 |
+
break
|
| 319 |
+
|
| 320 |
+
# Jurisdiction detection - generic
|
| 321 |
+
if any(country in text_lower for country in ["international", "global", "worldwide"]):
|
| 322 |
+
metadata["jurisdiction"] = "International"
|
| 323 |
+
elif re.search(r'(?:national|federal|state)\s+(?:standard|regulation)', text_lower):
|
| 324 |
+
metadata["jurisdiction"] = "National"
|
| 325 |
+
else:
|
| 326 |
+
metadata["jurisdiction"] = "General"
|
| 327 |
|
| 328 |
return metadata
|
| 329 |
|
| 330 |
def extract_enhanced_topics(self, text, sections):
|
| 331 |
+
"""Generic topic extraction without bias toward specific frameworks"""
|
| 332 |
topics = []
|
| 333 |
|
| 334 |
+
# UPDATED: Generic topic patterns applicable to any standard
|
| 335 |
topic_patterns = {
|
| 336 |
+
"Quality Management": {
|
| 337 |
+
"keywords": ["quality management", "quality system", "quality control", "quality assurance"],
|
| 338 |
+
"section_hints": ["quality", "management"]
|
| 339 |
},
|
| 340 |
+
"Documentation Requirements": {
|
| 341 |
+
"keywords": ["documentation", "records", "record keeping", "documents"],
|
| 342 |
+
"section_hints": ["document", "record"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
},
|
| 344 |
+
"Process Control": {
|
| 345 |
+
"keywords": ["process control", "process monitoring", "control measures"],
|
| 346 |
+
"section_hints": ["process", "control"]
|
| 347 |
},
|
| 348 |
+
"Verification and Validation": {
|
| 349 |
+
"keywords": ["verification", "validation", "audit", "review"],
|
| 350 |
+
"section_hints": ["verification", "validation"]
|
| 351 |
},
|
| 352 |
+
"Training Requirements": {
|
| 353 |
+
"keywords": ["training", "competence", "qualification", "education"],
|
| 354 |
+
"section_hints": ["training", "competence"]
|
| 355 |
},
|
| 356 |
"Corrective Actions": {
|
| 357 |
+
"keywords": ["corrective action", "preventive action", "non-conformance"],
|
| 358 |
+
"section_hints": ["corrective", "action"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 359 |
},
|
| 360 |
+
"Risk Management": {
|
| 361 |
+
"keywords": ["risk assessment", "risk management", "hazard", "risk analysis"],
|
| 362 |
+
"section_hints": ["risk", "hazard"]
|
| 363 |
},
|
| 364 |
+
"Monitoring and Measurement": {
|
| 365 |
+
"keywords": ["monitoring", "measurement", "testing", "inspection"],
|
| 366 |
+
"section_hints": ["monitoring", "measurement"]
|
| 367 |
},
|
| 368 |
+
"Compliance Requirements": {
|
| 369 |
+
"keywords": ["compliance", "regulatory", "legal requirements", "statutory"],
|
| 370 |
+
"section_hints": ["compliance", "regulatory"]
|
| 371 |
},
|
| 372 |
+
"Continuous Improvement": {
|
| 373 |
+
"keywords": ["improvement", "continual improvement", "enhancement"],
|
| 374 |
+
"section_hints": ["improvement", "enhance"]
|
| 375 |
},
|
| 376 |
+
"Resource Management": {
|
| 377 |
+
"keywords": ["resources", "facilities", "equipment", "infrastructure"],
|
| 378 |
+
"section_hints": ["resource", "facility"]
|
| 379 |
},
|
| 380 |
+
"Communication": {
|
| 381 |
+
"keywords": ["communication", "reporting", "notification"],
|
| 382 |
+
"section_hints": ["communication", "report"]
|
| 383 |
}
|
| 384 |
}
|
| 385 |
|
|
|
|
| 493 |
return None
|
| 494 |
|
| 495 |
def process_pdf(self, pdf_path):
|
| 496 |
+
"""Generic PDF processing without standard-specific bias"""
|
| 497 |
pdf_path = Path(pdf_path)
|
| 498 |
file_hash = self.get_file_hash(pdf_path)
|
| 499 |
filename = pdf_path.name
|
|
|
|
| 719 |
for pdf_file in pdf_files:
|
| 720 |
self.process_pdf(pdf_file)
|
| 721 |
|
| 722 |
+
print(f"\n🎯 Processing complete! Generic regulatory VDB ready.")
|
| 723 |
print(f"📊 Total files in manifest: {len(self.manifest['processed_files'])}")
|
| 724 |
+
|
| 725 |
def get_enhanced_stats(self):
|
| 726 |
+
"""Get enhanced processing statistics"""
|
| 727 |
+
conn = sqlite3.connect(self.metadata_db_path)
|
| 728 |
+
cursor = conn.cursor()
|
| 729 |
+
|
| 730 |
+
try:
|
| 731 |
+
# Overall stats
|
| 732 |
+
cursor.execute("""
|
| 733 |
+
SELECT COUNT(*) as total,
|
| 734 |
+
SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
|
| 735 |
+
SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
|
| 736 |
+
SUM(sections_extracted) as total_sections
|
| 737 |
+
FROM processing_log
|
| 738 |
+
""")
|
| 739 |
+
|
| 740 |
+
stats = cursor.fetchone()
|
| 741 |
+
|
| 742 |
+
# Regulatory body distribution
|
| 743 |
+
cursor.execute("""
|
| 744 |
+
SELECT regulatory_body, COUNT(*) as count, SUM(total_sections) as sections
|
| 745 |
+
FROM regulatory_documents
|
| 746 |
+
GROUP BY regulatory_body
|
| 747 |
+
ORDER BY count DESC
|
| 748 |
+
""")
|
| 749 |
+
|
| 750 |
+
body_dist = cursor.fetchall()
|
| 751 |
+
|
| 752 |
+
# Top topics with clause references
|
| 753 |
+
cursor.execute("""
|
| 754 |
+
SELECT topic, COUNT(*) as count, AVG(relevance_score) as avg_relevance,
|
| 755 |
+
GROUP_CONCAT(DISTINCT section_reference) as sections
|
| 756 |
+
FROM key_topics
|
| 757 |
+
GROUP BY topic
|
| 758 |
+
ORDER BY count DESC
|
| 759 |
+
LIMIT 10
|
| 760 |
+
""")
|
| 761 |
+
|
| 762 |
+
top_topics = cursor.fetchall()
|
| 763 |
+
|
| 764 |
+
return {
|
| 765 |
+
"total_processed": stats[0] or 0,
|
| 766 |
+
"successful": stats[1] or 0,
|
| 767 |
+
"errors": stats[2] or 0,
|
| 768 |
+
"total_sections": stats[3] or 0,
|
| 769 |
+
"regulatory_bodies": [(r[0], r[1], r[2]) for r in body_dist],
|
| 770 |
+
"top_topics": [{"topic": t[0], "count": t[1], "relevance": t[2], "sections": t[3]} for t in top_topics]
|
| 771 |
+
}
|
| 772 |
+
finally:
|
| 773 |
+
conn.close()
|
| 774 |
|
| 775 |
|
| 776 |
def main():
|
| 777 |
+
"""Main function to create/update the generic regulatory guidelines database"""
|
| 778 |
+
print("🚀 Starting Generic Regulatory Guidelines Database Creation...")
|
| 779 |
+
print("📋 Features: Unbiased extraction, generic standards support, dynamic classification")
|
| 780 |
+
|
| 781 |
+
# Initialize enhanced database
|
| 782 |
+
db = EnhancedRegulatoryVectorDB()
|
| 783 |
+
|
| 784 |
+
# Process all PDFs
|
| 785 |
+
db.process_all_pdfs()
|
| 786 |
+
|
| 787 |
+
# Show enhanced processing stats
|
| 788 |
+
print("\n" + "=" * 80)
|
| 789 |
+
print("📊 GENERIC PROCESSING STATISTICS:")
|
| 790 |
+
print("=" * 80)
|
| 791 |
+
|
| 792 |
+
stats = db.get_enhanced_stats()
|
| 793 |
+
print(f"📄 Total files processed: {stats['total_processed']}")
|
| 794 |
+
print(f"✅ Successful: {stats['successful']}")
|
| 795 |
+
print(f"❌ Errors: {stats['errors']}")
|
| 796 |
+
print(f"📑 Total sections extracted: {stats['total_sections']}")
|
| 797 |
+
|
| 798 |
+
print(f"\n🏛️ REGULATORY BODIES (No Bias):")
|
| 799 |
+
for body, count, sections in stats["regulatory_bodies"]:
|
| 800 |
+
print(f" - {body}: {count} documents ({sections} sections)")
|
| 801 |
+
|
| 802 |
+
print(f"\n🎯 TOP TOPICS (Generic):")
|
| 803 |
+
for topic_data in stats["top_topics"]:
|
| 804 |
+
sections_info = topic_data['sections'][:50] + "..." if len(topic_data['sections']) > 50 else topic_data['sections']
|
| 805 |
+
print(f" - {topic_data['topic']}: {topic_data['count']} documents")
|
| 806 |
+
print(f" └── Relevance: {topic_data['relevance']:.2f} | Sections: {sections_info}")
|
| 807 |
+
|
| 808 |
+
print("\n" + "=" * 80)
|
| 809 |
+
print("🎉 Generic Regulatory VDB Creation Complete!")
|
| 810 |
+
print("🔍 All regulatory frameworks are treated equally")
|
| 811 |
+
print("📝 The system can now provide:")
|
| 812 |
+
print(" - Unbiased regulatory references")
|
| 813 |
+
print(" - Generic clause citations")
|
| 814 |
+
print(" - Dynamic standard recognition")
|
| 815 |
+
print(" - Equal treatment of all frameworks")
|
| 816 |
+
print("=" * 80)
|
| 817 |
+
|
| 818 |
|
| 819 |
if __name__ == "__main__":
|
| 820 |
+
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|