yashgori20 commited on
Commit
365a20a
·
1 Parent(s): dd2978b
vector_stores/chroma_db/checklist_examples/create_industry_db.py CHANGED
@@ -63,7 +63,7 @@ class ChecklistExamplesVDB:
63
  document_type TEXT,
64
  product_name TEXT,
65
  supplier_name TEXT,
66
- checklist_category TEXT,
67
  total_parameters INTEGER DEFAULT 0,
68
  extracted_at DATETIME DEFAULT CURRENT_TIMESTAMP
69
  )
@@ -148,23 +148,23 @@ class ChecklistExamplesVDB:
148
  return hashlib.md5(f.read()).hexdigest()
149
 
150
  def extract_document_metadata(self, pdf_path, text_content):
151
- """Extract document metadata from checklist"""
152
  metadata = {
153
  "document_type": "QC Checklist",
154
  "product_name": "",
155
  "supplier_name": "",
156
- "checklist_category": "General Inspection"
157
  }
158
 
159
- # Extract document type
160
  doc_type_patterns = {
161
  "Inspection Record": ["inspection record", "inspection checklist", "quality inspection"],
162
  "Pre-Shipment Inspection": ["pre-shipment", "container inspection", "shipment inspection"],
163
  "Production Checklist": ["production checklist", "manufacturing checklist", "process checklist"],
164
- "Temperature Log": ["temperature", "chiller", "freezer", "thermal"],
165
  "Receiving Inspection": ["receiving", "goods receipt", "incoming inspection"],
166
  "Hygiene Checklist": ["hygiene", "sanitation", "cleaning checklist"],
167
- "HACCP Record": ["haccp", "critical control", "ccp monitoring"]
168
  }
169
 
170
  text_lower = text_content.lower()
@@ -173,11 +173,13 @@ class ChecklistExamplesVDB:
173
  metadata["document_type"] = doc_type
174
  break
175
 
176
- # Extract product name
177
  product_patterns = [
178
  r"product\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
 
 
179
  r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*[-–]\s*inspection",
180
- r"(malabar\s*paratha|green\s*peas|sweet\s*corn|vegetable\s*samosa)",
181
  r"product:\s*([^\n]{1,40})",
182
  ]
183
 
@@ -187,12 +189,13 @@ class ChecklistExamplesVDB:
187
  metadata["product_name"] = match.group(1).strip()
188
  break
189
 
190
- # Extract supplier name - look for "Al Kabeer" or similar
191
  supplier_patterns = [
192
  r"supplier\s*(?:name)?\s*[:\-]\s*([^\n]{1,40})",
193
- r"(al\s*kabeer|alkabeer)",
194
  r"manufacturer\s*[:\-]\s*([^\n]{1,40})",
195
- r"company\s*[:\-]\s*([^\n]{1,40})"
 
196
  ]
197
 
198
  for pattern in supplier_patterns:
@@ -201,23 +204,38 @@ class ChecklistExamplesVDB:
201
  metadata["supplier_name"] = match.group(1).strip()
202
  break
203
 
204
- # Determine checklist category
205
- category_keywords = {
206
- "Organoleptic Evaluation": ["organoleptic", "sensory", "taste", "aroma", "texture", "appearance"],
207
- "Physical Parameters": ["weight", "dimension", "size", "diameter", "thickness"],
208
- "Temperature Control": ["temperature", "freezer", "chiller", "thermal", "cooling"],
209
- "Packaging Inspection": ["packaging", "sealing", "printing", "carton"],
210
- "Microbiological Testing": ["microbiological", "bacteria", "pathogen", "contamination"],
211
- "Metal Detection": ["metal screening", "metal detection", "fe:", "non-fe", "ss:"],
212
- "Documentation Check": ["batch code", "shelf life", "expiry", "production date"],
213
- "Foreign Object Check": ["foreign particles", "foreign objects", "contamination"]
214
  }
215
 
216
- for category, keywords in category_keywords.items():
217
  if any(keyword in text_lower for keyword in keywords):
218
- metadata["checklist_category"] = category
219
  break
220
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
221
  return metadata
222
 
223
  def extract_checklist_parameters(self, text_content):
@@ -232,23 +250,31 @@ class ChecklistExamplesVDB:
232
  section_order = 0
233
  parameter_order = 0
234
 
235
- # Parameter extraction patterns
236
  param_patterns = [
237
  # Format: "Parameter Name: Type/Method"
238
- r"^([A-Z][^:]+?):\s*(Acceptable\s*/\s*Non-acceptable|To be mentioned|Present\s*/\s*Absent)",
239
  # Format: "Parameter (Spec: value)"
240
- r"^([A-Z][^(]+?)\s*\(Spec:\s*([^)]+)\)",
241
- # Format: "Parameter Name Acceptable / Non-acceptable"
242
- r"^([A-Z][^A]+?)\s+(Acceptable\s*/\s*Non-acceptable)",
243
- # Format: "Parameter: [details]"
244
- r"^([A-Z][^:]+?):\s*(.{1,100})",
 
 
 
 
 
 
245
  ]
246
 
247
  # Section header patterns
248
  section_patterns = [
249
- r"^([A-Z\s]+(?:EVALUATION|DETAILS|REQUIREMENTS|CONTROL|SCREENING))\s*$",
250
  r"^[0-9]+\.\s*([A-Z][^.]+)$",
251
- r"^\*\*([A-Z\s]+)\*\*$"
 
 
252
  ]
253
 
254
  for line_idx, line in enumerate(lines):
@@ -257,7 +283,7 @@ class ChecklistExamplesVDB:
257
  match = re.match(pattern, line)
258
  if match:
259
  section_name = match.group(1).strip()
260
- if len(section_name) > 5: # Valid section name
261
  current_section = section_name
262
  section_order += 1
263
  sections.append({
@@ -301,13 +327,13 @@ class ChecklistExamplesVDB:
301
  return parameters, sections
302
 
303
  def analyze_parameter(self, param_name, param_details, current_line, all_lines, line_idx):
304
- """Analyze parameter to determine type, input method, etc."""
305
  param_name_lower = param_name.lower()
306
- param_details_lower = param_details.lower()
307
  context_lines = all_lines[max(0, line_idx-2):min(len(all_lines), line_idx+3)]
308
  context_text = " ".join(context_lines).lower()
309
 
310
- # Determine parameter type based on name and context
311
  parameter_type = "Quality Check"
312
  input_method = "Text Input"
313
  specifications = ""
@@ -316,43 +342,51 @@ class ChecklistExamplesVDB:
316
  measurement_units = ""
317
  has_remarks = False
318
 
319
- # Input method determination
320
- if any(keyword in param_details_lower for keyword in ["acceptable / non-acceptable", "acceptable/non-acceptable"]):
321
- input_method = "Dropdown"
322
- options_list = "Acceptable, Non-acceptable"
 
 
 
323
  parameter_type = "Quality Assessment"
324
 
325
- elif any(keyword in param_details_lower for keyword in ["present / absent", "present/absent"]):
326
  input_method = "Toggle"
327
- options_list = "Present, Absent"
328
  parameter_type = "Presence Check"
329
 
330
- elif "to be mentioned" in param_details_lower:
331
- if any(unit in param_name_lower for unit in ["temperature", "weight", "time", "diameter", "length"]):
 
332
  input_method = "Numeric Input"
333
  parameter_type = "Measurement"
334
  else:
335
  input_method = "Text Input"
336
  parameter_type = "Information Entry"
337
 
338
- elif any(keyword in param_name_lower for keyword in ["photo", "attach", "image", "picture"]):
339
  input_method = "Image Upload"
340
  parameter_type = "Visual Documentation"
341
 
342
- elif any(keyword in param_name_lower for keyword in ["remarks", "comment", "observation", "note"]):
343
  input_method = "Remarks"
344
  parameter_type = "Detailed Notes"
345
 
 
 
 
 
346
  # Specification extraction
347
  spec_patterns = [
348
- r"\(spec:\s*([^)]+)\)",
349
- r"tolerance\s*limit[:\s]*([^,\n]+)",
350
  r"(\d+\s*[±]\s*\d+\s*[a-zA-Z%°]+)",
351
- r"(<\s*\d+[^,\n]*)",
352
- r"(\d+\s*[°][cC])",
353
  ]
354
 
355
- combined_text = f"{param_name} {param_details}"
356
  for pattern in spec_patterns:
357
  match = re.search(pattern, combined_text, re.IGNORECASE)
358
  if match:
@@ -362,7 +396,7 @@ class ChecklistExamplesVDB:
362
  # Extract measurement units
363
  unit_patterns = [
364
  r"(\d+\s*[a-zA-Z%°]+)",
365
- r"(°[cC]|gram|kg|mm|cm|minutes|hours|ppm|cfu)",
366
  ]
367
 
368
  for pattern in unit_patterns:
@@ -372,35 +406,26 @@ class ChecklistExamplesVDB:
372
  break
373
 
374
  # Check for tolerance limits
375
- tolerance_patterns = [
376
- r"tolerance\s*limit[:\s]*([^,\n]+)",
377
- r"(\d+\s*[±]\s*\d+)",
378
- r"([<>]=?\s*\d+)",
379
- ]
380
-
381
- for pattern in tolerance_patterns:
382
- match = re.search(pattern, combined_text, re.IGNORECASE)
383
- if match:
384
- tolerance_limits = match.group(1).strip()
385
- break
386
 
387
  # Check for remarks requirement
388
- has_remarks = any(keyword in context_text for keyword in ["remarks", "corrective action", "comment"])
389
 
390
- # Special handling for specific parameter types
391
- if any(keyword in param_name_lower for keyword in ["foreign", "contamination", "allergen"]):
392
- input_method = "Checklist"
393
- parameter_type = "Safety Check"
394
- if "foreign" in param_name_lower:
395
- options_list = "Stones, Glass, Metals, Plastic, Wood, Insects/Pests, Hair, Threads"
396
 
397
- elif any(keyword in param_name_lower for keyword in ["metal screening", "fe:", "non-fe", "ss:"]):
 
398
  input_method = "Text Input"
399
- parameter_type = "Metal Detection"
400
 
401
- elif any(keyword in param_name_lower for keyword in ["batch", "code", "date", "shelf life"]):
402
- input_method = "Text Input"
403
- parameter_type = "Traceability"
404
 
405
  return {
406
  "parameter_type": parameter_type,
@@ -414,21 +439,26 @@ class ChecklistExamplesVDB:
414
  }
415
 
416
  def classify_section_type(self, section_name):
417
- """Classify section based on name"""
418
  section_name_lower = section_name.lower()
419
 
420
- if any(keyword in section_name_lower for keyword in ["organoleptic", "sensory", "evaluation"]):
421
- return "Sensory Assessment"
422
- elif any(keyword in section_name_lower for keyword in ["physical", "dimension", "weight"]):
 
423
  return "Physical Measurement"
424
- elif any(keyword in section_name_lower for keyword in ["temperature", "thermal", "freezer", "chiller"]):
425
  return "Temperature Control"
426
- elif any(keyword in section_name_lower for keyword in ["packaging", "packing", "sealing"]):
427
  return "Packaging Inspection"
428
- elif any(keyword in section_name_lower for keyword in ["metal", "screening", "detection"]):
429
- return "Metal Detection"
430
- elif any(keyword in section_name_lower for keyword in ["microbiological", "bacteria", "pathogen"]):
431
- return "Microbiological Testing"
 
 
 
 
432
  else:
433
  return "General Inspection"
434
 
@@ -459,6 +489,17 @@ class ChecklistExamplesVDB:
459
  chunks = text_splitter.split_text(text)
460
  documents = []
461
 
 
 
 
 
 
 
 
 
 
 
 
462
  for i, chunk in enumerate(chunks):
463
  # Enrich metadata with structural information
464
  chunk_metadata = metadata.copy()
@@ -471,7 +512,7 @@ class ChecklistExamplesVDB:
471
  "parameter_types": ", ".join(set([p["parameter_type"] for p in parameters])),
472
  "input_methods": ", ".join(set([p["input_method"] for p in parameters])),
473
  "section_types": ", ".join(set([s["section_type"] for s in sections]))
474
- })
475
 
476
  documents.append({
477
  "text": chunk,
@@ -507,69 +548,67 @@ class ChecklistExamplesVDB:
507
  pdf_path = Path(pdf_path)
508
  file_hash = self.get_file_hash(pdf_path)
509
  filename = pdf_path.name
510
-
511
  # Check if already processed
512
  if filename in self.manifest["processed_files"]:
513
  if self.manifest["processed_files"][filename]["hash"] == file_hash:
514
  print(f"Skipping {filename} - already processed")
515
  return
516
-
517
  print(f"Processing checklist: {filename}...")
518
-
519
  try:
520
  # Load PDF content
521
  loader = PyPDFLoader(str(pdf_path))
522
  pages = loader.load()
523
-
524
  # Combine all pages
525
  full_text = ""
526
  for i, page in enumerate(pages):
527
  full_text += f"\n--- Page {i+1} ---\n{page.page_content}"
528
-
529
  # If text is too short, use OCR
530
  if len(full_text.strip()) < 100:
531
  print(f"Using OCR for {filename}")
532
  ocr_text = self.ocr_pdf(pdf_path)
533
  if len(ocr_text) > len(full_text):
534
  full_text = ocr_text
535
-
536
  # Extract document metadata
537
  doc_metadata = self.extract_document_metadata(pdf_path, full_text)
538
-
539
  # Extract parameters and sections
540
  parameters, sections = self.extract_checklist_parameters(full_text)
541
-
542
  # Create base metadata for chunks
543
  metadata = {
544
  "source": filename,
545
  "document_type": doc_metadata["document_type"],
546
  "product_name": doc_metadata["product_name"],
547
  "supplier_name": doc_metadata["supplier_name"],
548
- "checklist_category": doc_metadata["checklist_category"],
549
  "file_hash": file_hash,
550
  "processed_date": datetime.now().isoformat(),
551
- "domain": "Food Manufacturing"
552
  }
553
-
554
  # Create chunks
555
  documents = self.create_chunks(full_text, metadata, parameters, sections)
556
-
557
  # Generate embeddings and store in ChromaDB
558
  for i, doc in enumerate(documents):
559
  embedding = self.embedder.encode(doc["text"]).tolist()
560
-
561
  self.collection.add(
562
  documents=[doc["text"]],
563
  embeddings=[embedding],
564
  metadatas=[doc["metadata"]],
565
  ids=[f"{file_hash}_{i}"]
566
  )
567
-
568
  # Store metadata in SQLite
569
  self.save_document_metadata(file_hash, filename, doc_metadata, len(parameters))
570
  self.save_parameters(file_hash, parameters)
571
  self.save_sections(file_hash, sections)
572
-
573
  # Update manifest
574
  self.manifest["processed_files"][filename] = {
575
  "hash": file_hash,
@@ -579,19 +618,20 @@ class ChecklistExamplesVDB:
579
  "parameters_extracted": len(parameters),
580
  "sections_extracted": len(sections),
581
  "document_type": doc_metadata["document_type"],
582
- "product_name": doc_metadata["product_name"]
 
583
  }
584
  self.save_manifest()
585
-
586
  # Log success
587
  self.log_processing(filename, file_hash, "SUCCESS", None, len(parameters), len(sections))
588
-
589
  print(f"Successfully processed {filename}")
590
  print(f" - Document Type: {doc_metadata['document_type']}")
591
  print(f" - Product: {doc_metadata['product_name']}")
592
  print(f" - Parameters extracted: {len(parameters)}")
593
  print(f" - Sections extracted: {len(sections)}")
594
-
595
  except Exception as e:
596
  error_msg = str(e)
597
  print(f"Error processing {filename}: {error_msg}")
@@ -599,309 +639,354 @@ class ChecklistExamplesVDB:
599
  traceback.print_exc()
600
  self.log_processing(filename, file_hash, "ERROR", error_msg, 0, 0)
601
 
 
602
  def save_document_metadata(self, file_hash, filename, metadata, total_parameters):
603
- """Save document metadata to SQLite"""
604
- conn = sqlite3.connect(self.metadata_db_path)
605
- cursor = conn.cursor()
606
-
607
- try:
608
- cursor.execute("""
609
- INSERT OR REPLACE INTO checklist_documents
610
- (file_hash, filename, document_type, product_name, supplier_name,
611
- checklist_category, total_parameters)
612
- VALUES (?, ?, ?, ?, ?, ?, ?)
613
- """, (
614
- file_hash, filename, metadata["document_type"], metadata["product_name"],
615
- metadata["supplier_name"], metadata["checklist_category"], total_parameters
616
- ))
617
- conn.commit()
618
- finally:
619
- conn.close()
620
-
621
  def save_parameters(self, file_hash, parameters):
622
- """Save extracted parameters to SQLite"""
623
- conn = sqlite3.connect(self.metadata_db_path)
624
- cursor = conn.cursor()
625
-
626
- try:
627
- # Delete existing parameters for this file
628
- cursor.execute("DELETE FROM checklist_parameters WHERE file_hash = ?", (file_hash,))
629
-
630
- # Insert new parameters
631
- for param in parameters:
632
- cursor.execute("""
633
- INSERT INTO checklist_parameters
634
- (file_hash, parameter_name, parameter_type, input_method, specifications,
635
- options_list, tolerance_limits, measurement_units, section_category,
636
- parameter_order, has_remarks, is_mandatory)
637
- VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
638
- """, (
639
- file_hash, param["parameter_name"], param["parameter_type"],
640
- param["input_method"], param["specifications"], param["options_list"],
641
- param["tolerance_limits"], param["measurement_units"],
642
- param["section_category"], param["parameter_order"],
643
- param["has_remarks"], param["is_mandatory"]
644
- ))
645
-
646
- conn.commit()
647
- finally:
648
- conn.close()
649
-
650
  def save_sections(self, file_hash, sections):
651
- """Save extracted sections to SQLite"""
652
- conn = sqlite3.connect(self.metadata_db_path)
653
- cursor = conn.cursor()
654
-
655
- try:
656
- # Delete existing sections for this file
657
- cursor.execute("DELETE FROM checklist_sections WHERE file_hash = ?", (file_hash,))
658
-
659
- # Insert new sections
660
- for section in sections:
661
- cursor.execute("""
662
- INSERT INTO checklist_sections
663
- (file_hash, section_name, section_type, section_order, parameter_count)
664
- VALUES (?, ?, ?, ?, ?)
665
- """, (
666
- file_hash, section["section_name"], section["section_type"],
667
- section["section_order"], section["parameter_count"]
668
- ))
669
-
670
- conn.commit()
671
- finally:
672
- conn.close()
673
-
674
  def log_processing(self, filename, file_hash, status, error_message, parameters_extracted=0, sections_extracted=0):
675
- """Log processing status"""
676
- conn = sqlite3.connect(self.metadata_db_path)
677
- cursor = conn.cursor()
678
-
679
- try:
680
- cursor.execute("""
681
- INSERT INTO processing_log
682
- (filename, file_hash, status, error_message, parameters_extracted, sections_extracted)
683
- VALUES (?, ?, ?, ?, ?, ?)
684
- """, (filename, file_hash, status, error_message, parameters_extracted, sections_extracted))
685
-
686
- conn.commit()
687
- finally:
688
- conn.close()
689
-
690
  def process_all_pdfs(self):
691
- """Process all PDFs in the directory"""
692
- pdf_files = list(self.pdf_path.glob("*.pdf"))
693
-
694
- if not pdf_files:
695
- print(f"No PDF files found in {self.pdf_path}")
696
- return
697
-
698
- print(f"Found {len(pdf_files)} checklist PDF files")
699
-
700
- for pdf_file in pdf_files:
701
- self.process_pdf(pdf_file)
702
-
703
- print("\nChecklist processing complete!")
704
- print(f"Total files in manifest: {len(self.manifest['processed_files'])}")
705
-
706
  def get_processing_stats(self):
707
- """Get processing statistics"""
708
- conn = sqlite3.connect(self.metadata_db_path)
709
- cursor = conn.cursor()
710
-
711
- try:
712
- # Get overall stats
713
- cursor.execute("""
714
- SELECT COUNT(*) as total,
715
- SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
716
- SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
717
- SUM(parameters_extracted) as total_parameters,
718
- SUM(sections_extracted) as total_sections
719
- FROM processing_log
720
- """)
721
-
722
- stats = cursor.fetchone()
723
-
724
- # Get document type distribution
725
- cursor.execute("""
726
- SELECT document_type, COUNT(*) as count
727
- FROM checklist_documents
728
- GROUP BY document_type
729
- ORDER BY count DESC
730
- """)
731
-
732
- doc_types = cursor.fetchall()
733
-
734
- # Get parameter type distribution
735
- cursor.execute("""
736
- SELECT input_method, COUNT(*) as count
737
- FROM checklist_parameters
738
- GROUP BY input_method
739
- ORDER BY count DESC
740
- """)
741
-
742
- input_methods = cursor.fetchall()
743
-
744
- # Get most common parameters
745
- cursor.execute("""
746
- SELECT parameter_name, parameter_type, input_method, COUNT(*) as frequency
747
- FROM checklist_parameters
748
- GROUP BY parameter_name, parameter_type, input_method
749
- HAVING frequency > 1
750
- ORDER BY frequency DESC
751
- LIMIT 10
752
- """)
753
-
754
- common_params = cursor.fetchall()
755
-
756
- return {
757
- "total_processed": stats[0],
758
- "successful": stats[1],
759
- "errors": stats[2],
760
- "total_parameters": stats[3],
761
- "total_sections": stats[4],
762
- "document_types": dict(doc_types),
763
- "input_methods": dict(input_methods),
764
- "common_parameters": [
765
- {
766
- "name": p[0],
767
- "type": p[1],
768
- "input_method": p[2],
769
- "frequency": p[3]
770
- } for p in common_params
771
- ]
772
- }
773
- finally:
774
- conn.close()
775
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
776
  def get_parameter_patterns(self):
777
- """Get common parameter patterns for AI reference"""
778
- conn = sqlite3.connect(self.metadata_db_path)
779
- cursor = conn.cursor()
780
-
781
- try:
782
- cursor.execute("""
783
- SELECT
784
- parameter_type,
785
- input_method,
786
- GROUP_CONCAT(DISTINCT specifications) as common_specs,
787
- GROUP_CONCAT(DISTINCT options_list) as common_options,
788
- COUNT(*) as usage_count
789
- FROM checklist_parameters
790
- WHERE specifications != '' OR options_list != ''
791
- GROUP BY parameter_type, input_method
792
- ORDER BY usage_count DESC
793
- """)
794
-
795
- patterns = []
796
- for row in cursor.fetchall():
797
- patterns.append({
798
- "parameter_type": row[0],
799
- "input_method": row[1],
800
- "common_specifications": row[2],
801
- "common_options": row[3],
802
- "usage_count": row[4]
803
- })
804
-
805
- return patterns
806
- finally:
807
- conn.close()
808
-
809
  def search_similar_checklists(self, product_name, checklist_type="", limit=5):
810
- """Search for similar checklists based on product and type"""
811
- query_text = f"{product_name} {checklist_type} quality control inspection checklist"
812
- query_embedding = self.embedder.encode(query_text).tolist()
813
-
814
- try:
815
- results = self.collection.query(
816
- query_embeddings=[query_embedding],
817
- n_results=limit,
818
- where={"domain": "Food Manufacturing"}
819
- )
820
-
821
- similar_checklists = []
822
- if results['documents'][0]:
823
- for i, doc in enumerate(results['documents'][0]):
824
- metadata = results['metadatas'][0][i]
825
- similar_checklists.append({
826
- "document": metadata.get('source', 'Unknown'),
827
- "product": metadata.get('product_name', 'Unknown'),
828
- "type": metadata.get('document_type', 'Unknown'),
829
- "category": metadata.get('checklist_category', 'Unknown'),
830
- "parameters": metadata.get('total_parameters', 0),
831
- "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0,
832
- "content_preview": doc[:200] + "..." if len(doc) > 200 else doc
833
- })
834
-
835
- return similar_checklists
836
- except Exception as e:
837
- print(f"Error searching checklists: {str(e)}")
838
- return []
 
 
 
 
 
 
 
 
 
839
 
840
 
841
  def main():
842
- """Main function to create/update the checklist examples database"""
843
- print("Starting Checklist Examples Database Creation...")
844
-
845
- # Initialize database
846
- db = ChecklistExamplesVDB()
847
-
848
- # Process all PDFs
849
- db.process_all_pdfs()
850
-
851
- # Show processing stats
852
- print("\n" + "="*60)
853
- print("PROCESSING STATISTICS")
854
- print("="*60)
855
-
856
- stats = db.get_processing_stats()
857
- print(f"Total files processed: {stats['total_processed']}")
858
- print(f"Successful: {stats['successful']}")
859
- print(f"Errors: {stats['errors']}")
860
- print(f"Total parameters extracted: {stats['total_parameters']}")
861
- print(f"Total sections extracted: {stats['total_sections']}")
862
-
863
- print("\nDocument Types:")
864
- for doc_type, count in stats["document_types"].items():
865
- print(f" - {doc_type}: {count} documents")
866
-
867
- print("\nInput Methods Distribution:")
868
- for method, count in stats["input_methods"].items():
869
- print(f" - {method}: {count} parameters")
870
-
871
- print("\nMost Common Parameters:")
872
- for param in stats["common_parameters"]:
873
- print(f" - {param['name']} ({param['input_method']}) - used {param['frequency']} times")
874
-
875
- # Show parameter patterns
876
- print("\n" + "="*60)
877
- print("PARAMETER PATTERNS DISCOVERED")
878
- print("="*60)
879
-
880
- patterns = db.get_parameter_patterns()
881
- for pattern in patterns[:10]: # Show top 10 patterns
882
- print(f"\n{pattern['parameter_type']} -> {pattern['input_method']}")
883
- print(f" Usage: {pattern['usage_count']} times")
884
- if pattern['common_specifications']:
885
- specs = pattern['common_specifications'][:100]
886
- print(f" Common specs: {specs}{'...' if len(pattern['common_specifications']) > 100 else ''}")
887
- if pattern['common_options']:
888
- options = pattern['common_options'][:100]
889
- print(f" Common options: {options}{'...' if len(pattern['common_options']) > 100 else ''}")
890
-
891
- # Test search functionality
892
- print("\n" + "="*60)
893
- print("TESTING SEARCH FUNCTIONALITY")
894
- print("="*60)
895
-
896
- test_products = ["Malabar Paratha", "Green Peas", "Vegetable Samosa"]
897
- for product in test_products:
898
- print(f"\nSearching for '{product}' checklists:")
899
- similar = db.search_similar_checklists(product, limit=3)
900
- for i, checklist in enumerate(similar, 1):
901
- print(f" {i}. {checklist['document']} ({checklist['type']})")
902
- print(f" Product: {checklist['product']}, Parameters: {checklist['parameters']}")
903
- print(f" Relevance: {checklist['relevance_score']:.3f}")
 
 
 
 
 
 
 
 
 
904
 
905
 
906
  if __name__ == "__main__":
907
- main()
 
63
  document_type TEXT,
64
  product_name TEXT,
65
  supplier_name TEXT,
66
+ checklist_attributes TEXT, -- Dynamic attributes instead of category
67
  total_parameters INTEGER DEFAULT 0,
68
  extracted_at DATETIME DEFAULT CURRENT_TIMESTAMP
69
  )
 
148
  return hashlib.md5(f.read()).hexdigest()
149
 
150
  def extract_document_metadata(self, pdf_path, text_content):
151
+ """Extract document metadata from checklist - generic approach"""
152
  metadata = {
153
  "document_type": "QC Checklist",
154
  "product_name": "",
155
  "supplier_name": "",
156
+ "checklist_attributes": {} # Dynamic attributes
157
  }
158
 
159
+ # Extract document type generically
160
  doc_type_patterns = {
161
  "Inspection Record": ["inspection record", "inspection checklist", "quality inspection"],
162
  "Pre-Shipment Inspection": ["pre-shipment", "container inspection", "shipment inspection"],
163
  "Production Checklist": ["production checklist", "manufacturing checklist", "process checklist"],
164
+ "Temperature Log": ["temperature", "thermal", "cooling log"],
165
  "Receiving Inspection": ["receiving", "goods receipt", "incoming inspection"],
166
  "Hygiene Checklist": ["hygiene", "sanitation", "cleaning checklist"],
167
+ "Quality Control": ["quality control", "qc checklist", "quality check"]
168
  }
169
 
170
  text_lower = text_content.lower()
 
173
  metadata["document_type"] = doc_type
174
  break
175
 
176
+ # Extract product name generically
177
  product_patterns = [
178
  r"product\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
179
+ r"item\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
180
+ r"material\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
181
  r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*[-–]\s*inspection",
182
+ r"inspection\s*of\s*([^\n]{1,40})",
183
  r"product:\s*([^\n]{1,40})",
184
  ]
185
 
 
189
  metadata["product_name"] = match.group(1).strip()
190
  break
191
 
192
+ # Extract supplier name generically - no specific company bias
193
  supplier_patterns = [
194
  r"supplier\s*(?:name)?\s*[:\-]\s*([^\n]{1,40})",
195
+ r"vendor\s*(?:name)?\s*[:\-]\s*([^\n]{1,40})",
196
  r"manufacturer\s*[:\-]\s*([^\n]{1,40})",
197
+ r"company\s*[:\-]\s*([^\n]{1,40})",
198
+ r"produced\s*by\s*[:\-]\s*([^\n]{1,40})"
199
  ]
200
 
201
  for pattern in supplier_patterns:
 
204
  metadata["supplier_name"] = match.group(1).strip()
205
  break
206
 
207
+ # Extract dynamic attributes
208
+ attributes = {}
209
+
210
+ # Inspection stage/phase
211
+ stage_keywords = {
212
+ "pre-production": ["pre-production", "before production", "initial"],
213
+ "during-production": ["during production", "in-process", "mid-production"],
214
+ "final": ["final inspection", "finished goods", "end product"],
215
+ "incoming": ["incoming", "receiving", "goods receipt"],
216
+ "outgoing": ["outgoing", "dispatch", "shipping"]
217
  }
218
 
219
+ for stage, keywords in stage_keywords.items():
220
  if any(keyword in text_lower for keyword in keywords):
221
+ attributes["inspection_stage"] = stage
222
  break
223
 
224
+ # Inspection focus
225
+ if any(word in text_lower for word in ["visual", "appearance", "cosmetic"]):
226
+ attributes["inspection_focus"] = "visual"
227
+ elif any(word in text_lower for word in ["dimension", "measurement", "size"]):
228
+ attributes["inspection_focus"] = "dimensional"
229
+ elif any(word in text_lower for word in ["functional", "performance", "operation"]):
230
+ attributes["inspection_focus"] = "functional"
231
+ elif any(word in text_lower for word in ["safety", "hazard", "risk"]):
232
+ attributes["inspection_focus"] = "safety"
233
+
234
+ # Complexity level based on parameter count (will be updated later)
235
+ attributes["complexity"] = "standard" # Will be updated after parameter extraction
236
+
237
+ metadata["checklist_attributes"] = json.dumps(attributes)
238
+
239
  return metadata
240
 
241
  def extract_checklist_parameters(self, text_content):
 
250
  section_order = 0
251
  parameter_order = 0
252
 
253
+ # Generic parameter extraction patterns
254
  param_patterns = [
255
  # Format: "Parameter Name: Type/Method"
256
+ r"^([A-Z][^:]+?):\s*(Acceptable\s*/\s*Non-acceptable|To be mentioned|Present\s*/\s*Absent|Pass\s*/\s*Fail)",
257
  # Format: "Parameter (Spec: value)"
258
+ r"^([A-Z][^(]+?)\s*\((?:Spec|Specification):\s*([^)]+)\)",
259
+ # Format: "Parameter Name: [measurement/value]"
260
+ r"^([A-Z][^:]+?):\s*\[([^\]]+)\]",
261
+ # Format: "Parameter: _____" (blank field)
262
+ r"^([A-Z][^:]+?):\s*_{3,}",
263
+ # Format: "□ Parameter Name"
264
+ r"^[□☐]\s*([A-Z][^:]+?)$",
265
+ # Format: "• Parameter Name"
266
+ r"^[•·]\s*([A-Z][^:]+?)$",
267
+ # Generic: "Parameter Name: [details]"
268
+ r"^([A-Z][^:]+?):\s*(.{0,100})",
269
  ]
270
 
271
  # Section header patterns
272
  section_patterns = [
273
+ r"^([A-Z\s]+(?:EVALUATION|INSPECTION|CHECK|VERIFICATION|ASSESSMENT|CONTROL))\s*$",
274
  r"^[0-9]+\.\s*([A-Z][^.]+)$",
275
+ r"^\*\*([A-Z\s]+)\*\*$",
276
+ r"^={3,}\s*([A-Z\s]+)\s*={3,}$",
277
+ r"^-{3,}\s*([A-Z\s]+)\s*-{3,}$"
278
  ]
279
 
280
  for line_idx, line in enumerate(lines):
 
283
  match = re.match(pattern, line)
284
  if match:
285
  section_name = match.group(1).strip()
286
+ if len(section_name) > 5 and len(section_name) < 50: # Valid section name
287
  current_section = section_name
288
  section_order += 1
289
  sections.append({
 
327
  return parameters, sections
328
 
329
  def analyze_parameter(self, param_name, param_details, current_line, all_lines, line_idx):
330
+ """Analyze parameter to determine type, input method, etc. - generic approach"""
331
  param_name_lower = param_name.lower()
332
+ param_details_lower = param_details.lower() if param_details else ""
333
  context_lines = all_lines[max(0, line_idx-2):min(len(all_lines), line_idx+3)]
334
  context_text = " ".join(context_lines).lower()
335
 
336
+ # Initialize default values
337
  parameter_type = "Quality Check"
338
  input_method = "Text Input"
339
  specifications = ""
 
342
  measurement_units = ""
343
  has_remarks = False
344
 
345
+ # Generic input method determination
346
+ if any(keyword in param_details_lower for keyword in ["acceptable", "non-acceptable", "pass", "fail"]):
347
+ if "/" in param_details_lower:
348
+ input_method = "Dropdown"
349
+ options_list = param_details.replace("/", ", ")
350
+ else:
351
+ input_method = "Toggle"
352
  parameter_type = "Quality Assessment"
353
 
354
+ elif any(keyword in param_details_lower for keyword in ["present", "absent", "yes", "no"]):
355
  input_method = "Toggle"
356
+ options_list = param_details.replace("/", ", ")
357
  parameter_type = "Presence Check"
358
 
359
+ elif "to be mentioned" in param_details_lower or "_____" in current_line:
360
+ # Determine based on parameter name
361
+ if any(unit in param_name_lower for unit in ["temperature", "weight", "time", "dimension", "size", "count", "number"]):
362
  input_method = "Numeric Input"
363
  parameter_type = "Measurement"
364
  else:
365
  input_method = "Text Input"
366
  parameter_type = "Information Entry"
367
 
368
+ elif any(keyword in param_name_lower for keyword in ["photo", "picture", "image", "visual"]):
369
  input_method = "Image Upload"
370
  parameter_type = "Visual Documentation"
371
 
372
+ elif any(keyword in param_name_lower for keyword in ["remark", "comment", "observation", "note"]):
373
  input_method = "Remarks"
374
  parameter_type = "Detailed Notes"
375
 
376
+ elif "□" in current_line or "☐" in current_line:
377
+ input_method = "Checklist"
378
+ parameter_type = "Verification Check"
379
+
380
  # Specification extraction
381
  spec_patterns = [
382
+ r"\((?:spec|specification):\s*([^)]+)\)",
383
+ r"tolerance\s*(?:limit)?[:\s]*([^,\n]+)",
384
  r"(\d+\s*[±]\s*\d+\s*[a-zA-Z%°]+)",
385
+ r"([<>≤≥]\s*\d+[^,\n]*)",
386
+ r"(\d+\s*-\s*\d+\s*[a-zA-Z]+)",
387
  ]
388
 
389
+ combined_text = f"{param_name} {param_details} {' '.join(context_lines)}"
390
  for pattern in spec_patterns:
391
  match = re.search(pattern, combined_text, re.IGNORECASE)
392
  if match:
 
396
  # Extract measurement units
397
  unit_patterns = [
398
  r"(\d+\s*[a-zA-Z%°]+)",
399
+ r"(°[CcFf]|g|gram|kg|mm|cm|m|ml|L|minutes?|hours?|seconds?|%|ppm|cfu)",
400
  ]
401
 
402
  for pattern in unit_patterns:
 
406
  break
407
 
408
  # Check for tolerance limits
409
+ if "±" in combined_text or any(op in combined_text for op in ["<", ">", "≤", "≥"]):
410
+ tolerance_match = re.search(r"([±<>≤≥]\s*\d+(?:\.\d+)?)", combined_text)
411
+ if tolerance_match:
412
+ tolerance_limits = tolerance_match.group(1).strip()
 
 
 
 
 
 
 
413
 
414
  # Check for remarks requirement
415
+ has_remarks = any(keyword in context_text for keyword in ["remark", "comment", "observation", "corrective action"])
416
 
417
+ # Generic parameter type classification based on content
418
+ if any(keyword in param_name_lower for keyword in ["contamination", "foreign", "defect", "damage"]):
419
+ parameter_type = "Safety/Quality Check"
420
+ if not options_list and input_method == "Checklist":
421
+ options_list = "None observed, Minor issue, Major issue, Critical"
 
422
 
423
+ elif any(keyword in param_name_lower for keyword in ["batch", "lot", "code", "number", "id"]):
424
+ parameter_type = "Traceability"
425
  input_method = "Text Input"
 
426
 
427
+ elif any(keyword in param_name_lower for keyword in ["signature", "verified", "checked"]):
428
+ parameter_type = "Verification"
 
429
 
430
  return {
431
  "parameter_type": parameter_type,
 
439
  }
440
 
441
  def classify_section_type(self, section_name):
442
+ """Classify section based on name - generic approach"""
443
  section_name_lower = section_name.lower()
444
 
445
+ # Generic section classification
446
+ if any(keyword in section_name_lower for keyword in ["visual", "appearance", "cosmetic"]):
447
+ return "Visual Assessment"
448
+ elif any(keyword in section_name_lower for keyword in ["measurement", "dimension", "size", "weight"]):
449
  return "Physical Measurement"
450
+ elif any(keyword in section_name_lower for keyword in ["temperature", "thermal", "heat", "cold"]):
451
  return "Temperature Control"
452
+ elif any(keyword in section_name_lower for keyword in ["package", "packing", "container", "seal"]):
453
  return "Packaging Inspection"
454
+ elif any(keyword in section_name_lower for keyword in ["test", "analysis", "laboratory"]):
455
+ return "Testing/Analysis"
456
+ elif any(keyword in section_name_lower for keyword in ["safety", "hazard", "risk", "contamination"]):
457
+ return "Safety Assessment"
458
+ elif any(keyword in section_name_lower for keyword in ["document", "record", "certificate"]):
459
+ return "Documentation"
460
+ elif any(keyword in section_name_lower for keyword in ["final", "overall", "summary"]):
461
+ return "Final Assessment"
462
  else:
463
  return "General Inspection"
464
 
 
489
  chunks = text_splitter.split_text(text)
490
  documents = []
491
 
492
+ # Update complexity in metadata based on parameters
493
+ if metadata.get("checklist_attributes"):
494
+ attrs = json.loads(metadata["checklist_attributes"])
495
+ if len(parameters) < 10:
496
+ attrs["complexity"] = "simple"
497
+ elif len(parameters) < 25:
498
+ attrs["complexity"] = "standard"
499
+ else:
500
+ attrs["complexity"] = "comprehensive"
501
+ metadata["checklist_attributes"] = json.dumps(attrs)
502
+
503
  for i, chunk in enumerate(chunks):
504
  # Enrich metadata with structural information
505
  chunk_metadata = metadata.copy()
 
512
  "parameter_types": ", ".join(set([p["parameter_type"] for p in parameters])),
513
  "input_methods": ", ".join(set([p["input_method"] for p in parameters])),
514
  "section_types": ", ".join(set([s["section_type"] for s in sections]))
515
+ })
516
 
517
  documents.append({
518
  "text": chunk,
 
548
  pdf_path = Path(pdf_path)
549
  file_hash = self.get_file_hash(pdf_path)
550
  filename = pdf_path.name
551
+
552
  # Check if already processed
553
  if filename in self.manifest["processed_files"]:
554
  if self.manifest["processed_files"][filename]["hash"] == file_hash:
555
  print(f"Skipping {filename} - already processed")
556
  return
557
+
558
  print(f"Processing checklist: {filename}...")
 
559
  try:
560
  # Load PDF content
561
  loader = PyPDFLoader(str(pdf_path))
562
  pages = loader.load()
563
+
564
  # Combine all pages
565
  full_text = ""
566
  for i, page in enumerate(pages):
567
  full_text += f"\n--- Page {i+1} ---\n{page.page_content}"
568
+
569
  # If text is too short, use OCR
570
  if len(full_text.strip()) < 100:
571
  print(f"Using OCR for {filename}")
572
  ocr_text = self.ocr_pdf(pdf_path)
573
  if len(ocr_text) > len(full_text):
574
  full_text = ocr_text
575
+
576
  # Extract document metadata
577
  doc_metadata = self.extract_document_metadata(pdf_path, full_text)
578
+
579
  # Extract parameters and sections
580
  parameters, sections = self.extract_checklist_parameters(full_text)
581
+
582
  # Create base metadata for chunks
583
  metadata = {
584
  "source": filename,
585
  "document_type": doc_metadata["document_type"],
586
  "product_name": doc_metadata["product_name"],
587
  "supplier_name": doc_metadata["supplier_name"],
588
+ "checklist_attributes": doc_metadata["checklist_attributes"],
589
  "file_hash": file_hash,
590
  "processed_date": datetime.now().isoformat(),
591
+ "domain": "Quality Control" # Generic domain
592
  }
593
+
594
  # Create chunks
595
  documents = self.create_chunks(full_text, metadata, parameters, sections)
596
+
597
  # Generate embeddings and store in ChromaDB
598
  for i, doc in enumerate(documents):
599
  embedding = self.embedder.encode(doc["text"]).tolist()
 
600
  self.collection.add(
601
  documents=[doc["text"]],
602
  embeddings=[embedding],
603
  metadatas=[doc["metadata"]],
604
  ids=[f"{file_hash}_{i}"]
605
  )
606
+
607
  # Store metadata in SQLite
608
  self.save_document_metadata(file_hash, filename, doc_metadata, len(parameters))
609
  self.save_parameters(file_hash, parameters)
610
  self.save_sections(file_hash, sections)
611
+
612
  # Update manifest
613
  self.manifest["processed_files"][filename] = {
614
  "hash": file_hash,
 
618
  "parameters_extracted": len(parameters),
619
  "sections_extracted": len(sections),
620
  "document_type": doc_metadata["document_type"],
621
+ "product_name": doc_metadata["product_name"],
622
+ "attributes": doc_metadata["checklist_attributes"]
623
  }
624
  self.save_manifest()
625
+
626
  # Log success
627
  self.log_processing(filename, file_hash, "SUCCESS", None, len(parameters), len(sections))
628
+
629
  print(f"Successfully processed {filename}")
630
  print(f" - Document Type: {doc_metadata['document_type']}")
631
  print(f" - Product: {doc_metadata['product_name']}")
632
  print(f" - Parameters extracted: {len(parameters)}")
633
  print(f" - Sections extracted: {len(sections)}")
634
+
635
  except Exception as e:
636
  error_msg = str(e)
637
  print(f"Error processing {filename}: {error_msg}")
 
639
  traceback.print_exc()
640
  self.log_processing(filename, file_hash, "ERROR", error_msg, 0, 0)
641
 
642
+
643
  def save_document_metadata(self, file_hash, filename, metadata, total_parameters):
644
+ """Save document metadata to SQLite"""
645
+ conn = sqlite3.connect(self.metadata_db_path)
646
+ cursor = conn.cursor()
647
+
648
+ try:
649
+ cursor.execute("""
650
+ INSERT OR REPLACE INTO checklist_documents
651
+ (file_hash, filename, document_type, product_name, supplier_name,
652
+ checklist_attributes, total_parameters)
653
+ VALUES (?, ?, ?, ?, ?, ?, ?)
654
+ """, (
655
+ file_hash, filename, metadata["document_type"], metadata["product_name"],
656
+ metadata["supplier_name"], metadata["checklist_attributes"], total_parameters
657
+ ))
658
+ conn.commit()
659
+ finally:
660
+ conn.close()
661
+
662
  def save_parameters(self, file_hash, parameters):
663
+ """Save extracted parameters to SQLite"""
664
+ conn = sqlite3.connect(self.metadata_db_path)
665
+ cursor = conn.cursor()
666
+
667
+ try:
668
+ # Delete existing parameters for this file
669
+ cursor.execute("DELETE FROM checklist_parameters WHERE file_hash = ?", (file_hash,))
670
+
671
+ # Insert new parameters
672
+ for param in parameters:
673
+ cursor.execute("""
674
+ INSERT INTO checklist_parameters
675
+ (file_hash, parameter_name, parameter_type, input_method, specifications,
676
+ options_list, tolerance_limits, measurement_units, section_category,
677
+ parameter_order, has_remarks, is_mandatory)
678
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
679
+ """, (
680
+ file_hash, param["parameter_name"], param["parameter_type"],
681
+ param["input_method"], param["specifications"], param["options_list"],
682
+ param["tolerance_limits"], param["measurement_units"],
683
+ param["section_category"], param["parameter_order"],
684
+ param["has_remarks"], param["is_mandatory"]
685
+ ))
686
+
687
+ conn.commit()
688
+ finally:
689
+ conn.close()
690
+
691
  def save_sections(self, file_hash, sections):
692
+ """Save extracted sections to SQLite"""
693
+ conn = sqlite3.connect(self.metadata_db_path)
694
+ cursor = conn.cursor()
695
+
696
+ try:
697
+ # Delete existing sections for this file
698
+ cursor.execute("DELETE FROM checklist_sections WHERE file_hash = ?", (file_hash,))
699
+
700
+ # Insert new sections
701
+ for section in sections:
702
+ cursor.execute("""
703
+ INSERT INTO checklist_sections
704
+ (file_hash, section_name, section_type, section_order, parameter_count)
705
+ VALUES (?, ?, ?, ?, ?)
706
+ """, (
707
+ file_hash, section["section_name"], section["section_type"],
708
+ section["section_order"], section["parameter_count"]
709
+ ))
710
+
711
+ conn.commit()
712
+ finally:
713
+ conn.close()
714
+
715
  def log_processing(self, filename, file_hash, status, error_message, parameters_extracted=0, sections_extracted=0):
716
+ """Log processing status"""
717
+ conn = sqlite3.connect(self.metadata_db_path)
718
+ cursor = conn.cursor()
719
+
720
+ try:
721
+ cursor.execute("""
722
+ INSERT INTO processing_log
723
+ (filename, file_hash, status, error_message, parameters_extracted, sections_extracted)
724
+ VALUES (?, ?, ?, ?, ?, ?)
725
+ """, (filename, file_hash, status, error_message, parameters_extracted, sections_extracted))
726
+
727
+ conn.commit()
728
+ finally:
729
+ conn.close()
730
+
731
  def process_all_pdfs(self):
732
+ """Process all PDFs in the directory"""
733
+ pdf_files = list(self.pdf_path.glob("*.pdf"))
734
+
735
+ if not pdf_files:
736
+ print(f"No PDF files found in {self.pdf_path}")
737
+ return
738
+
739
+ print(f"Found {len(pdf_files)} checklist PDF files")
740
+
741
+ for pdf_file in pdf_files:
742
+ self.process_pdf(pdf_file)
743
+
744
+ print("\nChecklist processing complete!")
745
+ print(f"Total files in manifest: {len(self.manifest['processed_files'])}")
746
+
747
  def get_processing_stats(self):
748
+ """Get processing statistics"""
749
+ conn = sqlite3.connect(self.metadata_db_path)
750
+ cursor = conn.cursor()
751
+
752
+ try:
753
+ # Get overall stats
754
+ cursor.execute("""
755
+ SELECT COUNT(*) as total,
756
+ SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
757
+ SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
758
+ SUM(parameters_extracted) as total_parameters,
759
+ SUM(sections_extracted) as total_sections
760
+ FROM processing_log
761
+ """)
762
+
763
+ stats = cursor.fetchone()
764
+
765
+ # Get document type distribution
766
+ cursor.execute("""
767
+ SELECT document_type, COUNT(*) as count
768
+ FROM checklist_documents
769
+ GROUP BY document_type
770
+ ORDER BY count DESC
771
+ """)
772
+
773
+ doc_types = cursor.fetchall()
774
+
775
+ # Get attribute distribution
776
+ cursor.execute("""
777
+ SELECT checklist_attributes, COUNT(*) as count
778
+ FROM checklist_documents
779
+ WHERE checklist_attributes IS NOT NULL
780
+ GROUP BY checklist_attributes
781
+ """)
782
+
783
+ attr_dist = cursor.fetchall()
784
+
785
+ # Parse attributes for summary
786
+ attribute_summary = {}
787
+ for attrs_json, count in attr_dist:
788
+ if attrs_json:
789
+ try:
790
+ attrs = json.loads(attrs_json)
791
+ for key, value in attrs.items():
792
+ if key not in attribute_summary:
793
+ attribute_summary[key] = {}
794
+ if value not in attribute_summary[key]:
795
+ attribute_summary[key][value] = 0
796
+ attribute_summary[key][value] += count
797
+ except:
798
+ pass
799
+
800
+ # Get parameter type distribution
801
+ cursor.execute("""
802
+ SELECT input_method, COUNT(*) as count
803
+ FROM checklist_parameters
804
+ GROUP BY input_method
805
+ ORDER BY count DESC
806
+ """)
807
+
808
+ input_methods = cursor.fetchall()
809
+
810
+ # Get most common parameters
811
+ cursor.execute("""
812
+ SELECT parameter_name, parameter_type, input_method, COUNT(*) as frequency
813
+ FROM checklist_parameters
814
+ GROUP BY parameter_name, parameter_type, input_method
815
+ HAVING frequency > 1
816
+ ORDER BY frequency DESC
817
+ LIMIT 10
818
+ """)
819
+
820
+ common_params = cursor.fetchall()
821
+
822
+ return {
823
+ "total_processed": stats[0],
824
+ "successful": stats[1],
825
+ "errors": stats[2],
826
+ "total_parameters": stats[3],
827
+ "total_sections": stats[4],
828
+ "document_types": dict(doc_types),
829
+ "input_methods": dict(input_methods),
830
+ "attribute_summary": attribute_summary,
831
+ "common_parameters": [
832
+ {
833
+ "name": p[0],
834
+ "type": p[1],
835
+ "input_method": p[2],
836
+ "frequency": p[3]
837
+ } for p in common_params
838
+ ]
839
+ }
840
+ finally:
841
+ conn.close()
842
+
843
  def get_parameter_patterns(self):
844
+ """Get common parameter patterns for AI reference"""
845
+ conn = sqlite3.connect(self.metadata_db_path)
846
+ cursor = conn.cursor()
847
+
848
+ try:
849
+ cursor.execute("""
850
+ SELECT
851
+ parameter_type,
852
+ input_method,
853
+ GROUP_CONCAT(DISTINCT specifications) as common_specs,
854
+ GROUP_CONCAT(DISTINCT options_list) as common_options,
855
+ COUNT(*) as usage_count
856
+ FROM checklist_parameters
857
+ WHERE specifications != '' OR options_list != ''
858
+ GROUP BY parameter_type, input_method
859
+ ORDER BY usage_count DESC
860
+ """)
861
+
862
+ patterns = []
863
+ for row in cursor.fetchall():
864
+ patterns.append({
865
+ "parameter_type": row[0],
866
+ "input_method": row[1],
867
+ "common_specifications": row[2],
868
+ "common_options": row[3],
869
+ "usage_count": row[4]
870
+ })
871
+
872
+ return patterns
873
+ finally:
874
+ conn.close()
875
+
876
  def search_similar_checklists(self, product_name, checklist_type="", limit=5):
877
+ """Search for similar checklists based on product and type"""
878
+ query_text = f"{product_name} {checklist_type} quality control inspection checklist"
879
+ query_embedding = self.embedder.encode(query_text).tolist()
880
+
881
+ try:
882
+ results = self.collection.query(
883
+ query_embeddings=[query_embedding],
884
+ n_results=limit,
885
+ where={"domain": "Quality Control"}
886
+ )
887
+
888
+ similar_checklists = []
889
+ if results['documents'][0]:
890
+ for i, doc in enumerate(results['documents'][0]):
891
+ metadata = results['metadatas'][0][i]
892
+
893
+ # Parse attributes
894
+ attrs = {}
895
+ if metadata.get('checklist_attributes'):
896
+ try:
897
+ attrs = json.loads(metadata['checklist_attributes'])
898
+ except:
899
+ pass
900
+
901
+ similar_checklists.append({
902
+ "document": metadata.get('source', 'Unknown'),
903
+ "product": metadata.get('product_name', 'Unknown'),
904
+ "type": metadata.get('document_type', 'Unknown'),
905
+ "attributes": attrs,
906
+ "parameters": metadata.get('total_parameters', 0),
907
+ "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0,
908
+ "content_preview": doc[:200] + "..." if len(doc) > 200 else doc
909
+ })
910
+
911
+ return similar_checklists
912
+ except Exception as e:
913
+ print(f"Error searching checklists: {str(e)}")
914
+ return []
915
 
916
 
917
  def main():
918
+ """Main function to create/update the checklist examples database"""
919
+ print("Starting Generic Checklist Examples Database Creation...")
920
+ print("Features: No company bias, dynamic attributes, universal patterns")
921
+
922
+ # Initialize database
923
+ db = ChecklistExamplesVDB()
924
+
925
+ # Process all PDFs
926
+ db.process_all_pdfs()
927
+
928
+ # Show processing stats
929
+ print("\n" + "="*60)
930
+ print("PROCESSING STATISTICS (Generic)")
931
+ print("="*60)
932
+
933
+ stats = db.get_processing_stats()
934
+ print(f"Total files processed: {stats['total_processed']}")
935
+ print(f"Successful: {stats['successful']}")
936
+ print(f"Errors: {stats['errors']}")
937
+ print(f"Total parameters extracted: {stats['total_parameters']}")
938
+ print(f"Total sections extracted: {stats['total_sections']}")
939
+
940
+ print("\nDocument Types:")
941
+ for doc_type, count in stats["document_types"].items():
942
+ print(f" - {doc_type}: {count} documents")
943
+
944
+ print("\nDynamic Attributes Found:")
945
+ for attr_type, values in stats["attribute_summary"].items():
946
+ print(f"\n{attr_type}:")
947
+ for value, count in values.items():
948
+ print(f" - {value}: {count} documents")
949
+
950
+ print("\nInput Methods Distribution:")
951
+ for method, count in stats["input_methods"].items():
952
+ print(f" - {method}: {count} parameters")
953
+
954
+ print("\nMost Common Parameters (Generic):")
955
+ for param in stats["common_parameters"]:
956
+ print(f" - {param['name']} ({param['input_method']}) - used {param['frequency']} times")
957
+
958
+ # Show parameter patterns
959
+ print("\n" + "="*60)
960
+ print("PARAMETER PATTERNS DISCOVERED")
961
+ print("="*60)
962
+
963
+ patterns = db.get_parameter_patterns()
964
+ for pattern in patterns[:10]: # Show top 10 patterns
965
+ print(f"\n{pattern['parameter_type']} -> {pattern['input_method']}")
966
+ print(f" Usage: {pattern['usage_count']} times")
967
+ if pattern['common_specifications']:
968
+ specs = pattern['common_specifications'][:100]
969
+ print(f" Common specs: {specs}{'...' if len(pattern['common_specifications']) > 100 else ''}")
970
+ if pattern['common_options']:
971
+ options = pattern['common_options'][:100]
972
+ print(f" Common options: {options}{'...' if len(pattern['common_options']) > 100 else ''}")
973
+
974
+ # Test search functionality
975
+ print("\n" + "="*60)
976
+ print("TESTING SEARCH FUNCTIONALITY")
977
+ print("="*60)
978
+
979
+ test_products = ["Quality Inspection", "Production Check", "Safety Assessment"]
980
+ for product in test_products:
981
+ print(f"\nSearching for '{product}' checklists:")
982
+ similar = db.search_similar_checklists(product, limit=3)
983
+ for i, checklist in enumerate(similar, 1):
984
+ print(f" {i}. {checklist['document']} ({checklist['type']})")
985
+ print(f" Product: {checklist['product']}, Parameters: {checklist['parameters']}")
986
+ if checklist['attributes']:
987
+ print(f" Attributes: {checklist['attributes']}")
988
+ print(f" Relevance: {checklist['relevance_score']:.3f}")
989
 
990
 
991
  if __name__ == "__main__":
992
+ main()
vector_stores/chroma_db/product_specs/create_product_spec_db.py CHANGED
@@ -48,6 +48,7 @@ class ProductSpecificationVectorDB:
48
  conn = sqlite3.connect(self.metadata_db_path)
49
  cursor = conn.cursor()
50
 
 
51
  cursor.execute("""
52
  CREATE TABLE IF NOT EXISTS product_documents (
53
  id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -56,7 +57,7 @@ class ProductSpecificationVectorDB:
56
  product_name TEXT,
57
  brand TEXT,
58
  supplier TEXT,
59
- product_category TEXT,
60
  shelf_life TEXT,
61
  storage_conditions TEXT,
62
  manufacturing_location TEXT,
@@ -136,12 +137,12 @@ class ProductSpecificationVectorDB:
136
  return hashlib.md5(f.read()).hexdigest()
137
 
138
  def extract_product_metadata(self, text):
139
- """Extract product-specific metadata"""
140
  metadata = {
141
  "product_name": "",
142
  "brand": "",
143
  "supplier": "",
144
- "product_category": "",
145
  "shelf_life": "",
146
  "storage_conditions": "",
147
  "manufacturing_location": "",
@@ -156,16 +157,16 @@ class ProductSpecificationVectorDB:
156
  metadata["product_name"] = re.search(r'Product\s*Name[:]*\s*(.+)', line, re.IGNORECASE).group(1).strip()
157
  break
158
 
159
- # Extract brand
160
  brand_patterns = [
161
  r'Brand[:]*\s*(.+)',
162
- r'Al\s*Kabeer',
163
  r'Company[:]*\s*(.+)'
164
  ]
165
  for pattern in brand_patterns:
166
  match = re.search(pattern, text, re.IGNORECASE)
167
  if match:
168
- metadata["brand"] = match.group(1).strip() if match.groups() else "Al Kabeer"
169
  break
170
 
171
  # Extract shelf life
@@ -192,21 +193,56 @@ class ProductSpecificationVectorDB:
192
  metadata["storage_conditions"] = match.group(1).strip()
193
  break
194
 
195
- # Determine product category
196
- category_keywords = {
197
- "Frozen Food": ["frozen", "iqf", "freeze"],
198
- "Bakery": ["bread", "paratha", "bakery", "dough"],
199
- "Vegetables": ["vegetable", "corn", "peas", "carrot"],
200
- "Snacks": ["samosa", "snack", "fried"],
201
- "Dairy": ["milk", "cheese", "yogurt", "dairy"]
 
 
 
 
 
 
 
 
 
 
 
202
  }
203
 
204
  text_lower = text.lower()
205
- for category, keywords in category_keywords.items():
206
  if any(keyword in text_lower for keyword in keywords):
207
- metadata["product_category"] = category
208
  break
209
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
  return metadata
211
 
212
  def extract_parameters(self, text):
@@ -276,30 +312,30 @@ class ProductSpecificationVectorDB:
276
  return "Text Input"
277
 
278
  def classify_parameter_category(self, param_name):
279
- """Classify parameter into categories"""
280
  param_lower = param_name.lower()
281
 
282
- categories = {
283
- "Physical": ["weight", "size", "dimension", "length", "width", "diameter"],
284
- "Sensory": ["appearance", "color", "texture", "taste", "flavor", "aroma", "odor"],
285
- "Microbiological": ["plate count", "coli", "salmonella", "listeria", "enterobacteriaceae"],
286
- "Chemical": ["moisture", "fat", "protein", "ph", "acid", "chemical"],
287
- "Safety": ["foreign", "contamination", "allergen", "residue"],
288
- "Temperature": ["temperature", "freezing", "cooling", "heating"],
289
- "Packaging": ["packaging", "labeling", "seal", "material"]
290
- }
291
-
292
- for category, keywords in categories.items():
293
- if any(keyword in param_lower for keyword in keywords):
294
- return category
295
-
296
- return "General"
297
 
298
  def is_critical_parameter(self, param_name):
299
  """Determine if parameter is critical for safety/quality"""
300
  critical_keywords = [
301
  "temperature", "microbiological", "pathogen", "salmonella", "listeria",
302
- "foreign", "contamination", "allergen", "weight", "moisture"
303
  ]
304
  return any(keyword in param_name.lower() for keyword in critical_keywords)
305
 
@@ -310,10 +346,11 @@ class ProductSpecificationVectorDB:
310
 
311
  # Look for table headers
312
  table_indicators = [
313
- "MICROBIOLOGICAL SPECIFICATIONS",
314
- "CHEMICAL SPECIFICATIONS",
315
- "PRODUCT CHARACTERISTICS",
316
- "NUTRITIONAL FACTS"
 
317
  ]
318
 
319
  in_table = False
@@ -340,7 +377,7 @@ class ProductSpecificationVectorDB:
340
 
341
  if len(param_name) > 3 and param_name not in ["PARAMETERS", "ACCEPTED LIMIT"]:
342
  param_type = self.classify_parameter_type(param_name, value, unit)
343
- category = current_table_type.split()[0] if current_table_type else "General"
344
 
345
  parameters.append({
346
  "parameter_name": param_name,
@@ -389,50 +426,38 @@ class ProductSpecificationVectorDB:
389
  return nutritional_data
390
 
391
  def extract_compliance_standards(self, text):
392
- """Extract compliance standards and certifications"""
393
  standards = []
394
 
395
- # Look for standards references
396
  standard_patterns = [
397
- r'(GSO\s*\d+[:/]\d+)',
398
- r'(ISO\s*\d+(?::\d+)?)',
399
- r'(HACCP)',
400
- r'(HALAL)',
401
- r'(FDA)',
402
- r'(SASO\s*Standard)',
403
- r'(EU\s*Regulation)',
404
- r'(AOAC)',
405
- r'(Codex\s*Alimentarius)'
406
  ]
407
 
408
  for pattern in standard_patterns:
409
  matches = re.finditer(pattern, text, re.IGNORECASE)
410
  for match in matches:
411
- standard_code = match.group(1)
412
 
413
- # Determine standard type
414
- if "GSO" in standard_code:
415
- standard_name = "GCC Standardization Organization"
416
- compliance_type = "Regional Standard"
417
- elif "ISO" in standard_code:
418
- standard_name = "International Organization for Standardization"
419
- compliance_type = "International Standard"
420
- elif "HACCP" in standard_code:
421
- standard_name = "Hazard Analysis Critical Control Points"
422
- compliance_type = "Food Safety System"
423
- elif "HALAL" in standard_code:
424
- standard_name = "Halal Certification"
425
- compliance_type = "Religious Compliance"
426
  else:
427
- standard_name = standard_code
428
- compliance_type = "Regulatory Standard"
429
 
430
- standards.append({
431
- "standard_name": standard_name,
432
- "standard_code": standard_code,
433
- "compliance_type": compliance_type,
434
- "requirements": "" # Could be extracted from context
435
- })
 
 
436
 
437
  return standards
438
 
@@ -456,8 +481,14 @@ class ProductSpecificationVectorDB:
456
 
457
  # Add product context to searchable content
458
  searchable_content = f"Product: {metadata.get('product_name', 'Unknown')}\n"
459
- searchable_content += f"Category: {metadata.get('product_category', 'General')}\n\n"
460
- searchable_content += chunk
 
 
 
 
 
 
461
 
462
  documents.append({
463
  "text": searchable_content,
@@ -511,7 +542,7 @@ class ProductSpecificationVectorDB:
511
  "source": filename,
512
  "product_name": product_metadata["product_name"],
513
  "brand": product_metadata["brand"],
514
- "product_category": product_metadata["product_category"],
515
  "shelf_life": product_metadata["shelf_life"],
516
  "storage_conditions": product_metadata["storage_conditions"],
517
  "file_hash": file_hash,
@@ -546,7 +577,8 @@ class ProductSpecificationVectorDB:
546
  "processed_date": datetime.now().isoformat(),
547
  "product_name": product_metadata["product_name"],
548
  "parameters_extracted": len(parameters),
549
- "compliance_standards": len(compliance_standards)
 
550
  }
551
  self.save_manifest()
552
 
@@ -568,13 +600,14 @@ class ProductSpecificationVectorDB:
568
  try:
569
  cursor.execute("""
570
  INSERT OR REPLACE INTO product_documents
571
- (file_hash, filename, product_name, brand, supplier, product_category,
572
  shelf_life, storage_conditions, manufacturing_location, document_type)
573
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
574
  """, (
575
  file_hash, filename,
576
  metadata["product_name"], metadata["brand"], metadata["supplier"],
577
- metadata["product_category"], metadata["shelf_life"],
 
578
  metadata["storage_conditions"], metadata["manufacturing_location"],
579
  metadata["document_type"]
580
  ))
@@ -615,97 +648,162 @@ class ProductSpecificationVectorDB:
615
  cursor.execute("DELETE FROM nutritional_info WHERE file_hash = ?", (file_hash,))
616
 
617
  for nutrition in nutritional_data:
618
- cursor.execute("""
619
- INSERT INTO nutritional_info
620
- (file_hash, nutrient_name, value_per_100g, daily_value_percent)
621
- VALUES (?, ?, ?, ?)
622
- """, (
623
- file_hash, nutrition["nutrient_name"],
624
- nutrition["value_per_100g"], nutrition["daily_value_percent"]
625
- ))
626
-
627
  conn.commit()
628
  finally:
629
  conn.close()
630
-
631
  def save_compliance_standards(self, file_hash, standards):
632
- """Save compliance standards to SQLite"""
633
- conn = sqlite3.connect(self.metadata_db_path)
634
- cursor = conn.cursor()
635
-
636
- try:
637
- cursor.execute("DELETE FROM compliance_standards WHERE file_hash = ?", (file_hash,))
638
-
639
- for standard in standards:
640
- cursor.execute("""
641
- INSERT INTO compliance_standards
642
- (file_hash, standard_name, standard_code, compliance_type, requirements)
643
- VALUES (?, ?, ?, ?, ?)
644
- """, (
645
- file_hash, standard["standard_name"], standard["standard_code"],
646
- standard["compliance_type"], standard["requirements"]
647
- ))
648
-
649
- conn.commit()
650
- finally:
651
- conn.close()
652
-
653
  def log_processing(self, filename, file_hash, status, error_message, params_count=0, standards_count=0):
654
- """Log processing results"""
655
- conn = sqlite3.connect(self.metadata_db_path)
656
- cursor = conn.cursor()
657
-
658
- try:
659
- cursor.execute("""
660
- INSERT INTO processing_log
661
- (filename, file_hash, status, error_message, parameters_extracted, compliance_standards_extracted)
662
- VALUES (?, ?, ?, ?, ?, ?)
663
- """, (filename, file_hash, status, error_message, params_count, standards_count))
664
-
665
- conn.commit()
666
- finally:
667
- conn.close()
668
-
669
  def ocr_pdf(self, pdf_path):
670
- """OCR fallback for scanned PDFs"""
671
- try:
672
- images = pdf2image.convert_from_path(pdf_path)
673
- full_text = ""
674
-
675
- for i, image in enumerate(images):
676
- text = pytesseract.image_to_string(image)
677
- full_text += f"\n--- Page {i+1} ---\n{text}"
678
-
679
- return full_text
680
- except Exception as e:
681
- print(f"OCR error: {e}")
682
- return ""
683
-
684
  def process_all_pdfs(self):
685
- """Process all product specification PDFs"""
686
- pdf_files = list(self.pdf_path.glob("*.pdf"))
687
-
688
- if not pdf_files:
689
- print(f"No PDF files found in {self.pdf_path}")
690
- return
691
-
692
- print(f"Found {len(pdf_files)} product specification files")
693
-
694
- for pdf_file in pdf_files:
695
- self.process_pdf(pdf_file)
696
-
697
- print(f"Product specification VDB creation complete!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
698
 
699
 
700
  def main():
701
- """Main function"""
702
- print("Creating Product Specification Vector Database...")
703
-
704
- db = ProductSpecificationVectorDB()
705
- db.process_all_pdfs()
706
-
707
- print("Product Specification VDB ready!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
708
 
709
 
710
  if __name__ == "__main__":
711
- main()
 
48
  conn = sqlite3.connect(self.metadata_db_path)
49
  cursor = conn.cursor()
50
 
51
+ # UPDATED: Added product_attributes instead of fixed category
52
  cursor.execute("""
53
  CREATE TABLE IF NOT EXISTS product_documents (
54
  id INTEGER PRIMARY KEY AUTOINCREMENT,
 
57
  product_name TEXT,
58
  brand TEXT,
59
  supplier TEXT,
60
+ product_attributes TEXT, -- Dynamic attributes instead of category
61
  shelf_life TEXT,
62
  storage_conditions TEXT,
63
  manufacturing_location TEXT,
 
137
  return hashlib.md5(f.read()).hexdigest()
138
 
139
  def extract_product_metadata(self, text):
140
+ """Extract product-specific metadata without forcing categories"""
141
  metadata = {
142
  "product_name": "",
143
  "brand": "",
144
  "supplier": "",
145
+ "product_attributes": {}, # Dynamic attributes
146
  "shelf_life": "",
147
  "storage_conditions": "",
148
  "manufacturing_location": "",
 
157
  metadata["product_name"] = re.search(r'Product\s*Name[:]*\s*(.+)', line, re.IGNORECASE).group(1).strip()
158
  break
159
 
160
+ # Extract brand (generic)
161
  brand_patterns = [
162
  r'Brand[:]*\s*(.+)',
163
+ r'Manufacturer[:]*\s*(.+)',
164
  r'Company[:]*\s*(.+)'
165
  ]
166
  for pattern in brand_patterns:
167
  match = re.search(pattern, text, re.IGNORECASE)
168
  if match:
169
+ metadata["brand"] = match.group(1).strip()
170
  break
171
 
172
  # Extract shelf life
 
193
  metadata["storage_conditions"] = match.group(1).strip()
194
  break
195
 
196
+ # UPDATED: Extract dynamic product attributes instead of fixed categories
197
+ attributes = {}
198
+
199
+ # Temperature requirements
200
+ temp_match = re.search(r'(?:stored?|kept?|maintain(?:ed)?)\s*at\s*([-\d]+\s*[°]?[CF])', text, re.IGNORECASE)
201
+ if temp_match:
202
+ attributes["temperature_requirement"] = temp_match.group(1)
203
+
204
+ # Processing method
205
+ processing_keywords = {
206
+ "frozen": ["frozen", "freeze", "iqf", "individually quick frozen"],
207
+ "fresh": ["fresh", "chilled", "refrigerated"],
208
+ "dried": ["dried", "dehydrated", "dry"],
209
+ "canned": ["canned", "tinned", "preserved"],
210
+ "fried": ["fried", "deep fried", "oil fried"],
211
+ "baked": ["baked", "oven", "bakery"],
212
+ "raw": ["raw", "uncooked", "unprocessed"],
213
+ "cooked": ["cooked", "pre-cooked", "ready to eat"]
214
  }
215
 
216
  text_lower = text.lower()
217
+ for method, keywords in processing_keywords.items():
218
  if any(keyword in text_lower for keyword in keywords):
219
+ attributes["processing_method"] = method
220
  break
221
 
222
+ # Product form
223
+ form_keywords = {
224
+ "powder": ["powder", "powdered"],
225
+ "liquid": ["liquid", "juice", "syrup"],
226
+ "solid": ["solid", "whole", "pieces"],
227
+ "paste": ["paste", "puree"],
228
+ "granular": ["granular", "granules"]
229
+ }
230
+
231
+ for form, keywords in form_keywords.items():
232
+ if any(keyword in text_lower for keyword in keywords):
233
+ attributes["product_form"] = form
234
+ break
235
+
236
+ # Special characteristics
237
+ if any(word in text_lower for word in ["organic", "natural", "no preservatives"]):
238
+ attributes["special_characteristics"] = "natural/organic"
239
+ if any(word in text_lower for word in ["halal", "kosher"]):
240
+ attributes["certification"] = "religious compliance"
241
+ if any(word in text_lower for word in ["gluten free", "allergen free"]):
242
+ attributes["dietary"] = "allergen-free"
243
+
244
+ metadata["product_attributes"] = json.dumps(attributes)
245
+
246
  return metadata
247
 
248
  def extract_parameters(self, text):
 
312
  return "Text Input"
313
 
314
  def classify_parameter_category(self, param_name):
315
+ """Classify parameter into categories dynamically"""
316
  param_lower = param_name.lower()
317
 
318
+ # Dynamic categorization based on parameter nature
319
+ if any(word in param_lower for word in ["weight", "size", "dimension", "length", "width"]):
320
+ return "Physical Measurement"
321
+ elif any(word in param_lower for word in ["appearance", "color", "texture", "taste", "flavor"]):
322
+ return "Sensory Attribute"
323
+ elif any(word in param_lower for word in ["bacteria", "microb", "pathogen", "coli"]):
324
+ return "Microbiological"
325
+ elif any(word in param_lower for word in ["moisture", "fat", "protein", "ph", "acid"]):
326
+ return "Chemical Composition"
327
+ elif any(word in param_lower for word in ["foreign", "contamination", "hazard"]):
328
+ return "Safety Parameter"
329
+ elif any(word in param_lower for word in ["temperature", "thermal"]):
330
+ return "Temperature Control"
331
+ else:
332
+ return "Quality Parameter"
333
 
334
  def is_critical_parameter(self, param_name):
335
  """Determine if parameter is critical for safety/quality"""
336
  critical_keywords = [
337
  "temperature", "microbiological", "pathogen", "salmonella", "listeria",
338
+ "foreign", "contamination", "allergen", "critical"
339
  ]
340
  return any(keyword in param_name.lower() for keyword in critical_keywords)
341
 
 
346
 
347
  # Look for table headers
348
  table_indicators = [
349
+ "SPECIFICATIONS",
350
+ "PARAMETERS",
351
+ "CHARACTERISTICS",
352
+ "REQUIREMENTS",
353
+ "LIMITS"
354
  ]
355
 
356
  in_table = False
 
377
 
378
  if len(param_name) > 3 and param_name not in ["PARAMETERS", "ACCEPTED LIMIT"]:
379
  param_type = self.classify_parameter_type(param_name, value, unit)
380
+ category = self.classify_parameter_category(param_name)
381
 
382
  parameters.append({
383
  "parameter_name": param_name,
 
426
  return nutritional_data
427
 
428
  def extract_compliance_standards(self, text):
429
+ """Extract compliance standards and certifications generically"""
430
  standards = []
431
 
432
+ # Generic standard patterns
433
  standard_patterns = [
434
+ r'(?:complies?\s*with|as\s*per|according\s*to)\s*([A-Z]+\s*\d+[:/]?\d*)',
435
+ r'(?:standard|specification)\s*:?\s*([A-Z]+\s*\d+[:/]?\d*)',
436
+ r'(?:certified|certification)\s*:?\s*([A-Za-z\s]+)',
437
+ r'([A-Z]{2,}\s*\d+(?::\d+)?)', # Generic standard format
 
 
 
 
 
438
  ]
439
 
440
  for pattern in standard_patterns:
441
  matches = re.finditer(pattern, text, re.IGNORECASE)
442
  for match in matches:
443
+ standard_ref = match.group(1).strip()
444
 
445
+ # Generic classification
446
+ if re.match(r'^[A-Z]{2,4}\s*\d+', standard_ref):
447
+ standard_name = "Industry Standard"
448
+ compliance_type = "Technical Standard"
 
 
 
 
 
 
 
 
 
449
  else:
450
+ standard_name = standard_ref
451
+ compliance_type = "Certification"
452
 
453
+ # Avoid duplicates
454
+ if not any(s["standard_code"] == standard_ref for s in standards):
455
+ standards.append({
456
+ "standard_name": standard_name,
457
+ "standard_code": standard_ref,
458
+ "compliance_type": compliance_type,
459
+ "requirements": ""
460
+ })
461
 
462
  return standards
463
 
 
481
 
482
  # Add product context to searchable content
483
  searchable_content = f"Product: {metadata.get('product_name', 'Unknown')}\n"
484
+
485
+ # Add dynamic attributes
486
+ if metadata.get('product_attributes'):
487
+ attrs = json.loads(metadata['product_attributes'])
488
+ if attrs:
489
+ searchable_content += f"Attributes: {', '.join(f'{k}={v}' for k, v in attrs.items())}\n"
490
+
491
+ searchable_content += f"\n{chunk}"
492
 
493
  documents.append({
494
  "text": searchable_content,
 
542
  "source": filename,
543
  "product_name": product_metadata["product_name"],
544
  "brand": product_metadata["brand"],
545
+ "product_attributes": product_metadata["product_attributes"], # Dynamic attributes
546
  "shelf_life": product_metadata["shelf_life"],
547
  "storage_conditions": product_metadata["storage_conditions"],
548
  "file_hash": file_hash,
 
577
  "processed_date": datetime.now().isoformat(),
578
  "product_name": product_metadata["product_name"],
579
  "parameters_extracted": len(parameters),
580
+ "compliance_standards": len(compliance_standards),
581
+ "attributes": product_metadata["product_attributes"]
582
  }
583
  self.save_manifest()
584
 
 
600
  try:
601
  cursor.execute("""
602
  INSERT OR REPLACE INTO product_documents
603
+ (file_hash, filename, product_name, brand, supplier, product_attributes,
604
  shelf_life, storage_conditions, manufacturing_location, document_type)
605
  VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
606
  """, (
607
  file_hash, filename,
608
  metadata["product_name"], metadata["brand"], metadata["supplier"],
609
+ metadata["product_attributes"], # Dynamic attributes as JSON
610
+ metadata["shelf_life"],
611
  metadata["storage_conditions"], metadata["manufacturing_location"],
612
  metadata["document_type"]
613
  ))
 
648
  cursor.execute("DELETE FROM nutritional_info WHERE file_hash = ?", (file_hash,))
649
 
650
  for nutrition in nutritional_data:
651
+ cursor.execute("""INSERT INTO nutritional_info
652
+ (file_hash, nutrient_name, value_per_100g, daily_value_percent)
653
+ VALUES (?, ?, ?, ?)
654
+ """, (
655
+ file_hash, nutrition["nutrient_name"],
656
+ nutrition["value_per_100g"], nutrition["daily_value_percent"]
657
+ ))
658
+
 
659
  conn.commit()
660
  finally:
661
  conn.close()
662
+
663
  def save_compliance_standards(self, file_hash, standards):
664
+ """Save compliance standards to SQLite"""
665
+ conn = sqlite3.connect(self.metadata_db_path)
666
+ cursor = conn.cursor()
667
+
668
+ try:
669
+ cursor.execute("DELETE FROM compliance_standards WHERE file_hash = ?", (file_hash,))
670
+
671
+ for standard in standards:
672
+ cursor.execute("""
673
+ INSERT INTO compliance_standards
674
+ (file_hash, standard_name, standard_code, compliance_type, requirements)
675
+ VALUES (?, ?, ?, ?, ?)
676
+ """, (
677
+ file_hash, standard["standard_name"], standard["standard_code"],
678
+ standard["compliance_type"], standard["requirements"]
679
+ ))
680
+
681
+ conn.commit()
682
+ finally:
683
+ conn.close()
684
+
685
  def log_processing(self, filename, file_hash, status, error_message, params_count=0, standards_count=0):
686
+ """Log processing results"""
687
+ conn = sqlite3.connect(self.metadata_db_path)
688
+ cursor = conn.cursor()
689
+
690
+ try:
691
+ cursor.execute("""
692
+ INSERT INTO processing_log
693
+ (filename, file_hash, status, error_message, parameters_extracted, compliance_standards_extracted)
694
+ VALUES (?, ?, ?, ?, ?, ?)
695
+ """, (filename, file_hash, status, error_message, params_count, standards_count))
696
+
697
+ conn.commit()
698
+ finally:
699
+ conn.close()
700
+
701
  def ocr_pdf(self, pdf_path):
702
+ """OCR fallback for scanned PDFs"""
703
+ try:
704
+ images = pdf2image.convert_from_path(pdf_path)
705
+ full_text = ""
706
+
707
+ for i, image in enumerate(images):
708
+ text = pytesseract.image_to_string(image)
709
+ full_text += f"\n--- Page {i+1} ---\n{text}"
710
+
711
+ return full_text
712
+ except Exception as e:
713
+ print(f"OCR error: {e}")
714
+ return ""
715
+
716
  def process_all_pdfs(self):
717
+ """Process all product specification PDFs"""
718
+ pdf_files = list(self.pdf_path.glob("*.pdf"))
719
+
720
+ if not pdf_files:
721
+ print(f"No PDF files found in {self.pdf_path}")
722
+ return
723
+
724
+ print(f"Found {len(pdf_files)} product specification files")
725
+
726
+ for pdf_file in pdf_files:
727
+ self.process_pdf(pdf_file)
728
+
729
+ print(f"Product specification VDB creation complete!")
730
+
731
+ def get_processing_stats(self):
732
+ """Get processing statistics"""
733
+ conn = sqlite3.connect(self.metadata_db_path)
734
+ cursor = conn.cursor()
735
+
736
+ try:
737
+ # Overall stats
738
+ cursor.execute("""
739
+ SELECT COUNT(*) as total,
740
+ SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
741
+ SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
742
+ SUM(parameters_extracted) as total_parameters
743
+ FROM processing_log
744
+ """)
745
+
746
+ stats = cursor.fetchone()
747
+
748
+ # Get attribute distribution
749
+ cursor.execute("""
750
+ SELECT product_attributes, COUNT(*) as count
751
+ FROM product_documents
752
+ WHERE product_attributes IS NOT NULL
753
+ GROUP BY product_attributes
754
+ """)
755
+
756
+ attribute_dist = cursor.fetchall()
757
+
758
+ # Parse attributes to get summary
759
+ attribute_summary = {}
760
+ for attrs_json, count in attribute_dist:
761
+ if attrs_json:
762
+ try:
763
+ attrs = json.loads(attrs_json)
764
+ for key, value in attrs.items():
765
+ if key not in attribute_summary:
766
+ attribute_summary[key] = {}
767
+ if value not in attribute_summary[key]:
768
+ attribute_summary[key][value] = 0
769
+ attribute_summary[key][value] += count
770
+ except:
771
+ pass
772
+
773
+ return {
774
+ "total_processed": stats[0] or 0,
775
+ "successful": stats[1] or 0,
776
+ "errors": stats[2] or 0,
777
+ "total_parameters": stats[3] or 0,
778
+ "attribute_summary": attribute_summary
779
+ }
780
+ finally:
781
+ conn.close()
782
 
783
 
784
  def main():
785
+ """Main function"""
786
+ print("Creating Product Specification Vector Database...")
787
+ print("Features: Dynamic product attributes, no fixed categories")
788
+
789
+ db = ProductSpecificationVectorDB()
790
+ db.process_all_pdfs()
791
+
792
+ # Show stats
793
+ stats = db.get_processing_stats()
794
+ print(f"\n📊 Processing Statistics:")
795
+ print(f"Total files: {stats['total_processed']}")
796
+ print(f"Successful: {stats['successful']}")
797
+ print(f"Total parameters: {stats['total_parameters']}")
798
+
799
+ print(f"\n🏷️ Dynamic Product Attributes Found:")
800
+ for attr_type, values in stats['attribute_summary'].items():
801
+ print(f"\n{attr_type}:")
802
+ for value, count in values.items():
803
+ print(f" - {value}: {count} products")
804
+
805
+ print("\nProduct Specification VDB ready!")
806
 
807
 
808
  if __name__ == "__main__":
809
+ main()
vector_stores/chroma_db/regulatory_docs/create_regulatory_db.py CHANGED
@@ -1,578 +1,3 @@
1
- # import os
2
- # import json
3
- # import sqlite3
4
- # from datetime import datetime
5
- # from pathlib import Path
6
- # import chromadb
7
- # from chromadb import Settings
8
- # from langchain_community.document_loaders import PyPDFLoader
9
- # from langchain.text_splitter import RecursiveCharacterTextSplitter
10
- # from sentence_transformers import SentenceTransformer
11
- # import pytesseract
12
- # from PIL import Image
13
- # import pdf2image
14
- # import hashlib
15
- # import re
16
-
17
- # class RegulatoryGuidelinesDB:
18
- # def __init__(self, base_path="./vector_stores"):
19
- # self.base_path = Path(base_path)
20
- # self.pdf_path = self.base_path / "regulatory_guidelines" / "pdfs"
21
- # self.chroma_path = self.base_path / "chroma_db" / "regulatory_docs"
22
- # self.metadata_path = self.chroma_path / "metadata"
23
- # self.manifest_path = self.metadata_path / "manifest.json"
24
- # self.metadata_db_path = self.metadata_path / "regulatory_metadata.db"
25
-
26
- # # Create directories
27
- # self.pdf_path.mkdir(parents=True, exist_ok=True)
28
- # self.chroma_path.mkdir(parents=True, exist_ok=True)
29
- # self.metadata_path.mkdir(parents=True, exist_ok=True)
30
-
31
- # # Initialize ChromaDB
32
- # self.client = chromadb.PersistentClient(
33
- # path=str(self.chroma_path),
34
- # settings=Settings(anonymized_telemetry=False)
35
- # )
36
-
37
- # # Get or create collection
38
- # self.collection = self.client.get_or_create_collection(
39
- # name="regulatory_guidelines",
40
- # metadata={"description": "Regulatory guidelines and standards for QC"}
41
- # )
42
-
43
- # # Initialize embedding model
44
- # self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
45
-
46
- # # Initialize metadata database
47
- # self.init_metadata_db()
48
-
49
- # # Load manifest
50
- # self.manifest = self.load_manifest()
51
-
52
- # def init_metadata_db(self):
53
- # """Initialize SQLite database for storing regulatory metadata"""
54
- # conn = sqlite3.connect(self.metadata_db_path)
55
- # cursor = conn.cursor()
56
-
57
- # cursor.execute("""
58
- # CREATE TABLE IF NOT EXISTS regulatory_documents (
59
- # id INTEGER PRIMARY KEY AUTOINCREMENT,
60
- # file_hash TEXT UNIQUE NOT NULL,
61
- # filename TEXT NOT NULL,
62
- # regulatory_body TEXT,
63
- # standard_type TEXT,
64
- # standard_code TEXT,
65
- # publication_date TEXT,
66
- # effective_date TEXT,
67
- # jurisdiction TEXT,
68
- # industry TEXT,
69
- # extracted_at DATETIME DEFAULT CURRENT_TIMESTAMP
70
- # )
71
- # """)
72
-
73
- # cursor.execute("""
74
- # CREATE TABLE IF NOT EXISTS processing_log (
75
- # id INTEGER PRIMARY KEY AUTOINCREMENT,
76
- # filename TEXT NOT NULL,
77
- # file_hash TEXT NOT NULL,
78
- # processed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
79
- # status TEXT,
80
- # error_message TEXT,
81
- # text_length INTEGER,
82
- # chunk_count INTEGER
83
- # )
84
- # """)
85
-
86
- # cursor.execute("""
87
- # CREATE TABLE IF NOT EXISTS key_topics (
88
- # id INTEGER PRIMARY KEY AUTOINCREMENT,
89
- # file_hash TEXT NOT NULL,
90
- # topic TEXT NOT NULL,
91
- # relevance_score REAL,
92
- # FOREIGN KEY (file_hash) REFERENCES regulatory_documents(file_hash)
93
- # )
94
- # """)
95
-
96
- # conn.commit()
97
- # conn.close()
98
-
99
- # def load_manifest(self):
100
- # """Load processing manifest"""
101
- # if self.manifest_path.exists():
102
- # with open(self.manifest_path, 'r') as f:
103
- # return json.load(f)
104
- # return {"processed_files": {}, "last_updated": None}
105
-
106
- # def save_manifest(self):
107
- # """Save processing manifest"""
108
- # self.manifest["last_updated"] = datetime.now().isoformat()
109
- # with open(self.manifest_path, 'w') as f:
110
- # json.dump(self.manifest, f, indent=2)
111
-
112
- # def get_file_hash(self, filepath):
113
- # """Generate hash for file to track changes"""
114
- # with open(filepath, 'rb') as f:
115
- # return hashlib.md5(f.read()).hexdigest()
116
-
117
- # def extract_metadata_from_pdf(self, pdf_path, text_content):
118
- # """Extract regulatory metadata from PDF"""
119
- # metadata = {
120
- # "regulatory_body": "Unknown",
121
- # "standard_type": "Document",
122
- # "standard_code": "",
123
- # "publication_date": "",
124
- # "effective_date": "",
125
- # "jurisdiction": "General",
126
- # "industry": "General"
127
- # }
128
-
129
- # # Extract regulatory body
130
- # regulatory_bodies = {
131
- # "Dubai Municipality": ["dubai municipality", "dm ", "بلدية دبي", "@dmunicipality", "food safety department"],
132
- # "HACCP": ["haccp", "hazard analysis"],
133
- # "ISO": ["iso ", "international organization"],
134
- # "GSO": ["gso ", "gcc standardization", "gulf standard"],
135
- # "FDA": ["fda", "food and drug administration"],
136
- # "ESMA": ["esma", "emirates authority for standardization", "emirates standardisation"],
137
- # "SASO": ["saso", "saudi standards"],
138
- # "UAE Ministry": ["uae ministry", "ministry of", "الإمارات العربية المتحدة", "ministry of environment and water"],
139
- # "Federal Law": ["federal law", "uae law", "united arab emirates law"],
140
- # "DHA": ["dubai health authority", "dha"],
141
- # "Ministry of Health": ["ministry of health and prevention", "mohp"]
142
- # }
143
-
144
- # text_lower = text_content.lower()
145
- # for body, patterns in regulatory_bodies.items():
146
- # if any(pattern in text_lower for pattern in patterns):
147
- # metadata["regulatory_body"] = body
148
- # break
149
-
150
- # # Extract standard code (e.g., ISO 22000, GSO 2055)
151
- # standard_patterns = [
152
- # r"(ISO\s*\d+(?::\d+)?)",
153
- # r"(GSO\s*\d+(?:/\d+)?)",
154
- # r"(HACCP\s*(?:Rev\s*\d+)?)",
155
- # r"(DM[-/]\d+)",
156
- # r"(ESMA\s*\d+)",
157
- # r"(FDA\s*\d+)",
158
- # r"(Edition\s*\d+)", # For Dubai Municipality documents
159
- # r"(Federal Law No\.\s*\d+\s*of\s*\d+)", # For UAE Federal Laws
160
- # r"(Circular\s*(?:No\.)?\s*\d+)",
161
- # ]
162
-
163
- # for pattern in standard_patterns:
164
- # matches = re.findall(pattern, text_content, re.IGNORECASE)
165
- # if matches:
166
- # metadata["standard_code"] = matches[0]
167
- # break
168
-
169
- # # Extract publication/effective dates
170
- # date_patterns = [
171
- # r"(?:publication|published|issue)[\s:]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
172
- # r"(?:effective|validity)[\s:]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
173
- # r"(?:date|dated)[\s:]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
174
- # ]
175
-
176
- # for i, pattern in enumerate(date_patterns):
177
- # matches = re.findall(pattern, text_content, re.IGNORECASE)
178
- # if matches:
179
- # if i == 0:
180
- # metadata["publication_date"] = matches[0]
181
- # elif i == 1:
182
- # metadata["effective_date"] = matches[0]
183
- # else:
184
- # if not metadata["publication_date"]:
185
- # metadata["publication_date"] = matches[0]
186
-
187
- # # Extract jurisdiction
188
- # jurisdictions = {
189
- # "UAE": ["uae", "united arab emirates", "الإمارات"],
190
- # "Dubai": ["dubai", "دبي"],
191
- # "GCC": ["gcc", "gulf cooperation council", "مجلس التعاون الخليجي"],
192
- # "International": ["international", "global"],
193
- # }
194
-
195
- # for jurisdiction, patterns in jurisdictions.items():
196
- # if any(pattern in text_lower for pattern in patterns):
197
- # metadata["jurisdiction"] = jurisdiction
198
- # break
199
-
200
- # # Determine industry/domain
201
- # industry_keywords = {
202
- # "Food": ["food", "beverage", "nutrition", "edible", "consumption"],
203
- # "Pharmaceutical": ["pharmaceutical", "drug", "medicine", "pharma"],
204
- # "Cosmetics": ["cosmetic", "beauty", "personal care"],
205
- # "Medical Devices": ["medical device", "medical equipment"],
206
- # "General Manufacturing": ["manufacturing", "production", "industrial"]
207
- # }
208
-
209
- # for industry, keywords in industry_keywords.items():
210
- # if any(keyword in text_lower for keyword in keywords):
211
- # metadata["industry"] = industry
212
- # break
213
-
214
- # # If no industry detected, default to Food (since this is for Swift Check)
215
- # if not metadata["industry"]:
216
- # metadata["industry"] = "Food"
217
-
218
- # # Determine standard type
219
- # if "haccp" in text_lower:
220
- # metadata["standard_type"] = "Food Safety Management"
221
- # elif "iso" in text_lower:
222
- # metadata["standard_type"] = "International Standard"
223
- # elif "municipal" in text_lower or "municipality" in text_lower:
224
- # metadata["standard_type"] = "Local Regulation"
225
- # elif "ministry" in text_lower:
226
- # metadata["standard_type"] = "Government Regulation"
227
- # else:
228
- # metadata["standard_type"] = "Industry Standard"
229
-
230
- # return metadata
231
-
232
- # def ocr_pdf(self, pdf_path):
233
- # """Use OCR to extract text from PDF"""
234
- # try:
235
- # # Convert PDF to images
236
- # images = pdf2image.convert_from_path(pdf_path)
237
- # full_text = ""
238
-
239
- # for i, image in enumerate(images):
240
- # # Perform OCR
241
- # text = pytesseract.image_to_string(image)
242
- # full_text += f"\n--- Page {i+1} ---\n{text}"
243
-
244
- # return full_text
245
- # except Exception as e:
246
- # print(f"OCR error: {e}")
247
- # return ""
248
-
249
- # def extract_key_topics(self, text):
250
- # """Extract key regulatory topics from text"""
251
- # topics = set()
252
-
253
- # # Define topic patterns
254
- # topic_patterns = {
255
- # "Temperature Control": ["temperature", "cold chain", "frozen", "refrigerated", "cooling"],
256
- # "Packaging Requirements": ["packaging", "labeling", "package", "container"],
257
- # "Microbiological Standards": ["microbiological", "bacteria", "pathogen", "contamination"],
258
- # "Chemical Requirements": ["chemical", "pesticide", "residue", "additive", "preservative"],
259
- # "Traceability": ["traceability", "track", "batch", "lot number"],
260
- # "Storage Requirements": ["storage", "warehouse", "shelf life"],
261
- # "Transportation": ["transport", "distribution", "delivery"],
262
- # "Documentation": ["documentation", "record", "certificate", "report"],
263
- # "Testing Requirements": ["testing", "analysis", "laboratory", "sample"],
264
- # "Hygiene Standards": ["hygiene", "sanitation", "cleaning", "disinfection"],
265
- # "HACCP Principles": ["haccp", "critical control", "hazard analysis"],
266
- # "Certification": ["certification", "accreditation", "approval", "license"],
267
- # "Compliance": ["compliance", "conform", "requirement", "specification"],
268
- # "Quality Management": ["quality management", "qms", "quality system"],
269
- # "Risk Assessment": ["risk assessment", "risk analysis", "hazard"],
270
- # # COVID-19 specific topics
271
- # "COVID-19 Guidelines": ["covid-19", "coronavirus", "pandemic", "quarantine"],
272
- # "Social Distancing": ["social distancing", "physical distancing", "2 meters"],
273
- # "PPE Requirements": ["ppe", "personal protective equipment", "masks", "gloves"],
274
- # "Employee Health": ["employee health", "health screening", "symptoms"],
275
- # "Disinfection": ["disinfection", "sanitization", "cleaning and disinfection"],
276
- # # Food specific topics
277
- # "Food Safety": ["food safety", "food hygiene", "food handling"],
278
- # "Halal Requirements": ["halal", "islamic", "sharia"],
279
- # "Allergen Management": ["allergen", "allergy", "contains", "may contain"],
280
- # "Import/Export": ["import", "export", "customs", "border"]
281
- # }
282
-
283
- # text_lower = text.lower()
284
-
285
- # for topic, keywords in topic_patterns.items():
286
- # # Calculate relevance score based on keyword frequency
287
- # count = sum(1 for keyword in keywords if keyword in text_lower)
288
- # if count > 0:
289
- # relevance_score = count / len(keywords)
290
- # topics.add((topic, relevance_score))
291
-
292
- # return list(topics)
293
-
294
- # def create_chunks(self, text, metadata):
295
- # """Create text chunks for vector storage"""
296
- # text_splitter = RecursiveCharacterTextSplitter(
297
- # chunk_size=1500, # Larger chunks for regulatory docs
298
- # chunk_overlap=300,
299
- # length_function=len,
300
- # separators=["\n\n", "\n", ". ", " ", ""]
301
- # )
302
-
303
- # chunks = text_splitter.split_text(text)
304
- # documents = []
305
-
306
- # for i, chunk in enumerate(chunks):
307
- # doc_metadata = metadata.copy()
308
- # doc_metadata["chunk_index"] = i
309
- # doc_metadata["chunk_size"] = len(chunk)
310
- # doc_metadata["total_chunks"] = len(chunks)
311
- # documents.append({
312
- # "text": chunk,
313
- # "metadata": doc_metadata
314
- # })
315
-
316
- # return documents
317
-
318
- # def process_pdf(self, pdf_path):
319
- # """Process a single PDF file"""
320
- # pdf_path = Path(pdf_path)
321
- # file_hash = self.get_file_hash(pdf_path)
322
- # filename = pdf_path.name
323
-
324
- # # Check if already processed
325
- # if filename in self.manifest["processed_files"]:
326
- # if self.manifest["processed_files"][filename]["hash"] == file_hash:
327
- # print(f"Skipping {filename} - already processed")
328
- # return
329
-
330
- # print(f"Processing {filename}...")
331
-
332
- # try:
333
- # # Load PDF content
334
- # loader = PyPDFLoader(str(pdf_path))
335
- # pages = loader.load()
336
-
337
- # # Combine all pages
338
- # full_text = ""
339
- # for i, page in enumerate(pages):
340
- # full_text += f"\n--- Page {i+1} ---\n{page.page_content}"
341
-
342
- # # If text is too short, use OCR
343
- # if len(full_text.strip()) < 100:
344
- # print(f"Using OCR for {filename}")
345
- # ocr_text = self.ocr_pdf(pdf_path)
346
- # if len(ocr_text) > len(full_text):
347
- # full_text = ocr_text
348
-
349
- # # Extract regulatory metadata
350
- # reg_metadata = self.extract_metadata_from_pdf(pdf_path, full_text)
351
-
352
- # # Extract key topics
353
- # topics = self.extract_key_topics(full_text)
354
-
355
- # # Create base metadata for chunks
356
- # metadata = {
357
- # "source": filename,
358
- # "regulatory_body": reg_metadata["regulatory_body"] or "Unknown",
359
- # "standard_type": reg_metadata["standard_type"] or "Unknown",
360
- # "standard_code": reg_metadata["standard_code"] or "",
361
- # "jurisdiction": reg_metadata["jurisdiction"] or "Unknown",
362
- # "industry": reg_metadata["industry"] or "General",
363
- # "publication_date": reg_metadata["publication_date"] or "",
364
- # "effective_date": reg_metadata["effective_date"] or "",
365
- # "file_hash": file_hash,
366
- # "processed_date": datetime.now().isoformat(),
367
- # "topics": ", ".join([topic[0] for topic in topics]) if topics else ""
368
- # }
369
-
370
- # # Create chunks
371
- # documents = self.create_chunks(full_text, metadata)
372
-
373
- # # Generate embeddings and store in ChromaDB
374
- # for i, doc in enumerate(documents):
375
- # embedding = self.embedder.encode(doc["text"]).tolist()
376
-
377
- # self.collection.add(
378
- # documents=[doc["text"]],
379
- # embeddings=[embedding],
380
- # metadatas=[doc["metadata"]],
381
- # ids=[f"{file_hash}_{i}"]
382
- # )
383
-
384
- # # Store metadata in SQLite
385
- # self.save_metadata(file_hash, filename, reg_metadata)
386
-
387
- # # Store topics
388
- # self.save_topics(file_hash, topics)
389
-
390
- # # Update manifest
391
- # self.manifest["processed_files"][filename] = {
392
- # "hash": file_hash,
393
- # "processed_date": datetime.now().isoformat(),
394
- # "chunks": len(documents),
395
- # "text_length": len(full_text),
396
- # "regulatory_body": reg_metadata["regulatory_body"],
397
- # "standard_code": reg_metadata["standard_code"]
398
- # }
399
- # self.save_manifest()
400
-
401
- # # Log success
402
- # self.log_processing(filename, file_hash, "SUCCESS", None, len(full_text), len(documents))
403
-
404
- # print(f"Successfully processed {filename}")
405
- # print(f" - Regulatory Body: {reg_metadata['regulatory_body']}")
406
- # print(f" - Standard Code: {reg_metadata['standard_code']}")
407
- # print(f" - Text chunks: {len(documents)}")
408
- # print(f" - Topics extracted: {len(topics)}")
409
-
410
- # except Exception as e:
411
- # error_msg = str(e)
412
- # print(f"Error processing {filename}: {error_msg}")
413
- # import traceback
414
- # traceback.print_exc()
415
- # self.log_processing(filename, file_hash, "ERROR", error_msg, 0, 0)
416
-
417
- # def save_metadata(self, file_hash, filename, metadata):
418
- # """Save regulatory metadata to SQLite"""
419
- # conn = sqlite3.connect(self.metadata_db_path)
420
- # cursor = conn.cursor()
421
-
422
- # try:
423
- # cursor.execute("""
424
- # INSERT OR REPLACE INTO regulatory_documents
425
- # (file_hash, filename, regulatory_body, standard_type, standard_code,
426
- # publication_date, effective_date, jurisdiction, industry)
427
- # VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
428
- # """, (
429
- # file_hash,
430
- # filename,
431
- # metadata["regulatory_body"] or "Unknown",
432
- # metadata["standard_type"] or "Document",
433
- # metadata["standard_code"] or "",
434
- # metadata["publication_date"] or "",
435
- # metadata["effective_date"] or "",
436
- # metadata["jurisdiction"] or "General",
437
- # metadata["industry"] or "General"
438
- # ))
439
-
440
- # conn.commit()
441
- # finally:
442
- # conn.close()
443
-
444
- # def save_topics(self, file_hash, topics):
445
- # """Save extracted topics to SQLite"""
446
- # conn = sqlite3.connect(self.metadata_db_path)
447
- # cursor = conn.cursor()
448
-
449
- # try:
450
- # # Delete existing topics for this file
451
- # cursor.execute("DELETE FROM key_topics WHERE file_hash = ?", (file_hash,))
452
-
453
- # # Insert new topics
454
- # for topic, relevance_score in topics:
455
- # cursor.execute("""
456
- # INSERT INTO key_topics
457
- # (file_hash, topic, relevance_score)
458
- # VALUES (?, ?, ?)
459
- # """, (file_hash, topic, relevance_score))
460
-
461
- # conn.commit()
462
- # finally:
463
- # conn.close()
464
-
465
- # def log_processing(self, filename, file_hash, status, error_message, text_length=0, chunk_count=0):
466
- # """Log processing status"""
467
- # conn = sqlite3.connect(self.metadata_db_path)
468
- # cursor = conn.cursor()
469
-
470
- # try:
471
- # cursor.execute("""
472
- # INSERT INTO processing_log
473
- # (filename, file_hash, status, error_message, text_length, chunk_count)
474
- # VALUES (?, ?, ?, ?, ?, ?)
475
- # """, (filename, file_hash, status, error_message, text_length, chunk_count))
476
-
477
- # conn.commit()
478
- # finally:
479
- # conn.close()
480
-
481
- # def process_all_pdfs(self):
482
- # """Process all PDFs in the directory"""
483
- # pdf_files = list(self.pdf_path.glob("*.pdf"))
484
-
485
- # if not pdf_files:
486
- # print(f"No PDF files found in {self.pdf_path}")
487
- # return
488
-
489
- # print(f"Found {len(pdf_files)} PDF files")
490
-
491
- # for pdf_file in pdf_files:
492
- # self.process_pdf(pdf_file)
493
-
494
- # print("\nProcessing complete!")
495
- # print(f"Total files in manifest: {len(self.manifest['processed_files'])}")
496
-
497
- # def get_processing_stats(self):
498
- # """Get processing statistics"""
499
- # conn = sqlite3.connect(self.metadata_db_path)
500
- # cursor = conn.cursor()
501
-
502
- # try:
503
- # # Get overall stats
504
- # cursor.execute("""
505
- # SELECT COUNT(*) as total,
506
- # SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
507
- # SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors
508
- # FROM processing_log
509
- # """)
510
-
511
- # stats = cursor.fetchone()
512
-
513
- # # Get regulatory body distribution
514
- # cursor.execute("""
515
- # SELECT regulatory_body, COUNT(*) as count
516
- # FROM regulatory_documents
517
- # GROUP BY regulatory_body
518
- # ORDER BY count DESC
519
- # """)
520
-
521
- # body_dist = cursor.fetchall()
522
-
523
- # # Get top topics
524
- # cursor.execute("""
525
- # SELECT topic, COUNT(*) as count, AVG(relevance_score) as avg_relevance
526
- # FROM key_topics
527
- # GROUP BY topic
528
- # ORDER BY count DESC
529
- # LIMIT 10
530
- # """)
531
-
532
- # top_topics = cursor.fetchall()
533
-
534
- # return {
535
- # "total_processed": stats[0],
536
- # "successful": stats[1],
537
- # "errors": stats[2],
538
- # "regulatory_bodies": dict(body_dist),
539
- # "top_topics": [{"topic": t[0], "count": t[1], "relevance": t[2]} for t in top_topics]
540
- # }
541
- # finally:
542
- # conn.close()
543
-
544
-
545
- # def main():
546
- # """Main function to create/update the regulatory guidelines database"""
547
- # print("Starting Regulatory Guidelines Database Creation...")
548
-
549
- # # Initialize database
550
- # db = RegulatoryGuidelinesDB()
551
-
552
- # # Process all PDFs
553
- # db.process_all_pdfs()
554
-
555
- # # Show processing stats
556
- # print("\nProcessing Statistics:")
557
- # stats = db.get_processing_stats()
558
- # print(f"Total files processed: {stats['total_processed']}")
559
- # print(f"Successful: {stats['successful']}")
560
- # print(f"Errors: {stats['errors']}")
561
-
562
- # print("\nRegulatory Bodies:")
563
- # for body, count in stats["regulatory_bodies"].items():
564
- # print(f" - {body}: {count} documents")
565
-
566
- # print("\nTop Topics:")
567
- # for topic_data in stats["top_topics"]:
568
- # print(f" - {topic_data['topic']}: {topic_data['count']} documents (relevance: {topic_data['relevance']:.2f})")
569
-
570
-
571
- # if __name__ == "__main__":
572
- # main()
573
-
574
-
575
-
576
  import os
577
  import json
578
  import sqlite3
@@ -612,7 +37,7 @@ class EnhancedRegulatoryVectorDB:
612
  # Get or create collection
613
  self.collection = self.client.get_or_create_collection(
614
  name="regulatory_guidelines",
615
- metadata={"description": "Enhanced regulatory guidelines with clause references"}
616
  )
617
 
618
  # Initialize embedding model
@@ -625,7 +50,7 @@ class EnhancedRegulatoryVectorDB:
625
  self.manifest = self.load_manifest()
626
 
627
  def init_metadata_db(self):
628
- """Initialize enhanced SQLite database for storing regulatory metadata"""
629
  conn = sqlite3.connect(self.metadata_db_path)
630
  cursor = conn.cursor()
631
 
@@ -714,25 +139,27 @@ class EnhancedRegulatoryVectorDB:
714
  return hashlib.md5(f.read()).hexdigest()
715
 
716
  def extract_sections_and_clauses(self, text_content):
717
- """Enhanced section and clause extraction for regulatory documents"""
718
  sections = []
719
 
720
- # Enhanced section patterns for different document types
721
  section_patterns = [
722
- # HACCP style: "7.8. Temperature Control"
723
- r'(\d+\.\d+\.?\s+)([A-Z][^.\n\r]+)',
724
- # HACCP Principles: "8.1 Assemble HACCP Team"
725
- r'(\d+\.\d+\s+)([A-Z][^.\n\r]+)',
726
- # Principle format: "Principle 1:", "Principle 2"
727
- r'(Principle\s+\d+)[\s:]*([^.\n\r]*)',
728
- # Section format: "Section 7.8 -"
729
- r'(Section\s+\d+\.\d+)[\s\-–]*([^.\n\r]*)',
730
- # Simple numbered: "1. Introduction", "2. Objective"
 
 
 
 
731
  r'^(\d+\.\s+)([A-Z][^.\n\r]+)',
732
- # Subsection: "7.1 Management Policy"
733
- r'^(\d+\.\d+\s+)([A-Z][^.\n\r]+)',
734
- # Prerequisites: "7.1. Management Policy"
735
- r'^(\d+\.\d+\.\s+)([A-Z][^.\n\r]+)',
736
  ]
737
 
738
  lines = text_content.split('\n')
@@ -763,7 +190,7 @@ class EnhancedRegulatoryVectorDB:
763
 
764
  if section_title: # Only add if we have a meaningful title
765
  # Determine section level
766
- level = section_num.count('.') + (1 if section_num.startswith('Principle') else 0)
767
 
768
  # Extract content preview (next few lines)
769
  preview_lines = []
@@ -789,7 +216,7 @@ class EnhancedRegulatoryVectorDB:
789
  return sections
790
 
791
  def extract_enhanced_metadata(self, pdf_path, text_content):
792
- """Enhanced metadata extraction with clause awareness"""
793
  metadata = {
794
  "regulatory_body": "Unknown",
795
  "standard_type": "Document",
@@ -803,38 +230,49 @@ class EnhancedRegulatoryVectorDB:
803
 
804
  text_lower = text_content.lower()
805
 
806
- # Enhanced regulatory body detection
807
- regulatory_bodies = {
808
- "Dubai Municipality": [
809
- "dubai municipality", "dm ", "بلدية دبي", "food control section",
810
- "public health department", "food safety department"
811
- ],
812
- "HACCP": ["haccp", "hazard analysis critical control point"],
813
- "ISO": ["iso ", "international organization for standardization"],
814
- "GSO": ["gso ", "gcc standardization", "gulf standard"],
815
- "FDA": ["fda", "food and drug administration"],
816
- "ESMA": ["esma", "emirates authority for standardization"],
817
- "SASO": ["saso", "saudi standards"],
818
- "UAE Ministry": ["uae ministry", "ministry of environment", "ministry of health"],
819
- "Federal Law": ["federal law", "uae law", "united arab emirates law"],
820
- "DHA": ["dubai health authority", "dha"],
821
- "Codex Alimentarius": ["codex alimentarius", "codex"]
822
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
823
 
824
- for body, patterns in regulatory_bodies.items():
825
- if any(pattern in text_lower for pattern in patterns):
826
- metadata["regulatory_body"] = body
 
827
  break
828
 
829
- # Enhanced standard code extraction
830
  standard_patterns = [
831
- r"HACCP\s*Guidelines?\s*for\s*[^.\n\r]*",
832
- r"Guidelines?\s*for\s*Food\s*Manufacturing\s*Premises",
833
- r"ISO\s*\d+(?::\d+)*",
834
- r"GSO\s*\d+(?:/\d+)*",
835
- r"Dubai\s*Municipality\s*[-–]\s*Food\s*Control\s*Section",
836
- r"Federal\s*Law\s*No\.\s*\d+\s*of\s*\d+",
837
- r"Administrative\s*Order\s*No\.\s*\d+/\d+",
838
  ]
839
 
840
  for pattern in standard_patterns:
@@ -844,18 +282,18 @@ class EnhancedRegulatoryVectorDB:
844
  break
845
 
846
  # Document structure detection
847
- if "haccp" in text_lower and ("principle" in text_lower or "prerequisite" in text_lower):
848
- metadata["document_structure"] = "hierarchical_haccp"
849
  elif re.search(r'\d+\.\d+\s+[A-Z]', text_content):
850
  metadata["document_structure"] = "numbered_sections"
851
  else:
852
  metadata["document_structure"] = "flat"
853
 
854
- # Enhanced date extraction
855
  date_patterns = [
856
- r"Issue\s*Date:\s*(\d{1,2}/\d{1,2}/\d{4})",
857
- r"Issued\s*on\s*(\w+\s*\d{4})",
858
- r"(\d{1,2}/\d{1,2}/\d{4})",
859
  r"(\d{4})"
860
  ]
861
 
@@ -865,80 +303,83 @@ class EnhancedRegulatoryVectorDB:
865
  metadata["publication_date"] = matches[0]
866
  break
867
 
868
- # Industry and jurisdiction
869
- if "food" in text_lower or "manufacturing" in text_lower:
870
- metadata["industry"] = "Food Manufacturing"
 
 
 
 
 
871
 
872
- if "dubai" in text_lower:
873
- metadata["jurisdiction"] = "Dubai"
874
- elif "uae" in text_lower or "emirates" in text_lower:
875
- metadata["jurisdiction"] = "UAE"
876
- elif "gcc" in text_lower or "gulf" in text_lower:
877
- metadata["jurisdiction"] = "GCC"
 
 
 
 
 
 
878
 
879
  return metadata
880
 
881
  def extract_enhanced_topics(self, text, sections):
882
- """Enhanced topic extraction with clause references"""
883
  topics = []
884
 
885
- # Enhanced topic patterns with regulatory focus
886
  topic_patterns = {
887
- "Temperature Control": {
888
- "keywords": ["temperature control", "cold chain", "freezer", "chiller", "5°c", "-18°c", "danger zone"],
889
- "section_hints": ["7.8", "temperature"]
890
  },
891
- "HACCP Principles": {
892
- "keywords": ["haccp principles", "seven principles", "principle 1", "principle 2", "hazard analysis"],
893
- "section_hints": ["8.", "principle"]
894
- },
895
- "Critical Control Points": {
896
- "keywords": ["critical control point", "ccp", "control points", "decision tree"],
897
- "section_hints": ["8.8", "ccp", "critical"]
898
  },
899
- "Prerequisite Programs": {
900
- "keywords": ["prerequisite program", "pre-requisite", "management policy", "premises"],
901
- "section_hints": ["7.", "prerequisite"]
902
  },
903
- "Documentation Requirements": {
904
- "keywords": ["documentation", "record keeping", "records", "monitoring records"],
905
- "section_hints": ["8.13", "record", "documentation"]
906
  },
907
- "Verification Procedures": {
908
- "keywords": ["verification", "audit", "internal audit", "external audit"],
909
- "section_hints": ["8.12", "verification", "audit"]
910
  },
911
  "Corrective Actions": {
912
- "keywords": ["corrective action", "deviation", "non-conformance", "critical limit"],
913
- "section_hints": ["8.11", "corrective"]
914
- },
915
- "Monitoring Systems": {
916
- "keywords": ["monitoring", "monitoring system", "continuous monitoring"],
917
- "section_hints": ["8.10", "monitoring"]
918
  },
919
- "Hazard Analysis": {
920
- "keywords": ["hazard analysis", "biological hazard", "chemical hazard", "physical hazard"],
921
- "section_hints": ["8.7", "hazard"]
922
  },
923
- "Food Safety Management": {
924
- "keywords": ["food safety", "food hygiene", "food protection", "contamination"],
925
- "section_hints": ["food safety", "hygiene"]
926
  },
927
- "Personal Hygiene": {
928
- "keywords": ["personal hygiene", "health card", "hand washing", "cleanliness"],
929
- "section_hints": ["7.5", "personal"]
930
  },
931
- "Pest Control": {
932
- "keywords": ["pest control", "rodent", "insect", "fly control"],
933
- "section_hints": ["7.4", "pest"]
934
  },
935
- "Supplier Approval": {
936
- "keywords": ["supplier approval", "approved supplier", "supplier audit"],
937
- "section_hints": ["7.6", "supplier"]
938
  },
939
- "Product Recall": {
940
- "keywords": ["product recall", "recall program", "traceability"],
941
- "section_hints": ["7.9", "recall"]
942
  }
943
  }
944
 
@@ -1052,7 +493,7 @@ class EnhancedRegulatoryVectorDB:
1052
  return None
1053
 
1054
  def process_pdf(self, pdf_path):
1055
- """Enhanced PDF processing with section and clause extraction"""
1056
  pdf_path = Path(pdf_path)
1057
  file_hash = self.get_file_hash(pdf_path)
1058
  filename = pdf_path.name
@@ -1278,102 +719,102 @@ class EnhancedRegulatoryVectorDB:
1278
  for pdf_file in pdf_files:
1279
  self.process_pdf(pdf_file)
1280
 
1281
- print(f"\n🎯 Processing complete! Enhanced regulatory VDB ready.")
1282
  print(f"📊 Total files in manifest: {len(self.manifest['processed_files'])}")
1283
-
1284
  def get_enhanced_stats(self):
1285
- """Get enhanced processing statistics"""
1286
- conn = sqlite3.connect(self.metadata_db_path)
1287
- cursor = conn.cursor()
1288
-
1289
- try:
1290
- # Overall stats
1291
- cursor.execute("""
1292
- SELECT COUNT(*) as total,
1293
- SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
1294
- SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
1295
- SUM(sections_extracted) as total_sections
1296
- FROM processing_log
1297
- """)
1298
-
1299
- stats = cursor.fetchone()
1300
-
1301
- # Regulatory body distribution
1302
- cursor.execute("""
1303
- SELECT regulatory_body, COUNT(*) as count, SUM(total_sections) as sections
1304
- FROM regulatory_documents
1305
- GROUP BY regulatory_body
1306
- ORDER BY count DESC
1307
- """)
1308
-
1309
- body_dist = cursor.fetchall()
1310
-
1311
- # Top topics with clause references
1312
- cursor.execute("""
1313
- SELECT topic, COUNT(*) as count, AVG(relevance_score) as avg_relevance,
1314
- GROUP_CONCAT(DISTINCT section_reference) as sections
1315
- FROM key_topics
1316
- GROUP BY topic
1317
- ORDER BY count DESC
1318
- LIMIT 10
1319
- """)
1320
-
1321
- top_topics = cursor.fetchall()
1322
-
1323
- return {
1324
- "total_processed": stats[0] or 0,
1325
- "successful": stats[1] or 0,
1326
- "errors": stats[2] or 0,
1327
- "total_sections": stats[3] or 0,
1328
- "regulatory_bodies": [(r[0], r[1], r[2]) for r in body_dist],
1329
- "top_topics": [{"topic": t[0], "count": t[1], "relevance": t[2], "sections": t[3]} for t in top_topics]
1330
- }
1331
- finally:
1332
- conn.close()
1333
 
1334
 
1335
  def main():
1336
- """Main function to create/update the enhanced regulatory guidelines database"""
1337
- print("🚀 Starting Enhanced Regulatory Guidelines Database Creation...")
1338
- print("📋 Features: Section extraction, clause references, enhanced topic mapping")
1339
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1340
 
1341
  if __name__ == "__main__":
1342
- main()
1343
-
1344
- # Initialize enhanced database
1345
- db = EnhancedRegulatoryVectorDB()
1346
-
1347
- # Process all PDFs
1348
- db.process_all_pdfs()
1349
-
1350
- # Show enhanced processing stats
1351
- print("\n" + "=" * 80)
1352
- print("📊 ENHANCED PROCESSING STATISTICS:")
1353
- print("=" * 80)
1354
-
1355
- stats = db.get_enhanced_stats()
1356
- print(f"📄 Total files processed: {stats['total_processed']}")
1357
- print(f"✅ Successful: {stats['successful']}")
1358
- print(f"❌ Errors: {stats['errors']}")
1359
- print(f"📑 Total sections extracted: {stats['total_sections']}")
1360
-
1361
- print(f"\n🏛️ REGULATORY BODIES:")
1362
- for body, count, sections in stats["regulatory_bodies"]:
1363
- print(f" - {body}: {count} documents ({sections} sections)")
1364
-
1365
- print(f"\n🎯 TOP TOPICS WITH CLAUSE REFERENCES:")
1366
- for topic_data in stats["top_topics"]:
1367
- sections_info = topic_data['sections'][:50] + "..." if len(topic_data['sections']) > 50 else topic_data['sections']
1368
- print(f" - {topic_data['topic']}: {topic_data['count']} documents")
1369
- print(f" └── Relevance: {topic_data['relevance']:.2f} | Sections: {sections_info}")
1370
-
1371
- print("\n" + "=" * 80)
1372
- print("🎉 Enhanced Regulatory VDB Creation Complete!")
1373
- print("🔍 HACCP clause references are now available for the demo")
1374
- print("📝 The system can now provide:")
1375
- print(" - Section-specific guidance (e.g., 'Section 7.8 - Temperature Control')")
1376
- print(" - Clause references for each parameter")
1377
- print(" - Regulatory body attribution")
1378
- print(" - Hierarchical document structure awareness")
1379
- print("=" * 80)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
  import json
3
  import sqlite3
 
37
  # Get or create collection
38
  self.collection = self.client.get_or_create_collection(
39
  name="regulatory_guidelines",
40
+ metadata={"description": "Regulatory guidelines and standards for quality control"}
41
  )
42
 
43
  # Initialize embedding model
 
50
  self.manifest = self.load_manifest()
51
 
52
  def init_metadata_db(self):
53
+ """Initialize SQLite database for storing regulatory metadata"""
54
  conn = sqlite3.connect(self.metadata_db_path)
55
  cursor = conn.cursor()
56
 
 
139
  return hashlib.md5(f.read()).hexdigest()
140
 
141
  def extract_sections_and_clauses(self, text_content):
142
+ """Generic section and clause extraction for regulatory documents"""
143
  sections = []
144
 
145
+ # Generic section patterns that work for any regulatory document
146
  section_patterns = [
147
+ # Numbered sections: "1.2.3 Title"
148
+ r'(\d+(?:\.\d+)*\.?\s+)([A-Z][^.\n\r]+)',
149
+ # Lettered sections: "A.1 Title"
150
+ r'([A-Z]\.\d+\s+)([A-Z][^.\n\r]+)',
151
+ # Article format: "Article 1:"
152
+ r'(Article\s+\d+)[\s:]*([^.\n\r]*)',
153
+ # Section format: "Section 1.2"
154
+ r'(Section\s+\d+(?:\.\d+)*)[\s\-–]*([^.\n\r]*)',
155
+ # Chapter format: "Chapter 1"
156
+ r'(Chapter\s+\d+)[\s:]*([^.\n\r]*)',
157
+ # Part format: "Part I"
158
+ r'(Part\s+[IVX]+)[\s:]*([^.\n\r]*)',
159
+ # Simple numbered: "1. Title"
160
  r'^(\d+\.\s+)([A-Z][^.\n\r]+)',
161
+ # Annex format: "Annex 1"
162
+ r'(Annex\s+\d+)[\s:]*([^.\n\r]*)',
 
 
163
  ]
164
 
165
  lines = text_content.split('\n')
 
190
 
191
  if section_title: # Only add if we have a meaningful title
192
  # Determine section level
193
+ level = section_num.count('.')
194
 
195
  # Extract content preview (next few lines)
196
  preview_lines = []
 
216
  return sections
217
 
218
  def extract_enhanced_metadata(self, pdf_path, text_content):
219
+ """Generic metadata extraction without bias toward specific standards"""
220
  metadata = {
221
  "regulatory_body": "Unknown",
222
  "standard_type": "Document",
 
230
 
231
  text_lower = text_content.lower()
232
 
233
+ # UPDATED: Generic regulatory body detection without prioritization
234
+ # Extract regulatory body from document content
235
+ regulatory_indicators = [
236
+ # International standards
237
+ (r"iso\s*\d+", "ISO"),
238
+ (r"iec\s*\d+", "IEC"),
239
+ (r"codex\s+alimentarius", "Codex Alimentarius"),
240
+ (r"who\s+guidelines", "WHO"),
241
+ (r"fao\s+standards", "FAO"),
242
+
243
+ # Regional standards
244
+ (r"european\s+union", "EU"),
245
+ (r"gcc\s+standard", "GCC"),
246
+ (r"asean\s+standard", "ASEAN"),
247
+
248
+ # National standards
249
+ (r"uae\s+standard", "UAE National"),
250
+ (r"saudi\s+standard", "Saudi Arabia"),
251
+ (r"indian\s+standard", "India"),
252
+
253
+ # Generic detection
254
+ (r"ministry\s+of\s+\w+", "Government Ministry"),
255
+ (r"department\s+of\s+\w+", "Government Department"),
256
+ (r"authority\s+for\s+\w+", "Regulatory Authority"),
257
+
258
+ # Industry standards
259
+ (r"haccp", "HACCP System"),
260
+ (r"gmp", "GMP"),
261
+ (r"gap", "GAP"),
262
+ ]
263
 
264
+ # Find regulatory body without bias
265
+ for pattern, body_name in regulatory_indicators:
266
+ if re.search(pattern, text_lower):
267
+ metadata["regulatory_body"] = body_name
268
  break
269
 
270
+ # Extract standard code generically
271
  standard_patterns = [
272
+ r"(?:standard|guideline|regulation)\s*(?:no\.|number)?\s*:?\s*(\w+[-/]\d+)",
273
+ r"document\s*(?:no\.|number)?\s*:?\s*(\w+[-/]\d+)",
274
+ r"reference\s*:?\s*(\w+[-/]\d+)",
275
+ r"(\w{2,10}[-/]\d{2,6})", # Generic code pattern
 
 
 
276
  ]
277
 
278
  for pattern in standard_patterns:
 
282
  break
283
 
284
  # Document structure detection
285
+ if re.search(r'(?:article|chapter|part|annex)\s+\d+', text_lower):
286
+ metadata["document_structure"] = "hierarchical"
287
  elif re.search(r'\d+\.\d+\s+[A-Z]', text_content):
288
  metadata["document_structure"] = "numbered_sections"
289
  else:
290
  metadata["document_structure"] = "flat"
291
 
292
+ # Date extraction
293
  date_patterns = [
294
+ r"(?:published|issued|effective|dated?)\s*:?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
295
+ r"(?:version|revision)\s*date\s*:?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
296
+ r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
297
  r"(\d{4})"
298
  ]
299
 
 
303
  metadata["publication_date"] = matches[0]
304
  break
305
 
306
+ # Industry detection - generic
307
+ industry_indicators = [
308
+ (["quality", "control", "inspection", "standard"], "Quality Control"),
309
+ (["manufacturing", "production", "processing"], "Manufacturing"),
310
+ (["safety", "health", "hygiene"], "Health & Safety"),
311
+ (["environment", "sustainable", "green"], "Environmental"),
312
+ (["trade", "commerce", "export", "import"], "Trade & Commerce"),
313
+ ]
314
 
315
+ for keywords, industry in industry_indicators:
316
+ if any(keyword in text_lower for keyword in keywords):
317
+ metadata["industry"] = industry
318
+ break
319
+
320
+ # Jurisdiction detection - generic
321
+ if any(country in text_lower for country in ["international", "global", "worldwide"]):
322
+ metadata["jurisdiction"] = "International"
323
+ elif re.search(r'(?:national|federal|state)\s+(?:standard|regulation)', text_lower):
324
+ metadata["jurisdiction"] = "National"
325
+ else:
326
+ metadata["jurisdiction"] = "General"
327
 
328
  return metadata
329
 
330
  def extract_enhanced_topics(self, text, sections):
331
+ """Generic topic extraction without bias toward specific frameworks"""
332
  topics = []
333
 
334
+ # UPDATED: Generic topic patterns applicable to any standard
335
  topic_patterns = {
336
+ "Quality Management": {
337
+ "keywords": ["quality management", "quality system", "quality control", "quality assurance"],
338
+ "section_hints": ["quality", "management"]
339
  },
340
+ "Documentation Requirements": {
341
+ "keywords": ["documentation", "records", "record keeping", "documents"],
342
+ "section_hints": ["document", "record"]
 
 
 
 
343
  },
344
+ "Process Control": {
345
+ "keywords": ["process control", "process monitoring", "control measures"],
346
+ "section_hints": ["process", "control"]
347
  },
348
+ "Verification and Validation": {
349
+ "keywords": ["verification", "validation", "audit", "review"],
350
+ "section_hints": ["verification", "validation"]
351
  },
352
+ "Training Requirements": {
353
+ "keywords": ["training", "competence", "qualification", "education"],
354
+ "section_hints": ["training", "competence"]
355
  },
356
  "Corrective Actions": {
357
+ "keywords": ["corrective action", "preventive action", "non-conformance"],
358
+ "section_hints": ["corrective", "action"]
 
 
 
 
359
  },
360
+ "Risk Management": {
361
+ "keywords": ["risk assessment", "risk management", "hazard", "risk analysis"],
362
+ "section_hints": ["risk", "hazard"]
363
  },
364
+ "Monitoring and Measurement": {
365
+ "keywords": ["monitoring", "measurement", "testing", "inspection"],
366
+ "section_hints": ["monitoring", "measurement"]
367
  },
368
+ "Compliance Requirements": {
369
+ "keywords": ["compliance", "regulatory", "legal requirements", "statutory"],
370
+ "section_hints": ["compliance", "regulatory"]
371
  },
372
+ "Continuous Improvement": {
373
+ "keywords": ["improvement", "continual improvement", "enhancement"],
374
+ "section_hints": ["improvement", "enhance"]
375
  },
376
+ "Resource Management": {
377
+ "keywords": ["resources", "facilities", "equipment", "infrastructure"],
378
+ "section_hints": ["resource", "facility"]
379
  },
380
+ "Communication": {
381
+ "keywords": ["communication", "reporting", "notification"],
382
+ "section_hints": ["communication", "report"]
383
  }
384
  }
385
 
 
493
  return None
494
 
495
  def process_pdf(self, pdf_path):
496
+ """Generic PDF processing without standard-specific bias"""
497
  pdf_path = Path(pdf_path)
498
  file_hash = self.get_file_hash(pdf_path)
499
  filename = pdf_path.name
 
719
  for pdf_file in pdf_files:
720
  self.process_pdf(pdf_file)
721
 
722
+ print(f"\n🎯 Processing complete! Generic regulatory VDB ready.")
723
  print(f"📊 Total files in manifest: {len(self.manifest['processed_files'])}")
724
+
725
  def get_enhanced_stats(self):
726
+ """Get enhanced processing statistics"""
727
+ conn = sqlite3.connect(self.metadata_db_path)
728
+ cursor = conn.cursor()
729
+
730
+ try:
731
+ # Overall stats
732
+ cursor.execute("""
733
+ SELECT COUNT(*) as total,
734
+ SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
735
+ SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
736
+ SUM(sections_extracted) as total_sections
737
+ FROM processing_log
738
+ """)
739
+
740
+ stats = cursor.fetchone()
741
+
742
+ # Regulatory body distribution
743
+ cursor.execute("""
744
+ SELECT regulatory_body, COUNT(*) as count, SUM(total_sections) as sections
745
+ FROM regulatory_documents
746
+ GROUP BY regulatory_body
747
+ ORDER BY count DESC
748
+ """)
749
+
750
+ body_dist = cursor.fetchall()
751
+
752
+ # Top topics with clause references
753
+ cursor.execute("""
754
+ SELECT topic, COUNT(*) as count, AVG(relevance_score) as avg_relevance,
755
+ GROUP_CONCAT(DISTINCT section_reference) as sections
756
+ FROM key_topics
757
+ GROUP BY topic
758
+ ORDER BY count DESC
759
+ LIMIT 10
760
+ """)
761
+
762
+ top_topics = cursor.fetchall()
763
+
764
+ return {
765
+ "total_processed": stats[0] or 0,
766
+ "successful": stats[1] or 0,
767
+ "errors": stats[2] or 0,
768
+ "total_sections": stats[3] or 0,
769
+ "regulatory_bodies": [(r[0], r[1], r[2]) for r in body_dist],
770
+ "top_topics": [{"topic": t[0], "count": t[1], "relevance": t[2], "sections": t[3]} for t in top_topics]
771
+ }
772
+ finally:
773
+ conn.close()
774
 
775
 
776
  def main():
777
+ """Main function to create/update the generic regulatory guidelines database"""
778
+ print("🚀 Starting Generic Regulatory Guidelines Database Creation...")
779
+ print("📋 Features: Unbiased extraction, generic standards support, dynamic classification")
780
+
781
+ # Initialize enhanced database
782
+ db = EnhancedRegulatoryVectorDB()
783
+
784
+ # Process all PDFs
785
+ db.process_all_pdfs()
786
+
787
+ # Show enhanced processing stats
788
+ print("\n" + "=" * 80)
789
+ print("📊 GENERIC PROCESSING STATISTICS:")
790
+ print("=" * 80)
791
+
792
+ stats = db.get_enhanced_stats()
793
+ print(f"📄 Total files processed: {stats['total_processed']}")
794
+ print(f"✅ Successful: {stats['successful']}")
795
+ print(f"❌ Errors: {stats['errors']}")
796
+ print(f"📑 Total sections extracted: {stats['total_sections']}")
797
+
798
+ print(f"\n🏛️ REGULATORY BODIES (No Bias):")
799
+ for body, count, sections in stats["regulatory_bodies"]:
800
+ print(f" - {body}: {count} documents ({sections} sections)")
801
+
802
+ print(f"\n🎯 TOP TOPICS (Generic):")
803
+ for topic_data in stats["top_topics"]:
804
+ sections_info = topic_data['sections'][:50] + "..." if len(topic_data['sections']) > 50 else topic_data['sections']
805
+ print(f" - {topic_data['topic']}: {topic_data['count']} documents")
806
+ print(f" └── Relevance: {topic_data['relevance']:.2f} | Sections: {sections_info}")
807
+
808
+ print("\n" + "=" * 80)
809
+ print("🎉 Generic Regulatory VDB Creation Complete!")
810
+ print("🔍 All regulatory frameworks are treated equally")
811
+ print("📝 The system can now provide:")
812
+ print(" - Unbiased regulatory references")
813
+ print(" - Generic clause citations")
814
+ print(" - Dynamic standard recognition")
815
+ print(" - Equal treatment of all frameworks")
816
+ print("=" * 80)
817
+
818
 
819
  if __name__ == "__main__":
820
+ main()