Spaces:

yashgori20
/

Swiftcheck

Sleeping

App Files Files Community

yashgori20 commited on Jun 14, 2025

Commit

365a20a

1 Parent(s): dd2978b

ueifu

Browse files

Files changed (3) hide show

vector_stores/chroma_db/checklist_examples/create_industry_db.py +482 -397
vector_stores/chroma_db/product_specs/create_product_spec_db.py +255 -157
vector_stores/chroma_db/regulatory_docs/create_regulatory_db.py +216 -775

vector_stores/chroma_db/checklist_examples/create_industry_db.py CHANGED Viewed

@@ -63,7 +63,7 @@ class ChecklistExamplesVDB:
                 document_type TEXT,
                 product_name TEXT,
                 supplier_name TEXT,
-                checklist_category TEXT,
                 total_parameters INTEGER DEFAULT 0,
                 extracted_at DATETIME DEFAULT CURRENT_TIMESTAMP
             )
@@ -148,23 +148,23 @@ class ChecklistExamplesVDB:
             return hashlib.md5(f.read()).hexdigest()
     def extract_document_metadata(self, pdf_path, text_content):
-        """Extract document metadata from checklist"""
         metadata = {
             "document_type": "QC Checklist",
             "product_name": "",
             "supplier_name": "",
-            "checklist_category": "General Inspection"
         }
-        # Extract document type
         doc_type_patterns = {
             "Inspection Record": ["inspection record", "inspection checklist", "quality inspection"],
             "Pre-Shipment Inspection": ["pre-shipment", "container inspection", "shipment inspection"],
             "Production Checklist": ["production checklist", "manufacturing checklist", "process checklist"],
-            "Temperature Log": ["temperature", "chiller", "freezer", "thermal"],
             "Receiving Inspection": ["receiving", "goods receipt", "incoming inspection"],
             "Hygiene Checklist": ["hygiene", "sanitation", "cleaning checklist"],
-            "HACCP Record": ["haccp", "critical control", "ccp monitoring"]
         }
         text_lower = text_content.lower()
@@ -173,11 +173,13 @@ class ChecklistExamplesVDB:
                 metadata["document_type"] = doc_type
                 break
-        # Extract product name
         product_patterns = [
             r"product\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
             r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*[-–]\s*inspection",
-            r"(malabar\s*paratha|green\s*peas|sweet\s*corn|vegetable\s*samosa)",
             r"product:\s*([^\n]{1,40})",
         ]
@@ -187,12 +189,13 @@ class ChecklistExamplesVDB:
                 metadata["product_name"] = match.group(1).strip()
                 break
-        # Extract supplier name - look for "Al Kabeer" or similar
         supplier_patterns = [
             r"supplier\s*(?:name)?\s*[:\-]\s*([^\n]{1,40})",
-            r"(al\s*kabeer|alkabeer)",
             r"manufacturer\s*[:\-]\s*([^\n]{1,40})",
-            r"company\s*[:\-]\s*([^\n]{1,40})"
         ]
         for pattern in supplier_patterns:
@@ -201,23 +204,38 @@ class ChecklistExamplesVDB:
                 metadata["supplier_name"] = match.group(1).strip()
                 break
-        # Determine checklist category
-        category_keywords = {
-            "Organoleptic Evaluation": ["organoleptic", "sensory", "taste", "aroma", "texture", "appearance"],
-            "Physical Parameters": ["weight", "dimension", "size", "diameter", "thickness"],
-            "Temperature Control": ["temperature", "freezer", "chiller", "thermal", "cooling"],
-            "Packaging Inspection": ["packaging", "sealing", "printing", "carton"],
-            "Microbiological Testing": ["microbiological", "bacteria", "pathogen", "contamination"],
-            "Metal Detection": ["metal screening", "metal detection", "fe:", "non-fe", "ss:"],
-            "Documentation Check": ["batch code", "shelf life", "expiry", "production date"],
-            "Foreign Object Check": ["foreign particles", "foreign objects", "contamination"]
         }
-        for category, keywords in category_keywords.items():
             if any(keyword in text_lower for keyword in keywords):
-                metadata["checklist_category"] = category
                 break
         return metadata
     def extract_checklist_parameters(self, text_content):
@@ -232,23 +250,31 @@ class ChecklistExamplesVDB:
         section_order = 0
         parameter_order = 0
-        # Parameter extraction patterns
         param_patterns = [
             # Format: "Parameter Name: Type/Method"
-            r"^([A-Z][^:]+?):\s*(Acceptable\s*/\s*Non-acceptable|To be mentioned|Present\s*/\s*Absent)",
             # Format: "Parameter (Spec: value)"
-            r"^([A-Z][^(]+?)\s*\(Spec:\s*([^)]+)\)",
-            # Format: "Parameter Name Acceptable / Non-acceptable"
-            r"^([A-Z][^A]+?)\s+(Acceptable\s*/\s*Non-acceptable)",
-            # Format: "Parameter: [details]"
-            r"^([A-Z][^:]+?):\s*(.{1,100})",
         ]
         # Section header patterns
         section_patterns = [
-            r"^([A-Z\s]+(?:EVALUATION|DETAILS|REQUIREMENTS|CONTROL|SCREENING))\s*$",
             r"^[0-9]+\.\s*([A-Z][^.]+)$",
-            r"^\*\*([A-Z\s]+)\*\*$"
         ]
         for line_idx, line in enumerate(lines):
@@ -257,7 +283,7 @@ class ChecklistExamplesVDB:
                 match = re.match(pattern, line)
                 if match:
                     section_name = match.group(1).strip()
-                    if len(section_name) > 5:  # Valid section name
                         current_section = section_name
                         section_order += 1
                         sections.append({
@@ -301,13 +327,13 @@ class ChecklistExamplesVDB:
         return parameters, sections
     def analyze_parameter(self, param_name, param_details, current_line, all_lines, line_idx):
-        """Analyze parameter to determine type, input method, etc."""
         param_name_lower = param_name.lower()
-        param_details_lower = param_details.lower()
         context_lines = all_lines[max(0, line_idx-2):min(len(all_lines), line_idx+3)]
         context_text = " ".join(context_lines).lower()
-        # Determine parameter type based on name and context
         parameter_type = "Quality Check"
         input_method = "Text Input"
         specifications = ""
@@ -316,43 +342,51 @@ class ChecklistExamplesVDB:
         measurement_units = ""
         has_remarks = False
-        # Input method determination
-        if any(keyword in param_details_lower for keyword in ["acceptable / non-acceptable", "acceptable/non-acceptable"]):
-            input_method = "Dropdown"
-            options_list = "Acceptable, Non-acceptable"
             parameter_type = "Quality Assessment"
-        elif any(keyword in param_details_lower for keyword in ["present / absent", "present/absent"]):
             input_method = "Toggle"
-            options_list = "Present, Absent"
             parameter_type = "Presence Check"
-        elif "to be mentioned" in param_details_lower:
-            if any(unit in param_name_lower for unit in ["temperature", "weight", "time", "diameter", "length"]):
                 input_method = "Numeric Input"
                 parameter_type = "Measurement"
             else:
                 input_method = "Text Input"
                 parameter_type = "Information Entry"
-        elif any(keyword in param_name_lower for keyword in ["photo", "attach", "image", "picture"]):
             input_method = "Image Upload"
             parameter_type = "Visual Documentation"
-        elif any(keyword in param_name_lower for keyword in ["remarks", "comment", "observation", "note"]):
             input_method = "Remarks"
             parameter_type = "Detailed Notes"
         # Specification extraction
         spec_patterns = [
-            r"\(spec:\s*([^)]+)\)",
-            r"tolerance\s*limit[:\s]*([^,\n]+)",
             r"(\d+\s*[±]\s*\d+\s*[a-zA-Z%°]+)",
-            r"(<\s*\d+[^,\n]*)",
-            r"(\d+\s*[°][cC])",
         ]
-        combined_text = f"{param_name} {param_details}"
         for pattern in spec_patterns:
             match = re.search(pattern, combined_text, re.IGNORECASE)
             if match:
@@ -362,7 +396,7 @@ class ChecklistExamplesVDB:
         # Extract measurement units
         unit_patterns = [
             r"(\d+\s*[a-zA-Z%°]+)",
-            r"(°[cC]|gram|kg|mm|cm|minutes|hours|ppm|cfu)",
         ]
         for pattern in unit_patterns:
@@ -372,35 +406,26 @@ class ChecklistExamplesVDB:
                 break
         # Check for tolerance limits
-        tolerance_patterns = [
-            r"tolerance\s*limit[:\s]*([^,\n]+)",
-            r"(\d+\s*[±]\s*\d+)",
-            r"([<>]=?\s*\d+)",
-        ]
-        for pattern in tolerance_patterns:
-            match = re.search(pattern, combined_text, re.IGNORECASE)
-            if match:
-                tolerance_limits = match.group(1).strip()
-                break
         # Check for remarks requirement
-        has_remarks = any(keyword in context_text for keyword in ["remarks", "corrective action", "comment"])
-        # Special handling for specific parameter types
-        if any(keyword in param_name_lower for keyword in ["foreign", "contamination", "allergen"]):
-            input_method = "Checklist"
-            parameter_type = "Safety Check"
-            if "foreign" in param_name_lower:
-                options_list = "Stones, Glass, Metals, Plastic, Wood, Insects/Pests, Hair, Threads"
-        elif any(keyword in param_name_lower for keyword in ["metal screening", "fe:", "non-fe", "ss:"]):
             input_method = "Text Input"
-            parameter_type = "Metal Detection"
-        elif any(keyword in param_name_lower for keyword in ["batch", "code", "date", "shelf life"]):
-            input_method = "Text Input"
-            parameter_type = "Traceability"
         return {
             "parameter_type": parameter_type,
@@ -414,21 +439,26 @@ class ChecklistExamplesVDB:
         }
     def classify_section_type(self, section_name):
-        """Classify section based on name"""
         section_name_lower = section_name.lower()
-        if any(keyword in section_name_lower for keyword in ["organoleptic", "sensory", "evaluation"]):
-            return "Sensory Assessment"
-        elif any(keyword in section_name_lower for keyword in ["physical", "dimension", "weight"]):
             return "Physical Measurement"
-        elif any(keyword in section_name_lower for keyword in ["temperature", "thermal", "freezer", "chiller"]):
             return "Temperature Control"
-        elif any(keyword in section_name_lower for keyword in ["packaging", "packing", "sealing"]):
             return "Packaging Inspection"
-        elif any(keyword in section_name_lower for keyword in ["metal", "screening", "detection"]):
-            return "Metal Detection"
-        elif any(keyword in section_name_lower for keyword in ["microbiological", "bacteria", "pathogen"]):
-            return "Microbiological Testing"
         else:
             return "General Inspection"
@@ -459,6 +489,17 @@ class ChecklistExamplesVDB:
         chunks = text_splitter.split_text(text)
         documents = []
         for i, chunk in enumerate(chunks):
             # Enrich metadata with structural information
             chunk_metadata = metadata.copy()
@@ -471,7 +512,7 @@ class ChecklistExamplesVDB:
                 "parameter_types": ", ".join(set([p["parameter_type"] for p in parameters])),
                 "input_methods": ", ".join(set([p["input_method"] for p in parameters])),
                 "section_types": ", ".join(set([s["section_type"] for s in sections]))
-})
             documents.append({
                 "text": chunk,
@@ -507,69 +548,67 @@ class ChecklistExamplesVDB:
         pdf_path = Path(pdf_path)
         file_hash = self.get_file_hash(pdf_path)
         filename = pdf_path.name
         # Check if already processed
         if filename in self.manifest["processed_files"]:
             if self.manifest["processed_files"][filename]["hash"] == file_hash:
                 print(f"Skipping {filename} - already processed")
                 return
         print(f"Processing checklist: {filename}...")
         try:
             # Load PDF content
             loader = PyPDFLoader(str(pdf_path))
             pages = loader.load()
             # Combine all pages
             full_text = ""
             for i, page in enumerate(pages):
                 full_text += f"\n--- Page {i+1} ---\n{page.page_content}"
             # If text is too short, use OCR
             if len(full_text.strip()) < 100:
                 print(f"Using OCR for {filename}")
                 ocr_text = self.ocr_pdf(pdf_path)
                 if len(ocr_text) > len(full_text):
                     full_text = ocr_text
             # Extract document metadata
             doc_metadata = self.extract_document_metadata(pdf_path, full_text)
             # Extract parameters and sections
             parameters, sections = self.extract_checklist_parameters(full_text)
             # Create base metadata for chunks
             metadata = {
                 "source": filename,
                 "document_type": doc_metadata["document_type"],
                 "product_name": doc_metadata["product_name"],
                 "supplier_name": doc_metadata["supplier_name"],
-                "checklist_category": doc_metadata["checklist_category"],
                 "file_hash": file_hash,
                 "processed_date": datetime.now().isoformat(),
-                "domain": "Food Manufacturing"
             }
             # Create chunks
             documents = self.create_chunks(full_text, metadata, parameters, sections)
             # Generate embeddings and store in ChromaDB
             for i, doc in enumerate(documents):
                 embedding = self.embedder.encode(doc["text"]).tolist()
                 self.collection.add(
                     documents=[doc["text"]],
                     embeddings=[embedding],
                     metadatas=[doc["metadata"]],
                     ids=[f"{file_hash}_{i}"]
                 )
             # Store metadata in SQLite
             self.save_document_metadata(file_hash, filename, doc_metadata, len(parameters))
             self.save_parameters(file_hash, parameters)
             self.save_sections(file_hash, sections)
             # Update manifest
             self.manifest["processed_files"][filename] = {
                 "hash": file_hash,
@@ -579,19 +618,20 @@ class ChecklistExamplesVDB:
                 "parameters_extracted": len(parameters),
                 "sections_extracted": len(sections),
                 "document_type": doc_metadata["document_type"],
-                "product_name": doc_metadata["product_name"]
             }
             self.save_manifest()
             # Log success
             self.log_processing(filename, file_hash, "SUCCESS", None, len(parameters), len(sections))
             print(f"Successfully processed {filename}")
             print(f"  - Document Type: {doc_metadata['document_type']}")
             print(f"  - Product: {doc_metadata['product_name']}")
             print(f"  - Parameters extracted: {len(parameters)}")
             print(f"  - Sections extracted: {len(sections)}")
         except Exception as e:
             error_msg = str(e)
             print(f"Error processing {filename}: {error_msg}")
@@ -599,309 +639,354 @@ class ChecklistExamplesVDB:
             traceback.print_exc()
             self.log_processing(filename, file_hash, "ERROR", error_msg, 0, 0)
     def save_document_metadata(self, file_hash, filename, metadata, total_parameters):
-        """Save document metadata to SQLite"""
-        conn = sqlite3.connect(self.metadata_db_path)
-        cursor = conn.cursor()
-        try:
-            cursor.execute("""
-                INSERT OR REPLACE INTO checklist_documents
-                (file_hash, filename, document_type, product_name, supplier_name,
-                 checklist_category, total_parameters)
-                VALUES (?, ?, ?, ?, ?, ?, ?)
-            """, (
-                file_hash, filename, metadata["document_type"], metadata["product_name"],
-                metadata["supplier_name"], metadata["checklist_category"], total_parameters
-            ))
-            conn.commit()
-        finally:
-            conn.close()
     def save_parameters(self, file_hash, parameters):
-        """Save extracted parameters to SQLite"""
-        conn = sqlite3.connect(self.metadata_db_path)
-        cursor = conn.cursor()
-        try:
-            # Delete existing parameters for this file
-            cursor.execute("DELETE FROM checklist_parameters WHERE file_hash = ?", (file_hash,))
-            # Insert new parameters
-            for param in parameters:
-                cursor.execute("""
-                    INSERT INTO checklist_parameters
-                    (file_hash, parameter_name, parameter_type, input_method, specifications,
-                     options_list, tolerance_limits, measurement_units, section_category,
-                     parameter_order, has_remarks, is_mandatory)
-                    VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
-                """, (
-                    file_hash, param["parameter_name"], param["parameter_type"],
-                    param["input_method"], param["specifications"], param["options_list"],
-                    param["tolerance_limits"], param["measurement_units"],
-                    param["section_category"], param["parameter_order"],
-                    param["has_remarks"], param["is_mandatory"]
-                ))
-            conn.commit()
-        finally:
-            conn.close()
     def save_sections(self, file_hash, sections):
-        """Save extracted sections to SQLite"""
-        conn = sqlite3.connect(self.metadata_db_path)
-        cursor = conn.cursor()
-        try:
-            # Delete existing sections for this file
-            cursor.execute("DELETE FROM checklist_sections WHERE file_hash = ?", (file_hash,))
-            # Insert new sections
-            for section in sections:
-                cursor.execute("""
-                    INSERT INTO checklist_sections
-                    (file_hash, section_name, section_type, section_order, parameter_count)
-                    VALUES (?, ?, ?, ?, ?)
-                """, (
-                    file_hash, section["section_name"], section["section_type"],
-                    section["section_order"], section["parameter_count"]
-                ))
-            conn.commit()
-        finally:
-            conn.close()
     def log_processing(self, filename, file_hash, status, error_message, parameters_extracted=0, sections_extracted=0):
-        """Log processing status"""
-        conn = sqlite3.connect(self.metadata_db_path)
-        cursor = conn.cursor()
-        try:
-            cursor.execute("""
-                INSERT INTO processing_log
-                (filename, file_hash, status, error_message, parameters_extracted, sections_extracted)
-                VALUES (?, ?, ?, ?, ?, ?)
-            """, (filename, file_hash, status, error_message, parameters_extracted, sections_extracted))
-            conn.commit()
-        finally:
-            conn.close()
     def process_all_pdfs(self):
-        """Process all PDFs in the directory"""
-        pdf_files = list(self.pdf_path.glob("*.pdf"))
-        if not pdf_files:
-            print(f"No PDF files found in {self.pdf_path}")
-            return
-        print(f"Found {len(pdf_files)} checklist PDF files")
-        for pdf_file in pdf_files:
-            self.process_pdf(pdf_file)
-        print("\nChecklist processing complete!")
-        print(f"Total files in manifest: {len(self.manifest['processed_files'])}")
     def get_processing_stats(self):
-        """Get processing statistics"""
-        conn = sqlite3.connect(self.metadata_db_path)
-        cursor = conn.cursor()
-        try:
-            # Get overall stats
-            cursor.execute("""
-                SELECT COUNT(*) as total,
-                       SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
-                       SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
-                       SUM(parameters_extracted) as total_parameters,
-                       SUM(sections_extracted) as total_sections
-                FROM processing_log
-            """)
-            stats = cursor.fetchone()
-            # Get document type distribution
-            cursor.execute("""
-                SELECT document_type, COUNT(*) as count
-                FROM checklist_documents
-                GROUP BY document_type
-                ORDER BY count DESC
-            """)
-            doc_types = cursor.fetchall()
-            # Get parameter type distribution
-            cursor.execute("""
-                SELECT input_method, COUNT(*) as count
-                FROM checklist_parameters
-                GROUP BY input_method
-                ORDER BY count DESC
-            """)
-            input_methods = cursor.fetchall()
-            # Get most common parameters
-            cursor.execute("""
-                SELECT parameter_name, parameter_type, input_method, COUNT(*) as frequency
-                FROM checklist_parameters
-                GROUP BY parameter_name, parameter_type, input_method
-                HAVING frequency > 1
-                ORDER BY frequency DESC
-                LIMIT 10
-            """)
-            common_params = cursor.fetchall()
-            return {
-                "total_processed": stats[0],
-                "successful": stats[1],
-                "errors": stats[2],
-                "total_parameters": stats[3],
-                "total_sections": stats[4],
-                "document_types": dict(doc_types),
-                "input_methods": dict(input_methods),
-                "common_parameters": [
-                    {
-                        "name": p[0],
-                        "type": p[1],
-                        "input_method": p[2],
-                        "frequency": p[3]
-                    } for p in common_params
-                ]
-            }
-        finally:
-            conn.close()
     def get_parameter_patterns(self):
-        """Get common parameter patterns for AI reference"""
-        conn = sqlite3.connect(self.metadata_db_path)
-        cursor = conn.cursor()
-        try:
-            cursor.execute("""
-                SELECT
-                    parameter_type,
-                    input_method,
-                    GROUP_CONCAT(DISTINCT specifications) as common_specs,
-                    GROUP_CONCAT(DISTINCT options_list) as common_options,
-                    COUNT(*) as usage_count
-                FROM checklist_parameters
-                WHERE specifications != '' OR options_list != ''
-                GROUP BY parameter_type, input_method
-                ORDER BY usage_count DESC
-            """)
-            patterns = []
-            for row in cursor.fetchall():
-                patterns.append({
-                    "parameter_type": row[0],
-                    "input_method": row[1],
-                    "common_specifications": row[2],
-                    "common_options": row[3],
-                    "usage_count": row[4]
-                })
-            return patterns
-        finally:
-            conn.close()
     def search_similar_checklists(self, product_name, checklist_type="", limit=5):
-        """Search for similar checklists based on product and type"""
-        query_text = f"{product_name} {checklist_type} quality control inspection checklist"
-        query_embedding = self.embedder.encode(query_text).tolist()
-        try:
-            results = self.collection.query(
-                query_embeddings=[query_embedding],
-                n_results=limit,
-                where={"domain": "Food Manufacturing"}
-            )
-            similar_checklists = []
-            if results['documents'][0]:
-                for i, doc in enumerate(results['documents'][0]):
-                    metadata = results['metadatas'][0][i]
-                    similar_checklists.append({
-                        "document": metadata.get('source', 'Unknown'),
-                        "product": metadata.get('product_name', 'Unknown'),
-                        "type": metadata.get('document_type', 'Unknown'),
-                        "category": metadata.get('checklist_category', 'Unknown'),
-                        "parameters": metadata.get('total_parameters', 0),
-                        "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0,
-                        "content_preview": doc[:200] + "..." if len(doc) > 200 else doc
-                    })
-            return similar_checklists
-        except Exception as e:
-            print(f"Error searching checklists: {str(e)}")
-            return []
 def main():
-    """Main function to create/update the checklist examples database"""
-    print("Starting Checklist Examples Database Creation...")
-    # Initialize database
-    db = ChecklistExamplesVDB()
-    # Process all PDFs
-    db.process_all_pdfs()
-    # Show processing stats
-    print("\n" + "="*60)
-    print("PROCESSING STATISTICS")
-    print("="*60)
-    stats = db.get_processing_stats()
-    print(f"Total files processed: {stats['total_processed']}")
-    print(f"Successful: {stats['successful']}")
-    print(f"Errors: {stats['errors']}")
-    print(f"Total parameters extracted: {stats['total_parameters']}")
-    print(f"Total sections extracted: {stats['total_sections']}")
-    print("\nDocument Types:")
-    for doc_type, count in stats["document_types"].items():
-        print(f"  - {doc_type}: {count} documents")
-    print("\nInput Methods Distribution:")
-    for method, count in stats["input_methods"].items():
-        print(f"  - {method}: {count} parameters")
-    print("\nMost Common Parameters:")
-    for param in stats["common_parameters"]:
-        print(f"  - {param['name']} ({param['input_method']}) - used {param['frequency']} times")
-    # Show parameter patterns
-    print("\n" + "="*60)
-    print("PARAMETER PATTERNS DISCOVERED")
-    print("="*60)
-    patterns = db.get_parameter_patterns()
-    for pattern in patterns[:10]:  # Show top 10 patterns
-        print(f"\n{pattern['parameter_type']} -> {pattern['input_method']}")
-        print(f"  Usage: {pattern['usage_count']} times")
-        if pattern['common_specifications']:
-            specs = pattern['common_specifications'][:100]
-            print(f"  Common specs: {specs}{'...' if len(pattern['common_specifications']) > 100 else ''}")
-        if pattern['common_options']:
-            options = pattern['common_options'][:100]
-            print(f"  Common options: {options}{'...' if len(pattern['common_options']) > 100 else ''}")
-    # Test search functionality
-    print("\n" + "="*60)
-    print("TESTING SEARCH FUNCTIONALITY")
-    print("="*60)
-    test_products = ["Malabar Paratha", "Green Peas", "Vegetable Samosa"]
-    for product in test_products:
-        print(f"\nSearching for '{product}' checklists:")
-        similar = db.search_similar_checklists(product, limit=3)
-        for i, checklist in enumerate(similar, 1):
-            print(f"  {i}. {checklist['document']} ({checklist['type']})")
-            print(f"     Product: {checklist['product']}, Parameters: {checklist['parameters']}")
-            print(f"     Relevance: {checklist['relevance_score']:.3f}")
 if __name__ == "__main__":
-    main()

                 document_type TEXT,
                 product_name TEXT,
                 supplier_name TEXT,
+                checklist_attributes TEXT,  -- Dynamic attributes instead of category
                 total_parameters INTEGER DEFAULT 0,
                 extracted_at DATETIME DEFAULT CURRENT_TIMESTAMP
             )
             return hashlib.md5(f.read()).hexdigest()
     def extract_document_metadata(self, pdf_path, text_content):
+        """Extract document metadata from checklist - generic approach"""
         metadata = {
             "document_type": "QC Checklist",
             "product_name": "",
             "supplier_name": "",
+            "checklist_attributes": {}  # Dynamic attributes
         }
+        # Extract document type generically
         doc_type_patterns = {
             "Inspection Record": ["inspection record", "inspection checklist", "quality inspection"],
             "Pre-Shipment Inspection": ["pre-shipment", "container inspection", "shipment inspection"],
             "Production Checklist": ["production checklist", "manufacturing checklist", "process checklist"],
+            "Temperature Log": ["temperature", "thermal", "cooling log"],
             "Receiving Inspection": ["receiving", "goods receipt", "incoming inspection"],
             "Hygiene Checklist": ["hygiene", "sanitation", "cleaning checklist"],
+            "Quality Control": ["quality control", "qc checklist", "quality check"]
         }
         text_lower = text_content.lower()
                 metadata["document_type"] = doc_type
                 break
+        # Extract product name generically
         product_patterns = [
             r"product\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
+            r"item\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
+            r"material\s*(?:name|description)?\s*[:\-]\s*([^\n]{1,50})",
             r"([A-Z][a-z]+(?:\s+[A-Z][a-z]+)*)\s*[-–]\s*inspection",
+            r"inspection\s*of\s*([^\n]{1,40})",
             r"product:\s*([^\n]{1,40})",
         ]
                 metadata["product_name"] = match.group(1).strip()
                 break
+        # Extract supplier name generically - no specific company bias
         supplier_patterns = [
             r"supplier\s*(?:name)?\s*[:\-]\s*([^\n]{1,40})",
+            r"vendor\s*(?:name)?\s*[:\-]\s*([^\n]{1,40})",
             r"manufacturer\s*[:\-]\s*([^\n]{1,40})",
+            r"company\s*[:\-]\s*([^\n]{1,40})",
+            r"produced\s*by\s*[:\-]\s*([^\n]{1,40})"
         ]
         for pattern in supplier_patterns:
                 metadata["supplier_name"] = match.group(1).strip()
                 break
+        # Extract dynamic attributes
+        attributes = {}
+        # Inspection stage/phase
+        stage_keywords = {
+            "pre-production": ["pre-production", "before production", "initial"],
+            "during-production": ["during production", "in-process", "mid-production"],
+            "final": ["final inspection", "finished goods", "end product"],
+            "incoming": ["incoming", "receiving", "goods receipt"],
+            "outgoing": ["outgoing", "dispatch", "shipping"]
         }
+        for stage, keywords in stage_keywords.items():
             if any(keyword in text_lower for keyword in keywords):
+                attributes["inspection_stage"] = stage
                 break
+        # Inspection focus
+        if any(word in text_lower for word in ["visual", "appearance", "cosmetic"]):
+            attributes["inspection_focus"] = "visual"
+        elif any(word in text_lower for word in ["dimension", "measurement", "size"]):
+            attributes["inspection_focus"] = "dimensional"
+        elif any(word in text_lower for word in ["functional", "performance", "operation"]):
+            attributes["inspection_focus"] = "functional"
+        elif any(word in text_lower for word in ["safety", "hazard", "risk"]):
+            attributes["inspection_focus"] = "safety"
+        # Complexity level based on parameter count (will be updated later)
+        attributes["complexity"] = "standard"  # Will be updated after parameter extraction
+        metadata["checklist_attributes"] = json.dumps(attributes)
         return metadata
     def extract_checklist_parameters(self, text_content):
         section_order = 0
         parameter_order = 0
+        # Generic parameter extraction patterns
         param_patterns = [
             # Format: "Parameter Name: Type/Method"
+            r"^([A-Z][^:]+?):\s*(Acceptable\s*/\s*Non-acceptable|To be mentioned|Present\s*/\s*Absent|Pass\s*/\s*Fail)",
             # Format: "Parameter (Spec: value)"
+            r"^([A-Z][^(]+?)\s*\((?:Spec|Specification):\s*([^)]+)\)",
+            # Format: "Parameter Name: [measurement/value]"
+            r"^([A-Z][^:]+?):\s*\[([^\]]+)\]",
+            # Format: "Parameter: _____" (blank field)
+            r"^([A-Z][^:]+?):\s*_{3,}",
+            # Format: "□ Parameter Name"
+            r"^[□☐]\s*([A-Z][^:]+?)$",
+            # Format: "• Parameter Name"
+            r"^[•·]\s*([A-Z][^:]+?)$",
+            # Generic: "Parameter Name: [details]"
+            r"^([A-Z][^:]+?):\s*(.{0,100})",
         ]
         # Section header patterns
         section_patterns = [
+            r"^([A-Z\s]+(?:EVALUATION|INSPECTION|CHECK|VERIFICATION|ASSESSMENT|CONTROL))\s*$",
             r"^[0-9]+\.\s*([A-Z][^.]+)$",
+            r"^\*\*([A-Z\s]+)\*\*$",
+            r"^={3,}\s*([A-Z\s]+)\s*={3,}$",
+            r"^-{3,}\s*([A-Z\s]+)\s*-{3,}$"
         ]
         for line_idx, line in enumerate(lines):
                 match = re.match(pattern, line)
                 if match:
                     section_name = match.group(1).strip()
+                    if len(section_name) > 5 and len(section_name) < 50:  # Valid section name
                         current_section = section_name
                         section_order += 1
                         sections.append({
         return parameters, sections
     def analyze_parameter(self, param_name, param_details, current_line, all_lines, line_idx):
+        """Analyze parameter to determine type, input method, etc. - generic approach"""
         param_name_lower = param_name.lower()
+        param_details_lower = param_details.lower() if param_details else ""
         context_lines = all_lines[max(0, line_idx-2):min(len(all_lines), line_idx+3)]
         context_text = " ".join(context_lines).lower()
+        # Initialize default values
         parameter_type = "Quality Check"
         input_method = "Text Input"
         specifications = ""
         measurement_units = ""
         has_remarks = False
+        # Generic input method determination
+        if any(keyword in param_details_lower for keyword in ["acceptable", "non-acceptable", "pass", "fail"]):
+            if "/" in param_details_lower:
+                input_method = "Dropdown"
+                options_list = param_details.replace("/", ", ")
+            else:
+                input_method = "Toggle"
             parameter_type = "Quality Assessment"
+        elif any(keyword in param_details_lower for keyword in ["present", "absent", "yes", "no"]):
             input_method = "Toggle"
+            options_list = param_details.replace("/", ", ")
             parameter_type = "Presence Check"
+        elif "to be mentioned" in param_details_lower or "_____" in current_line:
+            # Determine based on parameter name
+            if any(unit in param_name_lower for unit in ["temperature", "weight", "time", "dimension", "size", "count", "number"]):
                 input_method = "Numeric Input"
                 parameter_type = "Measurement"
             else:
                 input_method = "Text Input"
                 parameter_type = "Information Entry"
+        elif any(keyword in param_name_lower for keyword in ["photo", "picture", "image", "visual"]):
             input_method = "Image Upload"
             parameter_type = "Visual Documentation"
+        elif any(keyword in param_name_lower for keyword in ["remark", "comment", "observation", "note"]):
             input_method = "Remarks"
             parameter_type = "Detailed Notes"
+        elif "□" in current_line or "☐" in current_line:
+            input_method = "Checklist"
+            parameter_type = "Verification Check"
         # Specification extraction
         spec_patterns = [
+            r"\((?:spec|specification):\s*([^)]+)\)",
+            r"tolerance\s*(?:limit)?[:\s]*([^,\n]+)",
             r"(\d+\s*[±]\s*\d+\s*[a-zA-Z%°]+)",
+            r"([<>≤≥]\s*\d+[^,\n]*)",
+            r"(\d+\s*-\s*\d+\s*[a-zA-Z]+)",
         ]
+        combined_text = f"{param_name} {param_details} {' '.join(context_lines)}"
         for pattern in spec_patterns:
             match = re.search(pattern, combined_text, re.IGNORECASE)
             if match:
         # Extract measurement units
         unit_patterns = [
             r"(\d+\s*[a-zA-Z%°]+)",
+            r"(°[CcFf]|g|gram|kg|mm|cm|m|ml|L|minutes?|hours?|seconds?|%|ppm|cfu)",
         ]
         for pattern in unit_patterns:
                 break
         # Check for tolerance limits
+        if "±" in combined_text or any(op in combined_text for op in ["<", ">", "≤", "≥"]):
+            tolerance_match = re.search(r"([±<>≤≥]\s*\d+(?:\.\d+)?)", combined_text)
+            if tolerance_match:
+                tolerance_limits = tolerance_match.group(1).strip()
         # Check for remarks requirement
+        has_remarks = any(keyword in context_text for keyword in ["remark", "comment", "observation", "corrective action"])
+        # Generic parameter type classification based on content
+        if any(keyword in param_name_lower for keyword in ["contamination", "foreign", "defect", "damage"]):
+            parameter_type = "Safety/Quality Check"
+            if not options_list and input_method == "Checklist":
+                options_list = "None observed, Minor issue, Major issue, Critical"
+        elif any(keyword in param_name_lower for keyword in ["batch", "lot", "code", "number", "id"]):
+            parameter_type = "Traceability"
             input_method = "Text Input"
+        elif any(keyword in param_name_lower for keyword in ["signature", "verified", "checked"]):
+            parameter_type = "Verification"
         return {
             "parameter_type": parameter_type,
         }
     def classify_section_type(self, section_name):
+        """Classify section based on name - generic approach"""
         section_name_lower = section_name.lower()
+        # Generic section classification
+        if any(keyword in section_name_lower for keyword in ["visual", "appearance", "cosmetic"]):
+            return "Visual Assessment"
+        elif any(keyword in section_name_lower for keyword in ["measurement", "dimension", "size", "weight"]):
             return "Physical Measurement"
+        elif any(keyword in section_name_lower for keyword in ["temperature", "thermal", "heat", "cold"]):
             return "Temperature Control"
+        elif any(keyword in section_name_lower for keyword in ["package", "packing", "container", "seal"]):
             return "Packaging Inspection"
+        elif any(keyword in section_name_lower for keyword in ["test", "analysis", "laboratory"]):
+            return "Testing/Analysis"
+        elif any(keyword in section_name_lower for keyword in ["safety", "hazard", "risk", "contamination"]):
+            return "Safety Assessment"
+        elif any(keyword in section_name_lower for keyword in ["document", "record", "certificate"]):
+            return "Documentation"
+        elif any(keyword in section_name_lower for keyword in ["final", "overall", "summary"]):
+            return "Final Assessment"
         else:
             return "General Inspection"
         chunks = text_splitter.split_text(text)
         documents = []
+        # Update complexity in metadata based on parameters
+        if metadata.get("checklist_attributes"):
+            attrs = json.loads(metadata["checklist_attributes"])
+            if len(parameters) < 10:
+                attrs["complexity"] = "simple"
+            elif len(parameters) < 25:
+                attrs["complexity"] = "standard"
+            else:
+                attrs["complexity"] = "comprehensive"
+            metadata["checklist_attributes"] = json.dumps(attrs)
         for i, chunk in enumerate(chunks):
             # Enrich metadata with structural information
             chunk_metadata = metadata.copy()
                 "parameter_types": ", ".join(set([p["parameter_type"] for p in parameters])),
                 "input_methods": ", ".join(set([p["input_method"] for p in parameters])),
                 "section_types": ", ".join(set([s["section_type"] for s in sections]))
+            })
             documents.append({
                 "text": chunk,
         pdf_path = Path(pdf_path)
         file_hash = self.get_file_hash(pdf_path)
         filename = pdf_path.name
         # Check if already processed
         if filename in self.manifest["processed_files"]:
             if self.manifest["processed_files"][filename]["hash"] == file_hash:
                 print(f"Skipping {filename} - already processed")
                 return
         print(f"Processing checklist: {filename}...")
         try:
             # Load PDF content
             loader = PyPDFLoader(str(pdf_path))
             pages = loader.load()
             # Combine all pages
             full_text = ""
             for i, page in enumerate(pages):
                 full_text += f"\n--- Page {i+1} ---\n{page.page_content}"
             # If text is too short, use OCR
             if len(full_text.strip()) < 100:
                 print(f"Using OCR for {filename}")
                 ocr_text = self.ocr_pdf(pdf_path)
                 if len(ocr_text) > len(full_text):
                     full_text = ocr_text
             # Extract document metadata
             doc_metadata = self.extract_document_metadata(pdf_path, full_text)
             # Extract parameters and sections
             parameters, sections = self.extract_checklist_parameters(full_text)
             # Create base metadata for chunks
             metadata = {
                 "source": filename,
                 "document_type": doc_metadata["document_type"],
                 "product_name": doc_metadata["product_name"],
                 "supplier_name": doc_metadata["supplier_name"],
+                "checklist_attributes": doc_metadata["checklist_attributes"],
                 "file_hash": file_hash,
                 "processed_date": datetime.now().isoformat(),
+                "domain": "Quality Control"  # Generic domain
             }
             # Create chunks
             documents = self.create_chunks(full_text, metadata, parameters, sections)
             # Generate embeddings and store in ChromaDB
             for i, doc in enumerate(documents):
                 embedding = self.embedder.encode(doc["text"]).tolist()
                 self.collection.add(
                     documents=[doc["text"]],
                     embeddings=[embedding],
                     metadatas=[doc["metadata"]],
                     ids=[f"{file_hash}_{i}"]
                 )
             # Store metadata in SQLite
             self.save_document_metadata(file_hash, filename, doc_metadata, len(parameters))
             self.save_parameters(file_hash, parameters)
             self.save_sections(file_hash, sections)
             # Update manifest
             self.manifest["processed_files"][filename] = {
                 "hash": file_hash,
                 "parameters_extracted": len(parameters),
                 "sections_extracted": len(sections),
                 "document_type": doc_metadata["document_type"],
+                "product_name": doc_metadata["product_name"],
+                "attributes": doc_metadata["checklist_attributes"]
             }
             self.save_manifest()
             # Log success
             self.log_processing(filename, file_hash, "SUCCESS", None, len(parameters), len(sections))
             print(f"Successfully processed {filename}")
             print(f"  - Document Type: {doc_metadata['document_type']}")
             print(f"  - Product: {doc_metadata['product_name']}")
             print(f"  - Parameters extracted: {len(parameters)}")
             print(f"  - Sections extracted: {len(sections)}")
         except Exception as e:
             error_msg = str(e)
             print(f"Error processing {filename}: {error_msg}")
             traceback.print_exc()
             self.log_processing(filename, file_hash, "ERROR", error_msg, 0, 0)
     def save_document_metadata(self, file_hash, filename, metadata, total_parameters):
+       """Save document metadata to SQLite"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           cursor.execute("""
+               INSERT OR REPLACE INTO checklist_documents
+               (file_hash, filename, document_type, product_name, supplier_name,
+                checklist_attributes, total_parameters)
+               VALUES (?, ?, ?, ?, ?, ?, ?)
+           """, (
+               file_hash, filename, metadata["document_type"], metadata["product_name"],
+               metadata["supplier_name"], metadata["checklist_attributes"], total_parameters
+           ))
+           conn.commit()
+       finally:
+           conn.close()
     def save_parameters(self, file_hash, parameters):
+       """Save extracted parameters to SQLite"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           # Delete existing parameters for this file
+           cursor.execute("DELETE FROM checklist_parameters WHERE file_hash = ?", (file_hash,))
+           # Insert new parameters
+           for param in parameters:
+               cursor.execute("""
+                   INSERT INTO checklist_parameters
+                   (file_hash, parameter_name, parameter_type, input_method, specifications,
+                    options_list, tolerance_limits, measurement_units, section_category,
+                    parameter_order, has_remarks, is_mandatory)
+                   VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
+               """, (
+                   file_hash, param["parameter_name"], param["parameter_type"],
+                   param["input_method"], param["specifications"], param["options_list"],
+                   param["tolerance_limits"], param["measurement_units"],
+                   param["section_category"], param["parameter_order"],
+                   param["has_remarks"], param["is_mandatory"]
+               ))
+           conn.commit()
+       finally:
+           conn.close()
     def save_sections(self, file_hash, sections):
+       """Save extracted sections to SQLite"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           # Delete existing sections for this file
+           cursor.execute("DELETE FROM checklist_sections WHERE file_hash = ?", (file_hash,))
+           # Insert new sections
+           for section in sections:
+               cursor.execute("""
+                   INSERT INTO checklist_sections
+                   (file_hash, section_name, section_type, section_order, parameter_count)
+                   VALUES (?, ?, ?, ?, ?)
+               """, (
+                   file_hash, section["section_name"], section["section_type"],
+                   section["section_order"], section["parameter_count"]
+               ))
+           conn.commit()
+       finally:
+           conn.close()
     def log_processing(self, filename, file_hash, status, error_message, parameters_extracted=0, sections_extracted=0):
+       """Log processing status"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           cursor.execute("""
+               INSERT INTO processing_log
+               (filename, file_hash, status, error_message, parameters_extracted, sections_extracted)
+               VALUES (?, ?, ?, ?, ?, ?)
+           """, (filename, file_hash, status, error_message, parameters_extracted, sections_extracted))
+           conn.commit()
+       finally:
+           conn.close()
     def process_all_pdfs(self):
+       """Process all PDFs in the directory"""
+       pdf_files = list(self.pdf_path.glob("*.pdf"))
+       if not pdf_files:
+           print(f"No PDF files found in {self.pdf_path}")
+           return
+       print(f"Found {len(pdf_files)} checklist PDF files")
+       for pdf_file in pdf_files:
+           self.process_pdf(pdf_file)
+       print("\nChecklist processing complete!")
+       print(f"Total files in manifest: {len(self.manifest['processed_files'])}")
     def get_processing_stats(self):
+       """Get processing statistics"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           # Get overall stats
+           cursor.execute("""
+               SELECT COUNT(*) as total,
+                      SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
+                      SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
+                      SUM(parameters_extracted) as total_parameters,
+                      SUM(sections_extracted) as total_sections
+               FROM processing_log
+           """)
+           stats = cursor.fetchone()
+           # Get document type distribution
+           cursor.execute("""
+               SELECT document_type, COUNT(*) as count
+               FROM checklist_documents
+               GROUP BY document_type
+               ORDER BY count DESC
+           """)
+           doc_types = cursor.fetchall()
+           # Get attribute distribution
+           cursor.execute("""
+               SELECT checklist_attributes, COUNT(*) as count
+               FROM checklist_documents
+               WHERE checklist_attributes IS NOT NULL
+               GROUP BY checklist_attributes
+           """)
+           attr_dist = cursor.fetchall()
+           # Parse attributes for summary
+           attribute_summary = {}
+           for attrs_json, count in attr_dist:
+               if attrs_json:
+                   try:
+                       attrs = json.loads(attrs_json)
+                       for key, value in attrs.items():
+                           if key not in attribute_summary:
+                               attribute_summary[key] = {}
+                           if value not in attribute_summary[key]:
+                               attribute_summary[key][value] = 0
+                           attribute_summary[key][value] += count
+                   except:
+                       pass
+           # Get parameter type distribution
+           cursor.execute("""
+               SELECT input_method, COUNT(*) as count
+               FROM checklist_parameters
+               GROUP BY input_method
+               ORDER BY count DESC
+           """)
+           input_methods = cursor.fetchall()
+           # Get most common parameters
+           cursor.execute("""
+               SELECT parameter_name, parameter_type, input_method, COUNT(*) as frequency
+               FROM checklist_parameters
+               GROUP BY parameter_name, parameter_type, input_method
+               HAVING frequency > 1
+               ORDER BY frequency DESC
+               LIMIT 10
+           """)
+           common_params = cursor.fetchall()
+           return {
+               "total_processed": stats[0],
+               "successful": stats[1],
+               "errors": stats[2],
+               "total_parameters": stats[3],
+               "total_sections": stats[4],
+               "document_types": dict(doc_types),
+               "input_methods": dict(input_methods),
+               "attribute_summary": attribute_summary,
+               "common_parameters": [
+                   {
+                       "name": p[0],
+                       "type": p[1],
+                       "input_method": p[2],
+                       "frequency": p[3]
+                   } for p in common_params
+               ]
+           }
+       finally:
+           conn.close()
     def get_parameter_patterns(self):
+       """Get common parameter patterns for AI reference"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           cursor.execute("""
+               SELECT
+                   parameter_type,
+                   input_method,
+                   GROUP_CONCAT(DISTINCT specifications) as common_specs,
+                   GROUP_CONCAT(DISTINCT options_list) as common_options,
+                   COUNT(*) as usage_count
+               FROM checklist_parameters
+               WHERE specifications != '' OR options_list != ''
+               GROUP BY parameter_type, input_method
+               ORDER BY usage_count DESC
+           """)
+           patterns = []
+           for row in cursor.fetchall():
+               patterns.append({
+                   "parameter_type": row[0],
+                   "input_method": row[1],
+                   "common_specifications": row[2],
+                   "common_options": row[3],
+                   "usage_count": row[4]
+               })
+           return patterns
+       finally:
+           conn.close()
     def search_similar_checklists(self, product_name, checklist_type="", limit=5):
+       """Search for similar checklists based on product and type"""
+       query_text = f"{product_name} {checklist_type} quality control inspection checklist"
+       query_embedding = self.embedder.encode(query_text).tolist()
+       try:
+           results = self.collection.query(
+               query_embeddings=[query_embedding],
+               n_results=limit,
+               where={"domain": "Quality Control"}
+           )
+           similar_checklists = []
+           if results['documents'][0]:
+               for i, doc in enumerate(results['documents'][0]):
+                   metadata = results['metadatas'][0][i]
+                   # Parse attributes
+                   attrs = {}
+                   if metadata.get('checklist_attributes'):
+                       try:
+                           attrs = json.loads(metadata['checklist_attributes'])
+                       except:
+                           pass
+                   similar_checklists.append({
+                       "document": metadata.get('source', 'Unknown'),
+                       "product": metadata.get('product_name', 'Unknown'),
+                       "type": metadata.get('document_type', 'Unknown'),
+                       "attributes": attrs,
+                       "parameters": metadata.get('total_parameters', 0),
+                       "relevance_score": 1 - results['distances'][0][i] if 'distances' in results else 0,
+                       "content_preview": doc[:200] + "..." if len(doc) > 200 else doc
+                   })
+           return similar_checklists
+       except Exception as e:
+           print(f"Error searching checklists: {str(e)}")
+           return []
 def main():
+   """Main function to create/update the checklist examples database"""
+   print("Starting Generic Checklist Examples Database Creation...")
+   print("Features: No company bias, dynamic attributes, universal patterns")
+   # Initialize database
+   db = ChecklistExamplesVDB()
+   # Process all PDFs
+   db.process_all_pdfs()
+   # Show processing stats
+   print("\n" + "="*60)
+   print("PROCESSING STATISTICS (Generic)")
+   print("="*60)
+   stats = db.get_processing_stats()
+   print(f"Total files processed: {stats['total_processed']}")
+   print(f"Successful: {stats['successful']}")
+   print(f"Errors: {stats['errors']}")
+   print(f"Total parameters extracted: {stats['total_parameters']}")
+   print(f"Total sections extracted: {stats['total_sections']}")
+   print("\nDocument Types:")
+   for doc_type, count in stats["document_types"].items():
+       print(f"  - {doc_type}: {count} documents")
+   print("\nDynamic Attributes Found:")
+   for attr_type, values in stats["attribute_summary"].items():
+       print(f"\n{attr_type}:")
+       for value, count in values.items():
+           print(f"  - {value}: {count} documents")
+   print("\nInput Methods Distribution:")
+   for method, count in stats["input_methods"].items():
+       print(f"  - {method}: {count} parameters")
+   print("\nMost Common Parameters (Generic):")
+   for param in stats["common_parameters"]:
+       print(f"  - {param['name']} ({param['input_method']}) - used {param['frequency']} times")
+   # Show parameter patterns
+   print("\n" + "="*60)
+   print("PARAMETER PATTERNS DISCOVERED")
+   print("="*60)
+   patterns = db.get_parameter_patterns()
+   for pattern in patterns[:10]:  # Show top 10 patterns
+       print(f"\n{pattern['parameter_type']} -> {pattern['input_method']}")
+       print(f"  Usage: {pattern['usage_count']} times")
+       if pattern['common_specifications']:
+           specs = pattern['common_specifications'][:100]
+           print(f"  Common specs: {specs}{'...' if len(pattern['common_specifications']) > 100 else ''}")
+       if pattern['common_options']:
+           options = pattern['common_options'][:100]
+           print(f"  Common options: {options}{'...' if len(pattern['common_options']) > 100 else ''}")
+   # Test search functionality
+   print("\n" + "="*60)
+   print("TESTING SEARCH FUNCTIONALITY")
+   print("="*60)
+   test_products = ["Quality Inspection", "Production Check", "Safety Assessment"]
+   for product in test_products:
+       print(f"\nSearching for '{product}' checklists:")
+       similar = db.search_similar_checklists(product, limit=3)
+       for i, checklist in enumerate(similar, 1):
+           print(f"  {i}. {checklist['document']} ({checklist['type']})")
+           print(f"     Product: {checklist['product']}, Parameters: {checklist['parameters']}")
+           if checklist['attributes']:
+               print(f"     Attributes: {checklist['attributes']}")
+           print(f"     Relevance: {checklist['relevance_score']:.3f}")
 if __name__ == "__main__":
+   main()

vector_stores/chroma_db/product_specs/create_product_spec_db.py CHANGED Viewed

@@ -48,6 +48,7 @@ class ProductSpecificationVectorDB:
         conn = sqlite3.connect(self.metadata_db_path)
         cursor = conn.cursor()
         cursor.execute("""
             CREATE TABLE IF NOT EXISTS product_documents (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -56,7 +57,7 @@ class ProductSpecificationVectorDB:
                 product_name TEXT,
                 brand TEXT,
                 supplier TEXT,
-                product_category TEXT,
                 shelf_life TEXT,
                 storage_conditions TEXT,
                 manufacturing_location TEXT,
@@ -136,12 +137,12 @@ class ProductSpecificationVectorDB:
             return hashlib.md5(f.read()).hexdigest()
     def extract_product_metadata(self, text):
-        """Extract product-specific metadata"""
         metadata = {
             "product_name": "",
             "brand": "",
             "supplier": "",
-            "product_category": "",
             "shelf_life": "",
             "storage_conditions": "",
             "manufacturing_location": "",
@@ -156,16 +157,16 @@ class ProductSpecificationVectorDB:
                 metadata["product_name"] = re.search(r'Product\s*Name[:]*\s*(.+)', line, re.IGNORECASE).group(1).strip()
                 break
-        # Extract brand
         brand_patterns = [
             r'Brand[:]*\s*(.+)',
-            r'Al\s*Kabeer',
             r'Company[:]*\s*(.+)'
         ]
         for pattern in brand_patterns:
             match = re.search(pattern, text, re.IGNORECASE)
             if match:
-                metadata["brand"] = match.group(1).strip() if match.groups() else "Al Kabeer"
                 break
         # Extract shelf life
@@ -192,21 +193,56 @@ class ProductSpecificationVectorDB:
                 metadata["storage_conditions"] = match.group(1).strip()
                 break
-        # Determine product category
-        category_keywords = {
-            "Frozen Food": ["frozen", "iqf", "freeze"],
-            "Bakery": ["bread", "paratha", "bakery", "dough"],
-            "Vegetables": ["vegetable", "corn", "peas", "carrot"],
-            "Snacks": ["samosa", "snack", "fried"],
-            "Dairy": ["milk", "cheese", "yogurt", "dairy"]
         }
         text_lower = text.lower()
-        for category, keywords in category_keywords.items():
             if any(keyword in text_lower for keyword in keywords):
-                metadata["product_category"] = category
                 break
         return metadata
     def extract_parameters(self, text):
@@ -276,30 +312,30 @@ class ProductSpecificationVectorDB:
             return "Text Input"
     def classify_parameter_category(self, param_name):
-        """Classify parameter into categories"""
         param_lower = param_name.lower()
-        categories = {
-            "Physical": ["weight", "size", "dimension", "length", "width", "diameter"],
-            "Sensory": ["appearance", "color", "texture", "taste", "flavor", "aroma", "odor"],
-            "Microbiological": ["plate count", "coli", "salmonella", "listeria", "enterobacteriaceae"],
-            "Chemical": ["moisture", "fat", "protein", "ph", "acid", "chemical"],
-            "Safety": ["foreign", "contamination", "allergen", "residue"],
-            "Temperature": ["temperature", "freezing", "cooling", "heating"],
-            "Packaging": ["packaging", "labeling", "seal", "material"]
-        }
-        for category, keywords in categories.items():
-            if any(keyword in param_lower for keyword in keywords):
-                return category
-        return "General"
     def is_critical_parameter(self, param_name):
         """Determine if parameter is critical for safety/quality"""
         critical_keywords = [
             "temperature", "microbiological", "pathogen", "salmonella", "listeria",
-            "foreign", "contamination", "allergen", "weight", "moisture"
         ]
         return any(keyword in param_name.lower() for keyword in critical_keywords)
@@ -310,10 +346,11 @@ class ProductSpecificationVectorDB:
         # Look for table headers
         table_indicators = [
-            "MICROBIOLOGICAL SPECIFICATIONS",
-            "CHEMICAL SPECIFICATIONS",
-            "PRODUCT CHARACTERISTICS",
-            "NUTRITIONAL FACTS"
         ]
         in_table = False
@@ -340,7 +377,7 @@ class ProductSpecificationVectorDB:
                     if len(param_name) > 3 and param_name not in ["PARAMETERS", "ACCEPTED LIMIT"]:
                         param_type = self.classify_parameter_type(param_name, value, unit)
-                        category = current_table_type.split()[0] if current_table_type else "General"
                         parameters.append({
                             "parameter_name": param_name,
@@ -389,50 +426,38 @@ class ProductSpecificationVectorDB:
         return nutritional_data
     def extract_compliance_standards(self, text):
-        """Extract compliance standards and certifications"""
         standards = []
-        # Look for standards references
         standard_patterns = [
-            r'(GSO\s*\d+[:/]\d+)',
-            r'(ISO\s*\d+(?::\d+)?)',
-            r'(HACCP)',
-            r'(HALAL)',
-            r'(FDA)',
-            r'(SASO\s*Standard)',
-            r'(EU\s*Regulation)',
-            r'(AOAC)',
-            r'(Codex\s*Alimentarius)'
         ]
         for pattern in standard_patterns:
             matches = re.finditer(pattern, text, re.IGNORECASE)
             for match in matches:
-                standard_code = match.group(1)
-                # Determine standard type
-                if "GSO" in standard_code:
-                    standard_name = "GCC Standardization Organization"
-                    compliance_type = "Regional Standard"
-                elif "ISO" in standard_code:
-                    standard_name = "International Organization for Standardization"
-                    compliance_type = "International Standard"
-                elif "HACCP" in standard_code:
-                    standard_name = "Hazard Analysis Critical Control Points"
-                    compliance_type = "Food Safety System"
-                elif "HALAL" in standard_code:
-                    standard_name = "Halal Certification"
-                    compliance_type = "Religious Compliance"
                 else:
-                    standard_name = standard_code
-                    compliance_type = "Regulatory Standard"
-                standards.append({
-                    "standard_name": standard_name,
-                    "standard_code": standard_code,
-                    "compliance_type": compliance_type,
-                    "requirements": ""  # Could be extracted from context
-                })
         return standards
@@ -456,8 +481,14 @@ class ProductSpecificationVectorDB:
             # Add product context to searchable content
             searchable_content = f"Product: {metadata.get('product_name', 'Unknown')}\n"
-            searchable_content += f"Category: {metadata.get('product_category', 'General')}\n\n"
-            searchable_content += chunk
             documents.append({
                 "text": searchable_content,
@@ -511,7 +542,7 @@ class ProductSpecificationVectorDB:
                 "source": filename,
                 "product_name": product_metadata["product_name"],
                 "brand": product_metadata["brand"],
-                "product_category": product_metadata["product_category"],
                 "shelf_life": product_metadata["shelf_life"],
                 "storage_conditions": product_metadata["storage_conditions"],
                 "file_hash": file_hash,
@@ -546,7 +577,8 @@ class ProductSpecificationVectorDB:
                 "processed_date": datetime.now().isoformat(),
                 "product_name": product_metadata["product_name"],
                 "parameters_extracted": len(parameters),
-                "compliance_standards": len(compliance_standards)
             }
             self.save_manifest()
@@ -568,13 +600,14 @@ class ProductSpecificationVectorDB:
         try:
             cursor.execute("""
                 INSERT OR REPLACE INTO product_documents
-                (file_hash, filename, product_name, brand, supplier, product_category,
                  shelf_life, storage_conditions, manufacturing_location, document_type)
                 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             """, (
                 file_hash, filename,
                 metadata["product_name"], metadata["brand"], metadata["supplier"],
-                metadata["product_category"], metadata["shelf_life"],
                 metadata["storage_conditions"], metadata["manufacturing_location"],
                 metadata["document_type"]
             ))
@@ -615,97 +648,162 @@ class ProductSpecificationVectorDB:
             cursor.execute("DELETE FROM nutritional_info WHERE file_hash = ?", (file_hash,))
             for nutrition in nutritional_data:
-                cursor.execute("""
-                    INSERT INTO nutritional_info
-                    (file_hash, nutrient_name, value_per_100g, daily_value_percent)
-                    VALUES (?, ?, ?, ?)
-                """, (
-                    file_hash, nutrition["nutrient_name"],
-                    nutrition["value_per_100g"], nutrition["daily_value_percent"]
-                ))
             conn.commit()
         finally:
             conn.close()
     def save_compliance_standards(self, file_hash, standards):
-        """Save compliance standards to SQLite"""
-        conn = sqlite3.connect(self.metadata_db_path)
-        cursor = conn.cursor()
-        try:
-            cursor.execute("DELETE FROM compliance_standards WHERE file_hash = ?", (file_hash,))
-            for standard in standards:
-                cursor.execute("""
-                    INSERT INTO compliance_standards
-                    (file_hash, standard_name, standard_code, compliance_type, requirements)
-                    VALUES (?, ?, ?, ?, ?)
-                """, (
-                    file_hash, standard["standard_name"], standard["standard_code"],
-                    standard["compliance_type"], standard["requirements"]
-                ))
-            conn.commit()
-        finally:
-            conn.close()
     def log_processing(self, filename, file_hash, status, error_message, params_count=0, standards_count=0):
-        """Log processing results"""
-        conn = sqlite3.connect(self.metadata_db_path)
-        cursor = conn.cursor()
-        try:
-            cursor.execute("""
-                INSERT INTO processing_log
-                (filename, file_hash, status, error_message, parameters_extracted, compliance_standards_extracted)
-                VALUES (?, ?, ?, ?, ?, ?)
-            """, (filename, file_hash, status, error_message, params_count, standards_count))
-            conn.commit()
-        finally:
-            conn.close()
     def ocr_pdf(self, pdf_path):
-        """OCR fallback for scanned PDFs"""
-        try:
-            images = pdf2image.convert_from_path(pdf_path)
-            full_text = ""
-            for i, image in enumerate(images):
-                text = pytesseract.image_to_string(image)
-                full_text += f"\n--- Page {i+1} ---\n{text}"
-            return full_text
-        except Exception as e:
-            print(f"OCR error: {e}")
-            return ""
     def process_all_pdfs(self):
-        """Process all product specification PDFs"""
-        pdf_files = list(self.pdf_path.glob("*.pdf"))
-        if not pdf_files:
-            print(f"No PDF files found in {self.pdf_path}")
-            return
-        print(f"Found {len(pdf_files)} product specification files")
-        for pdf_file in pdf_files:
-            self.process_pdf(pdf_file)
-        print(f"Product specification VDB creation complete!")
 def main():
-    """Main function"""
-    print("Creating Product Specification Vector Database...")
-    db = ProductSpecificationVectorDB()
-    db.process_all_pdfs()
-    print("Product Specification VDB ready!")
 if __name__ == "__main__":
-    main()

         conn = sqlite3.connect(self.metadata_db_path)
         cursor = conn.cursor()
+        # UPDATED: Added product_attributes instead of fixed category
         cursor.execute("""
             CREATE TABLE IF NOT EXISTS product_documents (
                 id INTEGER PRIMARY KEY AUTOINCREMENT,
                 product_name TEXT,
                 brand TEXT,
                 supplier TEXT,
+                product_attributes TEXT,  -- Dynamic attributes instead of category
                 shelf_life TEXT,
                 storage_conditions TEXT,
                 manufacturing_location TEXT,
             return hashlib.md5(f.read()).hexdigest()
     def extract_product_metadata(self, text):
+        """Extract product-specific metadata without forcing categories"""
         metadata = {
             "product_name": "",
             "brand": "",
             "supplier": "",
+            "product_attributes": {},  # Dynamic attributes
             "shelf_life": "",
             "storage_conditions": "",
             "manufacturing_location": "",
                 metadata["product_name"] = re.search(r'Product\s*Name[:]*\s*(.+)', line, re.IGNORECASE).group(1).strip()
                 break
+        # Extract brand (generic)
         brand_patterns = [
             r'Brand[:]*\s*(.+)',
+            r'Manufacturer[:]*\s*(.+)',
             r'Company[:]*\s*(.+)'
         ]
         for pattern in brand_patterns:
             match = re.search(pattern, text, re.IGNORECASE)
             if match:
+                metadata["brand"] = match.group(1).strip()
                 break
         # Extract shelf life
                 metadata["storage_conditions"] = match.group(1).strip()
                 break
+        # UPDATED: Extract dynamic product attributes instead of fixed categories
+        attributes = {}
+        # Temperature requirements
+        temp_match = re.search(r'(?:stored?|kept?|maintain(?:ed)?)\s*at\s*([-\d]+\s*[°]?[CF])', text, re.IGNORECASE)
+        if temp_match:
+            attributes["temperature_requirement"] = temp_match.group(1)
+        # Processing method
+        processing_keywords = {
+            "frozen": ["frozen", "freeze", "iqf", "individually quick frozen"],
+            "fresh": ["fresh", "chilled", "refrigerated"],
+            "dried": ["dried", "dehydrated", "dry"],
+            "canned": ["canned", "tinned", "preserved"],
+            "fried": ["fried", "deep fried", "oil fried"],
+            "baked": ["baked", "oven", "bakery"],
+            "raw": ["raw", "uncooked", "unprocessed"],
+            "cooked": ["cooked", "pre-cooked", "ready to eat"]
         }
         text_lower = text.lower()
+        for method, keywords in processing_keywords.items():
             if any(keyword in text_lower for keyword in keywords):
+                attributes["processing_method"] = method
                 break
+        # Product form
+        form_keywords = {
+            "powder": ["powder", "powdered"],
+            "liquid": ["liquid", "juice", "syrup"],
+            "solid": ["solid", "whole", "pieces"],
+            "paste": ["paste", "puree"],
+            "granular": ["granular", "granules"]
+        }
+        for form, keywords in form_keywords.items():
+            if any(keyword in text_lower for keyword in keywords):
+                attributes["product_form"] = form
+                break
+        # Special characteristics
+        if any(word in text_lower for word in ["organic", "natural", "no preservatives"]):
+            attributes["special_characteristics"] = "natural/organic"
+        if any(word in text_lower for word in ["halal", "kosher"]):
+            attributes["certification"] = "religious compliance"
+        if any(word in text_lower for word in ["gluten free", "allergen free"]):
+            attributes["dietary"] = "allergen-free"
+        metadata["product_attributes"] = json.dumps(attributes)
         return metadata
     def extract_parameters(self, text):
             return "Text Input"
     def classify_parameter_category(self, param_name):
+        """Classify parameter into categories dynamically"""
         param_lower = param_name.lower()
+        # Dynamic categorization based on parameter nature
+        if any(word in param_lower for word in ["weight", "size", "dimension", "length", "width"]):
+            return "Physical Measurement"
+        elif any(word in param_lower for word in ["appearance", "color", "texture", "taste", "flavor"]):
+            return "Sensory Attribute"
+        elif any(word in param_lower for word in ["bacteria", "microb", "pathogen", "coli"]):
+            return "Microbiological"
+        elif any(word in param_lower for word in ["moisture", "fat", "protein", "ph", "acid"]):
+            return "Chemical Composition"
+        elif any(word in param_lower for word in ["foreign", "contamination", "hazard"]):
+            return "Safety Parameter"
+        elif any(word in param_lower for word in ["temperature", "thermal"]):
+            return "Temperature Control"
+        else:
+            return "Quality Parameter"
     def is_critical_parameter(self, param_name):
         """Determine if parameter is critical for safety/quality"""
         critical_keywords = [
             "temperature", "microbiological", "pathogen", "salmonella", "listeria",
+            "foreign", "contamination", "allergen", "critical"
         ]
         return any(keyword in param_name.lower() for keyword in critical_keywords)
         # Look for table headers
         table_indicators = [
+            "SPECIFICATIONS",
+            "PARAMETERS",
+            "CHARACTERISTICS",
+            "REQUIREMENTS",
+            "LIMITS"
         ]
         in_table = False
                     if len(param_name) > 3 and param_name not in ["PARAMETERS", "ACCEPTED LIMIT"]:
                         param_type = self.classify_parameter_type(param_name, value, unit)
+                        category = self.classify_parameter_category(param_name)
                         parameters.append({
                             "parameter_name": param_name,
         return nutritional_data
     def extract_compliance_standards(self, text):
+        """Extract compliance standards and certifications generically"""
         standards = []
+        # Generic standard patterns
         standard_patterns = [
+            r'(?:complies?\s*with|as\s*per|according\s*to)\s*([A-Z]+\s*\d+[:/]?\d*)',
+            r'(?:standard|specification)\s*:?\s*([A-Z]+\s*\d+[:/]?\d*)',
+            r'(?:certified|certification)\s*:?\s*([A-Za-z\s]+)',
+            r'([A-Z]{2,}\s*\d+(?::\d+)?)',  # Generic standard format
         ]
         for pattern in standard_patterns:
             matches = re.finditer(pattern, text, re.IGNORECASE)
             for match in matches:
+                standard_ref = match.group(1).strip()
+                # Generic classification
+                if re.match(r'^[A-Z]{2,4}\s*\d+', standard_ref):
+                    standard_name = "Industry Standard"
+                    compliance_type = "Technical Standard"
                 else:
+                    standard_name = standard_ref
+                    compliance_type = "Certification"
+                # Avoid duplicates
+                if not any(s["standard_code"] == standard_ref for s in standards):
+                    standards.append({
+                        "standard_name": standard_name,
+                        "standard_code": standard_ref,
+                        "compliance_type": compliance_type,
+                        "requirements": ""
+                    })
         return standards
             # Add product context to searchable content
             searchable_content = f"Product: {metadata.get('product_name', 'Unknown')}\n"
+            # Add dynamic attributes
+            if metadata.get('product_attributes'):
+                attrs = json.loads(metadata['product_attributes'])
+                if attrs:
+                    searchable_content += f"Attributes: {', '.join(f'{k}={v}' for k, v in attrs.items())}\n"
+            searchable_content += f"\n{chunk}"
             documents.append({
                 "text": searchable_content,
                 "source": filename,
                 "product_name": product_metadata["product_name"],
                 "brand": product_metadata["brand"],
+                "product_attributes": product_metadata["product_attributes"],  # Dynamic attributes
                 "shelf_life": product_metadata["shelf_life"],
                 "storage_conditions": product_metadata["storage_conditions"],
                 "file_hash": file_hash,
                 "processed_date": datetime.now().isoformat(),
                 "product_name": product_metadata["product_name"],
                 "parameters_extracted": len(parameters),
+                "compliance_standards": len(compliance_standards),
+                "attributes": product_metadata["product_attributes"]
             }
             self.save_manifest()
         try:
             cursor.execute("""
                 INSERT OR REPLACE INTO product_documents
+                (file_hash, filename, product_name, brand, supplier, product_attributes,
                  shelf_life, storage_conditions, manufacturing_location, document_type)
                 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
             """, (
                 file_hash, filename,
                 metadata["product_name"], metadata["brand"], metadata["supplier"],
+                metadata["product_attributes"],  # Dynamic attributes as JSON
+                metadata["shelf_life"],
                 metadata["storage_conditions"], metadata["manufacturing_location"],
                 metadata["document_type"]
             ))
             cursor.execute("DELETE FROM nutritional_info WHERE file_hash = ?", (file_hash,))
             for nutrition in nutritional_data:
+                cursor.execute("""INSERT INTO nutritional_info
+                   (file_hash, nutrient_name, value_per_100g, daily_value_percent)
+                   VALUES (?, ?, ?, ?)
+               """, (
+                   file_hash, nutrition["nutrient_name"],
+                   nutrition["value_per_100g"], nutrition["daily_value_percent"]
+               ))
             conn.commit()
         finally:
             conn.close()
     def save_compliance_standards(self, file_hash, standards):
+       """Save compliance standards to SQLite"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           cursor.execute("DELETE FROM compliance_standards WHERE file_hash = ?", (file_hash,))
+           for standard in standards:
+               cursor.execute("""
+                   INSERT INTO compliance_standards
+                   (file_hash, standard_name, standard_code, compliance_type, requirements)
+                   VALUES (?, ?, ?, ?, ?)
+               """, (
+                   file_hash, standard["standard_name"], standard["standard_code"],
+                   standard["compliance_type"], standard["requirements"]
+               ))
+           conn.commit()
+       finally:
+           conn.close()
     def log_processing(self, filename, file_hash, status, error_message, params_count=0, standards_count=0):
+       """Log processing results"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           cursor.execute("""
+               INSERT INTO processing_log
+               (filename, file_hash, status, error_message, parameters_extracted, compliance_standards_extracted)
+               VALUES (?, ?, ?, ?, ?, ?)
+           """, (filename, file_hash, status, error_message, params_count, standards_count))
+           conn.commit()
+       finally:
+           conn.close()
     def ocr_pdf(self, pdf_path):
+       """OCR fallback for scanned PDFs"""
+       try:
+           images = pdf2image.convert_from_path(pdf_path)
+           full_text = ""
+           for i, image in enumerate(images):
+               text = pytesseract.image_to_string(image)
+               full_text += f"\n--- Page {i+1} ---\n{text}"
+           return full_text
+       except Exception as e:
+           print(f"OCR error: {e}")
+           return ""
     def process_all_pdfs(self):
+       """Process all product specification PDFs"""
+       pdf_files = list(self.pdf_path.glob("*.pdf"))
+       if not pdf_files:
+           print(f"No PDF files found in {self.pdf_path}")
+           return
+       print(f"Found {len(pdf_files)} product specification files")
+       for pdf_file in pdf_files:
+           self.process_pdf(pdf_file)
+       print(f"Product specification VDB creation complete!")
+    def get_processing_stats(self):
+       """Get processing statistics"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           # Overall stats
+           cursor.execute("""
+               SELECT COUNT(*) as total,
+                      SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
+                      SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
+                      SUM(parameters_extracted) as total_parameters
+               FROM processing_log
+           """)
+           stats = cursor.fetchone()
+           # Get attribute distribution
+           cursor.execute("""
+               SELECT product_attributes, COUNT(*) as count
+               FROM product_documents
+               WHERE product_attributes IS NOT NULL
+               GROUP BY product_attributes
+           """)
+           attribute_dist = cursor.fetchall()
+           # Parse attributes to get summary
+           attribute_summary = {}
+           for attrs_json, count in attribute_dist:
+               if attrs_json:
+                   try:
+                       attrs = json.loads(attrs_json)
+                       for key, value in attrs.items():
+                           if key not in attribute_summary:
+                               attribute_summary[key] = {}
+                           if value not in attribute_summary[key]:
+                               attribute_summary[key][value] = 0
+                           attribute_summary[key][value] += count
+                   except:
+                       pass
+           return {
+               "total_processed": stats[0] or 0,
+               "successful": stats[1] or 0,
+               "errors": stats[2] or 0,
+               "total_parameters": stats[3] or 0,
+               "attribute_summary": attribute_summary
+           }
+       finally:
+           conn.close()
 def main():
+   """Main function"""
+   print("Creating Product Specification Vector Database...")
+   print("Features: Dynamic product attributes, no fixed categories")
+   db = ProductSpecificationVectorDB()
+   db.process_all_pdfs()
+   # Show stats
+   stats = db.get_processing_stats()
+   print(f"\n📊 Processing Statistics:")
+   print(f"Total files: {stats['total_processed']}")
+   print(f"Successful: {stats['successful']}")
+   print(f"Total parameters: {stats['total_parameters']}")
+   print(f"\n🏷️ Dynamic Product Attributes Found:")
+   for attr_type, values in stats['attribute_summary'].items():
+       print(f"\n{attr_type}:")
+       for value, count in values.items():
+           print(f"  - {value}: {count} products")
+   print("\nProduct Specification VDB ready!")
 if __name__ == "__main__":
+   main()

vector_stores/chroma_db/regulatory_docs/create_regulatory_db.py CHANGED Viewed

@@ -1,578 +1,3 @@
-# import os
-# import json
-# import sqlite3
-# from datetime import datetime
-# from pathlib import Path
-# import chromadb
-# from chromadb import Settings
-# from langchain_community.document_loaders import PyPDFLoader
-# from langchain.text_splitter import RecursiveCharacterTextSplitter
-# from sentence_transformers import SentenceTransformer
-# import pytesseract
-# from PIL import Image
-# import pdf2image
-# import hashlib
-# import re
-# class RegulatoryGuidelinesDB:
-#     def __init__(self, base_path="./vector_stores"):
-#         self.base_path = Path(base_path)
-#         self.pdf_path = self.base_path / "regulatory_guidelines" / "pdfs"
-#         self.chroma_path = self.base_path / "chroma_db" / "regulatory_docs"
-#         self.metadata_path = self.chroma_path / "metadata"
-#         self.manifest_path = self.metadata_path / "manifest.json"
-#         self.metadata_db_path = self.metadata_path / "regulatory_metadata.db"
-#         # Create directories
-#         self.pdf_path.mkdir(parents=True, exist_ok=True)
-#         self.chroma_path.mkdir(parents=True, exist_ok=True)
-#         self.metadata_path.mkdir(parents=True, exist_ok=True)
-#         # Initialize ChromaDB
-#         self.client = chromadb.PersistentClient(
-#             path=str(self.chroma_path),
-#             settings=Settings(anonymized_telemetry=False)
-#         )
-#         # Get or create collection
-#         self.collection = self.client.get_or_create_collection(
-#             name="regulatory_guidelines",
-#             metadata={"description": "Regulatory guidelines and standards for QC"}
-#         )
-#         # Initialize embedding model
-#         self.embedder = SentenceTransformer('all-MiniLM-L6-v2')
-#         # Initialize metadata database
-#         self.init_metadata_db()
-#         # Load manifest
-#         self.manifest = self.load_manifest()
-#     def init_metadata_db(self):
-#         """Initialize SQLite database for storing regulatory metadata"""
-#         conn = sqlite3.connect(self.metadata_db_path)
-#         cursor = conn.cursor()
-#         cursor.execute("""
-#             CREATE TABLE IF NOT EXISTS regulatory_documents (
-#                 id INTEGER PRIMARY KEY AUTOINCREMENT,
-#                 file_hash TEXT UNIQUE NOT NULL,
-#                 filename TEXT NOT NULL,
-#                 regulatory_body TEXT,
-#                 standard_type TEXT,
-#                 standard_code TEXT,
-#                 publication_date TEXT,
-#                 effective_date TEXT,
-#                 jurisdiction TEXT,
-#                 industry TEXT,
-#                 extracted_at DATETIME DEFAULT CURRENT_TIMESTAMP
-#             )
-#         """)
-#         cursor.execute("""
-#             CREATE TABLE IF NOT EXISTS processing_log (
-#                 id INTEGER PRIMARY KEY AUTOINCREMENT,
-#                 filename TEXT NOT NULL,
-#                 file_hash TEXT NOT NULL,
-#                 processed_at DATETIME DEFAULT CURRENT_TIMESTAMP,
-#                 status TEXT,
-#                 error_message TEXT,
-#                 text_length INTEGER,
-#                 chunk_count INTEGER
-#             )
-#         """)
-#         cursor.execute("""
-#             CREATE TABLE IF NOT EXISTS key_topics (
-#                 id INTEGER PRIMARY KEY AUTOINCREMENT,
-#                 file_hash TEXT NOT NULL,
-#                 topic TEXT NOT NULL,
-#                 relevance_score REAL,
-#                 FOREIGN KEY (file_hash) REFERENCES regulatory_documents(file_hash)
-#             )
-#         """)
-#         conn.commit()
-#         conn.close()
-#     def load_manifest(self):
-#         """Load processing manifest"""
-#         if self.manifest_path.exists():
-#             with open(self.manifest_path, 'r') as f:
-#                 return json.load(f)
-#         return {"processed_files": {}, "last_updated": None}
-#     def save_manifest(self):
-#         """Save processing manifest"""
-#         self.manifest["last_updated"] = datetime.now().isoformat()
-#         with open(self.manifest_path, 'w') as f:
-#             json.dump(self.manifest, f, indent=2)
-#     def get_file_hash(self, filepath):
-#         """Generate hash for file to track changes"""
-#         with open(filepath, 'rb') as f:
-#             return hashlib.md5(f.read()).hexdigest()
-#     def extract_metadata_from_pdf(self, pdf_path, text_content):
-#         """Extract regulatory metadata from PDF"""
-#         metadata = {
-#             "regulatory_body": "Unknown",
-#             "standard_type": "Document",
-#             "standard_code": "",
-#             "publication_date": "",
-#             "effective_date": "",
-#             "jurisdiction": "General",
-#             "industry": "General"
-#         }
-#         # Extract regulatory body
-#         regulatory_bodies = {
-#             "Dubai Municipality": ["dubai municipality", "dm ", "بلدية دبي", "@dmunicipality", "food safety department"],
-#             "HACCP": ["haccp", "hazard analysis"],
-#             "ISO": ["iso ", "international organization"],
-#             "GSO": ["gso ", "gcc standardization", "gulf standard"],
-#             "FDA": ["fda", "food and drug administration"],
-#             "ESMA": ["esma", "emirates authority for standardization", "emirates standardisation"],
-#             "SASO": ["saso", "saudi standards"],
-#             "UAE Ministry": ["uae ministry", "ministry of", "الإمارات العربية المتحدة", "ministry of environment and water"],
-#             "Federal Law": ["federal law", "uae law", "united arab emirates law"],
-#             "DHA": ["dubai health authority", "dha"],
-#             "Ministry of Health": ["ministry of health and prevention", "mohp"]
-#         }
-#         text_lower = text_content.lower()
-#         for body, patterns in regulatory_bodies.items():
-#             if any(pattern in text_lower for pattern in patterns):
-#                 metadata["regulatory_body"] = body
-#                 break
-#         # Extract standard code (e.g., ISO 22000, GSO 2055)
-#         standard_patterns = [
-#             r"(ISO\s*\d+(?::\d+)?)",
-#             r"(GSO\s*\d+(?:/\d+)?)",
-#             r"(HACCP\s*(?:Rev\s*\d+)?)",
-#             r"(DM[-/]\d+)",
-#             r"(ESMA\s*\d+)",
-#             r"(FDA\s*\d+)",
-#             r"(Edition\s*\d+)",  # For Dubai Municipality documents
-#             r"(Federal Law No\.\s*\d+\s*of\s*\d+)",  # For UAE Federal Laws
-#             r"(Circular\s*(?:No\.)?\s*\d+)",
-#         ]
-#         for pattern in standard_patterns:
-#             matches = re.findall(pattern, text_content, re.IGNORECASE)
-#             if matches:
-#                 metadata["standard_code"] = matches[0]
-#                 break
-#         # Extract publication/effective dates
-#         date_patterns = [
-#             r"(?:publication|published|issue)[\s:]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
-#             r"(?:effective|validity)[\s:]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
-#             r"(?:date|dated)[\s:]*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
-#         ]
-#         for i, pattern in enumerate(date_patterns):
-#             matches = re.findall(pattern, text_content, re.IGNORECASE)
-#             if matches:
-#                 if i == 0:
-#                     metadata["publication_date"] = matches[0]
-#                 elif i == 1:
-#                     metadata["effective_date"] = matches[0]
-#                 else:
-#                     if not metadata["publication_date"]:
-#                         metadata["publication_date"] = matches[0]
-#         # Extract jurisdiction
-#         jurisdictions = {
-#             "UAE": ["uae", "united arab emirates", "الإمارات"],
-#             "Dubai": ["dubai", "دبي"],
-#             "GCC": ["gcc", "gulf cooperation council", "مجلس التعاون الخليجي"],
-#             "International": ["international", "global"],
-#         }
-#         for jurisdiction, patterns in jurisdictions.items():
-#             if any(pattern in text_lower for pattern in patterns):
-#                 metadata["jurisdiction"] = jurisdiction
-#                 break
-#         # Determine industry/domain
-#         industry_keywords = {
-#             "Food": ["food", "beverage", "nutrition", "edible", "consumption"],
-#             "Pharmaceutical": ["pharmaceutical", "drug", "medicine", "pharma"],
-#             "Cosmetics": ["cosmetic", "beauty", "personal care"],
-#             "Medical Devices": ["medical device", "medical equipment"],
-#             "General Manufacturing": ["manufacturing", "production", "industrial"]
-#         }
-#         for industry, keywords in industry_keywords.items():
-#             if any(keyword in text_lower for keyword in keywords):
-#                 metadata["industry"] = industry
-#                 break
-#         # If no industry detected, default to Food (since this is for Swift Check)
-#         if not metadata["industry"]:
-#             metadata["industry"] = "Food"
-#         # Determine standard type
-#         if "haccp" in text_lower:
-#             metadata["standard_type"] = "Food Safety Management"
-#         elif "iso" in text_lower:
-#             metadata["standard_type"] = "International Standard"
-#         elif "municipal" in text_lower or "municipality" in text_lower:
-#             metadata["standard_type"] = "Local Regulation"
-#         elif "ministry" in text_lower:
-#             metadata["standard_type"] = "Government Regulation"
-#         else:
-#             metadata["standard_type"] = "Industry Standard"
-#         return metadata
-#     def ocr_pdf(self, pdf_path):
-#         """Use OCR to extract text from PDF"""
-#         try:
-#             # Convert PDF to images
-#             images = pdf2image.convert_from_path(pdf_path)
-#             full_text = ""
-#             for i, image in enumerate(images):
-#                 # Perform OCR
-#                 text = pytesseract.image_to_string(image)
-#                 full_text += f"\n--- Page {i+1} ---\n{text}"
-#             return full_text
-#         except Exception as e:
-#             print(f"OCR error: {e}")
-#             return ""
-#     def extract_key_topics(self, text):
-#         """Extract key regulatory topics from text"""
-#         topics = set()
-#         # Define topic patterns
-#         topic_patterns = {
-#             "Temperature Control": ["temperature", "cold chain", "frozen", "refrigerated", "cooling"],
-#             "Packaging Requirements": ["packaging", "labeling", "package", "container"],
-#             "Microbiological Standards": ["microbiological", "bacteria", "pathogen", "contamination"],
-#             "Chemical Requirements": ["chemical", "pesticide", "residue", "additive", "preservative"],
-#             "Traceability": ["traceability", "track", "batch", "lot number"],
-#             "Storage Requirements": ["storage", "warehouse", "shelf life"],
-#             "Transportation": ["transport", "distribution", "delivery"],
-#             "Documentation": ["documentation", "record", "certificate", "report"],
-#             "Testing Requirements": ["testing", "analysis", "laboratory", "sample"],
-#             "Hygiene Standards": ["hygiene", "sanitation", "cleaning", "disinfection"],
-#             "HACCP Principles": ["haccp", "critical control", "hazard analysis"],
-#             "Certification": ["certification", "accreditation", "approval", "license"],
-#             "Compliance": ["compliance", "conform", "requirement", "specification"],
-#             "Quality Management": ["quality management", "qms", "quality system"],
-#             "Risk Assessment": ["risk assessment", "risk analysis", "hazard"],
-#             # COVID-19 specific topics
-#             "COVID-19 Guidelines": ["covid-19", "coronavirus", "pandemic", "quarantine"],
-#             "Social Distancing": ["social distancing", "physical distancing", "2 meters"],
-#             "PPE Requirements": ["ppe", "personal protective equipment", "masks", "gloves"],
-#             "Employee Health": ["employee health", "health screening", "symptoms"],
-#             "Disinfection": ["disinfection", "sanitization", "cleaning and disinfection"],
-#             # Food specific topics
-#             "Food Safety": ["food safety", "food hygiene", "food handling"],
-#             "Halal Requirements": ["halal", "islamic", "sharia"],
-#             "Allergen Management": ["allergen", "allergy", "contains", "may contain"],
-#             "Import/Export": ["import", "export", "customs", "border"]
-#         }
-#         text_lower = text.lower()
-#         for topic, keywords in topic_patterns.items():
-#             # Calculate relevance score based on keyword frequency
-#             count = sum(1 for keyword in keywords if keyword in text_lower)
-#             if count > 0:
-#                 relevance_score = count / len(keywords)
-#                 topics.add((topic, relevance_score))
-#         return list(topics)
-#     def create_chunks(self, text, metadata):
-#         """Create text chunks for vector storage"""
-#         text_splitter = RecursiveCharacterTextSplitter(
-#             chunk_size=1500,  # Larger chunks for regulatory docs
-#             chunk_overlap=300,
-#             length_function=len,
-#             separators=["\n\n", "\n", ". ", " ", ""]
-#         )
-#         chunks = text_splitter.split_text(text)
-#         documents = []
-#         for i, chunk in enumerate(chunks):
-#             doc_metadata = metadata.copy()
-#             doc_metadata["chunk_index"] = i
-#             doc_metadata["chunk_size"] = len(chunk)
-#             doc_metadata["total_chunks"] = len(chunks)
-#             documents.append({
-#                 "text": chunk,
-#                 "metadata": doc_metadata
-#             })
-#         return documents
-#     def process_pdf(self, pdf_path):
-#         """Process a single PDF file"""
-#         pdf_path = Path(pdf_path)
-#         file_hash = self.get_file_hash(pdf_path)
-#         filename = pdf_path.name
-#         # Check if already processed
-#         if filename in self.manifest["processed_files"]:
-#             if self.manifest["processed_files"][filename]["hash"] == file_hash:
-#                 print(f"Skipping {filename} - already processed")
-#                 return
-#         print(f"Processing {filename}...")
-#         try:
-#             # Load PDF content
-#             loader = PyPDFLoader(str(pdf_path))
-#             pages = loader.load()
-#             # Combine all pages
-#             full_text = ""
-#             for i, page in enumerate(pages):
-#                 full_text += f"\n--- Page {i+1} ---\n{page.page_content}"
-#             # If text is too short, use OCR
-#             if len(full_text.strip()) < 100:
-#                 print(f"Using OCR for {filename}")
-#                 ocr_text = self.ocr_pdf(pdf_path)
-#                 if len(ocr_text) > len(full_text):
-#                     full_text = ocr_text
-#             # Extract regulatory metadata
-#             reg_metadata = self.extract_metadata_from_pdf(pdf_path, full_text)
-#             # Extract key topics
-#             topics = self.extract_key_topics(full_text)
-#             # Create base metadata for chunks
-#             metadata = {
-#                 "source": filename,
-#                 "regulatory_body": reg_metadata["regulatory_body"] or "Unknown",
-#                 "standard_type": reg_metadata["standard_type"] or "Unknown",
-#                 "standard_code": reg_metadata["standard_code"] or "",
-#                 "jurisdiction": reg_metadata["jurisdiction"] or "Unknown",
-#                 "industry": reg_metadata["industry"] or "General",
-#                 "publication_date": reg_metadata["publication_date"] or "",
-#                 "effective_date": reg_metadata["effective_date"] or "",
-#                 "file_hash": file_hash,
-#                 "processed_date": datetime.now().isoformat(),
-#                 "topics": ", ".join([topic[0] for topic in topics]) if topics else ""
-#             }
-#             # Create chunks
-#             documents = self.create_chunks(full_text, metadata)
-#             # Generate embeddings and store in ChromaDB
-#             for i, doc in enumerate(documents):
-#                 embedding = self.embedder.encode(doc["text"]).tolist()
-#                 self.collection.add(
-#                     documents=[doc["text"]],
-#                     embeddings=[embedding],
-#                     metadatas=[doc["metadata"]],
-#                     ids=[f"{file_hash}_{i}"]
-#                 )
-#             # Store metadata in SQLite
-#             self.save_metadata(file_hash, filename, reg_metadata)
-#             # Store topics
-#             self.save_topics(file_hash, topics)
-#             # Update manifest
-#             self.manifest["processed_files"][filename] = {
-#                 "hash": file_hash,
-#                 "processed_date": datetime.now().isoformat(),
-#                 "chunks": len(documents),
-#                 "text_length": len(full_text),
-#                 "regulatory_body": reg_metadata["regulatory_body"],
-#                 "standard_code": reg_metadata["standard_code"]
-#             }
-#             self.save_manifest()
-#             # Log success
-#             self.log_processing(filename, file_hash, "SUCCESS", None, len(full_text), len(documents))
-#             print(f"Successfully processed {filename}")
-#             print(f"  - Regulatory Body: {reg_metadata['regulatory_body']}")
-#             print(f"  - Standard Code: {reg_metadata['standard_code']}")
-#             print(f"  - Text chunks: {len(documents)}")
-#             print(f"  - Topics extracted: {len(topics)}")
-#         except Exception as e:
-#             error_msg = str(e)
-#             print(f"Error processing {filename}: {error_msg}")
-#             import traceback
-#             traceback.print_exc()
-#             self.log_processing(filename, file_hash, "ERROR", error_msg, 0, 0)
-#     def save_metadata(self, file_hash, filename, metadata):
-#         """Save regulatory metadata to SQLite"""
-#         conn = sqlite3.connect(self.metadata_db_path)
-#         cursor = conn.cursor()
-#         try:
-#             cursor.execute("""
-#                 INSERT OR REPLACE INTO regulatory_documents
-#                 (file_hash, filename, regulatory_body, standard_type, standard_code,
-#                  publication_date, effective_date, jurisdiction, industry)
-#                 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
-#             """, (
-#                 file_hash,
-#                 filename,
-#                 metadata["regulatory_body"] or "Unknown",
-#                 metadata["standard_type"] or "Document",
-#                 metadata["standard_code"] or "",
-#                 metadata["publication_date"] or "",
-#                 metadata["effective_date"] or "",
-#                 metadata["jurisdiction"] or "General",
-#                 metadata["industry"] or "General"
-#             ))
-#             conn.commit()
-#         finally:
-#             conn.close()
-#     def save_topics(self, file_hash, topics):
-#         """Save extracted topics to SQLite"""
-#         conn = sqlite3.connect(self.metadata_db_path)
-#         cursor = conn.cursor()
-#         try:
-#             # Delete existing topics for this file
-#             cursor.execute("DELETE FROM key_topics WHERE file_hash = ?", (file_hash,))
-#             # Insert new topics
-#             for topic, relevance_score in topics:
-#                 cursor.execute("""
-#                     INSERT INTO key_topics
-#                     (file_hash, topic, relevance_score)
-#                     VALUES (?, ?, ?)
-#                 """, (file_hash, topic, relevance_score))
-#             conn.commit()
-#         finally:
-#             conn.close()
-#     def log_processing(self, filename, file_hash, status, error_message, text_length=0, chunk_count=0):
-#         """Log processing status"""
-#         conn = sqlite3.connect(self.metadata_db_path)
-#         cursor = conn.cursor()
-#         try:
-#             cursor.execute("""
-#                 INSERT INTO processing_log
-#                 (filename, file_hash, status, error_message, text_length, chunk_count)
-#                 VALUES (?, ?, ?, ?, ?, ?)
-#             """, (filename, file_hash, status, error_message, text_length, chunk_count))
-#             conn.commit()
-#         finally:
-#             conn.close()
-#     def process_all_pdfs(self):
-#         """Process all PDFs in the directory"""
-#         pdf_files = list(self.pdf_path.glob("*.pdf"))
-#         if not pdf_files:
-#             print(f"No PDF files found in {self.pdf_path}")
-#             return
-#         print(f"Found {len(pdf_files)} PDF files")
-#         for pdf_file in pdf_files:
-#             self.process_pdf(pdf_file)
-#         print("\nProcessing complete!")
-#         print(f"Total files in manifest: {len(self.manifest['processed_files'])}")
-#     def get_processing_stats(self):
-#         """Get processing statistics"""
-#         conn = sqlite3.connect(self.metadata_db_path)
-#         cursor = conn.cursor()
-#         try:
-#             # Get overall stats
-#             cursor.execute("""
-#                 SELECT COUNT(*) as total,
-#                        SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
-#                        SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors
-#                 FROM processing_log
-#             """)
-#             stats = cursor.fetchone()
-#             # Get regulatory body distribution
-#             cursor.execute("""
-#                 SELECT regulatory_body, COUNT(*) as count
-#                 FROM regulatory_documents
-#                 GROUP BY regulatory_body
-#                 ORDER BY count DESC
-#             """)
-#             body_dist = cursor.fetchall()
-#             # Get top topics
-#             cursor.execute("""
-#                 SELECT topic, COUNT(*) as count, AVG(relevance_score) as avg_relevance
-#                 FROM key_topics
-#                 GROUP BY topic
-#                 ORDER BY count DESC
-#                 LIMIT 10
-#             """)
-#             top_topics = cursor.fetchall()
-#             return {
-#                 "total_processed": stats[0],
-#                 "successful": stats[1],
-#                 "errors": stats[2],
-#                 "regulatory_bodies": dict(body_dist),
-#                 "top_topics": [{"topic": t[0], "count": t[1], "relevance": t[2]} for t in top_topics]
-#             }
-#         finally:
-#             conn.close()
-# def main():
-#     """Main function to create/update the regulatory guidelines database"""
-#     print("Starting Regulatory Guidelines Database Creation...")
-#     # Initialize database
-#     db = RegulatoryGuidelinesDB()
-#     # Process all PDFs
-#     db.process_all_pdfs()
-#     # Show processing stats
-#     print("\nProcessing Statistics:")
-#     stats = db.get_processing_stats()
-#     print(f"Total files processed: {stats['total_processed']}")
-#     print(f"Successful: {stats['successful']}")
-#     print(f"Errors: {stats['errors']}")
-#     print("\nRegulatory Bodies:")
-#     for body, count in stats["regulatory_bodies"].items():
-#         print(f"  - {body}: {count} documents")
-#     print("\nTop Topics:")
-#     for topic_data in stats["top_topics"]:
-#         print(f"  - {topic_data['topic']}: {topic_data['count']} documents (relevance: {topic_data['relevance']:.2f})")
-# if __name__ == "__main__":
-#     main()
 import os
 import json
 import sqlite3
@@ -612,7 +37,7 @@ class EnhancedRegulatoryVectorDB:
         # Get or create collection
         self.collection = self.client.get_or_create_collection(
             name="regulatory_guidelines",
-            metadata={"description": "Enhanced regulatory guidelines with clause references"}
         )
         # Initialize embedding model
@@ -625,7 +50,7 @@ class EnhancedRegulatoryVectorDB:
         self.manifest = self.load_manifest()
     def init_metadata_db(self):
-        """Initialize enhanced SQLite database for storing regulatory metadata"""
         conn = sqlite3.connect(self.metadata_db_path)
         cursor = conn.cursor()
@@ -714,25 +139,27 @@ class EnhancedRegulatoryVectorDB:
             return hashlib.md5(f.read()).hexdigest()
     def extract_sections_and_clauses(self, text_content):
-        """Enhanced section and clause extraction for regulatory documents"""
         sections = []
-        # Enhanced section patterns for different document types
         section_patterns = [
-            # HACCP style: "7.8. Temperature Control"
-            r'(\d+\.\d+\.?\s+)([A-Z][^.\n\r]+)',
-            # HACCP Principles: "8.1 Assemble HACCP Team"
-            r'(\d+\.\d+\s+)([A-Z][^.\n\r]+)',
-            # Principle format: "Principle 1:", "Principle 2"
-            r'(Principle\s+\d+)[\s:]*([^.\n\r]*)',
-            # Section format: "Section 7.8 -"
-            r'(Section\s+\d+\.\d+)[\s\-–]*([^.\n\r]*)',
-            # Simple numbered: "1. Introduction", "2. Objective"
             r'^(\d+\.\s+)([A-Z][^.\n\r]+)',
-            # Subsection: "7.1 Management Policy"
-            r'^(\d+\.\d+\s+)([A-Z][^.\n\r]+)',
-            # Prerequisites: "7.1. Management Policy"
-            r'^(\d+\.\d+\.\s+)([A-Z][^.\n\r]+)',
         ]
         lines = text_content.split('\n')
@@ -763,7 +190,7 @@ class EnhancedRegulatoryVectorDB:
                     if section_title:  # Only add if we have a meaningful title
                         # Determine section level
-                        level = section_num.count('.') + (1 if section_num.startswith('Principle') else 0)
                         # Extract content preview (next few lines)
                         preview_lines = []
@@ -789,7 +216,7 @@ class EnhancedRegulatoryVectorDB:
         return sections
     def extract_enhanced_metadata(self, pdf_path, text_content):
-        """Enhanced metadata extraction with clause awareness"""
         metadata = {
             "regulatory_body": "Unknown",
             "standard_type": "Document",
@@ -803,38 +230,49 @@ class EnhancedRegulatoryVectorDB:
         text_lower = text_content.lower()
-        # Enhanced regulatory body detection
-        regulatory_bodies = {
-            "Dubai Municipality": [
-                "dubai municipality", "dm ", "بلدية دبي", "food control section",
-                "public health department", "food safety department"
-            ],
-            "HACCP": ["haccp", "hazard analysis critical control point"],
-            "ISO": ["iso ", "international organization for standardization"],
-            "GSO": ["gso ", "gcc standardization", "gulf standard"],
-            "FDA": ["fda", "food and drug administration"],
-            "ESMA": ["esma", "emirates authority for standardization"],
-            "SASO": ["saso", "saudi standards"],
-            "UAE Ministry": ["uae ministry", "ministry of environment", "ministry of health"],
-            "Federal Law": ["federal law", "uae law", "united arab emirates law"],
-            "DHA": ["dubai health authority", "dha"],
-            "Codex Alimentarius": ["codex alimentarius", "codex"]
-        }
-        for body, patterns in regulatory_bodies.items():
-            if any(pattern in text_lower for pattern in patterns):
-                metadata["regulatory_body"] = body
                 break
-        # Enhanced standard code extraction
         standard_patterns = [
-            r"HACCP\s*Guidelines?\s*for\s*[^.\n\r]*",
-            r"Guidelines?\s*for\s*Food\s*Manufacturing\s*Premises",
-            r"ISO\s*\d+(?::\d+)*",
-            r"GSO\s*\d+(?:/\d+)*",
-            r"Dubai\s*Municipality\s*[-–]\s*Food\s*Control\s*Section",
-            r"Federal\s*Law\s*No\.\s*\d+\s*of\s*\d+",
-            r"Administrative\s*Order\s*No\.\s*\d+/\d+",
         ]
         for pattern in standard_patterns:
@@ -844,18 +282,18 @@ class EnhancedRegulatoryVectorDB:
                 break
         # Document structure detection
-        if "haccp" in text_lower and ("principle" in text_lower or "prerequisite" in text_lower):
-            metadata["document_structure"] = "hierarchical_haccp"
         elif re.search(r'\d+\.\d+\s+[A-Z]', text_content):
             metadata["document_structure"] = "numbered_sections"
         else:
             metadata["document_structure"] = "flat"
-        # Enhanced date extraction
         date_patterns = [
-            r"Issue\s*Date:\s*(\d{1,2}/\d{1,2}/\d{4})",
-            r"Issued\s*on\s*(\w+\s*\d{4})",
-            r"(\d{1,2}/\d{1,2}/\d{4})",
             r"(\d{4})"
         ]
@@ -865,80 +303,83 @@ class EnhancedRegulatoryVectorDB:
                 metadata["publication_date"] = matches[0]
                 break
-        # Industry and jurisdiction
-        if "food" in text_lower or "manufacturing" in text_lower:
-            metadata["industry"] = "Food Manufacturing"
-        if "dubai" in text_lower:
-            metadata["jurisdiction"] = "Dubai"
-        elif "uae" in text_lower or "emirates" in text_lower:
-            metadata["jurisdiction"] = "UAE"
-        elif "gcc" in text_lower or "gulf" in text_lower:
-            metadata["jurisdiction"] = "GCC"
         return metadata
     def extract_enhanced_topics(self, text, sections):
-        """Enhanced topic extraction with clause references"""
         topics = []
-        # Enhanced topic patterns with regulatory focus
         topic_patterns = {
-            "Temperature Control": {
-                "keywords": ["temperature control", "cold chain", "freezer", "chiller", "5°c", "-18°c", "danger zone"],
-                "section_hints": ["7.8", "temperature"]
             },
-            "HACCP Principles": {
-                "keywords": ["haccp principles", "seven principles", "principle 1", "principle 2", "hazard analysis"],
-                "section_hints": ["8.", "principle"]
-            },
-            "Critical Control Points": {
-                "keywords": ["critical control point", "ccp", "control points", "decision tree"],
-                "section_hints": ["8.8", "ccp", "critical"]
             },
-            "Prerequisite Programs": {
-                "keywords": ["prerequisite program", "pre-requisite", "management policy", "premises"],
-                "section_hints": ["7.", "prerequisite"]
             },
-            "Documentation Requirements": {
-                "keywords": ["documentation", "record keeping", "records", "monitoring records"],
-                "section_hints": ["8.13", "record", "documentation"]
             },
-            "Verification Procedures": {
-                "keywords": ["verification", "audit", "internal audit", "external audit"],
-                "section_hints": ["8.12", "verification", "audit"]
             },
             "Corrective Actions": {
-                "keywords": ["corrective action", "deviation", "non-conformance", "critical limit"],
-                "section_hints": ["8.11", "corrective"]
-            },
-            "Monitoring Systems": {
-                "keywords": ["monitoring", "monitoring system", "continuous monitoring"],
-                "section_hints": ["8.10", "monitoring"]
             },
-            "Hazard Analysis": {
-                "keywords": ["hazard analysis", "biological hazard", "chemical hazard", "physical hazard"],
-                "section_hints": ["8.7", "hazard"]
             },
-            "Food Safety Management": {
-                "keywords": ["food safety", "food hygiene", "food protection", "contamination"],
-                "section_hints": ["food safety", "hygiene"]
             },
-            "Personal Hygiene": {
-                "keywords": ["personal hygiene", "health card", "hand washing", "cleanliness"],
-                "section_hints": ["7.5", "personal"]
             },
-            "Pest Control": {
-                "keywords": ["pest control", "rodent", "insect", "fly control"],
-                "section_hints": ["7.4", "pest"]
             },
-            "Supplier Approval": {
-                "keywords": ["supplier approval", "approved supplier", "supplier audit"],
-                "section_hints": ["7.6", "supplier"]
             },
-            "Product Recall": {
-                "keywords": ["product recall", "recall program", "traceability"],
-                "section_hints": ["7.9", "recall"]
             }
         }
@@ -1052,7 +493,7 @@ class EnhancedRegulatoryVectorDB:
         return None
     def process_pdf(self, pdf_path):
-        """Enhanced PDF processing with section and clause extraction"""
         pdf_path = Path(pdf_path)
         file_hash = self.get_file_hash(pdf_path)
         filename = pdf_path.name
@@ -1278,102 +719,102 @@ class EnhancedRegulatoryVectorDB:
         for pdf_file in pdf_files:
             self.process_pdf(pdf_file)
-        print(f"\n🎯 Processing complete! Enhanced regulatory VDB ready.")
         print(f"📊 Total files in manifest: {len(self.manifest['processed_files'])}")
     def get_enhanced_stats(self):
-        """Get enhanced processing statistics"""
-        conn = sqlite3.connect(self.metadata_db_path)
-        cursor = conn.cursor()
-        try:
-            # Overall stats
-            cursor.execute("""
-                SELECT COUNT(*) as total,
-                       SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
-                       SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
-                       SUM(sections_extracted) as total_sections
-                FROM processing_log
-            """)
-            stats = cursor.fetchone()
-            # Regulatory body distribution
-            cursor.execute("""
-                SELECT regulatory_body, COUNT(*) as count, SUM(total_sections) as sections
-                FROM regulatory_documents
-                GROUP BY regulatory_body
-                ORDER BY count DESC
-            """)
-            body_dist = cursor.fetchall()
-            # Top topics with clause references
-            cursor.execute("""
-                SELECT topic, COUNT(*) as count, AVG(relevance_score) as avg_relevance,
-                       GROUP_CONCAT(DISTINCT section_reference) as sections
-                FROM key_topics
-                GROUP BY topic
-                ORDER BY count DESC
-                LIMIT 10
-            """)
-            top_topics = cursor.fetchall()
-            return {
-                "total_processed": stats[0] or 0,
-                "successful": stats[1] or 0,
-                "errors": stats[2] or 0,
-                "total_sections": stats[3] or 0,
-                "regulatory_bodies": [(r[0], r[1], r[2]) for r in body_dist],
-                "top_topics": [{"topic": t[0], "count": t[1], "relevance": t[2], "sections": t[3]} for t in top_topics]
-            }
-        finally:
-            conn.close()
 def main():
-    """Main function to create/update the enhanced regulatory guidelines database"""
-    print("🚀 Starting Enhanced Regulatory Guidelines Database Creation...")
-    print("📋 Features: Section extraction, clause references, enhanced topic mapping")
 if __name__ == "__main__":
-    main()
-    # Initialize enhanced database
-    db = EnhancedRegulatoryVectorDB()
-    # Process all PDFs
-    db.process_all_pdfs()
-    # Show enhanced processing stats
-    print("\n" + "=" * 80)
-    print("📊 ENHANCED PROCESSING STATISTICS:")
-    print("=" * 80)
-    stats = db.get_enhanced_stats()
-    print(f"📄 Total files processed: {stats['total_processed']}")
-    print(f"✅ Successful: {stats['successful']}")
-    print(f"❌ Errors: {stats['errors']}")
-    print(f"📑 Total sections extracted: {stats['total_sections']}")
-    print(f"\n🏛️  REGULATORY BODIES:")
-    for body, count, sections in stats["regulatory_bodies"]:
-        print(f"   - {body}: {count} documents ({sections} sections)")
-    print(f"\n🎯 TOP TOPICS WITH CLAUSE REFERENCES:")
-    for topic_data in stats["top_topics"]:
-        sections_info = topic_data['sections'][:50] + "..." if len(topic_data['sections']) > 50 else topic_data['sections']
-        print(f"   - {topic_data['topic']}: {topic_data['count']} documents")
-        print(f"     └── Relevance: {topic_data['relevance']:.2f} | Sections: {sections_info}")
-    print("\n" + "=" * 80)
-    print("🎉 Enhanced Regulatory VDB Creation Complete!")
-    print("🔍 HACCP clause references are now available for the demo")
-    print("📝 The system can now provide:")
-    print("   - Section-specific guidance (e.g., 'Section 7.8 - Temperature Control')")
-    print("   - Clause references for each parameter")
-    print("   - Regulatory body attribution")
-    print("   - Hierarchical document structure awareness")
-    print("=" * 80)

 import os
 import json
 import sqlite3
         # Get or create collection
         self.collection = self.client.get_or_create_collection(
             name="regulatory_guidelines",
+            metadata={"description": "Regulatory guidelines and standards for quality control"}
         )
         # Initialize embedding model
         self.manifest = self.load_manifest()
     def init_metadata_db(self):
+        """Initialize SQLite database for storing regulatory metadata"""
         conn = sqlite3.connect(self.metadata_db_path)
         cursor = conn.cursor()
             return hashlib.md5(f.read()).hexdigest()
     def extract_sections_and_clauses(self, text_content):
+        """Generic section and clause extraction for regulatory documents"""
         sections = []
+        # Generic section patterns that work for any regulatory document
         section_patterns = [
+            # Numbered sections: "1.2.3 Title"
+            r'(\d+(?:\.\d+)*\.?\s+)([A-Z][^.\n\r]+)',
+            # Lettered sections: "A.1 Title"
+            r'([A-Z]\.\d+\s+)([A-Z][^.\n\r]+)',
+            # Article format: "Article 1:"
+            r'(Article\s+\d+)[\s:]*([^.\n\r]*)',
+            # Section format: "Section 1.2"
+            r'(Section\s+\d+(?:\.\d+)*)[\s\-–]*([^.\n\r]*)',
+            # Chapter format: "Chapter 1"
+            r'(Chapter\s+\d+)[\s:]*([^.\n\r]*)',
+            # Part format: "Part I"
+            r'(Part\s+[IVX]+)[\s:]*([^.\n\r]*)',
+            # Simple numbered: "1. Title"
             r'^(\d+\.\s+)([A-Z][^.\n\r]+)',
+            # Annex format: "Annex 1"
+            r'(Annex\s+\d+)[\s:]*([^.\n\r]*)',
         ]
         lines = text_content.split('\n')
                     if section_title:  # Only add if we have a meaningful title
                         # Determine section level
+                        level = section_num.count('.')
                         # Extract content preview (next few lines)
                         preview_lines = []
         return sections
     def extract_enhanced_metadata(self, pdf_path, text_content):
+        """Generic metadata extraction without bias toward specific standards"""
         metadata = {
             "regulatory_body": "Unknown",
             "standard_type": "Document",
         text_lower = text_content.lower()
+        # UPDATED: Generic regulatory body detection without prioritization
+        # Extract regulatory body from document content
+        regulatory_indicators = [
+            # International standards
+            (r"iso\s*\d+", "ISO"),
+            (r"iec\s*\d+", "IEC"),
+            (r"codex\s+alimentarius", "Codex Alimentarius"),
+            (r"who\s+guidelines", "WHO"),
+            (r"fao\s+standards", "FAO"),
+            # Regional standards
+            (r"european\s+union", "EU"),
+            (r"gcc\s+standard", "GCC"),
+            (r"asean\s+standard", "ASEAN"),
+            # National standards
+            (r"uae\s+standard", "UAE National"),
+            (r"saudi\s+standard", "Saudi Arabia"),
+            (r"indian\s+standard", "India"),
+            # Generic detection
+            (r"ministry\s+of\s+\w+", "Government Ministry"),
+            (r"department\s+of\s+\w+", "Government Department"),
+            (r"authority\s+for\s+\w+", "Regulatory Authority"),
+            # Industry standards
+            (r"haccp", "HACCP System"),
+            (r"gmp", "GMP"),
+            (r"gap", "GAP"),
+        ]
+        # Find regulatory body without bias
+        for pattern, body_name in regulatory_indicators:
+            if re.search(pattern, text_lower):
+                metadata["regulatory_body"] = body_name
                 break
+        # Extract standard code generically
         standard_patterns = [
+            r"(?:standard|guideline|regulation)\s*(?:no\.|number)?\s*:?\s*(\w+[-/]\d+)",
+            r"document\s*(?:no\.|number)?\s*:?\s*(\w+[-/]\d+)",
+            r"reference\s*:?\s*(\w+[-/]\d+)",
+            r"(\w{2,10}[-/]\d{2,6})",  # Generic code pattern
         ]
         for pattern in standard_patterns:
                 break
         # Document structure detection
+        if re.search(r'(?:article|chapter|part|annex)\s+\d+', text_lower):
+            metadata["document_structure"] = "hierarchical"
         elif re.search(r'\d+\.\d+\s+[A-Z]', text_content):
             metadata["document_structure"] = "numbered_sections"
         else:
             metadata["document_structure"] = "flat"
+        # Date extraction
         date_patterns = [
+            r"(?:published|issued|effective|dated?)\s*:?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
+            r"(?:version|revision)\s*date\s*:?\s*(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
+            r"(\d{1,2}[-/]\d{1,2}[-/]\d{2,4})",
             r"(\d{4})"
         ]
                 metadata["publication_date"] = matches[0]
                 break
+        # Industry detection - generic
+        industry_indicators = [
+            (["quality", "control", "inspection", "standard"], "Quality Control"),
+            (["manufacturing", "production", "processing"], "Manufacturing"),
+            (["safety", "health", "hygiene"], "Health & Safety"),
+            (["environment", "sustainable", "green"], "Environmental"),
+            (["trade", "commerce", "export", "import"], "Trade & Commerce"),
+        ]
+        for keywords, industry in industry_indicators:
+            if any(keyword in text_lower for keyword in keywords):
+                metadata["industry"] = industry
+                break
+        # Jurisdiction detection - generic
+        if any(country in text_lower for country in ["international", "global", "worldwide"]):
+            metadata["jurisdiction"] = "International"
+        elif re.search(r'(?:national|federal|state)\s+(?:standard|regulation)', text_lower):
+            metadata["jurisdiction"] = "National"
+        else:
+            metadata["jurisdiction"] = "General"
         return metadata
     def extract_enhanced_topics(self, text, sections):
+        """Generic topic extraction without bias toward specific frameworks"""
         topics = []
+        # UPDATED: Generic topic patterns applicable to any standard
         topic_patterns = {
+            "Quality Management": {
+                "keywords": ["quality management", "quality system", "quality control", "quality assurance"],
+                "section_hints": ["quality", "management"]
             },
+            "Documentation Requirements": {
+                "keywords": ["documentation", "records", "record keeping", "documents"],
+                "section_hints": ["document", "record"]
             },
+            "Process Control": {
+                "keywords": ["process control", "process monitoring", "control measures"],
+                "section_hints": ["process", "control"]
             },
+            "Verification and Validation": {
+                "keywords": ["verification", "validation", "audit", "review"],
+                "section_hints": ["verification", "validation"]
             },
+            "Training Requirements": {
+                "keywords": ["training", "competence", "qualification", "education"],
+                "section_hints": ["training", "competence"]
             },
             "Corrective Actions": {
+                "keywords": ["corrective action", "preventive action", "non-conformance"],
+                "section_hints": ["corrective", "action"]
             },
+            "Risk Management": {
+                "keywords": ["risk assessment", "risk management", "hazard", "risk analysis"],
+                "section_hints": ["risk", "hazard"]
             },
+            "Monitoring and Measurement": {
+                "keywords": ["monitoring", "measurement", "testing", "inspection"],
+                "section_hints": ["monitoring", "measurement"]
             },
+            "Compliance Requirements": {
+                "keywords": ["compliance", "regulatory", "legal requirements", "statutory"],
+                "section_hints": ["compliance", "regulatory"]
             },
+            "Continuous Improvement": {
+                "keywords": ["improvement", "continual improvement", "enhancement"],
+                "section_hints": ["improvement", "enhance"]
             },
+            "Resource Management": {
+                "keywords": ["resources", "facilities", "equipment", "infrastructure"],
+                "section_hints": ["resource", "facility"]
             },
+            "Communication": {
+                "keywords": ["communication", "reporting", "notification"],
+                "section_hints": ["communication", "report"]
             }
         }
         return None
     def process_pdf(self, pdf_path):
+        """Generic PDF processing without standard-specific bias"""
         pdf_path = Path(pdf_path)
         file_hash = self.get_file_hash(pdf_path)
         filename = pdf_path.name
         for pdf_file in pdf_files:
             self.process_pdf(pdf_file)
+        print(f"\n🎯 Processing complete! Generic regulatory VDB ready.")
         print(f"📊 Total files in manifest: {len(self.manifest['processed_files'])}")
     def get_enhanced_stats(self):
+       """Get enhanced processing statistics"""
+       conn = sqlite3.connect(self.metadata_db_path)
+       cursor = conn.cursor()
+       try:
+           # Overall stats
+           cursor.execute("""
+               SELECT COUNT(*) as total,
+                      SUM(CASE WHEN status = 'SUCCESS' THEN 1 ELSE 0 END) as success,
+                      SUM(CASE WHEN status = 'ERROR' THEN 1 ELSE 0 END) as errors,
+                      SUM(sections_extracted) as total_sections
+               FROM processing_log
+           """)
+           stats = cursor.fetchone()
+           # Regulatory body distribution
+           cursor.execute("""
+               SELECT regulatory_body, COUNT(*) as count, SUM(total_sections) as sections
+               FROM regulatory_documents
+               GROUP BY regulatory_body
+               ORDER BY count DESC
+           """)
+           body_dist = cursor.fetchall()
+           # Top topics with clause references
+           cursor.execute("""
+               SELECT topic, COUNT(*) as count, AVG(relevance_score) as avg_relevance,
+                      GROUP_CONCAT(DISTINCT section_reference) as sections
+               FROM key_topics
+               GROUP BY topic
+               ORDER BY count DESC
+               LIMIT 10
+           """)
+           top_topics = cursor.fetchall()
+           return {
+               "total_processed": stats[0] or 0,
+               "successful": stats[1] or 0,
+               "errors": stats[2] or 0,
+               "total_sections": stats[3] or 0,
+               "regulatory_bodies": [(r[0], r[1], r[2]) for r in body_dist],
+               "top_topics": [{"topic": t[0], "count": t[1], "relevance": t[2], "sections": t[3]} for t in top_topics]
+           }
+       finally:
+           conn.close()
 def main():
+   """Main function to create/update the generic regulatory guidelines database"""
+   print("🚀 Starting Generic Regulatory Guidelines Database Creation...")
+   print("📋 Features: Unbiased extraction, generic standards support, dynamic classification")
+   # Initialize enhanced database
+   db = EnhancedRegulatoryVectorDB()
+   # Process all PDFs
+   db.process_all_pdfs()
+   # Show enhanced processing stats
+   print("\n" + "=" * 80)
+   print("📊 GENERIC PROCESSING STATISTICS:")
+   print("=" * 80)
+   stats = db.get_enhanced_stats()
+   print(f"📄 Total files processed: {stats['total_processed']}")
+   print(f"✅ Successful: {stats['successful']}")
+   print(f"❌ Errors: {stats['errors']}")
+   print(f"📑 Total sections extracted: {stats['total_sections']}")
+   print(f"\n🏛️  REGULATORY BODIES (No Bias):")
+   for body, count, sections in stats["regulatory_bodies"]:
+       print(f"   - {body}: {count} documents ({sections} sections)")
+   print(f"\n🎯 TOP TOPICS (Generic):")
+   for topic_data in stats["top_topics"]:
+       sections_info = topic_data['sections'][:50] + "..." if len(topic_data['sections']) > 50 else topic_data['sections']
+       print(f"   - {topic_data['topic']}: {topic_data['count']} documents")
+       print(f"     └── Relevance: {topic_data['relevance']:.2f} | Sections: {sections_info}")
+   print("\n" + "=" * 80)
+   print("🎉 Generic Regulatory VDB Creation Complete!")
+   print("🔍 All regulatory frameworks are treated equally")
+   print("📝 The system can now provide:")
+   print("   - Unbiased regulatory references")
+   print("   - Generic clause citations")
+   print("   - Dynamic standard recognition")
+   print("   - Equal treatment of all frameworks")
+   print("=" * 80)
 if __name__ == "__main__":
+   main()