Spaces:

samarth09healthPM
/

my-streamlit-app

Running

App Files Files Community

samarth09healthPM commited on Oct 7

Commit

f64b3f9

1 Parent(s): 7f09b7f

Add HIPAA RAG Clinical Summarizer (essential files only)

Browse files

Files changed (13) hide show

.gitignore +46 -0
audit.py +79 -0
bcrypt_pw.py +2 -0
deid_pipeline.py +266 -0
indexer.py +289 -0
main.py +494 -0
notes.py +47 -0
quick_check_chroma.py +20 -0
rag_pipeline.py +117 -0
retriever_context.py +7 -0
run_pipeline.py +51 -0
streamlit_config.yaml +17 -0
summarizer.py +610 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+# Large data folders
+synthea/
+data/raw_synthea/
+data/vector_store*/
+synthea/output/fhir/
+*.json
+*.csv
+*.zip
+*.sqlite
+__pycache__/
+*.ipynb_checkpoints
+# Ignore vector store database
+app/data/vector_store/chroma.sqlite3
+*.sqlite3
+*.db
+# Models and cache
+models/
+.cache/
+transformers_cache/
+# Virtual environment
+new_env_rag/
+venv/
+env/
+# Secrets
+# Logs and outputs
+logs/
+*.log
+*.jsonl
+# Python cache
+__pycache__/
+*.pyc
+# IDE
+.vs/
+.vscode/
+.idea/
+# OS files
+.DS_Store
+Thumbs.db

audit.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import json
+import hashlib
+from datetime import datetime
+from pathlib import Path
+import uuid
+import pytz
+class AuditLogger:
+    def __init__(self, log_file_path="logs/app_audit.jsonl"):
+        self.log_file = Path(log_file_path)
+        self.log_file.parent.mkdir(parents=True, exist_ok=True)
+        # Create file if it doesn't exist
+        if not self.log_file.exists():
+            self.log_file.touch()
+    def _get_last_hash(self):
+        """Read the last log entry and return its hash"""
+        try:
+            with open(self.log_file, 'r') as f:
+                lines = f.readlines()
+                if lines:
+                    last_entry = json.loads(lines[-1])
+                    return last_entry.get('sha256_curr', '')
+        except:
+            pass
+        return ''  # First entry has no previous hash
+    def _compute_hash(self, log_entry):
+        """Create a hash fingerprint of the log entry"""
+        # Convert the log entry to a string and hash it
+        entry_string = json.dumps(log_entry, sort_keys=True)
+        return hashlib.sha256(entry_string.encode()).hexdigest()
+    def log_action(self, user, action, resource, additional_info=None):
+        """
+        Main logging function - call this whenever a user does something
+        Args:
+            user: username (e.g., 'dr_smith')
+            action: what they did (e.g., 'UPLOAD_NOTE', 'GENERATE_SUMMARY', 'VIEW_LOGS')
+            resource: what they acted on (e.g., 'note_12345.txt', 'patient_record')
+            additional_info: any extra details (dictionary)
+        """
+        # Get the hash of the previous log entry
+        previous_hash = self._get_last_hash()
+        # Generate unique IDs for tracing
+        trace_id = str(uuid.uuid4())
+        span_id = str(uuid.uuid4())[:16]  # Shorter ID for span
+        # India timezone
+        india = pytz.timezone('Asia/Kolkata')
+        local_time = datetime.now(india).isoformat()
+        # Create the new log entry
+        log_entry = {
+            "timestamp": local_time + "Z",
+            "user": user,
+            "action": action,
+            "resource": resource,
+            "sha256_prev": previous_hash,
+            "additional_info": additional_info or {},
+             # OpenTelemetry attributes
+            "otel_trace_id": trace_id,
+            "otel_span_id": span_id,
+            "otel_service_name": "clinical-rag-app",
+            "severity": "INFO"  # Can be DEBUG, INFO, WARN, ERROR
+        }
+        # Compute hash of THIS entry
+        current_hash = self._compute_hash(log_entry)
+        log_entry["sha256_curr"] = current_hash
+        # Append to log file (append-only = cannot change old entries)
+        with open(self.log_file, 'a') as f:
+            f.write(json.dumps(log_entry) + '\n')
+        return log_entry

bcrypt_pw.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ import bcrypt
2	+ print(bcrypt.hashpw(b"mypassword", bcrypt.gensalt()).decode())

deid_pipeline.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import json
+import os
+from dataclasses import dataclass
+from typing import List, Dict, Any, Tuple
+# Presidio
+from presidio_analyzer import AnalyzerEngine
+from presidio_analyzer.recognizer_registry import RecognizerRegistry
+from presidio_anonymizer import AnonymizerEngine
+from presidio_analyzer import PatternRecognizer
+# Define medical terms that should NOT be redacted
+medical_terms_allowlist = [
+    "substernal", "exertional", "pressure-like", "diaphoresis",
+    "chest pain", "nausea", "radiation", "murmurs", "ischemia"
+]
+# Configure analyzer to ignore these terms
+analyzer_config = {
+    "nlp_engine_name": "spacy",
+    "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}],
+    "allow_list": medical_terms_allowlist  # Don't redact these
+}
+# NLP for optional section detection
+import spacy
+# If using medspacy, uncomment (preferred for clinical):
+# import medspacy
+# from medspacy.sectionizer import Sectionizer
+# If not using medspacy, optional lightweight section tagging:
+# We'll use regex on common headers as a fallback
+import re
+# Encryption
+from cryptography.fernet import Fernet
+@dataclass
+class PHISpan:
+    entity_type: str
+    start: int
+    end: int
+    text: str
+    section: str
+SECTION_HEADERS = [
+    # Common clinical sections; customize as needed
+    "HPI", "History of Present Illness",
+    "PMH", "Past Medical History",
+    "Medications", "Allergies",
+    "Assessment and Plan", "Assessment & Plan", "Assessment",
+    "Plan", "ROS", "Review of Systems",
+    "Physical Exam"
+]
+SECTION_PATTERN = re.compile(
+    r"^(?P<header>(" + "|".join([re.escape(h) for h in SECTION_HEADERS]) + r"))\s*:\s*$",
+    re.IGNORECASE | re.MULTILINE
+)
+TAG_MAP = {
+    "PERSON": "[NAME]",
+    "PHONE_NUMBER": "[PHONE]",
+    "DATE_TIME": "[DATE]",
+    "DATE": "[DATE]",
+    "EMAIL_ADDRESS": "[EMAIL]",
+    "US_SSN": "[SSN]"
+}
+class DeidPipeline:
+    """
+    De-identification pipeline using Microsoft Presidio
+    """
+    def __init__(self, fernet_key_path="secure_store/fernet.key"):
+        """
+        Initialize de-identification pipeline with Presidio
+        Args:
+            fernet_key_path: Path to Fernet encryption key
+        """
+        import os
+        from cryptography.fernet import Fernet
+        # Initialize encryption
+        try:
+            if os.path.exists(fernet_key_path):
+                # Load existing key from file
+                with open(fernet_key_path, "rb") as f:
+                    key = f.read()
+            else:
+                # Generate new key for this session
+                key = Fernet.generate_key()
+                # Try to save it (might fail on read-only filesystems)
+                try:
+                    os.makedirs(os.path.dirname(fernet_key_path), exist_ok=True)
+                    with open(fernet_key_path, "wb") as f:
+                        f.write(key)
+                except (PermissionError, OSError):
+                    # Cloud filesystem is read-only, just use the generated key
+                    pass
+            self.fernet = Fernet(key)
+        except Exception as e:
+            # Emergency fallback: Generate temporary key
+            print(f"Warning: Could not load encryption key, generating temporary key: {e}")
+            key = Fernet.generate_key()
+            self.fernet = Fernet(key)
+        # Initialize Presidio components
+        self.analyzer = AnalyzerEngine()
+        self.anonymizer = AnonymizerEngine()
+        # Load spaCy model
+        try:
+            self.nlp = spacy.load("en_core_web_lg")
+        except OSError:
+            print("Downloading spaCy model...")
+            import subprocess
+            subprocess.run(["python", "-m", "spacy", "download", "en_core_web_lg"])
+            self.nlp = spacy.load("en_core_web_lg")
+    def _detect_sections(self, text: str) -> List[Tuple[str, int, int]]:
+        """
+        Lightweight section finder:
+        Return list of (section_title, start_idx, end_idx_of_section_block)
+        """
+        # Find headers by regex, map their start positions
+        headers = []
+        for m in SECTION_PATTERN.finditer(text):
+            headers.append((m.group("header"), m.start()))
+        # Add end sentinel
+        headers.append(("[END]", len(text)))
+        sections = []
+        for i in range(len(headers) - 1):
+            title, start_pos = headers[i]
+            next_title, next_pos = headers[i+1]
+            sections.append((title.strip(), start_pos, next_pos))
+        if not sections:
+            # Single default section if none found
+            sections = [("DOCUMENT", 0, len(text))]
+        return sections
+    def _find_section_for_span(self, sections, start_idx) -> str:
+        for title, s, e in sections:
+            if s <= start_idx < e:
+                return title
+        return "DOCUMENT"
+    def analyze(self, text: str) -> List[Dict[str, Any]]:
+        # Detect entities
+        results = self.analyzer.analyze(text=text, language="en")
+        # Convert to dict for consistency
+        detections = []
+        for r in results:
+            detections.append({
+                "entity_type": r.entity_type,
+                "start": r.start,
+                "end": r.end,
+                "score": r.score
+            })
+        return detections
+    def mask(self, text: str, detections: List[Dict[str, Any]]) -> Tuple[str, List[PHISpan]]:
+        """
+        Replace spans with tags safely (right-to-left to maintain indices).
+        """
+        # Determine sections for context
+        sections = self._detect_sections(text)
+        # Build PHI span records
+        spans: List[PHISpan] = []
+        for d in detections:
+            entity = d["entity_type"]
+            start = d["start"]
+            end = d["end"]
+            original = text[start:end]
+            section = self._find_section_for_span(sections, start)
+            spans.append(PHISpan(entity_type=entity, start=start, end=end, text=original, section=section))
+        # Replace from the end to avoid index shifting
+        masked = text
+        for d in sorted(detections, key=lambda x: x["start"], reverse=True):
+            entity = d["entity_type"]
+            start = d["start"]
+            end = d["end"]
+            tag = TAG_MAP.get(entity, f"[{entity}]")
+            masked = masked[:start] + tag + masked[end:]
+        return masked, spans
+    def encrypt_span_map(self, spans: List[PHISpan], meta: Dict[str, Any]) -> bytes:
+        payload = {
+            "meta": meta,
+            "spans": [s.__dict__ for s in spans]
+        }
+        blob = json.dumps(payload).encode("utf-8")
+        token = self.fernet.encrypt(blob)
+        return token
+    def run_on_text(self, text: str, note_id: str) -> Dict[str, Any]:
+        detections = self.analyze(text)
+        masked, spans = self.mask(text, detections)
+        # Encrypt span map
+        token = self.encrypt_span_map(
+            spans=spans,
+            meta={"note_id": note_id}
+        )
+        return {
+            "masked_text": masked,
+            "encrypted_span_map": token
+        }
+def _read_text_with_fallback(path: str) -> str:
+    # 1) Try UTF-8 (preferred for cross-platform)
+    try:
+        with open(path, "r", encoding="utf-8") as f:
+            return f.read()
+    except UnicodeDecodeError:
+        pass
+    # 2) Try Windows-1252 (common for Notepad/docx copy-paste on Windows)
+    try:
+        with open(path, "r", encoding="cp1252") as f:
+            return f.read()
+    except UnicodeDecodeError:
+        pass
+    # 3) Last resort: decode with replacement to avoid crashing; preserves structure
+    with open(path, "r", encoding="utf-8", errors="replace") as f:
+        return f.read()
+def run_file(input_path: str, outputs_dir: str = "data/outputs", secure_dir: str = "secure_store"):
+    os.makedirs(outputs_dir, exist_ok=True)
+    os.makedirs(secure_dir, exist_ok=True)
+    note_id = os.path.splitext(os.path.basename(input_path))[0]
+    text = _read_text_with_fallback(input_path)
+    pipeline = DeidPipeline()
+    result = pipeline.run_on_text(text, note_id=note_id)
+    # Save masked text normalized to UTF-8
+    out_txt = os.path.join(outputs_dir, f"{note_id}.deid.txt")
+    with open(out_txt, "w", encoding="utf-8", newline="\n") as f:
+        f.write(result["masked_text"])
+    # Save encrypted span map (binary)
+    out_bin = os.path.join(secure_dir, f"{note_id}.spanmap.enc")
+    with open(out_bin, "wb") as f:
+        f.write(result["encrypted_span_map"])
+    print(f"De-identified text -> {out_txt}")
+    print(f"Encrypted span map -> {out_bin}")
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="De-identify a clinical note and save encrypted span map.")
+    parser.add_argument("--input", required=True, help="Path to input .txt note")
+    parser.add_argument("--outputs_dir", default="data/outputs", help="Output folder for masked text")
+    parser.add_argument("--secure_dir", default="secure_store", help="Folder for encrypted span maps")
+    args = parser.parse_args()
+    run_file(args.input, args.outputs_dir, args.secure_dir)

indexer.py ADDED Viewed

	@@ -0,0 +1,289 @@

+# app/indexer.py
+# Day 6: Vector store & embeddings
+# Usage examples:
+#   python app/indexer.py --input_dir ./data/outputs --db_type chroma --persist_dir ./data/vector_store
+#   python app/indexer.py --input_dir ./data/outputs --db_type faiss  --persist_dir ./data/vector_store_faiss
+import os
+import json
+import argparse
+from pathlib import Path
+from typing import List, Dict, Tuple
+from tqdm import tqdm
+# Embeddings
+from sentence_transformers import SentenceTransformer
+# Vector stores
+# Chroma
+import chromadb
+from chromadb.config import Settings as ChromaSettings
+# FAISS
+import faiss
+import pickle
+DEFAULT_CHUNK_TOKENS = 200
+DEFAULT_OVERLAP_TOKENS = 50
+def read_note_files(input_dir: str) -> List[Dict]:
+    """
+    Reads de-identified notes from .txt or .json in input_dir.
+    Expects .json to have a 'text' field containing de-identified content.
+    Returns list of dicts: {id, text, section?}
+    """
+    items = []
+    p = Path(input_dir)
+    if not p.exists():
+        raise FileNotFoundError(f"Input dir not found: {input_dir}")
+    for fp in p.glob("**/*"):
+        if fp.is_dir():
+            continue
+        if fp.suffix.lower() == ".txt":
+            text = fp.read_text(encoding="utf-8", errors="ignore").strip()
+            if text:
+                items.append({"id": fp.stem, "text": text, "section": None})
+        elif fp.suffix.lower() == ".json":
+            try:
+                obj = json.loads(fp.read_text(encoding="utf-8", errors="ignore"))
+                text = obj.get("text") or obj.get("deidentified_text") or ""
+                section = obj.get("section")
+                if text:
+                    items.append({"id": fp.stem, "text": text.strip(), "section": section})
+            except Exception:
+                # Skip malformed
+                continue
+    return items
+def approx_tokenize(text: str) -> List[str]:
+    """
+    Approximate tokenization by splitting on whitespace.
+    For MVP this is fine; can replace with tiktoken later.
+    """
+    return text.split()
+def detokenize(tokens: List[str]) -> str:
+    return " ".join(tokens)
+def chunk_text(text: str, chunk_tokens: int, overlap_tokens: int) -> List[str]:
+    """
+    Simple sliding window chunking.
+    """
+    tokens = approx_tokenize(text)
+    chunks = []
+    i = 0
+    n = len(tokens)
+    while i < n:
+        j = min(i + chunk_tokens, n)
+        chunk = detokenize(tokens[i:j])
+        if chunk.strip():
+            chunks.append(chunk)
+        if j == n:
+            break
+        i = j - overlap_tokens
+        if i < 0:
+            i = 0
+    return chunks
+def embed_texts(model: SentenceTransformer, texts: List[str]):
+    return model.encode(texts, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
+def build_chroma(persist_dir: str, collection_name: str = "notes"):
+    client = chromadb.PersistentClient(
+        path=persist_dir,
+        settings=ChromaSettings(allow_reset=True)
+    )
+    if collection_name in [c.name for c in client.list_collections()]:
+        coll = client.get_collection(collection_name)
+    else:
+        coll = client.create_collection(collection_name)
+    return client, coll
+def save_faiss(index, vectors_meta: List[Dict], persist_dir: str):
+    os.makedirs(persist_dir, exist_ok=True)
+    faiss_path = os.path.join(persist_dir, "index.faiss")
+    meta_path = os.path.join(persist_dir, "meta.pkl")
+    faiss.write_index(index, faiss_path)
+    with open(meta_path, "wb") as f:
+        pickle.dump(vectors_meta, f)
+def load_faiss(persist_dir: str):
+    faiss_path = os.path.join(persist_dir, "index.faiss")
+    meta_path = os.path.join(persist_dir, "meta.pkl")
+    if os.path.exists(faiss_path) and os.path.exists(meta_path):
+        index = faiss.read_index(faiss_path)
+        with open(meta_path, "rb") as f:
+            meta = pickle.load(f)
+        return index, meta
+    return None, []
+def index_note(
+    text: str,
+    note_id: str = "temp_note",
+    persist_dir: str = "./data/vector_store",
+    db_type: str = "chroma",
+    model_name: str = "sentence-transformers/all-MiniLM-L6-v2",
+    collection: str = "notes"
+) -> str:
+    from sentence_transformers import SentenceTransformer
+    import os
+    DEFAULT_CHUNK_TOKENS = 200
+    DEFAULT_OVERLAP_TOKENS = 50
+    def approx_tokenize(text: str):
+        return text.split()
+    def detokenize(tokens):
+        return " ".join(tokens)
+    def chunk_text(text, chunk_tokens, overlap_tokens):
+        tokens = approx_tokenize(text)
+        chunks = []
+        i = 0
+        n = len(tokens)
+        while i < n:
+            j = min(i + chunk_tokens, n)
+            chunk = detokenize(tokens[i:j])
+            if chunk.strip():
+                chunks.append(chunk)
+            if j == n:
+                break
+            i = j - overlap_tokens
+            if i < 0:
+                i = 0
+        return chunks
+    os.makedirs(persist_dir, exist_ok=True)
+    model = SentenceTransformer(model_name)
+    chunks = chunk_text(text, DEFAULT_CHUNK_TOKENS, DEFAULT_OVERLAP_TOKENS)
+    chunk_ids = [f"{note_id}::chunk_{i}" for i in range(len(chunks))]
+    metadatas = [{"note_id": note_id, "chunk_index": i} for i in range(len(chunks))]
+    vectors = model.encode(chunks, show_progress_bar=False, convert_to_numpy=True, normalize_embeddings=True)
+    if db_type == "chroma":
+        from chromadb.config import Settings as ChromaSettings
+        import chromadb
+        client = chromadb.PersistentClient(
+            path=persist_dir,
+            settings=ChromaSettings(allow_reset=True)
+        )
+        if collection in [c.name for c in client.list_collections()]:
+            coll = client.get_collection(collection)
+        else:
+            coll = client.create_collection(collection)
+        coll.upsert(
+            ids=chunk_ids,
+            embeddings=vectors.tolist(),
+            documents=chunks,
+            metadatas=metadatas,
+        )
+    elif db_type == "faiss":
+        import faiss
+        import pickle
+        d = vectors.shape[1]
+        index = faiss.IndexFlatIP(d)
+        index.add(vectors)
+        vectors_meta = [
+            {"id": chunk_ids[k], "text": chunks[k], "meta": metadatas[k]}
+            for k in range(len(chunks))
+        ]
+        faiss_path = os.path.join(persist_dir, "index.faiss")
+        meta_path = os.path.join(persist_dir, "meta.pkl")
+        faiss.write_index(index, faiss_path)
+        with open(meta_path, "wb") as f:
+            pickle.dump(vectors_meta, f)
+    return note_id
+def main():
+    parser = argparse.ArgumentParser(description="Day 6: Build local vector DB from de-identified notes.")
+    parser.add_argument("--input_dir", required=True, help="Directory with de-identified notes (.txt or .json).")
+    parser.add_argument("--persist_dir", default="./data/vector_store", help="Where to persist the DB.")
+    parser.add_argument("--db_type", choices=["chroma", "faiss"], default="chroma", help="Vector DB type.")
+    parser.add_argument("--model_name", default="sentence-transformers/all-MiniLM-L6-v2", help="Embedding model.")
+    parser.add_argument("--chunk_tokens", type=int, default=DEFAULT_CHUNK_TOKENS, help="Approx tokens per chunk.")
+    parser.add_argument("--overlap_tokens", type=int, default=DEFAULT_OVERLAP_TOKENS, help="Token overlap.")
+    parser.add_argument("--collection", default="notes", help="Collection name (Chroma).")
+    args = parser.parse_args()
+    notes = read_note_files(args.input_dir)
+    if not notes:
+        print(f"No de-identified notes found in {args.input_dir}. Ensure Day 5 outputs exist.")
+        return
+    print(f"Loaded {len(notes)} de-identified notes from {args.input_dir}")
+    os.makedirs(args.persist_dir, exist_ok=True)
+    print(f"Loading embedding model: {args.model_name}")
+    model = SentenceTransformer(args.model_name)
+    all_chunk_texts = []
+    all_chunk_ids = []
+    all_metadata = []
+    print("Chunking notes...")
+    for note in tqdm(notes):
+        chunks = chunk_text(note["text"], args.chunk_tokens, args.overlap_tokens)
+        for idx, ch in enumerate(chunks):
+            cid = f"{note['id']}::chunk_{idx}"
+            all_chunk_texts.append(ch)
+            all_chunk_ids.append(cid)
+            all_metadata.append({
+                "note_id": note["id"],
+                "chunk_index": idx,
+                "section": note.get("section")
+            })
+    print(f"Total chunks: {len(all_chunk_texts)}")
+    print("Embedding chunks...")
+    vectors = embed_texts(model, all_chunk_texts)
+    if args.db_type == "chroma":
+        print("Building Chroma persistent collection...")
+        client, coll = build_chroma(args.persist_dir, args.collection)
+        # Upsert in manageable batches
+        batch = 512
+        for i in tqdm(range(0, len(all_chunk_texts), batch)):
+            j = min(i + batch, len(all_chunk_texts))
+            coll.upsert(
+                ids=all_chunk_ids[i:j],
+                embeddings=vectors[i:j].tolist(),
+                documents=all_chunk_texts[i:j],
+                metadatas=all_metadata[i:j],
+            )
+        print(f"Chroma collection '{args.collection}' persisted at {args.persist_dir}")
+    elif args.db_type == "faiss":
+        print("Building FAISS index...")
+        d = vectors.shape[1]
+        index = faiss.IndexFlatIP(d)  # normalized vectors → use inner product as cosine
+        # Try to load existing
+        existing_index, existing_meta = load_faiss(args.persist_dir)
+        if existing_index is not None:
+            print("Appending to existing FAISS index...")
+            index = existing_index
+            vectors_meta = existing_meta
+        else:
+            vectors_meta = []
+        index.add(vectors)
+        vectors_meta.extend([
+            {
+                "id": all_chunk_ids[k],
+                "text": all_chunk_texts[k],
+                "meta": all_metadata[k]
+            } for k in range(len(all_chunk_texts))
+        ])
+        save_faiss(index, vectors_meta, args.persist_dir)
+        print(f"FAISS index persisted at {args.persist_dir}")
+    print("Done.")
+if __name__ == "__main__":
+    main()
+##result = pipeline.run_on_text(text=note_text, note_id="temp_note")
+##deid_text = result["masked_text"]

main.py ADDED Viewed

	@@ -0,0 +1,494 @@

+# --- Imports and page setup ---
+import streamlit as st
+import yaml
+from yaml.loader import SafeLoader
+import streamlit_authenticator as stauth
+import uuid
+import datetime
+from audit import AuditLogger
+st.set_page_config(page_title="Clinical Summarizer", layout="wide")
+st.title("HIPAA-compliant Clinical RAG Summarizer (MVP)")
+# --- Authentication setup ---
+def load_config():
+    """Load configuration from Streamlit secrets or local YAML"""
+    try:
+        # Check if running on Streamlit Cloud (secrets available)
+        if "credentials" in st.secrets:
+            # Convert immutable Streamlit secrets to mutable dict
+            config = {
+                "credentials": {
+                    "usernames": {}
+                },
+                "cookie": {
+                    "name": str(st.secrets["cookie"]["name"]),
+                    "key": str(st.secrets["cookie"]["key"]),
+                    "expiry_days": int(st.secrets["cookie"]["expiry_days"])
+                }
+            }
+            # Convert each user to mutable dict
+            for username, user_data in st.secrets["credentials"]["usernames"].items():
+                config["credentials"]["usernames"][str(username)] = {
+                    "email": str(user_data["email"]),
+                    "failed_login_attempts": int(user_data.get("failed_login_attempts", 0)),
+                    "logged_in": bool(user_data.get("logged_in", False)),
+                    "name": str(user_data["name"]),
+                    "password": str(user_data["password"]),
+                    "role": str(user_data["role"])
+                }
+            return config
+        else:
+            # Local development: Load from YAML file
+            with open("app/streamlit_config.yaml") as f:
+                return yaml.load(f, Loader=SafeLoader)
+    except FileNotFoundError:
+        st.error("⚠️ Configuration file not found. Please set up authentication.")
+        st.stop()
+    except Exception as e:
+        st.error(f"⚠️ Configuration error: {e}")
+        st.info("Make sure secrets are configured in Streamlit Cloud settings.")
+        st.stop()
+# Load config
+config = load_config()
+# Create authenticator with mutable config
+authenticator = stauth.Authenticate(
+    config["credentials"],
+    config["cookie"]["name"],
+    config["cookie"]["key"],
+    config["cookie"]["expiry_days"],
+)
+# Render the login widget
+authenticator.login(location="sidebar")
+# Read values from session_state
+auth_status = st.session_state.get("authentication_status")
+username = st.session_state.get("username")
+name = st.session_state.get("name")
+if auth_status is False:
+    st.error("Invalid username or password")
+    st.stop()
+elif auth_status is None:
+    st.info("Please log in")
+    st.stop()
+else:
+    role = config["credentials"]["usernames"][username]["role"]
+    st.session_state["role"] = role
+    with st.sidebar:
+        st.header("Clinical RAG Summarizer")
+        st.markdown("HIPAA-compliant, secure, and easy to use.")
+        st.markdown("---")
+        st.success(f"Logged in as {name}")
+        st.markdown(f"**Role:** {role}")
+        authenticator.logout("Logout", location="sidebar")
+        st.markdown("---")
+        st.info("Use the tabs above to upload notes, generate summaries, and view logs.")
+# Clear ChromaDB cache to prevent singleton conflicts
+try:
+    from chromadb.api.client import SharedSystemClient
+    SharedSystemClient.clear_system_cache()
+except:
+    pass
+# Generate a unique persist_dir for each session if not already set
+if "persist_dir" not in st.session_state:
+    if st.session_state.get("username"):
+        st.session_state["persist_dir"] = f"./data/vector_store_{st.session_state['username']}"
+    else:
+        unique_id = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + "_" + str(uuid.uuid4())[:8]
+        st.session_state["persist_dir"] = f"./data/vector_store_{unique_id}"
+# Initialize the audit logger
+audit_logger = AuditLogger()
+# Initialize model cache in session state
+if "t5_model" not in st.session_state:
+    st.session_state["t5_model"] = None
+if "t5_tokenizer" not in st.session_state:
+    st.session_state["t5_tokenizer"] = None
+# --- Tabs ---
+upload_tab, summarize_tab, logs_tab = st.tabs(["Upload/Enter Note", "Summarize", "Logs"])
+# --- Upload/Enter Note tab ---
+with upload_tab:
+    st.subheader("Enter or Upload Note")
+    st.caption("Paste a synthetic note or upload a .txt file, then de-identify and index.")
+    col_upload, col_text = st.columns([1, 2])
+    with col_upload:
+        file = st.file_uploader("Upload .txt file", type=["txt"])
+    with col_text:
+        note_text = st.text_area("Paste note text", height=200, placeholder="Paste clinical note text here...")
+    col1, col2 = st.columns(2)
+    with col1:
+        deid_index_clicked = st.button("De-identify & Index", use_container_width=True)
+    with col2:
+        skip_index_clicked = st.button("Skip (already indexed)", use_container_width=True)
+    if file and not note_text:
+        note_text = file.read().decode("utf-8", errors="ignore")
+    if deid_index_clicked and note_text:
+        try:
+            with st.spinner("De-identifying and indexing..."):
+                from deid_pipeline import DeidPipeline
+                pipeline = DeidPipeline()
+                result = pipeline.run_on_text(text=note_text, note_id="temp_note")
+                deid_text = result["masked_text"]
+                st.success("De-identified.")
+                st.text_area("De-identified preview", deid_text, height=160)
+                from indexer import index_note
+                # Use session-specific persist_dir
+                note_id = index_note(
+                    text=deid_text,
+                    note_id=f"note_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}",
+                    persist_dir=st.session_state["persist_dir"],
+                    db_type="chroma",
+                    model_name="sentence-transformers/all-MiniLM-L6-v2",
+                    collection="notes"
+                )
+                st.session_state["last_note_id"] = note_id
+                st.session_state["last_deid_text"] = deid_text
+                st.session_state["last_note_indexed"] = True
+                st.success(f"✓ Indexed note_id: {note_id}")
+                st.info(f"📁 Stored in: {st.session_state['persist_dir']}")
+                # Audit log for indexing
+                audit_logger.log_action(
+                    user=st.session_state.get('username', 'anonymous'),
+                    action="INDEX_NOTE",
+                    resource=note_id,
+                    additional_info={
+                        "text_length": len(deid_text),
+                        "persist_dir": st.session_state["persist_dir"]
+                    }
+                )
+                # Audit log for de-identification
+                audit_logger.log_action(
+                    user=st.session_state.get('username', 'anonymous'),
+                    action="DEID_PROCESS",
+                    resource="temp_note",
+                    additional_info={"original_length": len(note_text), "deid_length": len(deid_text)}
+                )
+                st.toast("Note indexed and de-identified!", icon="✅")
+        except Exception as e:
+            st.error(f"De-identification error: {e}")
+            import traceback
+            st.code(traceback.format_exc())
+    elif skip_index_clicked and note_text:
+        st.session_state["last_deid_text"] = note_text
+        st.session_state["last_note_indexed"] = False
+        st.info("Skipped indexing; text saved for summarization.")
+        # Audit log for skipping
+        audit_logger.log_action(
+            user=st.session_state.get('username', 'anonymous'),
+            action="SKIP_INDEX",
+            resource="temp_note",
+            additional_info={"text_length": len(note_text)}
+        )
+    if "last_deid_text" not in st.session_state:
+        st.caption("Tip: click 'De-identify & Index' or 'Skip' to carry text into the Summarize tab.")
+    else:
+        st.write(f"✓ Note text ready: {len(st.session_state['last_deid_text'])} characters")
+        st.write(f"Preview: {st.session_state['last_deid_text'][:100]}...")
+# --- Summarize tab ---
+with summarize_tab:
+    st.subheader("Summarize")
+    st.caption("Retrieves context and generates a structured clinical summary.")
+    from rag_pipeline import load_embedder, load_chroma, load_faiss_langchain, retrieve
+    from summarizer import make_t5, summarize_docs, validate_summary_quality
+    # Environment detection
+    import os
+    IS_CLOUD = os.path.exists('/mount/src')
+    if IS_CLOUD:
+        st.info("🌐 Cloud Mode: Using optimized model (flan-t5-base)")
+    # Clear ChromaDB system cache to avoid singleton conflicts
+    try:
+        import chromadb
+        from chromadb.api.client import SharedSystemClient
+        SharedSystemClient.clear_system_cache()
+    except Exception as e:
+        st.warning(f"Could not clear ChromaDB cache: {e}")
+    # Show current vector store location
+    st.info(f"📁 Using vector store: {st.session_state['persist_dir']}")
+    source_choice = st.radio("Use source:", ["Last de-identified text", "Note ID"], horizontal=True)
+    default_note_id = st.session_state.get("last_note_id", "")
+    user_note_id = st.text_input("Note ID (optional)", value=str(default_note_id))
+    # Add method selection
+    method_choice = st.radio("Extraction method:", ["multistage", "singleshot"], horizontal=True,
+                            help="Multistage: Better quality, slower. Singleshot: Faster, may miss details.")
+    generate_clicked = st.button("Generate Summary", type="primary", use_container_width=True)
+    if generate_clicked:
+        try:
+            with st.spinner("Retrieving context..."):
+                embed_model = "sentence-transformers/all-MiniLM-L6-v2"
+                db_type = "chroma"
+                persist_dir = st.session_state["persist_dir"]
+                collection = "notes"
+                top_k = 5
+                # Cache vector database in session state to avoid recreating
+                cache_key = f"vdb_{persist_dir}_{collection}"
+                if cache_key not in st.session_state:
+                    st.info("⏳ Loading vector database (first time)...")
+                    _, embeddings = load_embedder(embed_model)
+                    # Clear cache before creating new instance
+                    try:
+                        SharedSystemClient.clear_system_cache()
+                    except:
+                        pass
+                    if db_type == "chroma":
+                        vdb = load_chroma(persist_dir, collection, embeddings)
+                    else:
+                        vdb = load_faiss_langchain(persist_dir, embeddings)
+                    st.session_state[cache_key] = vdb
+                    st.success("✓ Vector database loaded")
+                else:
+                    vdb = st.session_state[cache_key]
+                    st.info("✓ Using cached vector database")
+                # Use actual note content for retrieval
+                if source_choice == "Note ID" and user_note_id:
+                    query_text = user_note_id
+                    st.info(f"🔍 Retrieving by Note ID: {user_note_id}")
+                else:
+                    deid_text = st.session_state.get("last_deid_text", "")
+                    if not deid_text:
+                        st.warning("No de-identified text available. Please use the Upload tab first.")
+                        st.stop()
+                    query_text = deid_text[:500]
+                    st.info(f"🔍 Retrieving using note content ({len(deid_text)} chars)")
+                docs = retrieve(vdb, query_text, top_k)
+                if not docs:
+                    st.error("⚠ No documents retrieved from vector database!")
+                    st.warning("This usually means:")
+                    st.write("• The vector database is empty")
+                    st.write("• The note wasn't properly indexed")
+                    st.write(f"• Check if files exist in: {persist_dir}")
+                    if st.button("🔄 Clear cache and retry"):
+                        if cache_key in st.session_state:
+                            del st.session_state[cache_key]
+                        SharedSystemClient.clear_system_cache()
+                        st.rerun()
+                    st.stop()
+                st.success(f"✓ Retrieved {len(docs)} document(s)")
+                # Show preview of retrieved content
+                with st.expander("View retrieved content"):
+                    for i, doc in enumerate(docs, 1):
+                        st.write(f"**Document {i}:**")
+                        st.code(doc.page_content[:300] + "..." if len(doc.page_content) > 300 else doc.page_content)
+            with st.spinner("Generating summary... (this may take 1-2 minutes on CPU)"):
+                # Cache model loading in session state
+                if st.session_state["t5_model"] is None or st.session_state["t5_tokenizer"] is None:
+                    st.info("⏳ Loading T5 model (first time only)...")
+                    tokenizer, model = make_t5("google/flan-t5-base")
+                    st.session_state["t5_tokenizer"] = tokenizer
+                    st.session_state["t5_model"] = model
+                else:
+                    tokenizer = st.session_state["t5_tokenizer"]
+                    model = st.session_state["t5_model"]
+                    st.info("✓ Using cached model")
+                # Generate summary
+                summary = summarize_docs(tokenizer, model, docs, method=method_choice)
+                # Store summary in session state
+                summary_key = f"summary_{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}"
+                st.session_state["last_summary"] = summary
+                st.session_state["last_summary_key"] = summary_key
+            # Validation
+            original_text = st.session_state.get("last_deid_text", "")
+            validation = validate_summary_quality(summary, original_text)
+            # Display validation results
+            status_color = {
+                "GOOD": "🟢",
+                "FAIR": "🟡",
+                "POOR": "🟠",
+                "FAILED": "🔴"
+            }
+            st.success("✓ Summary generated successfully")
+            # Show quality assessment in two columns
+            col_status, col_score = st.columns([3, 1])
+            with col_status:
+                st.markdown(f"### {status_color.get(validation['status'], '⚪')} Quality Status: **{validation['status']}**")
+            with col_score:
+                st.metric("Quality Score", f"{validation['quality_score']}/100")
+            # Display critical issues if any
+            if validation['issues']:
+                st.error("**❌ Critical Issues Detected:**")
+                for issue in validation['issues']:
+                    st.markdown(f"- {issue}")
+                st.markdown("**Recommendation:** Review de-identification settings and retrieval quality")
+            # Display warnings if any
+            if validation['warnings']:
+                st.warning("**⚠️ Quality Warnings:**")
+                for warning in validation['warnings']:
+                    st.markdown(f"- {warning}")
+            # Show detailed quality metrics in expandable section
+            with st.expander("📊 Detailed Quality Metrics"):
+                metric_col1, metric_col2, metric_col3, metric_col4 = st.columns(4)
+                with metric_col1:
+                    st.metric("PHI Placeholders", validation['metrics']['total_placeholders'])
+                with metric_col2:
+                    st.metric("Empty Sections", validation['metrics']['empty_sections'])
+                with metric_col3:
+                    st.metric("Filled Sections", f"{validation['metrics']['filled_sections']}/7")
+                with metric_col4:
+                    st.metric("Total Length", f"{validation['metrics']['total_length']} chars")
+            # Show warning banner if quality is poor
+            if validation['status'] in ['POOR', 'FAILED']:
+                st.warning("⚠️ **Quality Alert:** The summary below has significant quality issues. Review carefully before clinical use.")
+            elif validation['status'] == 'FAIR':
+                st.info("ℹ️ The summary has minor quality issues. Review the warnings above.")
+            else:
+                st.success("✅ Summary quality is acceptable.")
+            # Display the summary
+            st.text_area("Structured Summary", summary, height=400, key=f"summary_display_{summary_key}")
+            st.download_button("Download .txt", data=summary, file_name=f"summary_{user_note_id or 'latest'}.txt")
+            # Show summary statistics
+            col1, col2, col3 = st.columns(3)
+            with col1:
+                st.metric("Summary Length", f"{len(summary)} chars")
+            with col2:
+                st.metric("Documents Retrieved", len(docs))
+            with col3:
+                sections_filled = 7 - summary.count("None stated")
+                st.metric("Sections Filled", f"{sections_filled}/7")
+            # Audit log for summary generation with validation results
+            audit_logger.log_action(
+                user=st.session_state.get('username', 'anonymous'),
+                action="GENERATE_SUMMARY",
+                resource=user_note_id or "temp_note",
+                additional_info={
+                    "retrieved_docs": len(docs),
+                    "method": method_choice,
+                    "summary_length": len(summary),
+                    "persist_dir": persist_dir,
+                    "sections_filled": sections_filled,
+                    "quality_status": validation['status'],
+                    "quality_score": validation['quality_score'],
+                    "validation_issues": len(validation['issues']),
+                    "validation_warnings": len(validation['warnings']),
+                    "phi_placeholders": validation['metrics']['total_placeholders']
+                }
+            )
+        except ValueError as ve:
+            if "already exists" in str(ve):
+                st.error("❌ ChromaDB instance conflict detected!")
+                st.warning("This happens when the vector database is accessed with different settings.")
+                st.info("**Solution:** Click the button below to clear the cache and retry.")
+                if st.button("🔄 Clear ChromaDB cache and retry", type="primary"):
+                    try:
+                        SharedSystemClient.clear_system_cache()
+                    except:
+                        pass
+                    keys_to_delete = [k for k in st.session_state.keys() if k.startswith("vdb_")]
+                    for key in keys_to_delete:
+                        del st.session_state[key]
+                    st.success("✓ Cache cleared! Click 'Generate Summary' again.")
+                    st.rerun()
+            else:
+                st.error(f"❌ Error during summarization: {ve}")
+                import traceback
+                st.code(traceback.format_exc())
+        except Exception as e:
+            st.error(f"❌ Error during summarization: {e}")
+            import traceback
+            st.code(traceback.format_exc())
+    # Show last summary if available (when button not clicked)
+    elif "last_summary" in st.session_state:
+        st.info("Showing last generated summary:")
+        st.text_area("Last Summary", st.session_state["last_summary"], height=400)
+        st.download_button("Download Last Summary",
+                          data=st.session_state["last_summary"],
+                          file_name="last_summary.txt")
+# --- Logs tab ---
+with logs_tab:
+    st.subheader("Logs")
+    if st.session_state.get("role") != "admin":
+        st.info("Admins only.")
+    else:
+        st.caption("Audit logs for all user actions.")
+        # Audit log for viewing logs
+        audit_logger.log_action(
+            user=st.session_state.get('username', 'anonymous'),
+            action="VIEW_LOGS",
+            resource="app_audit.jsonl"
+        )
+        # Add log filtering
+        col1, col2 = st.columns([3, 1])
+        with col1:
+            filter_action = st.selectbox("Filter by action:",
+                                        ["All", "INDEX_NOTE", "GENERATE_SUMMARY", "DEID_PROCESS", "VIEW_LOGS"])
+        with col2:
+            num_lines = st.number_input("Show last N lines:", min_value=10, max_value=500, value=50)
+        try:
+            import json
+            with open("logs/app_audit.jsonl") as f:
+                lines = f.readlines()[-num_lines:]
+            st.write(f"Showing {len(lines)} most recent log entries:")
+            for line in lines:
+                try:
+                    log_entry = json.loads(line.strip())
+                    if filter_action == "All" or log_entry.get("action") == filter_action:
+                        with st.expander(f"{log_entry.get('timestamp', 'N/A')} - {log_entry.get('action', 'N/A')}"):
+                            st.json(log_entry)
+                except json.JSONDecodeError:
+                    st.code(line.strip())
+        except FileNotFoundError:
+            st.warning("No logs found yet. Logs will appear after you perform actions.")

notes.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from langchain_chroma import Chroma
+from sentence_transformers import SentenceTransformer
+from langchain.embeddings.base import Embeddings
+# 1. Wrap SentenceTransformer in a LangChain-compatible class
+class STEmbeddings(Embeddings):
+    def __init__(self, model_name="sentence-transformers/all-MiniLM-L6-v2"):
+        self.model = SentenceTransformer(model_name)
+    def embed_documents(self, texts):
+        return self.model.encode(texts, convert_to_numpy=True, normalize_embeddings=True).tolist()
+    def embed_query(self, text):
+        return self.model.encode([text], convert_to_numpy=True, normalize_embeddings=True)[0].tolist()
+# 2. Instantiate embeddings
+embeddings = STEmbeddings()
+# 3. Create or load Chroma collection
+db = Chroma(
+    collection_name="notes",
+    persist_directory="./data/vector_store",
+    embedding_function=embeddings
+)
+# 4. Add some sample texts
+texts = [
+    "Patient presents with chest pain for 2 days.",
+    "History of hypertension and diabetes.",
+    "Currently taking metformin and lisinopril.",
+    "No known drug allergies.",
+    "Plan: schedule ECG and follow-up in 1 week."
+]
+metadatas = [
+    {"note_id": "1", "section": "HPI", "chunk_index": 0},
+    {"note_id": "1", "section": "PMH", "chunk_index": 0},
+    {"note_id": "1", "section": "Medications", "chunk_index": 0},
+    {"note_id": "1", "section": "Allergies", "chunk_index": 0},
+    {"note_id": "1", "section": "Plan", "chunk_index": 0},
+]
+db.add_texts(texts=texts, metadatas=metadatas)
+# 5. Persist to disk
+print("Ingestion complete. Collection 'notes' is ready.")

quick_check_chroma.py ADDED Viewed

	@@ -0,0 +1,20 @@

+# quick_check_chroma.py
+import chromadb
+from chromadb.config import Settings as ChromaSettings
+persist_dir = "./data/vector_store"
+collection_name = "notes"
+client = chromadb.PersistentClient(path=persist_dir, settings=ChromaSettings())
+coll = client.get_collection(collection_name)
+query = "Type 2 diabetes management plan with metformin"
+res = coll.query(
+    query_texts=[query],
+    n_results=3,
+)
+for i, doc in enumerate(res["documents"][0]):
+    print(f"\nTop {i+1} doc:")
+    print(doc)
+    print("Meta:", res["metadatas"][0][i])

rag_pipeline.py ADDED Viewed

	@@ -0,0 +1,117 @@

+# app/rag_pipeline.py
+# Day 7: Retriever + RAG baseline (retrieval only; generation comes on Day 8)
+# Example usage:
+#   python app/rag_pipeline.py --db_type chroma --persist_dir ./data/vector_store --collection notes --query "Summarize into HPI/Assessment/Plan" --top_k 5
+#   python app/rag_pipeline.py --db_type faiss  --persist_dir ./data/vector_store_faiss --query "Extract Assessment and Plan" --top_k 5
+import os
+import argparse
+import pickle
+from typing import List, Dict
+import uuid
+import datetime
+import shutil
+from sentence_transformers import SentenceTransformer
+import numpy as np
+# LangChain vector store wrappers
+from langchain_community.vectorstores import Chroma, FAISS
+from langchain_core.documents import Document
+# For FAISS manual load if using custom persisted index
+import faiss
+from chromadb.config import Settings as ChromaSettings
+def load_embedder(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
+    model = SentenceTransformer(model_name)
+    def embed_f(texts: List[str]) -> List[List[float]]:
+        vecs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
+        return vecs.tolist()
+    return model, embed_f
+def load_chroma(persist_dir: str, collection: str, embed_f):
+    from langchain.embeddings.base import Embeddings
+    class STEmbeddings(Embeddings):
+        def embed_documents(self, texts: List[str]) -> List[List[float]]:
+            return embed_f(texts)
+        def embed_query(self, text: str) -> List[float]:
+            return embed_f([text])[0]
+    embeddings = STEmbeddings()
+    vectordb = Chroma(
+        collection_name=collection,
+        persist_directory=persist_dir,
+        embedding_function=embeddings
+    )
+    return vectordb
+def load_faiss_langchain(persist_dir: str, embed_f):
+    # If Day 6 saved FAISS with LangChain’s FAISS.save_local, we can do:
+    # return FAISS.load_local(persist_dir, embeddings, allow_dangerous_deserialization=True)
+    # But Day 6 saved raw FAISS + meta.pkl; handle that manually and wrap.
+    from langchain.embeddings.base import Embeddings
+    class STEmbeddings(Embeddings):
+        def embed_documents(self, texts: List[str]) -> List[List[float]]:
+            return embed_f(texts)
+        def embed_query(self, text: str) -> List[float]:
+            return embed_f([text])[0]
+    embeddings = STEmbeddings()
+    index_path = os.path.join(persist_dir, "index.faiss")
+    meta_path = os.path.join(persist_dir, "meta.pkl")
+    if not (os.path.exists(index_path) and os.path.exists(meta_path)):
+        raise FileNotFoundError(f"FAISS files not found in {persist_dir}")
+    index = faiss.read_index(index_path)
+    with open(meta_path, "rb") as f:
+        meta = pickle.load(f)
+    # Build FAISS VectorStore from texts + metadata to leverage LC retriever
+    texts = [m["text"] for m in meta]
+    metadatas = [m["meta"] | {"id": m["id"]} for m in meta]
+    vectordb = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
+    # Replace the underlying index with prebuilt (saves re-embedding cost when querying)
+    vectordb.index = index
+    return vectordb
+def retrieve(vdb, query: str, top_k: int = 5):
+    retriever = vdb.as_retriever(search_kwargs={"k": top_k})
+    docs: List[Document] = retriever.invoke(query)
+    return docs
+def format_context(docs: List[Document]) -> str:
+    parts = []
+    for i, d in enumerate(docs, 1):
+        md = d.metadata or {}
+        parts.append(f"[{i}] note_id={md.get('note_id')} section={md.get('section')} chunk_idx={md.get('chunk_index')}\n{d.page_content}")
+    return "\n\n---\n\n".join(parts)
+def main():
+    parser = argparse.ArgumentParser(description="Day 7: Retriever + RAG baseline (retrieval only).")
+    parser.add_argument("--db_type", choices=["chroma", "faiss"], default="chroma")
+    parser.add_argument("--persist_dir", default="./data/vector_store")
+    parser.add_argument("--collection", default="notes")
+    parser.add_argument("--model_name", default="sentence-transformers/all-MiniLM-L6-v2")
+    parser.add_argument("--query", required=True)
+    parser.add_argument("--top_k", type=int, default=5)
+    args = parser.parse_args()
+    # Sure shot fix: Remove existing persist_dir if it exists
+    if args.db_type == "chroma" and os.path.exists(args.persist_dir):
+        shutil.rmtree(args.persist_dir)
+    _, embed_f = load_embedder(args.model_name)
+    if args.db_type == "chroma":
+        vectordb = load_chroma(args.persist_dir, args.collection, embed_f)
+    else:
+        vectordb = load_faiss_langchain(args.persist_dir, embed_f)
+    docs = retrieve(vectordb, args.query, args.top_k)
+    context = format_context(docs)
+    print("\n=== Retrieved Context (to feed Day 8 summarizer) ===\n")
+    print(context)
+if __name__ == "__main__":
+    main()

retriever_context.py ADDED Viewed

	@@ -0,0 +1,7 @@

+from rag_pipeline import retrieve_context
+from summarizer import generate_summary
+query = "Summarize into HPI/Assessment/Plan"
+retrieved_text = retrieve_context(query, top_k=5)
+summary = generate_summary(retrieved_text)
+print(summary)

run_pipeline.py ADDED Viewed

	@@ -0,0 +1,51 @@

+# run_pipeline.py
+from rag_pipeline import retrieve_context   # <-- your Day 7 retriever
+from transformers import pipeline
+# 1. Load a summarization model
+# Option A: summarization-tuned model (recommended for clean summaries)
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+# Option B: instruction-tuned model (if you want to experiment with prompts)
+# summarizer = pipeline("text2text-generation", model="google/flan-t5-base")
+# 2. Define a function to generate structured summary
+def generate_summary(retrieved_text: str):
+    # For BART summarizer (Option A)
+    result = summarizer(retrieved_text, max_length=250, min_length=80, do_sample=False)
+    return result[0]['summary_text']
+    # If using Flan-T5 (Option B), uncomment this instead:
+    """
+    prompt = f'''
+    You are a clinical summarization assistant.
+    Use ONLY the provided context to create a structured summary.
+    Do not invent information.
+    Context:
+    {retrieved_text}
+    Write the output in this exact format:
+    Chief Complaint: ...
+    HPI: ...
+    PMH: ...
+    Medications: ...
+    Allergies: ...
+    Assessment: ...
+    Plan: ...
+    '''
+    result = summarizer(prompt, max_new_tokens=300, do_sample=False)
+    return result[0]['generated_text']
+    """
+# 3. Main execution
+if __name__ == "__main__":
+    query = "Summarize into HPI/Assessment/Plan"
+    # Get top 5 relevant chunks from your vector store
+    retrieved_text = retrieve_context(query, top_k=5)
+    print("=== Retrieved Context ===")
+    print(retrieved_text)
+    print("\n=== Structured Clinical Summary ===")
+    summary = generate_summary(retrieved_text)
+    print(summary)

streamlit_config.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+credentials:
+  usernames:
+    clinician1:
+      email: [email protected]
+      name: Clinician One
+      password: "$2b$12$r3uqzaknAfUAsMEVIKTR2eN8yuPxu8d8YJWmPOrvNKwK.K94sjl1W"
+      role: clinician
+    admin1:
+      email: [email protected]
+      name: Admin One
+      password: "$2b$12$r3uqzaknAfUAsMEVIKTR2eN8yuPxu8d8YJWmPOrvNKwK.K94sjl1W"
+      role: admin
+cookie:
+  expiry_days: 1
+  key: some_random_secret
+  name: auth_cookie

summarizer.py ADDED Viewed

	@@ -0,0 +1,610 @@

+# app/summarizer.py
+# Day 10: Enhanced HIPAA-compliant RAG clinical summarizer with robustness improvements
+# Critical fixes:
+# - Added progress indicators during model generation
+# - Implemented timeout mechanism for long-running operations
+# - Optimized for CPU with reduced generation parameters
+# - Better error handling and verbose logging
+# - Fallback to smaller max tokens if generation hangs
+import os
+import argparse
+import traceback
+from typing import List, Dict, Optional
+import re
+import time
+import sys
+from sentence_transformers import SentenceTransformer
+from langchain_community.vectorstores import Chroma, FAISS
+from langchain_core.documents import Document
+from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
+# -----------------------------
+# Embeddings / Vector stores
+# -----------------------------
+def load_embedder(model_name: str = "sentence-transformers/all-MiniLM-L6-v2"):
+    """
+    Load sentence transformer for embeddings.
+    For medical domain: consider "emilyalsentzer/Bio_ClinicalBERT" or similar
+    """
+    print(f"  → Loading embedding model...")
+    model = SentenceTransformer(model_name)
+    def embed_f(texts: List[str]):
+        vecs = model.encode(texts, convert_to_numpy=True, normalize_embeddings=True)
+        return vecs.tolist()
+    print(f"  ✓ Embedding model loaded")
+    return embed_f
+def load_chroma(persist_dir: str, collection: str, embed_f):
+    from langchain.embeddings.base import Embeddings
+    class STEmbeddings(Embeddings):
+        def embed_documents(self, texts: List[str]) -> List[List[float]]:
+            return embed_f(texts)
+        def embed_query(self, text: str) -> List[float]:
+            return embed_f([text])[0]
+    embeddings = STEmbeddings()
+    print(f"  → Loading Chroma vector store from {persist_dir}...")
+    return Chroma(collection_name=collection, persist_directory=persist_dir, embedding_function=embeddings)
+def load_faiss(persist_dir: str, embed_f):
+    import pickle, faiss
+    from langchain.embeddings.base import Embeddings
+    class STEmbeddings(Embeddings):
+        def embed_documents(self, texts: List[str]) -> List[List[float]]:
+            return embed_f(texts)
+        def embed_query(self, text: str) -> List[float]:
+            return embed_f([text])[0]
+    embeddings = STEmbeddings()
+    index_path = os.path.join(persist_dir, "index.faiss")
+    meta_path = os.path.join(persist_dir, "meta.pkl")
+    if not (os.path.exists(index_path) and os.path.exists(meta_path)):
+        raise FileNotFoundError(f"FAISS files not found in {persist_dir}")
+    print(f"  → Loading FAISS index from {persist_dir}...")
+    with open(meta_path, "rb") as f:
+        meta = pickle.load(f)
+    texts = [m["text"] for m in meta]
+    metadatas = [m["meta"] | {"id": m["id"]} for m in meta]
+    vdb = FAISS.from_texts(texts=texts, embedding=embeddings, metadatas=metadatas)
+    vdb.index = faiss.read_index(index_path)
+    return vdb
+def retrieve_docs(db_type: str, persist_dir: str, collection: str, query: str, top_k: int, embed_f) -> List[Document]:
+    if db_type == "chroma":
+        vdb = load_chroma(persist_dir, collection, embed_f)
+    else:
+        vdb = load_faiss(persist_dir, embed_f)
+    print(f"  → Retrieving documents...")
+    retriever = vdb.as_retriever(search_kwargs={"k": top_k})
+    docs: List[Document] = retriever.invoke(query)
+    print(f"  ✓ Retrieved {len(docs)} document(s)")
+    # Debug: Show retrieved content length
+    if docs:
+        total_chars = sum(len(d.page_content) for d in docs)
+        print(f"  ℹ Total retrieved content: {total_chars} characters")
+    else:
+        print(f"  ⚠ WARNING: No documents retrieved!")
+    return docs
+# -----------------------------
+# T5 Summarization utilities
+# -----------------------------
+def make_t5(model_name="google/flan-t5-base", device="cpu"):
+    print(f"  → Loading T5 model: {model_name}")
+    print(f"  ℹ This may take 30-60 seconds for large models...")
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
+    print(f"  ✓ Model loaded successfully")
+    return tokenizer, model
+def t5_generate(tokenizer, model, prompt: str, max_input_tokens: int = 512, max_output_tokens: int = 256, section_name: str = ""):
+    """
+    Enhanced generation with progress indicators and optimized parameters for CPU
+    """
+    # Show progress
+    if section_name:
+        print(f"    → Generating {section_name}...", end='', flush=True)
+    else:
+        print(f"    → Generating summary...", end='', flush=True)
+    start_time = time.time()
+    try:
+        inputs = tokenizer(prompt, truncation=True, max_length=max_input_tokens, return_tensors="pt")
+        inputs = {k: v.to(model.device) for k, v in inputs.items()}
+        # Optimized parameters for CPU performance
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_output_tokens,
+            min_length=10,  # Reduced minimum to avoid forcing long outputs
+            num_beams=2,  # Reduced from 4 for faster CPU generation
+            length_penalty=1.0,  # Reduced from 1.5
+            no_repeat_ngram_size=3,
+            early_stopping=True,  # Re-enabled for faster completion
+            do_sample=False  # Deterministic generation
+        )
+        result = tokenizer.decode(outputs[0], skip_special_tokens=True)
+        elapsed = time.time() - start_time
+        print(f" done ({elapsed:.1f}s)")
+        return result
+    except Exception as e:
+        elapsed = time.time() - start_time
+        print(f" FAILED ({elapsed:.1f}s)")
+        print(f"    ✗ Error: {str(e)}")
+        return ""
+def dedupe_texts(texts: List[str]) -> List[str]:
+    seen = set()
+    uniq = []
+    for t in texts:
+        key = " ".join(t.split())[:500]
+        if key not in seen:
+            seen.add(key)
+            uniq.append(t)
+    return uniq
+# -----------------------------
+# Section definitions
+# -----------------------------
+SECTION_ORDER = [
+    "Chief Complaint",
+    "HPI",
+    "PMH",
+    "Medications",
+    "Allergies",
+    "Assessment",
+    "Plan",
+]
+# -----------------------------
+# Multi-stage extraction prompts (optimized for T5)
+# -----------------------------
+SECTION_PROMPTS = {
+    "Chief Complaint": """Task: Extract the main reason for patient visit.
+Clinical Note:
+{context}
+Answer with only the chief complaint (1-2 sentences):""",
+    "HPI": """Task: Extract the history of present illness including symptom onset, progression, and context.
+Clinical Note:
+{context}
+Answer with the history of present illness:""",
+    "PMH": """Task: Extract past medical history including chronic conditions, past surgeries, and social history.
+Clinical Note:
+{context}
+Answer with past medical history:""",
+    "Medications": """Task: List all medications with dosages mentioned in the note.
+Clinical Note:
+{context}
+Answer with medication list:""",
+    "Allergies": """Task: Extract drug allergies. If none mentioned, state "No known drug allergies".
+Clinical Note:
+{context}
+Answer with allergies:""",
+    "Assessment": """Task: Extract diagnosis, test results, physical findings, and vital signs.
+Clinical Note:
+{context}
+Answer with assessment and findings:""",
+    "Plan": """Task: Extract treatment plan, medications prescribed, follow-up appointments, and discharge instructions.
+Clinical Note:
+{context}
+Answer with treatment plan:"""
+}
+# -----------------------------
+# Enhanced extraction pipeline
+# -----------------------------
+def extract_section_multistage(tokenizer, model, context: str, section: str) -> str:
+    """
+    Extract a single section using targeted prompting
+    """
+    if section not in SECTION_PROMPTS:
+        return "None stated"
+    # Truncate context if too long
+    max_context_chars = 2000
+    if len(context) > max_context_chars:
+        context = context[:max_context_chars] + "..."
+    prompt = SECTION_PROMPTS[section].format(context=context)
+    try:
+        result = t5_generate(tokenizer, model, prompt, max_input_tokens=512, max_output_tokens=200, section_name=section)
+        result = result.strip()
+        # Remove any section headers the model might have added
+        result = re.sub(r'^(Chief Complaint|HPI|PMH|Medications|Allergies|Assessment|Plan)\s*:\s*', '', result, flags=re.IGNORECASE)
+        # Check if extraction failed
+        if not result or len(result) < 5 or result.lower() in ["none", "none stated", "not mentioned", "n/a", "na"]:
+            return "None stated"
+        return result.strip()
+    except Exception as e:
+        print(f"    ✗ Error extracting {section}: {str(e)}")
+        return "None stated"
+def validate_extraction(sections: Dict[str, str]) -> bool:
+    """
+    Validate that extraction was successful (not all 'None stated')
+    """
+    non_empty = sum(1 for v in sections.values() if v and v != "None stated")
+    return non_empty >= 2  # At least 2 sections should have content
+def summarize_docs_multistage(tokenizer, model, docs: List[Document]) -> str:
+    """
+    Multi-stage extraction: extract each section independently
+    """
+    print(f"\n📄 Processing documents...")
+    contents = dedupe_texts([d.page_content for d in docs if d and d.page_content])
+    if not contents:
+        print("  ⚠ No content to summarize!")
+        return format_output({sec: "None stated" for sec in SECTION_ORDER})
+    # Combine all retrieved content
+    full_context = "\n\n".join(contents)
+    print(f"  ℹ Combined context length: {len(full_context)} characters")
+    # Extract each section independently
+    print(f"\n🔄 Extracting sections (this may take 1-3 minutes on CPU)...")
+    sections = {}
+    for i, section in enumerate(SECTION_ORDER, 1):
+        print(f"  [{i}/{len(SECTION_ORDER)}] {section}:")
+        sections[section] = extract_section_multistage(tokenizer, model, full_context, section)
+    # Validate extraction
+    print(f"\n✓ Extraction complete")
+    if not validate_extraction(sections):
+        print("⚠ WARNING: Extraction appears incomplete. Most sections are empty.")
+        print("  Possible issues:")
+        print("  • Vector retrieval may not be finding relevant content")
+        print("  • Model may not understand the clinical text format")
+        print("  • Context may be too short or fragmented")
+        print("  • De-identification artifacts may be confusing the model")
+    return format_output(sections)
+def format_output(sections: Dict[str, str]) -> str:
+    """
+    Format sections into structured output
+    """
+    output_lines = []
+    for section in SECTION_ORDER:
+        content = sections.get(section, "None stated")
+        output_lines.append(f"• {section}: {content}")
+    return "\n".join(output_lines)
+    # -----------------------------
+# Summary Quality Validation
+# -----------------------------
+def validate_summary_quality(summary: str, original_text: str = "") -> dict:
+    """
+    Validate summary quality and detect common issues
+    Args:
+        summary: The generated summary text
+        original_text: Optional original note text for comparison
+    Returns:
+        Dictionary with validation results
+    """
+    issues = []
+    warnings = []
+    # Check for placeholder contamination (de-ID over-redaction)
+    placeholder_patterns = [
+        (r'\[LOCATION\]', 'LOCATION'),
+        (r'\[DATE\]', 'DATE'),
+        (r'\[NAME\]', 'NAME'),
+        (r'\[PHONE\]', 'PHONE')
+    ]
+    total_placeholders = 0
+    for pattern, name in placeholder_patterns:
+        count = len(re.findall(pattern, summary))
+        total_placeholders += count
+        if count > 2:
+            warnings.append(f"Too many [{name}] placeholders ({count}) - de-identification may be over-aggressive")
+    if total_placeholders > 5:
+        issues.append(f"Critical: {total_placeholders} PHI placeholders in summary - clinical content lost")
+    # Check for "None stated" sections
+    none_count = summary.count("None stated")
+    if none_count >= 5:
+        issues.append(f"Critical: {none_count}/7 sections are empty - summarization failed")
+    elif none_count >= 3:
+        warnings.append(f"Warning: {none_count}/7 sections are empty - may need better retrieval")
+    # Check for minimum content length per section
+    total_length = len(summary)
+    # Subtract bullets and "None stated" overhead
+    content_length = total_length - (summary.count("•") * 2) - (none_count * 11)
+    filled_sections = 7 - none_count
+    if filled_sections > 0:
+        avg_section_length = content_length / filled_sections
+        if avg_section_length < 30:
+            warnings.append(f"Warning: Sections too short (avg {avg_section_length:.0f} chars) - may lack detail")
+    # Check for duplicate medications
+    if "Medications:" in summary:
+        meds_section = summary.split("Medications:")[1].split("•")[0] if "Medications:" in summary else ""
+        meds_lower = meds_section.lower()
+        common_meds = ['atorvastatin', 'metoprolol', 'lisinopril', 'aspirin', 'metformin']
+        for med in common_meds:
+            if meds_lower.count(med) > 1:
+                warnings.append(f"Warning: Duplicate medication detected: {med}")
+    # Calculate quality score (0-100)
+    score = 100
+    score -= len(issues) * 30  # Critical issues: -30 each
+    score -= len(warnings) * 10  # Warnings: -10 each
+    score = max(0, min(100, score))
+    # Determine overall status
+    if len(issues) > 0:
+        status = "FAILED"
+    elif len(warnings) > 2:
+        status = "POOR"
+    elif len(warnings) > 0:
+        status = "FAIR"
+    else:
+        status = "GOOD"
+    return {
+        "is_valid": len(issues) == 0,
+        "status": status,
+        "quality_score": score,
+        "issues": issues,
+        "warnings": warnings,
+        "metrics": {
+            "total_placeholders": total_placeholders,
+            "empty_sections": none_count,
+            "filled_sections": filled_sections,
+            "total_length": total_length
+        }
+    }
+# -----------------------------
+# Backward compatibility wrapper for Streamlit integration
+# -----------------------------
+def summarize_docs(tokenizer, model, docs: List[Document], method: str = "multistage") -> str:
+    """
+    Wrapper function for backward compatibility with main.py (Streamlit UI)
+    """
+    if method == "multistage":
+        return summarize_docs_multistage(tokenizer, model, docs)
+    else:
+        return summarize_docs_singleshot(tokenizer, model, docs)
+# -----------------------------
+# Single-shot extraction (simplified fallback)
+# -----------------------------
+def summarize_docs_singleshot(tokenizer, model, docs: List[Document]) -> str:
+    """
+    Single-shot extraction method (faster but less comprehensive)
+    """
+    print(f"\n📄 Processing documents...")
+    contents = dedupe_texts([d.page_content for d in docs if d and d.page_content])
+    if not contents:
+        print("  ⚠ No content to summarize!")
+        return format_output({sec: "None stated" for sec in SECTION_ORDER})
+    raw_context = "\n\n".join(contents)
+    print(f"  ℹ Combined context length: {len(raw_context)} characters")
+    # Simplified prompt for single-shot
+    instruction = """Summarize this clinical note into 7 sections:
+1. Chief Complaint (main reason for visit)
+2. HPI (symptom history and progression)
+3. PMH (past medical history)
+4. Medications (current medications with doses)
+5. Allergies (drug allergies)
+6. Assessment (diagnosis and findings)
+7. Plan (treatment plan and follow-up)
+Clinical Note:
+{context}
+Structured Summary:"""
+    print(f"\n🔄 Generating structured summary...")
+    prompt = instruction.format(context=raw_context[:2000])  # Limit context
+    model_out = t5_generate(tokenizer, model, prompt, max_input_tokens=512, max_output_tokens=400)
+    # Parse output into sections
+    sections = parse_output_to_sections(model_out)
+    return format_output(sections)
+def parse_output_to_sections(text: str) -> Dict[str, str]:
+    """
+    Parse model output into section dictionary
+    """
+    sections = {}
+    current_section = None
+    current_content = []
+    for line in text.split('\n'):
+        line = line.strip()
+        if not line:
+            continue
+        # Check if line starts with a section header
+        matched_section = None
+        for section in SECTION_ORDER:
+            # Match section headers with numbers or bullets
+            pattern = rf'^(\d+\.\s*)?{re.escape(section)}\s*:?'
+            if re.match(pattern, line, re.IGNORECASE):
+                matched_section = section
+                break
+        if matched_section:
+            # Save previous section
+            if current_section:
+                sections[current_section] = " ".join(current_content).strip()
+            # Start new section
+            current_section = matched_section
+            # Get content after the header
+            content = re.sub(rf'^(\d+\.\s*)?{re.escape(matched_section)}\s*:?\s*', '', line, flags=re.IGNORECASE).strip()
+            current_content = [content] if content else []
+        else:
+            # Continue current section
+            if current_section:
+                current_content.append(line)
+    # Save last section
+    if current_section:
+        sections[current_section] = " ".join(current_content).strip()
+    # Fill in missing sections
+    for section in SECTION_ORDER:
+        if section not in sections or not sections[section]:
+            sections[section] = "None stated"
+    return sections
+# -----------------------------
+# Backward compatibility wrapper for Streamlit integration
+# -----------------------------
+def summarize_docs(tokenizer, model, docs: List[Document], method: str = "multistage") -> str:
+    """
+    Wrapper function for backward compatibility with main.py (Streamlit UI)
+    Args:
+        tokenizer: T5 tokenizer instance
+        model: T5 model instance
+        docs: List of retrieved documents
+        method: "multistage" (default) or "singleshot" extraction method
+    Returns:
+        Formatted summary string with sections
+    """
+    if method == "multistage":
+        return summarize_docs_multistage(tokenizer, model, docs)
+    else:
+        return summarize_docs_singleshot(tokenizer, model, docs)
+# -----------------------------
+# Orchestration
+# -----------------------------
+def main():
+    parser = argparse.ArgumentParser(description="Day 10: Enhanced HIPAA-compliant RAG clinical summarizer")
+    parser.add_argument("--db_type", choices=["chroma", "faiss"], default="chroma")
+    parser.add_argument("--persist_dir", default="./data/vector_store")
+    parser.add_argument("--collection", default="notes")
+    parser.add_argument("--embed_model", default="sentence-transformers/all-MiniLM-L6-v2")
+    parser.add_argument("--model_name", default="google/flan-t5-small")
+    parser.add_argument("--query", required=True)
+    parser.add_argument("--top_k", type=int, default=5)
+    parser.add_argument("--out", default="./data/outputs/summaries/summary.txt")
+    parser.add_argument("--method", choices=["multistage", "singleshot"], default="multistage",
+                       help="Extraction method: multistage (recommended) or singleshot (faster)")
+    args = parser.parse_args()
+    print("=" * 70)
+    print("  HIPAA-COMPLIANT RAG CLINICAL SUMMARIZER")
+    print("=" * 70)
+    out_dir = os.path.dirname(args.out) or "."
+    os.makedirs(out_dir, exist_ok=True)
+    try:
+        # Step 1: Load embedder
+        print(f"\n[1/4] LOADING EMBEDDER")
+        print(f"  Model: {args.embed_model}")
+        embed_f = load_embedder(args.embed_model)
+        # Step 2: Retrieve documents
+        print(f"\n[2/4] RETRIEVING DOCUMENTS")
+        print(f"  Database: {args.db_type}")
+        print(f"  Location: {args.persist_dir}")
+        print(f"  Query: {args.query}")
+        print(f"  Top-K: {args.top_k}")
+        docs = retrieve_docs(args.db_type, args.persist_dir, args.collection, args.query, args.top_k, embed_f)
+        if not docs:
+            print("\n⚠ ERROR: No documents retrieved from vector database!")
+            print("  Possible causes:")
+            print("  • Vector database is empty or not properly indexed")
+            print("  • Query doesn't match indexed content")
+            print("  • Database path is incorrect")
+            result = format_output({sec: "None stated" for sec in SECTION_ORDER})
+            with open(args.out, "w", encoding="utf-8") as f:
+                f.write(result)
+            print(f"\n✓ Empty summary written to {args.out}")
+            return
+        # Step 3: Load summarization model
+        print(f"\n[3/4] LOADING SUMMARIZATION MODEL")
+        print(f"  Model: {args.model_name}")
+        tokenizer, model = make_t5(args.model_name)
+        # Step 4: Generate summary
+        print(f"\n[4/4] GENERATING SUMMARY")
+        print(f"  Method: {args.method}")
+        if args.method == "multistage":
+            summary = summarize_docs_multistage(tokenizer, model, docs)
+        else:
+            summary = summarize_docs_singleshot(tokenizer, model, docs)
+        # Write summary to output file
+        with open(args.out, "w", encoding="utf-8") as f:
+            f.write(summary)
+        print(f"\n{'=' * 70}")
+        print(f"✓ SUCCESS: Summary written to {args.out}")
+        print(f"{'=' * 70}")
+        print("\nGenerated Summary:")
+        print("-" * 70)
+        print(summary)
+        print("-" * 70)
+    except Exception as e:
+        err = traceback.format_exc()
+        error_msg = f"ERROR during summarization:\n{err}"
+        # Write error to file
+        with open(args.out, "w", encoding="utf-8") as f:
+            f.write(error_msg)
+        print(f"\n{'=' * 70}")
+        print(f"✗ ERROR: An error occurred during processing")
+        print(f"{'=' * 70}")
+        print(f"\n{err}")
+        print(f"\nError details written to {args.out}")
+if __name__ == "__main__":
+    main()