Spaces:

shreepanicker
/

phi-detection

Running

App Files Files Community

shree256 commited on 6 days ago

Commit

baf7357

1 Parent(s): 2a996fc

Replace BioBERT with Stanford PHI detector model for improved PHI detection; update function and interface accordingly.

Browse files

Files changed (1) hide show

app.py +46 -106

app.py CHANGED Viewed

@@ -1,142 +1,82 @@
-import warnings
 import gradio as gr
 from transformers import pipeline
-# Suppress known warnings
-warnings.filterwarnings("ignore", category=FutureWarning, module="torch.distributed")
-# Load BioBERT-based model for NER/PHI detection
-# Using a model fine-tuned for medical NER or a general NER model
-# You can replace with a specific PHI detection model if available
-MODEL_NAME = "dslim/bert-base-NER"  # General NER model as fallback
-# Alternative models for medical/PHI detection:
-# - "alvaroalon2/biobert_diseases_ner" (if available)
-# - "emilyalsentzer/Bio_ClinicalBERT" (clinical text)
-# - Any BioBERT-based model fine-tuned for NER
-# Initialize the NER pipeline
-try:
-    # Load a token classification model for NER
-    ner_pipeline = pipeline(
-        "token-classification",
-        model=MODEL_NAME,
-        tokenizer=MODEL_NAME,
-        aggregation_strategy="simple",
-        device=-1,  # Use CPU (-1) or GPU (0, 1, etc.)
-    )
-    print(f"Successfully loaded model: {MODEL_NAME}")
-except Exception as e:
-    print(f"Error loading model {MODEL_NAME}: {e}")
-    print("Trying alternative model...")
-    try:
-        # Fallback to a smaller/faster model
-        MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
-        ner_pipeline = pipeline(
-            "token-classification",
-            model=MODEL_NAME,
-            tokenizer=MODEL_NAME,
-            aggregation_strategy="simple",
-            device=-1,
-        )
-        print(f"Successfully loaded fallback model: {MODEL_NAME}")
-    except Exception as e2:
-        print(f"Error loading fallback model: {e2}")
-        ner_pipeline = None
-def detect_phi(text: str) -> str:
     """
-    Detect PHI (Protected Health Information) in the input text using NER model.
-    Note: For production use, replace with a BioBERT model fine-tuned specifically
-    for PHI detection (e.g., models trained on i2b2 or MIMIC datasets).
     Args:
-        text: Input text containing potential PHI
     Returns:
-        Formatted string showing detected entities
     """
     if not text or not text.strip():
-        return "Please enter some text to analyze."
-    if ner_pipeline is None:
-        return "Model not loaded. Please check the model configuration and install required dependencies."
     try:
-        # Run NER on the input text
-        entities = ner_pipeline(text)
-        if not entities:
-            return (
-                "**No entities detected in the text.**\n\nThis could mean:\n- The text doesn't contain recognizable entities\n- The model needs fine-tuning for PHI-specific detection\n\n**Original Text:**\n"
-                + text
-            )
-        # PHI-relevant entity types
-        phi_labels = [
-            "PER",
-            "PERSON",
-            "ORG",
-            "ORGANIZATION",
-            "LOC",
-            "LOCATION",
-            "MISC",
-            "DATE",
-            "TIME",
-        ]
-        # Format the results
-        result = "**Detected PHI Entities:**\n\n"
-        phi_count = 0
-        for entity in entities:
-            entity_text = entity.get("word", "")
-            entity_label = entity.get("entity_group", entity.get("label", "UNKNOWN"))
-            confidence = entity.get("score", 0.0)
-            # Highlight potential PHI entities
-            is_phi = any(phi_label in entity_label.upper() for phi_label in phi_labels)
-            phi_indicator = "🔴 PHI" if is_phi else "⚪"
-            result += (
-                f"{phi_indicator} **{entity_text}** → `{entity_label}` "
-                f"(Confidence: {confidence:.2%})\n"
-            )
-            if is_phi:
-                phi_count += 1
-        result += f"\n**Summary:** {phi_count} potential PHI entity(ies) detected out of {len(entities)} total entities.\n"
-        # Also show the original text
-        result += "\n---\n\n**Original Text:**\n"
-        result += text
-        return result
     except Exception as e:
-        return f"**Error processing text:** {str(e)}\n\nPlease check that the model is properly loaded and try again."
 # Create Gradio interface
 demo = gr.Interface(
     fn=detect_phi,
     inputs=gr.Textbox(
-        label="PHI Text Input",
-        placeholder="Enter text containing potential PHI (e.g., 'Patient John Doe, age 45, was admitted on 2024-01-15. SSN: 123-45-6789')",
-        lines=5,
     ),
     outputs=gr.Markdown(label="PHI Detection Results"),
-    title="BioBERT PHI Detection",
-    description="Enter text containing Protected Health Information (PHI) to detect entities using BioBERT model.",
 )
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
-        share=False,  # Set to True if you want a public link
-        show_error=True,
     )

 import gradio as gr
 from transformers import pipeline
+# Load Stanford PHI detector model
+print("Loading Stanford PHI detector model...")
+phi_detector = pipeline(
+    "token-classification",
+    model="StanfordAIMI/stanford-deidentifier-base",
+    aggregation_strategy="simple",
+    device=-1,  # CPU mode
+)
+print("Model loaded successfully!")
+def detect_phi(text):
     """
+    Detect PHI in text using Stanford's PHI detector
     Args:
+        text: Input text to analyze
     Returns:
+        Formatted string with detected PHI entities
     """
     if not text or not text.strip():
+        return "⚠️ Please enter some text to analyze."
     try:
+        # Get PHI predictions
+        results = phi_detector(text)
+        if not results:
+            return f"✅ **No PHI detected in the text.**\n\n**Original Text:**\n{text}"
+        # Format results
+        output = "## 🔍 PHI Detection Results\n\n"
+        output += f"**Found {len(results)} PHI entity(ies):**\n\n"
+        for idx, entity in enumerate(results, 1):
+            output += f"{idx}. **{entity['word']}**\n"
+            output += f"   - Type: `{entity['entity_group']}`\n"
+            output += f"   - Confidence: {entity['score']:.2%}\n"
+            output += f"   - Position: {entity['start']}-{entity['end']}\n\n"
+        output += "---\n\n**Original Text:**\n"
+        output += text
+        return output
     except Exception as e:
+        return f"❌ **Error:** {str(e)}"
 # Create Gradio interface
 demo = gr.Interface(
     fn=detect_phi,
     inputs=gr.Textbox(
+        label="Enter Text to Analyze",
+        placeholder="Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024.",
+        lines=8,
     ),
     outputs=gr.Markdown(label="PHI Detection Results"),
+    title="🏥 Stanford PHI Detector",
+    description="Detect Protected Health Information (PHI) using Stanford's de-identification model.",
+    examples=[
+        ["Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024."],
+        [
+            "Jane Smith, DOB: 03/22/1980, Phone: (555) 123-4567, Address: 123 Main St, Boston, MA"
+        ],
+        [
+            "MRN: 98765432. Dr. Anderson saw the patient at Massachusetts General Hospital on December 15, 2024."
+        ],
+    ],
+    theme="soft",
 )
 if __name__ == "__main__":
     demo.launch(
         server_name="0.0.0.0",
         server_port=7860,
+        share=False,  # Set to True for public link
     )