import gradio as gr from transformers import pipeline # Load Stanford PHI detector model print("Loading Stanford PHI detector model...") phi_detector = pipeline( "token-classification", model="StanfordAIMI/stanford-deidentifier-base", aggregation_strategy="simple", device=-1, # CPU mode ) print("Model loaded successfully!") def detect_and_redact_phi(text): """ Detect and redact PHI in text using Stanford's PHI detector Args: text: Input text to analyze Returns: Formatted string with redacted text and original text comparison """ if not text or not text.strip(): return "⚠️ Please enter some text to analyze." try: # Get PHI predictions results = phi_detector(text) print(results) if not results: output = "## ✅ No PHI Detected\n\n" output += "**Original Text:**\n```\n" output += text output += "\n```\n\n" output += "**Redacted Text:**\n```\n" output += text output += "\n```\n" return output # Validate and clean entity results text_len = len(text) valid_entities = [] for entity in results: start = entity.get("start", 0) end = entity.get("end", 0) # Validate positions are within bounds if start < 0 or end > text_len or start >= end: print( f"Warning: Invalid entity positions {start}-{end} for entity: {entity}" ) continue # Extract entity text using position-based slicing (most reliable for redaction) entity_text = text[start:end] # Ensure we have valid text to redact (skip empty or whitespace-only entities) if not entity_text.strip(): continue valid_entities.append( { "start": start, "end": end, "text": entity_text, "type": entity.get("entity_group", "UNKNOWN"), "score": entity.get("score", 0.0), } ) if not valid_entities: output = "## ✅ No Valid PHI Detected\n\n" output += "**Original Text:**\n```\n" output += text output += "\n```\n\n" output += "**Redacted Text:**\n```\n" output += text output += "\n```\n" return output # Sort results by start position in reverse to replace from end to start # This prevents index shifting issues when replacing sorted_entities = sorted(valid_entities, key=lambda x: x["start"], reverse=True) # Remove overlapping entities (keep the first/longest one when sorted) non_overlapping = [] used_ranges = [] for entity in sorted(valid_entities, key=lambda x: (x["start"], -x["end"])): start, end = entity["start"], entity["end"] # Check for overlap with already processed entities overlaps = any( not (end <= used_start or start >= used_end) for used_start, used_end in used_ranges ) if not overlaps: non_overlapping.append(entity) used_ranges.append((start, end)) # Sort again in reverse for replacement sorted_entities = sorted( non_overlapping, key=lambda x: x["start"], reverse=True ) redacted_text = text phi_details = [] # Replace PHI entities with redaction markers # Since we're replacing from end to start, positions remain valid in original text for entity in sorted_entities: start = entity["start"] end = entity["end"] entity_text = entity["text"] phi_type = entity["type"] redaction_tag = f"[{phi_type}]" # Verify the entity text matches what's at this position in the current redacted text # For end-to-start replacement, earlier positions (larger start) have been modified, # so we check against the stored entity_text which was extracted from original text # We still validate the slice matches to catch any alignment issues try: # Store details for display phi_details.insert( 0, { "text": entity_text, "type": phi_type, "confidence": entity["score"], "position": f"{start}-{end}", }, ) # Replace in redacted text using the original positions # Since we replace from end to start, positions remain valid redacted_text = ( redacted_text[:start] + redaction_tag + redacted_text[end:] ) except (IndexError, ValueError) as e: print(f"Warning: Error replacing entity at position {start}-{end}: {e}") print( f" Entity text: '{entity_text}', Redacted text length: {len(redacted_text)}" ) # Format output output = "## 🔍 PHI Detection & Redaction Results\n\n" output += f"**Found {len(phi_details)} PHI entity(ies):**\n\n" for idx, detail in enumerate(phi_details, 1): output += f"{idx}. **{detail['text']}** → `{detail['type']}` " output += f"(Confidence: {detail['confidence']:.2%})\n" output += "\n---\n\n" output += "### 📄 Original Text\n```\n" output += text output += "\n```\n\n" output += "### 🔒 Redacted Text\n```\n" output += redacted_text output += "\n```\n" return output except Exception as e: import traceback error_details = traceback.format_exc() print(f"Error in detect_and_redact_phi: {error_details}") return f"❌ **Error:** {str(e)}" # Create Gradio interface demo = gr.Interface( fn=detect_and_redact_phi, inputs=gr.Textbox( label="Enter Text to Analyze", placeholder="Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024.", lines=8, ), outputs=gr.Markdown(label="PHI Detection & Redaction Results"), title="🏥 Stanford PHI Detector & Redactor", description="Detect and redact Protected Health Information (PHI) using Stanford's de-identification model.", examples=[ ["Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024."], [ "Jane Smith, DOB: 03/22/1980, Phone: (555) 123-4567, Address: 123 Main St, Boston, MA" ], [ "MRN: 98765432. Dr. Anderson saw the patient at Massachusetts General Hospital on December 15, 2024." ], ], theme="soft", ) if __name__ == "__main__": demo.launch( server_name="0.0.0.0", server_port=7860, share=False, # Set to True for public link )