Spaces:
Running
Running
shree256
Refactor PHI detection logic to validate entity positions, remove overlaps, and improve redaction accuracy. Added error handling and enhanced output formatting for detected entities.
b9efb1f
| import gradio as gr | |
| from transformers import pipeline | |
| # Load Stanford PHI detector model | |
| print("Loading Stanford PHI detector model...") | |
| phi_detector = pipeline( | |
| "token-classification", | |
| model="StanfordAIMI/stanford-deidentifier-base", | |
| aggregation_strategy="simple", | |
| device=-1, # CPU mode | |
| ) | |
| print("Model loaded successfully!") | |
| def detect_and_redact_phi(text): | |
| """ | |
| Detect and redact PHI in text using Stanford's PHI detector | |
| Args: | |
| text: Input text to analyze | |
| Returns: | |
| Formatted string with redacted text and original text comparison | |
| """ | |
| if not text or not text.strip(): | |
| return "β οΈ Please enter some text to analyze." | |
| try: | |
| # Get PHI predictions | |
| results = phi_detector(text) | |
| print(results) | |
| if not results: | |
| output = "## β No PHI Detected\n\n" | |
| output += "**Original Text:**\n```\n" | |
| output += text | |
| output += "\n```\n\n" | |
| output += "**Redacted Text:**\n```\n" | |
| output += text | |
| output += "\n```\n" | |
| return output | |
| # Validate and clean entity results | |
| text_len = len(text) | |
| valid_entities = [] | |
| for entity in results: | |
| start = entity.get("start", 0) | |
| end = entity.get("end", 0) | |
| # Validate positions are within bounds | |
| if start < 0 or end > text_len or start >= end: | |
| print( | |
| f"Warning: Invalid entity positions {start}-{end} for entity: {entity}" | |
| ) | |
| continue | |
| # Extract entity text using position-based slicing (most reliable for redaction) | |
| entity_text = text[start:end] | |
| # Ensure we have valid text to redact (skip empty or whitespace-only entities) | |
| if not entity_text.strip(): | |
| continue | |
| valid_entities.append( | |
| { | |
| "start": start, | |
| "end": end, | |
| "text": entity_text, | |
| "type": entity.get("entity_group", "UNKNOWN"), | |
| "score": entity.get("score", 0.0), | |
| } | |
| ) | |
| if not valid_entities: | |
| output = "## β No Valid PHI Detected\n\n" | |
| output += "**Original Text:**\n```\n" | |
| output += text | |
| output += "\n```\n\n" | |
| output += "**Redacted Text:**\n```\n" | |
| output += text | |
| output += "\n```\n" | |
| return output | |
| # Sort results by start position in reverse to replace from end to start | |
| # This prevents index shifting issues when replacing | |
| sorted_entities = sorted(valid_entities, key=lambda x: x["start"], reverse=True) | |
| # Remove overlapping entities (keep the first/longest one when sorted) | |
| non_overlapping = [] | |
| used_ranges = [] | |
| for entity in sorted(valid_entities, key=lambda x: (x["start"], -x["end"])): | |
| start, end = entity["start"], entity["end"] | |
| # Check for overlap with already processed entities | |
| overlaps = any( | |
| not (end <= used_start or start >= used_end) | |
| for used_start, used_end in used_ranges | |
| ) | |
| if not overlaps: | |
| non_overlapping.append(entity) | |
| used_ranges.append((start, end)) | |
| # Sort again in reverse for replacement | |
| sorted_entities = sorted( | |
| non_overlapping, key=lambda x: x["start"], reverse=True | |
| ) | |
| redacted_text = text | |
| phi_details = [] | |
| # Replace PHI entities with redaction markers | |
| # Since we're replacing from end to start, positions remain valid in original text | |
| for entity in sorted_entities: | |
| start = entity["start"] | |
| end = entity["end"] | |
| entity_text = entity["text"] | |
| phi_type = entity["type"] | |
| redaction_tag = f"[{phi_type}]" | |
| # Verify the entity text matches what's at this position in the current redacted text | |
| # For end-to-start replacement, earlier positions (larger start) have been modified, | |
| # so we check against the stored entity_text which was extracted from original text | |
| # We still validate the slice matches to catch any alignment issues | |
| try: | |
| # Store details for display | |
| phi_details.insert( | |
| 0, | |
| { | |
| "text": entity_text, | |
| "type": phi_type, | |
| "confidence": entity["score"], | |
| "position": f"{start}-{end}", | |
| }, | |
| ) | |
| # Replace in redacted text using the original positions | |
| # Since we replace from end to start, positions remain valid | |
| redacted_text = ( | |
| redacted_text[:start] + redaction_tag + redacted_text[end:] | |
| ) | |
| except (IndexError, ValueError) as e: | |
| print(f"Warning: Error replacing entity at position {start}-{end}: {e}") | |
| print( | |
| f" Entity text: '{entity_text}', Redacted text length: {len(redacted_text)}" | |
| ) | |
| # Format output | |
| output = "## π PHI Detection & Redaction Results\n\n" | |
| output += f"**Found {len(phi_details)} PHI entity(ies):**\n\n" | |
| for idx, detail in enumerate(phi_details, 1): | |
| output += f"{idx}. **{detail['text']}** β `{detail['type']}` " | |
| output += f"(Confidence: {detail['confidence']:.2%})\n" | |
| output += "\n---\n\n" | |
| output += "### π Original Text\n```\n" | |
| output += text | |
| output += "\n```\n\n" | |
| output += "### π Redacted Text\n```\n" | |
| output += redacted_text | |
| output += "\n```\n" | |
| return output | |
| except Exception as e: | |
| import traceback | |
| error_details = traceback.format_exc() | |
| print(f"Error in detect_and_redact_phi: {error_details}") | |
| return f"β **Error:** {str(e)}" | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=detect_and_redact_phi, | |
| inputs=gr.Textbox( | |
| label="Enter Text to Analyze", | |
| placeholder="Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024.", | |
| lines=8, | |
| ), | |
| outputs=gr.Markdown(label="PHI Detection & Redaction Results"), | |
| title="π₯ Stanford PHI Detector & Redactor", | |
| description="Detect and redact Protected Health Information (PHI) using Stanford's de-identification model.", | |
| examples=[ | |
| ["Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024."], | |
| [ | |
| "Jane Smith, DOB: 03/22/1980, Phone: (555) 123-4567, Address: 123 Main St, Boston, MA" | |
| ], | |
| [ | |
| "MRN: 98765432. Dr. Anderson saw the patient at Massachusetts General Hospital on December 15, 2024." | |
| ], | |
| ], | |
| theme="soft", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False, # Set to True for public link | |
| ) | |