Spaces:
Running
Running
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForTokenClassification | |
| from transformers import pipeline | |
| import torch | |
| # Load BioBERT model for NER (using a medical NER model based on BioBERT) | |
| # You can replace this with a specific PHI detection model if available | |
| MODEL_NAME = "dmis-lab/biobert-v1.1" | |
| # Alternative: Use a medical NER model if available, e.g., "alvaroalon2/biobert_diseases_ner" | |
| # Initialize the NER pipeline | |
| try: | |
| # Try to load a tokenizer and model for token classification | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) | |
| # For PHI detection, we'll use a simple approach with the base model | |
| # In production, you'd use a fine-tuned model for PHI detection | |
| ner_pipeline = pipeline( | |
| "token-classification", | |
| model=MODEL_NAME, | |
| tokenizer=MODEL_NAME, | |
| aggregation_strategy="simple", | |
| ) | |
| except Exception as e: | |
| print(f"Error loading model: {e}") | |
| print("Falling back to a simpler approach...") | |
| ner_pipeline = None | |
| def detect_phi(text: str) -> str: | |
| """ | |
| Detect PHI (Protected Health Information) in the input text using BioBERT. | |
| Args: | |
| text: Input text containing potential PHI | |
| Returns: | |
| Formatted string showing detected entities | |
| """ | |
| if not text or not text.strip(): | |
| return "Please enter some text to analyze." | |
| if ner_pipeline is None: | |
| return "Model not loaded. Please check the model configuration." | |
| try: | |
| # Run NER on the input text | |
| entities = ner_pipeline(text) | |
| if not entities: | |
| return "No entities detected in the text." | |
| # Format the results | |
| result = "**Detected PHI Entities:**\n\n" | |
| for entity in entities: | |
| entity_text = entity.get("word", "") | |
| entity_label = entity.get("entity_group", entity.get("label", "UNKNOWN")) | |
| confidence = entity.get("score", 0.0) | |
| result += ( | |
| f"- **{entity_text}** ({entity_label}) - Confidence: {confidence:.2%}\n" | |
| ) | |
| # Also show the original text with highlights | |
| result += "\n---\n\n**Original Text:**\n" | |
| result += text | |
| return result | |
| except Exception as e: | |
| return f"Error processing text: {str(e)}" | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=detect_phi, | |
| inputs=gr.Textbox( | |
| label="PHI Text Input", | |
| placeholder="Enter text containing potential PHI (e.g., 'Patient John Doe, age 45, was admitted on 2024-01-15. SSN: 123-45-6789')", | |
| lines=5, | |
| ), | |
| outputs=gr.Markdown(label="PHI Detection Results"), | |
| title="BioBERT PHI Detection", | |
| description="Enter text containing Protected Health Information (PHI) to detect entities using BioBERT model.", | |
| examples=[ | |
| [ | |
| "Patient John Smith, age 52, was admitted to Memorial Hospital on January 15, 2024. Contact: [email protected]" | |
| ], | |
| [ | |
| "Dr. Sarah Johnson treated patient ID 12345 at the clinic located at 123 Main St, Boston, MA 02101." | |
| ], | |
| [ | |
| "The patient's date of birth is 03/15/1975 and their medical record number is MRN-987654." | |
| ], | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |