shree256 commited on
Commit
baf7357
Β·
1 Parent(s): 2a996fc

Replace BioBERT with Stanford PHI detector model for improved PHI detection; update function and interface accordingly.

Browse files
Files changed (1) hide show
  1. app.py +46 -106
app.py CHANGED
@@ -1,142 +1,82 @@
1
- import warnings
2
  import gradio as gr
3
  from transformers import pipeline
4
 
5
- # Suppress known warnings
6
- warnings.filterwarnings("ignore", category=FutureWarning, module="torch.distributed")
7
-
8
-
9
- # Load BioBERT-based model for NER/PHI detection
10
- # Using a model fine-tuned for medical NER or a general NER model
11
- # You can replace with a specific PHI detection model if available
12
- MODEL_NAME = "dslim/bert-base-NER" # General NER model as fallback
13
- # Alternative models for medical/PHI detection:
14
- # - "alvaroalon2/biobert_diseases_ner" (if available)
15
- # - "emilyalsentzer/Bio_ClinicalBERT" (clinical text)
16
- # - Any BioBERT-based model fine-tuned for NER
17
-
18
- # Initialize the NER pipeline
19
- try:
20
- # Load a token classification model for NER
21
- ner_pipeline = pipeline(
22
- "token-classification",
23
- model=MODEL_NAME,
24
- tokenizer=MODEL_NAME,
25
- aggregation_strategy="simple",
26
- device=-1, # Use CPU (-1) or GPU (0, 1, etc.)
27
- )
28
- print(f"Successfully loaded model: {MODEL_NAME}")
29
- except Exception as e:
30
- print(f"Error loading model {MODEL_NAME}: {e}")
31
- print("Trying alternative model...")
32
- try:
33
- # Fallback to a smaller/faster model
34
- MODEL_NAME = "dbmdz/bert-large-cased-finetuned-conll03-english"
35
- ner_pipeline = pipeline(
36
- "token-classification",
37
- model=MODEL_NAME,
38
- tokenizer=MODEL_NAME,
39
- aggregation_strategy="simple",
40
- device=-1,
41
- )
42
- print(f"Successfully loaded fallback model: {MODEL_NAME}")
43
- except Exception as e2:
44
- print(f"Error loading fallback model: {e2}")
45
- ner_pipeline = None
46
 
47
 
48
- def detect_phi(text: str) -> str:
49
  """
50
- Detect PHI (Protected Health Information) in the input text using NER model.
51
-
52
- Note: For production use, replace with a BioBERT model fine-tuned specifically
53
- for PHI detection (e.g., models trained on i2b2 or MIMIC datasets).
54
 
55
  Args:
56
- text: Input text containing potential PHI
57
 
58
  Returns:
59
- Formatted string showing detected entities
60
  """
61
  if not text or not text.strip():
62
- return "Please enter some text to analyze."
63
-
64
- if ner_pipeline is None:
65
- return "Model not loaded. Please check the model configuration and install required dependencies."
66
 
67
  try:
68
- # Run NER on the input text
69
- entities = ner_pipeline(text)
70
-
71
- if not entities:
72
- return (
73
- "**No entities detected in the text.**\n\nThis could mean:\n- The text doesn't contain recognizable entities\n- The model needs fine-tuning for PHI-specific detection\n\n**Original Text:**\n"
74
- + text
75
- )
76
-
77
- # PHI-relevant entity types
78
- phi_labels = [
79
- "PER",
80
- "PERSON",
81
- "ORG",
82
- "ORGANIZATION",
83
- "LOC",
84
- "LOCATION",
85
- "MISC",
86
- "DATE",
87
- "TIME",
88
- ]
89
-
90
- # Format the results
91
- result = "**Detected PHI Entities:**\n\n"
92
-
93
- phi_count = 0
94
- for entity in entities:
95
- entity_text = entity.get("word", "")
96
- entity_label = entity.get("entity_group", entity.get("label", "UNKNOWN"))
97
- confidence = entity.get("score", 0.0)
98
-
99
- # Highlight potential PHI entities
100
- is_phi = any(phi_label in entity_label.upper() for phi_label in phi_labels)
101
- phi_indicator = "πŸ”΄ PHI" if is_phi else "βšͺ"
102
 
103
- result += (
104
- f"{phi_indicator} **{entity_text}** β†’ `{entity_label}` "
105
- f"(Confidence: {confidence:.2%})\n"
106
- )
107
 
108
- if is_phi:
109
- phi_count += 1
 
110
 
111
- result += f"\n**Summary:** {phi_count} potential PHI entity(ies) detected out of {len(entities)} total entities.\n"
 
 
 
 
112
 
113
- # Also show the original text
114
- result += "\n---\n\n**Original Text:**\n"
115
- result += text
116
 
117
- return result
118
 
119
  except Exception as e:
120
- return f"**Error processing text:** {str(e)}\n\nPlease check that the model is properly loaded and try again."
121
 
122
 
123
  # Create Gradio interface
124
  demo = gr.Interface(
125
  fn=detect_phi,
126
  inputs=gr.Textbox(
127
- label="PHI Text Input",
128
- placeholder="Enter text containing potential PHI (e.g., 'Patient John Doe, age 45, was admitted on 2024-01-15. SSN: 123-45-6789')",
129
- lines=5,
130
  ),
131
  outputs=gr.Markdown(label="PHI Detection Results"),
132
- title="BioBERT PHI Detection",
133
- description="Enter text containing Protected Health Information (PHI) to detect entities using BioBERT model.",
 
 
 
 
 
 
 
 
 
 
134
  )
135
 
136
  if __name__ == "__main__":
137
  demo.launch(
138
  server_name="0.0.0.0",
139
  server_port=7860,
140
- share=False, # Set to True if you want a public link
141
- show_error=True,
142
  )
 
 
1
  import gradio as gr
2
  from transformers import pipeline
3
 
4
+ # Load Stanford PHI detector model
5
+ print("Loading Stanford PHI detector model...")
6
+ phi_detector = pipeline(
7
+ "token-classification",
8
+ model="StanfordAIMI/stanford-deidentifier-base",
9
+ aggregation_strategy="simple",
10
+ device=-1, # CPU mode
11
+ )
12
+ print("Model loaded successfully!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
 
15
+ def detect_phi(text):
16
  """
17
+ Detect PHI in text using Stanford's PHI detector
 
 
 
18
 
19
  Args:
20
+ text: Input text to analyze
21
 
22
  Returns:
23
+ Formatted string with detected PHI entities
24
  """
25
  if not text or not text.strip():
26
+ return "⚠️ Please enter some text to analyze."
 
 
 
27
 
28
  try:
29
+ # Get PHI predictions
30
+ results = phi_detector(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ if not results:
33
+ return f"βœ… **No PHI detected in the text.**\n\n**Original Text:**\n{text}"
 
 
34
 
35
+ # Format results
36
+ output = "## πŸ” PHI Detection Results\n\n"
37
+ output += f"**Found {len(results)} PHI entity(ies):**\n\n"
38
 
39
+ for idx, entity in enumerate(results, 1):
40
+ output += f"{idx}. **{entity['word']}**\n"
41
+ output += f" - Type: `{entity['entity_group']}`\n"
42
+ output += f" - Confidence: {entity['score']:.2%}\n"
43
+ output += f" - Position: {entity['start']}-{entity['end']}\n\n"
44
 
45
+ output += "---\n\n**Original Text:**\n"
46
+ output += text
 
47
 
48
+ return output
49
 
50
  except Exception as e:
51
+ return f"❌ **Error:** {str(e)}"
52
 
53
 
54
  # Create Gradio interface
55
  demo = gr.Interface(
56
  fn=detect_phi,
57
  inputs=gr.Textbox(
58
+ label="Enter Text to Analyze",
59
+ placeholder="Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024.",
60
+ lines=8,
61
  ),
62
  outputs=gr.Markdown(label="PHI Detection Results"),
63
+ title="πŸ₯ Stanford PHI Detector",
64
+ description="Detect Protected Health Information (PHI) using Stanford's de-identification model.",
65
+ examples=[
66
+ ["Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024."],
67
+ [
68
+ "Jane Smith, DOB: 03/22/1980, Phone: (555) 123-4567, Address: 123 Main St, Boston, MA"
69
+ ],
70
+ [
71
+ "MRN: 98765432. Dr. Anderson saw the patient at Massachusetts General Hospital on December 15, 2024."
72
+ ],
73
+ ],
74
+ theme="soft",
75
  )
76
 
77
  if __name__ == "__main__":
78
  demo.launch(
79
  server_name="0.0.0.0",
80
  server_port=7860,
81
+ share=False, # Set to True for public link
 
82
  )