Spaces:
Running
Running
File size: 7,144 Bytes
0e8e166 dbe4b72 0e8e166 baf7357 dbe4b72 127dda2 dbe4b72 127dda2 dbe4b72 baf7357 dbe4b72 127dda2 dbe4b72 baf7357 dbe4b72 baf7357 64eec9f b9efb1f baf7357 127dda2 dbe4b72 b9efb1f 127dda2 b9efb1f 127dda2 b9efb1f 127dda2 b9efb1f 127dda2 b9efb1f 127dda2 b9efb1f 127dda2 b9efb1f 64eec9f 127dda2 64eec9f 127dda2 baf7357 127dda2 dbe4b72 baf7357 dbe4b72 b9efb1f baf7357 dbe4b72 127dda2 dbe4b72 baf7357 dbe4b72 127dda2 baf7357 dbe4b72 2a996fc baf7357 2a996fc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 |
import gradio as gr
from transformers import pipeline
# Load Stanford PHI detector model
print("Loading Stanford PHI detector model...")
phi_detector = pipeline(
"token-classification",
model="StanfordAIMI/stanford-deidentifier-base",
aggregation_strategy="simple",
device=-1, # CPU mode
)
print("Model loaded successfully!")
def detect_and_redact_phi(text):
"""
Detect and redact PHI in text using Stanford's PHI detector
Args:
text: Input text to analyze
Returns:
Formatted string with redacted text and original text comparison
"""
if not text or not text.strip():
return "β οΈ Please enter some text to analyze."
try:
# Get PHI predictions
results = phi_detector(text)
print(results)
if not results:
output = "## β
No PHI Detected\n\n"
output += "**Original Text:**\n```\n"
output += text
output += "\n```\n\n"
output += "**Redacted Text:**\n```\n"
output += text
output += "\n```\n"
return output
# Validate and clean entity results
text_len = len(text)
valid_entities = []
for entity in results:
start = entity.get("start", 0)
end = entity.get("end", 0)
# Validate positions are within bounds
if start < 0 or end > text_len or start >= end:
print(
f"Warning: Invalid entity positions {start}-{end} for entity: {entity}"
)
continue
# Extract entity text using position-based slicing (most reliable for redaction)
entity_text = text[start:end]
# Ensure we have valid text to redact (skip empty or whitespace-only entities)
if not entity_text.strip():
continue
valid_entities.append(
{
"start": start,
"end": end,
"text": entity_text,
"type": entity.get("entity_group", "UNKNOWN"),
"score": entity.get("score", 0.0),
}
)
if not valid_entities:
output = "## β
No Valid PHI Detected\n\n"
output += "**Original Text:**\n```\n"
output += text
output += "\n```\n\n"
output += "**Redacted Text:**\n```\n"
output += text
output += "\n```\n"
return output
# Sort results by start position in reverse to replace from end to start
# This prevents index shifting issues when replacing
sorted_entities = sorted(valid_entities, key=lambda x: x["start"], reverse=True)
# Remove overlapping entities (keep the first/longest one when sorted)
non_overlapping = []
used_ranges = []
for entity in sorted(valid_entities, key=lambda x: (x["start"], -x["end"])):
start, end = entity["start"], entity["end"]
# Check for overlap with already processed entities
overlaps = any(
not (end <= used_start or start >= used_end)
for used_start, used_end in used_ranges
)
if not overlaps:
non_overlapping.append(entity)
used_ranges.append((start, end))
# Sort again in reverse for replacement
sorted_entities = sorted(
non_overlapping, key=lambda x: x["start"], reverse=True
)
redacted_text = text
phi_details = []
# Replace PHI entities with redaction markers
# Since we're replacing from end to start, positions remain valid in original text
for entity in sorted_entities:
start = entity["start"]
end = entity["end"]
entity_text = entity["text"]
phi_type = entity["type"]
redaction_tag = f"[{phi_type}]"
# Verify the entity text matches what's at this position in the current redacted text
# For end-to-start replacement, earlier positions (larger start) have been modified,
# so we check against the stored entity_text which was extracted from original text
# We still validate the slice matches to catch any alignment issues
try:
# Store details for display
phi_details.insert(
0,
{
"text": entity_text,
"type": phi_type,
"confidence": entity["score"],
"position": f"{start}-{end}",
},
)
# Replace in redacted text using the original positions
# Since we replace from end to start, positions remain valid
redacted_text = (
redacted_text[:start] + redaction_tag + redacted_text[end:]
)
except (IndexError, ValueError) as e:
print(f"Warning: Error replacing entity at position {start}-{end}: {e}")
print(
f" Entity text: '{entity_text}', Redacted text length: {len(redacted_text)}"
)
# Format output
output = "## π PHI Detection & Redaction Results\n\n"
output += f"**Found {len(phi_details)} PHI entity(ies):**\n\n"
for idx, detail in enumerate(phi_details, 1):
output += f"{idx}. **{detail['text']}** β `{detail['type']}` "
output += f"(Confidence: {detail['confidence']:.2%})\n"
output += "\n---\n\n"
output += "### π Original Text\n```\n"
output += text
output += "\n```\n\n"
output += "### π Redacted Text\n```\n"
output += redacted_text
output += "\n```\n"
return output
except Exception as e:
import traceback
error_details = traceback.format_exc()
print(f"Error in detect_and_redact_phi: {error_details}")
return f"β **Error:** {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=detect_and_redact_phi,
inputs=gr.Textbox(
label="Enter Text to Analyze",
placeholder="Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024.",
lines=8,
),
outputs=gr.Markdown(label="PHI Detection & Redaction Results"),
title="π₯ Stanford PHI Detector & Redactor",
description="Detect and redact Protected Health Information (PHI) using Stanford's de-identification model.",
examples=[
["Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024."],
[
"Jane Smith, DOB: 03/22/1980, Phone: (555) 123-4567, Address: 123 Main St, Boston, MA"
],
[
"MRN: 98765432. Dr. Anderson saw the patient at Massachusetts General Hospital on December 15, 2024."
],
],
theme="soft",
)
if __name__ == "__main__":
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False, # Set to True for public link
)
|