Spaces:

shreepanicker
/

phi-detection

Running

phi-detection / app.py

shree256

updates with biobert

dbe4b72 6 days ago

3.26 kB

	import gradio as gr
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	from transformers import pipeline
	import torch


	# Load BioBERT model for NER (using a medical NER model based on BioBERT)
	# You can replace this with a specific PHI detection model if available
	MODEL_NAME = "dmis-lab/biobert-v1.1"
	# Alternative: Use a medical NER model if available, e.g., "alvaroalon2/biobert_diseases_ner"

	# Initialize the NER pipeline
	try:
	# Try to load a tokenizer and model for token classification
	tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
	# For PHI detection, we'll use a simple approach with the base model
	# In production, you'd use a fine-tuned model for PHI detection
	ner_pipeline = pipeline(
	"token-classification",
	model=MODEL_NAME,
	tokenizer=MODEL_NAME,
	aggregation_strategy="simple",
	)
	except Exception as e:
	print(f"Error loading model: {e}")
	print("Falling back to a simpler approach...")
	ner_pipeline = None


	def detect_phi(text: str) -> str:
	"""
	Detect PHI (Protected Health Information) in the input text using BioBERT.

	Args:
	text: Input text containing potential PHI

	Returns:
	Formatted string showing detected entities
	"""
	if not text or not text.strip():
	return "Please enter some text to analyze."

	if ner_pipeline is None:
	return "Model not loaded. Please check the model configuration."

	try:
	# Run NER on the input text
	entities = ner_pipeline(text)

	if not entities:
	return "No entities detected in the text."

	# Format the results
	result = "Detected PHI Entities:\n\n"

	for entity in entities:
	entity_text = entity.get("word", "")
	entity_label = entity.get("entity_group", entity.get("label", "UNKNOWN"))
	confidence = entity.get("score", 0.0)

	result += (
	f"- {entity_text} ({entity_label}) - Confidence: {confidence:.2%}\n"
	)

	# Also show the original text with highlights
	result += "\n---\n\nOriginal Text:\n"
	result += text

	return result

	except Exception as e:
	return f"Error processing text: {str(e)}"


	# Create Gradio interface
	demo = gr.Interface(
	fn=detect_phi,
	inputs=gr.Textbox(
	label="PHI Text Input",
	placeholder="Enter text containing potential PHI (e.g., 'Patient John Doe, age 45, was admitted on 2024-01-15. SSN: 123-45-6789')",
	lines=5,
	),
	outputs=gr.Markdown(label="PHI Detection Results"),
	title="BioBERT PHI Detection",
	description="Enter text containing Protected Health Information (PHI) to detect entities using BioBERT model.",
	examples=[
	[
	"Patient John Smith, age 52, was admitted to Memorial Hospital on January 15, 2024. Contact: [email protected]"
	],
	[
	"Dr. Sarah Johnson treated patient ID 12345 at the clinic located at 123 Main St, Boston, MA 02101."
	],
	[
	"The patient's date of birth is 03/15/1975 and their medical record number is MRN-987654."
	],
	],
	)

	if __name__ == "__main__":
	demo.launch()