Spaces:

shreepanicker
/

phi-detection

Running

File size: 7,144 Bytes

0e8e166
dbe4b72
0e8e166
baf7357
 
 
 
 
 
 
 
 
dbe4b72
 
127dda2
dbe4b72
127dda2
dbe4b72
 
baf7357
dbe4b72
 
127dda2
dbe4b72
 
baf7357
dbe4b72
 
baf7357
 
64eec9f
b9efb1f
 
baf7357
127dda2
 
 
 
 
 
 
 
dbe4b72
b9efb1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127dda2
b9efb1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127dda2
b9efb1f
127dda2
 
b9efb1f
 
127dda2
 
b9efb1f
 
127dda2
 
b9efb1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127dda2
 
 
b9efb1f
64eec9f
127dda2
 
 
64eec9f
127dda2
 
baf7357
127dda2
 
 
 
dbe4b72
baf7357
dbe4b72
 
b9efb1f
 
 
 
baf7357
dbe4b72
 
 
 
127dda2
dbe4b72
baf7357
 
 
dbe4b72
127dda2
 
 
baf7357
 
 
 
 
 
 
 
 
 
dbe4b72
 
 
2a996fc
 
 
baf7357
2a996fc

import gradio as gr
from transformers import pipeline

# Load Stanford PHI detector model
print("Loading Stanford PHI detector model...")
phi_detector = pipeline(
    "token-classification",
    model="StanfordAIMI/stanford-deidentifier-base",
    aggregation_strategy="simple",
    device=-1,  # CPU mode
)
print("Model loaded successfully!")


def detect_and_redact_phi(text):
    """
    Detect and redact PHI in text using Stanford's PHI detector

    Args:
        text: Input text to analyze

    Returns:
        Formatted string with redacted text and original text comparison
    """
    if not text or not text.strip():
        return "⚠️ Please enter some text to analyze."

    try:
        # Get PHI predictions
        results = phi_detector(text)

        print(results)

        if not results:
            output = "## ✅ No PHI Detected\n\n"
            output += "**Original Text:**\n```\n"
            output += text
            output += "\n```\n\n"
            output += "**Redacted Text:**\n```\n"
            output += text
            output += "\n```\n"
            return output

        # Validate and clean entity results
        text_len = len(text)
        valid_entities = []

        for entity in results:
            start = entity.get("start", 0)
            end = entity.get("end", 0)

            # Validate positions are within bounds
            if start < 0 or end > text_len or start >= end:
                print(
                    f"Warning: Invalid entity positions {start}-{end} for entity: {entity}"
                )
                continue

            # Extract entity text using position-based slicing (most reliable for redaction)
            entity_text = text[start:end]

            # Ensure we have valid text to redact (skip empty or whitespace-only entities)
            if not entity_text.strip():
                continue

            valid_entities.append(
                {
                    "start": start,
                    "end": end,
                    "text": entity_text,
                    "type": entity.get("entity_group", "UNKNOWN"),
                    "score": entity.get("score", 0.0),
                }
            )

        if not valid_entities:
            output = "## ✅ No Valid PHI Detected\n\n"
            output += "**Original Text:**\n```\n"
            output += text
            output += "\n```\n\n"
            output += "**Redacted Text:**\n```\n"
            output += text
            output += "\n```\n"
            return output

        # Sort results by start position in reverse to replace from end to start
        # This prevents index shifting issues when replacing
        sorted_entities = sorted(valid_entities, key=lambda x: x["start"], reverse=True)

        # Remove overlapping entities (keep the first/longest one when sorted)
        non_overlapping = []
        used_ranges = []
        for entity in sorted(valid_entities, key=lambda x: (x["start"], -x["end"])):
            start, end = entity["start"], entity["end"]
            # Check for overlap with already processed entities
            overlaps = any(
                not (end <= used_start or start >= used_end)
                for used_start, used_end in used_ranges
            )
            if not overlaps:
                non_overlapping.append(entity)
                used_ranges.append((start, end))

        # Sort again in reverse for replacement
        sorted_entities = sorted(
            non_overlapping, key=lambda x: x["start"], reverse=True
        )
        redacted_text = text
        phi_details = []

        # Replace PHI entities with redaction markers
        # Since we're replacing from end to start, positions remain valid in original text
        for entity in sorted_entities:
            start = entity["start"]
            end = entity["end"]
            entity_text = entity["text"]
            phi_type = entity["type"]
            redaction_tag = f"[{phi_type}]"

            # Verify the entity text matches what's at this position in the current redacted text
            # For end-to-start replacement, earlier positions (larger start) have been modified,
            # so we check against the stored entity_text which was extracted from original text
            # We still validate the slice matches to catch any alignment issues
            try:
                # Store details for display
                phi_details.insert(
                    0,
                    {
                        "text": entity_text,
                        "type": phi_type,
                        "confidence": entity["score"],
                        "position": f"{start}-{end}",
                    },
                )

                # Replace in redacted text using the original positions
                # Since we replace from end to start, positions remain valid
                redacted_text = (
                    redacted_text[:start] + redaction_tag + redacted_text[end:]
                )
            except (IndexError, ValueError) as e:
                print(f"Warning: Error replacing entity at position {start}-{end}: {e}")
                print(
                    f"  Entity text: '{entity_text}', Redacted text length: {len(redacted_text)}"
                )

        # Format output
        output = "## 🔍 PHI Detection & Redaction Results\n\n"
        output += f"**Found {len(phi_details)} PHI entity(ies):**\n\n"

        for idx, detail in enumerate(phi_details, 1):
            output += f"{idx}. **{detail['text']}** → `{detail['type']}` "
            output += f"(Confidence: {detail['confidence']:.2%})\n"

        output += "\n---\n\n"
        output += "### 📄 Original Text\n```\n"
        output += text
        output += "\n```\n\n"
        output += "### 🔒 Redacted Text\n```\n"
        output += redacted_text
        output += "\n```\n"

        return output

    except Exception as e:
        import traceback

        error_details = traceback.format_exc()
        print(f"Error in detect_and_redact_phi: {error_details}")
        return f"❌ **Error:** {str(e)}"


# Create Gradio interface
demo = gr.Interface(
    fn=detect_and_redact_phi,
    inputs=gr.Textbox(
        label="Enter Text to Analyze",
        placeholder="Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024.",
        lines=8,
    ),
    outputs=gr.Markdown(label="PHI Detection & Redaction Results"),
    title="🏥 Stanford PHI Detector & Redactor",
    description="Detect and redact Protected Health Information (PHI) using Stanford's de-identification model.",
    examples=[
        ["Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024."],
        [
            "Jane Smith, DOB: 03/22/1980, Phone: (555) 123-4567, Address: 123 Main St, Boston, MA"
        ],
        [
            "MRN: 98765432. Dr. Anderson saw the patient at Massachusetts General Hospital on December 15, 2024."
        ],
    ],
    theme="soft",
)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,  # Set to True for public link
    )