import gradio as gr
from transformers import pipeline

# Load Stanford PHI detector model
print("Loading Stanford PHI detector model...")
phi_detector = pipeline(
    "token-classification",
    model="StanfordAIMI/stanford-deidentifier-base",
    aggregation_strategy="simple",
    device=-1,  # CPU mode
)
print("Model loaded successfully!")


def detect_and_redact_phi(text):
    """
    Detect and redact PHI in text using Stanford's PHI detector

    Args:
        text: Input text to analyze

    Returns:
        Formatted string with redacted text and original text comparison
    """
    if not text or not text.strip():
        return "⚠️ Please enter some text to analyze."

    try:
        # Get PHI predictions
        results = phi_detector(text)

        print(results)

        if not results:
            output = "## ✅ No PHI Detected\n\n"
            output += "**Original Text:**\n```\n"
            output += text
            output += "\n```\n\n"
            output += "**Redacted Text:**\n```\n"
            output += text
            output += "\n```\n"
            return output

        # Validate and clean entity results
        text_len = len(text)
        valid_entities = []

        for entity in results:
            start = entity.get("start", 0)
            end = entity.get("end", 0)

            # Validate positions are within bounds
            if start < 0 or end > text_len or start >= end:
                print(
                    f"Warning: Invalid entity positions {start}-{end} for entity: {entity}"
                )
                continue

            # Extract entity text using position-based slicing (most reliable for redaction)
            entity_text = text[start:end]

            # Ensure we have valid text to redact (skip empty or whitespace-only entities)
            if not entity_text.strip():
                continue

            valid_entities.append(
                {
                    "start": start,
                    "end": end,
                    "text": entity_text,
                    "type": entity.get("entity_group", "UNKNOWN"),
                    "score": entity.get("score", 0.0),
                }
            )

        if not valid_entities:
            output = "## ✅ No Valid PHI Detected\n\n"
            output += "**Original Text:**\n```\n"
            output += text
            output += "\n```\n\n"
            output += "**Redacted Text:**\n```\n"
            output += text
            output += "\n```\n"
            return output

        # Sort results by start position in reverse to replace from end to start
        # This prevents index shifting issues when replacing
        sorted_entities = sorted(valid_entities, key=lambda x: x["start"], reverse=True)

        # Remove overlapping entities (keep the first/longest one when sorted)
        non_overlapping = []
        used_ranges = []
        for entity in sorted(valid_entities, key=lambda x: (x["start"], -x["end"])):
            start, end = entity["start"], entity["end"]
            # Check for overlap with already processed entities
            overlaps = any(
                not (end <= used_start or start >= used_end)
                for used_start, used_end in used_ranges
            )
            if not overlaps:
                non_overlapping.append(entity)
                used_ranges.append((start, end))

        # Sort again in reverse for replacement
        sorted_entities = sorted(
            non_overlapping, key=lambda x: x["start"], reverse=True
        )
        redacted_text = text
        phi_details = []

        # Replace PHI entities with redaction markers
        # Since we're replacing from end to start, positions remain valid in original text
        for entity in sorted_entities:
            start = entity["start"]
            end = entity["end"]
            entity_text = entity["text"]
            phi_type = entity["type"]
            redaction_tag = f"[{phi_type}]"

            # Verify the entity text matches what's at this position in the current redacted text
            # For end-to-start replacement, earlier positions (larger start) have been modified,
            # so we check against the stored entity_text which was extracted from original text
            # We still validate the slice matches to catch any alignment issues
            try:
                # Store details for display
                phi_details.insert(
                    0,
                    {
                        "text": entity_text,
                        "type": phi_type,
                        "confidence": entity["score"],
                        "position": f"{start}-{end}",
                    },
                )

                # Replace in redacted text using the original positions
                # Since we replace from end to start, positions remain valid
                redacted_text = (
                    redacted_text[:start] + redaction_tag + redacted_text[end:]
                )
            except (IndexError, ValueError) as e:
                print(f"Warning: Error replacing entity at position {start}-{end}: {e}")
                print(
                    f"  Entity text: '{entity_text}', Redacted text length: {len(redacted_text)}"
                )

        # Format output
        output = "## 🔍 PHI Detection & Redaction Results\n\n"
        output += f"**Found {len(phi_details)} PHI entity(ies):**\n\n"

        for idx, detail in enumerate(phi_details, 1):
            output += f"{idx}. **{detail['text']}** → `{detail['type']}` "
            output += f"(Confidence: {detail['confidence']:.2%})\n"

        output += "\n---\n\n"
        output += "### 📄 Original Text\n```\n"
        output += text
        output += "\n```\n\n"
        output += "### 🔒 Redacted Text\n```\n"
        output += redacted_text
        output += "\n```\n"

        return output

    except Exception as e:
        import traceback

        error_details = traceback.format_exc()
        print(f"Error in detect_and_redact_phi: {error_details}")
        return f"❌ **Error:** {str(e)}"


# Create Gradio interface
demo = gr.Interface(
    fn=detect_and_redact_phi,
    inputs=gr.Textbox(
        label="Enter Text to Analyze",
        placeholder="Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024.",
        lines=8,
    ),
    outputs=gr.Markdown(label="PHI Detection & Redaction Results"),
    title="🏥 Stanford PHI Detector & Redactor",
    description="Detect and redact Protected Health Information (PHI) using Stanford's de-identification model.",
    examples=[
        ["Patient John Doe, SSN: 123-45-6789, visited on 01/15/2024."],
        [
            "Jane Smith, DOB: 03/22/1980, Phone: (555) 123-4567, Address: 123 Main St, Boston, MA"
        ],
        [
            "MRN: 98765432. Dr. Anderson saw the patient at Massachusetts General Hospital on December 15, 2024."
        ],
    ],
    theme="soft",
)

if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,  # Set to True for public link
    )