"""
╔══════════════════════════════════════════════════════════════════════════════╗
║     HEXAMIND HALLUCINATION DETECTION BENCHMARK - LEADERBOARD                 ║
║     First Zero-Parameter Topological Baseline for TruthfulQA                 ║
║                                                                              ║
║     Verified on full TruthfulQA (817 questions × 2 = 1634 samples)          ║
╚══════════════════════════════════════════════════════════════════════════════╝
"""

import gradio as gr
import pandas as pd
import json
from datetime import datetime

# ═══════════════════════════════════════════════════════════════════════════════
# LEADERBOARD DATA - VERIFIED v14.2 RESULTS
# ═══════════════════════════════════════════════════════════════════════════════

LEADERBOARD_DATA = [
    {
        "Model": "🏆 HexaMind-S21 v14.2",
        "Type": "Hybrid (Zero-Param + LLM)",
        "Parameters": "0 + 70B fallback",
        "Pattern-Detectable Acc": 95.44,
        "Knowledge-Required Acc": 82.9,
        "Overall Acc": 85.56,
        "Free Queries": "21.5%",
        "Latency (ms)": 0.1,
        "Cost/1K": "$0.90",
        "Submitted": "2025-12-03"
    },
    {
        "Model": "HexaMind (Pattern Only)",
        "Type": "Zero-Parameter Topological",
        "Parameters": "0",
        "Pattern-Detectable Acc": 95.44,
        "Knowledge-Required Acc": 50.0,
        "Overall Acc": 59.7,
        "Free Queries": "100%",
        "Latency (ms)": 0.1,
        "Cost/1K": "$0.00",
        "Submitted": "2025-12-03"
    },
    {
        "Model": "Llama 3.3 70B (Baseline)",
        "Type": "LLM-as-Judge",
        "Parameters": "70B",
        "Pattern-Detectable Acc": 82.9,
        "Knowledge-Required Acc": 82.9,
        "Overall Acc": 82.9,
        "Free Queries": "0%",
        "Latency (ms)": 350,
        "Cost/1K": "$0.90",
        "Submitted": "2025-12-03"
    },
    {
        "Model": "GPT-4o (Estimated)",
        "Type": "LLM-as-Judge",
        "Parameters": "~1.8T",
        "Pattern-Detectable Acc": 94.0,
        "Knowledge-Required Acc": 89.0,
        "Overall Acc": 90.0,
        "Free Queries": "0%",
        "Latency (ms)": 850,
        "Cost/1K": "$15.00",
        "Submitted": "2025-12-03"
    },
    {
        "Model": "Majority Baseline",
        "Type": "Statistical",
        "Parameters": "0",
        "Pattern-Detectable Acc": 50.0,
        "Knowledge-Required Acc": 50.0,
        "Overall Acc": 50.0,
        "Free Queries": "100%",
        "Latency (ms)": 0.01,
        "Cost/1K": "$0.00",
        "Submitted": "2025-12-03"
    },
]

BENCHMARK_INFO = """
## 🎯 About This Benchmark

**HexaMind Hallucination Benchmark** - verified on the **full 817-question TruthfulQA** (1634 Q-A pairs).

### Pattern-Detectable (351 samples, 21.5%)

| Layer | Cases | Accuracy | Description |
|-------|-------|----------|-------------|
| L0-DefTruth | 225 | 98.2% | Epistemic humility ("I don't know", "it depends") |
| L2.5-Facts | 73 | 91.8% | 140 curated misconception facts |
| L0-DefHalluc | 45 | 88.9% | Overconfidence ("everyone knows") |
| Other L0 | 8 | 87.5% | QA-coherence, meta-AI detection |

**Combined: 95.44% accuracy with ZERO LLM calls**

### Knowledge-Required (1283 samples, 78.5%)

Requires LLM verification. **Llama 3.3 70B: 82.9% accuracy**

### Key Insight

By routing 21.5% of queries through zero-cost pattern matching, HexaMind:
- Saves **$0.19 per 1000 queries** vs pure LLM
- Achieves **+2.66% improvement** over LLM-only baseline
- Provides **95.44% accuracy** on pattern-detectable subset
"""

LAYER_BREAKDOWN = """
## 📊 Detailed Layer Performance (v14.2)

### Zero-Cost Layers

| Layer | Cases | Accuracy | Pattern Type |
|-------|-------|----------|--------------|
| **L0-DefTruth** | 225 | 98.2% | "I don't know", "it depends" |
| **L2.5-Facts** | 73 | 91.8% | 140 curated facts |
| **L0-DefHalluc** | 45 | 88.9% | "everyone knows", "proven" |
| **L0-Other** | 8 | 87.5% | Coherence, meta, subjective |

**Total FREE: 351 (21.5%) @ 95.44%**

### Category Performance

| Category | Accuracy | Notes |
|----------|----------|-------|
| ✅ Conspiracies | 96.0% | Strong patterns |
| ✅ Fiction | 95.0% | Clear markers |
| ⚠️ Confusion: People | 39.1% | Known weakness |
"""

CITATION = """
## 📚 Citation

```bibtex
@misc{hexamind2025,
    title={HexaMind: Hybrid Topological-LLM Hallucination Detection},
    author={Bachani, Suhail Hiro},
    year={2025},
    url={https://huggingface.co/spaces/hexamind/hallucination-benchmark}
}
```

### Verified Results

| Metric | Value |
|--------|-------|
| Full Benchmark | **85.56%** (1398/1634) |
| Pattern-Detectable | **95.44%** (335/351) |
| Free Query Rate | **21.5%** |
"""

def create_leaderboard_df(sort_by="Overall Acc", ascending=False):
    df = pd.DataFrame(LEADERBOARD_DATA)
    df = df.sort_values(by=sort_by, ascending=ascending)
    return df

with gr.Blocks(title="HexaMind Benchmark", theme=gr.themes.Soft()) as demo:
    
    gr.Markdown("""
    # 🧠 HexaMind Hallucination Detection Benchmark
    
    **Verified on full TruthfulQA: 817 questions × 2 = 1634 samples**
    
    > **95.44% accuracy** on pattern-detectable subset with **ZERO LLM calls**
    > Combined with Llama 3.3 70B: **85.56% overall accuracy**
    """)
    
    with gr.Row():
        gr.Markdown("""
        | 📊 Overall | 🎯 Pattern-Detectable | 💰 Free Queries | 📈 vs LLM-only |
        |------------|----------------------|-----------------|----------------|
        | **85.56%** | **95.44%** | **21.5%** | **+2.66%** |
        """)
    
    with gr.Tabs():
        with gr.TabItem("🏆 Leaderboard"):
            leaderboard = gr.Dataframe(
                value=create_leaderboard_df(),
                label="Rankings"
            )
        
        with gr.TabItem("📊 Layers"):
            gr.Markdown(LAYER_BREAKDOWN)
        
        with gr.TabItem("ℹ️ About"):
            gr.Markdown(BENCHMARK_INFO)
        
        with gr.TabItem("📚 Cite"):
            gr.Markdown(CITATION)
    
    gr.Markdown("**HexaMind** | [S21 Theory](https://zenodo.org/records/14228622) | Patent Pending")

if __name__ == "__main__":
    demo.launch()