S21MIND / app.py
s21mind's picture
Update app.py
22ac1c8 verified
"""
╔══════════════════════════════════════════════════════════════════════════════╗
β•‘ HEXAMIND HALLUCINATION DETECTION BENCHMARK - LEADERBOARD β•‘
β•‘ First Zero-Parameter Topological Baseline for TruthfulQA β•‘
β•‘ β•‘
β•‘ Verified on full TruthfulQA (817 questions Γ— 2 = 1634 samples) β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
"""
import gradio as gr
import pandas as pd
import json
from datetime import datetime
# ═══════════════════════════════════════════════════════════════════════════════
# LEADERBOARD DATA - VERIFIED v14.2 RESULTS
# ═══════════════════════════════════════════════════════════════════════════════
LEADERBOARD_DATA = [
{
"Model": "πŸ† HexaMind-S21 v14.2",
"Type": "Hybrid (Zero-Param + LLM)",
"Parameters": "0 + 70B fallback",
"Pattern-Detectable Acc": 95.44,
"Knowledge-Required Acc": 82.9,
"Overall Acc": 85.56,
"Free Queries": "21.5%",
"Latency (ms)": 0.1,
"Cost/1K": "$0.90",
"Submitted": "2025-12-03"
},
{
"Model": "HexaMind (Pattern Only)",
"Type": "Zero-Parameter Topological",
"Parameters": "0",
"Pattern-Detectable Acc": 95.44,
"Knowledge-Required Acc": 50.0,
"Overall Acc": 59.7,
"Free Queries": "100%",
"Latency (ms)": 0.1,
"Cost/1K": "$0.00",
"Submitted": "2025-12-03"
},
{
"Model": "Llama 3.3 70B (Baseline)",
"Type": "LLM-as-Judge",
"Parameters": "70B",
"Pattern-Detectable Acc": 82.9,
"Knowledge-Required Acc": 82.9,
"Overall Acc": 82.9,
"Free Queries": "0%",
"Latency (ms)": 350,
"Cost/1K": "$0.90",
"Submitted": "2025-12-03"
},
{
"Model": "GPT-4o (Estimated)",
"Type": "LLM-as-Judge",
"Parameters": "~1.8T",
"Pattern-Detectable Acc": 94.0,
"Knowledge-Required Acc": 89.0,
"Overall Acc": 90.0,
"Free Queries": "0%",
"Latency (ms)": 850,
"Cost/1K": "$15.00",
"Submitted": "2025-12-03"
},
{
"Model": "Majority Baseline",
"Type": "Statistical",
"Parameters": "0",
"Pattern-Detectable Acc": 50.0,
"Knowledge-Required Acc": 50.0,
"Overall Acc": 50.0,
"Free Queries": "100%",
"Latency (ms)": 0.01,
"Cost/1K": "$0.00",
"Submitted": "2025-12-03"
},
]
BENCHMARK_INFO = """
## 🎯 About This Benchmark
**HexaMind Hallucination Benchmark** - verified on the **full 817-question TruthfulQA** (1634 Q-A pairs).
### Pattern-Detectable (351 samples, 21.5%)
| Layer | Cases | Accuracy | Description |
|-------|-------|----------|-------------|
| L0-DefTruth | 225 | 98.2% | Epistemic humility ("I don't know", "it depends") |
| L2.5-Facts | 73 | 91.8% | 140 curated misconception facts |
| L0-DefHalluc | 45 | 88.9% | Overconfidence ("everyone knows") |
| Other L0 | 8 | 87.5% | QA-coherence, meta-AI detection |
**Combined: 95.44% accuracy with ZERO LLM calls**
### Knowledge-Required (1283 samples, 78.5%)
Requires LLM verification. **Llama 3.3 70B: 82.9% accuracy**
### Key Insight
By routing 21.5% of queries through zero-cost pattern matching, HexaMind:
- Saves **$0.19 per 1000 queries** vs pure LLM
- Achieves **+2.66% improvement** over LLM-only baseline
- Provides **95.44% accuracy** on pattern-detectable subset
"""
LAYER_BREAKDOWN = """
## πŸ“Š Detailed Layer Performance (v14.2)
### Zero-Cost Layers
| Layer | Cases | Accuracy | Pattern Type |
|-------|-------|----------|--------------|
| **L0-DefTruth** | 225 | 98.2% | "I don't know", "it depends" |
| **L2.5-Facts** | 73 | 91.8% | 140 curated facts |
| **L0-DefHalluc** | 45 | 88.9% | "everyone knows", "proven" |
| **L0-Other** | 8 | 87.5% | Coherence, meta, subjective |
**Total FREE: 351 (21.5%) @ 95.44%**
### Category Performance
| Category | Accuracy | Notes |
|----------|----------|-------|
| βœ… Conspiracies | 96.0% | Strong patterns |
| βœ… Fiction | 95.0% | Clear markers |
| ⚠️ Confusion: People | 39.1% | Known weakness |
"""
CITATION = """
## πŸ“š Citation
```bibtex
@misc{hexamind2025,
title={HexaMind: Hybrid Topological-LLM Hallucination Detection},
author={Bachani, Suhail Hiro},
year={2025},
url={https://huggingface.co/spaces/hexamind/hallucination-benchmark}
}
```
### Verified Results
| Metric | Value |
|--------|-------|
| Full Benchmark | **85.56%** (1398/1634) |
| Pattern-Detectable | **95.44%** (335/351) |
| Free Query Rate | **21.5%** |
"""
def create_leaderboard_df(sort_by="Overall Acc", ascending=False):
df = pd.DataFrame(LEADERBOARD_DATA)
df = df.sort_values(by=sort_by, ascending=ascending)
return df
with gr.Blocks(title="HexaMind Benchmark", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🧠 HexaMind Hallucination Detection Benchmark
**Verified on full TruthfulQA: 817 questions Γ— 2 = 1634 samples**
> **95.44% accuracy** on pattern-detectable subset with **ZERO LLM calls**
> Combined with Llama 3.3 70B: **85.56% overall accuracy**
""")
with gr.Row():
gr.Markdown("""
| πŸ“Š Overall | 🎯 Pattern-Detectable | πŸ’° Free Queries | πŸ“ˆ vs LLM-only |
|------------|----------------------|-----------------|----------------|
| **85.56%** | **95.44%** | **21.5%** | **+2.66%** |
""")
with gr.Tabs():
with gr.TabItem("πŸ† Leaderboard"):
leaderboard = gr.Dataframe(
value=create_leaderboard_df(),
label="Rankings"
)
with gr.TabItem("πŸ“Š Layers"):
gr.Markdown(LAYER_BREAKDOWN)
with gr.TabItem("ℹ️ About"):
gr.Markdown(BENCHMARK_INFO)
with gr.TabItem("πŸ“š Cite"):
gr.Markdown(CITATION)
gr.Markdown("**HexaMind** | [S21 Theory](https://zenodo.org/records/14228622) | Patent Pending")
if __name__ == "__main__":
demo.launch()