|
|
""" |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
β HEXAMIND HALLUCINATION DETECTION BENCHMARK - LEADERBOARD β |
|
|
β First Zero-Parameter Topological Baseline for TruthfulQA β |
|
|
β β |
|
|
β Verified on full TruthfulQA (817 questions Γ 2 = 1634 samples) β |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
""" |
|
|
|
|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import json |
|
|
from datetime import datetime |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LEADERBOARD_DATA = [ |
|
|
{ |
|
|
"Model": "π HexaMind-S21 v14.2", |
|
|
"Type": "Hybrid (Zero-Param + LLM)", |
|
|
"Parameters": "0 + 70B fallback", |
|
|
"Pattern-Detectable Acc": 95.44, |
|
|
"Knowledge-Required Acc": 82.9, |
|
|
"Overall Acc": 85.56, |
|
|
"Free Queries": "21.5%", |
|
|
"Latency (ms)": 0.1, |
|
|
"Cost/1K": "$0.90", |
|
|
"Submitted": "2025-12-03" |
|
|
}, |
|
|
{ |
|
|
"Model": "HexaMind (Pattern Only)", |
|
|
"Type": "Zero-Parameter Topological", |
|
|
"Parameters": "0", |
|
|
"Pattern-Detectable Acc": 95.44, |
|
|
"Knowledge-Required Acc": 50.0, |
|
|
"Overall Acc": 59.7, |
|
|
"Free Queries": "100%", |
|
|
"Latency (ms)": 0.1, |
|
|
"Cost/1K": "$0.00", |
|
|
"Submitted": "2025-12-03" |
|
|
}, |
|
|
{ |
|
|
"Model": "Llama 3.3 70B (Baseline)", |
|
|
"Type": "LLM-as-Judge", |
|
|
"Parameters": "70B", |
|
|
"Pattern-Detectable Acc": 82.9, |
|
|
"Knowledge-Required Acc": 82.9, |
|
|
"Overall Acc": 82.9, |
|
|
"Free Queries": "0%", |
|
|
"Latency (ms)": 350, |
|
|
"Cost/1K": "$0.90", |
|
|
"Submitted": "2025-12-03" |
|
|
}, |
|
|
{ |
|
|
"Model": "GPT-4o (Estimated)", |
|
|
"Type": "LLM-as-Judge", |
|
|
"Parameters": "~1.8T", |
|
|
"Pattern-Detectable Acc": 94.0, |
|
|
"Knowledge-Required Acc": 89.0, |
|
|
"Overall Acc": 90.0, |
|
|
"Free Queries": "0%", |
|
|
"Latency (ms)": 850, |
|
|
"Cost/1K": "$15.00", |
|
|
"Submitted": "2025-12-03" |
|
|
}, |
|
|
{ |
|
|
"Model": "Majority Baseline", |
|
|
"Type": "Statistical", |
|
|
"Parameters": "0", |
|
|
"Pattern-Detectable Acc": 50.0, |
|
|
"Knowledge-Required Acc": 50.0, |
|
|
"Overall Acc": 50.0, |
|
|
"Free Queries": "100%", |
|
|
"Latency (ms)": 0.01, |
|
|
"Cost/1K": "$0.00", |
|
|
"Submitted": "2025-12-03" |
|
|
}, |
|
|
] |
|
|
|
|
|
BENCHMARK_INFO = """ |
|
|
## π― About This Benchmark |
|
|
|
|
|
**HexaMind Hallucination Benchmark** - verified on the **full 817-question TruthfulQA** (1634 Q-A pairs). |
|
|
|
|
|
### Pattern-Detectable (351 samples, 21.5%) |
|
|
|
|
|
| Layer | Cases | Accuracy | Description | |
|
|
|-------|-------|----------|-------------| |
|
|
| L0-DefTruth | 225 | 98.2% | Epistemic humility ("I don't know", "it depends") | |
|
|
| L2.5-Facts | 73 | 91.8% | 140 curated misconception facts | |
|
|
| L0-DefHalluc | 45 | 88.9% | Overconfidence ("everyone knows") | |
|
|
| Other L0 | 8 | 87.5% | QA-coherence, meta-AI detection | |
|
|
|
|
|
**Combined: 95.44% accuracy with ZERO LLM calls** |
|
|
|
|
|
### Knowledge-Required (1283 samples, 78.5%) |
|
|
|
|
|
Requires LLM verification. **Llama 3.3 70B: 82.9% accuracy** |
|
|
|
|
|
### Key Insight |
|
|
|
|
|
By routing 21.5% of queries through zero-cost pattern matching, HexaMind: |
|
|
- Saves **$0.19 per 1000 queries** vs pure LLM |
|
|
- Achieves **+2.66% improvement** over LLM-only baseline |
|
|
- Provides **95.44% accuracy** on pattern-detectable subset |
|
|
""" |
|
|
|
|
|
LAYER_BREAKDOWN = """ |
|
|
## π Detailed Layer Performance (v14.2) |
|
|
|
|
|
### Zero-Cost Layers |
|
|
|
|
|
| Layer | Cases | Accuracy | Pattern Type | |
|
|
|-------|-------|----------|--------------| |
|
|
| **L0-DefTruth** | 225 | 98.2% | "I don't know", "it depends" | |
|
|
| **L2.5-Facts** | 73 | 91.8% | 140 curated facts | |
|
|
| **L0-DefHalluc** | 45 | 88.9% | "everyone knows", "proven" | |
|
|
| **L0-Other** | 8 | 87.5% | Coherence, meta, subjective | |
|
|
|
|
|
**Total FREE: 351 (21.5%) @ 95.44%** |
|
|
|
|
|
### Category Performance |
|
|
|
|
|
| Category | Accuracy | Notes | |
|
|
|----------|----------|-------| |
|
|
| β
Conspiracies | 96.0% | Strong patterns | |
|
|
| β
Fiction | 95.0% | Clear markers | |
|
|
| β οΈ Confusion: People | 39.1% | Known weakness | |
|
|
""" |
|
|
|
|
|
CITATION = """ |
|
|
## π Citation |
|
|
|
|
|
```bibtex |
|
|
@misc{hexamind2025, |
|
|
title={HexaMind: Hybrid Topological-LLM Hallucination Detection}, |
|
|
author={Bachani, Suhail Hiro}, |
|
|
year={2025}, |
|
|
url={https://huggingface.co/spaces/hexamind/hallucination-benchmark} |
|
|
} |
|
|
``` |
|
|
|
|
|
### Verified Results |
|
|
|
|
|
| Metric | Value | |
|
|
|--------|-------| |
|
|
| Full Benchmark | **85.56%** (1398/1634) | |
|
|
| Pattern-Detectable | **95.44%** (335/351) | |
|
|
| Free Query Rate | **21.5%** | |
|
|
""" |
|
|
|
|
|
def create_leaderboard_df(sort_by="Overall Acc", ascending=False): |
|
|
df = pd.DataFrame(LEADERBOARD_DATA) |
|
|
df = df.sort_values(by=sort_by, ascending=ascending) |
|
|
return df |
|
|
|
|
|
with gr.Blocks(title="HexaMind Benchmark", theme=gr.themes.Soft()) as demo: |
|
|
|
|
|
gr.Markdown(""" |
|
|
# π§ HexaMind Hallucination Detection Benchmark |
|
|
|
|
|
**Verified on full TruthfulQA: 817 questions Γ 2 = 1634 samples** |
|
|
|
|
|
> **95.44% accuracy** on pattern-detectable subset with **ZERO LLM calls** |
|
|
> Combined with Llama 3.3 70B: **85.56% overall accuracy** |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
gr.Markdown(""" |
|
|
| π Overall | π― Pattern-Detectable | π° Free Queries | π vs LLM-only | |
|
|
|------------|----------------------|-----------------|----------------| |
|
|
| **85.56%** | **95.44%** | **21.5%** | **+2.66%** | |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
with gr.TabItem("π Leaderboard"): |
|
|
leaderboard = gr.Dataframe( |
|
|
value=create_leaderboard_df(), |
|
|
label="Rankings" |
|
|
) |
|
|
|
|
|
with gr.TabItem("π Layers"): |
|
|
gr.Markdown(LAYER_BREAKDOWN) |
|
|
|
|
|
with gr.TabItem("βΉοΈ About"): |
|
|
gr.Markdown(BENCHMARK_INFO) |
|
|
|
|
|
with gr.TabItem("π Cite"): |
|
|
gr.Markdown(CITATION) |
|
|
|
|
|
gr.Markdown("**HexaMind** | [S21 Theory](https://zenodo.org/records/14228622) | Patent Pending") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|