""" ╔══════════════════════════════════════════════════════════════════════════════╗ ║ HEXAMIND HALLUCINATION DETECTION BENCHMARK - LEADERBOARD ║ ║ First Zero-Parameter Topological Baseline for TruthfulQA ║ ║ ║ ║ Verified on full TruthfulQA (817 questions × 2 = 1634 samples) ║ ╚══════════════════════════════════════════════════════════════════════════════╝ """ import gradio as gr import pandas as pd import json from datetime import datetime # ═══════════════════════════════════════════════════════════════════════════════ # LEADERBOARD DATA - VERIFIED v14.2 RESULTS # ═══════════════════════════════════════════════════════════════════════════════ LEADERBOARD_DATA = [ { "Model": "🏆 HexaMind-S21 v14.2", "Type": "Hybrid (Zero-Param + LLM)", "Parameters": "0 + 70B fallback", "Pattern-Detectable Acc": 95.44, "Knowledge-Required Acc": 82.9, "Overall Acc": 85.56, "Free Queries": "21.5%", "Latency (ms)": 0.1, "Cost/1K": "$0.90", "Submitted": "2025-12-03" }, { "Model": "HexaMind (Pattern Only)", "Type": "Zero-Parameter Topological", "Parameters": "0", "Pattern-Detectable Acc": 95.44, "Knowledge-Required Acc": 50.0, "Overall Acc": 59.7, "Free Queries": "100%", "Latency (ms)": 0.1, "Cost/1K": "$0.00", "Submitted": "2025-12-03" }, { "Model": "Llama 3.3 70B (Baseline)", "Type": "LLM-as-Judge", "Parameters": "70B", "Pattern-Detectable Acc": 82.9, "Knowledge-Required Acc": 82.9, "Overall Acc": 82.9, "Free Queries": "0%", "Latency (ms)": 350, "Cost/1K": "$0.90", "Submitted": "2025-12-03" }, { "Model": "GPT-4o (Estimated)", "Type": "LLM-as-Judge", "Parameters": "~1.8T", "Pattern-Detectable Acc": 94.0, "Knowledge-Required Acc": 89.0, "Overall Acc": 90.0, "Free Queries": "0%", "Latency (ms)": 850, "Cost/1K": "$15.00", "Submitted": "2025-12-03" }, { "Model": "Majority Baseline", "Type": "Statistical", "Parameters": "0", "Pattern-Detectable Acc": 50.0, "Knowledge-Required Acc": 50.0, "Overall Acc": 50.0, "Free Queries": "100%", "Latency (ms)": 0.01, "Cost/1K": "$0.00", "Submitted": "2025-12-03" }, ] BENCHMARK_INFO = """ ## 🎯 About This Benchmark **HexaMind Hallucination Benchmark** - verified on the **full 817-question TruthfulQA** (1634 Q-A pairs). ### Pattern-Detectable (351 samples, 21.5%) | Layer | Cases | Accuracy | Description | |-------|-------|----------|-------------| | L0-DefTruth | 225 | 98.2% | Epistemic humility ("I don't know", "it depends") | | L2.5-Facts | 73 | 91.8% | 140 curated misconception facts | | L0-DefHalluc | 45 | 88.9% | Overconfidence ("everyone knows") | | Other L0 | 8 | 87.5% | QA-coherence, meta-AI detection | **Combined: 95.44% accuracy with ZERO LLM calls** ### Knowledge-Required (1283 samples, 78.5%) Requires LLM verification. **Llama 3.3 70B: 82.9% accuracy** ### Key Insight By routing 21.5% of queries through zero-cost pattern matching, HexaMind: - Saves **$0.19 per 1000 queries** vs pure LLM - Achieves **+2.66% improvement** over LLM-only baseline - Provides **95.44% accuracy** on pattern-detectable subset """ LAYER_BREAKDOWN = """ ## 📊 Detailed Layer Performance (v14.2) ### Zero-Cost Layers | Layer | Cases | Accuracy | Pattern Type | |-------|-------|----------|--------------| | **L0-DefTruth** | 225 | 98.2% | "I don't know", "it depends" | | **L2.5-Facts** | 73 | 91.8% | 140 curated facts | | **L0-DefHalluc** | 45 | 88.9% | "everyone knows", "proven" | | **L0-Other** | 8 | 87.5% | Coherence, meta, subjective | **Total FREE: 351 (21.5%) @ 95.44%** ### Category Performance | Category | Accuracy | Notes | |----------|----------|-------| | ✅ Conspiracies | 96.0% | Strong patterns | | ✅ Fiction | 95.0% | Clear markers | | ⚠️ Confusion: People | 39.1% | Known weakness | """ CITATION = """ ## 📚 Citation ```bibtex @misc{hexamind2025, title={HexaMind: Hybrid Topological-LLM Hallucination Detection}, author={Bachani, Suhail Hiro}, year={2025}, url={https://huggingface.co/spaces/hexamind/hallucination-benchmark} } ``` ### Verified Results | Metric | Value | |--------|-------| | Full Benchmark | **85.56%** (1398/1634) | | Pattern-Detectable | **95.44%** (335/351) | | Free Query Rate | **21.5%** | """ def create_leaderboard_df(sort_by="Overall Acc", ascending=False): df = pd.DataFrame(LEADERBOARD_DATA) df = df.sort_values(by=sort_by, ascending=ascending) return df with gr.Blocks(title="HexaMind Benchmark", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🧠 HexaMind Hallucination Detection Benchmark **Verified on full TruthfulQA: 817 questions × 2 = 1634 samples** > **95.44% accuracy** on pattern-detectable subset with **ZERO LLM calls** > Combined with Llama 3.3 70B: **85.56% overall accuracy** """) with gr.Row(): gr.Markdown(""" | 📊 Overall | 🎯 Pattern-Detectable | 💰 Free Queries | 📈 vs LLM-only | |------------|----------------------|-----------------|----------------| | **85.56%** | **95.44%** | **21.5%** | **+2.66%** | """) with gr.Tabs(): with gr.TabItem("🏆 Leaderboard"): leaderboard = gr.Dataframe( value=create_leaderboard_df(), label="Rankings" ) with gr.TabItem("📊 Layers"): gr.Markdown(LAYER_BREAKDOWN) with gr.TabItem("ℹ️ About"): gr.Markdown(BENCHMARK_INFO) with gr.TabItem("📚 Cite"): gr.Markdown(CITATION) gr.Markdown("**HexaMind** | [S21 Theory](https://zenodo.org/records/14228622) | Patent Pending") if __name__ == "__main__": demo.launch()