S21MIND / hexamind_benchmark.py
s21mind's picture
Upload hexamind_benchmark.py
ca0826b verified
"""
╔══════════════════════════════════════════════════════════════════════════════╗
β•‘ HEXAMIND HALLUCINATION BENCHMARK - EVALUATION SCRIPT β•‘
β•‘ Evaluate your model on Pattern-Detectable vs Knowledge-Required splits β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
Usage:
from hexamind_benchmark import HexaMindBenchmark
benchmark = HexaMindBenchmark()
results = benchmark.evaluate(your_model_function)
"""
import json
import os
from typing import Callable, Dict, List, Optional
from dataclasses import dataclass
import time
@dataclass
class EvaluationResult:
"""Results from benchmark evaluation"""
pattern_detectable_accuracy: float
knowledge_required_accuracy: float
overall_accuracy: float
pattern_detectable_samples: int
knowledge_required_samples: int
total_samples: int
avg_latency_ms: float
def to_dict(self) -> Dict:
return {
"pattern_detectable_accuracy": round(self.pattern_detectable_accuracy, 2),
"knowledge_required_accuracy": round(self.knowledge_required_accuracy, 2),
"overall_accuracy": round(self.overall_accuracy, 2),
"pattern_detectable_samples": self.pattern_detectable_samples,
"knowledge_required_samples": self.knowledge_required_samples,
"total_samples": self.total_samples,
"avg_latency_ms": round(self.avg_latency_ms, 2)
}
def __str__(self) -> str:
return f"""
╔══════════════════════════════════════════════════════════════╗
β•‘ HEXAMIND BENCHMARK EVALUATION RESULTS β•‘
╠══════════════════════════════════════════════════════════════╣
β•‘ Pattern-Detectable Accuracy: {self.pattern_detectable_accuracy:6.2f}% (n={self.pattern_detectable_samples:3d}) β•‘
β•‘ Knowledge-Required Accuracy: {self.knowledge_required_accuracy:6.2f}% (n={self.knowledge_required_samples:3d}) β•‘
β•‘ ──────────────────────────────────────────────────────── β•‘
β•‘ Overall Accuracy: {self.overall_accuracy:6.2f}% (n={self.total_samples:3d}) β•‘
β•‘ Average Latency: {self.avg_latency_ms:6.2f} ms β•‘
β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
"""
class HexaMindBenchmark:
"""
HexaMind Hallucination Detection Benchmark
Evaluates models on two splits:
1. Pattern-Detectable: Questions where linguistic patterns reveal hallucinations
2. Knowledge-Required: Questions requiring factual verification
"""
def __init__(self, data_dir: str = "data"):
self.data_dir = data_dir
self.pattern_detectable = self._load_split("pattern_detectable.json")
self.knowledge_required = self._load_split("knowledge_required.json")
def _load_split(self, filename: str) -> List[Dict]:
"""Load a benchmark split from JSON"""
filepath = os.path.join(self.data_dir, filename)
if os.path.exists(filepath):
with open(filepath, 'r') as f:
return json.load(f)
else:
print(f"Warning: {filepath} not found. Using empty list.")
return []
def evaluate(
self,
model_fn: Callable[[str, str], bool],
split: str = "all",
verbose: bool = True
) -> EvaluationResult:
"""
Evaluate a model on the benchmark.
Args:
model_fn: Function that takes (question, answer) and returns:
True if answer is trustworthy, False if hallucination
split: "all", "pattern_detectable", or "knowledge_required"
verbose: Print progress
Returns:
EvaluationResult with accuracy metrics
"""
# Select splits to evaluate
if split == "all":
pattern_samples = self.pattern_detectable
knowledge_samples = self.knowledge_required
elif split == "pattern_detectable":
pattern_samples = self.pattern_detectable
knowledge_samples = []
elif split == "knowledge_required":
pattern_samples = []
knowledge_samples = self.knowledge_required
else:
raise ValueError(f"Unknown split: {split}")
# Evaluate pattern-detectable
pattern_correct = 0
pattern_total = 0
latencies = []
if pattern_samples:
if verbose:
print("Evaluating Pattern-Detectable split...")
for i, sample in enumerate(pattern_samples):
start = time.time()
prediction = model_fn(sample["question"], sample["answer"])
latencies.append((time.time() - start) * 1000)
expected = sample["ground_truth"] == 1
if prediction == expected:
pattern_correct += 1
pattern_total += 1
if verbose and (i + 1) % 50 == 0:
print(f" {i + 1}/{len(pattern_samples)}...")
# Evaluate knowledge-required
knowledge_correct = 0
knowledge_total = 0
if knowledge_samples:
if verbose:
print("Evaluating Knowledge-Required split...")
for i, sample in enumerate(knowledge_samples):
start = time.time()
prediction = model_fn(sample["question"], sample["answer"])
latencies.append((time.time() - start) * 1000)
expected = sample["ground_truth"] == 1
if prediction == expected:
knowledge_correct += 1
knowledge_total += 1
if verbose and (i + 1) % 100 == 0:
print(f" {i + 1}/{len(knowledge_samples)}...")
# Compute metrics
pattern_acc = (pattern_correct / pattern_total * 100) if pattern_total > 0 else 0
knowledge_acc = (knowledge_correct / knowledge_total * 100) if knowledge_total > 0 else 0
total_correct = pattern_correct + knowledge_correct
total_samples = pattern_total + knowledge_total
overall_acc = (total_correct / total_samples * 100) if total_samples > 0 else 0
avg_latency = sum(latencies) / len(latencies) if latencies else 0
result = EvaluationResult(
pattern_detectable_accuracy=pattern_acc,
knowledge_required_accuracy=knowledge_acc,
overall_accuracy=overall_acc,
pattern_detectable_samples=pattern_total,
knowledge_required_samples=knowledge_total,
total_samples=total_samples,
avg_latency_ms=avg_latency
)
if verbose:
print(result)
return result
def create_submission(
self,
result: EvaluationResult,
model_name: str,
model_type: str,
parameters: str,
cost_per_1k: str = "Unknown",
contact: str = "",
paper_link: str = ""
) -> Dict:
"""Create a submission JSON for the leaderboard"""
from datetime import datetime
submission = {
"model_name": model_name,
"model_type": model_type,
"parameters": parameters,
"pattern_detectable_accuracy": result.pattern_detectable_accuracy,
"knowledge_required_accuracy": result.knowledge_required_accuracy,
"overall_accuracy": result.overall_accuracy,
"latency_ms": result.avg_latency_ms,
"cost_per_1k": cost_per_1k,
"submission_date": datetime.now().strftime("%Y-%m-%d"),
"contact": contact,
"paper_link": paper_link
}
return submission
# ═══════════════════════════════════════════════════════════════════════════════
# EXAMPLE: HEXAMIND BASELINE
# ═══════════════════════════════════════════════════════════════════════════════
class HexaMindBaseline:
"""
Zero-parameter topological hallucination detector.
Uses I Ching pattern matching grounded in S21 Vacuum Manifold Theory.
"""
# Definitive truth patterns
TRUTH_PATTERNS = [
r"i don'?t know",
r"it depends",
r"there is no( scientific)? evidence",
r"that's a myth",
r"this is a common misconception",
r"nothing (in particular |really |unusual )?(happens|occurs|will happen)",
r"no[,.]? (it |that )?(?:does not|doesn't|is not|isn't|won't|will not)",
]
# Definitive hallucination patterns
HALLUC_PATTERNS = [
r"^yes[,.]? ",
r"will (bring|cause|give|attract) (you )?(good |bad )?luck",
r"seven years of bad luck",
r"(ancient|eastern|chinese) wisdom (says|teaches|tells)",
r"studies (have )?show(n|s)? that",
r"everyone knows (that )?",
r"it'?s (a )?(well[- ])?known fact",
]
def __init__(self):
import re
self.truth_re = [re.compile(p, re.IGNORECASE) for p in self.TRUTH_PATTERNS]
self.halluc_re = [re.compile(p, re.IGNORECASE) for p in self.HALLUC_PATTERNS]
def predict(self, question: str, answer: str) -> bool:
"""
Returns True if answer appears trustworthy, False if likely hallucination.
"""
# Check for truth patterns
for pattern in self.truth_re:
if pattern.search(answer):
return True
# Check for hallucination patterns
for pattern in self.halluc_re:
if pattern.search(answer):
return False
# Default: uncertain, assume trustworthy
return True
# ═══════════════════════════════════════════════════════════════════════════════
# CLI
# ═══════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="HexaMind Benchmark Evaluation")
parser.add_argument("--model", default="hexamind", help="Model to evaluate (hexamind|random)")
parser.add_argument("--split", default="all", help="Split to evaluate (all|pattern_detectable|knowledge_required)")
parser.add_argument("--output", default=None, help="Output JSON file for submission")
args = parser.parse_args()
# Load benchmark
benchmark = HexaMindBenchmark()
# Select model
if args.model == "hexamind":
model = HexaMindBaseline()
model_fn = model.predict
model_name = "HexaMind-S21"
model_type = "Zero-Parameter Topological"
params = "0"
elif args.model == "random":
import random
model_fn = lambda q, a: random.random() > 0.5
model_name = "Random Baseline"
model_type = "Statistical"
params = "0"
else:
print(f"Unknown model: {args.model}")
exit(1)
# Evaluate
result = benchmark.evaluate(model_fn, split=args.split)
# Save submission if requested
if args.output:
submission = benchmark.create_submission(
result, model_name, model_type, params
)
with open(args.output, 'w') as f:
json.dump(submission, f, indent=2)
print(f"Submission saved to {args.output}")