S21MIND / benchmark.py
s21mind's picture
Upload benchmark.py
d98b6f9 verified
"""
HexaMind Hallucination Benchmark - Evaluation Framework
========================================================
This module provides the evaluation infrastructure for the HexaMind
Hallucination Benchmark. It does NOT include the HexaMind detector itself,
which is available under commercial license.
Usage:
from benchmark import HexaMindBenchmark
benchmark = HexaMindBenchmark()
results = benchmark.evaluate(your_detector_function)
"""
import json
import os
import time
from dataclasses import dataclass
from typing import Callable, Dict, List, Optional
@dataclass
class BenchmarkResults:
"""Results from benchmark evaluation"""
pattern_accuracy: float
knowledge_accuracy: float
overall_accuracy: float
pattern_samples: int
knowledge_samples: int
total_samples: int
avg_latency_ms: float
def to_dict(self) -> Dict:
return {
"pattern_detectable_accuracy": round(self.pattern_accuracy, 2),
"knowledge_required_accuracy": round(self.knowledge_accuracy, 2),
"overall_accuracy": round(self.overall_accuracy, 2),
"pattern_samples": self.pattern_samples,
"knowledge_samples": self.knowledge_samples,
"total_samples": self.total_samples,
"avg_latency_ms": round(self.avg_latency_ms, 2)
}
def __repr__(self):
return f"""
══════════════════════════════════════════════════════════════
HEXAMIND BENCHMARK RESULTS
══════════════════════════════════════════════════════════════
Pattern-Detectable: {self.pattern_accuracy:5.1f}% (n={self.pattern_samples})
Knowledge-Required: {self.knowledge_accuracy:5.1f}% (n={self.knowledge_samples})
──────────────────────────────────────────────────────────
Overall: {self.overall_accuracy:5.1f}% (n={self.total_samples})
Avg Latency: {self.avg_latency_ms:5.2f} ms
══════════════════════════════════════════════════════════════
"""
class HexaMindBenchmark:
"""
Evaluation framework for the HexaMind Hallucination Benchmark.
The benchmark splits TruthfulQA into:
- Pattern-Detectable: Questions with linguistic markers
- Knowledge-Required: Questions needing factual verification
Example:
benchmark = HexaMindBenchmark()
def my_detector(question, answer):
# Return True if trustworthy, False if hallucination
return some_logic(question, answer)
results = benchmark.evaluate(my_detector)
print(results)
"""
def __init__(self, data_dir: str = "data"):
"""
Initialize benchmark with data directory.
Args:
data_dir: Path to directory containing JSON split files
"""
self.data_dir = data_dir
self._pattern_data = None
self._knowledge_data = None
@property
def pattern_detectable(self) -> List[Dict]:
"""Load pattern-detectable split lazily"""
if self._pattern_data is None:
self._pattern_data = self._load_json("pattern_detectable.json")
return self._pattern_data
@property
def knowledge_required(self) -> List[Dict]:
"""Load knowledge-required split lazily"""
if self._knowledge_data is None:
self._knowledge_data = self._load_json("knowledge_required.json")
return self._knowledge_data
def _load_json(self, filename: str) -> List[Dict]:
"""Load a JSON file from data directory"""
path = os.path.join(self.data_dir, filename)
if not os.path.exists(path):
raise FileNotFoundError(
f"Data file not found: {path}\n"
f"Please ensure you have downloaded the benchmark data."
)
with open(path, 'r', encoding='utf-8') as f:
return json.load(f)
def evaluate(
self,
detector: Callable[[str, str], bool],
split: str = "all",
verbose: bool = True
) -> BenchmarkResults:
"""
Evaluate a hallucination detector on the benchmark.
Args:
detector: Function(question, answer) -> bool
Returns True if answer is trustworthy
Returns False if answer is a hallucination
split: Which split to evaluate
"all" - both splits
"pattern" - pattern-detectable only
"knowledge" - knowledge-required only
verbose: Print progress during evaluation
Returns:
BenchmarkResults with accuracy metrics
"""
# Select data based on split
if split == "all":
pattern_data = self.pattern_detectable
knowledge_data = self.knowledge_required
elif split in ("pattern", "pattern_detectable"):
pattern_data = self.pattern_detectable
knowledge_data = []
elif split in ("knowledge", "knowledge_required"):
pattern_data = []
knowledge_data = self.knowledge_required
else:
raise ValueError(f"Unknown split: {split}")
latencies = []
# Evaluate pattern-detectable
pattern_correct = 0
if pattern_data and verbose:
print(f"Evaluating pattern-detectable ({len(pattern_data)} samples)...")
for i, sample in enumerate(pattern_data):
start = time.perf_counter()
prediction = detector(sample["question"], sample["answer"])
latencies.append((time.perf_counter() - start) * 1000)
expected = sample["ground_truth"] == 1
if prediction == expected:
pattern_correct += 1
if verbose and (i + 1) % 25 == 0:
print(f" Progress: {i + 1}/{len(pattern_data)}")
# Evaluate knowledge-required
knowledge_correct = 0
if knowledge_data and verbose:
print(f"Evaluating knowledge-required ({len(knowledge_data)} samples)...")
for i, sample in enumerate(knowledge_data):
start = time.perf_counter()
prediction = detector(sample["question"], sample["answer"])
latencies.append((time.perf_counter() - start) * 1000)
expected = sample["ground_truth"] == 1
if prediction == expected:
knowledge_correct += 1
if verbose and (i + 1) % 200 == 0:
print(f" Progress: {i + 1}/{len(knowledge_data)}")
# Compute metrics
pattern_n = len(pattern_data)
knowledge_n = len(knowledge_data)
total_n = pattern_n + knowledge_n
pattern_acc = (pattern_correct / pattern_n * 100) if pattern_n > 0 else 0
knowledge_acc = (knowledge_correct / knowledge_n * 100) if knowledge_n > 0 else 0
overall_acc = ((pattern_correct + knowledge_correct) / total_n * 100) if total_n > 0 else 0
avg_latency = sum(latencies) / len(latencies) if latencies else 0
results = BenchmarkResults(
pattern_accuracy=pattern_acc,
knowledge_accuracy=knowledge_acc,
overall_accuracy=overall_acc,
pattern_samples=pattern_n,
knowledge_samples=knowledge_n,
total_samples=total_n,
avg_latency_ms=avg_latency
)
if verbose:
print(results)
return results
def create_submission(
self,
results: BenchmarkResults,
model_name: str,
model_type: str,
parameters: str,
contact: str = "",
paper_url: str = "",
cost_per_1k: str = "Unknown"
) -> Dict:
"""
Create a submission JSON for the leaderboard.
Args:
results: BenchmarkResults from evaluate()
model_name: Name of your model
model_type: Category (LLM-as-Judge, Classifier, Zero-Parameter, etc.)
parameters: Parameter count (e.g., "7B", "0", "70B")
contact: Email for questions
paper_url: Link to paper/preprint (optional)
cost_per_1k: API cost per 1000 evaluations (optional)
Returns:
Dict ready to save as JSON submission
"""
from datetime import datetime
return {
"model_name": model_name,
"model_type": model_type,
"parameters": parameters,
"pattern_detectable_accuracy": results.pattern_accuracy,
"knowledge_required_accuracy": results.knowledge_accuracy,
"overall_accuracy": results.overall_accuracy,
"latency_ms": results.avg_latency_ms,
"cost_per_1k": cost_per_1k,
"submission_date": datetime.now().strftime("%Y-%m-%d"),
"contact": contact,
"paper_url": paper_url
}
# ═══════════════════════════════════════════════════════════════════════════════
# EXAMPLE BASELINES (for reference)
# ═══════════════════════════════════════════════════════════════════════════════
def random_baseline(question: str, answer: str) -> bool:
"""Random baseline - 50% expected accuracy"""
import random
return random.random() > 0.5
def always_trust_baseline(question: str, answer: str) -> bool:
"""Always returns True - accuracy = % of truthful samples"""
return True
def always_reject_baseline(question: str, answer: str) -> bool:
"""Always returns False - accuracy = % of hallucination samples"""
return False
# ═══════════════════════════════════════════════════════════════════════════════
# CLI
# ═══════════════════════════════════════════════════════════════════════════════
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(
description="HexaMind Hallucination Benchmark Evaluation"
)
parser.add_argument(
"--baseline",
choices=["random", "always_trust", "always_reject"],
default="random",
help="Baseline to evaluate"
)
parser.add_argument(
"--split",
choices=["all", "pattern", "knowledge"],
default="all",
help="Which split to evaluate"
)
parser.add_argument(
"--data-dir",
default="data",
help="Path to data directory"
)
args = parser.parse_args()
# Select baseline
baselines = {
"random": random_baseline,
"always_trust": always_trust_baseline,
"always_reject": always_reject_baseline
}
detector = baselines[args.baseline]
# Run evaluation
benchmark = HexaMindBenchmark(data_dir=args.data_dir)
results = benchmark.evaluate(detector, split=args.split)
# Save results
submission = benchmark.create_submission(
results,
model_name=f"{args.baseline}_baseline",
model_type="Statistical Baseline",
parameters="0"
)
output_file = f"submission_{args.baseline}.json"
with open(output_file, 'w') as f:
json.dump(submission, f, indent=2)
print(f"\nSubmission saved to {output_file}")