|
|
""" |
|
|
HexaMind Hallucination Benchmark - Evaluation Framework |
|
|
======================================================== |
|
|
|
|
|
This module provides the evaluation infrastructure for the HexaMind |
|
|
Hallucination Benchmark. It does NOT include the HexaMind detector itself, |
|
|
which is available under commercial license. |
|
|
|
|
|
Usage: |
|
|
from benchmark import HexaMindBenchmark |
|
|
|
|
|
benchmark = HexaMindBenchmark() |
|
|
results = benchmark.evaluate(your_detector_function) |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
import time |
|
|
from dataclasses import dataclass |
|
|
from typing import Callable, Dict, List, Optional |
|
|
|
|
|
|
|
|
@dataclass |
|
|
class BenchmarkResults: |
|
|
"""Results from benchmark evaluation""" |
|
|
pattern_accuracy: float |
|
|
knowledge_accuracy: float |
|
|
overall_accuracy: float |
|
|
pattern_samples: int |
|
|
knowledge_samples: int |
|
|
total_samples: int |
|
|
avg_latency_ms: float |
|
|
|
|
|
def to_dict(self) -> Dict: |
|
|
return { |
|
|
"pattern_detectable_accuracy": round(self.pattern_accuracy, 2), |
|
|
"knowledge_required_accuracy": round(self.knowledge_accuracy, 2), |
|
|
"overall_accuracy": round(self.overall_accuracy, 2), |
|
|
"pattern_samples": self.pattern_samples, |
|
|
"knowledge_samples": self.knowledge_samples, |
|
|
"total_samples": self.total_samples, |
|
|
"avg_latency_ms": round(self.avg_latency_ms, 2) |
|
|
} |
|
|
|
|
|
def __repr__(self): |
|
|
return f""" |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
HEXAMIND BENCHMARK RESULTS |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
Pattern-Detectable: {self.pattern_accuracy:5.1f}% (n={self.pattern_samples}) |
|
|
Knowledge-Required: {self.knowledge_accuracy:5.1f}% (n={self.knowledge_samples}) |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
Overall: {self.overall_accuracy:5.1f}% (n={self.total_samples}) |
|
|
Avg Latency: {self.avg_latency_ms:5.2f} ms |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
""" |
|
|
|
|
|
|
|
|
class HexaMindBenchmark: |
|
|
""" |
|
|
Evaluation framework for the HexaMind Hallucination Benchmark. |
|
|
|
|
|
The benchmark splits TruthfulQA into: |
|
|
- Pattern-Detectable: Questions with linguistic markers |
|
|
- Knowledge-Required: Questions needing factual verification |
|
|
|
|
|
Example: |
|
|
benchmark = HexaMindBenchmark() |
|
|
|
|
|
def my_detector(question, answer): |
|
|
# Return True if trustworthy, False if hallucination |
|
|
return some_logic(question, answer) |
|
|
|
|
|
results = benchmark.evaluate(my_detector) |
|
|
print(results) |
|
|
""" |
|
|
|
|
|
def __init__(self, data_dir: str = "data"): |
|
|
""" |
|
|
Initialize benchmark with data directory. |
|
|
|
|
|
Args: |
|
|
data_dir: Path to directory containing JSON split files |
|
|
""" |
|
|
self.data_dir = data_dir |
|
|
self._pattern_data = None |
|
|
self._knowledge_data = None |
|
|
|
|
|
@property |
|
|
def pattern_detectable(self) -> List[Dict]: |
|
|
"""Load pattern-detectable split lazily""" |
|
|
if self._pattern_data is None: |
|
|
self._pattern_data = self._load_json("pattern_detectable.json") |
|
|
return self._pattern_data |
|
|
|
|
|
@property |
|
|
def knowledge_required(self) -> List[Dict]: |
|
|
"""Load knowledge-required split lazily""" |
|
|
if self._knowledge_data is None: |
|
|
self._knowledge_data = self._load_json("knowledge_required.json") |
|
|
return self._knowledge_data |
|
|
|
|
|
def _load_json(self, filename: str) -> List[Dict]: |
|
|
"""Load a JSON file from data directory""" |
|
|
path = os.path.join(self.data_dir, filename) |
|
|
if not os.path.exists(path): |
|
|
raise FileNotFoundError( |
|
|
f"Data file not found: {path}\n" |
|
|
f"Please ensure you have downloaded the benchmark data." |
|
|
) |
|
|
with open(path, 'r', encoding='utf-8') as f: |
|
|
return json.load(f) |
|
|
|
|
|
def evaluate( |
|
|
self, |
|
|
detector: Callable[[str, str], bool], |
|
|
split: str = "all", |
|
|
verbose: bool = True |
|
|
) -> BenchmarkResults: |
|
|
""" |
|
|
Evaluate a hallucination detector on the benchmark. |
|
|
|
|
|
Args: |
|
|
detector: Function(question, answer) -> bool |
|
|
Returns True if answer is trustworthy |
|
|
Returns False if answer is a hallucination |
|
|
split: Which split to evaluate |
|
|
"all" - both splits |
|
|
"pattern" - pattern-detectable only |
|
|
"knowledge" - knowledge-required only |
|
|
verbose: Print progress during evaluation |
|
|
|
|
|
Returns: |
|
|
BenchmarkResults with accuracy metrics |
|
|
""" |
|
|
|
|
|
if split == "all": |
|
|
pattern_data = self.pattern_detectable |
|
|
knowledge_data = self.knowledge_required |
|
|
elif split in ("pattern", "pattern_detectable"): |
|
|
pattern_data = self.pattern_detectable |
|
|
knowledge_data = [] |
|
|
elif split in ("knowledge", "knowledge_required"): |
|
|
pattern_data = [] |
|
|
knowledge_data = self.knowledge_required |
|
|
else: |
|
|
raise ValueError(f"Unknown split: {split}") |
|
|
|
|
|
latencies = [] |
|
|
|
|
|
|
|
|
pattern_correct = 0 |
|
|
if pattern_data and verbose: |
|
|
print(f"Evaluating pattern-detectable ({len(pattern_data)} samples)...") |
|
|
|
|
|
for i, sample in enumerate(pattern_data): |
|
|
start = time.perf_counter() |
|
|
prediction = detector(sample["question"], sample["answer"]) |
|
|
latencies.append((time.perf_counter() - start) * 1000) |
|
|
|
|
|
expected = sample["ground_truth"] == 1 |
|
|
if prediction == expected: |
|
|
pattern_correct += 1 |
|
|
|
|
|
if verbose and (i + 1) % 25 == 0: |
|
|
print(f" Progress: {i + 1}/{len(pattern_data)}") |
|
|
|
|
|
|
|
|
knowledge_correct = 0 |
|
|
if knowledge_data and verbose: |
|
|
print(f"Evaluating knowledge-required ({len(knowledge_data)} samples)...") |
|
|
|
|
|
for i, sample in enumerate(knowledge_data): |
|
|
start = time.perf_counter() |
|
|
prediction = detector(sample["question"], sample["answer"]) |
|
|
latencies.append((time.perf_counter() - start) * 1000) |
|
|
|
|
|
expected = sample["ground_truth"] == 1 |
|
|
if prediction == expected: |
|
|
knowledge_correct += 1 |
|
|
|
|
|
if verbose and (i + 1) % 200 == 0: |
|
|
print(f" Progress: {i + 1}/{len(knowledge_data)}") |
|
|
|
|
|
|
|
|
pattern_n = len(pattern_data) |
|
|
knowledge_n = len(knowledge_data) |
|
|
total_n = pattern_n + knowledge_n |
|
|
|
|
|
pattern_acc = (pattern_correct / pattern_n * 100) if pattern_n > 0 else 0 |
|
|
knowledge_acc = (knowledge_correct / knowledge_n * 100) if knowledge_n > 0 else 0 |
|
|
overall_acc = ((pattern_correct + knowledge_correct) / total_n * 100) if total_n > 0 else 0 |
|
|
avg_latency = sum(latencies) / len(latencies) if latencies else 0 |
|
|
|
|
|
results = BenchmarkResults( |
|
|
pattern_accuracy=pattern_acc, |
|
|
knowledge_accuracy=knowledge_acc, |
|
|
overall_accuracy=overall_acc, |
|
|
pattern_samples=pattern_n, |
|
|
knowledge_samples=knowledge_n, |
|
|
total_samples=total_n, |
|
|
avg_latency_ms=avg_latency |
|
|
) |
|
|
|
|
|
if verbose: |
|
|
print(results) |
|
|
|
|
|
return results |
|
|
|
|
|
def create_submission( |
|
|
self, |
|
|
results: BenchmarkResults, |
|
|
model_name: str, |
|
|
model_type: str, |
|
|
parameters: str, |
|
|
contact: str = "", |
|
|
paper_url: str = "", |
|
|
cost_per_1k: str = "Unknown" |
|
|
) -> Dict: |
|
|
""" |
|
|
Create a submission JSON for the leaderboard. |
|
|
|
|
|
Args: |
|
|
results: BenchmarkResults from evaluate() |
|
|
model_name: Name of your model |
|
|
model_type: Category (LLM-as-Judge, Classifier, Zero-Parameter, etc.) |
|
|
parameters: Parameter count (e.g., "7B", "0", "70B") |
|
|
contact: Email for questions |
|
|
paper_url: Link to paper/preprint (optional) |
|
|
cost_per_1k: API cost per 1000 evaluations (optional) |
|
|
|
|
|
Returns: |
|
|
Dict ready to save as JSON submission |
|
|
""" |
|
|
from datetime import datetime |
|
|
|
|
|
return { |
|
|
"model_name": model_name, |
|
|
"model_type": model_type, |
|
|
"parameters": parameters, |
|
|
"pattern_detectable_accuracy": results.pattern_accuracy, |
|
|
"knowledge_required_accuracy": results.knowledge_accuracy, |
|
|
"overall_accuracy": results.overall_accuracy, |
|
|
"latency_ms": results.avg_latency_ms, |
|
|
"cost_per_1k": cost_per_1k, |
|
|
"submission_date": datetime.now().strftime("%Y-%m-%d"), |
|
|
"contact": contact, |
|
|
"paper_url": paper_url |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def random_baseline(question: str, answer: str) -> bool: |
|
|
"""Random baseline - 50% expected accuracy""" |
|
|
import random |
|
|
return random.random() > 0.5 |
|
|
|
|
|
|
|
|
def always_trust_baseline(question: str, answer: str) -> bool: |
|
|
"""Always returns True - accuracy = % of truthful samples""" |
|
|
return True |
|
|
|
|
|
|
|
|
def always_reject_baseline(question: str, answer: str) -> bool: |
|
|
"""Always returns False - accuracy = % of hallucination samples""" |
|
|
return False |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser( |
|
|
description="HexaMind Hallucination Benchmark Evaluation" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--baseline", |
|
|
choices=["random", "always_trust", "always_reject"], |
|
|
default="random", |
|
|
help="Baseline to evaluate" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--split", |
|
|
choices=["all", "pattern", "knowledge"], |
|
|
default="all", |
|
|
help="Which split to evaluate" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--data-dir", |
|
|
default="data", |
|
|
help="Path to data directory" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
baselines = { |
|
|
"random": random_baseline, |
|
|
"always_trust": always_trust_baseline, |
|
|
"always_reject": always_reject_baseline |
|
|
} |
|
|
detector = baselines[args.baseline] |
|
|
|
|
|
|
|
|
benchmark = HexaMindBenchmark(data_dir=args.data_dir) |
|
|
results = benchmark.evaluate(detector, split=args.split) |
|
|
|
|
|
|
|
|
submission = benchmark.create_submission( |
|
|
results, |
|
|
model_name=f"{args.baseline}_baseline", |
|
|
model_type="Statistical Baseline", |
|
|
parameters="0" |
|
|
) |
|
|
|
|
|
output_file = f"submission_{args.baseline}.json" |
|
|
with open(output_file, 'w') as f: |
|
|
json.dump(submission, f, indent=2) |
|
|
print(f"\nSubmission saved to {output_file}") |
|
|
|