""" HexaMind Hallucination Benchmark - Evaluation Framework ======================================================== This module provides the evaluation infrastructure for the HexaMind Hallucination Benchmark. It does NOT include the HexaMind detector itself, which is available under commercial license. Usage: from benchmark import HexaMindBenchmark benchmark = HexaMindBenchmark() results = benchmark.evaluate(your_detector_function) """ import json import os import time from dataclasses import dataclass from typing import Callable, Dict, List, Optional @dataclass class BenchmarkResults: """Results from benchmark evaluation""" pattern_accuracy: float knowledge_accuracy: float overall_accuracy: float pattern_samples: int knowledge_samples: int total_samples: int avg_latency_ms: float def to_dict(self) -> Dict: return { "pattern_detectable_accuracy": round(self.pattern_accuracy, 2), "knowledge_required_accuracy": round(self.knowledge_accuracy, 2), "overall_accuracy": round(self.overall_accuracy, 2), "pattern_samples": self.pattern_samples, "knowledge_samples": self.knowledge_samples, "total_samples": self.total_samples, "avg_latency_ms": round(self.avg_latency_ms, 2) } def __repr__(self): return f""" ══════════════════════════════════════════════════════════════ HEXAMIND BENCHMARK RESULTS ══════════════════════════════════════════════════════════════ Pattern-Detectable: {self.pattern_accuracy:5.1f}% (n={self.pattern_samples}) Knowledge-Required: {self.knowledge_accuracy:5.1f}% (n={self.knowledge_samples}) ────────────────────────────────────────────────────────── Overall: {self.overall_accuracy:5.1f}% (n={self.total_samples}) Avg Latency: {self.avg_latency_ms:5.2f} ms ══════════════════════════════════════════════════════════════ """ class HexaMindBenchmark: """ Evaluation framework for the HexaMind Hallucination Benchmark. The benchmark splits TruthfulQA into: - Pattern-Detectable: Questions with linguistic markers - Knowledge-Required: Questions needing factual verification Example: benchmark = HexaMindBenchmark() def my_detector(question, answer): # Return True if trustworthy, False if hallucination return some_logic(question, answer) results = benchmark.evaluate(my_detector) print(results) """ def __init__(self, data_dir: str = "data"): """ Initialize benchmark with data directory. Args: data_dir: Path to directory containing JSON split files """ self.data_dir = data_dir self._pattern_data = None self._knowledge_data = None @property def pattern_detectable(self) -> List[Dict]: """Load pattern-detectable split lazily""" if self._pattern_data is None: self._pattern_data = self._load_json("pattern_detectable.json") return self._pattern_data @property def knowledge_required(self) -> List[Dict]: """Load knowledge-required split lazily""" if self._knowledge_data is None: self._knowledge_data = self._load_json("knowledge_required.json") return self._knowledge_data def _load_json(self, filename: str) -> List[Dict]: """Load a JSON file from data directory""" path = os.path.join(self.data_dir, filename) if not os.path.exists(path): raise FileNotFoundError( f"Data file not found: {path}\n" f"Please ensure you have downloaded the benchmark data." ) with open(path, 'r', encoding='utf-8') as f: return json.load(f) def evaluate( self, detector: Callable[[str, str], bool], split: str = "all", verbose: bool = True ) -> BenchmarkResults: """ Evaluate a hallucination detector on the benchmark. Args: detector: Function(question, answer) -> bool Returns True if answer is trustworthy Returns False if answer is a hallucination split: Which split to evaluate "all" - both splits "pattern" - pattern-detectable only "knowledge" - knowledge-required only verbose: Print progress during evaluation Returns: BenchmarkResults with accuracy metrics """ # Select data based on split if split == "all": pattern_data = self.pattern_detectable knowledge_data = self.knowledge_required elif split in ("pattern", "pattern_detectable"): pattern_data = self.pattern_detectable knowledge_data = [] elif split in ("knowledge", "knowledge_required"): pattern_data = [] knowledge_data = self.knowledge_required else: raise ValueError(f"Unknown split: {split}") latencies = [] # Evaluate pattern-detectable pattern_correct = 0 if pattern_data and verbose: print(f"Evaluating pattern-detectable ({len(pattern_data)} samples)...") for i, sample in enumerate(pattern_data): start = time.perf_counter() prediction = detector(sample["question"], sample["answer"]) latencies.append((time.perf_counter() - start) * 1000) expected = sample["ground_truth"] == 1 if prediction == expected: pattern_correct += 1 if verbose and (i + 1) % 25 == 0: print(f" Progress: {i + 1}/{len(pattern_data)}") # Evaluate knowledge-required knowledge_correct = 0 if knowledge_data and verbose: print(f"Evaluating knowledge-required ({len(knowledge_data)} samples)...") for i, sample in enumerate(knowledge_data): start = time.perf_counter() prediction = detector(sample["question"], sample["answer"]) latencies.append((time.perf_counter() - start) * 1000) expected = sample["ground_truth"] == 1 if prediction == expected: knowledge_correct += 1 if verbose and (i + 1) % 200 == 0: print(f" Progress: {i + 1}/{len(knowledge_data)}") # Compute metrics pattern_n = len(pattern_data) knowledge_n = len(knowledge_data) total_n = pattern_n + knowledge_n pattern_acc = (pattern_correct / pattern_n * 100) if pattern_n > 0 else 0 knowledge_acc = (knowledge_correct / knowledge_n * 100) if knowledge_n > 0 else 0 overall_acc = ((pattern_correct + knowledge_correct) / total_n * 100) if total_n > 0 else 0 avg_latency = sum(latencies) / len(latencies) if latencies else 0 results = BenchmarkResults( pattern_accuracy=pattern_acc, knowledge_accuracy=knowledge_acc, overall_accuracy=overall_acc, pattern_samples=pattern_n, knowledge_samples=knowledge_n, total_samples=total_n, avg_latency_ms=avg_latency ) if verbose: print(results) return results def create_submission( self, results: BenchmarkResults, model_name: str, model_type: str, parameters: str, contact: str = "", paper_url: str = "", cost_per_1k: str = "Unknown" ) -> Dict: """ Create a submission JSON for the leaderboard. Args: results: BenchmarkResults from evaluate() model_name: Name of your model model_type: Category (LLM-as-Judge, Classifier, Zero-Parameter, etc.) parameters: Parameter count (e.g., "7B", "0", "70B") contact: Email for questions paper_url: Link to paper/preprint (optional) cost_per_1k: API cost per 1000 evaluations (optional) Returns: Dict ready to save as JSON submission """ from datetime import datetime return { "model_name": model_name, "model_type": model_type, "parameters": parameters, "pattern_detectable_accuracy": results.pattern_accuracy, "knowledge_required_accuracy": results.knowledge_accuracy, "overall_accuracy": results.overall_accuracy, "latency_ms": results.avg_latency_ms, "cost_per_1k": cost_per_1k, "submission_date": datetime.now().strftime("%Y-%m-%d"), "contact": contact, "paper_url": paper_url } # ═══════════════════════════════════════════════════════════════════════════════ # EXAMPLE BASELINES (for reference) # ═══════════════════════════════════════════════════════════════════════════════ def random_baseline(question: str, answer: str) -> bool: """Random baseline - 50% expected accuracy""" import random return random.random() > 0.5 def always_trust_baseline(question: str, answer: str) -> bool: """Always returns True - accuracy = % of truthful samples""" return True def always_reject_baseline(question: str, answer: str) -> bool: """Always returns False - accuracy = % of hallucination samples""" return False # ═══════════════════════════════════════════════════════════════════════════════ # CLI # ═══════════════════════════════════════════════════════════════════════════════ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser( description="HexaMind Hallucination Benchmark Evaluation" ) parser.add_argument( "--baseline", choices=["random", "always_trust", "always_reject"], default="random", help="Baseline to evaluate" ) parser.add_argument( "--split", choices=["all", "pattern", "knowledge"], default="all", help="Which split to evaluate" ) parser.add_argument( "--data-dir", default="data", help="Path to data directory" ) args = parser.parse_args() # Select baseline baselines = { "random": random_baseline, "always_trust": always_trust_baseline, "always_reject": always_reject_baseline } detector = baselines[args.baseline] # Run evaluation benchmark = HexaMindBenchmark(data_dir=args.data_dir) results = benchmark.evaluate(detector, split=args.split) # Save results submission = benchmark.create_submission( results, model_name=f"{args.baseline}_baseline", model_type="Statistical Baseline", parameters="0" ) output_file = f"submission_{args.baseline}.json" with open(output_file, 'w') as f: json.dump(submission, f, indent=2) print(f"\nSubmission saved to {output_file}")