""" ╔══════════════════════════════════════════════════════════════════════════════╗ ║ HEXAMIND HALLUCINATION BENCHMARK - EVALUATION SCRIPT ║ ║ Evaluate your model on Pattern-Detectable vs Knowledge-Required splits ║ ╚══════════════════════════════════════════════════════════════════════════════╝ Usage: from hexamind_benchmark import HexaMindBenchmark benchmark = HexaMindBenchmark() results = benchmark.evaluate(your_model_function) """ import json import os from typing import Callable, Dict, List, Optional from dataclasses import dataclass import time @dataclass class EvaluationResult: """Results from benchmark evaluation""" pattern_detectable_accuracy: float knowledge_required_accuracy: float overall_accuracy: float pattern_detectable_samples: int knowledge_required_samples: int total_samples: int avg_latency_ms: float def to_dict(self) -> Dict: return { "pattern_detectable_accuracy": round(self.pattern_detectable_accuracy, 2), "knowledge_required_accuracy": round(self.knowledge_required_accuracy, 2), "overall_accuracy": round(self.overall_accuracy, 2), "pattern_detectable_samples": self.pattern_detectable_samples, "knowledge_required_samples": self.knowledge_required_samples, "total_samples": self.total_samples, "avg_latency_ms": round(self.avg_latency_ms, 2) } def __str__(self) -> str: return f""" ╔══════════════════════════════════════════════════════════════╗ ║ HEXAMIND BENCHMARK EVALUATION RESULTS ║ ╠══════════════════════════════════════════════════════════════╣ ║ Pattern-Detectable Accuracy: {self.pattern_detectable_accuracy:6.2f}% (n={self.pattern_detectable_samples:3d}) ║ ║ Knowledge-Required Accuracy: {self.knowledge_required_accuracy:6.2f}% (n={self.knowledge_required_samples:3d}) ║ ║ ──────────────────────────────────────────────────────── ║ ║ Overall Accuracy: {self.overall_accuracy:6.2f}% (n={self.total_samples:3d}) ║ ║ Average Latency: {self.avg_latency_ms:6.2f} ms ║ ╚══════════════════════════════════════════════════════════════╝ """ class HexaMindBenchmark: """ HexaMind Hallucination Detection Benchmark Evaluates models on two splits: 1. Pattern-Detectable: Questions where linguistic patterns reveal hallucinations 2. Knowledge-Required: Questions requiring factual verification """ def __init__(self, data_dir: str = "data"): self.data_dir = data_dir self.pattern_detectable = self._load_split("pattern_detectable.json") self.knowledge_required = self._load_split("knowledge_required.json") def _load_split(self, filename: str) -> List[Dict]: """Load a benchmark split from JSON""" filepath = os.path.join(self.data_dir, filename) if os.path.exists(filepath): with open(filepath, 'r') as f: return json.load(f) else: print(f"Warning: {filepath} not found. Using empty list.") return [] def evaluate( self, model_fn: Callable[[str, str], bool], split: str = "all", verbose: bool = True ) -> EvaluationResult: """ Evaluate a model on the benchmark. Args: model_fn: Function that takes (question, answer) and returns: True if answer is trustworthy, False if hallucination split: "all", "pattern_detectable", or "knowledge_required" verbose: Print progress Returns: EvaluationResult with accuracy metrics """ # Select splits to evaluate if split == "all": pattern_samples = self.pattern_detectable knowledge_samples = self.knowledge_required elif split == "pattern_detectable": pattern_samples = self.pattern_detectable knowledge_samples = [] elif split == "knowledge_required": pattern_samples = [] knowledge_samples = self.knowledge_required else: raise ValueError(f"Unknown split: {split}") # Evaluate pattern-detectable pattern_correct = 0 pattern_total = 0 latencies = [] if pattern_samples: if verbose: print("Evaluating Pattern-Detectable split...") for i, sample in enumerate(pattern_samples): start = time.time() prediction = model_fn(sample["question"], sample["answer"]) latencies.append((time.time() - start) * 1000) expected = sample["ground_truth"] == 1 if prediction == expected: pattern_correct += 1 pattern_total += 1 if verbose and (i + 1) % 50 == 0: print(f" {i + 1}/{len(pattern_samples)}...") # Evaluate knowledge-required knowledge_correct = 0 knowledge_total = 0 if knowledge_samples: if verbose: print("Evaluating Knowledge-Required split...") for i, sample in enumerate(knowledge_samples): start = time.time() prediction = model_fn(sample["question"], sample["answer"]) latencies.append((time.time() - start) * 1000) expected = sample["ground_truth"] == 1 if prediction == expected: knowledge_correct += 1 knowledge_total += 1 if verbose and (i + 1) % 100 == 0: print(f" {i + 1}/{len(knowledge_samples)}...") # Compute metrics pattern_acc = (pattern_correct / pattern_total * 100) if pattern_total > 0 else 0 knowledge_acc = (knowledge_correct / knowledge_total * 100) if knowledge_total > 0 else 0 total_correct = pattern_correct + knowledge_correct total_samples = pattern_total + knowledge_total overall_acc = (total_correct / total_samples * 100) if total_samples > 0 else 0 avg_latency = sum(latencies) / len(latencies) if latencies else 0 result = EvaluationResult( pattern_detectable_accuracy=pattern_acc, knowledge_required_accuracy=knowledge_acc, overall_accuracy=overall_acc, pattern_detectable_samples=pattern_total, knowledge_required_samples=knowledge_total, total_samples=total_samples, avg_latency_ms=avg_latency ) if verbose: print(result) return result def create_submission( self, result: EvaluationResult, model_name: str, model_type: str, parameters: str, cost_per_1k: str = "Unknown", contact: str = "", paper_link: str = "" ) -> Dict: """Create a submission JSON for the leaderboard""" from datetime import datetime submission = { "model_name": model_name, "model_type": model_type, "parameters": parameters, "pattern_detectable_accuracy": result.pattern_detectable_accuracy, "knowledge_required_accuracy": result.knowledge_required_accuracy, "overall_accuracy": result.overall_accuracy, "latency_ms": result.avg_latency_ms, "cost_per_1k": cost_per_1k, "submission_date": datetime.now().strftime("%Y-%m-%d"), "contact": contact, "paper_link": paper_link } return submission # ═══════════════════════════════════════════════════════════════════════════════ # EXAMPLE: HEXAMIND BASELINE # ═══════════════════════════════════════════════════════════════════════════════ class HexaMindBaseline: """ Zero-parameter topological hallucination detector. Uses I Ching pattern matching grounded in S21 Vacuum Manifold Theory. """ # Definitive truth patterns TRUTH_PATTERNS = [ r"i don'?t know", r"it depends", r"there is no( scientific)? evidence", r"that's a myth", r"this is a common misconception", r"nothing (in particular |really |unusual )?(happens|occurs|will happen)", r"no[,.]? (it |that )?(?:does not|doesn't|is not|isn't|won't|will not)", ] # Definitive hallucination patterns HALLUC_PATTERNS = [ r"^yes[,.]? ", r"will (bring|cause|give|attract) (you )?(good |bad )?luck", r"seven years of bad luck", r"(ancient|eastern|chinese) wisdom (says|teaches|tells)", r"studies (have )?show(n|s)? that", r"everyone knows (that )?", r"it'?s (a )?(well[- ])?known fact", ] def __init__(self): import re self.truth_re = [re.compile(p, re.IGNORECASE) for p in self.TRUTH_PATTERNS] self.halluc_re = [re.compile(p, re.IGNORECASE) for p in self.HALLUC_PATTERNS] def predict(self, question: str, answer: str) -> bool: """ Returns True if answer appears trustworthy, False if likely hallucination. """ # Check for truth patterns for pattern in self.truth_re: if pattern.search(answer): return True # Check for hallucination patterns for pattern in self.halluc_re: if pattern.search(answer): return False # Default: uncertain, assume trustworthy return True # ═══════════════════════════════════════════════════════════════════════════════ # CLI # ═══════════════════════════════════════════════════════════════════════════════ if __name__ == "__main__": import argparse parser = argparse.ArgumentParser(description="HexaMind Benchmark Evaluation") parser.add_argument("--model", default="hexamind", help="Model to evaluate (hexamind|random)") parser.add_argument("--split", default="all", help="Split to evaluate (all|pattern_detectable|knowledge_required)") parser.add_argument("--output", default=None, help="Output JSON file for submission") args = parser.parse_args() # Load benchmark benchmark = HexaMindBenchmark() # Select model if args.model == "hexamind": model = HexaMindBaseline() model_fn = model.predict model_name = "HexaMind-S21" model_type = "Zero-Parameter Topological" params = "0" elif args.model == "random": import random model_fn = lambda q, a: random.random() > 0.5 model_name = "Random Baseline" model_type = "Statistical" params = "0" else: print(f"Unknown model: {args.model}") exit(1) # Evaluate result = benchmark.evaluate(model_fn, split=args.split) # Save submission if requested if args.output: submission = benchmark.create_submission( result, model_name, model_type, params ) with open(args.output, 'w') as f: json.dump(submission, f, indent=2) print(f"Submission saved to {args.output}")