|
|
""" |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
β HEXAMIND HALLUCINATION BENCHMARK - EVALUATION SCRIPT β |
|
|
β Evaluate your model on Pattern-Detectable vs Knowledge-Required splits β |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
|
|
|
Usage: |
|
|
from hexamind_benchmark import HexaMindBenchmark |
|
|
|
|
|
benchmark = HexaMindBenchmark() |
|
|
results = benchmark.evaluate(your_model_function) |
|
|
""" |
|
|
|
|
|
import json |
|
|
import os |
|
|
from typing import Callable, Dict, List, Optional |
|
|
from dataclasses import dataclass |
|
|
import time |
|
|
|
|
|
@dataclass |
|
|
class EvaluationResult: |
|
|
"""Results from benchmark evaluation""" |
|
|
pattern_detectable_accuracy: float |
|
|
knowledge_required_accuracy: float |
|
|
overall_accuracy: float |
|
|
pattern_detectable_samples: int |
|
|
knowledge_required_samples: int |
|
|
total_samples: int |
|
|
avg_latency_ms: float |
|
|
|
|
|
def to_dict(self) -> Dict: |
|
|
return { |
|
|
"pattern_detectable_accuracy": round(self.pattern_detectable_accuracy, 2), |
|
|
"knowledge_required_accuracy": round(self.knowledge_required_accuracy, 2), |
|
|
"overall_accuracy": round(self.overall_accuracy, 2), |
|
|
"pattern_detectable_samples": self.pattern_detectable_samples, |
|
|
"knowledge_required_samples": self.knowledge_required_samples, |
|
|
"total_samples": self.total_samples, |
|
|
"avg_latency_ms": round(self.avg_latency_ms, 2) |
|
|
} |
|
|
|
|
|
def __str__(self) -> str: |
|
|
return f""" |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
β HEXAMIND BENCHMARK EVALUATION RESULTS β |
|
|
β βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ£ |
|
|
β Pattern-Detectable Accuracy: {self.pattern_detectable_accuracy:6.2f}% (n={self.pattern_detectable_samples:3d}) β |
|
|
β Knowledge-Required Accuracy: {self.knowledge_required_accuracy:6.2f}% (n={self.knowledge_required_samples:3d}) β |
|
|
β ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ β |
|
|
β Overall Accuracy: {self.overall_accuracy:6.2f}% (n={self.total_samples:3d}) β |
|
|
β Average Latency: {self.avg_latency_ms:6.2f} ms β |
|
|
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ |
|
|
""" |
|
|
|
|
|
|
|
|
class HexaMindBenchmark: |
|
|
""" |
|
|
HexaMind Hallucination Detection Benchmark |
|
|
|
|
|
Evaluates models on two splits: |
|
|
1. Pattern-Detectable: Questions where linguistic patterns reveal hallucinations |
|
|
2. Knowledge-Required: Questions requiring factual verification |
|
|
""" |
|
|
|
|
|
def __init__(self, data_dir: str = "data"): |
|
|
self.data_dir = data_dir |
|
|
self.pattern_detectable = self._load_split("pattern_detectable.json") |
|
|
self.knowledge_required = self._load_split("knowledge_required.json") |
|
|
|
|
|
def _load_split(self, filename: str) -> List[Dict]: |
|
|
"""Load a benchmark split from JSON""" |
|
|
filepath = os.path.join(self.data_dir, filename) |
|
|
if os.path.exists(filepath): |
|
|
with open(filepath, 'r') as f: |
|
|
return json.load(f) |
|
|
else: |
|
|
print(f"Warning: {filepath} not found. Using empty list.") |
|
|
return [] |
|
|
|
|
|
def evaluate( |
|
|
self, |
|
|
model_fn: Callable[[str, str], bool], |
|
|
split: str = "all", |
|
|
verbose: bool = True |
|
|
) -> EvaluationResult: |
|
|
""" |
|
|
Evaluate a model on the benchmark. |
|
|
|
|
|
Args: |
|
|
model_fn: Function that takes (question, answer) and returns: |
|
|
True if answer is trustworthy, False if hallucination |
|
|
split: "all", "pattern_detectable", or "knowledge_required" |
|
|
verbose: Print progress |
|
|
|
|
|
Returns: |
|
|
EvaluationResult with accuracy metrics |
|
|
""" |
|
|
|
|
|
|
|
|
if split == "all": |
|
|
pattern_samples = self.pattern_detectable |
|
|
knowledge_samples = self.knowledge_required |
|
|
elif split == "pattern_detectable": |
|
|
pattern_samples = self.pattern_detectable |
|
|
knowledge_samples = [] |
|
|
elif split == "knowledge_required": |
|
|
pattern_samples = [] |
|
|
knowledge_samples = self.knowledge_required |
|
|
else: |
|
|
raise ValueError(f"Unknown split: {split}") |
|
|
|
|
|
|
|
|
pattern_correct = 0 |
|
|
pattern_total = 0 |
|
|
latencies = [] |
|
|
|
|
|
if pattern_samples: |
|
|
if verbose: |
|
|
print("Evaluating Pattern-Detectable split...") |
|
|
for i, sample in enumerate(pattern_samples): |
|
|
start = time.time() |
|
|
prediction = model_fn(sample["question"], sample["answer"]) |
|
|
latencies.append((time.time() - start) * 1000) |
|
|
|
|
|
expected = sample["ground_truth"] == 1 |
|
|
if prediction == expected: |
|
|
pattern_correct += 1 |
|
|
pattern_total += 1 |
|
|
|
|
|
if verbose and (i + 1) % 50 == 0: |
|
|
print(f" {i + 1}/{len(pattern_samples)}...") |
|
|
|
|
|
|
|
|
knowledge_correct = 0 |
|
|
knowledge_total = 0 |
|
|
|
|
|
if knowledge_samples: |
|
|
if verbose: |
|
|
print("Evaluating Knowledge-Required split...") |
|
|
for i, sample in enumerate(knowledge_samples): |
|
|
start = time.time() |
|
|
prediction = model_fn(sample["question"], sample["answer"]) |
|
|
latencies.append((time.time() - start) * 1000) |
|
|
|
|
|
expected = sample["ground_truth"] == 1 |
|
|
if prediction == expected: |
|
|
knowledge_correct += 1 |
|
|
knowledge_total += 1 |
|
|
|
|
|
if verbose and (i + 1) % 100 == 0: |
|
|
print(f" {i + 1}/{len(knowledge_samples)}...") |
|
|
|
|
|
|
|
|
pattern_acc = (pattern_correct / pattern_total * 100) if pattern_total > 0 else 0 |
|
|
knowledge_acc = (knowledge_correct / knowledge_total * 100) if knowledge_total > 0 else 0 |
|
|
|
|
|
total_correct = pattern_correct + knowledge_correct |
|
|
total_samples = pattern_total + knowledge_total |
|
|
overall_acc = (total_correct / total_samples * 100) if total_samples > 0 else 0 |
|
|
|
|
|
avg_latency = sum(latencies) / len(latencies) if latencies else 0 |
|
|
|
|
|
result = EvaluationResult( |
|
|
pattern_detectable_accuracy=pattern_acc, |
|
|
knowledge_required_accuracy=knowledge_acc, |
|
|
overall_accuracy=overall_acc, |
|
|
pattern_detectable_samples=pattern_total, |
|
|
knowledge_required_samples=knowledge_total, |
|
|
total_samples=total_samples, |
|
|
avg_latency_ms=avg_latency |
|
|
) |
|
|
|
|
|
if verbose: |
|
|
print(result) |
|
|
|
|
|
return result |
|
|
|
|
|
def create_submission( |
|
|
self, |
|
|
result: EvaluationResult, |
|
|
model_name: str, |
|
|
model_type: str, |
|
|
parameters: str, |
|
|
cost_per_1k: str = "Unknown", |
|
|
contact: str = "", |
|
|
paper_link: str = "" |
|
|
) -> Dict: |
|
|
"""Create a submission JSON for the leaderboard""" |
|
|
from datetime import datetime |
|
|
|
|
|
submission = { |
|
|
"model_name": model_name, |
|
|
"model_type": model_type, |
|
|
"parameters": parameters, |
|
|
"pattern_detectable_accuracy": result.pattern_detectable_accuracy, |
|
|
"knowledge_required_accuracy": result.knowledge_required_accuracy, |
|
|
"overall_accuracy": result.overall_accuracy, |
|
|
"latency_ms": result.avg_latency_ms, |
|
|
"cost_per_1k": cost_per_1k, |
|
|
"submission_date": datetime.now().strftime("%Y-%m-%d"), |
|
|
"contact": contact, |
|
|
"paper_link": paper_link |
|
|
} |
|
|
|
|
|
return submission |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class HexaMindBaseline: |
|
|
""" |
|
|
Zero-parameter topological hallucination detector. |
|
|
Uses I Ching pattern matching grounded in S21 Vacuum Manifold Theory. |
|
|
""" |
|
|
|
|
|
|
|
|
TRUTH_PATTERNS = [ |
|
|
r"i don'?t know", |
|
|
r"it depends", |
|
|
r"there is no( scientific)? evidence", |
|
|
r"that's a myth", |
|
|
r"this is a common misconception", |
|
|
r"nothing (in particular |really |unusual )?(happens|occurs|will happen)", |
|
|
r"no[,.]? (it |that )?(?:does not|doesn't|is not|isn't|won't|will not)", |
|
|
] |
|
|
|
|
|
|
|
|
HALLUC_PATTERNS = [ |
|
|
r"^yes[,.]? ", |
|
|
r"will (bring|cause|give|attract) (you )?(good |bad )?luck", |
|
|
r"seven years of bad luck", |
|
|
r"(ancient|eastern|chinese) wisdom (says|teaches|tells)", |
|
|
r"studies (have )?show(n|s)? that", |
|
|
r"everyone knows (that )?", |
|
|
r"it'?s (a )?(well[- ])?known fact", |
|
|
] |
|
|
|
|
|
def __init__(self): |
|
|
import re |
|
|
self.truth_re = [re.compile(p, re.IGNORECASE) for p in self.TRUTH_PATTERNS] |
|
|
self.halluc_re = [re.compile(p, re.IGNORECASE) for p in self.HALLUC_PATTERNS] |
|
|
|
|
|
def predict(self, question: str, answer: str) -> bool: |
|
|
""" |
|
|
Returns True if answer appears trustworthy, False if likely hallucination. |
|
|
""" |
|
|
|
|
|
for pattern in self.truth_re: |
|
|
if pattern.search(answer): |
|
|
return True |
|
|
|
|
|
|
|
|
for pattern in self.halluc_re: |
|
|
if pattern.search(answer): |
|
|
return False |
|
|
|
|
|
|
|
|
return True |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
import argparse |
|
|
|
|
|
parser = argparse.ArgumentParser(description="HexaMind Benchmark Evaluation") |
|
|
parser.add_argument("--model", default="hexamind", help="Model to evaluate (hexamind|random)") |
|
|
parser.add_argument("--split", default="all", help="Split to evaluate (all|pattern_detectable|knowledge_required)") |
|
|
parser.add_argument("--output", default=None, help="Output JSON file for submission") |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
|
|
|
benchmark = HexaMindBenchmark() |
|
|
|
|
|
|
|
|
if args.model == "hexamind": |
|
|
model = HexaMindBaseline() |
|
|
model_fn = model.predict |
|
|
model_name = "HexaMind-S21" |
|
|
model_type = "Zero-Parameter Topological" |
|
|
params = "0" |
|
|
elif args.model == "random": |
|
|
import random |
|
|
model_fn = lambda q, a: random.random() > 0.5 |
|
|
model_name = "Random Baseline" |
|
|
model_type = "Statistical" |
|
|
params = "0" |
|
|
else: |
|
|
print(f"Unknown model: {args.model}") |
|
|
exit(1) |
|
|
|
|
|
|
|
|
result = benchmark.evaluate(model_fn, split=args.split) |
|
|
|
|
|
|
|
|
if args.output: |
|
|
submission = benchmark.create_submission( |
|
|
result, model_name, model_type, params |
|
|
) |
|
|
with open(args.output, 'w') as f: |
|
|
json.dump(submission, f, indent=2) |
|
|
print(f"Submission saved to {args.output}") |
|
|
|