Spaces:

s21mind
/

S21MIND

Running

App Files Files Community

S21MIND / hexamind_benchmark.py

s21mind

Upload hexamind_benchmark.py

ca0826b verified 16 days ago

raw

history blame contribute delete

12.9 kB

	"""
	╔══════════════════════════════════════════════════════════════════════════════╗
	║ HEXAMIND HALLUCINATION BENCHMARK - EVALUATION SCRIPT ║
	║ Evaluate your model on Pattern-Detectable vs Knowledge-Required splits ║
	╚══════════════════════════════════════════════════════════════════════════════╝

	Usage:
	from hexamind_benchmark import HexaMindBenchmark

	benchmark = HexaMindBenchmark()
	results = benchmark.evaluate(your_model_function)
	"""

	import json
	import os
	from typing import Callable, Dict, List, Optional
	from dataclasses import dataclass
	import time

	@dataclass
	class EvaluationResult:
	"""Results from benchmark evaluation"""
	pattern_detectable_accuracy: float
	knowledge_required_accuracy: float
	overall_accuracy: float
	pattern_detectable_samples: int
	knowledge_required_samples: int
	total_samples: int
	avg_latency_ms: float

	def to_dict(self) -> Dict:
	return {
	"pattern_detectable_accuracy": round(self.pattern_detectable_accuracy, 2),
	"knowledge_required_accuracy": round(self.knowledge_required_accuracy, 2),
	"overall_accuracy": round(self.overall_accuracy, 2),
	"pattern_detectable_samples": self.pattern_detectable_samples,
	"knowledge_required_samples": self.knowledge_required_samples,
	"total_samples": self.total_samples,
	"avg_latency_ms": round(self.avg_latency_ms, 2)
	}

	def __str__(self) -> str:
	return f"""
	╔══════════════════════════════════════════════════════════════╗
	║ HEXAMIND BENCHMARK EVALUATION RESULTS ║
	╠══════════════════════════════════════════════════════════════╣
	║ Pattern-Detectable Accuracy: {self.pattern_detectable_accuracy:6.2f}% (n={self.pattern_detectable_samples:3d}) ║
	║ Knowledge-Required Accuracy: {self.knowledge_required_accuracy:6.2f}% (n={self.knowledge_required_samples:3d}) ║
	║ ──────────────────────────────────────────────────────── ║
	║ Overall Accuracy: {self.overall_accuracy:6.2f}% (n={self.total_samples:3d}) ║
	║ Average Latency: {self.avg_latency_ms:6.2f} ms ║
	╚══════════════════════════════════════════════════════════════╝
	"""


	class HexaMindBenchmark:
	"""
	HexaMind Hallucination Detection Benchmark

	Evaluates models on two splits:
	1. Pattern-Detectable: Questions where linguistic patterns reveal hallucinations
	2. Knowledge-Required: Questions requiring factual verification
	"""

	def __init__(self, data_dir: str = "data"):
	self.data_dir = data_dir
	self.pattern_detectable = self._load_split("pattern_detectable.json")
	self.knowledge_required = self._load_split("knowledge_required.json")

	def _load_split(self, filename: str) -> List[Dict]:
	"""Load a benchmark split from JSON"""
	filepath = os.path.join(self.data_dir, filename)
	if os.path.exists(filepath):
	with open(filepath, 'r') as f:
	return json.load(f)
	else:
	print(f"Warning: {filepath} not found. Using empty list.")
	return []

	def evaluate(
	self,
	model_fn: Callable[[str, str], bool],
	split: str = "all",
	verbose: bool = True
	) -> EvaluationResult:
	"""
	Evaluate a model on the benchmark.

	Args:
	model_fn: Function that takes (question, answer) and returns:
	True if answer is trustworthy, False if hallucination
	split: "all", "pattern_detectable", or "knowledge_required"
	verbose: Print progress

	Returns:
	EvaluationResult with accuracy metrics
	"""

	# Select splits to evaluate
	if split == "all":
	pattern_samples = self.pattern_detectable
	knowledge_samples = self.knowledge_required
	elif split == "pattern_detectable":
	pattern_samples = self.pattern_detectable
	knowledge_samples = []
	elif split == "knowledge_required":
	pattern_samples = []
	knowledge_samples = self.knowledge_required
	else:
	raise ValueError(f"Unknown split: {split}")

	# Evaluate pattern-detectable
	pattern_correct = 0
	pattern_total = 0
	latencies = []

	if pattern_samples:
	if verbose:
	print("Evaluating Pattern-Detectable split...")
	for i, sample in enumerate(pattern_samples):
	start = time.time()
	prediction = model_fn(sample["question"], sample["answer"])
	latencies.append((time.time() - start) * 1000)

	expected = sample["ground_truth"] == 1
	if prediction == expected:
	pattern_correct += 1
	pattern_total += 1

	if verbose and (i + 1) % 50 == 0:
	print(f" {i + 1}/{len(pattern_samples)}...")

	# Evaluate knowledge-required
	knowledge_correct = 0
	knowledge_total = 0

	if knowledge_samples:
	if verbose:
	print("Evaluating Knowledge-Required split...")
	for i, sample in enumerate(knowledge_samples):
	start = time.time()
	prediction = model_fn(sample["question"], sample["answer"])
	latencies.append((time.time() - start) * 1000)

	expected = sample["ground_truth"] == 1
	if prediction == expected:
	knowledge_correct += 1
	knowledge_total += 1

	if verbose and (i + 1) % 100 == 0:
	print(f" {i + 1}/{len(knowledge_samples)}...")

	# Compute metrics
	pattern_acc = (pattern_correct / pattern_total * 100) if pattern_total > 0 else 0
	knowledge_acc = (knowledge_correct / knowledge_total * 100) if knowledge_total > 0 else 0

	total_correct = pattern_correct + knowledge_correct
	total_samples = pattern_total + knowledge_total
	overall_acc = (total_correct / total_samples * 100) if total_samples > 0 else 0

	avg_latency = sum(latencies) / len(latencies) if latencies else 0

	result = EvaluationResult(
	pattern_detectable_accuracy=pattern_acc,
	knowledge_required_accuracy=knowledge_acc,
	overall_accuracy=overall_acc,
	pattern_detectable_samples=pattern_total,
	knowledge_required_samples=knowledge_total,
	total_samples=total_samples,
	avg_latency_ms=avg_latency
	)

	if verbose:
	print(result)

	return result

	def create_submission(
	self,
	result: EvaluationResult,
	model_name: str,
	model_type: str,
	parameters: str,
	cost_per_1k: str = "Unknown",
	contact: str = "",
	paper_link: str = ""
	) -> Dict:
	"""Create a submission JSON for the leaderboard"""
	from datetime import datetime

	submission = {
	"model_name": model_name,
	"model_type": model_type,
	"parameters": parameters,
	"pattern_detectable_accuracy": result.pattern_detectable_accuracy,
	"knowledge_required_accuracy": result.knowledge_required_accuracy,
	"overall_accuracy": result.overall_accuracy,
	"latency_ms": result.avg_latency_ms,
	"cost_per_1k": cost_per_1k,
	"submission_date": datetime.now().strftime("%Y-%m-%d"),
	"contact": contact,
	"paper_link": paper_link
	}

	return submission


	# ═══════════════════════════════════════════════════════════════════════════════
	# EXAMPLE: HEXAMIND BASELINE
	# ═══════════════════════════════════════════════════════════════════════════════

	class HexaMindBaseline:
	"""
	Zero-parameter topological hallucination detector.
	Uses I Ching pattern matching grounded in S21 Vacuum Manifold Theory.
	"""

	# Definitive truth patterns
	TRUTH_PATTERNS = [
	r"i don'?t know",
	r"it depends",
	r"there is no( scientific)? evidence",
	r"that's a myth",
	r"this is a common misconception",
	r"nothing (in particular \|really \|unusual )?(happens\|occurs\|will happen)",
	r"no[,.]? (it \|that )?(?:does not\|doesn't\|is not\|isn't\|won't\|will not)",
	]

	# Definitive hallucination patterns
	HALLUC_PATTERNS = [
	r"^yes[,.]? ",
	r"will (bring\|cause\|give\|attract) (you )?(good \|bad )?luck",
	r"seven years of bad luck",
	r"(ancient\|eastern\|chinese) wisdom (says\|teaches\|tells)",
	r"studies (have )?show(n\|s)? that",
	r"everyone knows (that )?",
	r"it'?s (a )?(well[- ])?known fact",
	]

	def __init__(self):
	import re
	self.truth_re = [re.compile(p, re.IGNORECASE) for p in self.TRUTH_PATTERNS]
	self.halluc_re = [re.compile(p, re.IGNORECASE) for p in self.HALLUC_PATTERNS]

	def predict(self, question: str, answer: str) -> bool:
	"""
	Returns True if answer appears trustworthy, False if likely hallucination.
	"""
	# Check for truth patterns
	for pattern in self.truth_re:
	if pattern.search(answer):
	return True

	# Check for hallucination patterns
	for pattern in self.halluc_re:
	if pattern.search(answer):
	return False

	# Default: uncertain, assume trustworthy
	return True


	# ═══════════════════════════════════════════════════════════════════════════════
	# CLI
	# ═══════════════════════════════════════════════════════════════════════════════

	if __name__ == "__main__":
	import argparse

	parser = argparse.ArgumentParser(description="HexaMind Benchmark Evaluation")
	parser.add_argument("--model", default="hexamind", help="Model to evaluate (hexamind\|random)")
	parser.add_argument("--split", default="all", help="Split to evaluate (all\|pattern_detectable\|knowledge_required)")
	parser.add_argument("--output", default=None, help="Output JSON file for submission")

	args = parser.parse_args()

	# Load benchmark
	benchmark = HexaMindBenchmark()

	# Select model
	if args.model == "hexamind":
	model = HexaMindBaseline()
	model_fn = model.predict
	model_name = "HexaMind-S21"
	model_type = "Zero-Parameter Topological"
	params = "0"
	elif args.model == "random":
	import random
	model_fn = lambda q, a: random.random() > 0.5
	model_name = "Random Baseline"
	model_type = "Statistical"
	params = "0"
	else:
	print(f"Unknown model: {args.model}")
	exit(1)

	# Evaluate
	result = benchmark.evaluate(model_fn, split=args.split)

	# Save submission if requested
	if args.output:
	submission = benchmark.create_submission(
	result, model_name, model_type, params
	)
	with open(args.output, 'w') as f:
	json.dump(submission, f, indent=2)
	print(f"Submission saved to {args.output}")