Spaces:

s21mind
/

S21MIND

Running

File size: 12,922 Bytes

ca0826b

"""
╔══════════════════════════════════════════════════════════════════════════════╗
║     HEXAMIND HALLUCINATION BENCHMARK - EVALUATION SCRIPT                     ║
║     Evaluate your model on Pattern-Detectable vs Knowledge-Required splits   ║
╚══════════════════════════════════════════════════════════════════════════════╝

Usage:
    from hexamind_benchmark import HexaMindBenchmark
    
    benchmark = HexaMindBenchmark()
    results = benchmark.evaluate(your_model_function)
"""

import json
import os
from typing import Callable, Dict, List, Optional
from dataclasses import dataclass
import time

@dataclass
class EvaluationResult:
    """Results from benchmark evaluation"""
    pattern_detectable_accuracy: float
    knowledge_required_accuracy: float
    overall_accuracy: float
    pattern_detectable_samples: int
    knowledge_required_samples: int
    total_samples: int
    avg_latency_ms: float
    
    def to_dict(self) -> Dict:
        return {
            "pattern_detectable_accuracy": round(self.pattern_detectable_accuracy, 2),
            "knowledge_required_accuracy": round(self.knowledge_required_accuracy, 2),
            "overall_accuracy": round(self.overall_accuracy, 2),
            "pattern_detectable_samples": self.pattern_detectable_samples,
            "knowledge_required_samples": self.knowledge_required_samples,
            "total_samples": self.total_samples,
            "avg_latency_ms": round(self.avg_latency_ms, 2)
        }
    
    def __str__(self) -> str:
        return f"""
╔══════════════════════════════════════════════════════════════╗
║          HEXAMIND BENCHMARK EVALUATION RESULTS               ║
╠══════════════════════════════════════════════════════════════╣
║  Pattern-Detectable Accuracy:  {self.pattern_detectable_accuracy:6.2f}%  (n={self.pattern_detectable_samples:3d})    ║
║  Knowledge-Required Accuracy:  {self.knowledge_required_accuracy:6.2f}%  (n={self.knowledge_required_samples:3d})    ║
║  ────────────────────────────────────────────────────────    ║
║  Overall Accuracy:             {self.overall_accuracy:6.2f}%  (n={self.total_samples:3d})    ║
║  Average Latency:              {self.avg_latency_ms:6.2f} ms                  ║
╚══════════════════════════════════════════════════════════════╝
"""


class HexaMindBenchmark:
    """
    HexaMind Hallucination Detection Benchmark
    
    Evaluates models on two splits:
    1. Pattern-Detectable: Questions where linguistic patterns reveal hallucinations
    2. Knowledge-Required: Questions requiring factual verification
    """
    
    def __init__(self, data_dir: str = "data"):
        self.data_dir = data_dir
        self.pattern_detectable = self._load_split("pattern_detectable.json")
        self.knowledge_required = self._load_split("knowledge_required.json")
        
    def _load_split(self, filename: str) -> List[Dict]:
        """Load a benchmark split from JSON"""
        filepath = os.path.join(self.data_dir, filename)
        if os.path.exists(filepath):
            with open(filepath, 'r') as f:
                return json.load(f)
        else:
            print(f"Warning: {filepath} not found. Using empty list.")
            return []
    
    def evaluate(
        self, 
        model_fn: Callable[[str, str], bool],
        split: str = "all",
        verbose: bool = True
    ) -> EvaluationResult:
        """
        Evaluate a model on the benchmark.
        
        Args:
            model_fn: Function that takes (question, answer) and returns:
                      True if answer is trustworthy, False if hallucination
            split: "all", "pattern_detectable", or "knowledge_required"
            verbose: Print progress
            
        Returns:
            EvaluationResult with accuracy metrics
        """
        
        # Select splits to evaluate
        if split == "all":
            pattern_samples = self.pattern_detectable
            knowledge_samples = self.knowledge_required
        elif split == "pattern_detectable":
            pattern_samples = self.pattern_detectable
            knowledge_samples = []
        elif split == "knowledge_required":
            pattern_samples = []
            knowledge_samples = self.knowledge_required
        else:
            raise ValueError(f"Unknown split: {split}")
        
        # Evaluate pattern-detectable
        pattern_correct = 0
        pattern_total = 0
        latencies = []
        
        if pattern_samples:
            if verbose:
                print("Evaluating Pattern-Detectable split...")
            for i, sample in enumerate(pattern_samples):
                start = time.time()
                prediction = model_fn(sample["question"], sample["answer"])
                latencies.append((time.time() - start) * 1000)
                
                expected = sample["ground_truth"] == 1
                if prediction == expected:
                    pattern_correct += 1
                pattern_total += 1
                
                if verbose and (i + 1) % 50 == 0:
                    print(f"  {i + 1}/{len(pattern_samples)}...")
        
        # Evaluate knowledge-required
        knowledge_correct = 0
        knowledge_total = 0
        
        if knowledge_samples:
            if verbose:
                print("Evaluating Knowledge-Required split...")
            for i, sample in enumerate(knowledge_samples):
                start = time.time()
                prediction = model_fn(sample["question"], sample["answer"])
                latencies.append((time.time() - start) * 1000)
                
                expected = sample["ground_truth"] == 1
                if prediction == expected:
                    knowledge_correct += 1
                knowledge_total += 1
                
                if verbose and (i + 1) % 100 == 0:
                    print(f"  {i + 1}/{len(knowledge_samples)}...")
        
        # Compute metrics
        pattern_acc = (pattern_correct / pattern_total * 100) if pattern_total > 0 else 0
        knowledge_acc = (knowledge_correct / knowledge_total * 100) if knowledge_total > 0 else 0
        
        total_correct = pattern_correct + knowledge_correct
        total_samples = pattern_total + knowledge_total
        overall_acc = (total_correct / total_samples * 100) if total_samples > 0 else 0
        
        avg_latency = sum(latencies) / len(latencies) if latencies else 0
        
        result = EvaluationResult(
            pattern_detectable_accuracy=pattern_acc,
            knowledge_required_accuracy=knowledge_acc,
            overall_accuracy=overall_acc,
            pattern_detectable_samples=pattern_total,
            knowledge_required_samples=knowledge_total,
            total_samples=total_samples,
            avg_latency_ms=avg_latency
        )
        
        if verbose:
            print(result)
        
        return result
    
    def create_submission(
        self,
        result: EvaluationResult,
        model_name: str,
        model_type: str,
        parameters: str,
        cost_per_1k: str = "Unknown",
        contact: str = "",
        paper_link: str = ""
    ) -> Dict:
        """Create a submission JSON for the leaderboard"""
        from datetime import datetime
        
        submission = {
            "model_name": model_name,
            "model_type": model_type,
            "parameters": parameters,
            "pattern_detectable_accuracy": result.pattern_detectable_accuracy,
            "knowledge_required_accuracy": result.knowledge_required_accuracy,
            "overall_accuracy": result.overall_accuracy,
            "latency_ms": result.avg_latency_ms,
            "cost_per_1k": cost_per_1k,
            "submission_date": datetime.now().strftime("%Y-%m-%d"),
            "contact": contact,
            "paper_link": paper_link
        }
        
        return submission


# ═══════════════════════════════════════════════════════════════════════════════
# EXAMPLE: HEXAMIND BASELINE
# ═══════════════════════════════════════════════════════════════════════════════

class HexaMindBaseline:
    """
    Zero-parameter topological hallucination detector.
    Uses I Ching pattern matching grounded in S21 Vacuum Manifold Theory.
    """
    
    # Definitive truth patterns
    TRUTH_PATTERNS = [
        r"i don'?t know",
        r"it depends",
        r"there is no( scientific)? evidence",
        r"that's a myth",
        r"this is a common misconception",
        r"nothing (in particular |really |unusual )?(happens|occurs|will happen)",
        r"no[,.]? (it |that )?(?:does not|doesn't|is not|isn't|won't|will not)",
    ]
    
    # Definitive hallucination patterns
    HALLUC_PATTERNS = [
        r"^yes[,.]? ",
        r"will (bring|cause|give|attract) (you )?(good |bad )?luck",
        r"seven years of bad luck",
        r"(ancient|eastern|chinese) wisdom (says|teaches|tells)",
        r"studies (have )?show(n|s)? that",
        r"everyone knows (that )?",
        r"it'?s (a )?(well[- ])?known fact",
    ]
    
    def __init__(self):
        import re
        self.truth_re = [re.compile(p, re.IGNORECASE) for p in self.TRUTH_PATTERNS]
        self.halluc_re = [re.compile(p, re.IGNORECASE) for p in self.HALLUC_PATTERNS]
    
    def predict(self, question: str, answer: str) -> bool:
        """
        Returns True if answer appears trustworthy, False if likely hallucination.
        """
        # Check for truth patterns
        for pattern in self.truth_re:
            if pattern.search(answer):
                return True
        
        # Check for hallucination patterns  
        for pattern in self.halluc_re:
            if pattern.search(answer):
                return False
        
        # Default: uncertain, assume trustworthy
        return True


# ═══════════════════════════════════════════════════════════════════════════════
# CLI
# ═══════════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(description="HexaMind Benchmark Evaluation")
    parser.add_argument("--model", default="hexamind", help="Model to evaluate (hexamind|random)")
    parser.add_argument("--split", default="all", help="Split to evaluate (all|pattern_detectable|knowledge_required)")
    parser.add_argument("--output", default=None, help="Output JSON file for submission")
    
    args = parser.parse_args()
    
    # Load benchmark
    benchmark = HexaMindBenchmark()
    
    # Select model
    if args.model == "hexamind":
        model = HexaMindBaseline()
        model_fn = model.predict
        model_name = "HexaMind-S21"
        model_type = "Zero-Parameter Topological"
        params = "0"
    elif args.model == "random":
        import random
        model_fn = lambda q, a: random.random() > 0.5
        model_name = "Random Baseline"
        model_type = "Statistical"
        params = "0"
    else:
        print(f"Unknown model: {args.model}")
        exit(1)
    
    # Evaluate
    result = benchmark.evaluate(model_fn, split=args.split)
    
    # Save submission if requested
    if args.output:
        submission = benchmark.create_submission(
            result, model_name, model_type, params
        )
        with open(args.output, 'w') as f:
            json.dump(submission, f, indent=2)
        print(f"Submission saved to {args.output}")