Spaces:

s21mind
/

S21MIND

Running

File size: 12,417 Bytes

d98b6f9

"""
HexaMind Hallucination Benchmark - Evaluation Framework
========================================================

This module provides the evaluation infrastructure for the HexaMind 
Hallucination Benchmark. It does NOT include the HexaMind detector itself,
which is available under commercial license.

Usage:
    from benchmark import HexaMindBenchmark
    
    benchmark = HexaMindBenchmark()
    results = benchmark.evaluate(your_detector_function)
"""

import json
import os
import time
from dataclasses import dataclass
from typing import Callable, Dict, List, Optional


@dataclass
class BenchmarkResults:
    """Results from benchmark evaluation"""
    pattern_accuracy: float
    knowledge_accuracy: float
    overall_accuracy: float
    pattern_samples: int
    knowledge_samples: int
    total_samples: int
    avg_latency_ms: float
    
    def to_dict(self) -> Dict:
        return {
            "pattern_detectable_accuracy": round(self.pattern_accuracy, 2),
            "knowledge_required_accuracy": round(self.knowledge_accuracy, 2),
            "overall_accuracy": round(self.overall_accuracy, 2),
            "pattern_samples": self.pattern_samples,
            "knowledge_samples": self.knowledge_samples,
            "total_samples": self.total_samples,
            "avg_latency_ms": round(self.avg_latency_ms, 2)
        }
    
    def __repr__(self):
        return f"""
══════════════════════════════════════════════════════════════
           HEXAMIND BENCHMARK RESULTS
══════════════════════════════════════════════════════════════
  Pattern-Detectable:  {self.pattern_accuracy:5.1f}%  (n={self.pattern_samples})
  Knowledge-Required:  {self.knowledge_accuracy:5.1f}%  (n={self.knowledge_samples})
  ──────────────────────────────────────────────────────────
  Overall:             {self.overall_accuracy:5.1f}%  (n={self.total_samples})
  Avg Latency:         {self.avg_latency_ms:5.2f} ms
══════════════════════════════════════════════════════════════
"""


class HexaMindBenchmark:
    """
    Evaluation framework for the HexaMind Hallucination Benchmark.
    
    The benchmark splits TruthfulQA into:
    - Pattern-Detectable: Questions with linguistic markers
    - Knowledge-Required: Questions needing factual verification
    
    Example:
        benchmark = HexaMindBenchmark()
        
        def my_detector(question, answer):
            # Return True if trustworthy, False if hallucination
            return some_logic(question, answer)
        
        results = benchmark.evaluate(my_detector)
        print(results)
    """
    
    def __init__(self, data_dir: str = "data"):
        """
        Initialize benchmark with data directory.
        
        Args:
            data_dir: Path to directory containing JSON split files
        """
        self.data_dir = data_dir
        self._pattern_data = None
        self._knowledge_data = None
    
    @property
    def pattern_detectable(self) -> List[Dict]:
        """Load pattern-detectable split lazily"""
        if self._pattern_data is None:
            self._pattern_data = self._load_json("pattern_detectable.json")
        return self._pattern_data
    
    @property
    def knowledge_required(self) -> List[Dict]:
        """Load knowledge-required split lazily"""
        if self._knowledge_data is None:
            self._knowledge_data = self._load_json("knowledge_required.json")
        return self._knowledge_data
    
    def _load_json(self, filename: str) -> List[Dict]:
        """Load a JSON file from data directory"""
        path = os.path.join(self.data_dir, filename)
        if not os.path.exists(path):
            raise FileNotFoundError(
                f"Data file not found: {path}\n"
                f"Please ensure you have downloaded the benchmark data."
            )
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)
    
    def evaluate(
        self,
        detector: Callable[[str, str], bool],
        split: str = "all",
        verbose: bool = True
    ) -> BenchmarkResults:
        """
        Evaluate a hallucination detector on the benchmark.
        
        Args:
            detector: Function(question, answer) -> bool
                      Returns True if answer is trustworthy
                      Returns False if answer is a hallucination
            split: Which split to evaluate
                   "all" - both splits
                   "pattern" - pattern-detectable only
                   "knowledge" - knowledge-required only
            verbose: Print progress during evaluation
            
        Returns:
            BenchmarkResults with accuracy metrics
        """
        # Select data based on split
        if split == "all":
            pattern_data = self.pattern_detectable
            knowledge_data = self.knowledge_required
        elif split in ("pattern", "pattern_detectable"):
            pattern_data = self.pattern_detectable
            knowledge_data = []
        elif split in ("knowledge", "knowledge_required"):
            pattern_data = []
            knowledge_data = self.knowledge_required
        else:
            raise ValueError(f"Unknown split: {split}")
        
        latencies = []
        
        # Evaluate pattern-detectable
        pattern_correct = 0
        if pattern_data and verbose:
            print(f"Evaluating pattern-detectable ({len(pattern_data)} samples)...")
        
        for i, sample in enumerate(pattern_data):
            start = time.perf_counter()
            prediction = detector(sample["question"], sample["answer"])
            latencies.append((time.perf_counter() - start) * 1000)
            
            expected = sample["ground_truth"] == 1
            if prediction == expected:
                pattern_correct += 1
            
            if verbose and (i + 1) % 25 == 0:
                print(f"  Progress: {i + 1}/{len(pattern_data)}")
        
        # Evaluate knowledge-required
        knowledge_correct = 0
        if knowledge_data and verbose:
            print(f"Evaluating knowledge-required ({len(knowledge_data)} samples)...")
        
        for i, sample in enumerate(knowledge_data):
            start = time.perf_counter()
            prediction = detector(sample["question"], sample["answer"])
            latencies.append((time.perf_counter() - start) * 1000)
            
            expected = sample["ground_truth"] == 1
            if prediction == expected:
                knowledge_correct += 1
            
            if verbose and (i + 1) % 200 == 0:
                print(f"  Progress: {i + 1}/{len(knowledge_data)}")
        
        # Compute metrics
        pattern_n = len(pattern_data)
        knowledge_n = len(knowledge_data)
        total_n = pattern_n + knowledge_n
        
        pattern_acc = (pattern_correct / pattern_n * 100) if pattern_n > 0 else 0
        knowledge_acc = (knowledge_correct / knowledge_n * 100) if knowledge_n > 0 else 0
        overall_acc = ((pattern_correct + knowledge_correct) / total_n * 100) if total_n > 0 else 0
        avg_latency = sum(latencies) / len(latencies) if latencies else 0
        
        results = BenchmarkResults(
            pattern_accuracy=pattern_acc,
            knowledge_accuracy=knowledge_acc,
            overall_accuracy=overall_acc,
            pattern_samples=pattern_n,
            knowledge_samples=knowledge_n,
            total_samples=total_n,
            avg_latency_ms=avg_latency
        )
        
        if verbose:
            print(results)
        
        return results
    
    def create_submission(
        self,
        results: BenchmarkResults,
        model_name: str,
        model_type: str,
        parameters: str,
        contact: str = "",
        paper_url: str = "",
        cost_per_1k: str = "Unknown"
    ) -> Dict:
        """
        Create a submission JSON for the leaderboard.
        
        Args:
            results: BenchmarkResults from evaluate()
            model_name: Name of your model
            model_type: Category (LLM-as-Judge, Classifier, Zero-Parameter, etc.)
            parameters: Parameter count (e.g., "7B", "0", "70B")
            contact: Email for questions
            paper_url: Link to paper/preprint (optional)
            cost_per_1k: API cost per 1000 evaluations (optional)
            
        Returns:
            Dict ready to save as JSON submission
        """
        from datetime import datetime
        
        return {
            "model_name": model_name,
            "model_type": model_type,
            "parameters": parameters,
            "pattern_detectable_accuracy": results.pattern_accuracy,
            "knowledge_required_accuracy": results.knowledge_accuracy,
            "overall_accuracy": results.overall_accuracy,
            "latency_ms": results.avg_latency_ms,
            "cost_per_1k": cost_per_1k,
            "submission_date": datetime.now().strftime("%Y-%m-%d"),
            "contact": contact,
            "paper_url": paper_url
        }


# ═══════════════════════════════════════════════════════════════════════════════
# EXAMPLE BASELINES (for reference)
# ═══════════════════════════════════════════════════════════════════════════════

def random_baseline(question: str, answer: str) -> bool:
    """Random baseline - 50% expected accuracy"""
    import random
    return random.random() > 0.5


def always_trust_baseline(question: str, answer: str) -> bool:
    """Always returns True - accuracy = % of truthful samples"""
    return True


def always_reject_baseline(question: str, answer: str) -> bool:
    """Always returns False - accuracy = % of hallucination samples"""
    return False


# ═══════════════════════════════════════════════════════════════════════════════
# CLI
# ═══════════════════════════════════════════════════════════════════════════════

if __name__ == "__main__":
    import argparse
    
    parser = argparse.ArgumentParser(
        description="HexaMind Hallucination Benchmark Evaluation"
    )
    parser.add_argument(
        "--baseline", 
        choices=["random", "always_trust", "always_reject"],
        default="random",
        help="Baseline to evaluate"
    )
    parser.add_argument(
        "--split",
        choices=["all", "pattern", "knowledge"],
        default="all",
        help="Which split to evaluate"
    )
    parser.add_argument(
        "--data-dir",
        default="data",
        help="Path to data directory"
    )
    
    args = parser.parse_args()
    
    # Select baseline
    baselines = {
        "random": random_baseline,
        "always_trust": always_trust_baseline,
        "always_reject": always_reject_baseline
    }
    detector = baselines[args.baseline]
    
    # Run evaluation
    benchmark = HexaMindBenchmark(data_dir=args.data_dir)
    results = benchmark.evaluate(detector, split=args.split)
    
    # Save results
    submission = benchmark.create_submission(
        results,
        model_name=f"{args.baseline}_baseline",
        model_type="Statistical Baseline",
        parameters="0"
    )
    
    output_file = f"submission_{args.baseline}.json"
    with open(output_file, 'w') as f:
        json.dump(submission, f, indent=2)
    print(f"\nSubmission saved to {output_file}")