Spaces:

s21mind
/

S21MIND

Running

App Files Files Community

s21mind commited on 14 days ago

Commit

ca0826b

verified ·

1 Parent(s): 77fdbf9

Upload hexamind_benchmark.py

Browse files

Files changed (1) hide show

hexamind_benchmark.py +307 -0

hexamind_benchmark.py ADDED Viewed

	@@ -0,0 +1,307 @@

+"""
+╔══════════════════════════════════════════════════════════════════════════════╗
+║     HEXAMIND HALLUCINATION BENCHMARK - EVALUATION SCRIPT                     ║
+║     Evaluate your model on Pattern-Detectable vs Knowledge-Required splits   ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+Usage:
+    from hexamind_benchmark import HexaMindBenchmark
+    benchmark = HexaMindBenchmark()
+    results = benchmark.evaluate(your_model_function)
+"""
+import json
+import os
+from typing import Callable, Dict, List, Optional
+from dataclasses import dataclass
+import time
+@dataclass
+class EvaluationResult:
+    """Results from benchmark evaluation"""
+    pattern_detectable_accuracy: float
+    knowledge_required_accuracy: float
+    overall_accuracy: float
+    pattern_detectable_samples: int
+    knowledge_required_samples: int
+    total_samples: int
+    avg_latency_ms: float
+    def to_dict(self) -> Dict:
+        return {
+            "pattern_detectable_accuracy": round(self.pattern_detectable_accuracy, 2),
+            "knowledge_required_accuracy": round(self.knowledge_required_accuracy, 2),
+            "overall_accuracy": round(self.overall_accuracy, 2),
+            "pattern_detectable_samples": self.pattern_detectable_samples,
+            "knowledge_required_samples": self.knowledge_required_samples,
+            "total_samples": self.total_samples,
+            "avg_latency_ms": round(self.avg_latency_ms, 2)
+        }
+    def __str__(self) -> str:
+        return f"""
+╔══════════════════════════════════════════════════════════════╗
+║          HEXAMIND BENCHMARK EVALUATION RESULTS               ║
+╠══════════════════════════════════════════════════════════════╣
+║  Pattern-Detectable Accuracy:  {self.pattern_detectable_accuracy:6.2f}%  (n={self.pattern_detectable_samples:3d})    ║
+║  Knowledge-Required Accuracy:  {self.knowledge_required_accuracy:6.2f}%  (n={self.knowledge_required_samples:3d})    ║
+║  ────────────────────────────────────────────────────────    ║
+║  Overall Accuracy:             {self.overall_accuracy:6.2f}%  (n={self.total_samples:3d})    ║
+║  Average Latency:              {self.avg_latency_ms:6.2f} ms                  ║
+╚══════════════════════════════════════════════════════════════╝
+"""
+class HexaMindBenchmark:
+    """
+    HexaMind Hallucination Detection Benchmark
+    Evaluates models on two splits:
+    1. Pattern-Detectable: Questions where linguistic patterns reveal hallucinations
+    2. Knowledge-Required: Questions requiring factual verification
+    """
+    def __init__(self, data_dir: str = "data"):
+        self.data_dir = data_dir
+        self.pattern_detectable = self._load_split("pattern_detectable.json")
+        self.knowledge_required = self._load_split("knowledge_required.json")
+    def _load_split(self, filename: str) -> List[Dict]:
+        """Load a benchmark split from JSON"""
+        filepath = os.path.join(self.data_dir, filename)
+        if os.path.exists(filepath):
+            with open(filepath, 'r') as f:
+                return json.load(f)
+        else:
+            print(f"Warning: {filepath} not found. Using empty list.")
+            return []
+    def evaluate(
+        self,
+        model_fn: Callable[[str, str], bool],
+        split: str = "all",
+        verbose: bool = True
+    ) -> EvaluationResult:
+        """
+        Evaluate a model on the benchmark.
+        Args:
+            model_fn: Function that takes (question, answer) and returns:
+                      True if answer is trustworthy, False if hallucination
+            split: "all", "pattern_detectable", or "knowledge_required"
+            verbose: Print progress
+        Returns:
+            EvaluationResult with accuracy metrics
+        """
+        # Select splits to evaluate
+        if split == "all":
+            pattern_samples = self.pattern_detectable
+            knowledge_samples = self.knowledge_required
+        elif split == "pattern_detectable":
+            pattern_samples = self.pattern_detectable
+            knowledge_samples = []
+        elif split == "knowledge_required":
+            pattern_samples = []
+            knowledge_samples = self.knowledge_required
+        else:
+            raise ValueError(f"Unknown split: {split}")
+        # Evaluate pattern-detectable
+        pattern_correct = 0
+        pattern_total = 0
+        latencies = []
+        if pattern_samples:
+            if verbose:
+                print("Evaluating Pattern-Detectable split...")
+            for i, sample in enumerate(pattern_samples):
+                start = time.time()
+                prediction = model_fn(sample["question"], sample["answer"])
+                latencies.append((time.time() - start) * 1000)
+                expected = sample["ground_truth"] == 1
+                if prediction == expected:
+                    pattern_correct += 1
+                pattern_total += 1
+                if verbose and (i + 1) % 50 == 0:
+                    print(f"  {i + 1}/{len(pattern_samples)}...")
+        # Evaluate knowledge-required
+        knowledge_correct = 0
+        knowledge_total = 0
+        if knowledge_samples:
+            if verbose:
+                print("Evaluating Knowledge-Required split...")
+            for i, sample in enumerate(knowledge_samples):
+                start = time.time()
+                prediction = model_fn(sample["question"], sample["answer"])
+                latencies.append((time.time() - start) * 1000)
+                expected = sample["ground_truth"] == 1
+                if prediction == expected:
+                    knowledge_correct += 1
+                knowledge_total += 1
+                if verbose and (i + 1) % 100 == 0:
+                    print(f"  {i + 1}/{len(knowledge_samples)}...")
+        # Compute metrics
+        pattern_acc = (pattern_correct / pattern_total * 100) if pattern_total > 0 else 0
+        knowledge_acc = (knowledge_correct / knowledge_total * 100) if knowledge_total > 0 else 0
+        total_correct = pattern_correct + knowledge_correct
+        total_samples = pattern_total + knowledge_total
+        overall_acc = (total_correct / total_samples * 100) if total_samples > 0 else 0
+        avg_latency = sum(latencies) / len(latencies) if latencies else 0
+        result = EvaluationResult(
+            pattern_detectable_accuracy=pattern_acc,
+            knowledge_required_accuracy=knowledge_acc,
+            overall_accuracy=overall_acc,
+            pattern_detectable_samples=pattern_total,
+            knowledge_required_samples=knowledge_total,
+            total_samples=total_samples,
+            avg_latency_ms=avg_latency
+        )
+        if verbose:
+            print(result)
+        return result
+    def create_submission(
+        self,
+        result: EvaluationResult,
+        model_name: str,
+        model_type: str,
+        parameters: str,
+        cost_per_1k: str = "Unknown",
+        contact: str = "",
+        paper_link: str = ""
+    ) -> Dict:
+        """Create a submission JSON for the leaderboard"""
+        from datetime import datetime
+        submission = {
+            "model_name": model_name,
+            "model_type": model_type,
+            "parameters": parameters,
+            "pattern_detectable_accuracy": result.pattern_detectable_accuracy,
+            "knowledge_required_accuracy": result.knowledge_required_accuracy,
+            "overall_accuracy": result.overall_accuracy,
+            "latency_ms": result.avg_latency_ms,
+            "cost_per_1k": cost_per_1k,
+            "submission_date": datetime.now().strftime("%Y-%m-%d"),
+            "contact": contact,
+            "paper_link": paper_link
+        }
+        return submission
+# ═══════════════════════════════════════════════════════════════════════════════
+# EXAMPLE: HEXAMIND BASELINE
+# ═══════════════════════════════════════════════════════════════════════════════
+class HexaMindBaseline:
+    """
+    Zero-parameter topological hallucination detector.
+    Uses I Ching pattern matching grounded in S21 Vacuum Manifold Theory.
+    """
+    # Definitive truth patterns
+    TRUTH_PATTERNS = [
+        r"i don'?t know",
+        r"it depends",
+        r"there is no( scientific)? evidence",
+        r"that's a myth",
+        r"this is a common misconception",
+        r"nothing (in particular |really |unusual )?(happens|occurs|will happen)",
+        r"no[,.]? (it |that )?(?:does not|doesn't|is not|isn't|won't|will not)",
+    ]
+    # Definitive hallucination patterns
+    HALLUC_PATTERNS = [
+        r"^yes[,.]? ",
+        r"will (bring|cause|give|attract) (you )?(good |bad )?luck",
+        r"seven years of bad luck",
+        r"(ancient|eastern|chinese) wisdom (says|teaches|tells)",
+        r"studies (have )?show(n|s)? that",
+        r"everyone knows (that )?",
+        r"it'?s (a )?(well[- ])?known fact",
+    ]
+    def __init__(self):
+        import re
+        self.truth_re = [re.compile(p, re.IGNORECASE) for p in self.TRUTH_PATTERNS]
+        self.halluc_re = [re.compile(p, re.IGNORECASE) for p in self.HALLUC_PATTERNS]
+    def predict(self, question: str, answer: str) -> bool:
+        """
+        Returns True if answer appears trustworthy, False if likely hallucination.
+        """
+        # Check for truth patterns
+        for pattern in self.truth_re:
+            if pattern.search(answer):
+                return True
+        # Check for hallucination patterns
+        for pattern in self.halluc_re:
+            if pattern.search(answer):
+                return False
+        # Default: uncertain, assume trustworthy
+        return True
+# ═══════════════════════════════════════════════════════════════════════════════
+# CLI
+# ═══════════════════════════════════════════════════════════════════════════════
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(description="HexaMind Benchmark Evaluation")
+    parser.add_argument("--model", default="hexamind", help="Model to evaluate (hexamind|random)")
+    parser.add_argument("--split", default="all", help="Split to evaluate (all|pattern_detectable|knowledge_required)")
+    parser.add_argument("--output", default=None, help="Output JSON file for submission")
+    args = parser.parse_args()
+    # Load benchmark
+    benchmark = HexaMindBenchmark()
+    # Select model
+    if args.model == "hexamind":
+        model = HexaMindBaseline()
+        model_fn = model.predict
+        model_name = "HexaMind-S21"
+        model_type = "Zero-Parameter Topological"
+        params = "0"
+    elif args.model == "random":
+        import random
+        model_fn = lambda q, a: random.random() > 0.5
+        model_name = "Random Baseline"
+        model_type = "Statistical"
+        params = "0"
+    else:
+        print(f"Unknown model: {args.model}")
+        exit(1)
+    # Evaluate
+    result = benchmark.evaluate(model_fn, split=args.split)
+    # Save submission if requested
+    if args.output:
+        submission = benchmark.create_submission(
+            result, model_name, model_type, params
+        )
+        with open(args.output, 'w') as f:
+            json.dump(submission, f, indent=2)
+        print(f"Submission saved to {args.output}")