s21mind commited on
Commit
ca0826b
Β·
verified Β·
1 Parent(s): 77fdbf9

Upload hexamind_benchmark.py

Browse files
Files changed (1) hide show
  1. hexamind_benchmark.py +307 -0
hexamind_benchmark.py ADDED
@@ -0,0 +1,307 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ╔══════════════════════════════════════════════════════════════════════════════╗
3
+ β•‘ HEXAMIND HALLUCINATION BENCHMARK - EVALUATION SCRIPT β•‘
4
+ β•‘ Evaluate your model on Pattern-Detectable vs Knowledge-Required splits β•‘
5
+ β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
6
+
7
+ Usage:
8
+ from hexamind_benchmark import HexaMindBenchmark
9
+
10
+ benchmark = HexaMindBenchmark()
11
+ results = benchmark.evaluate(your_model_function)
12
+ """
13
+
14
+ import json
15
+ import os
16
+ from typing import Callable, Dict, List, Optional
17
+ from dataclasses import dataclass
18
+ import time
19
+
20
+ @dataclass
21
+ class EvaluationResult:
22
+ """Results from benchmark evaluation"""
23
+ pattern_detectable_accuracy: float
24
+ knowledge_required_accuracy: float
25
+ overall_accuracy: float
26
+ pattern_detectable_samples: int
27
+ knowledge_required_samples: int
28
+ total_samples: int
29
+ avg_latency_ms: float
30
+
31
+ def to_dict(self) -> Dict:
32
+ return {
33
+ "pattern_detectable_accuracy": round(self.pattern_detectable_accuracy, 2),
34
+ "knowledge_required_accuracy": round(self.knowledge_required_accuracy, 2),
35
+ "overall_accuracy": round(self.overall_accuracy, 2),
36
+ "pattern_detectable_samples": self.pattern_detectable_samples,
37
+ "knowledge_required_samples": self.knowledge_required_samples,
38
+ "total_samples": self.total_samples,
39
+ "avg_latency_ms": round(self.avg_latency_ms, 2)
40
+ }
41
+
42
+ def __str__(self) -> str:
43
+ return f"""
44
+ ╔══════════════════════════════════════════════════════════════╗
45
+ β•‘ HEXAMIND BENCHMARK EVALUATION RESULTS β•‘
46
+ ╠══════════════════════════════════════════════════════════════╣
47
+ β•‘ Pattern-Detectable Accuracy: {self.pattern_detectable_accuracy:6.2f}% (n={self.pattern_detectable_samples:3d}) β•‘
48
+ β•‘ Knowledge-Required Accuracy: {self.knowledge_required_accuracy:6.2f}% (n={self.knowledge_required_samples:3d}) β•‘
49
+ β•‘ ──────────────────────────────────────────────────────── β•‘
50
+ β•‘ Overall Accuracy: {self.overall_accuracy:6.2f}% (n={self.total_samples:3d}) β•‘
51
+ β•‘ Average Latency: {self.avg_latency_ms:6.2f} ms β•‘
52
+ β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
53
+ """
54
+
55
+
56
+ class HexaMindBenchmark:
57
+ """
58
+ HexaMind Hallucination Detection Benchmark
59
+
60
+ Evaluates models on two splits:
61
+ 1. Pattern-Detectable: Questions where linguistic patterns reveal hallucinations
62
+ 2. Knowledge-Required: Questions requiring factual verification
63
+ """
64
+
65
+ def __init__(self, data_dir: str = "data"):
66
+ self.data_dir = data_dir
67
+ self.pattern_detectable = self._load_split("pattern_detectable.json")
68
+ self.knowledge_required = self._load_split("knowledge_required.json")
69
+
70
+ def _load_split(self, filename: str) -> List[Dict]:
71
+ """Load a benchmark split from JSON"""
72
+ filepath = os.path.join(self.data_dir, filename)
73
+ if os.path.exists(filepath):
74
+ with open(filepath, 'r') as f:
75
+ return json.load(f)
76
+ else:
77
+ print(f"Warning: {filepath} not found. Using empty list.")
78
+ return []
79
+
80
+ def evaluate(
81
+ self,
82
+ model_fn: Callable[[str, str], bool],
83
+ split: str = "all",
84
+ verbose: bool = True
85
+ ) -> EvaluationResult:
86
+ """
87
+ Evaluate a model on the benchmark.
88
+
89
+ Args:
90
+ model_fn: Function that takes (question, answer) and returns:
91
+ True if answer is trustworthy, False if hallucination
92
+ split: "all", "pattern_detectable", or "knowledge_required"
93
+ verbose: Print progress
94
+
95
+ Returns:
96
+ EvaluationResult with accuracy metrics
97
+ """
98
+
99
+ # Select splits to evaluate
100
+ if split == "all":
101
+ pattern_samples = self.pattern_detectable
102
+ knowledge_samples = self.knowledge_required
103
+ elif split == "pattern_detectable":
104
+ pattern_samples = self.pattern_detectable
105
+ knowledge_samples = []
106
+ elif split == "knowledge_required":
107
+ pattern_samples = []
108
+ knowledge_samples = self.knowledge_required
109
+ else:
110
+ raise ValueError(f"Unknown split: {split}")
111
+
112
+ # Evaluate pattern-detectable
113
+ pattern_correct = 0
114
+ pattern_total = 0
115
+ latencies = []
116
+
117
+ if pattern_samples:
118
+ if verbose:
119
+ print("Evaluating Pattern-Detectable split...")
120
+ for i, sample in enumerate(pattern_samples):
121
+ start = time.time()
122
+ prediction = model_fn(sample["question"], sample["answer"])
123
+ latencies.append((time.time() - start) * 1000)
124
+
125
+ expected = sample["ground_truth"] == 1
126
+ if prediction == expected:
127
+ pattern_correct += 1
128
+ pattern_total += 1
129
+
130
+ if verbose and (i + 1) % 50 == 0:
131
+ print(f" {i + 1}/{len(pattern_samples)}...")
132
+
133
+ # Evaluate knowledge-required
134
+ knowledge_correct = 0
135
+ knowledge_total = 0
136
+
137
+ if knowledge_samples:
138
+ if verbose:
139
+ print("Evaluating Knowledge-Required split...")
140
+ for i, sample in enumerate(knowledge_samples):
141
+ start = time.time()
142
+ prediction = model_fn(sample["question"], sample["answer"])
143
+ latencies.append((time.time() - start) * 1000)
144
+
145
+ expected = sample["ground_truth"] == 1
146
+ if prediction == expected:
147
+ knowledge_correct += 1
148
+ knowledge_total += 1
149
+
150
+ if verbose and (i + 1) % 100 == 0:
151
+ print(f" {i + 1}/{len(knowledge_samples)}...")
152
+
153
+ # Compute metrics
154
+ pattern_acc = (pattern_correct / pattern_total * 100) if pattern_total > 0 else 0
155
+ knowledge_acc = (knowledge_correct / knowledge_total * 100) if knowledge_total > 0 else 0
156
+
157
+ total_correct = pattern_correct + knowledge_correct
158
+ total_samples = pattern_total + knowledge_total
159
+ overall_acc = (total_correct / total_samples * 100) if total_samples > 0 else 0
160
+
161
+ avg_latency = sum(latencies) / len(latencies) if latencies else 0
162
+
163
+ result = EvaluationResult(
164
+ pattern_detectable_accuracy=pattern_acc,
165
+ knowledge_required_accuracy=knowledge_acc,
166
+ overall_accuracy=overall_acc,
167
+ pattern_detectable_samples=pattern_total,
168
+ knowledge_required_samples=knowledge_total,
169
+ total_samples=total_samples,
170
+ avg_latency_ms=avg_latency
171
+ )
172
+
173
+ if verbose:
174
+ print(result)
175
+
176
+ return result
177
+
178
+ def create_submission(
179
+ self,
180
+ result: EvaluationResult,
181
+ model_name: str,
182
+ model_type: str,
183
+ parameters: str,
184
+ cost_per_1k: str = "Unknown",
185
+ contact: str = "",
186
+ paper_link: str = ""
187
+ ) -> Dict:
188
+ """Create a submission JSON for the leaderboard"""
189
+ from datetime import datetime
190
+
191
+ submission = {
192
+ "model_name": model_name,
193
+ "model_type": model_type,
194
+ "parameters": parameters,
195
+ "pattern_detectable_accuracy": result.pattern_detectable_accuracy,
196
+ "knowledge_required_accuracy": result.knowledge_required_accuracy,
197
+ "overall_accuracy": result.overall_accuracy,
198
+ "latency_ms": result.avg_latency_ms,
199
+ "cost_per_1k": cost_per_1k,
200
+ "submission_date": datetime.now().strftime("%Y-%m-%d"),
201
+ "contact": contact,
202
+ "paper_link": paper_link
203
+ }
204
+
205
+ return submission
206
+
207
+
208
+ # ═══════════════════════════════════════════════════════════════════════════════
209
+ # EXAMPLE: HEXAMIND BASELINE
210
+ # ═══════════════════════════════════════════════════════════════════════════════
211
+
212
+ class HexaMindBaseline:
213
+ """
214
+ Zero-parameter topological hallucination detector.
215
+ Uses I Ching pattern matching grounded in S21 Vacuum Manifold Theory.
216
+ """
217
+
218
+ # Definitive truth patterns
219
+ TRUTH_PATTERNS = [
220
+ r"i don'?t know",
221
+ r"it depends",
222
+ r"there is no( scientific)? evidence",
223
+ r"that's a myth",
224
+ r"this is a common misconception",
225
+ r"nothing (in particular |really |unusual )?(happens|occurs|will happen)",
226
+ r"no[,.]? (it |that )?(?:does not|doesn't|is not|isn't|won't|will not)",
227
+ ]
228
+
229
+ # Definitive hallucination patterns
230
+ HALLUC_PATTERNS = [
231
+ r"^yes[,.]? ",
232
+ r"will (bring|cause|give|attract) (you )?(good |bad )?luck",
233
+ r"seven years of bad luck",
234
+ r"(ancient|eastern|chinese) wisdom (says|teaches|tells)",
235
+ r"studies (have )?show(n|s)? that",
236
+ r"everyone knows (that )?",
237
+ r"it'?s (a )?(well[- ])?known fact",
238
+ ]
239
+
240
+ def __init__(self):
241
+ import re
242
+ self.truth_re = [re.compile(p, re.IGNORECASE) for p in self.TRUTH_PATTERNS]
243
+ self.halluc_re = [re.compile(p, re.IGNORECASE) for p in self.HALLUC_PATTERNS]
244
+
245
+ def predict(self, question: str, answer: str) -> bool:
246
+ """
247
+ Returns True if answer appears trustworthy, False if likely hallucination.
248
+ """
249
+ # Check for truth patterns
250
+ for pattern in self.truth_re:
251
+ if pattern.search(answer):
252
+ return True
253
+
254
+ # Check for hallucination patterns
255
+ for pattern in self.halluc_re:
256
+ if pattern.search(answer):
257
+ return False
258
+
259
+ # Default: uncertain, assume trustworthy
260
+ return True
261
+
262
+
263
+ # ═══════════════════════════════════════════════════════════════════════════════
264
+ # CLI
265
+ # ═══════════════════════════════════════════════════════════════════════════════
266
+
267
+ if __name__ == "__main__":
268
+ import argparse
269
+
270
+ parser = argparse.ArgumentParser(description="HexaMind Benchmark Evaluation")
271
+ parser.add_argument("--model", default="hexamind", help="Model to evaluate (hexamind|random)")
272
+ parser.add_argument("--split", default="all", help="Split to evaluate (all|pattern_detectable|knowledge_required)")
273
+ parser.add_argument("--output", default=None, help="Output JSON file for submission")
274
+
275
+ args = parser.parse_args()
276
+
277
+ # Load benchmark
278
+ benchmark = HexaMindBenchmark()
279
+
280
+ # Select model
281
+ if args.model == "hexamind":
282
+ model = HexaMindBaseline()
283
+ model_fn = model.predict
284
+ model_name = "HexaMind-S21"
285
+ model_type = "Zero-Parameter Topological"
286
+ params = "0"
287
+ elif args.model == "random":
288
+ import random
289
+ model_fn = lambda q, a: random.random() > 0.5
290
+ model_name = "Random Baseline"
291
+ model_type = "Statistical"
292
+ params = "0"
293
+ else:
294
+ print(f"Unknown model: {args.model}")
295
+ exit(1)
296
+
297
+ # Evaluate
298
+ result = benchmark.evaluate(model_fn, split=args.split)
299
+
300
+ # Save submission if requested
301
+ if args.output:
302
+ submission = benchmark.create_submission(
303
+ result, model_name, model_type, params
304
+ )
305
+ with open(args.output, 'w') as f:
306
+ json.dump(submission, f, indent=2)
307
+ print(f"Submission saved to {args.output}")