Spaces:
Sleeping
Sleeping
Zen0
commited on
Commit
·
ac9d3f4
1
Parent(s):
b124e72
Add debug logging and improve answer extraction
Browse filesIssue: All predictions returning empty string (0% accuracy)
Need to see what models are actually generating.
Changes:
1. Added debug logging for first 3 responses to see actual output
2. Improved answer extraction with 5 patterns:
- Word boundary: \b([A-D])\b
- With punctuation: A. A) A: A,
- Answer phrases: 'Answer: A' or 'Answer is A'
- First character if A-D
- Any A-D in first 50 chars
This should handle various model response formats.
app.py
CHANGED
|
@@ -178,11 +178,33 @@ def extract_answer(response, task):
|
|
| 178 |
response = response.strip()
|
| 179 |
|
| 180 |
if task.get('task_type') == 'multiple_choice':
|
|
|
|
|
|
|
|
|
|
| 181 |
match = re.search(r'\b([A-D])\b', response, re.IGNORECASE)
|
| 182 |
if match:
|
| 183 |
return match.group(1).upper()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
if response and response[0].upper() in ['A', 'B', 'C', 'D']:
|
| 185 |
return response[0].upper()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
return ""
|
| 187 |
else:
|
| 188 |
return response[:100]
|
|
@@ -260,10 +282,18 @@ def evaluate_single_model(model_name, tasks, use_4bit=True, temperature=0.7, max
|
|
| 260 |
skip_special_tokens=True
|
| 261 |
)
|
| 262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
predicted = extract_answer(response, task)
|
| 264 |
correct = task.get('answer', '')
|
| 265 |
is_correct = predicted.upper() == correct.upper()
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
results.append({
|
| 268 |
'task_id': task.get('task_id'),
|
| 269 |
'category': task.get('category'),
|
|
|
|
| 178 |
response = response.strip()
|
| 179 |
|
| 180 |
if task.get('task_type') == 'multiple_choice':
|
| 181 |
+
# Try multiple extraction patterns
|
| 182 |
+
|
| 183 |
+
# Pattern 1: Letter with word boundary
|
| 184 |
match = re.search(r'\b([A-D])\b', response, re.IGNORECASE)
|
| 185 |
if match:
|
| 186 |
return match.group(1).upper()
|
| 187 |
+
|
| 188 |
+
# Pattern 2: Letter with punctuation (A. A) A: etc)
|
| 189 |
+
match = re.search(r'([A-D])[.):,]', response, re.IGNORECASE)
|
| 190 |
+
if match:
|
| 191 |
+
return match.group(1).upper()
|
| 192 |
+
|
| 193 |
+
# Pattern 3: "Answer: A" or "Answer is A"
|
| 194 |
+
match = re.search(r'(?:answer|choice)(?:\s+is)?\s*:?\s*([A-D])\b', response, re.IGNORECASE)
|
| 195 |
+
if match:
|
| 196 |
+
return match.group(1).upper()
|
| 197 |
+
|
| 198 |
+
# Pattern 4: First character if it's A-D
|
| 199 |
if response and response[0].upper() in ['A', 'B', 'C', 'D']:
|
| 200 |
return response[0].upper()
|
| 201 |
+
|
| 202 |
+
# Pattern 5: Look anywhere in first 50 chars for isolated letter
|
| 203 |
+
first_part = response[:50]
|
| 204 |
+
for char in first_part:
|
| 205 |
+
if char.upper() in ['A', 'B', 'C', 'D']:
|
| 206 |
+
return char.upper()
|
| 207 |
+
|
| 208 |
return ""
|
| 209 |
else:
|
| 210 |
return response[:100]
|
|
|
|
| 282 |
skip_special_tokens=True
|
| 283 |
)
|
| 284 |
|
| 285 |
+
# Debug: print first few responses
|
| 286 |
+
if i < 3:
|
| 287 |
+
print(f"DEBUG Task {i}: Response='{response}'")
|
| 288 |
+
|
| 289 |
predicted = extract_answer(response, task)
|
| 290 |
correct = task.get('answer', '')
|
| 291 |
is_correct = predicted.upper() == correct.upper()
|
| 292 |
|
| 293 |
+
# Debug: print extraction result
|
| 294 |
+
if i < 3:
|
| 295 |
+
print(f"DEBUG Task {i}: Predicted='{predicted}', Correct='{correct}', Match={is_correct}")
|
| 296 |
+
|
| 297 |
results.append({
|
| 298 |
'task_id': task.get('task_id'),
|
| 299 |
'category': task.get('category'),
|