Zen0 commited on
Commit
ac9d3f4
·
1 Parent(s): b124e72

Add debug logging and improve answer extraction

Browse files

Issue: All predictions returning empty string (0% accuracy)
Need to see what models are actually generating.

Changes:
1. Added debug logging for first 3 responses to see actual output
2. Improved answer extraction with 5 patterns:
- Word boundary: \b([A-D])\b
- With punctuation: A. A) A: A,
- Answer phrases: 'Answer: A' or 'Answer is A'
- First character if A-D
- Any A-D in first 50 chars

This should handle various model response formats.

Files changed (1) hide show
  1. app.py +30 -0
app.py CHANGED
@@ -178,11 +178,33 @@ def extract_answer(response, task):
178
  response = response.strip()
179
 
180
  if task.get('task_type') == 'multiple_choice':
 
 
 
181
  match = re.search(r'\b([A-D])\b', response, re.IGNORECASE)
182
  if match:
183
  return match.group(1).upper()
 
 
 
 
 
 
 
 
 
 
 
 
184
  if response and response[0].upper() in ['A', 'B', 'C', 'D']:
185
  return response[0].upper()
 
 
 
 
 
 
 
186
  return ""
187
  else:
188
  return response[:100]
@@ -260,10 +282,18 @@ def evaluate_single_model(model_name, tasks, use_4bit=True, temperature=0.7, max
260
  skip_special_tokens=True
261
  )
262
 
 
 
 
 
263
  predicted = extract_answer(response, task)
264
  correct = task.get('answer', '')
265
  is_correct = predicted.upper() == correct.upper()
266
 
 
 
 
 
267
  results.append({
268
  'task_id': task.get('task_id'),
269
  'category': task.get('category'),
 
178
  response = response.strip()
179
 
180
  if task.get('task_type') == 'multiple_choice':
181
+ # Try multiple extraction patterns
182
+
183
+ # Pattern 1: Letter with word boundary
184
  match = re.search(r'\b([A-D])\b', response, re.IGNORECASE)
185
  if match:
186
  return match.group(1).upper()
187
+
188
+ # Pattern 2: Letter with punctuation (A. A) A: etc)
189
+ match = re.search(r'([A-D])[.):,]', response, re.IGNORECASE)
190
+ if match:
191
+ return match.group(1).upper()
192
+
193
+ # Pattern 3: "Answer: A" or "Answer is A"
194
+ match = re.search(r'(?:answer|choice)(?:\s+is)?\s*:?\s*([A-D])\b', response, re.IGNORECASE)
195
+ if match:
196
+ return match.group(1).upper()
197
+
198
+ # Pattern 4: First character if it's A-D
199
  if response and response[0].upper() in ['A', 'B', 'C', 'D']:
200
  return response[0].upper()
201
+
202
+ # Pattern 5: Look anywhere in first 50 chars for isolated letter
203
+ first_part = response[:50]
204
+ for char in first_part:
205
+ if char.upper() in ['A', 'B', 'C', 'D']:
206
+ return char.upper()
207
+
208
  return ""
209
  else:
210
  return response[:100]
 
282
  skip_special_tokens=True
283
  )
284
 
285
+ # Debug: print first few responses
286
+ if i < 3:
287
+ print(f"DEBUG Task {i}: Response='{response}'")
288
+
289
  predicted = extract_answer(response, task)
290
  correct = task.get('answer', '')
291
  is_correct = predicted.upper() == correct.upper()
292
 
293
+ # Debug: print extraction result
294
+ if i < 3:
295
+ print(f"DEBUG Task {i}: Predicted='{predicted}', Correct='{correct}', Match={is_correct}")
296
+
297
  results.append({
298
  'task_id': task.get('task_id'),
299
  'category': task.get('category'),