brycemeetkai commited on
Commit
a540a5c
·
verified ·
1 Parent(s): 00d42e1

Mirror evals/ from 4ac99d72af66

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. evals/README.md +0 -2
  2. evals/albanian/README.md +136 -0
  3. evals/albanian/albanian.yaml +21 -0
  4. evals/{igbo/classification/igbo_classification.yaml → albanian/classification/albanian_classification.yaml} +2 -2
  5. evals/{igbo/classification/igbo_sib200.yaml → albanian/classification/albanian_sib200.yaml} +10 -3
  6. evals/{yoruba → albanian}/classification/utils.py +3 -6
  7. evals/{igbo → albanian}/mcq/_default_mcq_yaml +2 -1
  8. evals/{igbo/mcq/igbo_belebele.yaml → albanian/mcq/albanian_belebele.yaml} +3 -3
  9. evals/albanian/mcq/albanian_global_mmlu.yaml +11 -0
  10. evals/{igbo/mcq/igbo_mcq.yaml → albanian/mcq/albanian_mcq.yaml} +4 -4
  11. evals/{yoruba → albanian}/mcq/utils.py +39 -49
  12. evals/albanian/open_generation/_default_open_generation_yaml +18 -0
  13. evals/albanian/open_generation/albanian_aya.yaml +14 -0
  14. evals/albanian/open_generation/albanian_open_generation.yaml +17 -0
  15. evals/albanian/open_generation/albanian_polywrite.yaml +15 -0
  16. evals/albanian/open_generation/utils.py +172 -0
  17. evals/albanian/summarization/_default_summarization_yaml +18 -0
  18. evals/albanian/summarization/albanian_massivesumm_long.yaml +14 -0
  19. evals/albanian/summarization/albanian_massivesumm_short.yaml +9 -0
  20. evals/albanian/summarization/albanian_summarization.yaml +11 -0
  21. evals/albanian/summarization/utils.py +111 -0
  22. evals/arabic/classification/arabic_sib200.yaml +7 -0
  23. evals/arabic/qa/arabic_qa.yaml +3 -0
  24. evals/cost_core.py +0 -1
  25. evals/english/english.yaml +0 -1
  26. evals/eval_config.toml +34 -38
  27. evals/f1_utils.py +162 -17
  28. evals/french/classification/french_sib200.yaml +7 -0
  29. evals/french/qa/french_qa.yaml +3 -0
  30. evals/hausa/classification/hausa_sib200.yaml +7 -0
  31. evals/hausa/hausa.yaml +0 -1
  32. evals/hausa/nli/hausa_afrixnli.yaml +7 -0
  33. evals/hausa/nli/utils.py +2 -0
  34. evals/hausa/qa/hausa_qa.yaml +3 -0
  35. evals/hausa/sentiment/utils.py +0 -26
  36. evals/igbo/afrimgsm/igbo_afrimgsm.yaml +0 -28
  37. evals/igbo/igbo.yaml +0 -10
  38. evals/igbo/mcq/igbo_afrimmlu.yaml +0 -9
  39. evals/igbo/nli/utils.py +0 -26
  40. evals/igbo/qa/utils.py +0 -61
  41. evals/igbo/sentiment/igbo_sentiment.yaml +0 -9
  42. evals/igbo/sentiment/utils.py +0 -26
  43. evals/portuguese/README.md +131 -0
  44. evals/{igbo/nli/igbo_afrixnli.yaml → portuguese/classification/_default_classification_yaml} +1 -8
  45. evals/portuguese/classification/portuguese_classification.yaml +12 -0
  46. evals/portuguese/classification/portuguese_hate_speech.yaml +17 -0
  47. evals/portuguese/classification/portuguese_hatebr.yaml +17 -0
  48. evals/portuguese/classification/portuguese_tweetsentbr.yaml +16 -0
  49. evals/portuguese/classification/utils.py +112 -0
  50. evals/{swahili/afrimgsm/swahili_afrimgsm.yaml → portuguese/mcq/_default_mcq_yaml} +4 -11
evals/README.md CHANGED
@@ -111,8 +111,6 @@ name = "my_task_group"
111
  | French | SIB-200, Belebele, MGSM | `french_classification`, `french_mcq`, `french_math` |
112
  | Arabic | SIB-200, Belebele | `arabic_classification`, `arabic_mcq` |
113
  | Hausa | SIB-200, AfriMMLU, Belebele | `hausa_classification`, `hausa_mcq` |
114
- | Yoruba | SIB-200, AfriMMLU, Belebele | `yoruba_classification`, `yoruba_mcq` |
115
- | Igbo | SIB-200, AfriMMLU, Belebele | `igbo_classification`, `igbo_mcq` |
116
 
117
  ## Adding a New Language/Benchmark
118
 
 
111
  | French | SIB-200, Belebele, MGSM | `french_classification`, `french_mcq`, `french_math` |
112
  | Arabic | SIB-200, Belebele | `arabic_classification`, `arabic_mcq` |
113
  | Hausa | SIB-200, AfriMMLU, Belebele | `hausa_classification`, `hausa_mcq` |
 
 
114
 
115
  ## Adding a New Language/Benchmark
116
 
evals/albanian/README.md ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Albanian – lm-eval Tasks
2
+
3
+ Albanian (Tosk, `als_Latn` / macro `sq`) evaluation suite for the
4
+ `lm-evaluation-harness` framework.
5
+
6
+ ## Overview
7
+
8
+ ### Custom Tasks (require `--include_path`)
9
+
10
+ | # | Task Name | Category | Dataset (HuggingFace) | Metric |
11
+ | --- | ---------------------------- | --------------- | --------------------------------------------------------------------------------------- | ------------------ |
12
+ | 1 | `albanian_sib200` | Classification | `Davlan/sib200` (`als_Latn`) | f1_macro |
13
+ | 2 | `albanian_belebele` | MCQ | `facebook/belebele` (`als_Latn`) | f1_macro |
14
+ | 3 | `albanian_global_mmlu` | MCQ | `CohereLabs/Global-MMLU-Lite` (`sq`, v2) | f1_macro |
15
+ | 4 | `albanian_massivesumm_short` | Summarization | `MaLA-LM/MassiveSumm_short` (filtered `language=sqi`) | rouge_l |
16
+ | 5 | `albanian_massivesumm_long` | Summarization | `MaLA-LM/MassiveSumm_long` (filtered `language=sqi`) | rouge_l |
17
+ | 6 | `albanian_aya` | Open generation | `CohereLabs/aya_evaluation_suite` (`dolly_machine_translated`, filtered `language=sqi`) | llm_judge_score |
18
+ | 7 | `albanian_polywrite` | Open generation | `MaLA-LM/PolyWrite` (filtered `lang_script=sqi_Latn`) | open_quality_score |
19
+
20
+ #### Subgroups
21
+
22
+ | Group | Tasks |
23
+ | -------------------------- | ----------------------------------- |
24
+ | `albanian_classification` | sib200 |
25
+ | `albanian_mcq` | belebele, global_mmlu |
26
+ | `albanian_summarization` | massivesumm_short, massivesumm_long |
27
+ | `albanian_open_generation` | aya, polywrite |
28
+
29
+ ## Setup
30
+
31
+ ```bash
32
+ pip install lm-eval
33
+ ```
34
+
35
+ ## Running Tasks
36
+
37
+ All commands must be run from the `multilingual_bench/` directory:
38
+
39
+ ```bash
40
+ cd /path/to/functionary_internal/evaluation/multilingual_bench
41
+ ```
42
+
43
+ ### Run the Entire Albanian Suite (all 7 tasks)
44
+
45
+ ```bash
46
+ OPENAI_API_KEY="$OPENROUTER_API_KEY" \
47
+ lm_eval \
48
+ --include_path lm_eval_tasks \
49
+ --tasks albanian \
50
+ --model local-chat-completions \
51
+ --model_args model=openai/gpt-5-mini,base_url=https://openrouter.ai/api/v1/chat/completions,num_concurrent=5 \
52
+ --apply_chat_template \
53
+ --num_fewshot 0 \
54
+ --log_samples \
55
+ --output_path output/albanian_results
56
+ ```
57
+
58
+ ### Run via the project runner
59
+
60
+ ```bash
61
+ cd lm_eval_tasks
62
+ export OPENROUTER_API_KEY="sk-or-..."
63
+ python run_eval.py --models gpt-5-mini --tasks albanian
64
+ ```
65
+
66
+ ### Run a Single Category
67
+
68
+ ```bash
69
+ lm_eval --include_path lm_eval_tasks --tasks albanian_classification ...
70
+ lm_eval --include_path lm_eval_tasks --tasks albanian_mcq ...
71
+ lm_eval --include_path lm_eval_tasks --tasks albanian_summarization ...
72
+ lm_eval --include_path lm_eval_tasks --tasks albanian_open_generation ...
73
+ ```
74
+
75
+ ### Run a Single Task
76
+
77
+ ```bash
78
+ lm_eval --include_path lm_eval_tasks --tasks albanian_sib200 ...
79
+ lm_eval --include_path lm_eval_tasks --tasks albanian_belebele ...
80
+ lm_eval --include_path lm_eval_tasks --tasks albanian_global_mmlu ...
81
+ lm_eval --include_path lm_eval_tasks --tasks albanian_massivesumm_short ...
82
+ lm_eval --include_path lm_eval_tasks --tasks albanian_massivesumm_long ...
83
+ lm_eval --include_path lm_eval_tasks --tasks albanian_aya ...
84
+ lm_eval --include_path lm_eval_tasks --tasks albanian_polywrite ...
85
+ ```
86
+
87
+ ## Output
88
+
89
+ With `--log_samples`, the output directory contains:
90
+
91
+ - `results.json` – aggregate scores per task
92
+ - `samples_<task_name>.jsonl` – per-example model outputs for debugging
93
+
94
+ ## Dataset Sources
95
+
96
+ | Dataset | Source | Config | Notes |
97
+ | ----------------- | --------------------------------- | -------------------------------------------------- | -------------------------------------------------------------------- |
98
+ | SIB-200 | `Davlan/sib200` | `als_Latn` | text + ClassLabel `category` (7 topics) |
99
+ | Belebele | `facebook/belebele` | `als_Latn` | flores_passage + question + 4 mc_answers, `correct_answer_num` 1-4 |
100
+ | Global-MMLU-Lite | `CohereLabs/Global-MMLU-Lite` | `sq` | question + `option_a..d` + `answer` letter (400 samples, CS+CA) |
101
+ | MassiveSumm short | `MaLA-LM/MassiveSumm_short` | — (filter `language=sqi`) | `text`, `summary`, `language`; gated |
102
+ | MassiveSumm long | `MaLA-LM/MassiveSumm_long` | — (filter `language=sqi`) | same schema; longer articles |
103
+ | Aya Eval | `CohereLabs/aya_evaluation_suite` | `dolly_machine_translated` (filter `language=sqi`) | `inputs`, `targets`, `language`, `script` |
104
+ | PolyWrite | `MaLA-LM/PolyWrite` | — (filter `lang_script=sqi_Latn`) | `prompt_translated`, `category`, `lang_script` (no reference answer) |
105
+
106
+ ### Gated datasets
107
+
108
+ Several upstream datasets are gated on Hugging Face. Accept the terms (once) and export an HF token before running:
109
+
110
+ - Aya Eval: <https://huggingface.co/datasets/CohereLabs/aya_evaluation_suite>
111
+ - MassiveSumm short / long: <https://huggingface.co/datasets/MaLA-LM/MassiveSumm_short> and <https://huggingface.co/datasets/MaLA-LM/MassiveSumm_long>
112
+
113
+ ```bash
114
+ export HF_TOKEN="hf_..."
115
+ huggingface-cli login # one-time, optional if HF_TOKEN is exported
116
+ ```
117
+
118
+ ### LLM-judge tasks
119
+
120
+ `albanian_aya` and `albanian_polywrite` use an LLM judge (default `openai/gpt-5-mini` via OpenRouter) for scoring; this consumes additional API credits per sample. Override:
121
+
122
+ ```bash
123
+ export JUDGE_MODEL="openai/gpt-5-mini"
124
+ export JUDGE_BASE_URL="https://openrouter.ai/api/v1"
125
+ export JUDGE_CONCURRENCY=32
126
+ ```
127
+
128
+ ### Tasks not (yet) included
129
+
130
+ GlotEval also lists IN22, AmericasNLP, MAFAND, XLSum, MMMLU, NTEU, MMHB, BenchMAX (Math/Science/Rule-based), TICO-19, NTREX-128, Taxi-1500, PBC, MaLA, and UD-UPOS. These are intentionally **not** included because:
131
+
132
+ - No Albanian coverage upstream: AmericasNLP, IN22, MAFAND, XLSum, MMMLU, NTEU, MMHB, BenchMAX (all subsets).
133
+ - No HuggingFace dataset (only ship as local files in GlotEval): TICO-19, NTREX-128, Taxi-1500.
134
+ - Not meaningful via chat-completions: PBC, MaLA (intrinsic NLL), UD-UPOS / WikiANN-style token-level tagging.
135
+
136
+ Add any of these later if you ship the local data and want a corresponding task config.
evals/albanian/albanian.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Albanian – top-level benchmark group
2
+ # Usage:
3
+ # cd multilingual_bench
4
+ # lm_eval --include_path lm_eval_tasks \
5
+ # --tasks albanian \
6
+ # --model local-chat-completions \
7
+ # --model_args model=your-model,base_url=http://your-endpoint/v1/chat/completions \
8
+ # --num_fewshot 0
9
+ #
10
+ # Metrics:
11
+ # classification & mcq → f1_macro (per sub-group)
12
+ # summarization → rouge_l (per sub-group)
13
+ # open_generation → llm_judge_score / open_quality_score (per sub-group)
14
+ group: albanian
15
+ task:
16
+ - albanian_classification
17
+ - albanian_mcq
18
+ - albanian_summarization
19
+ - albanian_open_generation
20
+ metadata:
21
+ version: 1.0
evals/{igbo/classification/igbo_classification.yaml → albanian/classification/albanian_classification.yaml} RENAMED
@@ -1,7 +1,7 @@
1
  # Topic Classification subgroup (SIB-200)
2
- group: igbo_classification
3
  task:
4
- - igbo_sib200
5
  aggregate_metric_list:
6
  - metric: f1_macro
7
  aggregation: mean
 
1
  # Topic Classification subgroup (SIB-200)
2
+ group: albanian_classification
3
  task:
4
+ - albanian_sib200
5
  aggregate_metric_list:
6
  - metric: f1_macro
7
  aggregation: mean
evals/{igbo/classification/igbo_sib200.yaml → albanian/classification/albanian_sib200.yaml} RENAMED
@@ -1,7 +1,7 @@
1
- task: igbo_sib200
2
  task_alias: sib200
3
  dataset_path: Davlan/sib200
4
- dataset_name: ibo_Latn
5
  test_split: test
6
  output_type: generate_until
7
  generation_kwargs:
@@ -10,8 +10,15 @@ generation_kwargs:
10
  until:
11
  - "<|endoftext|>"
12
  process_docs: !function utils.process_sib200_docs
13
- doc_to_text: "You are a topic classification system.\nChoose the single best label for the following Igbo text.\n\nAllowed labels: {{labels_str}}\n\nInstruction: Reply with ONE label only from the allowed labels. Do not write anything else.\n\nText:\n{{text}}\n\nLabel:"
14
  doc_to_target: "{{target}}"
 
 
 
 
 
 
 
15
  process_results: !function utils.process_results
16
  metric_list:
17
  - metric: f1_macro
 
1
+ task: albanian_sib200
2
  task_alias: sib200
3
  dataset_path: Davlan/sib200
4
+ dataset_name: als_Latn
5
  test_split: test
6
  output_type: generate_until
7
  generation_kwargs:
 
10
  until:
11
  - "<|endoftext|>"
12
  process_docs: !function utils.process_sib200_docs
13
+ doc_to_text: "Ti je një sistem klasifikimi temash.\nZgjidh një etiketë vetme e përshkruan mirë tekstin e mëposhtëm.\n\nEtiketat e lejuara: {{labels_str}}\n\nUdhëzim: Përgjigju vetëm me NJË etiketë nga lista e lejuar. Mos shkruaj asnjë fjalë tjetër.\n\nTeksti:\n{{text}}\n\nEtiketa:"
14
  doc_to_target: "{{target}}"
15
+ filter_list:
16
+ - name: "get_label"
17
+ filter:
18
+ - function: "strip_think_recover"
19
+ - function: "regex_label_set"
20
+ labels_field: "labels_str"
21
+ - function: "take_first"
22
  process_results: !function utils.process_results
23
  metric_list:
24
  - metric: f1_macro
evals/{yoruba → albanian}/classification/utils.py RENAMED
@@ -1,10 +1,10 @@
1
- """Utility helpers for Yoruba classification tasks (SIB-200).
2
 
3
- Uses macro-averaged F1 scoring (matching Swahili pattern).
4
  """
5
 
6
  import os as _os, sys as _sys # noqa: E401
7
- _sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..","..",)))
8
 
9
 
10
  import datasets
@@ -40,9 +40,6 @@ def process_sib200_docs(dataset: datasets.Dataset) -> datasets.Dataset:
40
  return dataset.map(_process)
41
 
42
 
43
- # -- Result processing ----------------------------------------------------
44
-
45
-
46
  def process_results(doc, results):
47
  """Return (pred, gold) tuple for macro-F1 aggregation."""
48
  return process_results_f1(doc, results)
 
1
+ """Utility helpers for Albanian classification tasks (SIB-200).
2
 
3
+ Uses macro-averaged F1 scoring (matching the Swahili / Urdu pattern).
4
  """
5
 
6
  import os as _os, sys as _sys # noqa: E401
7
+ _sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..", "..",)))
8
 
9
 
10
  import datasets
 
40
  return dataset.map(_process)
41
 
42
 
 
 
 
43
  def process_results(doc, results):
44
  """Return (pred, gold) tuple for macro-F1 aggregation."""
45
  return process_results_f1(doc, results)
evals/{igbo → albanian}/mcq/_default_mcq_yaml RENAMED
@@ -1,4 +1,4 @@
1
- # Shared config for Igbo MCQ tasks (generative A/B/C/D).
2
  output_type: generate_until
3
  generation_kwargs:
4
  do_sample: false
@@ -8,6 +8,7 @@ generation_kwargs:
8
  filter_list:
9
  - name: "get_answer"
10
  filter:
 
11
  - function: "regex"
12
  regex_pattern: "([ABCD])"
13
  group_select: 0
 
1
+ # Shared config for Albanian MCQ tasks (generative A/B/C/D).
2
  output_type: generate_until
3
  generation_kwargs:
4
  do_sample: false
 
8
  filter_list:
9
  - name: "get_answer"
10
  filter:
11
+ - function: "strip_think_recover"
12
  - function: "regex"
13
  regex_pattern: "([ABCD])"
14
  group_select: 0
evals/{igbo/mcq/igbo_belebele.yaml → albanian/mcq/albanian_belebele.yaml} RENAMED
@@ -1,9 +1,9 @@
1
- task: igbo_belebele
2
  task_alias: belebele
3
  dataset_path: facebook/belebele
4
- dataset_name: ibo_Latn
5
  test_split: test
6
  include: _default_mcq_yaml
7
  process_docs: !function utils.process_belebele_docs
8
- doc_to_text: "P: {{flores_passage}}\nQ: {{question}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nInstruction: Reply with EXACTLY one letter: A, B, C, or D. No other text.\nAnswer:"
9
  doc_to_target: "{{gold_letter}}"
 
1
+ task: albanian_belebele
2
  task_alias: belebele
3
  dataset_path: facebook/belebele
4
+ dataset_name: als_Latn
5
  test_split: test
6
  include: _default_mcq_yaml
7
  process_docs: !function utils.process_belebele_docs
8
+ doc_to_text: "P: {{flores_passage}}\nQ: {{question}}\nA: {{mc_answer1}}\nB: {{mc_answer2}}\nC: {{mc_answer3}}\nD: {{mc_answer4}}\nUdhëzim: Përgjigju me VETËM një shkronjë: A, B, C ose D. Asnjë tekst tjetër.\nPërgjigja:"
9
  doc_to_target: "{{gold_letter}}"
evals/albanian/mcq/albanian_global_mmlu.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: albanian_global_mmlu
2
+ task_alias: global_mmlu_lite
3
+ # Global-MMLU-Lite v2 added Albanian (sq); the full Global-MMLU dataset
4
+ # does not currently ship Albanian. 400 samples total (200 CS + 200 CA).
5
+ dataset_path: CohereLabs/Global-MMLU-Lite
6
+ dataset_name: sq
7
+ test_split: test
8
+ include: _default_mcq_yaml
9
+ process_docs: !function utils.process_global_mmlu_docs
10
+ doc_to_text: "Ti je një AI me njohuri të gjera që u përgjigjet pyetjeve me zgjedhje të shumëfishta për lëndën '{{subject_field}}'.\n\nPyetja:\n{{question}}\n\nMundësitë:\nA: {{choice_a}}\nB: {{choice_b}}\nC: {{choice_c}}\nD: {{choice_d}}\n\nUdhëzim: Përgjigju me VETËM një shkronjë: A, B, C ose D. Asnjë tekst tjetër.\n\nPërgjigja:"
11
+ doc_to_target: "{{gold_letter}}"
evals/{igbo/mcq/igbo_mcq.yaml → albanian/mcq/albanian_mcq.yaml} RENAMED
@@ -1,8 +1,8 @@
1
- # Multiple-Choice QA subgroup (AfriMMLU + Belebele)
2
- group: igbo_mcq
3
  task:
4
- - igbo_afrimmlu
5
- - igbo_belebele
6
  aggregate_metric_list:
7
  - metric: f1_macro
8
  aggregation: mean
 
1
+ # Multiple-Choice QA subgroup (Belebele + Global-MMLU)
2
+ group: albanian_mcq
3
  task:
4
+ - albanian_belebele
5
+ - albanian_global_mmlu
6
  aggregate_metric_list:
7
  - metric: f1_macro
8
  aggregation: mean
evals/{yoruba → albanian}/mcq/utils.py RENAMED
@@ -1,7 +1,7 @@
1
- """Utility helpers for Yoruba MCQ tasks (AfriMMLU, Belebele)."""
2
 
3
  import os as _os, sys as _sys # noqa: E401
4
- _sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..","..",)))
5
 
6
 
7
  import ast
@@ -18,40 +18,48 @@ def _safe_str(x):
18
  return "" if x is None else str(x)
19
 
20
 
21
- # -- AfriMMLU --------------------------------------------------------------
22
 
23
 
24
- def _normalize_choices(doc):
25
- """Robustly extract 4 choices from AfriMMLU documents.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- The HF dataset uses several different field layouts depending on the
28
- upload revision; this mirrors the logic in ``run_mcq.py``.
 
 
 
 
 
29
  """
30
- # Try mc_answer1-4
31
- mc = [
32
- _safe_str(doc.get("mc_answer1", "")).strip(),
33
- _safe_str(doc.get("mc_answer2", "")).strip(),
34
- _safe_str(doc.get("mc_answer3", "")).strip(),
35
- _safe_str(doc.get("mc_answer4", "")).strip(),
36
- ]
37
- if all(mc):
38
- return mc
39
-
40
- # Try A/B/C/D keys
41
  if all(k in doc for k in CHOICE_LETTERS):
42
  return [_safe_str(doc[k]).strip() for k in CHOICE_LETTERS]
43
 
44
- # Try 'choices' field
45
  choices = doc.get("choices")
46
-
47
  if isinstance(choices, list) and len(choices) >= 4:
48
  return [_safe_str(x).strip() for x in choices[:4]]
49
-
50
  if isinstance(choices, dict):
51
  upper = {str(k).upper(): k for k in choices}
52
  if all(k in upper for k in CHOICE_LETTERS):
53
  return [_safe_str(choices[upper[k]]).strip() for k in CHOICE_LETTERS]
54
-
55
  if isinstance(choices, str):
56
  for parser in (json.loads, ast.literal_eval):
57
  try:
@@ -64,60 +72,42 @@ def _normalize_choices(doc):
64
  return ["", "", "", ""]
65
 
66
 
67
- def _gold_letter(doc):
68
- """Resolve the gold answer to a letter (A/B/C/D)."""
69
  answer = doc.get("answer")
70
 
71
- # Direct letter
72
  if isinstance(answer, str) and answer.strip().upper() in set(CHOICE_LETTERS):
73
  return answer.strip().upper()
74
 
75
- # Integer index (0-based)
76
  try:
77
  idx = int(answer)
78
  if 0 <= idx <= 3:
79
  return CHOICE_LETTERS[idx]
 
 
80
  except (ValueError, TypeError):
81
  pass
82
 
83
  return ""
84
 
85
 
86
- def process_afrimmlu_docs(dataset: datasets.Dataset) -> datasets.Dataset:
87
- """Normalise AfriMMLU fields for the YAML template."""
88
 
89
  def _process(doc):
90
- choices = _normalize_choices(doc)
91
  doc["choice_a"] = choices[0]
92
  doc["choice_b"] = choices[1]
93
  doc["choice_c"] = choices[2]
94
  doc["choice_d"] = choices[3]
95
- doc["gold_letter"] = _gold_letter(doc)
96
  doc["subject_field"] = doc.get("subject", "unknown")
97
  return doc
98
 
99
  return dataset.map(_process)
100
 
101
 
102
- # -- Belebele --------------------------------------------------------------
103
-
104
-
105
- def process_belebele_docs(dataset: datasets.Dataset) -> datasets.Dataset:
106
- """Resolve correct_answer_num (1-4) -> gold_letter (A-D)."""
107
-
108
- def _process(doc):
109
- num = doc.get("correct_answer_num")
110
- try:
111
- n = int(str(num).strip())
112
- doc["gold_letter"] = CHOICE_LETTERS[n - 1] if 1 <= n <= 4 else ""
113
- except (ValueError, TypeError):
114
- doc["gold_letter"] = ""
115
- return doc
116
-
117
- return dataset.map(_process)
118
-
119
-
120
- # -- Result processing ----------------------------------------------------
121
 
122
 
123
  def process_results(doc, results):
 
1
+ """Utility helpers for Albanian MCQ tasks (Belebele, Global-MMLU)."""
2
 
3
  import os as _os, sys as _sys # noqa: E401
4
+ _sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..", "..",)))
5
 
6
 
7
  import ast
 
18
  return "" if x is None else str(x)
19
 
20
 
21
+ # ── Belebele ──────────────────────────────────────────────────────────
22
 
23
 
24
+ def process_belebele_docs(dataset: datasets.Dataset) -> datasets.Dataset:
25
+ """Resolve correct_answer_num (1-4) gold_letter (A-D)."""
26
+
27
+ def _process(doc):
28
+ num = doc.get("correct_answer_num")
29
+ try:
30
+ n = int(str(num).strip())
31
+ doc["gold_letter"] = CHOICE_LETTERS[n - 1] if 1 <= n <= 4 else ""
32
+ except (ValueError, TypeError):
33
+ doc["gold_letter"] = ""
34
+ return doc
35
+
36
+ return dataset.map(_process)
37
+
38
+
39
+ # ── Global-MMLU ───────────────────────────────────────────────────────
40
 
41
+
42
+ def _global_mmlu_choices(doc):
43
+ """Robustly extract 4 choices from a Global-MMLU document.
44
+
45
+ Global-MMLU exposes the four options under ``option_a``..``option_d``
46
+ keys (lower-case). Older revisions used ``A``..``D`` or a ``choices``
47
+ list, which we accept as fallbacks.
48
  """
49
+ lower_keys = ["option_a", "option_b", "option_c", "option_d"]
50
+ if all(k in doc for k in lower_keys):
51
+ return [_safe_str(doc[k]).strip() for k in lower_keys]
52
+
 
 
 
 
 
 
 
53
  if all(k in doc for k in CHOICE_LETTERS):
54
  return [_safe_str(doc[k]).strip() for k in CHOICE_LETTERS]
55
 
 
56
  choices = doc.get("choices")
 
57
  if isinstance(choices, list) and len(choices) >= 4:
58
  return [_safe_str(x).strip() for x in choices[:4]]
 
59
  if isinstance(choices, dict):
60
  upper = {str(k).upper(): k for k in choices}
61
  if all(k in upper for k in CHOICE_LETTERS):
62
  return [_safe_str(choices[upper[k]]).strip() for k in CHOICE_LETTERS]
 
63
  if isinstance(choices, str):
64
  for parser in (json.loads, ast.literal_eval):
65
  try:
 
72
  return ["", "", "", ""]
73
 
74
 
75
+ def _global_mmlu_gold_letter(doc):
76
+ """Resolve the Global-MMLU gold answer to a letter (A/B/C/D)."""
77
  answer = doc.get("answer")
78
 
 
79
  if isinstance(answer, str) and answer.strip().upper() in set(CHOICE_LETTERS):
80
  return answer.strip().upper()
81
 
 
82
  try:
83
  idx = int(answer)
84
  if 0 <= idx <= 3:
85
  return CHOICE_LETTERS[idx]
86
+ if 1 <= idx <= 4:
87
+ return CHOICE_LETTERS[idx - 1]
88
  except (ValueError, TypeError):
89
  pass
90
 
91
  return ""
92
 
93
 
94
+ def process_global_mmlu_docs(dataset: datasets.Dataset) -> datasets.Dataset:
95
+ """Normalise Global-MMLU fields for the YAML template."""
96
 
97
  def _process(doc):
98
+ choices = _global_mmlu_choices(doc)
99
  doc["choice_a"] = choices[0]
100
  doc["choice_b"] = choices[1]
101
  doc["choice_c"] = choices[2]
102
  doc["choice_d"] = choices[3]
103
+ doc["gold_letter"] = _global_mmlu_gold_letter(doc)
104
  doc["subject_field"] = doc.get("subject", "unknown")
105
  return doc
106
 
107
  return dataset.map(_process)
108
 
109
 
110
+ # ── Result processing ────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
 
113
  def process_results(doc, results):
evals/albanian/open_generation/_default_open_generation_yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shared config for Albanian open-ended generation tasks (Aya, PolyWrite).
2
+ # Both datasets are highly multilingual single-table dumps. Filtering to
3
+ # Albanian rows is done in process_docs.
4
+ #
5
+ # Required env vars:
6
+ # OPENAI_API_KEY – OpenRouter / OpenAI key
7
+ # Optional env vars:
8
+ # JUDGE_MODEL – judge model name (default: openai/gpt-5-mini)
9
+ # JUDGE_BASE_URL – judge API endpoint (default: https://openrouter.ai/api/v1)
10
+ # JUDGE_CONCURRENCY – parallel judge calls (default: 32)
11
+ output_type: generate_until
12
+ generation_kwargs:
13
+ do_sample: false
14
+ max_gen_toks: 8192
15
+ until:
16
+ - "<|endoftext|>"
17
+ metadata:
18
+ version: 1.0
evals/albanian/open_generation/albanian_aya.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: albanian_aya
2
+ task_alias: aya
3
+ dataset_path: CohereLabs/aya_evaluation_suite
4
+ dataset_name: dolly_machine_translated
5
+ test_split: test
6
+ include: _default_open_generation_yaml
7
+ process_docs: !function utils.process_aya_docs
8
+ doc_to_text: "{{inputs}}"
9
+ doc_to_target: "{{targets}}"
10
+ process_results: !function utils.process_results_aya
11
+ metric_list:
12
+ - metric: llm_judge_score
13
+ aggregation: !function utils.llm_judge_agg
14
+ higher_is_better: true
evals/albanian/open_generation/albanian_open_generation.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Open-ended generation subgroup (Aya + PolyWrite)
2
+ # Note: Aya uses reference-based llm_judge_score; PolyWrite uses the
3
+ # reference-free open_quality_score (1-5 rubric, normalised to 0-1).
4
+ # Both metrics are on the same 0-1 scale so their mean is meaningful.
5
+ group: albanian_open_generation
6
+ task:
7
+ - albanian_aya
8
+ - albanian_polywrite
9
+ aggregate_metric_list:
10
+ - metric: llm_judge_score
11
+ aggregation: mean
12
+ weight_by_size: true
13
+ - metric: open_quality_score
14
+ aggregation: mean
15
+ weight_by_size: true
16
+ metadata:
17
+ version: 1.0
evals/albanian/open_generation/albanian_polywrite.yaml ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: albanian_polywrite
2
+ task_alias: polywrite
3
+ dataset_path: MaLA-LM/PolyWrite
4
+ test_split: train
5
+ include: _default_open_generation_yaml
6
+ process_docs: !function utils.process_polywrite_docs
7
+ doc_to_text: "{{prompt}}"
8
+ # PolyWrite has no reference answer; doc_to_target is a placeholder so
9
+ # lm-eval is happy. The reference-free judge ignores it.
10
+ doc_to_target: ""
11
+ process_results: !function utils.process_results_polywrite
12
+ metric_list:
13
+ - metric: open_quality_score
14
+ aggregation: mean
15
+ higher_is_better: true
evals/albanian/open_generation/utils.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility helpers for Albanian open-ended generation tasks.
2
+
3
+ Two datasets are supported:
4
+
5
+ * **Aya Evaluation Suite** (``CohereLabs/aya_evaluation_suite``,
6
+ config ``dolly_machine_translated``) – has a translated reference
7
+ ``targets`` for every prompt, so we can use the standard reference-based
8
+ judge from ``judge_utils``.
9
+ * **PolyWrite** (``MaLA-LM/PolyWrite``) – open-ended creative writing
10
+ prompts with **no** reference answer. We use a separate "open quality"
11
+ judge that scores 1-5 against an Albanian-fluency + relevance rubric.
12
+
13
+ Both datasets are highly multilingual single-table dumps; we filter to
14
+ Albanian inside ``process_docs``.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import json
20
+ import os as _os
21
+ import sys as _sys
22
+ import time
23
+
24
+ _sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..", "..",)))
25
+
26
+
27
+ import datasets
28
+
29
+ from judge_utils import ( # noqa: F401 (re-exported for yaml use)
30
+ llm_judge_agg,
31
+ strip_think_tags,
32
+ _get_judge_client,
33
+ )
34
+
35
+
36
+ # ── Albanian filtering ───────────────────────────────────────────────
37
+
38
+
39
+ _ALBANIAN_LANG_CODES = {"sqi", "als", "aln"}
40
+ _ALBANIAN_LANG_SCRIPTS = {"sqi_Latn", "als_Latn", "aln_Latn"}
41
+
42
+
43
+ def process_aya_docs(dataset: datasets.Dataset) -> datasets.Dataset:
44
+ """Filter Aya rows to Albanian and project the columns we need."""
45
+
46
+ def _is_albanian(row):
47
+ return str(row.get("language", "")).lower() in _ALBANIAN_LANG_CODES
48
+
49
+ filtered = dataset.filter(_is_albanian)
50
+
51
+ def _project(row):
52
+ return {
53
+ "inputs": (row.get("inputs") or "").strip(),
54
+ "targets": (row.get("targets") or "").strip(),
55
+ }
56
+
57
+ cols_to_drop = [c for c in filtered.column_names if c not in ("inputs", "targets")]
58
+ return filtered.map(_project, remove_columns=cols_to_drop)
59
+
60
+
61
+ def process_polywrite_docs(dataset: datasets.Dataset) -> datasets.Dataset:
62
+ """Filter PolyWrite rows to Albanian and surface the translated prompt."""
63
+
64
+ def _is_albanian(row):
65
+ return str(row.get("lang_script", "")) in _ALBANIAN_LANG_SCRIPTS
66
+
67
+ filtered = dataset.filter(_is_albanian)
68
+
69
+ def _project(row):
70
+ return {
71
+ "prompt": (row.get("prompt_translated") or "").strip(),
72
+ "category": (row.get("category") or "").strip(),
73
+ "name": (row.get("name") or "").strip(),
74
+ }
75
+
76
+ cols_to_drop = [c for c in filtered.column_names if c not in ("prompt", "category", "name")]
77
+ return filtered.map(_project, remove_columns=cols_to_drop)
78
+
79
+
80
+ # ── Aya: reference-based judge ──────────────────────────────────────
81
+
82
+
83
+ def process_results_aya(doc, results):
84
+ """Return (question, gold, pred, raw_response) for the standard llm_judge."""
85
+ raw_response = results[0].strip() if results and results[0] else ""
86
+ pred = strip_think_tags(raw_response)
87
+ question = str(doc.get("inputs", ""))
88
+ gold = str(doc.get("targets", ""))
89
+ return {"llm_judge_score": (question, gold, pred, raw_response)}
90
+
91
+
92
+ # ── PolyWrite: reference-free open-quality judge ────────────────────
93
+
94
+
95
+ _OPEN_QUALITY_PROMPT = """\
96
+ You are an expert evaluator of open-ended writing in Albanian (gjuha shqipe).
97
+
98
+ You will be given a writing PROMPT in Albanian and a MODEL_ANSWER produced \
99
+ by a language model. The prompt has NO reference answer; judge the answer \
100
+ on its own merits.
101
+
102
+ Score the answer from 1 (very poor) to 5 (excellent) on the following \
103
+ combined rubric:
104
+
105
+ 1. **Relevance** – does the answer address the prompt and stay on-topic?
106
+ 2. **Fluency** – is the writing in fluent, grammatical Albanian (Tosk \
107
+ ``sqi`` or ``als`` accepted)?
108
+ 3. **Coherence** – is the answer well-structured and internally consistent?
109
+ 4. **Quality** – is the content interesting / useful / creative as the \
110
+ prompt requests?
111
+
112
+ Calibration:
113
+ - 5: excellent on all four axes.
114
+ - 4: good, minor issues.
115
+ - 3: acceptable but noticeably flawed (e.g. partly off-topic, awkward Albanian).
116
+ - 2: poor (off-topic, broken Albanian, very short, or generic boilerplate).
117
+ - 1: unusable (refusal, wrong language, gibberish, empty).
118
+
119
+ Respond ONLY with a single compact JSON object with exactly these keys:
120
+ - "score": integer 1-5
121
+ - "justification": one short sentence (Albanian or English).
122
+
123
+ PROMPT:
124
+ {prompt}
125
+
126
+ MODEL_ANSWER:
127
+ {pred}"""
128
+
129
+
130
+ def _call_open_quality_judge(prompt: str, pred: str, max_retries: int = 3) -> float:
131
+ client = _get_judge_client()
132
+ judge_model = _os.getenv("JUDGE_MODEL", "openai/gpt-5-mini")
133
+ judge_text = _OPEN_QUALITY_PROMPT.format(prompt=prompt.strip(), pred=pred.strip())
134
+
135
+ for attempt in range(max_retries):
136
+ try:
137
+ resp = client.chat.completions.create(
138
+ model=judge_model,
139
+ temperature=0,
140
+ messages=[{"role": "user", "content": judge_text}],
141
+ response_format={"type": "json_object"},
142
+ )
143
+ data = json.loads(resp.choices[0].message.content)
144
+ raw = data.get("score", 0)
145
+ try:
146
+ score_int = int(raw)
147
+ except (TypeError, ValueError):
148
+ score_int = 0
149
+ score_int = max(1, min(5, score_int)) if score_int else 0
150
+ return (score_int - 1) / 4.0 if score_int else 0.0
151
+ except Exception as e:
152
+ if attempt < max_retries - 1:
153
+ time.sleep(2 ** attempt)
154
+ else:
155
+ print(f"[open-judge] Failed after {max_retries} retries: {e}")
156
+ return 0.0
157
+ return 0.0
158
+
159
+
160
+ def process_results_polywrite(doc, results):
161
+ """Score each sample with the open-quality judge and return a per-sample float.
162
+
163
+ Returning a numeric score (instead of a tuple consumed by a custom aggregator)
164
+ means the score is written to every JSONL sample row by lm-eval, so downstream
165
+ tools (e.g. the dashboard) can recompute the average from samples alone.
166
+ Aggregation is plain ``mean`` in the YAML.
167
+ """
168
+ raw_response = results[0].strip() if results and results[0] else ""
169
+ pred = strip_think_tags(raw_response)
170
+ prompt = str(doc.get("prompt", ""))
171
+ score = _call_open_quality_judge(prompt, pred)
172
+ return {"open_quality_score": score}
evals/albanian/summarization/_default_summarization_yaml ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Shared config for Albanian summarization tasks (MassiveSumm).
2
+ # The Albanian-only filter is applied in process_docs (the upstream
3
+ # datasets are highly multilingual single-table dumps, no per-language
4
+ # config). Scoring is sentence-level ROUGE-L F1.
5
+ output_type: generate_until
6
+ test_split: train
7
+ generation_kwargs:
8
+ do_sample: false
9
+ max_gen_toks: 8192
10
+ until:
11
+ - "<|endoftext|>"
12
+ process_results: !function utils.process_results
13
+ metric_list:
14
+ - metric: rouge_l
15
+ aggregation: !function utils.rouge_l_agg
16
+ higher_is_better: true
17
+ metadata:
18
+ version: 1.0
evals/albanian/summarization/albanian_massivesumm_long.yaml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: albanian_massivesumm_long
2
+ task_alias: massivesumm_long
3
+ # Gated dataset: accept terms at
4
+ # https://huggingface.co/datasets/MaLA-LM/MassiveSumm_long and export HF_TOKEN.
5
+ dataset_path: MaLA-LM/MassiveSumm_long
6
+ include: _default_summarization_yaml
7
+ process_docs: !function utils.process_docs
8
+ generation_kwargs:
9
+ do_sample: false
10
+ max_gen_toks: 8192
11
+ until:
12
+ - "<|endoftext|>"
13
+ doc_to_text: "Ti je një sistem përmbledhjeje lajmesh.\nPërmblidh artikullin e mëposhtëm në një paragraf të shkurtër (3-5 fjali) në gjuhën shqipe. Mos shto komente.\n\nArtikulli:\n{{text}}\n\nPërmbledhja:"
14
+ doc_to_target: "{{summary}}"
evals/albanian/summarization/albanian_massivesumm_short.yaml ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ task: albanian_massivesumm_short
2
+ task_alias: massivesumm_short
3
+ # Gated dataset: accept terms at
4
+ # https://huggingface.co/datasets/MaLA-LM/MassiveSumm_short and export HF_TOKEN.
5
+ dataset_path: MaLA-LM/MassiveSumm_short
6
+ include: _default_summarization_yaml
7
+ process_docs: !function utils.process_docs
8
+ doc_to_text: "Ti je një sistem përmbledhjeje lajmesh.\nPërmblidh artikullin e mëposhtëm në një ose dy fjali të shkurtra në gjuhën shqipe. Mos shto komente.\n\nArtikulli:\n{{text}}\n\nPërmbledhja:"
9
+ doc_to_target: "{{summary}}"
evals/albanian/summarization/albanian_summarization.yaml ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Summarization subgroup (MassiveSumm short + long)
2
+ group: albanian_summarization
3
+ task:
4
+ - albanian_massivesumm_short
5
+ - albanian_massivesumm_long
6
+ aggregate_metric_list:
7
+ - metric: rouge_l
8
+ aggregation: mean
9
+ weight_by_size: true
10
+ metadata:
11
+ version: 1.0
evals/albanian/summarization/utils.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility helpers for Albanian summarization tasks (MassiveSumm short/long).
2
+
3
+ Both MassiveSumm subsets are highly multilingual single-table datasets
4
+ (one ``train`` split, no language configs). We filter to Albanian rows
5
+ inside ``process_docs``. The HF dataset is **gated** — accept the terms
6
+ on the dataset page once and export ``HF_TOKEN`` before running.
7
+
8
+ Scoring uses ROUGE-L F1 via the ``rouge_score`` package, which is
9
+ already a transitive dependency of lm-evaluation-harness.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import re
15
+ import string
16
+
17
+ import datasets
18
+
19
+
20
+ _ALBANIAN_LANG_CODES = {"sqi", "als", "aln"}
21
+
22
+
23
+ def _strip_think_tags(text: str) -> str:
24
+ """Strip <think>...</think> reasoning wrapper (e.g. Qwen thinking models)."""
25
+ if "</think>" in text:
26
+ return text.split("</think>")[-1].strip()
27
+ return text
28
+
29
+
30
+ def _filter_albanian(dataset: datasets.Dataset) -> datasets.Dataset:
31
+ """Keep rows whose ``language`` field is one of Albanian variants."""
32
+ if "language" not in dataset.column_names:
33
+ return dataset
34
+ return dataset.filter(lambda row: str(row.get("language", "")).lower() in _ALBANIAN_LANG_CODES)
35
+
36
+
37
+ def _normalise_doc(doc):
38
+ """Project the columns we actually need."""
39
+ text = (doc.get("text") or "").strip()
40
+ summary = (doc.get("summary") or "").strip()
41
+ title = (doc.get("title") or "").strip()
42
+ return {
43
+ "text": text,
44
+ "summary": summary,
45
+ "title": title,
46
+ }
47
+
48
+
49
+ def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:
50
+ filtered = _filter_albanian(dataset)
51
+ return filtered.map(_normalise_doc, remove_columns=[
52
+ c for c in filtered.column_names if c not in ("text", "summary", "title")
53
+ ])
54
+
55
+
56
+ # ── ROUGE-L scoring ──────────────────────────────────────────────────
57
+
58
+
59
+ _PUNCT_TABLE = str.maketrans("", "", string.punctuation)
60
+ _WHITESPACE_RE = re.compile(r"\s+")
61
+
62
+
63
+ def _normalise(text: str) -> str:
64
+ text = text.translate(_PUNCT_TABLE)
65
+ text = _WHITESPACE_RE.sub(" ", text)
66
+ return text.strip().lower()
67
+
68
+
69
+ def _lcs_length(a, b):
70
+ m, n = len(a), len(b)
71
+ if m == 0 or n == 0:
72
+ return 0
73
+ dp = [0] * (n + 1)
74
+ for i in range(1, m + 1):
75
+ prev = 0
76
+ for j in range(1, n + 1):
77
+ tmp = dp[j]
78
+ if a[i - 1] == b[j - 1]:
79
+ dp[j] = prev + 1
80
+ else:
81
+ dp[j] = max(dp[j], dp[j - 1])
82
+ prev = tmp
83
+ return dp[n]
84
+
85
+
86
+ def _rouge_l_f1(pred: str, gold: str) -> float:
87
+ """Compute sentence-level ROUGE-L F1 (no stemming) between pred and gold."""
88
+ pred_tokens = _normalise(pred).split()
89
+ gold_tokens = _normalise(gold).split()
90
+ if not pred_tokens or not gold_tokens:
91
+ return 0.0
92
+ lcs = _lcs_length(pred_tokens, gold_tokens)
93
+ if lcs == 0:
94
+ return 0.0
95
+ precision = lcs / len(pred_tokens)
96
+ recall = lcs / len(gold_tokens)
97
+ return 2 * precision * recall / (precision + recall)
98
+
99
+
100
+ def process_results(doc, results):
101
+ raw_response = results[0].strip() if results and results[0] else ""
102
+ pred = _strip_think_tags(raw_response)
103
+ gold = (doc.get("summary") or "").strip()
104
+ return {"rouge_l": (gold, pred)}
105
+
106
+
107
+ def rouge_l_agg(items):
108
+ if not items:
109
+ return 0.0
110
+ scores = [_rouge_l_f1(pred, gold) for gold, pred in items]
111
+ return sum(scores) / len(scores)
evals/arabic/classification/arabic_sib200.yaml CHANGED
@@ -12,6 +12,13 @@ generation_kwargs:
12
  process_docs: !function utils.process_sib200_docs
13
  doc_to_text: "أنت نظام تصنيف مواضيع.\nاختر التصنيف الأنسب للنص التالي.\n\nالتصنيفات المسموحة: {{labels_str}}\n\nالتعليمات: أجب بتصنيف واحد فقط من التصنيفات المسموحة. لا تكتب أي شيء آخر.\n\nالنص:\n{{text}}\n\nالتصنيف:"
14
  doc_to_target: "{{target}}"
 
 
 
 
 
 
 
15
  process_results: !function utils.process_results
16
  metric_list:
17
  - metric: f1_macro
 
12
  process_docs: !function utils.process_sib200_docs
13
  doc_to_text: "أنت نظام تصنيف مواضيع.\nاختر التصنيف الأنسب للنص التالي.\n\nالتصنيفات المسموحة: {{labels_str}}\n\nالتعليمات: أجب بتصنيف واحد فقط من التصنيفات المسموحة. لا تكتب أي شيء آخر.\n\nالنص:\n{{text}}\n\nالتصنيف:"
14
  doc_to_target: "{{target}}"
15
+ filter_list:
16
+ - name: "get_label"
17
+ filter:
18
+ - function: "strip_think_recover"
19
+ - function: "regex_label_set"
20
+ labels_field: "labels_str"
21
+ - function: "take_first"
22
  process_results: !function utils.process_results
23
  metric_list:
24
  - metric: f1_macro
evals/arabic/qa/arabic_qa.yaml CHANGED
@@ -2,6 +2,9 @@ group: arabic_qa
2
  task:
3
  - arabic_tydiqa
4
  aggregate_metric_list:
 
 
 
5
  - metric: f1
6
  aggregation: mean
7
  weight_by_size: true
 
2
  task:
3
  - arabic_tydiqa
4
  aggregate_metric_list:
5
+ - metric: exact_match
6
+ aggregation: mean
7
+ weight_by_size: true
8
  - metric: f1
9
  aggregation: mean
10
  weight_by_size: true
evals/cost_core.py CHANGED
@@ -53,7 +53,6 @@ _OUTPUT_HEURISTIC_BY_KEYWORD = [
53
  ("afrimgsm", 1024),
54
  ("mgsm", 1024),
55
  ("gsm8k", 1024),
56
- ("ifeval", 512),
57
  # Specific QA tasks first (some end in "qa" as a substring).
58
  ("tydiqa", 256),
59
  ("aquas", 384),
 
53
  ("afrimgsm", 1024),
54
  ("mgsm", 1024),
55
  ("gsm8k", 1024),
 
56
  # Specific QA tasks first (some end in "qa" as a substring).
57
  ("tydiqa", 256),
58
  ("aquas", 384),
evals/english/english.yaml CHANGED
@@ -2,7 +2,6 @@ group: english
2
  task:
3
  - english_mcq
4
  - english_math
5
- - ifeval
6
  - english_mmlu_pro
7
  metadata:
8
  version: 1.0
 
2
  task:
3
  - english_mcq
4
  - english_math
 
5
  - english_mmlu_pro
6
  metadata:
7
  version: 1.0
evals/eval_config.toml CHANGED
@@ -141,9 +141,6 @@ name = "english_mcq"
141
  [[tasks]]
142
  name = "english_math"
143
 
144
- [[tasks]]
145
- name = "ifeval"
146
-
147
  [[tasks]]
148
  name = "english_mmlu_pro"
149
 
@@ -153,95 +150,94 @@ name = "english_mmlu_pro"
153
  [[tasks]]
154
  name = "spanish_xquad_es"
155
 
156
- [[tasks]]
157
- name = "spanish_mcq"
158
-
159
  # Swahili
160
  [[tasks]]
161
  name = "swahili_classification"
162
 
 
163
  [[tasks]]
164
- name = "swahili_afrimgsm"
165
 
166
  [[tasks]]
167
- name = "swahili_nli"
168
 
169
- # French
170
  [[tasks]]
171
- name = "french_classification"
172
 
173
  [[tasks]]
174
- name = "french_mcq"
175
 
 
176
  [[tasks]]
177
- name = "french_math"
178
 
179
  [[tasks]]
180
- name = "french_qa"
181
 
182
- # Arabic
183
  [[tasks]]
184
- name = "arabic_classification"
185
 
 
186
  [[tasks]]
187
- name = "arabic_mcq"
188
 
189
  [[tasks]]
190
- name = "arabic_qa"
191
 
192
- # Hausa
193
  [[tasks]]
194
- name = "hausa_classification"
195
 
196
  [[tasks]]
197
- name = "hausa_mcq"
198
 
199
  [[tasks]]
200
- name = "hausa_afrimgsm"
201
 
 
202
  [[tasks]]
203
- name = "hausa_nli"
204
 
205
  [[tasks]]
206
- name = "hausa_qa"
207
 
208
  [[tasks]]
209
- name = "hausa_sentiment"
210
 
211
- # Yoruba
212
  [[tasks]]
213
- name = "yoruba_classification"
214
 
215
  [[tasks]]
216
- name = "yoruba_mcq"
217
 
218
  [[tasks]]
219
- name = "yoruba_afrimgsm"
220
 
221
  [[tasks]]
222
- name = "yoruba_nli"
223
 
 
224
  [[tasks]]
225
- name = "yoruba_qa"
226
 
227
  [[tasks]]
228
- name = "yoruba_sentiment"
229
 
230
- # Igbo
231
  [[tasks]]
232
- name = "igbo_classification"
233
 
 
234
  [[tasks]]
235
- name = "igbo_mcq"
236
 
237
  [[tasks]]
238
- name = "igbo_afrimgsm"
239
 
240
  [[tasks]]
241
- name = "igbo_nli"
242
 
243
  [[tasks]]
244
- name = "igbo_qa"
245
 
246
  [[tasks]]
247
- name = "igbo_sentiment"
 
141
  [[tasks]]
142
  name = "english_math"
143
 
 
 
 
144
  [[tasks]]
145
  name = "english_mmlu_pro"
146
 
 
150
  [[tasks]]
151
  name = "spanish_xquad_es"
152
 
 
 
 
153
  # Swahili
154
  [[tasks]]
155
  name = "swahili_classification"
156
 
157
+ # Albanian
158
  [[tasks]]
159
+ name = "albanian_classification"
160
 
161
  [[tasks]]
162
+ name = "albanian_mcq"
163
 
 
164
  [[tasks]]
165
+ name = "albanian_summarization"
166
 
167
  [[tasks]]
168
+ name = "albanian_open_generation"
169
 
170
+ # Portuguese
171
  [[tasks]]
172
+ name = "portuguese_mcq"
173
 
174
  [[tasks]]
175
+ name = "portuguese_classification"
176
 
 
177
  [[tasks]]
178
+ name = "portuguese_nli"
179
 
180
+ # Ukrainian
181
  [[tasks]]
182
+ name = "ukrainian_classification"
183
 
184
  [[tasks]]
185
+ name = "ukrainian_mcq"
186
 
 
187
  [[tasks]]
188
+ name = "ukrainian_qa"
189
 
190
  [[tasks]]
191
+ name = "ukrainian_summarization"
192
 
193
  [[tasks]]
194
+ name = "ukrainian_open_generation"
195
 
196
+ # Urdu
197
  [[tasks]]
198
+ name = "urdu_claim"
199
 
200
  [[tasks]]
201
+ name = "urdu_classification"
202
 
203
  [[tasks]]
204
+ name = "urdu_qa"
205
 
206
+ # French
207
  [[tasks]]
208
+ name = "french_classification"
209
 
210
  [[tasks]]
211
+ name = "french_mcq"
212
 
213
  [[tasks]]
214
+ name = "french_math"
215
 
216
  [[tasks]]
217
+ name = "french_qa"
218
 
219
+ # Arabic
220
  [[tasks]]
221
+ name = "arabic_classification"
222
 
223
  [[tasks]]
224
+ name = "arabic_mcq"
225
 
 
226
  [[tasks]]
227
+ name = "arabic_qa"
228
 
229
+ # Hausa
230
  [[tasks]]
231
+ name = "hausa_classification"
232
 
233
  [[tasks]]
234
+ name = "hausa_mcq"
235
 
236
  [[tasks]]
237
+ name = "hausa_afrimgsm"
238
 
239
  [[tasks]]
240
+ name = "hausa_nli"
241
 
242
  [[tasks]]
243
+ name = "hausa_qa"
evals/f1_utils.py CHANGED
@@ -3,9 +3,25 @@
3
  Provides the macro-averaged F1 aggregation and a common process_results
4
  helper used across all language-specific classification utils modules.
5
 
6
- Registers ``regex_last``: like lm_eval's ``regex`` filter, but picks a match
7
- from ``findall`` using ``group_select``; default ``group_select=-1`` is the
8
- **last** match (needed when CoT/reasoning mentions labels before the answer).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  """
10
 
11
  import re
@@ -21,6 +37,41 @@ def _strip_think_tags(text: str) -> str:
21
  return text
22
 
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  def macro_f1_agg(items):
25
  """Compute macro-averaged F1 over all class labels.
26
 
@@ -93,29 +144,123 @@ class RegexLastFilter(Filter):
93
  return list(map(filter_set, resps))
94
 
95
 
96
- def _normalize_label(s: str) -> str:
97
- """Light normalization for classification labels: lowercase, strip
98
- surrounding whitespace and trailing punctuation (e.g. ``entailment.``).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  """
100
- if not s:
101
- return ""
102
- cleaned = s.strip().lower()
103
- # Strip trailing punctuation commonly emitted by chat models
104
- cleaned = cleaned.rstrip(".,;:!?\"'`")
105
- return cleaned.strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
 
108
  def process_results_f1(doc, results, *, gold_key="target"):
109
  """Return ``(pred, gold)`` for macro-F1 aggregation.
110
 
111
- ``pred`` is the label after stripping think wrappers and light
112
- normalization (lowercasing, trailing punctuation). Full raw generation
113
- is logged as ``resps`` / ``reasoning_content`` when using ``run_eval.py``.
114
 
115
  Most tasks use ``gold_key="target"``; override for tasks that store
116
  the gold label under a different field name.
117
  """
118
  raw_response = results[0].strip() if results[0] else ""
119
- pred = _normalize_label(_strip_think_tags(raw_response))
120
- gold = _normalize_label(doc.get(gold_key, ""))
121
  return {"f1_macro": (pred, gold)}
 
3
  Provides the macro-averaged F1 aggregation and a common process_results
4
  helper used across all language-specific classification utils modules.
5
 
6
+ Registers:
7
+
8
+ - ``regex_last``: like lm_eval's ``regex`` filter, but picks a match from
9
+ ``findall`` using ``group_select``; default ``group_select=-1`` is the
10
+ **last** match (needed when CoT/reasoning mentions labels before the answer).
11
+ - ``strip_think_recover``: drop ``<think>…</think>`` so
12
+ downstream ``regex`` sees only the final answer channel; if that tail is empty
13
+ (e.g. stop at ``\\n\\n`` before content), fall back to the last non-empty line
14
+ of the reasoning block (see ``run_eval.py`` merge format).
15
+ - ``regex_label_set``: pick the last occurrence of any allowed label from a
16
+ per-doc field (e.g. ``labels_str`` for SIB-200, ``intents_str`` for InjongoIntent).
17
+ Robust to channel-marker leak (e.g. a leaked ``<|channel|>`` header before the
18
+ answer), models that say "the answer is X", and substring collisions
19
+ (``science/technology`` vs ``science``) -- labels are matched longest-first.
20
+ - ``strip_channel_header``: drop a Harmony-style channel-marker prefix
21
+ (``<channel|>`` / ``<|channel|>`` and optional trailing ``<|message|>``) from
22
+ the start of the response. Useful for open-text generation tasks
23
+ (summarization / QA / open generation) where the actual answer is correct
24
+ but the chat template leaks tokens at the start. No-op when no marker found.
25
  """
26
 
27
  import re
 
37
  return text
38
 
39
 
40
+ @register_filter("strip_think_recover")
41
+ class StripThinkRecoverFilter(Filter):
42
+ """Remove think wrapper so MCQ ``regex`` runs on the answer tail only.
43
+
44
+ When ``run_eval.py`` merges API ``reasoning`` + ``content``, the built-in
45
+ ``regex`` ``([ABCD])`` / ``([ABCDE])`` filter would otherwise match the
46
+ **first** letter inside the reasoning block. This step keeps only text after
47
+ ``</think>`` when non-empty; if that tail is empty, uses the
48
+ last non-empty line inside the reasoning (common when generation stops early).
49
+ """
50
+
51
+ def __init__(self) -> None:
52
+ pass
53
+
54
+ def apply(self, resps, docs):
55
+ def strip_set(inst):
56
+ stripped = []
57
+ for resp in inst:
58
+ if not isinstance(resp, str):
59
+ resp = ""
60
+ content = _strip_think_tags(resp)
61
+ if not content and "</think>" in resp:
62
+ reasoning = resp.split("</think>")[0]
63
+ if "<think>" in reasoning:
64
+ reasoning = reasoning.split("<think>", 1)[1]
65
+ lines = [
66
+ ln.strip() for ln in reasoning.strip().splitlines() if ln.strip()
67
+ ]
68
+ content = lines[-1] if lines else ""
69
+ stripped.append(content)
70
+ return stripped
71
+
72
+ return list(map(strip_set, resps))
73
+
74
+
75
  def macro_f1_agg(items):
76
  """Compute macro-averaged F1 over all class labels.
77
 
 
144
  return list(map(filter_set, resps))
145
 
146
 
147
+ @register_filter("regex_label_set")
148
+ class RegexLabelSetFilter(Filter):
149
+ """Pick the LAST occurrence of any allowed label from the response.
150
+
151
+ The allowed-label list is read from a per-doc field (default ``labels_str``,
152
+ e.g. ``"entertainment, geography, ..., science/technology, ..."``). Labels
153
+ are matched **longest-first** so multi-segment labels like
154
+ ``science/technology`` win over substring collisions like ``science``.
155
+
156
+ Robust to:
157
+
158
+ - ``<think>...</think>`` reasoning leak -- typically chain ``strip_think_recover``
159
+ first to drop the reasoning block, then this filter on the answer tail.
160
+ - Harmony / channel-marker leak (e.g. a leaked ``<|channel|>`` header followed
161
+ by the actual label) -- the regex still finds the trailing label substring.
162
+ - "The answer is X" / "Final: X" patterns -- the LAST occurrence wins.
163
+
164
+ If the doc field is missing, empty, or no label matches, returns
165
+ ``fallback`` (default ``"[invalid]"``) so the row counts as wrong in F1.
166
  """
167
+
168
+ def __init__(
169
+ self,
170
+ labels_field: str = "labels_str",
171
+ separator: str = ",",
172
+ fallback: str = "[invalid]",
173
+ ) -> None:
174
+ self.labels_field = labels_field
175
+ self.separator = separator
176
+ self.fallback = fallback
177
+
178
+ def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
179
+ out: list[list[str]] = []
180
+ for resp_set, doc in zip(resps, docs):
181
+ raw_labels = str((doc or {}).get(self.labels_field, "") or "")
182
+ labels = [
183
+ lbl.strip() for lbl in raw_labels.split(self.separator) if lbl.strip()
184
+ ]
185
+ # Sort longest-first so e.g. "science/technology" matches before
186
+ # the bare "science" substring inside it.
187
+ labels_sorted = sorted(set(labels), key=len, reverse=True)
188
+ pattern = (
189
+ re.compile("(" + "|".join(re.escape(lbl) for lbl in labels_sorted) + ")")
190
+ if labels_sorted
191
+ else None
192
+ )
193
+ filtered: list[str] = []
194
+ for resp in resp_set:
195
+ if not isinstance(resp, str):
196
+ resp = ""
197
+ if pattern is None:
198
+ filtered.append(self.fallback)
199
+ continue
200
+ matches = pattern.findall(resp)
201
+ filtered.append(matches[-1].strip() if matches else self.fallback)
202
+ out.append(filtered)
203
+ return out
204
+
205
+
206
+ @register_filter("strip_channel_header")
207
+ class StripChannelHeaderFilter(Filter):
208
+ """Strip Harmony-style channel/message header leaks from the response.
209
+
210
+ Some providers (notably deepinfra fp8 Gemma) leak chat template tokens like
211
+ ``<|channel|>final<|message|>`` -- or partial fragments such as
212
+ ``s.<channel|>`` (where ``s.`` is the tail of the previous token) -- into
213
+ the assistant ``content``. For open-text tasks (summarization / QA / open
214
+ generation) this hurts every metric (ROUGE / BLEU / SAS-encoder / LLM judge)
215
+ because the prefix garbage drags down the score even when the actual answer
216
+ that follows is correct.
217
+
218
+ Strategy: anchored at the start of the response, match up to ``max_prefix_chars``
219
+ (default 80) characters of any text followed by a ``<channel|>`` /
220
+ ``<|channel|>`` marker, optionally followed by a Harmony ``<|message|>`` /
221
+ ``<message|>`` marker (and any text in between like ``final``). Drop everything
222
+ matched. No-op when no marker is found near the start, so safe for clean responses.
223
+
224
+ Order tip: chain after ``strip_think_recover`` so reasoning is dropped first
225
+ and this filter operates on the answer tail only.
226
+ """
227
+
228
+ def __init__(self, max_prefix_chars: int = 80) -> None:
229
+ self.max_prefix_chars = int(max_prefix_chars)
230
+ # ^ - anchored at start
231
+ # .{0,N}? - up to N chars of garbage prefix (non-greedy)
232
+ # <\|?channel\|?> - matches both <channel|> and <|channel|>
233
+ # (?:[^<]*<\|?message\|?>)? - optional Harmony "<...><|message|>" tail
234
+ # \s* - eat trailing whitespace
235
+ self._pattern = re.compile(
236
+ r"^.{0," + str(self.max_prefix_chars) + r"}?<\|?channel\|?>"
237
+ r"(?:[^<]{0,40}<\|?message\|?>)?\s*",
238
+ re.DOTALL,
239
+ )
240
+
241
+ def apply(self, resps: list[list[str]], docs: list[dict]) -> list[list[str]]:
242
+ out: list[list[str]] = []
243
+ for resp_set in resps:
244
+ stripped: list[str] = []
245
+ for resp in resp_set:
246
+ if not isinstance(resp, str):
247
+ resp = ""
248
+ cleaned = self._pattern.sub("", resp, count=1)
249
+ stripped.append(cleaned)
250
+ out.append(stripped)
251
+ return out
252
 
253
 
254
  def process_results_f1(doc, results, *, gold_key="target"):
255
  """Return ``(pred, gold)`` for macro-F1 aggregation.
256
 
257
+ ``pred`` is the label after stripping think wrappers. Full reasoning
258
+ is logged as ``reasoning_content`` when using ``run_eval.py``.
 
259
 
260
  Most tasks use ``gold_key="target"``; override for tasks that store
261
  the gold label under a different field name.
262
  """
263
  raw_response = results[0].strip() if results[0] else ""
264
+ pred = _strip_think_tags(raw_response)
265
+ gold = doc.get(gold_key, "").strip()
266
  return {"f1_macro": (pred, gold)}
evals/french/classification/french_sib200.yaml CHANGED
@@ -12,6 +12,13 @@ generation_kwargs:
12
  process_docs: !function utils.process_sib200_docs
13
  doc_to_text: "Vous etes un systeme de classification de sujets.\nChoisissez la meilleure etiquette pour le texte suivant.\n\nEtiquettes autorisees: {{labels_str}}\n\nInstruction: Repondez avec UNE SEULE etiquette parmi les etiquettes autorisees. N'ecrivez rien d'autre.\n\nTexte:\n{{text}}\n\nEtiquette:"
14
  doc_to_target: "{{target}}"
 
 
 
 
 
 
 
15
  process_results: !function utils.process_results
16
  metric_list:
17
  - metric: f1_macro
 
12
  process_docs: !function utils.process_sib200_docs
13
  doc_to_text: "Vous etes un systeme de classification de sujets.\nChoisissez la meilleure etiquette pour le texte suivant.\n\nEtiquettes autorisees: {{labels_str}}\n\nInstruction: Repondez avec UNE SEULE etiquette parmi les etiquettes autorisees. N'ecrivez rien d'autre.\n\nTexte:\n{{text}}\n\nEtiquette:"
14
  doc_to_target: "{{target}}"
15
+ filter_list:
16
+ - name: "get_label"
17
+ filter:
18
+ - function: "strip_think_recover"
19
+ - function: "regex_label_set"
20
+ labels_field: "labels_str"
21
+ - function: "take_first"
22
  process_results: !function utils.process_results
23
  metric_list:
24
  - metric: f1_macro
evals/french/qa/french_qa.yaml CHANGED
@@ -2,6 +2,9 @@ group: french_qa
2
  task:
3
  - french_fquad
4
  aggregate_metric_list:
 
 
 
5
  - metric: f1
6
  aggregation: mean
7
  weight_by_size: true
 
2
  task:
3
  - french_fquad
4
  aggregate_metric_list:
5
+ - metric: exact_match
6
+ aggregation: mean
7
+ weight_by_size: true
8
  - metric: f1
9
  aggregation: mean
10
  weight_by_size: true
evals/hausa/classification/hausa_sib200.yaml CHANGED
@@ -12,6 +12,13 @@ generation_kwargs:
12
  process_docs: !function utils.process_sib200_docs
13
  doc_to_text: "You are a topic classification system.\nChoose the single best label for the following Hausa text.\n\nAllowed labels: {{labels_str}}\n\nInstruction: Reply with ONE label only from the allowed labels. Do not write anything else.\n\nText:\n{{text}}\n\nLabel:"
14
  doc_to_target: "{{target}}"
 
 
 
 
 
 
 
15
  process_results: !function utils.process_results
16
  metric_list:
17
  - metric: f1_macro
 
12
  process_docs: !function utils.process_sib200_docs
13
  doc_to_text: "You are a topic classification system.\nChoose the single best label for the following Hausa text.\n\nAllowed labels: {{labels_str}}\n\nInstruction: Reply with ONE label only from the allowed labels. Do not write anything else.\n\nText:\n{{text}}\n\nLabel:"
14
  doc_to_target: "{{target}}"
15
+ filter_list:
16
+ - name: "get_label"
17
+ filter:
18
+ - function: "strip_think_recover"
19
+ - function: "regex_label_set"
20
+ labels_field: "labels_str"
21
+ - function: "take_first"
22
  process_results: !function utils.process_results
23
  metric_list:
24
  - metric: f1_macro
evals/hausa/hausa.yaml CHANGED
@@ -5,6 +5,5 @@ task:
5
  - hausa_afrimgsm
6
  - hausa_nli
7
  - hausa_qa
8
- - hausa_sentiment
9
  metadata:
10
  version: 1.0
 
5
  - hausa_afrimgsm
6
  - hausa_nli
7
  - hausa_qa
 
8
  metadata:
9
  version: 1.0
evals/hausa/nli/hausa_afrixnli.yaml CHANGED
@@ -12,6 +12,13 @@ generation_kwargs:
12
  process_docs: !function utils.process_afrixnli_docs
13
  doc_to_text: "Premise: {{premise}}\nHypothesis: {{hypothesis}}\n\nDoes the hypothesis follow from the premise?\nAllowed answers: entailment, neutral, contradiction\n\nInstruction: Reply with ONE word only from the allowed answers.\n\nAnswer:"
14
  doc_to_target: "{{target}}"
 
 
 
 
 
 
 
15
  process_results: !function utils.process_results
16
  metric_list:
17
  - metric: f1_macro
 
12
  process_docs: !function utils.process_afrixnli_docs
13
  doc_to_text: "Premise: {{premise}}\nHypothesis: {{hypothesis}}\n\nDoes the hypothesis follow from the premise?\nAllowed answers: entailment, neutral, contradiction\n\nInstruction: Reply with ONE word only from the allowed answers.\n\nAnswer:"
14
  doc_to_target: "{{target}}"
15
+ filter_list:
16
+ - name: "get_label"
17
+ filter:
18
+ - function: "strip_think_recover"
19
+ - function: "regex_label_set"
20
+ labels_field: "labels_str"
21
+ - function: "take_first"
22
  process_results: !function utils.process_results
23
  metric_list:
24
  - metric: f1_macro
evals/hausa/nli/utils.py CHANGED
@@ -6,6 +6,7 @@ import datasets
6
  from f1_utils import macro_f1_agg, process_results_f1 # noqa: F401
7
 
8
  LABELS = ["entailment", "neutral", "contradiction"]
 
9
 
10
 
11
  def process_afrixnli_docs(dataset: datasets.Dataset) -> datasets.Dataset:
@@ -18,6 +19,7 @@ def process_afrixnli_docs(dataset: datasets.Dataset) -> datasets.Dataset:
18
  doc["target"] = LABELS[lbl]
19
  else:
20
  doc["target"] = str(lbl)
 
21
  return doc
22
  return dataset.map(_process)
23
 
 
6
  from f1_utils import macro_f1_agg, process_results_f1 # noqa: F401
7
 
8
  LABELS = ["entailment", "neutral", "contradiction"]
9
+ LABELS_STR = ", ".join(LABELS)
10
 
11
 
12
  def process_afrixnli_docs(dataset: datasets.Dataset) -> datasets.Dataset:
 
19
  doc["target"] = LABELS[lbl]
20
  else:
21
  doc["target"] = str(lbl)
22
+ doc["labels_str"] = LABELS_STR
23
  return doc
24
  return dataset.map(_process)
25
 
evals/hausa/qa/hausa_qa.yaml CHANGED
@@ -2,6 +2,9 @@ group: hausa_qa
2
  task:
3
  - hausa_afriqa
4
  aggregate_metric_list:
 
 
 
5
  - metric: f1
6
  aggregation: mean
7
  weight_by_size: true
 
2
  task:
3
  - hausa_afriqa
4
  aggregate_metric_list:
5
+ - metric: exact_match
6
+ aggregation: mean
7
+ weight_by_size: true
8
  - metric: f1
9
  aggregation: mean
10
  weight_by_size: true
evals/hausa/sentiment/utils.py DELETED
@@ -1,26 +0,0 @@
1
- """Sentiment utils."""
2
- import os as _os, sys as _sys # noqa: E401
3
- _sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..", "..")))
4
-
5
- import datasets
6
- from f1_utils import macro_f1_agg, process_results_f1 # noqa: F401
7
-
8
- LABELS = ["positive", "negative", "neutral"]
9
-
10
-
11
- def process_naijasenti_docs(dataset: datasets.Dataset) -> datasets.Dataset:
12
- feat = dataset.features.get("label")
13
- def _process(doc):
14
- lbl = doc.get("label")
15
- if isinstance(lbl, int) and feat is not None and hasattr(feat, "names") and feat.names:
16
- doc["target"] = feat.names[lbl]
17
- elif isinstance(lbl, int) and 0 <= lbl < len(LABELS):
18
- doc["target"] = LABELS[lbl]
19
- else:
20
- doc["target"] = str(lbl).lower()
21
- return doc
22
- return dataset.map(_process)
23
-
24
-
25
- def process_results(doc, results):
26
- return process_results_f1(doc, results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/igbo/afrimgsm/igbo_afrimgsm.yaml DELETED
@@ -1,28 +0,0 @@
1
- task: igbo_afrimgsm
2
- task_alias: afrimgsm
3
- dataset_path: masakhane/afrimgsm
4
- dataset_name: ibo
5
- test_split: test
6
- output_type: generate_until
7
- generation_kwargs:
8
- do_sample: false
9
- max_gen_toks: 8192
10
- until:
11
- - "<|endoftext|>"
12
- doc_to_text: "Question: {{question}}\nAnswer:"
13
- doc_to_target: "{{answer_number|string}}"
14
- filter_list:
15
- - name: "get_answer"
16
- filter:
17
- - function: "regex"
18
- regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
19
- group_select: -1
20
- - function: "take_first"
21
- metric_list:
22
- - metric: exact_match
23
- aggregation: mean
24
- higher_is_better: true
25
- ignore_case: true
26
- ignore_punctuation: true
27
- metadata:
28
- version: 1.0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/igbo/igbo.yaml DELETED
@@ -1,10 +0,0 @@
1
- group: igbo
2
- task:
3
- - igbo_classification
4
- - igbo_mcq
5
- - igbo_afrimgsm
6
- - igbo_nli
7
- - igbo_qa
8
- - igbo_sentiment
9
- metadata:
10
- version: 1.0
 
 
 
 
 
 
 
 
 
 
 
evals/igbo/mcq/igbo_afrimmlu.yaml DELETED
@@ -1,9 +0,0 @@
1
- task: igbo_afrimmlu
2
- task_alias: afrimmlu
3
- dataset_path: masakhane/afrimmlu
4
- dataset_name: ibo
5
- test_split: test
6
- include: _default_mcq_yaml
7
- process_docs: !function utils.process_afrimmlu_docs
8
- doc_to_text: "You are a highly knowledgeable AI that answers multiple-choice questions about '{{subject_field}}'.\n\nQuestion:\n{{question}}\n\nChoices:\nA: {{choice_a}}\nB: {{choice_b}}\nC: {{choice_c}}\nD: {{choice_d}}\n\nInstruction: Reply with EXACTLY one letter: A, B, C, or D. No other text.\n\nAnswer:"
9
- doc_to_target: "{{gold_letter}}"
 
 
 
 
 
 
 
 
 
 
evals/igbo/nli/utils.py DELETED
@@ -1,26 +0,0 @@
1
- """NLI utils."""
2
- import os as _os, sys as _sys # noqa: E401
3
- _sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..", "..")))
4
-
5
- import datasets
6
- from f1_utils import macro_f1_agg, process_results_f1 # noqa: F401
7
-
8
- LABELS = ["entailment", "neutral", "contradiction"]
9
-
10
-
11
- def process_afrixnli_docs(dataset: datasets.Dataset) -> datasets.Dataset:
12
- feat = dataset.features.get("label")
13
- def _process(doc):
14
- lbl = doc.get("label")
15
- if isinstance(lbl, int) and feat is not None and hasattr(feat, "names") and feat.names:
16
- doc["target"] = feat.names[lbl]
17
- elif isinstance(lbl, int) and 0 <= lbl < len(LABELS):
18
- doc["target"] = LABELS[lbl]
19
- else:
20
- doc["target"] = str(lbl)
21
- return doc
22
- return dataset.map(_process)
23
-
24
-
25
- def process_results(doc, results):
26
- return process_results_f1(doc, results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/igbo/qa/utils.py DELETED
@@ -1,61 +0,0 @@
1
- """QA utils."""
2
- import re
3
- import string
4
- import unicodedata
5
- import json
6
-
7
- import datasets
8
-
9
-
10
- def _get_gold_answers(doc):
11
- """AfriQA stores answers in a nested structure that may be a dict or stringified list."""
12
- ap = doc.get("answer_pivot") or doc.get("answers") or {}
13
- if isinstance(ap, str):
14
- try:
15
- ap = json.loads(ap)
16
- except Exception:
17
- return [ap]
18
- if isinstance(ap, dict):
19
- a = ap.get("answers") or ap.get("text") or []
20
- if isinstance(a, str):
21
- try:
22
- a = json.loads(a)
23
- except Exception:
24
- return [a]
25
- return a if isinstance(a, list) else [str(a)]
26
- if isinstance(ap, list):
27
- return [str(x) for x in ap]
28
- return [str(ap)]
29
-
30
-
31
- def _normalize(s: str) -> str:
32
- s = unicodedata.normalize("NFKC", s).lower()
33
- s = "".join(c for c in s if c not in string.punctuation)
34
- s = " ".join(s.split())
35
- return s
36
-
37
-
38
- def _f1(pred: str, gold: str) -> float:
39
- pred_toks = _normalize(pred).split()
40
- gold_toks = _normalize(gold).split()
41
- if not pred_toks or not gold_toks:
42
- return float(pred_toks == gold_toks)
43
- common = set(pred_toks) & set(gold_toks)
44
- num_same = sum(min(pred_toks.count(t), gold_toks.count(t)) for t in common)
45
- if num_same == 0:
46
- return 0.0
47
- p = num_same / len(pred_toks)
48
- r = num_same / len(gold_toks)
49
- return 2 * p * r / (p + r)
50
-
51
-
52
- def process_results_qa(doc, results):
53
- pred = results[0].strip() if results[0] else ""
54
- if "</think>" in pred:
55
- pred = pred.split("</think>")[-1].strip()
56
- golds = _get_gold_answers(doc)
57
- if not golds:
58
- return {"exact_match": 0.0, "f1": 0.0}
59
- em = max(1.0 if _normalize(pred) == _normalize(g) else 0.0 for g in golds)
60
- f1 = max(_f1(pred, g) for g in golds)
61
- return {"exact_match": em, "f1": f1}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/igbo/sentiment/igbo_sentiment.yaml DELETED
@@ -1,9 +0,0 @@
1
- group: igbo_sentiment
2
- task:
3
- - igbo_naijasenti
4
- aggregate_metric_list:
5
- - metric: f1_macro
6
- aggregation: mean
7
- weight_by_size: true
8
- metadata:
9
- version: 1.0
 
 
 
 
 
 
 
 
 
 
evals/igbo/sentiment/utils.py DELETED
@@ -1,26 +0,0 @@
1
- """Sentiment utils."""
2
- import os as _os, sys as _sys # noqa: E401
3
- _sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..", "..")))
4
-
5
- import datasets
6
- from f1_utils import macro_f1_agg, process_results_f1 # noqa: F401
7
-
8
- LABELS = ["positive", "negative", "neutral"]
9
-
10
-
11
- def process_naijasenti_docs(dataset: datasets.Dataset) -> datasets.Dataset:
12
- feat = dataset.features.get("label")
13
- def _process(doc):
14
- lbl = doc.get("label")
15
- if isinstance(lbl, int) and feat is not None and hasattr(feat, "names") and feat.names:
16
- doc["target"] = feat.names[lbl]
17
- elif isinstance(lbl, int) and 0 <= lbl < len(LABELS):
18
- doc["target"] = LABELS[lbl]
19
- else:
20
- doc["target"] = str(lbl).lower()
21
- return doc
22
- return dataset.map(_process)
23
-
24
-
25
- def process_results(doc, results):
26
- return process_results_f1(doc, results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
evals/portuguese/README.md ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Portuguese – lm-eval Tasks
2
+
3
+ Portuguese (PT-BR) evaluation suite for the `lm-evaluation-harness` framework.
4
+
5
+ ## Overview
6
+
7
+ | # | Task Name | Category | Dataset (HuggingFace) | Metric |
8
+ | --- | ------------------------ | -------------- | ----------------------------------------------- | ----------- |
9
+ | 1 | `portuguese_enem` | MCQ | `eduagarcia/enem_challenge` | exact_match |
10
+ | 2 | `portuguese_bluex` | MCQ | `eduagarcia-temp/BLUEX_without_images` | exact_match |
11
+ | 3 | `portuguese_oab_exams` | MCQ | `eduagarcia/oab_exams` | exact_match |
12
+ | 4 | `portuguese_hatebr` | Classification | `eduagarcia/portuguese_benchmark` (HateBR) | f1_macro |
13
+ | 5 | `portuguese_hate_speech` | Classification | `eduagarcia/portuguese_benchmark` (Hate Speech) | f1_macro |
14
+ | 6 | `portuguese_tweetsentbr` | Classification | `eduagarcia/tweetsentbr_fewshot` | f1_macro |
15
+ | 7 | `portuguese_assin2_rte` | NLI | `assin2` | f1_macro |
16
+ | 8 | `portuguese_faquad_nli` | NLI | `ruanchaves/faquad-nli` | f1_macro |
17
+ | 9 | `portuguese_assin2_sts` | NLI | `assin2` | pearson |
18
+
19
+ ### Subgroups
20
+
21
+ | Group | Tasks |
22
+ | --------------------------- | ---------------------------------- |
23
+ | `portuguese_mcq` | enem, bluex, oab_exams |
24
+ | `portuguese_classification` | hatebr, hate_speech, tweetsentbr |
25
+ | `portuguese_nli` | assin2_rte, faquad_nli, assin2_sts |
26
+
27
+ ## Setup
28
+
29
+ ```bash
30
+ pip install lm-eval
31
+ ```
32
+
33
+ ## Running Tasks
34
+
35
+ First, `cd` into the `lm_eval_tasks` directory and set the include path:
36
+
37
+ ```bash
38
+ cd functionary_internal/evaluation/multilingual_bench/lm_eval_tasks
39
+ export INCLUDE_PATH="$(pwd)"
40
+ ```
41
+
42
+ ### Run the Entire Portuguese (all 9 tasks)
43
+
44
+ ```bash
45
+ OPENAI_API_KEY="your-key" \
46
+ lm_eval \
47
+ --include_path $INCLUDE_PATH \
48
+ --tasks portuguese \
49
+ --model local-chat-completions \
50
+ --model_args model=your-model,base_url=https://openrouter.ai/api/v1/chat/completions,num_concurrent=5 \
51
+ --apply_chat_template \
52
+ --num_fewshot 0 \
53
+ --log_samples \
54
+ --output_path output/portuguese_results \
55
+ --gen_kwargs '{"temperature":0.6,"top_p":0.95,"provider":{"order":["alibaba"]}}'
56
+ ```
57
+
58
+ ### Run a Single Category
59
+
60
+ ```bash
61
+ # Multiple-choice exams (ENEM + BLUEX + OAB)
62
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_mcq ...
63
+
64
+ # Classification (hate speech, sentiment)
65
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_classification ...
66
+
67
+ # Natural Language Inference (ASSIN2 RTE + FaQuAD NLI + ASSIN2 STS)
68
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_nli ...
69
+ ```
70
+
71
+ ### Run a Single Task
72
+
73
+ ```bash
74
+ # ENEM exam
75
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_enem ...
76
+
77
+ # BLUEX vestibular
78
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_bluex ...
79
+
80
+ # OAB bar exam
81
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_oab_exams ...
82
+
83
+ # Hate speech detection
84
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_hatebr ...
85
+
86
+ # Sentiment analysis
87
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_tweetsentbr ...
88
+
89
+ # Textual entailment
90
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_assin2_rte ...
91
+ ```
92
+
93
+ ### Run with a Local HuggingFace Model
94
+
95
+ ```bash
96
+ lm_eval \
97
+ --include_path $INCLUDE_PATH \
98
+ --tasks portuguese \
99
+ --model hf \
100
+ --model_args pretrained=your-org/your-model \
101
+ --num_fewshot 0 \
102
+ --log_samples \
103
+ --output_path output/portuguese_results
104
+ ```
105
+
106
+ ### Mix and Match
107
+
108
+ ```bash
109
+ # Run ENEM + ASSIN2 RTE only
110
+ lm_eval --include_path $INCLUDE_PATH --tasks portuguese_enem,portuguese_assin2_rte ...
111
+ ```
112
+
113
+ ## Output
114
+
115
+ With `--log_samples`, the output directory contains:
116
+
117
+ - `results.json` – aggregate scores per task
118
+ - `samples_<task_name>.jsonl` – per-example model outputs for debugging
119
+
120
+ ## Dataset Sources
121
+
122
+ | Dataset | Source | Config | Fields |
123
+ | ---------------------- | -------------------------------------- | ------------------------------- | ----------------------------------------------------------- |
124
+ | ENEM | `eduagarcia/enem_challenge` | — | question, choices, answerKey |
125
+ | BLUEX | `eduagarcia-temp/BLUEX_without_images` | — | question, choices, answerKey |
126
+ | OAB Exams | `eduagarcia/oab_exams` | — | question, choices, answerKey |
127
+ | HateBR | `eduagarcia/portuguese_benchmark` | `HateBR_offensive_binary` | sentence, label |
128
+ | Portuguese Hate Speech | `eduagarcia/portuguese_benchmark` | `Portuguese_Hate_Speech_binary` | sentence, label |
129
+ | TweetSentBR | `eduagarcia/tweetsentbr_fewshot` | — | sentence, label |
130
+ | ASSIN2 | `assin2` | — | premise, hypothesis, entailment_judgment, relatedness_score |
131
+ | FaQuAD-NLI | `ruanchaves/faquad-nli` | — | question, answer, label |
evals/{igbo/nli/igbo_afrixnli.yaml → portuguese/classification/_default_classification_yaml} RENAMED
@@ -1,17 +1,10 @@
1
- task: igbo_afrixnli
2
- task_alias: afrixnli
3
- dataset_path: masakhane/afrixnli
4
- dataset_name: ibo
5
- test_split: test
6
  output_type: generate_until
7
  generation_kwargs:
8
  do_sample: false
9
  max_gen_toks: 8192
10
  until:
11
  - "<|endoftext|>"
12
- process_docs: !function utils.process_afrixnli_docs
13
- doc_to_text: "Premise: {{premise}}\nHypothesis: {{hypothesis}}\n\nDoes the hypothesis follow from the premise?\nAllowed answers: entailment, neutral, contradiction\n\nInstruction: Reply with ONE word only from the allowed answers.\n\nAnswer:"
14
- doc_to_target: "{{target}}"
15
  process_results: !function utils.process_results
16
  metric_list:
17
  - metric: f1_macro
 
1
+ # Shared config for Portuguese classification tasks (generative).
 
 
 
 
2
  output_type: generate_until
3
  generation_kwargs:
4
  do_sample: false
5
  max_gen_toks: 8192
6
  until:
7
  - "<|endoftext|>"
 
 
 
8
  process_results: !function utils.process_results
9
  metric_list:
10
  - metric: f1_macro
evals/portuguese/classification/portuguese_classification.yaml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Classification subgroup (hate speech, sentiment)
2
+ group: portuguese_classification
3
+ task:
4
+ - portuguese_hatebr
5
+ - portuguese_hate_speech
6
+ - portuguese_tweetsentbr
7
+ aggregate_metric_list:
8
+ - metric: f1_macro
9
+ aggregation: mean
10
+ weight_by_size: true
11
+ metadata:
12
+ version: 1.0
evals/portuguese/classification/portuguese_hate_speech.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: portuguese_hate_speech
2
+ task_alias: portuguese_hate_speech
3
+ dataset_path: eduagarcia/portuguese_benchmark
4
+ dataset_name: Portuguese_Hate_Speech_binary
5
+ test_split: test
6
+ include: _default_classification_yaml
7
+ process_docs: !function utils.process_binary_docs
8
+ doc_to_text: "Classifique se o texto a seguir contém discurso de ódio ou não. Responda apenas com \"Sim\" ou \"Não\".\n\nTexto: {{sentence}}\nPergunta: O texto contém discurso de ódio?\nResposta:"
9
+ doc_to_target: "{{target}}"
10
+ filter_list:
11
+ - name: "get_label"
12
+ filter:
13
+ - function: "strip_think_recover"
14
+ - function: "regex_last"
15
+ regex_pattern: "(Não|Sim)"
16
+ group_select: -1
17
+ - function: "take_first"
evals/portuguese/classification/portuguese_hatebr.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: portuguese_hatebr
2
+ task_alias: hatebr_offensive
3
+ dataset_path: eduagarcia/portuguese_benchmark
4
+ dataset_name: HateBR_offensive_binary
5
+ test_split: test
6
+ include: _default_classification_yaml
7
+ process_docs: !function utils.process_binary_docs
8
+ doc_to_text: "Classifique se o texto a seguir é ofensivo ou não. Responda apenas com \"Sim\" ou \"Não\".\n\nTexto: {{sentence}}\nPergunta: O texto é ofensivo?\nResposta:"
9
+ doc_to_target: "{{target}}"
10
+ filter_list:
11
+ - name: "get_label"
12
+ filter:
13
+ - function: "strip_think_recover"
14
+ - function: "regex_last"
15
+ regex_pattern: "(Não|Sim)"
16
+ group_select: -1
17
+ - function: "take_first"
evals/portuguese/classification/portuguese_tweetsentbr.yaml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ task: portuguese_tweetsentbr
2
+ task_alias: tweetsentbr
3
+ dataset_path: eduagarcia/tweetsentbr_fewshot
4
+ test_split: test
5
+ include: _default_classification_yaml
6
+ process_docs: !function utils.process_sentiment_docs
7
+ doc_to_text: "Classifique o sentimento do texto a seguir. Responda apenas com \"Positivo\", \"Neutro\" ou \"Negativo\".\n\nTexto: {{sentence}}\nPergunta: O sentimento do texto é Positivo, Neutro ou Negativo?\nResposta:"
8
+ doc_to_target: "{{target}}"
9
+ filter_list:
10
+ - name: "get_label"
11
+ filter:
12
+ - function: "strip_think_recover"
13
+ - function: "regex_last"
14
+ regex_pattern: "(Negativo|Neutro|Positivo)"
15
+ group_select: -1
16
+ - function: "take_first"
evals/portuguese/classification/utils.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility helpers for Portuguese classification tasks (generative mode).
2
+
3
+ Each process_docs function adds a ``target`` field with the expected
4
+ Portuguese label. process_results + macro_f1_agg compute macro-averaged F1.
5
+ """
6
+
7
+ import os as _os, sys as _sys # noqa: E401
8
+ _sys.path.insert(0, _os.path.normpath(_os.path.join(_os.path.dirname(__file__), "..","..",)))
9
+
10
+
11
+ from f1_utils import macro_f1_agg, process_results_f1 # noqa: F401
12
+
13
+
14
+ # ── Emotion label mapping (English → Portuguese) ────────────────────
15
+
16
+ EMOTION_LABEL_MAP = {
17
+ "Admiration": "Admiração",
18
+ "Amusement": "Diversão",
19
+ "Anger": "Raiva",
20
+ "Annoyance": "Aborrecimento",
21
+ "Approval": "Aprovação",
22
+ "Compassion": "Compaixão",
23
+ "Confusion": "Confusão",
24
+ "Curiosity": "Curiosidade",
25
+ "Desire": "Desejo",
26
+ "Disappointment": "Decepção",
27
+ "Disapproval": "Desaprovação",
28
+ "Disgust": "Nojo",
29
+ "Embarrassment": "Vergonha",
30
+ "Envy": "Inveja",
31
+ "Excitement": "Entusiasmo",
32
+ "Fear": "Medo",
33
+ "Gratitude": "Gratidão",
34
+ "Grief": "Luto",
35
+ "Joy": "Alegria",
36
+ "Longing": "Saudade",
37
+ "Love": "Amor",
38
+ "Nervousness": "Nervosismo",
39
+ "Optimism": "Otimismo",
40
+ "Pride": "Orgulho",
41
+ "Relief": "Alívio",
42
+ "Remorse": "Remorso",
43
+ "Sadness": "Tristeza",
44
+ "Surprise": "Surpresa",
45
+ }
46
+
47
+ SENTIMENT_LABEL_MAP = {
48
+ "Positive": "Positivo",
49
+ "Negative": "Negativo",
50
+ "Neutral": "Neutro",
51
+ }
52
+
53
+
54
+ # ── Document pre-processing ─────────────────────────────────────────
55
+
56
+
57
+ def process_binary_docs(dataset):
58
+ """Map 0/1 label → Não/Sim (for hatebr, portuguese_hate_speech)."""
59
+
60
+ def _map(doc):
61
+ doc["target"] = "Sim" if doc["label"] == 1 else "Não"
62
+ return doc
63
+
64
+ return dataset.map(_map)
65
+
66
+
67
+ def process_sentiment_docs(dataset):
68
+ """Map Positive/Negative/Neutral → Positivo/Negativo/Neutro (tweetsentbr)."""
69
+
70
+ def _map(doc):
71
+ doc["target"] = SENTIMENT_LABEL_MAP.get(doc["label"], "Neutro")
72
+ return doc
73
+
74
+ return dataset.map(_map)
75
+
76
+
77
+ def process_sparrow_sentiment_docs(dataset):
78
+ """Map sparrow sentiment labels → Portuguese."""
79
+
80
+ def _map(doc):
81
+ doc["target"] = SENTIMENT_LABEL_MAP.get(doc["label"], "Neutro")
82
+ return doc
83
+
84
+ return dataset.map(_map)
85
+
86
+
87
+ def process_sparrow_emotion_docs(dataset):
88
+ """Map English emotion labels → Portuguese."""
89
+
90
+ def _map(doc):
91
+ doc["target"] = EMOTION_LABEL_MAP.get(doc["label"], doc["label"])
92
+ return doc
93
+
94
+ return dataset.map(_map)
95
+
96
+
97
+ def process_sparrow_hate_docs(dataset):
98
+ """Map Hate/NotHate → Sim/Não."""
99
+
100
+ def _map(doc):
101
+ doc["target"] = "Sim" if doc["label"] == "Hate" else "Não"
102
+ return doc
103
+
104
+ return dataset.map(_map)
105
+
106
+
107
+ # ── Result processing ────────────────────────────────────────────────
108
+
109
+
110
+ def process_results(doc, results):
111
+ """Return (pred, gold) tuple for macro-F1 aggregation."""
112
+ return process_results_f1(doc, results)
evals/{swahili/afrimgsm/swahili_afrimgsm.yaml → portuguese/mcq/_default_mcq_yaml} RENAMED
@@ -1,28 +1,21 @@
1
- task: swahili_afrimgsm
2
- task_alias: afrimgsm
3
- dataset_path: masakhane/afrimgsm
4
- dataset_name: swa
5
- test_split: test
6
  output_type: generate_until
7
  generation_kwargs:
8
  do_sample: false
9
  max_gen_toks: 8192
10
  until:
11
  - "<|endoftext|>"
12
- doc_to_text: "Swali: {{question}}\nJibu:"
13
- doc_to_target: "{{answer_number|string}}"
14
  filter_list:
15
  - name: "get_answer"
16
  filter:
 
17
  - function: "regex"
18
- regex_pattern: "(-?[$0-9.,]{2,})|(-?[0-9]+)"
19
- group_select: -1
20
  - function: "take_first"
21
  metric_list:
22
  - metric: exact_match
23
  aggregation: mean
24
  higher_is_better: true
25
- ignore_case: true
26
- ignore_punctuation: true
27
  metadata:
28
  version: 1.0
 
1
+ # Shared config for Portuguese MCQ tasks (generative A/B/C/D/E).
 
 
 
 
2
  output_type: generate_until
3
  generation_kwargs:
4
  do_sample: false
5
  max_gen_toks: 8192
6
  until:
7
  - "<|endoftext|>"
 
 
8
  filter_list:
9
  - name: "get_answer"
10
  filter:
11
+ - function: "strip_think_recover"
12
  - function: "regex"
13
+ regex_pattern: "([ABCDE])"
14
+ group_select: 0
15
  - function: "take_first"
16
  metric_list:
17
  - metric: exact_match
18
  aggregation: mean
19
  higher_is_better: true
 
 
20
  metadata:
21
  version: 1.0