Update app.py
Browse files
app.py
CHANGED
|
@@ -164,7 +164,7 @@ SUMMARISATION = Task(name="summarisation", metric="bertscore")
|
|
| 164 |
KNOWLEDGE = Task(name="knowledge", metric="mcc")
|
| 165 |
REASONING = Task(name="reasoning", metric="mcc")
|
| 166 |
GRAMMAR = Task(name="grammar", metric="mcc")
|
| 167 |
-
|
| 168 |
TEXT_CLASSIFICATION = Task(name="text classification", metric="mcc")
|
| 169 |
INFORMATION_EXTRACTION = Task(name="information extraction", metric="micro_f1_no_misc")
|
| 170 |
ALL_TASKS = [obj for obj in globals().values() if isinstance(obj, Task)]
|
|
@@ -203,13 +203,13 @@ DATASETS = [
|
|
| 203 |
Dataset(name="scala-de", language=GERMAN, task=GRAMMAR),
|
| 204 |
Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR),
|
| 205 |
Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR),
|
| 206 |
-
Dataset(name="scandiqa-da", language=DANISH, task=
|
| 207 |
-
Dataset(name="norquad", language=NORWEGIAN, task=
|
| 208 |
-
Dataset(name="scandiqa-sv", language=SWEDISH, task=
|
| 209 |
-
Dataset(name="nqii", language=ICELANDIC, task=
|
| 210 |
-
Dataset(name="germanquad", language=GERMAN, task=
|
| 211 |
-
Dataset(name="squad", language=ENGLISH, task=
|
| 212 |
-
Dataset(name="squad-nl", language=DUTCH, task=
|
| 213 |
Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION),
|
| 214 |
Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION),
|
| 215 |
Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION),
|
|
@@ -671,11 +671,6 @@ def fetch_results() -> dict[Language, pd.DataFrame]:
|
|
| 671 |
for record in records:
|
| 672 |
model_name = record["model"]
|
| 673 |
|
| 674 |
-
# Manual fix for OpenAI models: Only keep the validation split results
|
| 675 |
-
if "gpt-3.5" in model_name or "gpt-4" in model_name:
|
| 676 |
-
if not record.get("validation_split", False):
|
| 677 |
-
continue
|
| 678 |
-
|
| 679 |
dataset_name = record["dataset"]
|
| 680 |
if dataset_name in possible_dataset_names:
|
| 681 |
dataset = next(
|
|
|
|
| 164 |
KNOWLEDGE = Task(name="knowledge", metric="mcc")
|
| 165 |
REASONING = Task(name="reasoning", metric="mcc")
|
| 166 |
GRAMMAR = Task(name="grammar", metric="mcc")
|
| 167 |
+
READING_COMPREHENSION = Task(name="reading comprehension", metric="em")
|
| 168 |
TEXT_CLASSIFICATION = Task(name="text classification", metric="mcc")
|
| 169 |
INFORMATION_EXTRACTION = Task(name="information extraction", metric="micro_f1_no_misc")
|
| 170 |
ALL_TASKS = [obj for obj in globals().values() if isinstance(obj, Task)]
|
|
|
|
| 203 |
Dataset(name="scala-de", language=GERMAN, task=GRAMMAR),
|
| 204 |
Dataset(name="scala-nl", language=DUTCH, task=GRAMMAR),
|
| 205 |
Dataset(name="scala-en", language=ENGLISH, task=GRAMMAR),
|
| 206 |
+
Dataset(name="scandiqa-da", language=DANISH, task=READING_COMPREHENSION),
|
| 207 |
+
Dataset(name="norquad", language=NORWEGIAN, task=READING_COMPREHENSION),
|
| 208 |
+
Dataset(name="scandiqa-sv", language=SWEDISH, task=READING_COMPREHENSION),
|
| 209 |
+
Dataset(name="nqii", language=ICELANDIC, task=READING_COMPREHENSION),
|
| 210 |
+
Dataset(name="germanquad", language=GERMAN, task=READING_COMPREHENSION),
|
| 211 |
+
Dataset(name="squad", language=ENGLISH, task=READING_COMPREHENSION),
|
| 212 |
+
Dataset(name="squad-nl", language=DUTCH, task=READING_COMPREHENSION),
|
| 213 |
Dataset(name="nordjylland-news", language=DANISH, task=SUMMARISATION),
|
| 214 |
Dataset(name="mlsum", language=GERMAN, task=SUMMARISATION),
|
| 215 |
Dataset(name="rrn", language=ICELANDIC, task=SUMMARISATION),
|
|
|
|
| 671 |
for record in records:
|
| 672 |
model_name = record["model"]
|
| 673 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 674 |
dataset_name = record["dataset"]
|
| 675 |
if dataset_name in possible_dataset_names:
|
| 676 |
dataset = next(
|