Spaces:
Running
Running
Commit
·
678bdbb
1
Parent(s):
eea50e2
Updating metrics
Browse files
src/components/filters.py
CHANGED
|
@@ -20,6 +20,10 @@ def initialize_session_state(df):
|
|
| 20 |
if 'selected_tasks' not in st.session_state:
|
| 21 |
# Select all tasks by default, excluding Model Type
|
| 22 |
st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
if 'selected_model_types' not in st.session_state:
|
| 25 |
# Ensure all model types are selected by default
|
|
|
|
| 20 |
if 'selected_tasks' not in st.session_state:
|
| 21 |
# Select all tasks by default, excluding Model Type
|
| 22 |
st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
|
| 23 |
+
else:
|
| 24 |
+
# Make sure selected_tasks only includes actual tasks from the dataframe
|
| 25 |
+
valid_tasks = [col for col in df.columns if col not in ['Model Type']]
|
| 26 |
+
st.session_state.selected_tasks = [task for task in st.session_state.selected_tasks if task in valid_tasks]
|
| 27 |
|
| 28 |
if 'selected_model_types' not in st.session_state:
|
| 29 |
# Ensure all model types are selected by default
|
src/data/metrics/absolute_improvement_to_baseline.json
CHANGED
|
@@ -6,7 +6,8 @@
|
|
| 6 |
"MLAB (o3-mini)": 0.9,
|
| 7 |
"MLAB (gpt-4o)": 0.9,
|
| 8 |
"MLAB (llama3-1-405b-instruct)": 1.5,
|
| 9 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.0
|
|
|
|
| 10 |
},
|
| 11 |
"llm-merging": {
|
| 12 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
|
|
@@ -15,60 +16,57 @@
|
|
| 15 |
"MLAB (gemini-exp-1206)": 3.4,
|
| 16 |
"MLAB (o3-mini)": -0.7,
|
| 17 |
"MLAB (gpt-4o)": 1.4,
|
| 18 |
-
"MLAB (llama3-1-405b-instruct)": -0.7
|
| 19 |
-
|
| 20 |
-
"meta-learning": {
|
| 21 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 5.4,
|
| 22 |
-
"Top Human in Competition": 304.5,
|
| 23 |
-
"MLAB (claude-3-5-sonnet-v2)": 5.4,
|
| 24 |
-
"MLAB (gemini-exp-1206)": 5.4,
|
| 25 |
-
"MLAB (o3-mini)": -14.9,
|
| 26 |
-
"MLAB (gpt-4o)": 5.4,
|
| 27 |
-
"MLAB (llama3-1-405b-instruct)": 5.4
|
| 28 |
},
|
| 29 |
"product-recommendation": {
|
| 30 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 2.3,
|
| 31 |
-
"Top Human in Competition": 412.6,
|
| 32 |
"MLAB (claude-3-5-sonnet-v2)": 12.3,
|
|
|
|
| 33 |
"MLAB (gemini-exp-1206)": 0.6,
|
| 34 |
"MLAB (o3-mini)": 0.6,
|
| 35 |
"MLAB (gpt-4o)": 2.6,
|
| 36 |
-
"MLAB (llama3-1-405b-instruct)": -0.0
|
|
|
|
|
|
|
| 37 |
},
|
| 38 |
"weather_forcast": {
|
| 39 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
|
| 40 |
-
"Top Human in Competition":
|
|
|
|
| 41 |
"MLAB (claude-3-5-sonnet-v2)": 31.0,
|
| 42 |
"MLAB (gemini-exp-1206)": 91.4,
|
| 43 |
"MLAB (o3-mini)": 53.3,
|
| 44 |
"MLAB (gpt-4o)": 100.8,
|
| 45 |
"MLAB (llama3-1-405b-instruct)": 66.7
|
| 46 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
"machine_unlearning": {
|
| 48 |
-
"
|
| 49 |
"Top Human in Competition": 61.9,
|
|
|
|
| 50 |
"MLAB (claude-3-5-sonnet-v2)": -58.6,
|
| 51 |
"MLAB (gemini-exp-1206)": 3.5,
|
| 52 |
"MLAB (o3-mini)": 2.2,
|
| 53 |
"MLAB (gpt-4o)": -11.1,
|
| 54 |
"MLAB (llama3-1-405b-instruct)": 3.8
|
| 55 |
},
|
| 56 |
-
"erasing_invisible_watermarks": {
|
| 57 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 80.3,
|
| 58 |
-
"Top Human in Competition": 95.6,
|
| 59 |
-
"MLAB (claude-3-5-sonnet-v2)": 83.7,
|
| 60 |
-
"MLAB (gemini-exp-1206)": 93.3,
|
| 61 |
-
"MLAB (o3-mini)": 79.8,
|
| 62 |
-
"MLAB (gpt-4o)": 79.8,
|
| 63 |
-
"MLAB (llama3-1-405b-instruct)": 79.8
|
| 64 |
-
},
|
| 65 |
"backdoor-trigger-recovery": {
|
| 66 |
-
"
|
| 67 |
"Top Human in Competition": 621.3,
|
|
|
|
| 68 |
"MLAB (claude-3-5-sonnet-v2)": 247.9,
|
| 69 |
"MLAB (gemini-exp-1206)": 80.4,
|
| 70 |
"MLAB (o3-mini)": 38.8,
|
| 71 |
-
"MLAB (
|
| 72 |
-
"MLAB (
|
| 73 |
}
|
| 74 |
}
|
|
|
|
| 6 |
"MLAB (o3-mini)": 0.9,
|
| 7 |
"MLAB (gpt-4o)": 0.9,
|
| 8 |
"MLAB (llama3-1-405b-instruct)": 1.5,
|
| 9 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.0,
|
| 10 |
+
"Human Idea + MLAB (gpt-4o)": 1.5
|
| 11 |
},
|
| 12 |
"llm-merging": {
|
| 13 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
|
|
|
|
| 16 |
"MLAB (gemini-exp-1206)": 3.4,
|
| 17 |
"MLAB (o3-mini)": -0.7,
|
| 18 |
"MLAB (gpt-4o)": 1.4,
|
| 19 |
+
"MLAB (llama3-1-405b-instruct)": -0.7,
|
| 20 |
+
"Human Idea + MLAB (gpt-4o)": -0.7
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
},
|
| 22 |
"product-recommendation": {
|
|
|
|
|
|
|
| 23 |
"MLAB (claude-3-5-sonnet-v2)": 12.3,
|
| 24 |
+
"Top Human in Competition": 412.6,
|
| 25 |
"MLAB (gemini-exp-1206)": 0.6,
|
| 26 |
"MLAB (o3-mini)": 0.6,
|
| 27 |
"MLAB (gpt-4o)": 2.6,
|
| 28 |
+
"MLAB (llama3-1-405b-instruct)": -0.0,
|
| 29 |
+
"Human Idea + MLAB (gpt-4o)": 8.9,
|
| 30 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.6
|
| 31 |
},
|
| 32 |
"weather_forcast": {
|
| 33 |
"CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
|
| 34 |
+
"Top Human in Competition": 212.0,
|
| 35 |
+
"Human Idea + MLAB (gpt-4o)": 26.1,
|
| 36 |
"MLAB (claude-3-5-sonnet-v2)": 31.0,
|
| 37 |
"MLAB (gemini-exp-1206)": 91.4,
|
| 38 |
"MLAB (o3-mini)": 53.3,
|
| 39 |
"MLAB (gpt-4o)": 100.8,
|
| 40 |
"MLAB (llama3-1-405b-instruct)": 66.7
|
| 41 |
},
|
| 42 |
+
"meta-learning": {
|
| 43 |
+
"MLAB (claude-3-5-sonnet-v2)": -14.9,
|
| 44 |
+
"Top Human in Competition": 304.5,
|
| 45 |
+
"MLAB (gemini-exp-1206)": -3.2,
|
| 46 |
+
"MLAB (o3-mini)": -14.9,
|
| 47 |
+
"MLAB (gpt-4o)": -14.9,
|
| 48 |
+
"MLAB (llama3-1-405b-instruct)": -14.9,
|
| 49 |
+
"Human Idea + MLAB (gpt-4o)": -14.9,
|
| 50 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -14.9
|
| 51 |
+
},
|
| 52 |
"machine_unlearning": {
|
| 53 |
+
"Human Idea + MLAB (gpt-4o)": 4.2,
|
| 54 |
"Top Human in Competition": 61.9,
|
| 55 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 7.3,
|
| 56 |
"MLAB (claude-3-5-sonnet-v2)": -58.6,
|
| 57 |
"MLAB (gemini-exp-1206)": 3.5,
|
| 58 |
"MLAB (o3-mini)": 2.2,
|
| 59 |
"MLAB (gpt-4o)": -11.1,
|
| 60 |
"MLAB (llama3-1-405b-instruct)": 3.8
|
| 61 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
"backdoor-trigger-recovery": {
|
| 63 |
+
"MLAB (gpt-4o)": 74.0,
|
| 64 |
"Top Human in Competition": 621.3,
|
| 65 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 24.9,
|
| 66 |
"MLAB (claude-3-5-sonnet-v2)": 247.9,
|
| 67 |
"MLAB (gemini-exp-1206)": 80.4,
|
| 68 |
"MLAB (o3-mini)": 38.8,
|
| 69 |
+
"MLAB (llama3-1-405b-instruct)": 71.7,
|
| 70 |
+
"Human Idea + MLAB (gpt-4o)": 54.5
|
| 71 |
}
|
| 72 |
}
|
src/data/metrics/relative_improvement_to_human.json
CHANGED
|
@@ -6,7 +6,8 @@
|
|
| 6 |
"MLAB (o3-mini)": 0.3,
|
| 7 |
"MLAB (gpt-4o)": 0.3,
|
| 8 |
"MLAB (llama3-1-405b-instruct)": 0.5,
|
| 9 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.4
|
|
|
|
| 10 |
},
|
| 11 |
"llm-merging": {
|
| 12 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
|
|
@@ -15,60 +16,57 @@
|
|
| 15 |
"MLAB (gemini-exp-1206)": 5.0,
|
| 16 |
"MLAB (o3-mini)": -1.0,
|
| 17 |
"MLAB (gpt-4o)": 2.0,
|
| 18 |
-
"MLAB (llama3-1-405b-instruct)": -1.0
|
| 19 |
-
|
| 20 |
-
"meta-learning": {
|
| 21 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.8,
|
| 22 |
-
"Top Human in Competition": 100.0,
|
| 23 |
-
"MLAB (claude-3-5-sonnet-v2)": 1.8,
|
| 24 |
-
"MLAB (gemini-exp-1206)": 1.8,
|
| 25 |
-
"MLAB (o3-mini)": -4.9,
|
| 26 |
-
"MLAB (gpt-4o)": 1.8,
|
| 27 |
-
"MLAB (llama3-1-405b-instruct)": 1.8
|
| 28 |
},
|
| 29 |
"product-recommendation": {
|
| 30 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.6,
|
| 31 |
-
"Top Human in Competition": 100.0,
|
| 32 |
"MLAB (claude-3-5-sonnet-v2)": 3.0,
|
|
|
|
| 33 |
"MLAB (gemini-exp-1206)": 0.1,
|
| 34 |
"MLAB (o3-mini)": 0.1,
|
| 35 |
"MLAB (gpt-4o)": 0.6,
|
| 36 |
-
"MLAB (llama3-1-405b-instruct)": -0.0
|
|
|
|
|
|
|
| 37 |
},
|
| 38 |
"weather_forcast": {
|
| 39 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
| 40 |
"Top Human in Competition": 100.0,
|
| 41 |
-
"MLAB (
|
| 42 |
-
"MLAB (
|
| 43 |
-
"MLAB (
|
| 44 |
-
"MLAB (
|
| 45 |
-
"MLAB (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
},
|
| 47 |
"machine_unlearning": {
|
| 48 |
-
"
|
| 49 |
"Top Human in Competition": 100.0,
|
|
|
|
| 50 |
"MLAB (claude-3-5-sonnet-v2)": -94.7,
|
| 51 |
"MLAB (gemini-exp-1206)": 5.6,
|
| 52 |
"MLAB (o3-mini)": 3.6,
|
| 53 |
"MLAB (gpt-4o)": -18.0,
|
| 54 |
"MLAB (llama3-1-405b-instruct)": 6.2
|
| 55 |
},
|
| 56 |
-
"erasing_invisible_watermarks": {
|
| 57 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 84.0,
|
| 58 |
-
"Top Human in Competition": 100.0,
|
| 59 |
-
"MLAB (claude-3-5-sonnet-v2)": 87.6,
|
| 60 |
-
"MLAB (gemini-exp-1206)": 97.5,
|
| 61 |
-
"MLAB (o3-mini)": 83.4,
|
| 62 |
-
"MLAB (gpt-4o)": 83.4,
|
| 63 |
-
"MLAB (llama3-1-405b-instruct)": 83.4
|
| 64 |
-
},
|
| 65 |
"backdoor-trigger-recovery": {
|
| 66 |
-
"
|
| 67 |
"Top Human in Competition": 100.0,
|
|
|
|
| 68 |
"MLAB (claude-3-5-sonnet-v2)": 39.9,
|
| 69 |
"MLAB (gemini-exp-1206)": 12.9,
|
| 70 |
"MLAB (o3-mini)": 6.2,
|
| 71 |
-
"MLAB (
|
| 72 |
-
"MLAB (
|
| 73 |
}
|
| 74 |
}
|
|
|
|
| 6 |
"MLAB (o3-mini)": 0.3,
|
| 7 |
"MLAB (gpt-4o)": 0.3,
|
| 8 |
"MLAB (llama3-1-405b-instruct)": 0.5,
|
| 9 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.4,
|
| 10 |
+
"Human Idea + MLAB (gpt-4o)": 0.5
|
| 11 |
},
|
| 12 |
"llm-merging": {
|
| 13 |
"CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
|
|
|
|
| 16 |
"MLAB (gemini-exp-1206)": 5.0,
|
| 17 |
"MLAB (o3-mini)": -1.0,
|
| 18 |
"MLAB (gpt-4o)": 2.0,
|
| 19 |
+
"MLAB (llama3-1-405b-instruct)": -1.0,
|
| 20 |
+
"Human Idea + MLAB (gpt-4o)": -1.0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
},
|
| 22 |
"product-recommendation": {
|
|
|
|
|
|
|
| 23 |
"MLAB (claude-3-5-sonnet-v2)": 3.0,
|
| 24 |
+
"Top Human in Competition": 100.0,
|
| 25 |
"MLAB (gemini-exp-1206)": 0.1,
|
| 26 |
"MLAB (o3-mini)": 0.1,
|
| 27 |
"MLAB (gpt-4o)": 0.6,
|
| 28 |
+
"MLAB (llama3-1-405b-instruct)": -0.0,
|
| 29 |
+
"Human Idea + MLAB (gpt-4o)": 2.2,
|
| 30 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.1
|
| 31 |
},
|
| 32 |
"weather_forcast": {
|
| 33 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 39.4,
|
| 34 |
"Top Human in Competition": 100.0,
|
| 35 |
+
"Human Idea + MLAB (gpt-4o)": 12.3,
|
| 36 |
+
"MLAB (claude-3-5-sonnet-v2)": 14.6,
|
| 37 |
+
"MLAB (gemini-exp-1206)": 43.1,
|
| 38 |
+
"MLAB (o3-mini)": 25.1,
|
| 39 |
+
"MLAB (gpt-4o)": 47.5,
|
| 40 |
+
"MLAB (llama3-1-405b-instruct)": 31.5
|
| 41 |
+
},
|
| 42 |
+
"meta-learning": {
|
| 43 |
+
"MLAB (claude-3-5-sonnet-v2)": -4.9,
|
| 44 |
+
"Top Human in Competition": 100.0,
|
| 45 |
+
"MLAB (gemini-exp-1206)": -1.1,
|
| 46 |
+
"MLAB (o3-mini)": -4.9,
|
| 47 |
+
"MLAB (gpt-4o)": -4.9,
|
| 48 |
+
"MLAB (llama3-1-405b-instruct)": -4.9,
|
| 49 |
+
"Human Idea + MLAB (gpt-4o)": -4.9,
|
| 50 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -4.9
|
| 51 |
},
|
| 52 |
"machine_unlearning": {
|
| 53 |
+
"Human Idea + MLAB (gpt-4o)": 6.8,
|
| 54 |
"Top Human in Competition": 100.0,
|
| 55 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 11.8,
|
| 56 |
"MLAB (claude-3-5-sonnet-v2)": -94.7,
|
| 57 |
"MLAB (gemini-exp-1206)": 5.6,
|
| 58 |
"MLAB (o3-mini)": 3.6,
|
| 59 |
"MLAB (gpt-4o)": -18.0,
|
| 60 |
"MLAB (llama3-1-405b-instruct)": 6.2
|
| 61 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
"backdoor-trigger-recovery": {
|
| 63 |
+
"MLAB (gpt-4o)": 11.9,
|
| 64 |
"Top Human in Competition": 100.0,
|
| 65 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 4.0,
|
| 66 |
"MLAB (claude-3-5-sonnet-v2)": 39.9,
|
| 67 |
"MLAB (gemini-exp-1206)": 12.9,
|
| 68 |
"MLAB (o3-mini)": 6.2,
|
| 69 |
+
"MLAB (llama3-1-405b-instruct)": 11.5,
|
| 70 |
+
"Human Idea + MLAB (gpt-4o)": 8.8
|
| 71 |
}
|
| 72 |
}
|
src/utils/config.py
CHANGED
|
@@ -82,7 +82,8 @@ model_categories = {
|
|
| 82 |
"MLAB (llama3-1-405b-instruct)": "Open Weights",
|
| 83 |
"CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
|
| 84 |
"Human": "Human",
|
| 85 |
-
"Top Human in Competition": "Human"
|
|
|
|
| 86 |
# More models would be added here as needed
|
| 87 |
}
|
| 88 |
|
|
|
|
| 82 |
"MLAB (llama3-1-405b-instruct)": "Open Weights",
|
| 83 |
"CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
|
| 84 |
"Human": "Human",
|
| 85 |
+
"Top Human in Competition": "Human",
|
| 86 |
+
"Human Idea + MLAB (gpt-4o)": "Closed Source"
|
| 87 |
# More models would be added here as needed
|
| 88 |
}
|
| 89 |
|
src/utils/task_mapping.py
CHANGED
|
@@ -8,6 +8,7 @@ task_display_names = {
|
|
| 8 |
"Llm Merging": "LLM Merging",
|
| 9 |
"Meta Learning": "Meta Learning",
|
| 10 |
"Product Recommendation": "Next Product Recommendation",
|
|
|
|
| 11 |
"Machine Unlearning": "Machine Unlearning",
|
| 12 |
"Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
|
| 13 |
}
|
|
|
|
| 8 |
"Llm Merging": "LLM Merging",
|
| 9 |
"Meta Learning": "Meta Learning",
|
| 10 |
"Product Recommendation": "Next Product Recommendation",
|
| 11 |
+
"Weather Forcast": "Rainfall Prediction",
|
| 12 |
"Machine Unlearning": "Machine Unlearning",
|
| 13 |
"Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
|
| 14 |
}
|