Spaces:
Running
Running
Commit
·
cf2253a
1
Parent(s):
72d60b0
Updating metrics
Browse files
src/data/metrics/absolute_improvement_to_baseline.json
CHANGED
|
@@ -1,56 +1,74 @@
|
|
| 1 |
{
|
| 2 |
"perception_temporal_action_loc": {
|
| 3 |
-
"MLAB (claude-3-5-sonnet-v2)": 2.
|
| 4 |
-
"Top Human in Competition": 284.
|
| 5 |
-
"MLAB (gemini-exp-1206)": -1.
|
| 6 |
-
"MLAB (o3-mini)": 0.
|
| 7 |
-
"MLAB (gpt-4o)": 0.
|
| 8 |
-
"MLAB (llama3-1-405b-instruct)": 1.
|
| 9 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0
|
| 10 |
},
|
| 11 |
"llm-merging": {
|
| 12 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.
|
| 13 |
-
"Top Human in Competition": 68.
|
| 14 |
-
"MLAB (claude-3-5-sonnet-v2)": 3.
|
| 15 |
-
"MLAB (gemini-exp-1206)": 3.
|
| 16 |
-
"MLAB (o3-mini)": -0.
|
| 17 |
-
"MLAB (gpt-4o)": 1.
|
| 18 |
-
"MLAB (llama3-1-405b-instruct)": -0.
|
| 19 |
},
|
| 20 |
"meta-learning": {
|
| 21 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 5.
|
| 22 |
-
"Top Human in Competition": 304.
|
| 23 |
-
"MLAB (claude-3-5-sonnet-v2)": 5.
|
| 24 |
-
"MLAB (gemini-exp-1206)": 5.
|
| 25 |
-
"MLAB (o3-mini)": -14.
|
| 26 |
-
"MLAB (gpt-4o)": 5.
|
| 27 |
-
"MLAB (llama3-1-405b-instruct)": 5.
|
| 28 |
},
|
| 29 |
"product-recommendation": {
|
| 30 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
| 31 |
-
"Top Human in Competition": 412.
|
| 32 |
-
"MLAB (claude-3-5-sonnet-v2)": 12.
|
| 33 |
-
"MLAB (gemini-exp-1206)": 0.
|
| 34 |
-
"MLAB (o3-mini)": 0.
|
| 35 |
-
"MLAB (gpt-4o)": 2.
|
| 36 |
-
"MLAB (llama3-1-405b-instruct)": -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
},
|
| 38 |
"machine_unlearning": {
|
| 39 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
| 40 |
-
"Top Human in Competition": 61.
|
| 41 |
-
"MLAB (claude-3-5-sonnet-v2)": -58.
|
| 42 |
-
"MLAB (gemini-exp-1206)": 3.
|
| 43 |
-
"MLAB (o3-mini)": 2.
|
| 44 |
-
"MLAB (gpt-4o)": -11.
|
| 45 |
-
"MLAB (llama3-1-405b-instruct)": 3.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
},
|
| 47 |
"backdoor-trigger-recovery": {
|
| 48 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
| 49 |
-
"Top Human in Competition": 621.
|
| 50 |
-
"MLAB (claude-3-5-sonnet-v2)": 247.
|
| 51 |
-
"MLAB (gemini-exp-1206)": 80.
|
| 52 |
-
"MLAB (o3-mini)": 38.
|
| 53 |
-
"MLAB (gpt-4o)": 64.
|
| 54 |
-
"MLAB (llama3-1-405b-instruct)": 71.
|
| 55 |
}
|
| 56 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"perception_temporal_action_loc": {
|
| 3 |
+
"MLAB (claude-3-5-sonnet-v2)": 2.2,
|
| 4 |
+
"Top Human in Competition": 284.6,
|
| 5 |
+
"MLAB (gemini-exp-1206)": -1.3,
|
| 6 |
+
"MLAB (o3-mini)": 0.9,
|
| 7 |
+
"MLAB (gpt-4o)": 0.9,
|
| 8 |
+
"MLAB (llama3-1-405b-instruct)": 1.5,
|
| 9 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.0
|
| 10 |
},
|
| 11 |
"llm-merging": {
|
| 12 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
|
| 13 |
+
"Top Human in Competition": 68.2,
|
| 14 |
+
"MLAB (claude-3-5-sonnet-v2)": 3.4,
|
| 15 |
+
"MLAB (gemini-exp-1206)": 3.4,
|
| 16 |
+
"MLAB (o3-mini)": -0.7,
|
| 17 |
+
"MLAB (gpt-4o)": 1.4,
|
| 18 |
+
"MLAB (llama3-1-405b-instruct)": -0.7
|
| 19 |
},
|
| 20 |
"meta-learning": {
|
| 21 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 5.4,
|
| 22 |
+
"Top Human in Competition": 304.5,
|
| 23 |
+
"MLAB (claude-3-5-sonnet-v2)": 5.4,
|
| 24 |
+
"MLAB (gemini-exp-1206)": 5.4,
|
| 25 |
+
"MLAB (o3-mini)": -14.9,
|
| 26 |
+
"MLAB (gpt-4o)": 5.4,
|
| 27 |
+
"MLAB (llama3-1-405b-instruct)": 5.4
|
| 28 |
},
|
| 29 |
"product-recommendation": {
|
| 30 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 2.3,
|
| 31 |
+
"Top Human in Competition": 412.6,
|
| 32 |
+
"MLAB (claude-3-5-sonnet-v2)": 12.3,
|
| 33 |
+
"MLAB (gemini-exp-1206)": 0.6,
|
| 34 |
+
"MLAB (o3-mini)": 0.6,
|
| 35 |
+
"MLAB (gpt-4o)": 2.6,
|
| 36 |
+
"MLAB (llama3-1-405b-instruct)": -0.0
|
| 37 |
+
},
|
| 38 |
+
"weather_forcast": {
|
| 39 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
|
| 40 |
+
"Top Human in Competition": 399.4,
|
| 41 |
+
"MLAB (claude-3-5-sonnet-v2)": 31.0,
|
| 42 |
+
"MLAB (gemini-exp-1206)": 91.4,
|
| 43 |
+
"MLAB (o3-mini)": 53.3,
|
| 44 |
+
"MLAB (gpt-4o)": 100.8,
|
| 45 |
+
"MLAB (llama3-1-405b-instruct)": 66.7
|
| 46 |
},
|
| 47 |
"machine_unlearning": {
|
| 48 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 8.8,
|
| 49 |
+
"Top Human in Competition": 61.9,
|
| 50 |
+
"MLAB (claude-3-5-sonnet-v2)": -58.6,
|
| 51 |
+
"MLAB (gemini-exp-1206)": 3.5,
|
| 52 |
+
"MLAB (o3-mini)": 2.2,
|
| 53 |
+
"MLAB (gpt-4o)": -11.1,
|
| 54 |
+
"MLAB (llama3-1-405b-instruct)": 3.8
|
| 55 |
+
},
|
| 56 |
+
"erasing_invisible_watermarks": {
|
| 57 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -0.0,
|
| 58 |
+
"Top Human in Competition": -95.6,
|
| 59 |
+
"MLAB (claude-3-5-sonnet-v2)": -0.0,
|
| 60 |
+
"MLAB (gemini-exp-1206)": -0.0,
|
| 61 |
+
"MLAB (o3-mini)": -0.0,
|
| 62 |
+
"MLAB (gpt-4o)": 0.5,
|
| 63 |
+
"MLAB (llama3-1-405b-instruct)": -0.0
|
| 64 |
},
|
| 65 |
"backdoor-trigger-recovery": {
|
| 66 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 85.0,
|
| 67 |
+
"Top Human in Competition": 621.3,
|
| 68 |
+
"MLAB (claude-3-5-sonnet-v2)": 247.9,
|
| 69 |
+
"MLAB (gemini-exp-1206)": 80.4,
|
| 70 |
+
"MLAB (o3-mini)": 38.8,
|
| 71 |
+
"MLAB (gpt-4o)": 64.5,
|
| 72 |
+
"MLAB (llama3-1-405b-instruct)": 71.7
|
| 73 |
}
|
| 74 |
}
|
src/data/metrics/relative_improvement_to_human.json
CHANGED
|
@@ -1,56 +1,74 @@
|
|
| 1 |
{
|
| 2 |
"perception_temporal_action_loc": {
|
| 3 |
-
"MLAB (claude-3-5-sonnet-v2)": 0.
|
| 4 |
"Top Human in Competition": 100.0,
|
| 5 |
-
"MLAB (gemini-exp-1206)": -0.
|
| 6 |
-
"MLAB (o3-mini)": 0.
|
| 7 |
-
"MLAB (gpt-4o)": 0.
|
| 8 |
-
"MLAB (llama3-1-405b-instruct)": 0.
|
| 9 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.
|
| 10 |
},
|
| 11 |
"llm-merging": {
|
| 12 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": -0
|
| 13 |
"Top Human in Competition": 100.0,
|
| 14 |
-
"MLAB (claude-3-5-sonnet-v2)":
|
| 15 |
-
"MLAB (gemini-exp-1206)":
|
| 16 |
-
"MLAB (o3-mini)": -0
|
| 17 |
-
"MLAB (gpt-4o)":
|
| 18 |
-
"MLAB (llama3-1-405b-instruct)": -0
|
| 19 |
},
|
| 20 |
"meta-learning": {
|
| 21 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.
|
| 22 |
"Top Human in Competition": 100.0,
|
| 23 |
-
"MLAB (claude-3-5-sonnet-v2)": 1.
|
| 24 |
-
"MLAB (gemini-exp-1206)": 1.
|
| 25 |
-
"MLAB (o3-mini)": -4.
|
| 26 |
-
"MLAB (gpt-4o)": 1.
|
| 27 |
-
"MLAB (llama3-1-405b-instruct)": 1.
|
| 28 |
},
|
| 29 |
"product-recommendation": {
|
| 30 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.
|
| 31 |
"Top Human in Competition": 100.0,
|
| 32 |
-
"MLAB (claude-3-5-sonnet-v2)":
|
| 33 |
-
"MLAB (gemini-exp-1206)": 0.
|
| 34 |
-
"MLAB (o3-mini)": 0.
|
| 35 |
-
"MLAB (gpt-4o)": 0.
|
| 36 |
-
"MLAB (llama3-1-405b-instruct)": -
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
},
|
| 38 |
"machine_unlearning": {
|
| 39 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
"Top Human in Competition": 100.0,
|
| 41 |
-
"MLAB (claude-3-5-sonnet-v2)":
|
| 42 |
-
"MLAB (gemini-exp-1206)": 5
|
| 43 |
-
"MLAB (o3-mini)":
|
| 44 |
-
"MLAB (gpt-4o)":
|
| 45 |
-
"MLAB (llama3-1-405b-instruct)":
|
| 46 |
},
|
| 47 |
"backdoor-trigger-recovery": {
|
| 48 |
-
"CoI-Agent (o1) + MLAB (gpt-4o)":
|
| 49 |
"Top Human in Competition": 100.0,
|
| 50 |
-
"MLAB (claude-3-5-sonnet-v2)": 39.
|
| 51 |
-
"MLAB (gemini-exp-1206)": 12.
|
| 52 |
-
"MLAB (o3-mini)": 6.
|
| 53 |
-
"MLAB (gpt-4o)": 10.
|
| 54 |
-
"MLAB (llama3-1-405b-instruct)": 11.
|
| 55 |
}
|
| 56 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"perception_temporal_action_loc": {
|
| 3 |
+
"MLAB (claude-3-5-sonnet-v2)": 0.8,
|
| 4 |
"Top Human in Competition": 100.0,
|
| 5 |
+
"MLAB (gemini-exp-1206)": -0.5,
|
| 6 |
+
"MLAB (o3-mini)": 0.3,
|
| 7 |
+
"MLAB (gpt-4o)": 0.3,
|
| 8 |
+
"MLAB (llama3-1-405b-instruct)": 0.5,
|
| 9 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.4
|
| 10 |
},
|
| 11 |
"llm-merging": {
|
| 12 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
|
| 13 |
"Top Human in Competition": 100.0,
|
| 14 |
+
"MLAB (claude-3-5-sonnet-v2)": 5.0,
|
| 15 |
+
"MLAB (gemini-exp-1206)": 5.0,
|
| 16 |
+
"MLAB (o3-mini)": -1.0,
|
| 17 |
+
"MLAB (gpt-4o)": 2.0,
|
| 18 |
+
"MLAB (llama3-1-405b-instruct)": -1.0
|
| 19 |
},
|
| 20 |
"meta-learning": {
|
| 21 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 1.8,
|
| 22 |
"Top Human in Competition": 100.0,
|
| 23 |
+
"MLAB (claude-3-5-sonnet-v2)": 1.8,
|
| 24 |
+
"MLAB (gemini-exp-1206)": 1.8,
|
| 25 |
+
"MLAB (o3-mini)": -4.9,
|
| 26 |
+
"MLAB (gpt-4o)": 1.8,
|
| 27 |
+
"MLAB (llama3-1-405b-instruct)": 1.8
|
| 28 |
},
|
| 29 |
"product-recommendation": {
|
| 30 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 0.6,
|
| 31 |
"Top Human in Competition": 100.0,
|
| 32 |
+
"MLAB (claude-3-5-sonnet-v2)": 3.0,
|
| 33 |
+
"MLAB (gemini-exp-1206)": 0.1,
|
| 34 |
+
"MLAB (o3-mini)": 0.1,
|
| 35 |
+
"MLAB (gpt-4o)": 0.6,
|
| 36 |
+
"MLAB (llama3-1-405b-instruct)": -0.0
|
| 37 |
+
},
|
| 38 |
+
"weather_forcast": {
|
| 39 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 20.9,
|
| 40 |
+
"Top Human in Competition": 100.0,
|
| 41 |
+
"MLAB (claude-3-5-sonnet-v2)": 7.8,
|
| 42 |
+
"MLAB (gemini-exp-1206)": 22.9,
|
| 43 |
+
"MLAB (o3-mini)": 13.3,
|
| 44 |
+
"MLAB (gpt-4o)": 25.2,
|
| 45 |
+
"MLAB (llama3-1-405b-instruct)": 16.7
|
| 46 |
},
|
| 47 |
"machine_unlearning": {
|
| 48 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 14.2,
|
| 49 |
+
"Top Human in Competition": 100.0,
|
| 50 |
+
"MLAB (claude-3-5-sonnet-v2)": -94.7,
|
| 51 |
+
"MLAB (gemini-exp-1206)": 5.6,
|
| 52 |
+
"MLAB (o3-mini)": 3.6,
|
| 53 |
+
"MLAB (gpt-4o)": -18.0,
|
| 54 |
+
"MLAB (llama3-1-405b-instruct)": 6.2
|
| 55 |
+
},
|
| 56 |
+
"erasing_invisible_watermarks": {
|
| 57 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 84.0,
|
| 58 |
"Top Human in Competition": 100.0,
|
| 59 |
+
"MLAB (claude-3-5-sonnet-v2)": 87.6,
|
| 60 |
+
"MLAB (gemini-exp-1206)": 97.5,
|
| 61 |
+
"MLAB (o3-mini)": 83.4,
|
| 62 |
+
"MLAB (gpt-4o)": 83.4,
|
| 63 |
+
"MLAB (llama3-1-405b-instruct)": 83.4
|
| 64 |
},
|
| 65 |
"backdoor-trigger-recovery": {
|
| 66 |
+
"CoI-Agent (o1) + MLAB (gpt-4o)": 13.7,
|
| 67 |
"Top Human in Competition": 100.0,
|
| 68 |
+
"MLAB (claude-3-5-sonnet-v2)": 39.9,
|
| 69 |
+
"MLAB (gemini-exp-1206)": 12.9,
|
| 70 |
+
"MLAB (o3-mini)": 6.2,
|
| 71 |
+
"MLAB (gpt-4o)": 10.4,
|
| 72 |
+
"MLAB (llama3-1-405b-instruct)": 11.5
|
| 73 |
}
|
| 74 |
}
|