MLRC_Bench

Running

App Files Files Community

Armeddinosaur commited on Apr 4

Commit

678bdbb

1 Parent(s): eea50e2

Updating metrics

Browse files

Files changed (5) hide show

src/components/filters.py +4 -0
src/data/metrics/absolute_improvement_to_baseline.json +26 -28
src/data/metrics/relative_improvement_to_human.json +31 -33
src/utils/config.py +2 -1
src/utils/task_mapping.py +1 -0

src/components/filters.py CHANGED Viewed

@@ -20,6 +20,10 @@ def initialize_session_state(df):
     if 'selected_tasks' not in st.session_state:
         # Select all tasks by default, excluding Model Type
         st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
     if 'selected_model_types' not in st.session_state:
         # Ensure all model types are selected by default

     if 'selected_tasks' not in st.session_state:
         # Select all tasks by default, excluding Model Type
         st.session_state.selected_tasks = [col for col in df.columns if col not in ['Model Type']]
+    else:
+        # Make sure selected_tasks only includes actual tasks from the dataframe
+        valid_tasks = [col for col in df.columns if col not in ['Model Type']]
+        st.session_state.selected_tasks = [task for task in st.session_state.selected_tasks if task in valid_tasks]
     if 'selected_model_types' not in st.session_state:
         # Ensure all model types are selected by default

src/data/metrics/absolute_improvement_to_baseline.json CHANGED Viewed

@@ -6,7 +6,8 @@
     "MLAB (o3-mini)": 0.9,
     "MLAB (gpt-4o)": 0.9,
     "MLAB (llama3-1-405b-instruct)": 1.5,
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 1.0
   },
   "llm-merging": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
@@ -15,60 +16,57 @@
     "MLAB (gemini-exp-1206)": 3.4,
     "MLAB (o3-mini)": -0.7,
     "MLAB (gpt-4o)": 1.4,
-    "MLAB (llama3-1-405b-instruct)": -0.7
-  },
-  "meta-learning": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 5.4,
-    "Top Human in Competition": 304.5,
-    "MLAB (claude-3-5-sonnet-v2)": 5.4,
-    "MLAB (gemini-exp-1206)": 5.4,
-    "MLAB (o3-mini)": -14.9,
-    "MLAB (gpt-4o)": 5.4,
-    "MLAB (llama3-1-405b-instruct)": 5.4
   },
   "product-recommendation": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 2.3,
-    "Top Human in Competition": 412.6,
     "MLAB (claude-3-5-sonnet-v2)": 12.3,
     "MLAB (gemini-exp-1206)": 0.6,
     "MLAB (o3-mini)": 0.6,
     "MLAB (gpt-4o)": 2.6,
-    "MLAB (llama3-1-405b-instruct)": -0.0
   },
   "weather_forcast": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
-    "Top Human in Competition": 399.4,
     "MLAB (claude-3-5-sonnet-v2)": 31.0,
     "MLAB (gemini-exp-1206)": 91.4,
     "MLAB (o3-mini)": 53.3,
     "MLAB (gpt-4o)": 100.8,
     "MLAB (llama3-1-405b-instruct)": 66.7
   },
   "machine_unlearning": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 8.8,
     "Top Human in Competition": 61.9,
     "MLAB (claude-3-5-sonnet-v2)": -58.6,
     "MLAB (gemini-exp-1206)": 3.5,
     "MLAB (o3-mini)": 2.2,
     "MLAB (gpt-4o)": -11.1,
     "MLAB (llama3-1-405b-instruct)": 3.8
   },
-  "erasing_invisible_watermarks": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 80.3,
-    "Top Human in Competition": 95.6,
-    "MLAB (claude-3-5-sonnet-v2)": 83.7,
-    "MLAB (gemini-exp-1206)": 93.3,
-    "MLAB (o3-mini)": 79.8,
-    "MLAB (gpt-4o)": 79.8,
-    "MLAB (llama3-1-405b-instruct)": 79.8
-  },
   "backdoor-trigger-recovery": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 85.0,
     "Top Human in Competition": 621.3,
     "MLAB (claude-3-5-sonnet-v2)": 247.9,
     "MLAB (gemini-exp-1206)": 80.4,
     "MLAB (o3-mini)": 38.8,
-    "MLAB (gpt-4o)": 64.5,
-    "MLAB (llama3-1-405b-instruct)": 71.7
   }
 }

     "MLAB (o3-mini)": 0.9,
     "MLAB (gpt-4o)": 0.9,
     "MLAB (llama3-1-405b-instruct)": 1.5,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 1.0,
+    "Human Idea + MLAB (gpt-4o)": 1.5
   },
   "llm-merging": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": -0.7,
     "MLAB (gemini-exp-1206)": 3.4,
     "MLAB (o3-mini)": -0.7,
     "MLAB (gpt-4o)": 1.4,
+    "MLAB (llama3-1-405b-instruct)": -0.7,
+    "Human Idea + MLAB (gpt-4o)": -0.7
   },
   "product-recommendation": {
     "MLAB (claude-3-5-sonnet-v2)": 12.3,
+    "Top Human in Competition": 412.6,
     "MLAB (gemini-exp-1206)": 0.6,
     "MLAB (o3-mini)": 0.6,
     "MLAB (gpt-4o)": 2.6,
+    "MLAB (llama3-1-405b-instruct)": -0.0,
+    "Human Idea + MLAB (gpt-4o)": 8.9,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 0.6
   },
   "weather_forcast": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": 83.6,
+    "Top Human in Competition": 212.0,
+    "Human Idea + MLAB (gpt-4o)": 26.1,
     "MLAB (claude-3-5-sonnet-v2)": 31.0,
     "MLAB (gemini-exp-1206)": 91.4,
     "MLAB (o3-mini)": 53.3,
     "MLAB (gpt-4o)": 100.8,
     "MLAB (llama3-1-405b-instruct)": 66.7
   },
+  "meta-learning": {
+    "MLAB (claude-3-5-sonnet-v2)": -14.9,
+    "Top Human in Competition": 304.5,
+    "MLAB (gemini-exp-1206)": -3.2,
+    "MLAB (o3-mini)": -14.9,
+    "MLAB (gpt-4o)": -14.9,
+    "MLAB (llama3-1-405b-instruct)": -14.9,
+    "Human Idea + MLAB (gpt-4o)": -14.9,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": -14.9
+  },
   "machine_unlearning": {
+    "Human Idea + MLAB (gpt-4o)": 4.2,
     "Top Human in Competition": 61.9,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 7.3,
     "MLAB (claude-3-5-sonnet-v2)": -58.6,
     "MLAB (gemini-exp-1206)": 3.5,
     "MLAB (o3-mini)": 2.2,
     "MLAB (gpt-4o)": -11.1,
     "MLAB (llama3-1-405b-instruct)": 3.8
   },
   "backdoor-trigger-recovery": {
+    "MLAB (gpt-4o)": 74.0,
     "Top Human in Competition": 621.3,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 24.9,
     "MLAB (claude-3-5-sonnet-v2)": 247.9,
     "MLAB (gemini-exp-1206)": 80.4,
     "MLAB (o3-mini)": 38.8,
+    "MLAB (llama3-1-405b-instruct)": 71.7,
+    "Human Idea + MLAB (gpt-4o)": 54.5
   }
 }

src/data/metrics/relative_improvement_to_human.json CHANGED Viewed

@@ -6,7 +6,8 @@
     "MLAB (o3-mini)": 0.3,
     "MLAB (gpt-4o)": 0.3,
     "MLAB (llama3-1-405b-instruct)": 0.5,
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 0.4
   },
   "llm-merging": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
@@ -15,60 +16,57 @@
     "MLAB (gemini-exp-1206)": 5.0,
     "MLAB (o3-mini)": -1.0,
     "MLAB (gpt-4o)": 2.0,
-    "MLAB (llama3-1-405b-instruct)": -1.0
-  },
-  "meta-learning": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 1.8,
-    "Top Human in Competition": 100.0,
-    "MLAB (claude-3-5-sonnet-v2)": 1.8,
-    "MLAB (gemini-exp-1206)": 1.8,
-    "MLAB (o3-mini)": -4.9,
-    "MLAB (gpt-4o)": 1.8,
-    "MLAB (llama3-1-405b-instruct)": 1.8
   },
   "product-recommendation": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 0.6,
-    "Top Human in Competition": 100.0,
     "MLAB (claude-3-5-sonnet-v2)": 3.0,
     "MLAB (gemini-exp-1206)": 0.1,
     "MLAB (o3-mini)": 0.1,
     "MLAB (gpt-4o)": 0.6,
-    "MLAB (llama3-1-405b-instruct)": -0.0
   },
   "weather_forcast": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 20.9,
     "Top Human in Competition": 100.0,
-    "MLAB (claude-3-5-sonnet-v2)": 7.8,
-    "MLAB (gemini-exp-1206)": 22.9,
-    "MLAB (o3-mini)": 13.3,
-    "MLAB (gpt-4o)": 25.2,
-    "MLAB (llama3-1-405b-instruct)": 16.7
   },
   "machine_unlearning": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 14.2,
     "Top Human in Competition": 100.0,
     "MLAB (claude-3-5-sonnet-v2)": -94.7,
     "MLAB (gemini-exp-1206)": 5.6,
     "MLAB (o3-mini)": 3.6,
     "MLAB (gpt-4o)": -18.0,
     "MLAB (llama3-1-405b-instruct)": 6.2
   },
-  "erasing_invisible_watermarks": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 84.0,
-    "Top Human in Competition": 100.0,
-    "MLAB (claude-3-5-sonnet-v2)": 87.6,
-    "MLAB (gemini-exp-1206)": 97.5,
-    "MLAB (o3-mini)": 83.4,
-    "MLAB (gpt-4o)": 83.4,
-    "MLAB (llama3-1-405b-instruct)": 83.4
-  },
   "backdoor-trigger-recovery": {
-    "CoI-Agent (o1) + MLAB (gpt-4o)": 13.7,
     "Top Human in Competition": 100.0,
     "MLAB (claude-3-5-sonnet-v2)": 39.9,
     "MLAB (gemini-exp-1206)": 12.9,
     "MLAB (o3-mini)": 6.2,
-    "MLAB (gpt-4o)": 10.4,
-    "MLAB (llama3-1-405b-instruct)": 11.5
   }
 }

     "MLAB (o3-mini)": 0.3,
     "MLAB (gpt-4o)": 0.3,
     "MLAB (llama3-1-405b-instruct)": 0.5,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 0.4,
+    "Human Idea + MLAB (gpt-4o)": 0.5
   },
   "llm-merging": {
     "CoI-Agent (o1) + MLAB (gpt-4o)": -1.0,
     "MLAB (gemini-exp-1206)": 5.0,
     "MLAB (o3-mini)": -1.0,
     "MLAB (gpt-4o)": 2.0,
+    "MLAB (llama3-1-405b-instruct)": -1.0,
+    "Human Idea + MLAB (gpt-4o)": -1.0
   },
   "product-recommendation": {
     "MLAB (claude-3-5-sonnet-v2)": 3.0,
+    "Top Human in Competition": 100.0,
     "MLAB (gemini-exp-1206)": 0.1,
     "MLAB (o3-mini)": 0.1,
     "MLAB (gpt-4o)": 0.6,
+    "MLAB (llama3-1-405b-instruct)": -0.0,
+    "Human Idea + MLAB (gpt-4o)": 2.2,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 0.1
   },
   "weather_forcast": {
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 39.4,
     "Top Human in Competition": 100.0,
+    "Human Idea + MLAB (gpt-4o)": 12.3,
+    "MLAB (claude-3-5-sonnet-v2)": 14.6,
+    "MLAB (gemini-exp-1206)": 43.1,
+    "MLAB (o3-mini)": 25.1,
+    "MLAB (gpt-4o)": 47.5,
+    "MLAB (llama3-1-405b-instruct)": 31.5
+  },
+  "meta-learning": {
+    "MLAB (claude-3-5-sonnet-v2)": -4.9,
+    "Top Human in Competition": 100.0,
+    "MLAB (gemini-exp-1206)": -1.1,
+    "MLAB (o3-mini)": -4.9,
+    "MLAB (gpt-4o)": -4.9,
+    "MLAB (llama3-1-405b-instruct)": -4.9,
+    "Human Idea + MLAB (gpt-4o)": -4.9,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": -4.9
   },
   "machine_unlearning": {
+    "Human Idea + MLAB (gpt-4o)": 6.8,
     "Top Human in Competition": 100.0,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 11.8,
     "MLAB (claude-3-5-sonnet-v2)": -94.7,
     "MLAB (gemini-exp-1206)": 5.6,
     "MLAB (o3-mini)": 3.6,
     "MLAB (gpt-4o)": -18.0,
     "MLAB (llama3-1-405b-instruct)": 6.2
   },
   "backdoor-trigger-recovery": {
+    "MLAB (gpt-4o)": 11.9,
     "Top Human in Competition": 100.0,
+    "CoI-Agent (o1) + MLAB (gpt-4o)": 4.0,
     "MLAB (claude-3-5-sonnet-v2)": 39.9,
     "MLAB (gemini-exp-1206)": 12.9,
     "MLAB (o3-mini)": 6.2,
+    "MLAB (llama3-1-405b-instruct)": 11.5,
+    "Human Idea + MLAB (gpt-4o)": 8.8
   }
 }

src/utils/config.py CHANGED Viewed

@@ -82,7 +82,8 @@ model_categories = {
     "MLAB (llama3-1-405b-instruct)": "Open Weights",
     "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
     "Human": "Human",
-    "Top Human in Competition": "Human"
     # More models would be added here as needed
 }

     "MLAB (llama3-1-405b-instruct)": "Open Weights",
     "CoI-Agent (o1) + MLAB (gpt-4o)": "Closed Source",
     "Human": "Human",
+    "Top Human in Competition": "Human",
+    "Human Idea + MLAB (gpt-4o)": "Closed Source"
     # More models would be added here as needed
 }

src/utils/task_mapping.py CHANGED Viewed

@@ -8,6 +8,7 @@ task_display_names = {
     "Llm Merging": "LLM Merging",
     "Meta Learning": "Meta Learning",
     "Product Recommendation": "Next Product Recommendation",
     "Machine Unlearning": "Machine Unlearning",
     "Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
 }

     "Llm Merging": "LLM Merging",
     "Meta Learning": "Meta Learning",
     "Product Recommendation": "Next Product Recommendation",
+    "Weather Forcast": "Rainfall Prediction",
     "Machine Unlearning": "Machine Unlearning",
     "Backdoor Trigger Recovery": "Backdoor Trigger Recovery"
 }