Spaces:
Running
Running
openhands openhands commited on
Commit ·
70749cd
1
Parent(s): 6d3b657
Show OpenHands on alternate agents page for shared models
Browse filesThis commit was created by an AI assistant (OpenHands) on behalf of the user.
Co-authored-by: openhands <openhands@all-hands.dev>
- alternative_agents_page.py +42 -4
alternative_agents_page.py
CHANGED
|
@@ -13,9 +13,15 @@ This page is intentionally a single Overall view (no per-category
|
|
| 13 |
subpages) — the alternative-agents dataset is small (one row per
|
| 14 |
harness × model) and the goal is "show me all the alternatives at a
|
| 15 |
glance", not "drill into Issue Resolution for Codex".
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
"""
|
| 17 |
import matplotlib
|
| 18 |
matplotlib.use('Agg')
|
|
|
|
| 19 |
import gradio as gr
|
| 20 |
|
| 21 |
from simple_data_loader import SimpleLeaderboardViewer
|
|
@@ -30,15 +36,45 @@ ALTERNATIVE_AGENTS_INTRO = """
|
|
| 30 |
<h2>Alternative Agents</h2>
|
| 31 |
<p>
|
| 32 |
Third-party agent harnesses running the OpenHands Index benchmarks.
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
instrumentation and aren't directly comparable
|
|
|
|
| 37 |
</p>
|
| 38 |
</div>
|
| 39 |
"""
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def build_page():
|
| 43 |
gr.HTML(ALTERNATIVE_AGENTS_INTRO)
|
| 44 |
|
|
@@ -57,6 +93,8 @@ def build_page():
|
|
| 57 |
)
|
| 58 |
return
|
| 59 |
|
|
|
|
|
|
|
| 60 |
create_leaderboard_display(
|
| 61 |
full_df=test_df,
|
| 62 |
tag_map=test_tag_map,
|
|
|
|
| 13 |
subpages) — the alternative-agents dataset is small (one row per
|
| 14 |
harness × model) and the goal is "show me all the alternatives at a
|
| 15 |
glance", not "drill into Issue Resolution for Codex".
|
| 16 |
+
|
| 17 |
+
To make same-model comparisons easier, the page also appends canonical
|
| 18 |
+
OpenHands rows for any language model that appears in the alternative
|
| 19 |
+
agent dataset. The match is exact, so ``Gemini-3-Pro`` and
|
| 20 |
+
``Gemini-3.1-Pro`` remain distinct entries.
|
| 21 |
"""
|
| 22 |
import matplotlib
|
| 23 |
matplotlib.use('Agg')
|
| 24 |
+
import pandas as pd
|
| 25 |
import gradio as gr
|
| 26 |
|
| 27 |
from simple_data_loader import SimpleLeaderboardViewer
|
|
|
|
| 36 |
<h2>Alternative Agents</h2>
|
| 37 |
<p>
|
| 38 |
Third-party agent harnesses running the OpenHands Index benchmarks.
|
| 39 |
+
To make direct comparisons easier, this page also includes the
|
| 40 |
+
canonical OpenHands row whenever the exact same language model appears
|
| 41 |
+
under an alternative harness. Cost and runtime numbers still come from
|
| 42 |
+
each harness's own instrumentation and aren't directly comparable
|
| 43 |
+
across harnesses.
|
| 44 |
</p>
|
| 45 |
</div>
|
| 46 |
"""
|
| 47 |
|
| 48 |
|
| 49 |
+
def _append_openhands_shared_models(
|
| 50 |
+
alternative_df: pd.DataFrame,
|
| 51 |
+
split: str,
|
| 52 |
+
) -> pd.DataFrame:
|
| 53 |
+
if alternative_df.empty or "Language Model" not in alternative_df.columns:
|
| 54 |
+
return alternative_df
|
| 55 |
+
|
| 56 |
+
openhands_df, _ = get_full_leaderboard_data(
|
| 57 |
+
split,
|
| 58 |
+
agent_filter=SimpleLeaderboardViewer.AGENT_FILTER_OPENHANDS,
|
| 59 |
+
)
|
| 60 |
+
if openhands_df.empty or "Language Model" not in openhands_df.columns:
|
| 61 |
+
return alternative_df
|
| 62 |
+
|
| 63 |
+
alternative_models = set(
|
| 64 |
+
alternative_df["Language Model"].dropna().astype(str).str.strip()
|
| 65 |
+
)
|
| 66 |
+
if not alternative_models:
|
| 67 |
+
return alternative_df
|
| 68 |
+
|
| 69 |
+
openhands_shared_df = openhands_df[
|
| 70 |
+
openhands_df["Language Model"].astype(str).str.strip().isin(alternative_models)
|
| 71 |
+
].copy()
|
| 72 |
+
if openhands_shared_df.empty:
|
| 73 |
+
return alternative_df
|
| 74 |
+
|
| 75 |
+
return pd.concat([alternative_df, openhands_shared_df], ignore_index=True, sort=False)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
def build_page():
|
| 79 |
gr.HTML(ALTERNATIVE_AGENTS_INTRO)
|
| 80 |
|
|
|
|
| 93 |
)
|
| 94 |
return
|
| 95 |
|
| 96 |
+
test_df = _append_openhands_shared_models(test_df, split="test")
|
| 97 |
+
|
| 98 |
create_leaderboard_display(
|
| 99 |
full_df=test_df,
|
| 100 |
tag_map=test_tag_map,
|