leaderboard / benchmark.json
adedomenico's picture
Update benchmark.json
891279c verified
[
{
"provider": "Qwen",
"name": "Qwen3-32B",
"repo": "qwen/qwen3-32b",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 34.78, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 76.14, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 69.51, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 33.77, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 23.1, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "OpenAI",
"name": "GPT-OSS-20B",
"repo": "openai/gpt-oss-20b",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 30.18, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 75.79, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 53.80, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 40.10, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 20.00, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "OpenAI",
"name": "GPT-OSS-120B",
"repo": "openai/gpt-oss-120b",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 36.25, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 78.51, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 60.40, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 44.70, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 28.7, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "ByteDance",
"name": "Seed-OSS-36B",
"repo": "bytedance/seed-oss-36b-instruct",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 37.66, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 75.67, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 56.05, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 57.00, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": null, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "LiquidAI",
"name": "LFM2-2.6B",
"repo": "liquidai/LFM2-2.6B",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 24.2, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 57.90, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 18.05, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 9.08, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 19.73, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "IBM Granite",
"name": "granite-3.3-8b-instruct",
"repo": "ibm-granite/granite-3.3-8b-instruct",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 26.00, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 62.35, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 14.37, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 13.31, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 20.39, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "IBM Granite",
"name": "granite-4.0-h-small",
"repo": "ibm-granite/granite-4.0-h-small",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 32.53, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 72.15, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 32.4, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 17.24, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 21.50, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Microsoft",
"name": "Phi-4-mini-instruct",
"repo": "microsoft/phi-4-mini-instruct",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 24.87, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 45.90, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 14.4, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 6.56, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 20.53, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Swiss AI",
"name": "Apertus-8B-Instruct-2509",
"repo": "swiss-ai/Apertus-8B-Instruct-2509",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 20.38, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 56.40, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 6.03, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 4.25, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 20.53, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Meta",
"name": "Llama-3.3-70B-Instruct",
"repo": "meta/llama-3.3-70b-instruct",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 55.18, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 74.98, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 36.23, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 22.01, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 23.60, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Qwen",
"name": "Qwen3-4B",
"repo": "qwen/qwen3-4b",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 31.65, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 70.50, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 45.62, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 32.00, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 23.10, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Qwen",
"name": "Qwen3-8B",
"repo": "qwen/qwen3-8b",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 31.42, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 73.21, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 49.73, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 36.42, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 20.13, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "DeepSeek",
"name": "DeepSeek-R1-Distill-Qwen-7B",
"repo": "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 21.43, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 69.31, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 24.9, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 12.05, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 19.73, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Meta",
"name": "Llama-3.1-8B-Instruct",
"repo": "meta/llama-3.1-8b-instruct",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 25.27, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 68.03, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 13.56, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 13.42, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 20.27, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Mistral",
"name": "Mistral-Large-123B",
"repo": "mistralai/Mistral-Large-Instruct-2411",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 54.95, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 75.85, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 38.80, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 30.55, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 24.5, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "OpenAI",
"name": "GPT-5",
"repo": "openai/gpt-5",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 67.90, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 82.51, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 70.27, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 80.00, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 27.07, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Grok",
"name": "Grok-4-fast",
"repo": "grok/grok-4-fast",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 60.60, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 79.39, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 62.80, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 78.12, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 26.67, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Claude",
"name": "Claude-Sonnet-4.5",
"repo": "claude/claude-sonnet-4.5",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 65.25, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 80.57, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 62.80, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 67.12, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 27.47, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Google",
"name": "Gemini-2.5-pro",
"repo": "google/gemini-2.5-pro",
"updated_at": "2025-09-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 68.20, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 80.11, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 74.40, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 44.30, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 25.20, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "Google",
"name": "AT&T FT Gemma-3-4B-IT",
"repo": "AT&T/gemma-3-4b-fine-tuned",
"updated_at": "2025-10-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": null, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": null, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": null, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 80.09, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": null, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "NetoAI",
"name": "TSLAM-2B MINI",
"repo": "NetoAI/TSLAM-2B MINI",
"updated_at": "2025-10-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 27, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 62, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 4.5, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 13.5, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 19.73, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "NetoAI",
"name": "TSLAM-18B",
"repo": "NetoAI/TSLAM-18B",
"updated_at": "2025-10-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 63.5, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 72, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 69.5, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 20.62, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 24.05, "energy_consumed": "", "co2_consumed": ""}
]
},
{
"provider": "NetoAI",
"name": "TSLAM-G3",
"repo": "NetoAI/TSLAM-G3",
"updated_at": "2025-10-16T00:00:00Z",
"scores": [
{"dataset_name": "3GPP-TSG", "metric_type": "raw", "score": 58.5, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleQna", "metric_type": "raw", "score": 82.5, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleMath", "metric_type": "raw", "score": 26.5, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleLogs", "metric_type": "raw", "score": 11.25, "energy_consumed": "", "co2_consumed": ""},
{"dataset_name": "TeleYAML", "metric_type": "llm-as-judge", "score": 21.73, "energy_consumed": "", "co2_consumed": ""}
]
}
]