rrr-leaderboard / app.py
pasha
Cleanup and init
e3ed394
raw
history blame
1.14 kB
import streamlit as st
import pandas as pd
# Load CSV file
DATA_FILE = "test_all.csv"
df = pd.read_csv(DATA_FILE)
# Normalize column names
df.columns = df.columns.str.strip()
# Header interface
st.title("MSNP Accuracy Leaderboard")
st.markdown("""
[GitHub Repository](https://github.com/EvilFreelancer/rrr-benchmark)
The table shows the accuracy and performance of the models on the
[rrr-benchmark](https://huggingface.co/datasets/evilfreelancer/rrr-benchmark) dataset.
""")
# Sidebar filtering
with st.sidebar:
st.header("Filters")
model_options = st.multiselect("Select model:", options=sorted(df["model"].dropna().unique()))
# Apply filters
filtered_df = df.copy()
if model_options:
filtered_df = filtered_df[filtered_df["model"].isin(model_options)]
# Column formatting for display
format_dict = {
"accuracy": "{:.2%}".format,
"avg_response_time": "{:.3f}".format,
"avg_token_count": "{:.1f}".format
}
# Display the table sorted by accuracy in descending order
st.dataframe(
filtered_df.sort_values(by="accuracy", ascending=False).reset_index(drop=True).style.format(format_dict)
)