UniVA-Bench Leaderboard

from dataclasses import dataclass
from enum import Enum

@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str

# Select your tasks here
# ---------------------------------------------------
class Tasks(Enum):
    # task_key in the json file, metric_key in the json file, name to display in the leaderboard
    
    # --- Generation (Table 1 in Paper)  ---
    # 1. LongText2Video
    longtext_clip = Task("LongText2Video", "clip_score", "LongText2Video / CLIP")
    longtext_dino = Task("LongText2Video", "dino_score", "LongText2Video / DINO")
    longtext_mllm = Task("LongText2Video", "mllm_judge", "LongText2Video / MLLM")

    # 2. Entities2Video
    entities_clip = Task("Entities2Video", "clip_score", "Entities2Video / CLIP")
    entities_dino = Task("Entities2Video", "dino_score", "Entities2Video / DINO")
    entities_mllm = Task("Entities2Video", "mllm_judge", "Entities2Video / MLLM")

    # 3. Video2Video
    v2v_clip = Task("Video2Video", "clip_score", "Video2Video / CLIP")
    v2v_dino = Task("Video2Video", "dino_score", "Video2Video / DINO")
    v2v_mllm = Task("Video2Video", "mllm_judge", "Video2Video / MLLM")

    # --- Long-Video Tasks (Table 2 in Paper)  ---
    # 4. Understanding (Table 2a)
    understanding_acc = Task("Understanding", "accuracy", "LongVideo QA / Acc")

    # 5. Editing (Table 2b)
    editing_clip = Task("Editing", "clip_score", "Editing / CLIP")
    editing_dino = Task("Editing", "dino_score", "Editing / DINO")
    editing_mllm = Task("Editing", "mllm_judge", "Editing / MLLM")

    # 6. Segmentation (Table 2c)
    segmentation_j = Task("Segmentation", "j_score", "Segmentation / J")
    segmentation_f = Task("Segmentation", "f_score", "Segmentation / F")
    segmentation_jf = Task("Segmentation", "j_and_f", "Segmentation / J&F")

NUM_FEWSHOT = 0

# ---------------------------------------------------
# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">UniVA-Bench Leaderboard</h1>"""

# What does your leaderboard evaluate?
INTRODUCTION_TEXT = """
UniVA-Bench is an agent-oriented benchmark for unified video intelligence,
covering Understanding, Generation, Editing, Segmentation, and agentic probing.
We report CLIP / DINO / MLLM preference, segmentation J/F/J&F, and long-video QA accuracy,
following the evaluation protocol described in our paper.
"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
## How it works
This leaderboard reports the performance of UniVA and baseline models on **UniVA-Bench**, following
the evaluation protocol described in our paper. All scores are obtained with our internal evaluation
pipeline, using the same task definitions, metrics, and settings as in Section 4 and Appendix B.

## Reproducibility
Our implementation and evaluation scripts will be released in a public repository in a future update.
Before that, please refer to the paper for detailed descriptions of UniVA-Bench, including dataset
construction, splits, metrics, and experimental setups. If you need additional information to reproduce
these results, feel free to contact the authors.
"""

# Submission / queue
EVALUATION_QUEUE_TEXT = """
Submissions are currently disabled for the UniVA-Bench leaderboard.
We will open public submission once the evaluation backend is ready.
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""@misc{liang2025univauniversalvideoagent,
  title={UniVA: Universal Video Agent towards Open-Source Next-Generation Video Generalist},
  author={Zhengyang Liang and Daoan Zhang and Huichi Zhou and Rui Huang and Bobo Li and
          Yuechen Zhang and Shengqiong Wu and Xiaohan Wang and Jiebo Luo and
          Lizi Liao and Hao Fei},
  year={2025},
  eprint={2511.08521},
  archivePrefix={arXiv},
  primaryClass={cs.CV},
  url={https://arxiv.org/abs/2511.08521}
}"""