from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard # --- Generation (Table 1 in Paper) --- # 1. LongText2Video longtext_clip = Task("LongText2Video", "clip_score", "LongText2Video / CLIP") longtext_dino = Task("LongText2Video", "dino_score", "LongText2Video / DINO") longtext_mllm = Task("LongText2Video", "mllm_judge", "LongText2Video / MLLM") # 2. Entities2Video entities_clip = Task("Entities2Video", "clip_score", "Entities2Video / CLIP") entities_dino = Task("Entities2Video", "dino_score", "Entities2Video / DINO") entities_mllm = Task("Entities2Video", "mllm_judge", "Entities2Video / MLLM") # 3. Video2Video v2v_clip = Task("Video2Video", "clip_score", "Video2Video / CLIP") v2v_dino = Task("Video2Video", "dino_score", "Video2Video / DINO") v2v_mllm = Task("Video2Video", "mllm_judge", "Video2Video / MLLM") # --- Long-Video Tasks (Table 2 in Paper) --- # 4. Understanding (Table 2a) understanding_acc = Task("Understanding", "accuracy", "LongVideo QA / Acc") # 5. Editing (Table 2b) editing_clip = Task("Editing", "clip_score", "Editing / CLIP") editing_dino = Task("Editing", "dino_score", "Editing / DINO") editing_mllm = Task("Editing", "mllm_judge", "Editing / MLLM") # 6. Segmentation (Table 2c) segmentation_j = Task("Segmentation", "j_score", "Segmentation / J") segmentation_f = Task("Segmentation", "f_score", "Segmentation / F") segmentation_jf = Task("Segmentation", "j_and_f", "Segmentation / J&F") NUM_FEWSHOT = 0 # --------------------------------------------------- # Your leaderboard name TITLE = """

UniVA-Bench Leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ UniVA-Bench is an agent-oriented benchmark for unified video intelligence, covering Understanding, Generation, Editing, Segmentation, and agentic probing. We report CLIP / DINO / MLLM preference, segmentation J/F/J&F, and long-video QA accuracy, following the evaluation protocol described in our paper. """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" ## How it works This leaderboard reports the performance of UniVA and baseline models on **UniVA-Bench**, following the evaluation protocol described in our paper. All scores are obtained with our internal evaluation pipeline, using the same task definitions, metrics, and settings as in Section 4 and Appendix B. ## Reproducibility Our implementation and evaluation scripts will be released in a public repository in a future update. Before that, please refer to the paper for detailed descriptions of UniVA-Bench, including dataset construction, splits, metrics, and experimental setups. If you need additional information to reproduce these results, feel free to contact the authors. """ # Submission / queue EVALUATION_QUEUE_TEXT = """ Submissions are currently disabled for the UniVA-Bench leaderboard. We will open public submission once the evaluation backend is ready. """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r"""@misc{liang2025univauniversalvideoagent, title={UniVA: Universal Video Agent towards Open-Source Next-Generation Video Generalist}, author={Zhengyang Liang and Daoan Zhang and Huichi Zhou and Rui Huang and Bobo Li and Yuechen Zhang and Shengqiong Wu and Xiaohan Wang and Jiebo Luo and Lizi Liao and Hao Fei}, year={2025}, eprint={2511.08521}, archivePrefix={arXiv}, primaryClass={cs.CV}, url={https://arxiv.org/abs/2511.08521} }"""