NVIDIA · kding1 · Jun 28, 2026
diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/.github/scripts/assert_score.py b/evaluation/cosmos3/reasoner/vlmevalkit/.github/scripts/assert_score.py
@@ -0,0 +1,61 @@
+import argparse
+import ast
+import json
+import os
+
+import pandas as pd
+
+
+def validate_scores(dataset_list, assert_score, model_name):
+    for dataset in dataset_list:
+        base_score = assert_score[dataset][model_name]
+        if dataset == "OCRBench_MINI":
+            score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_score.json")
+            cur_score = 0
+            with open(score_file, "r") as f:
+                total_score = json.load(f)
+                cur_score = total_score["Final Score Norm"]
+            assert (
+                abs(cur_score - float(base_score)) <= 0.01
+            ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}"
+        else:
+            score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_acc.csv")
+            df = pd.read_csv(score_file)
+            cur_score = df["Overall"].iloc[0]
+            if dataset == "MMBench_V11_MINI":
+                cur_score = df.loc[df["split"] == "dev", "Overall"].values
+            assert (
+                abs(cur_score - float(base_score)) <= 0.01
+            ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}"
+        print(f"cur_score is {cur_score}, base_score is {base_score}")
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="Validate model scores against csv/json data")
+
+    parser.add_argument("--dataset", type=str, required=True, help="Space-separated list of datasets")
+
+    parser.add_argument(
+        "--base_score", type=str, required=True, help="Dictionary string in format {dataset:{model:score}}"
+    )
+
+    parser.add_argument("--model-name", type=str, required=True, help="Name of the model to validate")
+
+    return parser.parse_args()
+
+
+def main():
+    args = parse_arguments()
+
+    try:
+        dataset_list = args.dataset.split()
+        base_score = ast.literal_eval(args.base_score)
+    except Exception as e:
+        print(f"Parameter parsing error: {str(e)}")
+        return
+
+    validate_scores(dataset_list, base_score, args.model_name)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/lint.yml b/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/lint.yml
@@ -0,0 +1,23 @@
+name: lint
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  lint:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v2
+      - name: Set up Python 3.10
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.10.15
+      - name: Install pre-commit hook
+        run: |
+          pip install pre-commit
+          pre-commit install
+      - name: Linting
+        run: pre-commit run --all-files
diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/pr-run-test.yml b/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/pr-run-test.yml
@@ -0,0 +1,90 @@
+name: pr_run_test
+
+on:
+  pull_request:
+    branches:
+      - "main"
+    paths-ignore:
+      - "docs/**"
+      - "**.md"
+  workflow_dispatch:
+  schedule:
+    - cron:  '56 01 * * *'
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
+  cancel-in-progress: true
+
+env:
+  BASE_SCORE: '{"MMBench_V11_MINI":{"Qwen2.5-VL-7B-Instruct":0.76363636,"InternVL3-8B":0.92727273,"llava-onevision-qwen2-0.5b-ov-hf":0.45454545},"MMStar_MINI":{"Qwen2.5-VL-7B-Instruct":0.6133333333333333,"InternVL3-8B":0.7,"llava-onevision-qwen2-0.5b-ov-hf":0.36},"AI2D_MINI":{"Qwen2.5-VL-7B-Instruct":0.7651821862348178,"InternVL3-8B":0.8218623481781376,"llava-onevision-qwen2-0.5b-ov-hf":0.48582995951417},"OCRBench_MINI":{"Qwen2.5-VL-7B-Instruct":15.7,"InternVL3-8B":17.3,"llava-onevision-qwen2-0.5b-ov-hf":5.5}}'
+  HF_HUB_CACHE: /mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub
+  HF_HUB_OFFLINE: 1
+  CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
+  CONDA_ENV: vlm_pr_test
+  KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
+  KUBEBRAIN_NAMESPACE: ailab-opencompass
+
+jobs:
+  vlm_test:
+    if: ${{!cancelled()}}
+    runs-on: [yidian_cu12_mllm]
+    strategy:
+      fail-fast: false
+      matrix:
+        dataset: ["MMBench_V11_MINI MMStar_MINI AI2D_MINI","OCRBench_MINI"]
+        model: ['llava-onevision-qwen2-0.5b-ov-hf', 'InternVL3-8B', 'Qwen2.5-VL-7B-Instruct']
+        include:
+          - model: llava-onevision-qwen2-0.5b-ov-hf
+            model_name: llava
+          - model: Qwen2.5-VL-7B-Instruct
+            model_name: qwen
+          - model: InternVL3-8B
+            model_name: internvl
+          - dataset: MMBench_V11_MINI MMStar_MINI AI2D_MINI
+            dataset_name: mmbench
+          - dataset: OCRBench_MINI
+            dataset_name: ocrbench
+    steps:
+      - name: Clean workdir
+        run: sudo git clean -ffdx
+      - name: clone_repo
+        uses: actions/checkout@v3
+      - name: reinstall vlmeval
+        run: |
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          pip uninstall vlmeval -y
+          pip install .
+          pip install numpy==1.23.0 transformers==4.57.1
+      - name: evaluation_model
+        run: |
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          pip list
+
+          rjob submit --metadata-name=vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=16 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_wsp_cpu/vlmevalkit:auto-v0.0.10 --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=HF_HUB_OFFLINE=1 --env=LMUData=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/LMUData --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --mount=gpfs://gpfs1/mllm:/mnt/shared-storage-user/mllm --host-network=True -- bash -exc 'cd ${{github.workspace}}; source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate; conda activate ${{env.CONDA_ENV}}; python run.py --data ${{matrix.dataset}} --model ${{matrix.model}} --work-dir /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} --reuse --judge exact_matching 2>&1'
+
+          for i in {1..1200}; do
+            current_status=$(rjob get vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} | grep -oP 'rjob [^:]+: \K[^ ]+')
+            echo "Current status: $current_status, stop checking"
+            if [[ $current_status == "Succeeded" ]]; then
+              echo "Task succeeded"
+              rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100
+              exit 0
+            elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
+              echo "Task failed or stopped, fetching logs"
+              rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
+              exit 1
+            fi
+            sleep 6
+          done
+          rjob stop vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
+          rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100
+          echo "Task timeout"
+          exit 1
+      - name: assert_result
+        run: |
+          . ${{env.CONDA_PATH}}/bin/activate
+          conda activate ${{env.CONDA_ENV}}
+          cp -r /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} outputs
+          python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name ${{matrix.model}}