Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import argparse
import ast
import json
import os

import pandas as pd


def validate_scores(dataset_list, assert_score, model_name):
for dataset in dataset_list:
base_score = assert_score[dataset][model_name]
if dataset == "OCRBench_MINI":
score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_score.json")
cur_score = 0
with open(score_file, "r") as f:
total_score = json.load(f)
cur_score = total_score["Final Score Norm"]
assert (
abs(cur_score - float(base_score)) <= 0.01
), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}"
else:
score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_acc.csv")
df = pd.read_csv(score_file)
cur_score = df["Overall"].iloc[0]
if dataset == "MMBench_V11_MINI":
cur_score = df.loc[df["split"] == "dev", "Overall"].values
assert (
abs(cur_score - float(base_score)) <= 0.01
), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}"
print(f"cur_score is {cur_score}, base_score is {base_score}")


def parse_arguments():
parser = argparse.ArgumentParser(description="Validate model scores against csv/json data")

parser.add_argument("--dataset", type=str, required=True, help="Space-separated list of datasets")

parser.add_argument(
"--base_score", type=str, required=True, help="Dictionary string in format {dataset:{model:score}}"
)

parser.add_argument("--model-name", type=str, required=True, help="Name of the model to validate")

return parser.parse_args()


def main():
args = parse_arguments()

try:
dataset_list = args.dataset.split()
base_score = ast.literal_eval(args.base_score)
except Exception as e:
print(f"Parameter parsing error: {str(e)}")
return

validate_scores(dataset_list, base_score, args.model_name)


if __name__ == "__main__":
main()
23 changes: 23 additions & 0 deletions evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/lint.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
name: lint

on: [push, pull_request]

concurrency:
group: ${{ github.workflow }}-${{ github.ref }}
cancel-in-progress: true

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.10
uses: actions/setup-python@v2
with:
python-version: 3.10.15
- name: Install pre-commit hook
run: |
pip install pre-commit
pre-commit install
- name: Linting
run: pre-commit run --all-files
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
name: pr_run_test

on:
pull_request:
branches:
- "main"
paths-ignore:
- "docs/**"
- "**.md"
workflow_dispatch:
schedule:
- cron: '56 01 * * *'

concurrency:
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}
cancel-in-progress: true

env:
BASE_SCORE: '{"MMBench_V11_MINI":{"Qwen2.5-VL-7B-Instruct":0.76363636,"InternVL3-8B":0.92727273,"llava-onevision-qwen2-0.5b-ov-hf":0.45454545},"MMStar_MINI":{"Qwen2.5-VL-7B-Instruct":0.6133333333333333,"InternVL3-8B":0.7,"llava-onevision-qwen2-0.5b-ov-hf":0.36},"AI2D_MINI":{"Qwen2.5-VL-7B-Instruct":0.7651821862348178,"InternVL3-8B":0.8218623481781376,"llava-onevision-qwen2-0.5b-ov-hf":0.48582995951417},"OCRBench_MINI":{"Qwen2.5-VL-7B-Instruct":15.7,"InternVL3-8B":17.3,"llava-onevision-qwen2-0.5b-ov-hf":5.5}}'
HF_HUB_CACHE: /mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub
HF_HUB_OFFLINE: 1
CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3
CONDA_ENV: vlm_pr_test
KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn
KUBEBRAIN_NAMESPACE: ailab-opencompass

jobs:
vlm_test:
if: ${{!cancelled()}}
runs-on: [yidian_cu12_mllm]
strategy:
fail-fast: false
matrix:
dataset: ["MMBench_V11_MINI MMStar_MINI AI2D_MINI","OCRBench_MINI"]
model: ['llava-onevision-qwen2-0.5b-ov-hf', 'InternVL3-8B', 'Qwen2.5-VL-7B-Instruct']
include:
- model: llava-onevision-qwen2-0.5b-ov-hf
model_name: llava
- model: Qwen2.5-VL-7B-Instruct
model_name: qwen
- model: InternVL3-8B
model_name: internvl
- dataset: MMBench_V11_MINI MMStar_MINI AI2D_MINI
dataset_name: mmbench
- dataset: OCRBench_MINI
dataset_name: ocrbench
steps:
- name: Clean workdir
run: sudo git clean -ffdx
- name: clone_repo
uses: actions/checkout@v3
- name: reinstall vlmeval
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip uninstall vlmeval -y
pip install .
pip install numpy==1.23.0 transformers==4.57.1
- name: evaluation_model
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
pip list

rjob submit --metadata-name=vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=16 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_wsp_cpu/vlmevalkit:auto-v0.0.10 --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=HF_HUB_OFFLINE=1 --env=LMUData=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/LMUData --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --mount=gpfs://gpfs1/mllm:/mnt/shared-storage-user/mllm --host-network=True -- bash -exc 'cd ${{github.workspace}}; source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate; conda activate ${{env.CONDA_ENV}}; python run.py --data ${{matrix.dataset}} --model ${{matrix.model}} --work-dir /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} --reuse --judge exact_matching 2>&1'

for i in {1..1200}; do
current_status=$(rjob get vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} | grep -oP 'rjob [^:]+: \K[^ ]+')
echo "Current status: $current_status, stop checking"
if [[ $current_status == "Succeeded" ]]; then
echo "Task succeeded"
rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100
exit 0
elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then
echo "Task failed or stopped, fetching logs"
rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
exit 1
fi
sleep 6
done
rjob stop vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }}
rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100
echo "Task timeout"
exit 1
- name: assert_result
run: |
. ${{env.CONDA_PATH}}/bin/activate
conda activate ${{env.CONDA_ENV}}
cp -r /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} outputs
python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name ${{matrix.model}}
Loading