diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/.github/scripts/assert_score.py b/evaluation/cosmos3/reasoner/vlmevalkit/.github/scripts/assert_score.py new file mode 100644 index 00000000..5d5fbfe5 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/.github/scripts/assert_score.py @@ -0,0 +1,61 @@ +import argparse +import ast +import json +import os + +import pandas as pd + + +def validate_scores(dataset_list, assert_score, model_name): + for dataset in dataset_list: + base_score = assert_score[dataset][model_name] + if dataset == "OCRBench_MINI": + score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_score.json") + cur_score = 0 + with open(score_file, "r") as f: + total_score = json.load(f) + cur_score = total_score["Final Score Norm"] + assert ( + abs(cur_score - float(base_score)) <= 0.01 + ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}" + else: + score_file = os.path.join("outputs", f"{model_name}/{model_name}_{dataset}_acc.csv") + df = pd.read_csv(score_file) + cur_score = df["Overall"].iloc[0] + if dataset == "MMBench_V11_MINI": + cur_score = df.loc[df["split"] == "dev", "Overall"].values + assert ( + abs(cur_score - float(base_score)) <= 0.01 + ), f"{dataset} on {model_name}: cur_score is {cur_score}, base_score is {base_score}" + print(f"cur_score is {cur_score}, base_score is {base_score}") + + +def parse_arguments(): + parser = argparse.ArgumentParser(description="Validate model scores against csv/json data") + + parser.add_argument("--dataset", type=str, required=True, help="Space-separated list of datasets") + + parser.add_argument( + "--base_score", type=str, required=True, help="Dictionary string in format {dataset:{model:score}}" + ) + + parser.add_argument("--model-name", type=str, required=True, help="Name of the model to validate") + + return parser.parse_args() + + +def main(): + args = parse_arguments() + + try: + dataset_list = args.dataset.split() + base_score = ast.literal_eval(args.base_score) + except Exception as e: + print(f"Parameter parsing error: {str(e)}") + return + + validate_scores(dataset_list, base_score, args.model_name) + + +if __name__ == "__main__": + main() diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/lint.yml b/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/lint.yml new file mode 100644 index 00000000..1eb46dcb --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/lint.yml @@ -0,0 +1,23 @@ +name: lint + +on: [push, pull_request] + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + lint: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Set up Python 3.10 + uses: actions/setup-python@v2 + with: + python-version: 3.10.15 + - name: Install pre-commit hook + run: | + pip install pre-commit + pre-commit install + - name: Linting + run: pre-commit run --all-files diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/pr-run-test.yml b/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/pr-run-test.yml new file mode 100644 index 00000000..25e214af --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/.github/workflows/pr-run-test.yml @@ -0,0 +1,90 @@ +name: pr_run_test + +on: + pull_request: + branches: + - "main" + paths-ignore: + - "docs/**" + - "**.md" + workflow_dispatch: + schedule: + - cron: '56 01 * * *' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }} + cancel-in-progress: true + +env: + BASE_SCORE: '{"MMBench_V11_MINI":{"Qwen2.5-VL-7B-Instruct":0.76363636,"InternVL3-8B":0.92727273,"llava-onevision-qwen2-0.5b-ov-hf":0.45454545},"MMStar_MINI":{"Qwen2.5-VL-7B-Instruct":0.6133333333333333,"InternVL3-8B":0.7,"llava-onevision-qwen2-0.5b-ov-hf":0.36},"AI2D_MINI":{"Qwen2.5-VL-7B-Instruct":0.7651821862348178,"InternVL3-8B":0.8218623481781376,"llava-onevision-qwen2-0.5b-ov-hf":0.48582995951417},"OCRBench_MINI":{"Qwen2.5-VL-7B-Instruct":15.7,"InternVL3-8B":17.3,"llava-onevision-qwen2-0.5b-ov-hf":5.5}}' + HF_HUB_CACHE: /mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub + HF_HUB_OFFLINE: 1 + CONDA_PATH: /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3 + CONDA_ENV: vlm_pr_test + KUBEBRAIN_CLUSTER_ENTRY: https://h.pjlab.org.cn + KUBEBRAIN_NAMESPACE: ailab-opencompass + +jobs: + vlm_test: + if: ${{!cancelled()}} + runs-on: [yidian_cu12_mllm] + strategy: + fail-fast: false + matrix: + dataset: ["MMBench_V11_MINI MMStar_MINI AI2D_MINI","OCRBench_MINI"] + model: ['llava-onevision-qwen2-0.5b-ov-hf', 'InternVL3-8B', 'Qwen2.5-VL-7B-Instruct'] + include: + - model: llava-onevision-qwen2-0.5b-ov-hf + model_name: llava + - model: Qwen2.5-VL-7B-Instruct + model_name: qwen + - model: InternVL3-8B + model_name: internvl + - dataset: MMBench_V11_MINI MMStar_MINI AI2D_MINI + dataset_name: mmbench + - dataset: OCRBench_MINI + dataset_name: ocrbench + steps: + - name: Clean workdir + run: sudo git clean -ffdx + - name: clone_repo + uses: actions/checkout@v3 + - name: reinstall vlmeval + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + pip uninstall vlmeval -y + pip install . + pip install numpy==1.23.0 transformers==4.57.1 + - name: evaluation_model + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + pip list + + rjob submit --metadata-name=vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} --charged-group=opencompass_gpu --private-machine=group --group=opencompass_gpu --gpu=1 --cpu=16 --memory=32568 --private-machine=group --image=registry.h.pjlab.org.cn/ailab-puyu-puyu_wsp_cpu/vlmevalkit:auto-v0.0.10 --env=HF_HUB_CACHE=/mnt/shared-storage-gpfs2/gpfs2-shared-public/huggingface/hub --env=HF_HUB_OFFLINE=1 --env=LMUData=/mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/LMUData --mount=gpfs://gpfs1/qa-llm-cicd:/mnt/shared-storage-user/qa-llm-cicd --mount=gpfs://gpfs1/opencompass-shared:/mnt/shared-storage-user/opencompass-shared --mount=gpfs://gpfs1/auto-eval-pipeline:/mnt/shared-storage-user/auto-eval-pipeline --mount=gpfs://gpfs2/gpfs2-shared-public:/mnt/shared-storage-gpfs2/gpfs2-shared-public --mount=gpfs://gpfs1/mllm:/mnt/shared-storage-user/mllm --host-network=True -- bash -exc 'cd ${{github.workspace}}; source /mnt/shared-storage-user/opencompass-shared/qa-llm-cicd/miniconda3/bin/activate; conda activate ${{env.CONDA_ENV}}; python run.py --data ${{matrix.dataset}} --model ${{matrix.model}} --work-dir /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} --reuse --judge exact_matching 2>&1' + + for i in {1..1200}; do + current_status=$(rjob get vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} | grep -oP 'rjob [^:]+: \K[^ ]+') + echo "Current status: $current_status, stop checking" + if [[ $current_status == "Succeeded" ]]; then + echo "Task succeeded" + rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100 + exit 0 + elif [[ $current_status == "Failed" || $current_status == "Stopped" ]]; then + echo "Task failed or stopped, fetching logs" + rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} + exit 1 + fi + sleep 6 + done + rjob stop vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} + rjob logs job vllm-pr-test-${{ github.run_id }}-${{matrix.model_name}}-${{matrix.dataset_name}}-${{ github.run_attempt }} -n 100 + echo "Task timeout" + exit 1 + - name: assert_result + run: | + . ${{env.CONDA_PATH}}/bin/activate + conda activate ${{env.CONDA_ENV}} + cp -r /mnt/shared-storage-user/mllm/qa-llm-cicd/eval_report/${{ github.run_id }}/${{matrix.model}} outputs + python .github/scripts/assert_score.py --dataset "${{matrix.dataset}}" --base_score $BASE_SCORE --model-name ${{matrix.model}} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/.gitignore b/evaluation/cosmos3/reasoner/vlmevalkit/.gitignore new file mode 100644 index 00000000..4f4ca813 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/.gitignore @@ -0,0 +1,227 @@ +.idea/ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST +.vscode/ +.gradio/ + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +environment.yml + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# Images +images/ + +# Video files +*.mp4 +*.avi +*.mov +*.mkv +*.flv +*.wmv +*.webm + +scripts/*ttf +.history +cache_dir/* + +# Evaluation Outputs +outputs/* +demo.ipynb +*json +!vlmeval/dataset/utils/vgrpbench/configs/formating-prompt/**/*.json +.vscode +*.swp +GPT4o_MINI/ + +2weiyun* +script.py +Gemini* +Claude3-5V* +GLM4V* +GPT4o* +GPT4V* +mmmu_debug +bailingMM +BailingMM* +SenseChat* +Step* +DoubaoVL +arch +BlueLM* +mmb_* +gpt-4.1* +Reka* +Taiyi +TeleMM +apple.jpg +assets/LOGO.png +api_list.txt +vlmeval/gemini_tmp.py +run.sh +run_g.sh +tmp/ +vlmeval/dataset/Omni3D/tmp/ +InternVL* +Qwen* +CongRong* +Seed1.5* +aguvis* +grok-* +GLM4.5* +SenseNova* + +.DS_Store + + +slurm-logs/ + +data/ diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/.pre-commit-config.yaml b/evaluation/cosmos3/reasoner/vlmevalkit/.pre-commit-config.yaml new file mode 100644 index 00000000..d0b03227 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/.pre-commit-config.yaml @@ -0,0 +1,45 @@ +exclude: | + (?x)^( + scripts/| + assets/| + vlmeval/config.py | + vlmeval/dataset/utils/wemath.py | + vlmeval/dataset/OmniDocBench/ | + vlmeval/dataset/utils/megabench/ | + vlmeval/dataset/utils/vgrpbench/ | + vlmeval/dataset/utils/chartmimic/ | + vlmeval/vlm/ola/ | + vlmeval/vlm/ursa/ | + vlmeval/vlm/ovis/ | + vlmeval/dataset/utils/mme_reasoning.py + ) +repos: + - repo: https://github.com/PyCQA/flake8 + rev: 7.1.2 + hooks: + - id: flake8 + args: + [ + "--max-line-length=120", + "--ignore=W503", + ] + exclude: ^configs/ + - repo: https://github.com/PyCQA/isort + rev: 6.0.1 + hooks: + - id: isort + - repo: https://github.com/google/yapf + rev: v0.43.0 + hooks: + - id: yapf + args: ["--style={column_limit=120}"] + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v6.0.0 + hooks: + - id: trailing-whitespace + - id: check-yaml + - id: end-of-file-fixer + - id: requirements-txt-fixer + - id: check-merge-conflict + - id: mixed-line-ending + args: ["--fix=lf"] diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/LICENSE b/evaluation/cosmos3/reasoner/vlmevalkit/LICENSE new file mode 100644 index 00000000..d67ab032 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/LICENSE @@ -0,0 +1,203 @@ +Copyright 2023 VLMEvalKit Authors. All rights reserved. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2023 VLMEvalKit Authors. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/README-vlmevalkit.md b/evaluation/cosmos3/reasoner/vlmevalkit/README-vlmevalkit.md new file mode 100644 index 00000000..e94ce75d --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/README-vlmevalkit.md @@ -0,0 +1,158 @@ +![LOGO](https://opencompass.openxlab.space/utils/MMLB.jpg) + +A Toolkit for Evaluating Large Vision-Language Models. + +[![][github-contributors-shield]][github-contributors-link] • [![][github-forks-shield]][github-forks-link] • [![][github-stars-shield]][github-stars-link] • [![][github-issues-shield]][github-issues-link] • [![][github-license-shield]][github-license-link] + +English | [简体中文](/docs/zh-CN/README_zh-CN.md) | [日本語](/docs/ja/README_ja.md) + +🏆 OC Learderboard • +🏗️Quickstart • +📊Datasets & Models • +🛠️Development + +🤗 HF Leaderboard • +🤗 Evaluation Records • +🤗 HF Video Leaderboard • + +🔊 Discord • +📝 Report • +🎯Goal • +🖊️Citation + + +**VLMEvalKit** (the python package name is **vlmeval**) is an **open-source evaluation toolkit** of **large vision-language models (LVLMs)**. It enables **one-command evaluation** of LVLMs on various benchmarks, without the heavy workload of data preparation under multiple repositories. In VLMEvalKit, we adopt **generation-based evaluation** for all LVLMs, and provide the evaluation results obtained with both **exact matching** and **LLM-based answer extraction**. + +## Recent Codebase Changes +- **[2025-09-12]** **Major Update: Improved Handling for Models with Thinking Mode** + + A new feature in [PR 1229](https://github.com/open-compass/VLMEvalKit/pull/1175) that improves support for models with thinking mode. VLMEvalKit now allows for the use of a custom `split_thinking` function. **We strongly recommend this for models with thinking mode to ensure the accuracy of evaluation**. To use this new functionality, please enable the Environment Variable: `SPLIT_THINK=True`. By default, the function will parse content within `...` tags and store it in the `thinking` key of the output. For more advanced customization, you can also create a `split_think` function for model. Please see the InternVL implementation for an example. +- **[2025-09-12]** **Major Update: Improved Handling for Long Response(More than 16k/32k)** + + A new feature in [PR 1229](https://github.com/open-compass/VLMEvalKit/pull/1175) that improves support for models with long response outputs. VLMEvalKit can now save prediction files in TSV format. **Since individual cells in an `.xlsx` file are limited to 32,767 characters, we strongly recommend using this feature for models that generate long responses (e.g., exceeding 16k or 32k tokens) to prevent data truncation.** To use this new functionality, please enable the Environment Variable: `PRED_FORMAT=tsv`. +- **[2025-08-04]** In [PR 1175](https://github.com/open-compass/VLMEvalKit/pull/1175), we refine the `can_infer_option` and `can_infer_text`, which increasingly route the evaluation to LLM choice extractors and empirically leads to slight performance improvement for MCQ benchmarks. + +## 🆕 News +- **[2025-07-07]** Supported [**SeePhys**](https://seephys.github.io/), which is a ​full spectrum multimodal benchmark for evaluating physics reasoning across different knowledge levels. thanks to [**Quinn777**](https://github.com/Quinn777) 🔥🔥🔥 +- **[2025-07-02]** Supported [**OvisU1**](https://huggingface.co/AIDC-AI/Ovis-U1-3B), thanks to [**liyang-7**](https://github.com/liyang-7) 🔥🔥🔥 +- **[2025-06-16]** Supported [**PhyX**](https://phyx-bench.github.io/), a benchmark aiming to assess capacity for physics-grounded reasoning in visual scenarios. 🔥🔥🔥 +- **[2025-05-24]** To facilitate faster evaluations for large-scale or thinking models, **VLMEvalKit supports multi-node distributed inference** using **LMDeploy** (supports *InternVL Series, QwenVL Series, LLaMa4*) or **VLLM**(supports *QwenVL Series, LLaMa4*). You can activate this feature by adding the ```use_lmdeploy``` or ```use_vllm``` flag to your custom model configuration in [config.py](vlmeval/config.py) . Leverage these tools to significantly speed up your evaluation workflows 🔥🔥🔥 +- **[2025-05-24]** Supported Models: **InternVL3 Series, Gemini-2.5-Pro, Kimi-VL, LLaMA4, NVILA, Qwen2.5-Omni, Phi4, SmolVLM2, Grok, SAIL-VL-1.5, WeThink-Qwen2.5VL-7B, Bailingmm, VLM-R1, Taichu-VLR**. Supported Benchmarks: **HLE-Bench, MMVP, MM-AlignBench, Creation-MMBench, MM-IFEval, OmniDocBench, OCR-Reasoning, EMMA, ChaXiv,MedXpertQA, Physics, MSEarthMCQ, MicroBench, MMSci, VGRP-Bench, wildDoc, TDBench, VisuLogic, CVBench, LEGO-Puzzles, Video-MMLU, QBench-Video, MME-CoT, VLM2Bench, VMCBench, MOAT, Spatial457 Benchmark**. Please refer to [**VLMEvalKit Features**](https://aicarrier.feishu.cn/wiki/Qp7wwSzQ9iK1Y6kNUJVcr6zTnPe?table=tblsdEpLieDoCxtb) for more details. Thanks to all contributors 🔥🔥🔥 +- **[2025-02-20]** Supported Models: **InternVL2.5 Series, Qwen2.5VL Series, QVQ-72B, Doubao-VL, Janus-Pro-7B, MiniCPM-o-2.6, InternVL2-MPO, LLaVA-CoT, Hunyuan-Standard-Vision, Ovis2, Valley, SAIL-VL, Ross, Long-VITA, EMU3, SmolVLM**. Supported Benchmarks: **MMMU-Pro, WeMath, 3DSRBench, LogicVista, VL-RewardBench, CC-OCR, CG-Bench, CMMMU, WorldSense**. Thanks to all contributors 🔥🔥🔥 +- **[2024-12-11]** Supported [**NaturalBench**](https://huggingface.co/datasets/BaiqiL/NaturalBench), a vision-centric VQA benchmark (NeurIPS'24) that challenges vision-language models with simple questions about natural imagery. +- **[2024-12-02]** Supported [**VisOnlyQA**](https://github.com/psunlpgroup/VisOnlyQA/), a benchmark for evaluating the visual perception capabilities 🔥🔥🔥 +- **[2024-11-26]** Supported [**Ovis1.6-Gemma2-27B**](https://huggingface.co/AIDC-AI/Ovis1.6-Gemma2-27B), thanks to [**runninglsy**](https://github.com/runninglsy) 🔥🔥🔥 +- **[2024-11-25]** Create a new flag `VLMEVALKIT_USE_MODELSCOPE`. By setting this environment variable, you can download the video benchmarks supported from [**modelscope**](https://www.modelscope.cn) 🔥🔥🔥 + +## 🏗️ QuickStart + +See [[QuickStart](/docs/en/Quickstart.md) | [快速开始](/docs/zh-CN/Quickstart.md)] for a quick start guide. + +## 📊 Datasets, Models, and Evaluation Results + +### Evaluation Results + +**The performance numbers on our official multi-modal leaderboards can be downloaded from here!** + +[**OpenVLM Leaderboard**](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard): [**Download All DETAILED Results**](http://opencompass.openxlab.space/assets/OpenVLM.json). + +Check **Supported Benchmarks** Tab in [**VLMEvalKit Features**](https://aicarrier.feishu.cn/wiki/Qp7wwSzQ9iK1Y6kNUJVcr6zTnPe?table=tblsdEpLieDoCxtb) to view all supported image & video benchmarks (70+). + +Check **Supported LMMs** Tab in [**VLMEvalKit Features**](https://aicarrier.feishu.cn/wiki/Qp7wwSzQ9iK1Y6kNUJVcr6zTnPe?table=tblsdEpLieDoCxtb) to view all supported LMMs, including commercial APIs, open-source models, and more (200+). + +**Transformers Version Recommendation:** + +Note that some VLMs may not be able to run under certain transformer versions, we recommend the following settings to evaluate each VLM: + +- **Please use** `transformers==4.33.0` **for**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`. +- **Please use** `transformers==4.36.2` **for**: `Moondream1`. +- **Please use** `transformers==4.37.0` **for**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`, `Cambrian Series`, `VILA Series`, `Llama-3-MixSenseV1_1`, `Parrot-7B`, `PLLaVA Series`. +- **Please use** `transformers==4.40.0` **for**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `360VL-70B`, `Phi-3-Vision`, `WeMM`. +- **Please use** `transformers==4.42.0` **for**: `AKI`. +- **Please use** `transformers==4.44.0` **for**: `Moondream2`, `H2OVL series`. +- **Please use** `transformers==4.45.0` **for**: `Aria`. +- **Please use** `transformers==4.48.0` (or `4.46.0`) **for**: `LLaVA-Next series` (e.g., `llava-hf/llava-v1.6-vicuna-7b-hf`). +- **Please use** `transformers==latest` **for**: `PaliGemma-3B`, `Chameleon series`, `Video-LLaVA-7B-HF`, `Ovis series`, `Mantis series`, `MiniCPM-V2.6`, `OmChat-v2.0-13B-sinlge-beta`, `Idefics-3`, `GLM-4v-9B`, `VideoChat2-HD`, `RBDash_72b`, `Llama-3.2 series`, `Kosmos series`. +- **Please use** `transformers==4.50.3` (or `4.46.1` or `4.51` or `4.53`) **for**: `Molmo series`. +- **Please use** `transformers>=5.2.0` **for**: `Qwen3.5 series`. + +**Torchvision Version Recommendation:** + +Note that some VLMs may not be able to run under certain torchvision versions, we recommend the following settings to evaluate each VLM: + +- **Please use** `torchvision>=0.16` **for**: `Moondream series` and `Aria` + +**Flash-attn Version Recommendation:** + +Note that some VLMs may not be able to run under certain flash-attention versions, we recommend the following settings to evaluate each VLM: + +- **Please use** `pip install flash-attn --no-build-isolation` **for**: `Aria` + +```python +# Demo +from vlmeval.config import supported_VLM +model = supported_VLM['idefics_9b_instruct']() +# Forward Single Image +ret = model.generate(['assets/apple.jpg', 'What is in this image?']) +print(ret) # The image features a red apple with a leaf on it. +# Forward Multiple Images +ret = model.generate(['assets/apple.jpg', 'assets/apple.jpg', 'How many apples are there in the provided images? ']) +print(ret) # There are two apples in the provided images. +``` + +## 🛠️ Development Guide + +To develop custom benchmarks, VLMs, or simply contribute other codes to **VLMEvalKit**, please refer to [[Development_Guide](/docs/en/Development.md) | [开发指南](/docs/zh-CN/Development.md)]. + +**Call for contributions** + +To promote the contribution from the community and share the corresponding credit (in the next report update): + +- All Contributions will be acknowledged in the report. +- Contributors with 3 or more major contributions (implementing an MLLM, benchmark, or major feature) can join the author list of [VLMEvalKit Technical Report](https://www.arxiv.org/abs/2407.11691) on ArXiv. Eligible contributors can create an issue or dm kennyutc in [VLMEvalKit Discord Channel](https://discord.com/invite/evDT4GZmxN). + +Here is a [contributor list](/docs/en/Contributors.md) we curated based on the records. + +## 🎯 The Goal of VLMEvalKit + +**The codebase is designed to:** + +1. Provide an **easy-to-use**, **opensource evaluation toolkit** to make it convenient for researchers & developers to evaluate existing LVLMs and make evaluation results **easy to reproduce**. +2. Make it easy for VLM developers to evaluate their own models. To evaluate the VLM on multiple supported benchmarks, one just need to **implement a single `generate_inner()` function**, all other workloads (data downloading, data preprocessing, prediction inference, metric calculation) are handled by the codebase. + +**The codebase is not designed to:** + +1. Reproduce the exact accuracy number reported in the original papers of all **3rd party benchmarks**. The reason can be two-fold: + 1. VLMEvalKit uses **generation-based evaluation** for all VLMs (and optionally with **LLM-based answer extraction**). Meanwhile, some benchmarks may use different approaches (SEEDBench uses PPL-based evaluation, *eg.*). For those benchmarks, we compare both scores in the corresponding result. We encourage developers to support other evaluation paradigms in the codebase. + 2. By default, we use the same prompt template for all VLMs to evaluate on a benchmark. Meanwhile, **some VLMs may have their specific prompt templates** (some may not covered by the codebase at this time). We encourage VLM developers to implement their own prompt template in VLMEvalKit, if that is not covered currently. That will help to improve the reproducibility. + +## 🖊️ Citation + +If you find this work helpful, please consider to **star🌟** this repo. Thanks for your support! + +[![Stargazers repo roster for @open-compass/VLMEvalKit](https://reporoster.com/stars/open-compass/VLMEvalKit)](https://github.com/open-compass/VLMEvalKit/stargazers) + +If you use VLMEvalKit in your research or wish to refer to published OpenSource evaluation results, please use the following BibTeX entry and the BibTex entry corresponding to the specific VLM / benchmark you used. + +```bib +@inproceedings{duan2024vlmevalkit, + title={Vlmevalkit: An open-source toolkit for evaluating large multi-modality models}, + author={Duan, Haodong and Yang, Junming and Qiao, Yuxuan and Fang, Xinyu and Chen, Lin and Liu, Yuan and Dong, Xiaoyi and Zang, Yuhang and Zhang, Pan and Wang, Jiaqi and others}, + booktitle={Proceedings of the 32nd ACM International Conference on Multimedia}, + pages={11198--11201}, + year={2024} +} +``` + +

🔝Back to top

+ +[github-contributors-link]: https://github.com/open-compass/VLMEvalKit/graphs/contributors +[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/VLMEvalKit?color=c4f042&labelColor=black&style=flat-square +[github-forks-link]: https://github.com/open-compass/VLMEvalKit/network/members +[github-forks-shield]: https://img.shields.io/github/forks/open-compass/VLMEvalKit?color=8ae8ff&labelColor=black&style=flat-square +[github-issues-link]: https://github.com/open-compass/VLMEvalKit/issues +[github-issues-shield]: https://img.shields.io/github/issues/open-compass/VLMEvalKit?color=ff80eb&labelColor=black&style=flat-square +[github-license-link]: https://github.com/open-compass/VLMEvalKit/blob/main/LICENSE +[github-license-shield]: https://img.shields.io/github/license/open-compass/VLMEvalKit?color=white&labelColor=black&style=flat-square +[github-stars-link]: https://github.com/open-compass/VLMEvalKit/stargazers +[github-stars-shield]: https://img.shields.io/github/stars/open-compass/VLMEvalKit?color=ffcb47&labelColor=black&style=flat-square diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/README.md b/evaluation/cosmos3/reasoner/vlmevalkit/README.md new file mode 100644 index 00000000..5c2f335c --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/README.md @@ -0,0 +1,183 @@ + + +# TAR and VANTAGE Benchmarks for Cosmos3 Model + +This repository is an **NVIDIA-customized [VLMEvalKit](https://github.com/open-compass/VLMEvalKit)** that +releases the NVIDIA Metropolis "smart infra" benchmarks and makes the cosmos-reasoner scores on them +**publicly reproducible**. The reproduction kit lives in [`cosmos_eval/`](cosmos_eval/); the stock +VLMEvalKit inference + scoring engine it drives is the rest of the tree (upstream toolkit docs: +[`README-vlmevalkit.md`](README-vlmevalkit.md)). + +The kit ships, for each benchmark, the exact `run.py --config` settings the internal evaluation uses — +split into a shared dataset layer (`cosmos_eval/data/`) and a per-family model layer +(`cosmos_eval/models/`) — plus a parallel launcher (`cosmos_eval/run_all.py`) and a standalone score +reporter (`cosmos_eval/parse_score.py`). + +You deploy the model behind an OpenAI-compatible endpoint, point the launcher at it, and it composes +each config, runs the stock vlmevalkit `run.py`, and reports the headline score per benchmark. No +extra evaluation logic lives in the kit — inference and scoring are vlmevalkit's; the kit only composes +the inputs and reads the outputs. + +> **This rollout ships the NVIDIA Metropolis "smart infra" benchmarks — `AETCBench_all` (the official +> **TAR** / Traffic Anomaly Reasoning benchmark) + the 8 `VANTAGE_*` (9 total).** +> +> **⚠️ The datasets are not publicly available yet** — both the VANTAGE and TAR +> challenges are still ongoing. **We will update this code as soon as the data and evaluation are ready.** +> Until then these configs are published as a reference; `run_all` runs every benchmark in the manifest. + +``` +. # NVIDIA-customized VLMEvalKit (stock inference + scoring engine) +├── README.md # this file +├── README-vlmevalkit.md # upstream VLMEvalKit toolkit docs +├── run.py # stock vlmevalkit entrypoint (driven once per benchmark) +└── cosmos_eval/ # the reproduction kit + ├── data//.json # dataset section per benchmark (model-independent) + ├── models/.json # per-family model layer (day 1: cosmos) + ├── manifest.json # per-bench run.py CLI flags + ├── run_all.py # parallel launcher (compose → run.py → parse_score) + └── parse_score.py # standalone score reporter +``` + +--- + +## 1. Environment setup + +Use a clean virtual environment with **Python 3.10+**, run from the **repo root** (the directory that +holds this README): + +```bash +python -m venv .venv && source .venv/bin/activate + +# vlmevalkit's own dependencies +pip install -r requirements.txt + +# Extra packages required at runtime but not pinned by requirements.txt: +# einops, accelerate -> imported when vlmevalkit loads its dataset modules +# setuptools<81 -> jieba imports pkg_resources, which setuptools>=81 drops +# and Python 3.12 venvs omit +pip install einops accelerate "setuptools<81" + +# install vlmevalkit itself so `import vlmeval` resolves +pip install -e . +``` + +Sanity check: + +```bash +python -c "import vlmeval; print('vlmeval OK')" +``` + +--- + +## 2. Deploy the model endpoint (vLLM) + +Serve the model behind an OpenAI-compatible `/v1/chat/completions` endpoint (see also +**[Reasoner with vLLM](https://github.com/NVIDIA/cosmos/tree/main#reasoner-with-vllm)**): + +```bash +vllm serve nvidia/Cosmos3-Nano \ + --async-scheduling \ + --allowed-local-media-path / \ + --port 8080 \ + --max-model-len 128000 \ + --tensor-parallel-size 1 \ + --gpu-memory-utilization 0.9 \ + --enable-chunked-prefill \ + --mm-processor-cache-gb 0 \ + --mm-encoder-tp-mode data \ + --media-io-kwargs '{"video": {"num_frames": -1, "fps": -1}}' +``` + +The `--media-io-kwargs` / `--mm-*` flags let the server honor the per-benchmark frame sampling the +configs request, and `--allowed-local-media-path /` lets it read local media. (Raise +`--tensor-parallel-size` for larger models.) Any OpenAI-compatible server works (vLLM, SGLang, TGI, a +hosted NIM, …) as long as it answers `POST /v1/chat/completions` and `GET /v1/models`; the wrapper +health-checks the endpoint and auto-detects the served model id from `/v1/models`. + +--- + +## 3. Environment variables + +```bash +# Model under test +export COSMOS_API_BASE=http://:8080/v1/chat/completions +export COSMOS_MODEL= # or any placeholder; auto-detected from /v1/models +export COSMOS_API_KEY= +``` + +`run_all.py` substitutes `${COSMOS_MODEL}` / `${COSMOS_API_BASE}` into each composed config at launch. + +--- + +## 4. Run + +From the vlmevalkit repo root: + +```bash +# all benchmarks in the manifest +python cosmos_eval/run_all.py --model cosmos --concurrency 8 --work-dir ./out + +# a specific few +python cosmos_eval/run_all.py --benchmarks VANTAGE_VQA,AETCBench_all --work-dir ./out +``` + +Per benchmark, the launcher composes `model_conf = {class} ∪ defaults ∪ benchmarks[bench]` and +`dataset_conf = data[bench] ∪ {model_family}`, writes `/_configs/.json`, runs +`run.py --config … --verbose --save-eval-results` with the manifest's CLI flags (one `run.py` +subprocess per benchmark, `--concurrency` at a time), then reports the score. + +**Useful flags** + +| Flag | Effect | +|---|---| +| `--benchmarks A,B` | run only these benchmarks (default: all in the manifest) | +| `--concurrency N` | up to N benchmarks (each its own `run.py`) at once | +| `--dry-run` | print the `run.py` commands and exit | +| `--export-configs DIR` | compose + write each `--config` JSON to `DIR` (placeholders kept), then exit | +| `--import-configs DIR` | run from a dir of (possibly hand-edited) configs instead of composing | + +--- + +## 5. Data & scoring + +These benchmarks' datasets are being published to Hugging Face: +- **VANTAGE** (`nvidia/PhysicalAI-VANTAGE-Bench`) — test inputs are released; ground truth is withheld, + so scores come from the **VANTAGE leaderboard**. +- **TAR — Traffic Anomaly Reasoning** (config key `AETCBench_all`; `nvidia/PhysicalAI-Traffic-Anomaly-Reasoning`) — + test answers are redacted; submit predictions to the **AI City Challenge** evaluation server. + +`run_all` produces per-benchmark predictions (and a local score where ground truth is available); follow +each benchmark's leaderboard instructions to obtain official scores. + +--- + +## 6. Confirm the results + +After a run, each benchmark has its own output dir and a parsed score in the summary table: + +``` +=== cosmos_eval summary === + AETCBench_all ok + VANTAGE_2DGrounding ok + ... + 9/9 ok, 0 failed +``` + +`run.py` writes its evaluation output to `///T*/_.{dict,df}.eval.json`. +Report any single output directly with the standalone reporter: + +```bash +python cosmos_eval/parse_score.py --work-dir ./out/VANTAGE_VQA --dataset VANTAGE_VQA +# VANTAGE_VQA Overall: +``` + +`parse_score.py` prints the headline **Overall** (0–100) plus every native scalar sub-score key the +benchmark's eval JSON contains. It reads both eval-output shapes vlmevalkit emits +(`*.dict.eval.json`, `*.df.eval.json`). The scores it reports match the internal evaluation pipeline, +benchmark for benchmark. + +Each benchmark dir also keeps `run.log` (full `run.py` output) for debugging; the composed config +used is under `/_configs/.json`. \ No newline at end of file diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/assets/0_lamp_pred.png b/evaluation/cosmos3/reasoner/vlmevalkit/assets/0_lamp_pred.png new file mode 100644 index 00000000..7f0cccab Binary files /dev/null and b/evaluation/cosmos3/reasoner/vlmevalkit/assets/0_lamp_pred.png differ diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/assets/LOGO.svg b/evaluation/cosmos3/reasoner/vlmevalkit/assets/LOGO.svg new file mode 100644 index 00000000..39b62268 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/assets/LOGO.svg @@ -0,0 +1,24 @@ + + + +Created with Fabric.js 5.3.0 + + + + + + + + + + + + + VLMEvalKit + \ No newline at end of file diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/assets/apple.jpg b/evaluation/cosmos3/reasoner/vlmevalkit/assets/apple.jpg new file mode 100644 index 00000000..34e1859a Binary files /dev/null and b/evaluation/cosmos3/reasoner/vlmevalkit/assets/apple.jpg differ diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/assets/sample.mp4 b/evaluation/cosmos3/reasoner/vlmevalkit/assets/sample.mp4 new file mode 100644 index 00000000..ff5d7589 Binary files /dev/null and b/evaluation/cosmos3/reasoner/vlmevalkit/assets/sample.mp4 differ diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/.gitignore b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/.gitignore new file mode 100644 index 00000000..09248382 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/.gitignore @@ -0,0 +1,5 @@ +# The committed config kit lives here, but the repo-root .gitignore broadly +# ignores `*json` and `data/`. Re-include this subtree's assets so they track. +!*.json +!data/ +!data/** diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/AETCBench_all.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/AETCBench_all.json new file mode 100644 index 00000000..bae48fd7 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/AETCBench_all.json @@ -0,0 +1,4 @@ +{ + "class": "AETCBench", + "dataset": "AETCBench_all" +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_2DGrounding.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_2DGrounding.json new file mode 100644 index 00000000..e8fc9904 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_2DGrounding.json @@ -0,0 +1,4 @@ +{ + "class": "Metropolis2DGroundingDataset", + "dataset": "VANTAGE_2DGrounding" +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_2DPointing.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_2DPointing.json new file mode 100644 index 00000000..adabef3f --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_2DPointing.json @@ -0,0 +1,4 @@ +{ + "class": "VANTAGE_2DPointing", + "dataset": "VANTAGE_2DPointing" +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_Astro2D.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_Astro2D.json new file mode 100644 index 00000000..71032928 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_Astro2D.json @@ -0,0 +1,4 @@ +{ + "class": "Astro2DBenchDataset", + "dataset": "VANTAGE_Astro2D" +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_DVC.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_DVC.json new file mode 100644 index 00000000..149aaf79 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_DVC.json @@ -0,0 +1,7 @@ +{ + "class": "MetropolisDVC", + "dataset": "VANTAGE_DVC", + "fps": 4, + "max_frames": 128, + "total_pixels": 8388608 +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_EventVerification.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_EventVerification.json new file mode 100644 index 00000000..8a2f7c0c --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_EventVerification.json @@ -0,0 +1,7 @@ +{ + "class": "MetropolisEventVerification", + "dataset": "VANTAGE_EventVerification", + "fps": 4, + "max_frames": 256, + "total_pixels": 16777216 +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_SOT.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_SOT.json new file mode 100644 index 00000000..48b98be0 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_SOT.json @@ -0,0 +1,6 @@ +{ + "class": "VANTAGE_SOT", + "dataset": "VANTAGE_SOT", + "max_pixels": 921600, + "nframe": 8 +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_Temporal.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_Temporal.json new file mode 100644 index 00000000..c3c7788f --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_Temporal.json @@ -0,0 +1,8 @@ +{ + "class": "MetropolisTemporal", + "dataset": "VANTAGE_Temporal", + "fps": 10, + "max_frames": 256, + "max_pixels": 921600, + "total_pixels": 16777216 +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_VQA.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_VQA.json new file mode 100644 index 00000000..46aa50b1 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/data/metropolis/VANTAGE_VQA.json @@ -0,0 +1,8 @@ +{ + "class": "MetropolisVQA", + "dataset": "VANTAGE_VQA", + "fps": 4, + "max_frames": 256, + "max_pixels": 921600, + "total_pixels": 16777216 +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/manifest.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/manifest.json new file mode 100644 index 00000000..fbe9e6f2 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/manifest.json @@ -0,0 +1,56 @@ +{ + "metropolis/AETCBench_all": { + "run": { + "api_nproc": 8, + "judge_nproc": 4 + } + }, + "metropolis/VANTAGE_2DGrounding": { + "run": { + "api_nproc": 8, + "judge_nproc": 4 + } + }, + "metropolis/VANTAGE_2DPointing": { + "run": { + "api_nproc": 8, + "judge_nproc": 4 + } + }, + "metropolis/VANTAGE_Astro2D": { + "run": { + "api_nproc": 8, + "judge_nproc": 4 + } + }, + "metropolis/VANTAGE_DVC": { + "run": { + "api_nproc": 8, + "judge_nproc": 4 + } + }, + "metropolis/VANTAGE_EventVerification": { + "run": { + "api_nproc": 8, + "judge_nproc": 4 + } + }, + "metropolis/VANTAGE_SOT": { + "run": { + "api_nproc": 8, + "judge_nproc": 4 + } + }, + "metropolis/VANTAGE_Temporal": { + "run": { + "api_nproc": 8, + "judge_nproc": 4 + } + }, + "metropolis/VANTAGE_VQA": { + "run": { + "api_nproc": 8, + "judge_nproc": 4 + } + } +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/models/cosmos.json b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/models/cosmos.json new file mode 100644 index 00000000..982b90ad --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/models/cosmos.json @@ -0,0 +1,171 @@ +{ + "model_family": "cr", + "model_key": "cosmos", + "class": "CosmosReason2", + "defaults": { + "api_base": "${COSMOS_API_BASE}", + "chat_template_kwargs": { + "enable_thinking": false + }, + "max_tokens": 8192, + "model": "${COSMOS_MODEL}", + "presence_penalty": null, + "retry": 10, + "seed": 1, + "temperature": 0, + "timeout": 600, + "verbose": false + }, + "benchmarks": { + "AETCBench_all": {}, + "VANTAGE_2DGrounding": { + "max_tokens": 4096 + }, + "VANTAGE_2DPointing": { + "max_tokens": 512 + }, + "VANTAGE_Astro2D": {}, + "VANTAGE_DVC": { + "max_tokens": 16384, + "total_pixels": 8388608 + }, + "VANTAGE_EventVerification": { + "total_pixels": 16777216 + }, + "VANTAGE_SOT": { + "response_format": { + "type": "json_schema", + "json_schema": { + "name": "sot_predictions", + "strict": true, + "schema": { + "type": "object", + "properties": { + "frame_1": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "number" + }, + "minItems": 4, + "maxItems": 4 + }, + { + "type": "null" + } + ] + }, + "frame_2": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "number" + }, + "minItems": 4, + "maxItems": 4 + }, + { + "type": "null" + } + ] + }, + "frame_3": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "number" + }, + "minItems": 4, + "maxItems": 4 + }, + { + "type": "null" + } + ] + }, + "frame_4": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "number" + }, + "minItems": 4, + "maxItems": 4 + }, + { + "type": "null" + } + ] + }, + "frame_5": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "number" + }, + "minItems": 4, + "maxItems": 4 + }, + { + "type": "null" + } + ] + }, + "frame_6": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "number" + }, + "minItems": 4, + "maxItems": 4 + }, + { + "type": "null" + } + ] + }, + "frame_7": { + "anyOf": [ + { + "type": "array", + "items": { + "type": "number" + }, + "minItems": 4, + "maxItems": 4 + }, + { + "type": "null" + } + ] + } + }, + "required": [ + "frame_1", + "frame_2", + "frame_3", + "frame_4", + "frame_5", + "frame_6", + "frame_7" + ], + "additionalProperties": false + } + } + } + }, + "VANTAGE_Temporal": { + "total_pixels": 16777216 + }, + "VANTAGE_VQA": { + "total_pixels": 16777216 + } + } +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/parse_score.py b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/parse_score.py new file mode 100644 index 00000000..f9783872 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/parse_score.py @@ -0,0 +1,787 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Standalone score reporter for the cosmos_eval kit. + +Reports the headline Overall score (0-100) plus every native sub-score key from a +`run.py --save-eval-results` evaluation output. No database, no parent +`vlmeval-metric` dependency — stdlib only. + +This file is a VERBATIM VENDORED PORT of two internal modules, assembled by the +maintainer generator so the public scores match the internal pipeline exactly: + + * `vlmeval_metric/score_parser.py` — all format detectors (A/B/C/D) + every + per-benchmark override (`extract_overall_score`, `parse_scores`, ...). + * `vlmeval_metric/eval_data.py` — the `*.dict.eval.json` / `*.df.eval.json` loader. + +Do not hand-edit the ported sections; regenerate. A maintainer CI gate runs this +port and the real modules on the same eval JSONs and asserts identical output. + +Score formats handled (see `parse_scores`): + A: flat scalar dict B: multi-split arrays C: indexed lists D: simple numeric +""" + +from __future__ import annotations + +import argparse +import json +import sys +from collections.abc import Callable +from dataclasses import dataclass, field +from pathlib import Path +from typing import Any + +# =========================================================================== +# Vendored verbatim from vlmeval_metric/eval_data.py +# =========================================================================== + +@dataclass +class EvalSummary: + """Evaluation summary with optional column metadata.""" + + data: dict[str, Any] + columns: list[str] = field(default_factory=list) + + +def load_eval_summary(filepath: Path) -> dict[str, Any]: + """Load eval summary — backward compatible wrapper.""" + return load_eval_summary_with_metadata(filepath).data + + +def load_eval_summary_with_metadata(filepath: Path) -> EvalSummary: + """Load eval summary preserving column names for .df.eval.json files. + + For .dict.eval.json: columns will be empty (keys are already column names). + For .df.eval.json: columns contains the DataFrame column headers. + """ + doc = json.loads(filepath.read_text("utf-8")) + + if filepath.name.endswith(".dict.eval.json"): + return EvalSummary(data=doc) + + if not filepath.name.endswith(".df.eval.json"): + raise ValueError("expected filepath to end with known suffix") + + columns = doc["columns"] + index = [str(k) for k in doc["index"]] + data = doc["data"] + + if len(columns) == 1: + data = [x[0] for x in data] + + return EvalSummary( + data=dict(zip(index, data, strict=False)), + columns=[str(c) for c in columns], + ) + + +# =========================================================================== +# Vendored verbatim from vlmeval_metric/score_parser.py +# =========================================================================== + +# Column names to search for the "overall" metric, in priority order. +ACCURACY_KEYS = [ + "CV-Bench Accuracy", + "Overall", + "overall", + "English Overall Score", + "Accuracy", + "accuracy", + "acc", + "Acc", +] + + +def normalize_score(value: float) -> float: + """Normalize score to 0-100 range. + + VLMEvalKit is inconsistent: some datasets store 0-1 (AI2D, MMBench), + others store 0-100 (CountBenchQA, HallusionBench). + Heuristic: values <= 1.0 are 0-1 scale, multiply by 100. + """ + return value * 100 if value <= 1.0 else value + + +def _is_format_c(scores_dict: dict) -> bool: + """Check if all non-metadata keys are numeric strings with list values.""" + non_meta_keys = [k for k in scores_dict if k != "__columns__"] + if not non_meta_keys: + return False + if any(not k.isdigit() for k in non_meta_keys): + return False + return isinstance(scores_dict[non_meta_keys[0]], list) + + +def _detect_format(scores_dict: dict) -> str: + """Detect which format the scores dict is in. + + Returns one of: "A", "B", "C", "D", "empty", "unknown". + """ + if not scores_dict: + return "empty" + + if _is_format_c(scores_dict): + return "C" + + for key in ACCURACY_KEYS: + if key in scores_dict and isinstance(scores_dict[key], list): + return "B" + + for key in ACCURACY_KEYS: + if key in scores_dict and isinstance(scores_dict[key], int | float): + return "A" + + for key, value in scores_dict.items(): + if key != "__columns__" and isinstance(value, int | float): + return "D" + + return "unknown" + + +def _parse_format_a(scores_dict: dict) -> dict[str, str]: + """Parse flat dict with scalar values.""" + for key in ACCURACY_KEYS: + if key in scores_dict and isinstance(scores_dict[key], int | float): + score = normalize_score(scores_dict[key]) + return {"overall": f"{score:.2f}"} + return {} + + +def _parse_format_b(scores_dict: dict) -> dict[str, str]: + """Parse dict with array values (multi-split).""" + result: dict[str, str] = {} + splits = scores_dict.get("split", []) + if not isinstance(splits, list): + splits = [splits] + + for key in ACCURACY_KEYS: + if key not in scores_dict: + continue + values = scores_dict[key] + if not isinstance(values, list): + continue + + if len(splits) == len(values): + for split_name, val in zip(splits, values, strict=False): + if isinstance(val, int | float): + score = normalize_score(val) + result[str(split_name)] = f"{score:.2f}" + elif len(values) == 1 and isinstance(values[0], int | float): + score = normalize_score(values[0]) + result["overall"] = f"{score:.2f}" + + if result: + return result + + return {} + + +def _extract_accuracy_by_columns(values: list, columns: list[str]) -> float | None: + """Column-aware accuracy extraction. + + columns[0] is the label/Category column; columns[1:] correspond to values[0:]. + Finds the LAST column whose name contains an accuracy keyword. + """ + acc_col_names = {"accuracy", "acc", "overall", "iou"} + data_columns = columns[1:] + last_match: float | None = None + for j, col in enumerate(data_columns): + col_lower = str(col).lower() + if any(name in col_lower for name in acc_col_names): + if j < len(values) and isinstance(values[j], int | float): + last_match = values[j] + if last_match is not None: + return last_match + for j in range(len(values) - 1, -1, -1): + if isinstance(values[j], int | float): + return values[j] + return None + + +def _extract_accuracy_by_heuristic(values: list) -> float | None: + """Heuristic accuracy extraction when no column names available.""" + numeric = [v for v in values if isinstance(v, int | float)] + if not numeric: + return None + if len(numeric) == 1: + return numeric[0] + + small_values = [v for v in numeric if v <= 100] + if small_values and len(small_values) < len(numeric): + return small_values[-1] + + return numeric[0] + + +def _extract_accuracy_from_row(values: list, columns: list[str] | None = None) -> float | None: + """Extract the accuracy value from a data row.""" + if columns is not None and len(columns) > 1: + return _extract_accuracy_by_columns(values, columns) + return _extract_accuracy_by_heuristic(values) + + +def _iter_format_c_rows(scores_dict: dict) -> list[tuple[str, list]]: + """Iterate Format C rows in sorted order, skipping metadata.""" + rows = [] + for key, value in sorted(scores_dict.items(), key=lambda x: x[0]): + if key == "__columns__": + continue + if isinstance(value, list) and len(value) >= 2: + rows.append((key, value)) + return rows + + +def _get_column_list(scores_dict: dict) -> list[str] | None: + """Extract __columns__ metadata if present.""" + columns = scores_dict.get("__columns__") + if isinstance(columns, list) and columns: + return [str(c) for c in columns] + return None + + +def _parse_format_c(scores_dict: dict) -> dict[str, str]: + """Parse indexed lists format.""" + col_list = _get_column_list(scores_dict) + rows = _iter_format_c_rows(scores_dict) + + for _key, value in rows: + label = value[0] + if not isinstance(label, str): + continue + if label.lower() not in ("overall", "cv-bench accuracy"): + continue + acc = _extract_accuracy_from_row(value[1:], col_list) + if acc is not None: + return {"overall": f"{normalize_score(acc):.2f}"} + + for _key, value in rows: + label = value[0] if isinstance(value[0], str) else "unknown" + acc = _extract_accuracy_from_row(value[1:], col_list) + if acc is not None: + clean_label = label.lower().replace(" ", "_") + return {clean_label: f"{normalize_score(acc):.2f}"} + + return {} + + +def parse_scores(scores_dict: dict[str, Any]) -> dict[str, str]: + """Parse scores dict and extract metrics. + + Returns: + Dictionary mapping metric names to formatted score strings + (0-100 scale, 2 decimal places). + """ + fmt = _detect_format(scores_dict) + + if fmt == "A": + return _parse_format_a(scores_dict) + if fmt == "B": + return _parse_format_b(scores_dict) + if fmt == "C": + return _parse_format_c(scores_dict) + if fmt == "D": + for value in scores_dict.values(): + if isinstance(value, int | float): + score = normalize_score(value) + return {"overall": f"{score:.2f}"} + + return {} + + +# Dataset-specific score extraction overrides. +# Each entry maps dataset_name to a callable that extracts the overall score +# from the raw scores dict. Only needed for benchmarks where the generic +# parse_scores() logic doesn't pick the right metric. +SCORE_OVERRIDES: dict[str, Callable[[dict[str, Any]], float | None]] = { + # MMMU_DEV_VAL: multi-split, use validation split + "MMMU_DEV_VAL": lambda d: _extract_split_score(d, "validation"), + # Anomaly-detection-style binary classification (sklearn classification_report + # shape) — class distributions are heavily skewed, so macro-F1 is the honest + # summary; accuracy/micro-F1 is flattered by the majority class. + "TailgatingVerification": lambda d: _extract_key_score(d, "macro avg--f1-score"), + "LVEventVerification": lambda d: _extract_key_score(d, "macro avg--f1-score"), + "MetropolisEventVerification": lambda d: _extract_key_score(d, "macro avg--f1-score"), + "VANTAGE_EventVerification": lambda d: _extract_key_score(d, "macro avg--f1-score"), + "WarehouseNearMiss": lambda d: _extract_key_score(d, "macro avg--f1-score"), + "ITSCollision": lambda d: _extract_key_score(d, "macro avg--f1-score"), + "MetropolisDVC": lambda d: _extract_nested_score(d, "overall", "SODA_c"), + "VANTAGE_DVC": lambda d: _extract_nested_score(d, "overall", "SODA_c"), + # MVBench: overall is [correct, total, "pct%"], extract percentage + "MVBench": lambda d: _extract_pct_from_list(d, "overall"), + # Video-MME: nested {"overall": {"overall": "0.606"}}, use overall.overall + "Video-MME": lambda d: _extract_nested_score(d, "overall", "overall"), + # RefCOCO: Format C with columns [Split, Precision@1, Average IoU, Samples], + # macro-average row labeled "Average". Extract Precision@1 from that row. + "RefCOCO": lambda d: _extract_refcoco_precision(d), + # IFBench: Format C with columns [strict, loose], single row. + # Overall = average of strict and loose, normalized from 0-1 to 0-100. + "IFBench": lambda d: _extract_ifbench_avg(d), + # VideoPhy2: correlation score + "VideoPhy2": lambda d: _extract_key_score(d, "Correlation"), + # CausalVQA: average of unpaired and paired accuracy + "CausalVQA": lambda d: _extract_avg_keys(d, "Unpaired Accuracy", "Paired Accuracy"), + # MVPBench: average of single and pair accuracy + "MVPBench": lambda d: _extract_avg_keys(d, "Single Accuracy", "Pair Accuracy"), + # MetropolisTemporal: nested {"overall": {"iou": 0.45, ...}} + "MetropolisTemporal": lambda d: _extract_nested_score(d, "overall", "iou"), + "VANTAGE_Temporal": lambda d: _extract_nested_score(d, "overall", "iou"), + # MetropolisVQA: general-purpose VQA, classes roughly balanced — accuracy + # is the right summary (unlike the anomaly-detection benchmarks above). + "MetropolisVQA": lambda d: _extract_key_score(d, "accuracy"), + "VANTAGE_VQA": lambda d: _extract_key_score(d, "accuracy"), + "Metropolis2DGrounding": lambda d: _extract_key_score(d, "Mean_IoU"), + "VANTAGE_2DGrounding": lambda d: _extract_key_score(d, "Mean_IoU"), + # ThreeDAVGroundingBench: the harness writes a transposed 1-row DF as a flat + # dict — metric names are top-level keys (e.g. {"IoU Accuracy": 0.086, ..., + # "__columns__": ["0"]}). `_extract_key_score` reads the headline directly. + # Fixture: tests/fixtures/threedavgrounding_real_scores.json. + "ThreeDAVGroundingBench": lambda d: _extract_key_score(d, "IoU Accuracy"), + "Astro2DBench": lambda d: _extract_key_score(d, "f1"), + "VANTAGE_Astro2D": lambda d: _extract_key_score(d, "f1"), + "VANTAGE_SOT": lambda d: _extract_key_score(d, "Overall"), + # WarehouseSpatialAI: custom key not in ACCURACY_KEYS + "WarehouseSpatialAI": lambda d: _extract_key_score(d, "Overall_acc"), + # LingoQA: benchmark_score (0-1 scale) + "LingoQA": lambda d: _extract_key_score(d, "benchmark_score"), + # AVSpecial*Bench: "Overall Accuracy" key (not in ACCURACY_KEYS) + "AVSpecialCollisionBench": lambda d: _extract_key_score(d, "Overall Accuracy"), + "AVSpecialStopBehaviorBench": lambda d: _extract_key_score(d, "Overall Accuracy"), + # AVSpecialEnvironmentBench: pooled accuracy on 0-1 scale (normalize_score rescales to 0-100). + "AVSpecialEnvironmentBench": lambda d: _extract_key_score(d, "accuracy"), + # AVSpecialOODReasoningBench: mean Lingo-Judge sigmoid prob on 0-1 scale. + "AVSpecialOODReasoningBench": lambda d: _extract_key_score(d, "lingo_judge_mean"), + # AVPromptFollowingBench: sample-level success rate on 0-1 scale (normalize_score rescales to 0-100). + "AVPromptFollowingBench": lambda d: _extract_key_score(d, "success_rate"), + # AETCBench: harness writes a transposed 1-row DF as a flat dict — metric + # names are top-level keys (e.g. {"bcq_accuracy": ..., "weighted_mean": ..., + # "__columns__": ["0"]}). Pick "weighted_mean" as the headline. + "AETCBench_all": lambda d: _extract_key_score(d, "weighted_mean"), + # LVS_ai_hallucination: aggregate.avg_factual_accuracy on a 0-10 scale → 0-100. + "LVS_ai_hallucination": lambda d: _extract_lvs_ai_hallucination_score(d), + # CameraBench: aggregate.overall_f1 on a 0-1 scale → 0-100. + "CameraBench": lambda d: _extract_camera_bench_score(d), + # Cosmos-CAB-Video (both General and Camera variants): aggregate.overall_score on + # a 0-1 scale → 0-100. General = F1(precision, recall); Camera = mean of three macro-F1. + "Cosmos-CAB-Video_General": lambda d: _extract_cosmos_cab_score(d), + "Cosmos-CAB-Video_Camera": lambda d: _extract_cosmos_cab_score(d), + # Cosmos-CAB-Image: aggregate.overall_score = F1(precision, recall) on 0-1 → 0-100. + "Cosmos-CAB-Image": lambda d: _extract_cosmos_cab_score(d), + # LocateAnythingBench (both Box and Point variants): aggregate.overall_score on + # a 0-1 scale (mean of per-dataset F1 across 7 anchor datasets), rescale to 0-100. + "LocateAnythingBench-Box": lambda d: _extract_locate_anything_bench_score(d), + "LocateAnythingBench-Point": lambda d: _extract_locate_anything_bench_score(d), + # ODinW13: evaluate() returns a wide DataFrame (rows = ['Overall', *13 datasets], + # cols = ['mAP', 'mAP_50']) on the natural 0-1 mAP scale. Headline = average mAP at + # scores['Overall'][0]; the override applies the single 0-1 -> 0-100 conversion. + "ODinW13": lambda d: _extract_odinw_score(d), +} + +# Prefix-based overrides: dataset names starting with these prefixes use the +# corresponding override. Allows e.g. MVBench_8frame, MVBench_64frame to share logic. +SCORE_OVERRIDE_PREFIXES: list[tuple[str, Callable[[dict[str, Any]], float | None]]] = [ + ("MVBench", SCORE_OVERRIDES["MVBench"]), + ("Video-MME", SCORE_OVERRIDES["Video-MME"]), + ("tailgating_", SCORE_OVERRIDES["TailgatingVerification"]), + ("CausalVQA", SCORE_OVERRIDES["CausalVQA"]), + ("MetropolisTemporal", SCORE_OVERRIDES["MetropolisTemporal"]), + ("MetropolisVQA", SCORE_OVERRIDES["MetropolisVQA"]), + ("AETCBench", SCORE_OVERRIDES["AETCBench_all"]), + ("LVS_ai_hallucination", SCORE_OVERRIDES["LVS_ai_hallucination"]), + ("CameraBench", SCORE_OVERRIDES["CameraBench"]), + ("Cosmos-CAB-Video_", SCORE_OVERRIDES["Cosmos-CAB-Video_General"]), + ("Cosmos-CAB-Image", SCORE_OVERRIDES["Cosmos-CAB-Image"]), + ("LocateAnythingBench", SCORE_OVERRIDES["LocateAnythingBench-Box"]), +] + + +def _extract_key_score(scores_dict: dict[str, Any], key: str) -> float | None: + """Extract a score by direct key lookup from flattened eval dict.""" + val = scores_dict.get(key) + if isinstance(val, int | float): + return normalize_score(val) + return None + + +def _extract_df_split_column(scores_dict: dict[str, Any], column_name: str) -> float | None: + """Extract a scalar from a one-row .df.eval.json by column name via __columns__.""" + columns = _get_column_list(scores_dict) + if not columns or column_name not in columns: + return None + idx = columns.index(column_name) + rows = _iter_format_c_rows(scores_dict) + if not rows: + return None + _, row = rows[0] + if idx >= len(row): + return None + val = row[idx] + if isinstance(val, int | float): + return normalize_score(val) + return None + + +def _extract_avg_keys(scores_dict: dict[str, Any], *keys: str) -> float | None: + """Average multiple keys and normalize to 0-100.""" + vals = [scores_dict[k] for k in keys if isinstance(scores_dict.get(k), int | float)] + if len(vals) != len(keys): + return None + return normalize_score(sum(vals) / len(vals)) + + +def _extract_split_score(scores_dict: dict[str, Any], split_name: str) -> float | None: + """Extract score for a specific split from Format B (multi-split) data.""" + parsed = parse_scores(scores_dict) + val = parsed.get(split_name) + return float(val) if val else None + + +def _extract_pct_from_list(scores_dict: dict[str, Any], key: str) -> float | None: + """Extract percentage from [correct, total, "pct%"] format like MVBench.""" + val = scores_dict.get(key) + if not isinstance(val, list) or len(val) < 3: + return None + pct_str = val[2] + if isinstance(pct_str, str) and pct_str.endswith("%"): + try: + return float(pct_str.rstrip("%")) + except ValueError: + return None + # Fallback: compute from correct/total + if isinstance(val[0], int | float) and isinstance(val[1], int | float) and val[1] > 0: + return val[0] / val[1] * 100 + return None + + +def _extract_nested_score(scores_dict: dict[str, Any], outer_key: str, inner_key: str) -> float | None: + """Extract a value from a nested dict like {"overall": {"mIoU": 0.5}} or {"overall": {"overall": "0.606"}}.""" + outer = scores_dict.get(outer_key) + if isinstance(outer, dict): + val = outer.get(inner_key) + if isinstance(val, int | float): + return normalize_score(val) + if isinstance(val, str): + try: + return normalize_score(float(val)) + except ValueError: + return None + return None + + +def _extract_refcoco_precision(scores_dict: dict[str, Any]) -> float | None: + """Extract Precision@1 from RefCOCO's macro-average row. + + RefCOCO outputs Format C with columns ["Split", "Precision@1", "Average IoU", "Samples"]. + The last row (label "Average") contains the macro-average across all 8 splits. + """ + columns = _get_column_list(scores_dict) + rows = _iter_format_c_rows(scores_dict) + + # Find the column index for "Precision@1" + precision_idx: int | None = None + if columns and len(columns) > 1: + data_columns = columns[1:] # columns[0] is the label column + for j, col in enumerate(data_columns): + if "precision" in col.lower(): + precision_idx = j + break + + for _key, value in rows: + label = value[0] + if not isinstance(label, str) or label.lower() != "average": + continue + data = value[1:] + if precision_idx is not None and precision_idx < len(data): + val = data[precision_idx] + if isinstance(val, int | float): + return float(val) # already 0-100 scale + # Fallback: first numeric value in the row + for v in data: + if isinstance(v, int | float): + return float(v) + + return None + + +def _extract_ifbench_avg(scores_dict: dict[str, Any]) -> float | None: + """Extract average of strict and loose accuracy from IFBench. + + IFBench returns a 1-row, 2-column DataFrame which run.py transposes, + producing: {"strict": scalar, "loose": scalar, "__columns__": [0]}. + Overall = (strict + loose) / 2, normalized from 0-1 to 0-100. + """ + strict = scores_dict.get("strict") + loose = scores_dict.get("loose") + if isinstance(strict, int | float) and isinstance(loose, int | float): + return normalize_score((strict + loose) / 2) + return None + + +def _extract_lvs_ai_hallucination_score(scores_dict: dict[str, Any]) -> float | None: + """Extract aggregate.avg_factual_accuracy (0-10 scale) and rescale to 0-100.""" + aggregate = scores_dict.get("aggregate") + if not isinstance(aggregate, dict): + return None + raw = aggregate.get("avg_factual_accuracy") + if not isinstance(raw, int | float): + return None + return float(raw) * 10.0 + + +def _extract_camera_bench_score(scores_dict: dict[str, Any]) -> float | None: + """Extract the headline overall_f1 (0-100) for CameraBench from the wide + df.eval.json shape: 'Overall' row, columns [precision, recall, f1, accuracy]. + """ + overall_row = scores_dict.get("Overall") + columns = scores_dict.get("__columns__") + if not isinstance(overall_row, list) or not isinstance(columns, list): + return None + try: + idx = columns.index("f1") + except ValueError: + return None + if idx >= len(overall_row): + return None + val = overall_row[idx] + return float(val) if isinstance(val, int | float) else None + + +def _extract_cosmos_cab_score(scores_dict: dict[str, Any]) -> float | None: + """Extract the headline f1 (0-100) for Cosmos-CAB benchmarks from the wide + df.eval.json shape: 'Overall' row, columns starting with 'f1'. + """ + overall_row = scores_dict.get("Overall") + columns = scores_dict.get("__columns__") + if not isinstance(overall_row, list) or not isinstance(columns, list): + return None + try: + idx = columns.index("f1") + except ValueError: + return None + if idx >= len(overall_row): + return None + val = overall_row[idx] + return float(val) if isinstance(val, int | float) else None + + +def _extract_locate_anything_bench_score(scores_dict: dict[str, Any]) -> float | None: + """Extract the headline overall_score (0-100) for LocateAnythingBench. + + `evaluate()` returns a wide DataFrame: rows = ['Overall', *7 datasets]; + cols start with the headline metric (avg_f1 for Box, f1 for Point). + After `eval_data.load_eval_summary_with_metadata`, the 'Overall' row + becomes a top-level list whose first element is the headline score + (already on the 0-100 scale). + """ + overall_row = scores_dict.get("Overall") + if isinstance(overall_row, list) and overall_row and isinstance(overall_row[0], int | float): + return float(overall_row[0]) + return None + + +def _extract_odinw_score(scores_dict: dict[str, Any]) -> float | None: + """Extract the headline average mAP for ODinW13, converted to 0-100. + + `evaluate()` returns a wide DataFrame: rows = ['Overall', *13 datasets]; + cols = ['mAP', 'mAP_50'], values on the natural 0-1 mAP scale. After + `load_eval_summary_with_metadata`, the 'Overall' row becomes a top-level list + whose first element is the average mAP (0-1). `normalize_score` applies the + single 0-1 -> 0-100 conversion; since mAP is always in [0,1] this is + unambiguous and cannot double-scale. + """ + overall_row = scores_dict.get("Overall") + if isinstance(overall_row, list) and overall_row and isinstance(overall_row[0], int | float): + return normalize_score(float(overall_row[0])) + return None + + +def extract_overall_score(scores_dict: dict[str, Any], dataset_name: str = "") -> float: + """Extract a single overall score from VLMEvalKit output. + + Uses dataset-specific overrides when available, otherwise falls back to + generic parse_scores() logic. Returns the score on the 0-100 scale, or + 0.0 if extraction fails. + """ + # Check dataset-specific override first (exact match, then prefix match) + override = SCORE_OVERRIDES.get(dataset_name) + if override is None: + for prefix, fn in SCORE_OVERRIDE_PREFIXES: + if dataset_name.startswith(prefix): + override = fn + break + if override is not None: + result = override(scores_dict) + if result is not None: + return result + + # Generic fallback + parsed = parse_scores(scores_dict) + if not parsed: + return 0.0 + if "overall" in parsed: + try: + return float(parsed["overall"]) + except (ValueError, TypeError): + pass + score_str = next(iter(parsed.values())) + try: + return float(score_str) + except (ValueError, TypeError): + return 0.0 + + +# --- RynnScale harness adapter (layer-1: native .json -> standardized scores) --- +# The ``rynnscale`` backend emits a native ``.json`` ({"metrics": {...}}) whose keys +# use RynnScale's own vocabulary ("Object Cognition", "traj", ...). These functions map that +# into standardized 0-100 component scores; the result then flows through the common +# ``extract_overall_score`` like every other backend. Ported verbatim from +# rynnscale-metric/rynnscale_metric/score_parser.py for score parity. NOTE: RynnScale always +# reports 0-1, so this uses an unconditional ``round(value*100, 2)`` — deliberately distinct +# from the heuristic ``normalize_score`` above (which serves VLMEvalKit's mixed 0-1/0-100 storage). + +RYNNSCALE_BENCHMARK_REGISTRY: dict[str, dict[str, Any]] = { + "RynnBrainCog": { + "output_file": "RynnBrainCog.json", + "components": { + "RynnBrain-Object": "Object Cognition", + "RynnBrain-Spatial": "Spatial Cognition", + }, + }, + "RynnBrainLoc": { + "output_file": "RynnBrainLoc.json", + "components": { + "RynnBrain-Grounding": "Object Referring", + "RynnBrain-Area": "area", + "RynnBrain-Affordance": "affordance", + "RynnBrain-Trajectory": "traj", + }, + }, +} + + +def extract_rynnscale_suite_scores(metrics: dict[str, Any], suite_name: str) -> dict[str, float]: + """Extract component scores for a RynnScale suite, normalized to 0-100. + + Returns a dict with component names as keys plus an "Overall" key for the average. + """ + registry = RYNNSCALE_BENCHMARK_REGISTRY.get(suite_name) + if not registry: + return {} + + components = registry["components"] + scores: dict[str, float] = {} + values = [] + + for component_name, output_key in components.items(): + if output_key in metrics: + normalized = round(metrics[output_key] * 100.0, 2) # RynnScale is always 0-1; see note above + scores[component_name] = normalized + values.append(normalized) + + if values: + scores["Overall"] = round(sum(values) / len(values), 2) + + return scores + + +def extract_rynnscale_scores( + raw_metrics: dict[str, dict[str, Any]], + benchmarks: list[str], +) -> dict[str, dict[str, float]]: + """Extract scores for all requested RynnScale suites. + + Args: + raw_metrics: suite name -> full metrics dict from the RynnScale ``.json``. + benchmarks: suite names to extract (e.g. ["RynnBrainCog", "RynnBrainLoc"]). + + Returns: + suite name -> {component: score_0_100, "Overall": avg_0_100}. + """ + result: dict[str, dict[str, float]] = {} + for suite in benchmarks: + if suite in raw_metrics: + result[suite] = extract_rynnscale_suite_scores(raw_metrics[suite], suite) + return result + + +# =========================================================================== +# cosmos_eval CLI: locate the eval output, report Overall + native sub-scores. +# (Mirrors vlmeval_run._finalize_scores: load summary, inject __columns__, +# then extract_overall_score keyed on the benchmark name.) +# =========================================================================== + +# Metadata keys injected alongside real scores; never reported as sub-scores. +_META_PREFIX = "__" + + +def find_eval_output(work_dir: Path) -> Path | None: + """Newest `*/*/*.eval.json` under a run.py work-dir (matches the internal finder).""" + files = list(Path(work_dir).glob("*/*/*.eval.json")) + return max(files, key=lambda p: p.stat().st_mtime, default=None) + + +def load_scores(eval_path: Path) -> dict[str, Any]: + """Load an eval output into the scores dict the parser expects (with __columns__).""" + summary = load_eval_summary_with_metadata(Path(eval_path)) + scores = summary.data + if summary.columns: + scores["__columns__"] = summary.columns + return scores + + +def _infer_dataset_name(eval_path: Path) -> str: + """Best-effort benchmark name from a `_.{dict,df}.eval.json` filename. + + Used only when --dataset is omitted; the launcher always passes it explicitly + (the per-benchmark score override is keyed on this name). + """ + name = eval_path.name + for suffix in (".dict.eval.json", ".df.eval.json"): + if name.endswith(suffix): + name = name[: -len(suffix)] + break + return name.split("_", 1)[1] if "_" in name else name + + +def report( + work_dir: str | None = None, + eval_json: str | None = None, + dataset_name: str = "", +) -> dict[str, Any]: + """Resolve the eval output and return {eval_json, dataset, overall, subscores}.""" + eval_path = Path(eval_json) if eval_json else find_eval_output(Path(work_dir)) + if eval_path is None: + return {"eval_json": None, "dataset": dataset_name, "overall": None, "subscores": {}} + scores = load_scores(eval_path) + dataset = dataset_name or _infer_dataset_name(eval_path) + overall = extract_overall_score(scores, dataset_name=dataset) + subscores = { + k: v + for k, v in scores.items() + if not str(k).startswith(_META_PREFIX) and isinstance(v, (int, float)) and not isinstance(v, bool) + } + return {"eval_json": str(eval_path), "dataset": dataset, "overall": overall, "subscores": subscores} + + +def main(argv: list[str] | None = None) -> None: + ap = argparse.ArgumentParser(description="Report Overall + sub-scores from a run.py eval output.") + src = ap.add_mutually_exclusive_group(required=True) + src.add_argument("--work-dir", help="run.py output dir; the newest */*/*.eval.json is used") + src.add_argument("--eval-json", help="explicit path to a *.eval.json") + ap.add_argument("--dataset", default="", help="benchmark name (drives the per-benchmark score override)") + ap.add_argument("--json", action="store_true", help="emit machine-readable JSON") + args = ap.parse_args(argv) + + r = report(work_dir=args.work_dir, eval_json=args.eval_json, dataset_name=args.dataset) + if args.json: + print(json.dumps(r)) + return + if r["eval_json"] is None: + print("no eval output found", file=sys.stderr) + raise SystemExit(1) + print(f"{r['dataset']} Overall: {r['overall']:.2f}") + for k, v in sorted(r["subscores"].items()): + print(f" {k}: {v}") + + +if __name__ == "__main__": + main() diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/run_all.py b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/run_all.py new file mode 100644 index 00000000..7a4c7be8 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/cosmos_eval/run_all.py @@ -0,0 +1,279 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +"""Parallel launcher for the cosmos_eval kit. + +Composes each benchmark's `run.py --config` JSON from the committed split +artifacts (`data//.json` + `models/.json`), runs the +stock vlmevalkit `run.py` for each in a bounded process pool, and reports the +headline score per benchmark via `parse_score.py`. + +This is the END-USER entrypoint: stdlib only, no `vlmeval-metric` dependency. +The internal generator (`gen_oss_configs.py`) is what *produced* the artifacts; +this just consumes them with a dumb dict-merge — no routing/schema logic here. + +Quickstart: + + export COSMOS_API_BASE=https:///v1/chat/completions + export COSMOS_MODEL= COSMOS_API_KEY= + export OPENAI_API_BASE= OPENAI_API_KEY= # gpt-4o judge + python cosmos_eval/run_all.py --model cosmos --concurrency 8 --work-dir ./out + # runs every benchmark in the manifest + + # one/few: --benchmarks VANTAGE_VQA,AETCBench_all + # inspect: --export-configs ./cfgs (compose + write, no run) + # tweak-run: --import-configs ./cfgs (run from a dir of edited configs) +""" + +from __future__ import annotations + +import argparse +import json +import os +import string +import subprocess +import sys +from concurrent.futures import ThreadPoolExecutor, as_completed +from pathlib import Path +from typing import Any + +import parse_score # sibling module in cosmos_eval/ + +HERE = Path(__file__).resolve().parent +VLMEVALKIT_ROOT = HERE.parent +RUN_PY = VLMEVALKIT_ROOT / "run.py" + +# ConcatDataset benchmarks (e.g. Astro2D) evaluate per sub-dataset via +# `eval_file.replace(dataset_name, sub_name)`, which rewrites the name EVERYWHERE in the +# path — including the work-dir component. So they need a work-dir whose path does NOT +# contain the benchmark name, otherwise the sub-result targets a sibling dir that is never +# created (FileNotFoundError). These get a name-free (indexed) work-dir; all other benches +# keep the readable / layout. +_CONCAT_BENCHES = {"VANTAGE_Astro2D"} + + +# --------------------------------------------------------------------------- +# Kit loading + composition (the dumb dict-merge; mirrors gen_oss_configs.compose) +# --------------------------------------------------------------------------- + + +def load_model_layer(model: str) -> dict[str, Any]: + path = HERE / "models" / f"{model}.json" + if not path.exists(): + sys.exit(f"Error: no model layer at {path} (have: " + f"{', '.join(p.stem for p in sorted((HERE / 'models').glob('*.json')))})") + return json.loads(path.read_text("utf-8")) + + +def load_manifest() -> dict[str, Any]: + return json.loads((HERE / "manifest.json").read_text("utf-8")) + + +def load_data_conf(domain: str, bench: str) -> dict[str, Any]: + return json.loads((HERE / "data" / domain / f"{bench}.json").read_text("utf-8")) + + +def _render_env(obj: Any) -> Any: + """Substitute ${VAR} placeholders from the environment, recursively.""" + if isinstance(obj, str): + return string.Template(obj).safe_substitute(os.environ) + if isinstance(obj, dict): + return {k: _render_env(v) for k, v in obj.items()} + if isinstance(obj, list): + return [_render_env(v) for v in obj] + return obj + + +def compose(data_conf: dict[str, Any], model_layer: dict[str, Any], bench: str) -> tuple[dict, dict]: + """model_conf = {class} | defaults | benchmarks[bench]; dataset_conf = data | {model_family}. + + Placeholders are kept verbatim (env substitution is applied separately at write time). + """ + model_conf = {"class": model_layer["class"]} + model_conf.update(model_layer["defaults"]) + model_conf.update(model_layer["benchmarks"].get(bench, {})) + dataset_conf = dict(data_conf) + dataset_conf["model_family"] = model_layer["model_family"] + return model_conf, dataset_conf + + +def build_config(model_layer: dict[str, Any], domain: str, bench: str, *, render: bool) -> dict[str, Any]: + """The full `run.py --config` document for one benchmark.""" + model_conf, dataset_conf = compose(load_data_conf(domain, bench), model_layer, bench) + if render: + model_conf = _render_env(model_conf) + return {"model": {model_layer["model_key"]: model_conf}, "data": {bench: dataset_conf}} + + +# --------------------------------------------------------------------------- +# Benchmark selection +# --------------------------------------------------------------------------- + + +def select_benches( + manifest: dict[str, Any], + *, + domains: list[str] | None, + benchmarks: list[str] | None, +) -> list[dict[str, Any]]: + """Resolve the manifest into an ordered list of {key, bench, domain, run}. + + Runs every benchmark in the manifest, optionally narrowed by `domains` / `benchmarks`. + """ + selected: list[dict[str, Any]] = [] + for key, entry in manifest.items(): + domain, bench = key.split("/", 1) + if domains and domain not in domains: + continue + if benchmarks and bench not in benchmarks: + continue + selected.append({"key": key, "bench": bench, "domain": domain, "run": entry.get("run", {})}) + for i, it in enumerate(selected): + it["idx"] = i # stable index for name-free work-dirs (ConcatDataset benches) + return selected + + +def _bench_workdir(work_root: Path, item: dict[str, Any]) -> Path: + """Per-bench work-dir. ConcatDataset benches get a NAME-FREE dir (see _CONCAT_BENCHES).""" + if item["bench"] in _CONCAT_BENCHES: + return work_root / f"_concat_{item['idx']:02d}" + return work_root / item["bench"] + + +# --------------------------------------------------------------------------- +# run.py invocation +# --------------------------------------------------------------------------- + + +def _run_cmd(config_path: Path, work_dir: Path, run_flags: dict[str, Any]) -> list[str]: + """Build the stock `run.py --config` command (no --data/--model; cfg drives them).""" + cmd = [sys.executable, str(RUN_PY), "--config", str(config_path), "--work-dir", str(work_dir)] + cmd += ["--api-nproc", str(run_flags.get("api_nproc", 16))] + cmd += ["--judge-nproc", str(run_flags.get("judge_nproc", 4))] + if run_flags.get("judge"): + cmd += ["--judge", str(run_flags["judge"])] + if run_flags.get("judge_args"): + cmd += ["--judge-args", str(run_flags["judge_args"])] + cmd += ["--verbose", "--save-eval-results"] + return cmd + + +def run_one(item: dict[str, Any], model_layer: dict[str, Any], *, work_root: Path, + configs_dir: Path, import_dir: Path | None) -> dict[str, Any]: + """Compose (or import) the config, run run.py, then parse the score. Returns a result row.""" + bench, domain = item["bench"], item["domain"] + bench_out = _bench_workdir(work_root, item) + bench_out.mkdir(parents=True, exist_ok=True) + + if import_dir is not None: + config_path = import_dir / f"{bench}.json" + if not config_path.exists(): + return {**item, "status": "error", "score": None, "detail": f"no imported config {config_path}"} + # Imported configs may still carry ${...} placeholders; render into the work copy. + rendered = _render_env(json.loads(config_path.read_text("utf-8"))) + config_path = configs_dir / f"{bench}.json" + config_path.write_text(json.dumps(rendered, indent=2), "utf-8") + else: + cfg = build_config(model_layer, domain, bench, render=True) + config_path = configs_dir / f"{bench}.json" + config_path.write_text(json.dumps(cfg, indent=2), "utf-8") + + cmd = _run_cmd(config_path, bench_out, item["run"]) + log_path = bench_out / "run.log" + with log_path.open("w") as log: + proc = subprocess.run(cmd, cwd=str(VLMEVALKIT_ROOT), stdout=log, + stderr=subprocess.STDOUT, env=os.environ.copy()) + if proc.returncode != 0: + return {**item, "status": "error", "score": None, "detail": f"run.py exit {proc.returncode}; see {log_path}"} + + rep = parse_score.report(work_dir=str(bench_out), dataset_name=bench) + if rep["eval_json"] is None: + return {**item, "status": "no-eval", "score": None, "detail": f"no eval output; see {log_path}"} + return {**item, "status": "ok", "score": rep["overall"], "subscores": rep["subscores"]} + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + + +def _csv(value: str | None) -> list[str] | None: + return [x.strip() for x in value.split(",") if x.strip()] if value else None + + +def _print_summary(results: list[dict[str, Any]]) -> None: + print("\n=== cosmos_eval summary ===") + width = max([len(r["bench"]) for r in results] + [9]) + for r in sorted(results, key=lambda x: x["key"]): + score = f"{r['score']:.2f}" if isinstance(r.get("score"), (int, float)) else "-" + line = f" {r['bench']:<{width}} {r['status']:<8} {score:>7}" + if r["status"] != "ok": + line += f" {r.get('detail', '')}" + print(line) + ok = sum(1 for r in results if r["status"] == "ok") + print(f"\n {ok}/{len(results)} ok, {len(results) - ok} failed") + + +def main(argv: list[str] | None = None) -> None: + ap = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) + ap.add_argument("--model", default="cosmos", help="model layer (models/.json); default cosmos") + ap.add_argument("--domains", default=None, help="comma-separated domains (default: all in the manifest)") + ap.add_argument("--benchmarks", default=None, help="comma-separated benchmark names (default: all in the manifest)") + ap.add_argument("--concurrency", type=int, default=4, help="max concurrent run.py subprocesses") + ap.add_argument("--work-dir", default="./cosmos_eval_out", help="run output root") + ap.add_argument("--dry-run", action="store_true", help="print the run.py commands and exit") + ap.add_argument("--export-configs", metavar="DIR", help="compose + write each config to DIR, then exit") + ap.add_argument("--import-configs", metavar="DIR", help="run from a dir of pre-composed configs instead of composing") + args = ap.parse_args(argv) + + model_layer = load_model_layer(args.model) + manifest = load_manifest() + + selected = select_benches(manifest, domains=_csv(args.domains), benchmarks=_csv(args.benchmarks)) + if not selected: + sys.exit("Error: no benchmarks selected (check --domains/--benchmarks).") + + # --export-configs: compose (placeholders intact) + write, then exit. + if args.export_configs: + out = Path(args.export_configs) + out.mkdir(parents=True, exist_ok=True) + for item in selected: + cfg = build_config(model_layer, item["domain"], item["bench"], render=False) + (out / f"{item['bench']}.json").write_text(json.dumps(cfg, indent=2), "utf-8") + print(f"Exported {len(selected)} config(s) to {out} (placeholders intact).") + return + + work_root = Path(args.work_dir) + configs_dir = work_root / "_configs" + configs_dir.mkdir(parents=True, exist_ok=True) + import_dir = Path(args.import_configs) if args.import_configs else None + + if args.dry_run: + for item in selected: + cfg_path = (import_dir or configs_dir) / f"{item['bench']}.json" + print(" ".join(_run_cmd(cfg_path, work_root / item["bench"], item["run"]))) + print(f"\n[dry-run] {len(selected)} benchmark(s) selected.") + return + + print(f"Running {len(selected)} benchmark(s) with concurrency {args.concurrency} -> {work_root}") + results: list[dict[str, Any]] = [] + with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as pool: + futures = { + pool.submit(run_one, item, model_layer, work_root=work_root, + configs_dir=configs_dir, import_dir=import_dir): item + for item in selected + } + for fut in as_completed(futures): + r = fut.result() + results.append(r) + mark = "ok" if r["status"] == "ok" else r["status"].upper() + score = f"{r['score']:.2f}" if isinstance(r.get("score"), (int, float)) else "-" + print(f" [{mark}] {r['bench']}: {score}") + + _print_summary(results) + if any(r["status"] != "ok" for r in results): + raise SystemExit(1) + + +if __name__ == "__main__": + main() diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/.readthedocs.yaml b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/.readthedocs.yaml new file mode 100644 index 00000000..c6cf8e2a --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/.readthedocs.yaml @@ -0,0 +1,17 @@ +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + +formats: + - epub + +sphinx: + configuration: docs/en/conf.py + +python: + install: + - requirements: requirements/docs.txt diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/ConfigSystem.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/ConfigSystem.md new file mode 100644 index 00000000..120e0cb0 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/ConfigSystem.md @@ -0,0 +1,67 @@ +# Config System + +By default, VLMEvalKit launches the evaluation by setting the model name(s) (defined in `/vlmeval/config.py`) and dataset name(s) (defined in `vlmeval/dataset/__init__.py` or `vlmeval/dataset/video_dataset_config.py`) in the `run.py` script with the `--model` and `--data` arguments. Such approach is simple and efficient in most scenarios, however, it may not be flexible enough when the user wants to evaluate multiple models / datasets with different settings. + +To address this, VLMEvalKit provides a more flexible config system. The user can specify the model and dataset settings in a json file, and pass the path to the config file to the `run.py` script with the `--config` argument. Here is a sample config json: + +```json +{ + "model": { + "GPT4o_20240806_T00_HIGH": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 0, + "img_detail": "high" + }, + "GPT4o_20240806_T10_Low": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 1.0, + "img_detail": "low" + }, + "GPT4o_20241120": {} + }, + "data": { + "MME-RealWorld-Lite": { + "class": "MMERealWorld", + "dataset": "MME-RealWorld-Lite" + }, + "MMBench_DEV_EN_V11": { + "class": "ImageMCQDataset", + "dataset": "MMBench_DEV_EN_V11" + }, + "MMBench_Video_8frame_nopack":{}, + "Video-MME_16frame_subs": { + "class": "VideoMME", + "dataset": "Video-MME", + "nframe": 16, + "use_subtitle": true + } + } +} +``` + +Explanation of the config json: + +1. Now we support two fields: `model` and `data`, each of which is a dictionary. The key of the dictionary is the name of the model / dataset (set by the user), and the value is the setting of the model / dataset. +2. For items in `model`, the value is a dictionary containing the following keys: + - `class`: The class name of the model, which should be a class name defined in `vlmeval/vlm/__init__.py` (open-source models) or `vlmeval/api/__init__.py` (API models). + - Other kwargs: Other kwargs are model-specific parameters, please refer to the definition of the model class for detailed usage. For example, `model`, `temperature`, `img_detail` are arguments of the `GPT4V` class. It's noteworthy that the `model` argument is required by most model classes. + - Tip: The defined model in the `supported_VLM` of `vlmeval/config.py` can be used as a shortcut, for example, `GPT4o_20241120: {}` is equivalent to `GPT4o_20241120: {'class': 'GPT4V', 'model': 'gpt-4o-2024-11-20', 'temperature': 0, 'img_size': -1, 'img_detail': 'high', 'retry': 10, 'verbose': False}` +3. For the dictionary `data`, we suggest users to use the official dataset name as the key (or part of the key), since we frequently determine the post-processing / judging settings based on the dataset name. For items in `data`, the value is a dictionary containing the following keys: + - `class`: The class name of the dataset, which should be a class name defined in `vlmeval/dataset/__init__.py`. + - Other kwargs: Other kwargs are dataset-specific parameters, please refer to the definition of the dataset class for detailed usage. Typically, the `dataset` argument is required by most dataset classes. It's noteworthy that the `nframe` argument or `fps` argument is required by most video dataset classes. + - Tip: The defined dataset in the `supported_video_datasets` of `vlmeval/dataset/video_dataset_config.py` can be used as a shortcut, for example, `MMBench_Video_8frame_nopack: {}` is equivalent to `MMBench_Video_8frame_nopack: {'class': 'MMBenchVideo', 'dataset': 'MMBench-Video', 'nframe': 8, 'pack': False}`. +Saving the example config json to `config.json`, you can launch the evaluation by: + +```bash +python run.py --config config.json +``` + +That will generate the following output files under the working directory `$WORK_DIR` (Following the format `{$WORK_DIR}/{$MODEL_NAME}/{$MODEL_NAME}_{$DATASET_NAME}_*`): + +- `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MME-RealWorld-Lite*` +- `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MME-RealWorld-Lite*` +- `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MMBench_DEV_EN_V11*` +- `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MMBench_DEV_EN_V11*` +... diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Contributors.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Contributors.md new file mode 100644 index 00000000..ddf50c6c --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Contributors.md @@ -0,0 +1,21 @@ +# Contributors + +## Contributors w. 3+ Major Contributions + +> In this section, we list all the contributors who have made significant contributions (3+) to the development of VLMEvalKit. + +New Qualified Contributors (2024.09): + +1. [amitbcp](https://github.com/amitbcp): The contributor helped support MUIRBench, Phi-3.5, Idefics3, VILA, and xGen-MM +2. [czczup](https://github.com/czczup): The contributor helped support the InternVL Series (V1.5, Mini-InternVL, V2, etc.) +3. [DseidLi](https://github.com/DseidLi): The contributor helped support LLaVA-OneVision, GQA, and developed the readthedocs site for VLMEvalKit +4. [mayubo2333](https://github.com/mayubo2333): The contributor helped support MMLongBench, SlideVQA, and DUDE +5. [sun-hailong](https://github.com/sun-hailong): The contributor helped support A-OKVQA, Parrot, MMMB, and MTL-MMBench +6. [PhoenixZ810](https://github.com/PhoenixZ810): The contributor helped support Video-ChatGPT, Chat-UniVI, and Llama-VID +7. [Cuiunbo](https://github.com/Cuiunbo): The contributor helped support OmniLMM-12B, MiniCPM-V Series (V1, V2, V2.5) + +## Full Contributor List + +> In this section, we list all the contributors as well as their corresponding contributions to the development of VLMEvalKit. + +TBD. diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Development.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Development.md new file mode 100644 index 00000000..0fe5a60e --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Development.md @@ -0,0 +1,145 @@ +# Develop new Benchmark / MLLM + +> 🛠️ How to implement a new Benchmark / VLM in VLMEvalKit? + +## Implement a new benchmark + +Example PR: **Math-Vision Benchmark** ([#292](https://github.com/open-compass/VLMEvalKit/pull/292/files)) + +In VLMEvalKit, benchmarks are organized as dataset classes. When you try to implement a new benchmark, you can either reuse existing dataset classes (*e.g.*, You can reuse `ImageMCQDataset` when implementing a new multi-choice benchmark), or support a new dataset class. Each dataset must have the following two member functions (either reuse the one of the parent class or implement your own): + +- `build_prompt(self, line)`: The function input `line` is an integer (the sample index) or a `pd.Series` object (the raw record of the sample). The function outputs a `multi-modal message`, serving as the input of an MLLM. The `multi-modal message` is an interleaved list of multi-modal messages adopting the following format (the example includes an image and a text message): `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`. +- `evaluate(self, eval_file, **judge_kwargs)`: The function input `eval_file` is the MLLM prediction (typically in `.xlsx` format). If the benchmark requires an external LLM (typically GPT) for evaluation, then `judge_kwargs` can pass the arguments for the LLM. The function outputs the benchmark evaluation results (metrics) in the form of `dict` or `pd.DataFrame`. + +We then brief the typical steps to implement a new benchmark under VLMEvalKit: + +### 1. Prepare your benchmark tsv file + +Currently, we organize a benchmark as one single TSV file. During inference, the data file will be automatically downloaded from the definited `DATASET_URL` link to `$LMUData` file (default path is `$HOME/LMUData`, if not set explicitly). You can upload the prepared TSV file to a downloadable address (e.g., Huggingface) or send it to us at . We will assist in uploading the dataset to the server. You can also customize `LMUData` path in the environment variable `LMUData=/path/to/your/data`. + +The contents of the TSV file consist of: + +| Dataset Name \ Fields | index | image | image_path | question | hint | multi-choice
options | answer | category | l2-category | split | +| --------------------------------------- | ----- | ----- | ---------- | -------- | ---- | ----------------------- | ------ | -------- | ----------- | ----- | +| MMBench_DEV_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| MMBench_TEST_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | +| CCBench | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | | +| SEEDBench_IMG | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | | +| MME | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | | +| MMVet | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | | +| MMMU_DEV_VAL | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | +| COCO_VAL | ✅ | ✅ | | | | | ✅ | | | | +| OCRVQA_[TEST/TESTCORE] | ✅ | ✅ | | ✅ | | | ✅ | | | | +| TextVQA_VAL | ✅ | ✅ | | ✅ | | | ✅ | | | | +| VCR_[EN/ZH]\_[EASY/HARD]\_[ALL/500/100] | ✅ | ✅ | | ✅ | | | ✅ | | | | +| MMMB_[en/cn/pt/ar/tr/ru] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | |✅ | +| MMBench_dev_[en/cn/pt/ar/tr/ru] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | + +
Table 1. TSV fields of supported datasets.
+ +**Intro to mandatory fields in the `TSV` file:** + +- **index:** Integer, Unique for each line in `tsv` +- **image:** The base64 of the image, you can use APIs implemented in `vlmeval/smp/vlm.py` for encoding and decoding: + - Encoding: `encode_image_to_base64 `(for PIL Image) / `encode_image_file_to_base64` (for image file path) + - Decoding: `decode_base64_to_image`(for PIL Image) / `decode_base64_to_image_file` (for image file path) +- **question**: The question corresponding to the image, a string +- **answer**: The answer to the question, a string. The `test` split does not need this field + +### 2. Cutomize your benchmark prompt + +`ImageBaseDataset` defines the default prompt format. If you need to add prompts specific to the dataset or input data in the `Interleave` format to the model, you can implement this through the `build_prompt(line)` function. This function takes a line from a TSV file as input, containing fields such as index, image, question, etc. The function returns a dictionary list of multimodal messages `msg` in the format `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`, including the image path and the text prompt to be input into VLMs. For interleave type inputs, you can directly place the dictionary of the image path at the image token position. + +### 3. Cutomize your benchmark metrics + +To add evaluation for a new benchmark, you need to customize a class object to implement the dataset’s metrics calculation. Multimodal datasets inherit from the `ImageBaseDataset` object in `vlmeval/dataset/image_base.py`. The TYPE defines the type of dataset, `DATASET_URL` is the download address of the dataset, and `DATASET_MD5` is the MD5 checksum for consistency checking of the dataset file. + +In this class, **you need to implement** the `evaluate(eval_file, **judge_kwargs)` class function to calculate metrics and output results for the custom dataset. The function input `eval_file` is the path to the model prediction results file `{model_name}_{dataset}.xlsx`. This file can be read as a pandas.DataFrame using the `load(eval_file)` method, containing fields such as index, question, answer, category, prediction, etc. The judge_kwargs will pass a dictionary related to evaluation, such as the name of the `judge model`, the number of API request threads, etc. **The return value** of the function is the calculated accuracy and other metrics, formatted as a dictionary composed of lists, organized into a pandas.DataFrame. + +## Implement a new model + +Example PR: **Support LLaVA-Next-Interleave** ([#294](https://github.com/open-compass/VLMEvalKit/pull/294)) + +**1. Support `generate_inner` API (mandatory).** + +All existing models are implemented in `vlmeval/vlm`. For a minimal model, your model class **must implement the method** `generate_inner(msgs, dataset=None)`. In this function, you feed a multi-modal message to your VLM and return the VLM prediction (which is a string). The optional argument `dataset` can be used as the flag for the model to switch among various inference strategies. + +The multi-modal messages `msgs` is a list of dictionaries, each dictionary has two keys: type and value: +- `type`: We currently support two types, choices are ["image", "text"]. +- `value`: When type=='text' , the value is the text message (a single string); when type=='image', the value can be the local path of an image file, or the image URL. + +Currently a multi-modal message may contain arbitrarily interleaved images and texts. If your model do not support that, a practice can be taking the 1st image and concatenated text messages as the input. You can set the `INTERLEAVE = False` in your model class and use `self.message_to_promptimg(message, dataset=dataset)` to build your prompt and the first image's path. + +Here are some examples of multi-modal messages: + +```python +IMAGE_PTH = 'assets/apple.jpg' +IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg' +msg1 = [ + dict(type='image', value=IMAGE_PTH), + dict(type='text', value='What is in this image?') +] +msg2 = [ + dict(type='image', value=IMAGE_URL), + dict(type='image', value=IMAGE_URL), + dict(type='text', value='How many apples are there in these images?') +] +response = model.generate(msg1) +``` + +For convenience sake, we also support to take a list of string as inputs. In that case, we will check if a string is an image path or image URL and automatically convert it to the list[dict] format: + +```python +IMAGE_PTH = 'assets/apple.jpg' +IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg' +msg1 = [IMAGE_PTH, 'What is in this image?'] +msg2 = [IMAGE_URL, IMAGE_URL, 'How many apples are there in these images?'] +response = model.generate(msg1) +``` + +**Support Custom Prompt (optional).** + +Besides, your model can support **custom prompt building** by implementing two optional methods: `use_custom_prompt(dataset)` and `build_prompt(line, dataset=None)`. + +Both functions take the dataset name as the input: + +- `use_custom_prompt(dataset)` returns a boolean flag, indicating whether the model should use the custom prompt building strategy. +- If `use_custom_prompt(dataset)` returns True, `build_prompt(line, dataset)` should return a customly bulit multimodal message for the corresponding `dataset`, given `line`, which is a dictionary that includes the necessary information of a data sample. If `use_custom_prompt(dataset)` returns False, the default prompt building strategy will be used. + +**Support multi-turn chatting (optional).** + +You can also support the multi-turn chatting and evaluation with your VLM by supporting the `chat_inner(message, dataset)` function. The function outputs a single string response, and the `message` is a list of chat history, following the below format. + +```python +# Assume msg1, msg2, msg3, ... are multi-modal messages following the previously described format +# `chat_inner` take the following chat history list as input: +message = [ + dict(role='user', content=msg1), + dict(role='assistant', content=msg2), + dict(role='user', content=msg3), + dict(role='assistant', content=msg4), + ...... + dict(role='user', content=msgn), +] +# `message` should contain an odd number of chat utterances, the role of utterances should be interleaved "user" and "assistant", with the role of the last utterance to be "user". +# The chat function will call `chat_inner` +response = model.chat(message) +``` + +### Example PRs: + +- VLM that doesn't support interleaved images and texts, and does not use custom prompts: [[Model] Support glm-4v-9b](https://github.com/open-compass/VLMEvalKit/pull/221) +- VLM that supports interleaved images and texts and custom prompts: [Add MiniCPM-Llama3-V-2.5](https://github.com/open-compass/VLMEvalKit/pull/205) +- VLM API: [Feature add glmv](https://github.com/open-compass/VLMEvalKit/pull/201) + +## Contribute to VLMEvalKit + +If you want to contribute codes to **VLMEvalKit**, please do the pre-commit check before you submit a PR. That helps to keep the code tidy. + +```bash +# Under the directory of VLMEvalKit, install the pre-commit hook: +pip install pre-commit +pre-commit install +pre-commit run --all-files +# Then you can commit your code. +``` diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/EvalByLMDeploy.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/EvalByLMDeploy.md new file mode 100644 index 00000000..fc0a8c38 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/EvalByLMDeploy.md @@ -0,0 +1,27 @@ +# Using LMDeploy to Accelerate Evaluation and Inference + +VLMEvalKit supports testing VLM models deployed by LMDeploy. Below, we use InternVL2-8B as an example to show how to test the model. + +## Step 0: Install LMDeploy + +```bash +pip install lmdeploy +``` +For other installation methods, you can refer to LMDeploy's [documentation](https://github.com/InternLM/lmdeploy). + +## Step 1: Start the Inference Service + +```bash +lmdeploy serve api_server OpenGVLab/InternVL2-8B --model-name InternVL2-8B +``` +> [!IMPORTANT] +> Since models in VLMEvalKit may have custom behaviors when building prompts for different datasets, such as InternVL2's handling of HallusionBench, it is necessary to specify `--model-name` when starting the server. This allows the VLMEvalKit to select appropriate prompt construction strategy based on the name when using the LMDeploy API. +> +> If `--server-port`, is specified, the corresponding environment variable `LMDEPLOY_API_BASE` needs to be set. + + +## Step 2: Evaluation + +```bash +python run.py --data MMStar --model lmdeploy --verbose --api-nproc 64 +``` diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Makefile b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Quickstart.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Quickstart.md new file mode 100644 index 00000000..264d2cdd --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/Quickstart.md @@ -0,0 +1,236 @@ +# Quickstart + +Before running the evaluation script, you need to **configure** the VLMs and set the model_paths properly. + +After that, you can use a single script `run.py` to inference and evaluate multiple VLMs and benchmarks at a same time. + +## Step 0. Installation & Setup essential keys + +**Installation.** + +```bash +git clone https://github.com/open-compass/VLMEvalKit.git +cd VLMEvalKit +pip install -e . +``` + +**Setup Keys.** + +To infer with API models (GPT-4v, Gemini-Pro-V, etc.) or use LLM APIs as the **judge or choice extractor**, you need to first setup API keys. VLMEvalKit will use an judge **LLM** to extract answer from the output if you set the key, otherwise it uses the **exact matching** mode (find "Yes", "No", "A", "B", "C"... in the output strings). **The exact matching can only be applied to the Yes-or-No tasks and the Multi-choice tasks.** +- You can place the required keys in `$VLMEvalKit/.env` or directly set them as the environment variable. If you choose to create a `.env` file, its content will look like: + + ```bash + # The .env file, place it under $VLMEvalKit + # API Keys of Proprietary VLMs + # QwenVL APIs + DASHSCOPE_API_KEY= + # Gemini w. Google Cloud Backends + GOOGLE_API_KEY= + # OpenAI API + OPENAI_API_KEY= + OPENAI_API_BASE= + # StepAI API + STEPAI_API_KEY= + # REKA API + REKA_API_KEY= + # GLMV API + GLMV_API_KEY= + # CongRong API + CW_API_BASE= + CW_API_KEY= + # SenseNova API + SENSENOVA_API_KEY= + # Hunyuan-Vision API + HUNYUAN_SECRET_KEY= + HUNYUAN_SECRET_ID= + # LMDeploy API + LMDEPLOY_API_BASE= + # MiniMax API + MINIMAX_API_KEY= + # You can also set a proxy for calling api models during the evaluation stage + EVAL_PROXY= + ``` + +- Fill the blanks with your API keys (if necessary). Those API keys will be automatically loaded when doing the inference and evaluation. +## Step 1. Configuration + +**VLM Configuration**: All VLMs are configured in `vlmeval/config.py`. Few legacy VLMs (like MiniGPT-4, LLaVA-v1-7B) requires additional configuration (configuring the code / model_weight root in the config file). During evaluation, you should use the model name specified in `supported_VLM` in `vlmeval/config.py` to select the VLM. Make sure you can successfully infer with the VLM before starting the evaluation with the following command `vlmutil check {MODEL_NAME}`. + +Note: For the Qwen-VL series models (Qwen-VL, Qwen2-VL, Qwen2.5-VL), the upper and lower bounds of the number of pixels specified in vlmeval/config.py are as follows: + +``` +min_pixels=1280 * 28 * 28, +max_pixels=16384 * 28 * 28, +``` +Where 1280 is the maximum value recommended by Qwen for balancing performance, computational resources, and memory, and 16384 is the theoretical maximum value for model input. This setting has a positive effect on some vision tasks that require high resolution (such as document understanding). However, there is no practical basis for considering this setting. If you need to align with the official settings, you can remove these two values, or set them to the following values from the official Qwen demo: + +``` +min_pixels=256 * 28 * 28, +max_pixels=1280 * 28 * 28, +``` + +## Step 2. Evaluation + +**New!!!** We integrated a new config system to enable more flexible evaluation settings. Check the [Document](/docs/en/ConfigSystem.md) or run `python run.py --help` for more details 🔥🔥🔥 + +We use `run.py` for evaluation. To use the script, you can use `$VLMEvalKit/run.py` or create a soft-link of the script (to use the script anywhere): + +**Arguments** + +- `--data (list[str])`: Set the dataset names that are supported in VLMEvalKit (names can be found in the codebase README). +- `--model (list[str])`: Set the VLM names that are supported in VLMEvalKit (defined in `supported_VLM` in `vlmeval/config.py`). +- `--mode (str, default to 'all', choices are ['all', 'infer'])`: When `mode` set to "all", will perform both inference and evaluation; when set to "infer", will only perform the inference. +- `--api-nproc (int, default to 4)`: The number of threads for OpenAI API calling. +- `--work-dir (str, default to '.')`: The directory to save evaluation results. + +**Command for Evaluating Image Benchmarks ** + +You can run the script with `python` or `torchrun`: + +```bash +# When running with `python`, only one VLM instance is instantiated, and it might use multiple GPUs (depending on its default behavior). +# That is recommended for evaluating very large VLMs (like IDEFICS-80B-Instruct). + +# IDEFICS-80B-Instruct on MMBench_DEV_EN, MME, and SEEDBench_IMG, Inference and Evalution +python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose +# IDEFICS-80B-Instruct on MMBench_DEV_EN, MME, and SEEDBench_IMG, Inference only +python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose --mode infer + +# When running with `torchrun`, one VLM instance is instantiated on each GPU. It can speed up the inference. +# However, that is only suitable for VLMs that consume small amounts of GPU memory. + +# IDEFICS-9B-Instruct, Qwen-VL-Chat, mPLUG-Owl2 on MMBench_DEV_EN, MME, and SEEDBench_IMG. On a node with 8 GPU. Inference and Evaluation. +torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct qwen_chat mPLUG-Owl2 --verbose +# Qwen-VL-Chat on MME. On a node with 2 GPU. Inference and Evaluation. +torchrun --nproc-per-node=2 run.py --data MME --model qwen_chat --verbose +``` + +**Command for Evaluating Video Benchmarks** + +```bash +# When running with `python`, only one VLM instance is instantiated, and it might use multiple GPUs (depending on its default behavior). +# That is recommended for evaluating very large VLMs (like IDEFICS-80B-Instruct). + +# IDEFICS2-8B on MMBench-Video, with 8 frames as inputs and vanilla evaluation. On a node with 8 GPUs. MMBench_Video_8frame_nopack is a defined dataset setting in `vlmeval/dataset/video_dataset_config.py`. +torchrun --nproc-per-node=8 run.py --data MMBench_Video_8frame_nopack --model idefics2_8 +# GPT-4o (API model) on MMBench-Video, with 1 frame per second as inputs and pack evaluation (all questions of a video in a single query). +python run.py --data MMBench_Video_1fps_pack --model GPT4o +``` + +The evaluation results will be printed as logs, besides. **Result Files** will also be generated in the directory `$YOUR_WORKING_DIRECTORY/{model_name}`. Files ending with `.csv` contain the evaluated metrics. + +### Frequently Asked Questions + +#### Constructing Input Prompt: The `build_prompt()` Function +If you find that the model's output does not match the expected results when evaluating a specific benchmark, it could be due to the model not constructing the input prompt correctly. + +In VLMEvalKit, each `dataset` class includes a function named `build_prompt()`, which is responsible for formatting input questions. Different benchmarks can either customize their own `build_prompt()` function or use the default implementation. + +For instance, when handling the default [Multiple-Choice QA](https://github.com/open-compass/VLMEvalKit/blob/43af13e052de6805a8b08cd04aed5e0d74f82ff5/vlmeval/dataset/image_mcq.py#L164), the `ImageMCQDataset.build_prompt()` method combines elements such as `hint`, `question`, and `options` (if present in the dataset) into a complete question format, as shown below: + +``` +HINT +QUESTION +Options: +A. Option A +B. Option B +··· +Please select the correct answer from the options above. +``` + +Additionally, since different models may have varying evaluation requirements, VLMEvalKit also supports customizing the prompt construction method at the model level through `model.build_prompt()`. For an example, you can refer to [InternVL](https://github.com/open-compass/VLMEvalKit/blob/43af13e052de6805a8b08cd04aed5e0d74f82ff5/vlmeval/vlm/internvl_chat.py#L324). + +**Note: If both `model.build_prompt()` and `dataset.build_prompt()` are defined, `model.build_prompt()` will take precedence over `dataset.build_prompt()`, effectively overriding it.** + +Some models, such as Qwen2VL and InternVL, define extensive prompt-building methods for various types of benchmarks. To provide more flexibility in adapting to different benchmarks, VLMEvalKit allows users to customize the `model.use_custom_prompt()` function within the model. By adding or modifying the `use_custom_prompt()` function, you can decide which benchmarks should utilize the model's custom prompt logic. Below is an example: + +```python +def use_custom_prompt(self, dataset: str) -> bool: + from vlmeval.dataset import DATASET_TYPE, DATASET_MODALITY + dataset_type = DATASET_TYPE(dataset, default=None) + if not self._use_custom_prompt: + return False + if listinstr(['MMVet'], dataset): + return True + if dataset_type == 'MCQ': + return True + if DATASET_MODALITY(dataset) == 'VIDEO': + return False + return False +``` +Only when the `use_custom_prompt()` function returns `True` will VLMEvalKit call the model's `build_prompt()` function for the current benchmark. +With this approach, you can flexibly control which benchmarks use the model's custom prompt logic based on your specific needs, thereby better adapting to different models and tasks. + +#### Model Splitting + +Currently, VLMEvalKit automatically supports GPU resource allocation and model splitting between processes on the same machine. This feature is supported when the inference backend is `lmdeploy` or `transformers`, with the following behaviors: + +- When launching with `python` command, the model is by default allocated to all available GPUs. If you want to specify which GPUs to use, you can use `CUDA_VISIBLE_DEVICES` environment variable. +- When starting with `torchrun` command, each model instance will be allocated to `N_GPU // N_PROC` GPUs, where `N_PROC` is the number of processes specified by the `--nproc-per-node` parameter in the torchrun command. The value of `N_GPU` is determined as follows: + - If `CUDA_VISIBLE_DEVICES` environment variable is not set, `N_GPU` will be the total number of available GPUs. + - If `CUDA_VISIBLE_DEVICES` environment variable is set, `N_GPU` will be the number of GPUs specified by the `CUDA_VISIBLE_DEVICES` environment variable, and only the specified GPUs will be utilized. +Below are specific examples of running evaluation tasks on a machine equipped with 8 GPUs: + +```bash + +torchrun --nproc-per-node=2 run.py --data MMBench_DEV_EN --model InternVL3-78B + +python run.py --data MMBench_DEV_EN --model InternVL3-78B + +CUDA_VISIBLE_DEVICES=1,2,3,4,5,6 torchrun --nproc-per-node=3 run.py --data MMBench_DEV_EN --model InternVL3-38B +``` + +PS: The feature is not compatible with `vllm` backend. When you evaluate a model with `vllm` backend, please use `python` to launch, and all visible GPU devices will be used. + +#### Performance Discrepancies + +Model performance may vary across different environments. As a result, you might observe discrepancies between your evaluation results and those listed on the official VLMEvalKit leaderboard. These differences could be attributed to variations in versions of libraries such as `transformers`, `cuda`, and `torch`. + +Besides, if you encounter unexpected performance, we recommend first reviewing the local generation records (`{model}_{dataset}.xlsx`) or the evaluation records (`{model}_{dataset}_{judge_model}.xlsx`). This may help you better understand the evaluation outcomes and identify potential issues. + +## Deploy a local language model as the judge / choice extractor +The default setting mentioned above uses OpenAI's GPT as the judge LLM. However, you can also deploy a local judge LLM with [LMDeploy](https://github.com/InternLM/lmdeploy). + +First install: +``` +pip install lmdeploy openai +``` + +And then deploy a local judge LLM with the single line of code. LMDeploy will automatically download the model from Huggingface. Assuming we use internlm2-chat-1_8b as the judge, port 23333, and the key sk-123456 (the key must start with "sk-" and follow with any number you like): +``` +lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333 +``` + +You need to get the model name registered by LMDeploy with the following python code: +``` +from openai import OpenAI +client = OpenAI( + api_key='sk-123456', + base_url="http://0.0.0.0:23333/v1" +) +model_name = client.models.list().data[0].id +``` + +Now set some environment variables to tell VLMEvalKit how to use the local judge LLM. As mentioned above, you can also set them in `$VLMEvalKit/.env` file: +``` +OPENAI_API_KEY=sk-123456 +OPENAI_API_BASE=http://0.0.0.0:23333/v1/chat/completions +LOCAL_LLM= +``` + +Finally, you can run the commands in step 2 to evaluate your VLM with the local judge LLM. + +Note that + +- If you hope to deploy the judge LLM in a single GPU and evaluate your VLM on other GPUs because of limited GPU memory, try `CUDA_VISIBLE_DEVICES=x` like +``` +CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333 +CUDA_VISIBLE_DEVICES=1,2,3 torchrun --nproc-per-node=3 run.py --data HallusionBench --model qwen_chat --verbose +``` +- If the local judge LLM is not good enough in following the instructions, the evaluation may fail. Please report such failures (e.g., by issues). +- It's possible to deploy the judge LLM in different ways, e.g., use a private LLM (not from HuggingFace) or use a quantized LLM. Please refer to the [LMDeploy doc](https://lmdeploy.readthedocs.io/en/latest/serving/api_server.html). You can use any other deployment framework if they support OpenAI API. + + +### Using LMDeploy to Accelerate Evaluation and Inference + +You can refer this [doc](/docs/en/EvalByLMDeploy.md) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/css/readthedocs.css b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/css/readthedocs.css new file mode 100644 index 00000000..c83beffd --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/css/readthedocs.css @@ -0,0 +1,63 @@ +.header-logo { + background-image: url("../image/logo.svg"); + background-size: 275px 80px; + height: 80px; + width: 275px; +} + + +@media screen and (min-width: 1100px) { + .header-logo { + top: -25px; + } +} + +pre { + white-space: pre; +} + +@media screen and (min-width: 2000px) { + .pytorch-content-left { + width: 1200px; + margin-left: 30px; + } + article.pytorch-article { + max-width: 1200px; + } + .pytorch-breadcrumbs-wrapper { + width: 1200px; + } + .pytorch-right-menu.scrolling-fixed { + position: fixed; + top: 45px; + left: 1580px; + } +} + + +article.pytorch-article section code { + padding: .2em .4em; + background-color: #f3f4f7; + border-radius: 5px; +} + +/* Disable the change in tables */ +article.pytorch-article section table code { + padding: unset; + background-color: unset; + border-radius: unset; +} + +table.autosummary td { + width: 50% +} + +img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +article.pytorch-article p.rubric { + font-weight: bold; +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/image/logo.svg b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/image/logo.svg new file mode 100644 index 00000000..04353057 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/image/logo.svg @@ -0,0 +1,24 @@ + + + +Created with Fabric.js 5.3.0 + + + + + + + + + + + + + VLMEvalKit + diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/image/logo_icon.svg b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/image/logo_icon.svg new file mode 100644 index 00000000..c46dd3b5 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/image/logo_icon.svg @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/js/custom.js b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/js/custom.js new file mode 100644 index 00000000..84da69d4 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_static/js/custom.js @@ -0,0 +1,10 @@ +var collapsedSections = []; + +$(document).ready(function () { + $('.model-summary').DataTable({ + "stateSave": false, + "lengthChange": false, + "pageLength": 20, + "order": [] + }); +}); diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_templates/404.html b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_templates/404.html new file mode 100644 index 00000000..64910175 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_templates/404.html @@ -0,0 +1,18 @@ +{% extends "layout.html" %} + +{% block body %} + +

Page Not Found

+

+ The page you are looking for cannot be found. +

+

+ If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in + the content table left, or go to the homepage. +

+ + +{% endblock %} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_templates/autosummary/class.rst b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_templates/autosummary/class.rst new file mode 100644 index 00000000..4c3a7a9a --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_templates/autosummary/class.rst @@ -0,0 +1,13 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :members: + +.. + autogenerated from _templates/autosummary/class.rst + note it does not have :inherited-members: diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_templates/callable.rst b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_templates/callable.rst new file mode 100644 index 00000000..3a7b9d2b --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/_templates/callable.rst @@ -0,0 +1,14 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :members: + :special-members: __call__ + +.. + autogenerated from _templates/callable.rst + note it does not have :inherited-members: diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/conf.py b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/conf.py new file mode 100644 index 00000000..e1d103e1 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/conf.py @@ -0,0 +1,234 @@ +# flake8: noqa +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import ast +import os +import subprocess +import sys + +import pytorch_sphinx_theme +from sphinx.builders.html import StandaloneHTMLBuilder + +sys.path.insert(0, os.path.abspath('../../')) + +# -- Project information ----------------------------------------------------- + +project = 'VLMEvalKit' +copyright = '2023, VLMEvalKit' +author = 'VLMEvalKit Authors' + +# The full version, including alpha/beta/rc tags +version_file = '../../vlmeval/__init__.py' + + +def get_version(): + with open(version_file, 'r') as f: + file_content = f.read() + # Parse the file content into an abstract syntax tree (AST) + tree = ast.parse(file_content, filename=version_file) + + # Iterate through the body of the AST, looking for an assignment to __version__ + for node in tree.body: + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == '__version__': + return node.value.s + raise ValueError('__version__ not found') + + +release = get_version() + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'sphinx_copybutton', + 'sphinx_tabs.tabs', + 'notfound.extension', + 'sphinxcontrib.jquery', + 'sphinx_design', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +language = 'en' + +# The master toctree document. +root_doc = 'index' +html_context = { + 'github_version': 'latest', +} +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'pytorch_sphinx_theme' +html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# yapf: disable +html_theme_options = { + 'menu': [ + { + 'name': 'GitHub', + 'url': 'https://github.com/open-compass/VLMEvalKit' + }, + ], + # Specify the language of shared menu + 'menu_lang': 'en', + # Disable the default edit on GitHub + 'default_edit_on_github': False, +} +# yapf: enable + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_css_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css', + 'css/readthedocs.css' +] +html_js_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js', + 'js/custom.js' +] + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'vlmevalkitdoc' + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (root_doc, 'vlmevalkit.tex', 'VLMEvalKit Documentation', author, + 'manual'), +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', [author], + 1)] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', author, + 'VLMEvalKit Authors', 'AGI evaluation toolbox and benchmark.', + 'Miscellaneous'), +] + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + +# set priority when building html +StandaloneHTMLBuilder.supported_image_types = [ + 'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg' +] + +# -- Extension configuration ------------------------------------------------- +# Ignore >>> when copying code +copybutton_prompt_text = r'>>> |\.\.\. ' +copybutton_prompt_is_regexp = True + +# Auto-generated header anchors +myst_heading_anchors = 3 +# Enable "colon_fence" extension of myst. +myst_enable_extensions = ['colon_fence', 'dollarmath'] + +# Configuration for intersphinx +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'numpy': ('https://numpy.org/doc/stable', None), + 'torch': ('https://pytorch.org/docs/stable/', None), + 'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None), + 'transformers': + ('https://huggingface.co/docs/transformers/main/en/', None), +} +napoleon_custom_sections = [ + # Custom sections for data elements. + ('Meta fields', 'params_style'), + ('Data fields', 'params_style'), +] + +# Disable docstring inheritance +autodoc_inherit_docstrings = False +# Mock some imports during generate API docs. +autodoc_mock_imports = ['rich', 'attr', 'einops'] +# Disable displaying type annotations, these can be very verbose +autodoc_typehints = 'none' + +# The not found page +notfound_template = '404.html' diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/docutils.conf b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/docutils.conf new file mode 100644 index 00000000..0c00c846 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/docutils.conf @@ -0,0 +1,2 @@ +[html writers] +table_style: colwidths-auto diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/index.rst b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/index.rst new file mode 100644 index 00000000..425c7de4 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/en/index.rst @@ -0,0 +1,41 @@ +Welcome to the VLMEvalKit Tutorial! +========================================== + +VLMEvalKit Getting Started Guide +------------------------------- + +To help users get started quickly, we recommend the following process: + +- For users who want to use VLMEvalKit, we recommend reading the "Start Your First Step" section to set up the environment and start a mini-experiment to familiarize yourself with the process. + +- If you want to customize more modules, such as adding datasets and models, we provide an "Advanced Tutorial." + +We always welcome users' PRs (Pull Requests) and Issues to improve VLMEvalKit! + +.. _Start Your First Step: +.. toctree:: + :maxdepth: 1 + :caption: Start Your First Step + + Quickstart.md + +.. _Advanced Tutorial: +.. toctree:: + :maxdepth: 1 + :caption: Advanced Tutorial + + Development.md + ConfigSystem.md + +.. _Other Notes: +.. toctree:: + :maxdepth: 1 + :caption: Other Notes + + Contributors.md + +Index and Tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/ja/README_ja.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/ja/README_ja.md new file mode 100644 index 00000000..5bf9564b --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/ja/README_ja.md @@ -0,0 +1,117 @@ +
+ +![LOGO](http://opencompass.openxlab.space/utils/MMLB.jpg) + +VLMEvalKit: 大規模視覚言語モデルの評価ツールキット + +[![][github-contributors-shield]][github-contributors-link] • [![][github-forks-shield]][github-forks-link] • [![][github-stars-shield]][github-stars-link] • [![][github-issues-shield]][github-issues-link] • [![][github-license-shield]][github-license-link] + +[English](/README.md) | [简体中文](/docs/zh-CN/README_zh-CN.md) | 日本語 + +🏆 OpenCompass Learderboard • +📊Datasets & Models • +🏗️Quickstart • +🛠️Development • +🎯Goal • +🖊️Citation + +🤗 HF Leaderboard • +🤗 Evaluation Records • +🔊 Discord Channel • +📝 Technical Report +
+ +**VLMEvalKit**(pythonパッケージ名は**vlmeval**)は、**大規模視覚言語モデル(LVLMs)**の**オープンソース評価ツールキット**です。このツールキットは、複数のリポジトリでのデータ準備という重労働なしに、さまざまなベンチマークでLVLMsの**ワンコマンド評価**を可能にします。VLMEvalKitでは、すべてのLVLMsに対して**生成ベースの評価**を採用し、**正確なマッチング**と**LLMベースの回答抽出**の両方で得られた評価結果を提供します。 + +PS: 日本語の README には最新のアップデートがすべて含まれていない場合があります。英語版をご確認ください。 + +## 📊 データセット、モデル、および評価結果 + +**公式のマルチモーダルリーダーボードでのパフォーマンス数値は、ここからダウンロードできます!** + +[**OpenVLM Leaderboard**](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard): [すべての詳細な結果をダウンロード](http://opencompass.openxlab.space/assets/OpenVLM.json)。 + +**Supported Benchmarks** in [**VLMEvalKit Features**](https://aicarrier.feishu.cn/wiki/Qp7wwSzQ9iK1Y6kNUJVcr6zTnPe?table=tblsdEpLieDoCxtb) を確認して、すべてのサポートされているベンチマーク(70以上)を表示してください。 + +**Supported LMMs** in [**VLMEvalKit Features**](https://aicarrier.feishu.cn/wiki/Qp7wwSzQ9iK1Y6kNUJVcr6zTnPe?table=tblsdEpLieDoCxtb) を確認して、すべてのサポートされている LMMs(200以上)を表示してください。 + +**Transformersバージョンの推奨事項:** + +特定のtransformerバージョンで一部のVLMが実行できない可能性があることに注意してください。各VLMを評価するために、以下の設定を推奨します: + +- **`transformers==4.33.0`を使用してください**: `Qwenシリーズ`, `Monkeyシリーズ`, `InternLM-XComposerシリーズ`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICSシリーズ`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4シリーズ`, `InstructBLIPシリーズ`, `PandaGPT`, `VXVERSE`, `GLM-4v-9B`. +- **`transformers==4.37.0`を使用してください**: `LLaVAシリーズ`, `ShareGPT4Vシリーズ`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLMシリーズ`, `EMU2シリーズ`, `Yi-VLシリーズ`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VLシリーズ`, `InternVLシリーズ`, `Cambrianシリーズ`, `VILA-VLシリーズ`. +- **`transformers==4.40.0`を使用してください**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `360VL-70B`, `Phi-3-Vision`, `WeMM`. +- **`transformers==4.42.0`を使用してください**: `AKI`. +- **`transformers==latest`を使用してください**: `LLaVA-Nextシリーズ`, `PaliGemma-3B`, `Chameleon-VLシリーズ`, `Video-LLaVA-7B-HF`, `Ovis1.5シリーズ`, `Mantisシリーズ`, `MiniCPM-V2.6`. + +```python +# デモ +from vlmeval.config import supported_VLM +model = supported_VLM['idefics_9b_instruct']() +# 単一画像のフォワード +ret = model.generate(['assets/apple.jpg', 'この画像には何がありますか?']) +print(ret) # この画像には葉がついた赤いリンゴがあります。 +# 複数画像のフォワード +ret = model.generate(['assets/apple.jpg', 'assets/apple.jpg', '提供された画像にはリンゴが何個ありますか?']) +print(ret) # 提供された画像にはリンゴが2個あります。 +``` + +## 🏗️ クイックスタート + +クイックスタートガイドについては、[クイックスタート](/docs/en/Quickstart.md)を参照してください。 + +## 🛠️ 開発ガイド + +カスタムベンチマーク、VLMsを開発するか、単に**VLMEvalKit**に他のコードを貢献する場合は、[開発ガイド](/docs/en/Development.md)を参照してください。 + +コミュニティからの共有を奨励し、それに応じたクレジットを共有するために、次回のレポート更新では以下のことを実施します: + +- 全ての貢献に対して感謝の意を示します +- 新しいモデル、評価セット、または主要な機能への3つ以上の主要な貢献を持つ貢献者は、テクニカルレポートの著者リストに加わることができます。適格な貢献者は、issueを作成するか、または[VLM評価キット ディスコードチャンネル](https://discord.com/invite/evDT4GZmxN)で kennyutc にDMを送ることができます。私たちはそれに応じてフォローアップします。 + +## 🎯 VLMEvalKitの目標 + +**このコードベースは以下を目的として設計されています:** + +1. 研究者や開発者が既存のLVLMsを評価し、評価結果を**簡単に再現できるようにする**ための**使いやすい**、**オープンソースの評価ツールキット**を提供します。 +2. VLMの開発者が自分のモデルを簡単に評価できるようにします。複数のサポートされているベンチマークでVLMを評価するには、単一の`generate_inner()`関数を**実装するだけで**、他のすべてのワークロード(データのダウンロード、データの前処理、予測の推論、メトリックの計算)はコードベースによって処理されます。 + +**このコードベースは以下を目的として設計されていません:** + +1. すべての**第三者ベンチマーク**の元の論文で報告された正確な精度数値を再現すること。その理由は2つあります: + 1. VLMEvalKitは、すべてのVLMに対して**生成ベースの評価**を使用します(オプションで**LLMベースの回答抽出**を使用)。一方、一部のベンチマークは異なるアプローチを使用する場合があります(SEEDBenchはPPLベースの評価を使用します)。これらのベンチマークについては、対応する結果で両方のスコアを比較します。開発者には、コードベースで他の評価パラダイムをサポートすることをお勧めします。 + 2. デフォルトでは、すべてのVLMに対して同じプロンプトテンプレートを使用してベンチマークを評価します。一方、**一部のVLMには特定のプロンプトテンプレートがある**場合があります(現時点ではコードベースでカバーされていない場合があります)。VLMの開発者には、現在カバーされていない場合でも、VLMEvalKitで独自のプロンプトテンプレートを実装することをお勧めします。これにより、再現性が向上します。 + +## 🖊️ 引用 + +この作業が役立つ場合は、このリポジトリに**スター🌟**を付けてください。サポートありがとうございます! + +[![Stargazers repo roster for @open-compass/VLMEvalKit](https://reporoster.com/stars/open-compass/VLMEvalKit)](https://github.com/open-compass/VLMEvalKit/stargazers) + +研究でVLMEvalKitを使用する場合、または公開されたオープンソースの評価結果を参照する場合は、以下のBibTeXエントリと、使用した特定のVLM/ベンチマークに対応するBibTexエントリを使用してください。 + +```bib +@misc{duan2024vlmevalkit, + title={VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models}, + author={Haodong Duan and Junming Yang and Yuxuan Qiao and Xinyu Fang and Lin Chen and Yuan Liu and Xiaoyi Dong and Yuhang Zang and Pan Zhang and Jiaqi Wang and Dahua Lin and Kai Chen}, + year={2024}, + eprint={2407.11691}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2407.11691}, +} +``` + +

🔝Top に戻る

+ +[github-contributors-link]: https://github.com/open-compass/VLMEvalKit/graphs/contributors +[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/VLMEvalKit?color=c4f042&labelColor=black&style=flat-square +[github-forks-link]: https://github.com/open-compass/VLMEvalKit/network/members +[github-forks-shield]: https://img.shields.io/github/forks/open-compass/VLMEvalKit?color=8ae8ff&labelColor=black&style=flat-square +[github-issues-link]: https://github.com/open-compass/VLMEvalKit/issues +[github-issues-shield]: https://img.shields.io/github/issues/open-compass/VLMEvalKit?color=ff80eb&labelColor=black&style=flat-square +[github-license-link]: https://github.com/open-compass/VLMEvalKit/blob/main/LICENSE +[github-license-shield]: https://img.shields.io/github/license/open-compass/VLMEvalKit?color=white&labelColor=black&style=flat-square +[github-stars-link]: https://github.com/open-compass/VLMEvalKit/stargazers +[github-stars-shield]: https://img.shields.io/github/stars/open-compass/VLMEvalKit?color=ffcb47&labelColor=black&style=flat-square diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/.readthedocs.yaml b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/.readthedocs.yaml new file mode 100644 index 00000000..b7e46fe3 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/.readthedocs.yaml @@ -0,0 +1,17 @@ +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + +formats: + - epub + +sphinx: + configuration: docs/zh-CN/conf.py + +python: + install: + - requirements: requirements/docs.txt diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/ConfigSystem.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/ConfigSystem.md new file mode 100644 index 00000000..14e8d495 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/ConfigSystem.md @@ -0,0 +1,69 @@ + +# 配置系统 + +默认情况下,VLMEvalKit通过在`run.py`脚本中使用`--model`和`--data`参数设置模型名称(在`/vlmeval/config.py`中定义)和数据集名称(在`vlmeval/dataset/__init__.py` 或 `vlmeval/dataset/video_dataset_config.py` 中定义)来启动评估。这种方法在大多数情况下简单且高效,但当用户希望使用不同设置评估多个模型/数据集时,可能不够灵活。 + +为了解决这个问题,VLMEvalKit提供了一个更灵活的配置系统。用户可以在json文件中指定模型和数据集设置,并通过`--config`参数将配置文件的路径传递给`run.py`脚本。以下是一个示例配置json: + +```json +{ + "model": { + "GPT4o_20240806_T00_HIGH": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 0, + "img_detail": "high" + }, + "GPT4o_20240806_T10_Low": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 1.0, + "img_detail": "low" + }, + "GPT4o_20241120": {} + }, + "data": { + "MME-RealWorld-Lite": { + "class": "MMERealWorld", + "dataset": "MME-RealWorld-Lite" + }, + "MMBench_DEV_EN_V11": { + "class": "ImageMCQDataset", + "dataset": "MMBench_DEV_EN_V11" + }, + "MMBench_Video_8frame_nopack":{}, + "Video-MME_16frame_subs": { + "class": "VideoMME", + "dataset": "Video-MME", + "nframe": 16, + "use_subtitle": true + } + } +} +``` + +配置json的解释: + +1. 现在我们支持两个字段:`model`和`data`,每个字段都是一个字典。字典的键是模型/数据集的名称(由用户设置),值是模型/数据集的设置。 +2. 对于`model`中的项目,值是一个包含以下键的字典: + - `class`:模型的类名,应该是`vlmeval/vlm/__init__.py`(开源模型)或`vlmeval/api/__init__.py`(API模型)中定义的类名。 + - 其他kwargs:其他kwargs是模型特定的参数,请参考模型类的定义以获取详细用法。例如,`model`、`temperature`、`img_detail`是`GPT4V`类的参数。值得注意的是,大多数模型类都需要`model`参数。 + - Tip:在位于`vlmeval/config.py`的变量`supported_VLM`中的已经被定义的模型可以作为`model`的键,而不需要填对应的值即可启动。例如,`GPT4o_20240806_T00_HIGH: {}`是等价于`GPT4o_20240806_T00_HIGH: {'class': 'GPT4V', 'model': 'gpt-4o-2024-08-06', 'temperature': 0, 'img_size': -1, 'img_detail': 'high', 'retry': 10, 'verbose': False}`。 +3. 对于字典`data`,我们建议用户使用官方数据集名称作为键(或键的一部分),因为我们经常根据数据集名称确定后处理/判断设置。对于`data`中的项目,值是一个包含以下键的字典: + - `class`:数据集的类名,应该是`vlmeval/dataset/__init__.py`中定义的类名。 + - 其他kwargs:其他kwargs是数据集特定的参数,请参考数据集类的定义以获取详细用法。通常,大多数数据集类都需要`dataset`参数。大多数视频数据集类都需要 `nframe` 或 `fps` 参数。 + - Tip:在位于`vlmeval/dataset/video_dataset_config.py`的变量`supported_video_dataset`中的已经被定义的数据集可以作为`data`的键,而不需要填对应的值即可启动。例如,`MMBench_Video_8frame_nopack: {}`是等价于`MMBench_Video_8frame_nopack: {'class': 'MMBenchVideo', 'dataset': 'MMBench-Video', 'nframe': 8, 'pack': False}`。 + +将示例配置json保存为`config.json`,您可以通过以下命令启动评估: + +```bash +python run.py --config config.json +``` + +这将在工作目录`$WORK_DIR`下生成以下输出文件(格式为`{$WORK_DIR}/{$MODEL_NAME}/{$MODEL_NAME}_{$DATASET_NAME}_*`): + +- `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MME-RealWorld-Lite*` +- `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MME-RealWorld-Lite*` +- `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MMBench_DEV_EN_V11*` +- `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MMBench_DEV_EN_V11*` +...... diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/Development.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/Development.md new file mode 100644 index 00000000..69db0649 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/Development.md @@ -0,0 +1,139 @@ +# 🛠️ 如何在 VLMEvalKit 中实现一个新的 Benchmark 或多模态模型(VLM) + +## 实现一个新的 benchmark + +示例 PR: **添加 Math-Vision Benchmark** ([#292](https://github.com/open-compass/VLMEvalKit/pull/292/files)) + +目前在 VLMEvalKit 中,benchmark 以数据集类的形式呈现,当你新增一个 benchmark 时,你可以选择复用现有的数据集类 (如单选题 benchmark 可复用 `ImageMCQDataset`),或是实现新的数据集类。你的数据集类必须支持以下两种方法 (复用父类或自行实现): + +- `build_prompt(self, line)`: 方法输入 `line` 类型为 int (对应数据 index) 或 `pd.Series` (对应数据原始 record)。方法输出一条 `multi-modal message` 作为多模态模型输入,`multi-modal message` 是一个图文交错的列表,如以下格式 (一图一文): `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`。 +- `evaluate(self, eval_file, **judge_kwargs)`: 方法输入 `eval_file` 为多模态模型的预测结果 (多以 `.xlsx` 格式存在),如 benchmark evaluation 需要大语言模型 (一般为 GPT) 辅助,则 `judge_kwargs` 传入大语言模型的参数。方法输出 benchmark 的评测结果,以 `dict` 或 `pd.DataFrame` 的形式。 + +以下,我们简述新增数据集的通常步骤: + +### 1. TSV 数据文件准备 (图文评测集) + +目前,我们将每一个 benchmark 数据集设置为一个单独的 TSV 文件。在推理过程中,数据文件将从数据集定义的 `DATASET_URL` 链接地址自动下载到 `$LMUData` 中(如果没有明确设置的话,默认路径是 `$HOME/LMUData`)。你可以将准备好的 TSV 文件上传到一个可下载的地址(如:huggingface),或发送给我们 ,我们将帮助上传数据集到服务器中。此外,你也可以在环境变量中自定义设置下载路径 `LMUData=/path/to/your/data`。 + +TSV 文件中的内容组成为: + +| 数据集名称 \ 字段 | index | image | image_path | question | hint | multi-choice
options | answer | category | l2-category | split | +| ---------------------- | ----- | ----- | ---------- | -------- | ---- | ----------------------- | ------ | -------- | ----------- | ----- | +| MMBench_DEV_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| MMBench_TEST_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | +| CCBench | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | | +| SEEDBench_IMG | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | | +| MME | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | | +| MMVet | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | | +| MMMU_DEV_VAL | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | +| COCO_VAL | ✅ | ✅ | | | | | ✅ | | | | +| OCRVQA_[TEST/TESTCORE] | ✅ | ✅ | | ✅ | | | ✅ | | | | +| TextVQA_VAL | ✅ | ✅ | | ✅ | | | ✅ | | | | +| VCR_[EN/ZH]\_[EASY/HARD]_[ALL/500/100] | ✅ | ✅ | | ✅ | | | ✅ | | | | + +
表 1. 支持的数据集的 TSV 字段。
+ +**TSV 中必须字段的介绍:** + +- **index:** 一个整数,`tsv` 中每一行的唯一标识 +- **image:** 图片的 base64 编码,你可以使用 `vlmeval/smp/vlm.py` 中实现的API进行编码和解码: + - 编码:`encode_image_to_base64`(对于PIL Image)/ `encode_image_file_to_base64`(对于图片文件路径) + - 解码:`decode_base64_to_image`(对于PIL Image)/ `decode_base64_to_image_file`(对于图片文件路径) +- **question:** 针对图像所提取出的问题,类型为字符串 +- **answer:** 问题的答案,类型为字符串,Test 集可缺失这一字段 + +### 2. 自定义数据集的 prompt 构建 + +`ImageBaseDataset` 定义了默认的 prompt 格式。如果需要针对数据集添加 prompt,或给模型输入 `Interleave` 的数据格式,可以通过 `build_prompt(line)` 函数实现。该函数输入为,每次给定 TSV 文件中的一行,包含 index, image, question 等内容作为 line。该函数将返回一个多模态消息 `msg` 的字典列表 `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`,包括图片路径和将被输入到 VLMs 的文本 prompt。对于 interleave 类型输入,可以直接将图片路径的字典放置到 image token 位置。 + +### 3. 自定义数据集的指标实现 + +增加对 benchmark 的评测需要自定义一个该数据集的 class 对象,从而实现数据集的指标计算。图文多模态数据集均继承自 `vlmeval/dataset/image_base.py` 中的 `ImageBaseDataset` 对象。其中 `TYPE` 定义了数据集的类型;`DATASET_URL` 为数据集的下载地址;`DATASET_MD5` 为数据集文件的 md5 一致性编码检查。 + +在 class 中**需要实现** `evaluate(eval_file, **judge_kwargs)` 类函数,对自定义的数据集结果进行指标计算和结果输出。函数输入 `eval_file` 为模型预测结果 `{model_name}_{dataset}.xlsx` 的路径。可以通过 `load(eval_file)` 文件将其读取为 panda.DataFrames 类型,其中包含 index, question, answer, category, prediction 等字段。`judge_kwargs` 参数将传递一个评测相关的字典,如:judge 模型的名称,api 请求线程数等。**函数的返回值**为评估完成的准确度等指标,其格式为由 list 组成的字典,并组织成 panda.DataFrames 类型。 + +## 实现一个新的模型 + +示例 PR: **支持 LLaVA-Next-Interleave** ([#294](https://github.com/open-compass/VLMEvalKit/pull/294)) + +**1. 支持 `generate_inner` API (必须)** + +现有所有的模型都在 `vlmeval/vlm` 中实现。对于一个最基本的模型,你的模型类**应该实现方法** `generate_inner(msgs, dataset=None)`。这个函数将向 VLM 输入一个多模态数据,并返回 VLM 的预测(一个字符串)。可选参数 `dataset` 可以用作模型在不同推理策略之间切换的标志。 + +其中多模态消息 `msgs` 是一个字典列表,每个字典有两个键:类型和值: +- `type`:我们目前支持两种类型,选项是 ["image", "text"]。 +- `value`:当类型为 `text` 时,值是文本消息(一个字符串);当类型为 `image` 时,值可以是图像文件的本地路径,或者是图像的URL。 + +> 目前,一个多模态消息可能包含任意交错的图像和文本。如果你的模型不支持这一点,我们推荐的做法是取第一张图像和连接的文本消息作为模型的输入。你可以在模型的 class 中设置 `INTERLEAVE = False` 并调用 `self.message_to_promptimg(message, dataset=dataset)` 函数来获取你的 prompt 和第一张图片的地址。 + +一些多模态消息的例子: + +```python +IMAGE_PTH = 'assets/apple.jpg' +IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg' +msg1 = [ + dict(type='image', value=IMAGE_PTH), + dict(type='text', value='What is in this image?') +] +msg2 = [ + dict(type='image', value=IMAGE_URL), + dict(type='image', value=IMAGE_URL), + dict(type='text', value='How many apples are there in these images?') +] +response = model.generate(msg1) +``` + +为了方便起见,我们还支持接受字符串列表作为输入。在这种情况下,我们将检查一个字符串是图像路径还是图像 URL,并自动将其转换为 `list[dict]` 格式: + +```python +IMAGE_PTH = 'assets/apple.jpg' +IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg' +msg1 = [IMAGE_PTH, 'What is in this image?'] +msg2 = [IMAGE_URL, IMAGE_URL, 'How many apples are there in these images?'] +response = model.generate(msg1) +``` + +**2. 支持自定义提示词构建 (可选)** + +此外,你的模型可以通过实现两个可选方法来支持自定义提示构建:`use_custom_prompt(dataset)` 和 `build_prompt(line, dataset=None)`。 + +- `use_custom_prompt(dataset)` 将返回一个布尔值,指示模型是否应使用自定义提示构建策略。 +- 如果`use_custom_prompt(dataset)`返回 True,`build_prompt(line, dataset)` 应该为相应的数据集返回一个自定义构建的多模态消息,line 数据是一个包含数据样本所需信息的字典。如果`use_custom_prompt(dataset)` 返回False,则将使用默认的 prompt 构建策略。 + +**3. 支持多轮对话 (可选)** + +你可以通过支持 `chat_inner(message, dataset)` API 为你的模型新增多轮对话功能并兼容多轮对话评测。这个 API 输出一个字符串型回复,`message` 包含一个聊天记录的列表,格式如下: + +```python +# Assume msg1, msg2, msg3, ... are multi-modal messages following the previously described format +# `chat_inner` take the following chat history list as input: +message = [ + dict(role='user', content=msg1), + dict(role='assistant', content=msg2), + dict(role='user', content=msg3), + dict(role='assistant', content=msg4), + ...... + dict(role='user', content=msgn), +] +# `message` should contain an odd number of chat utterances, the role of utterances should be interleaved "user" and "assistant", with the role of the last utterance to be "user". +# The chat function will call `chat_inner` +response = model.chat(message) +``` + +### 示例 PRs: + +- 不支持交错的图像和文本,且不使用自定义提示的VLM:[[模型] 支持 glm-4v-9b](https://github.com/open-compass/VLMEvalKit/pull/221) +- 支持交错的图像和文本及自定义提示的VLM:[添加 MiniCPM-Llama3-V-2.5](https://github.com/open-compass/VLMEvalKit/pull/205) +- VLM API:[特征添加 glmv](https://github.com/open-compass/VLMEvalKit/pull/201) + +## 为 VLMEvalKit 贡献代码 + +如果你想为 **VLMEvalKit** 贡献代码,请在提交PR之前进行预提交检查。这有助于保持代码整洁。 + +```bash +# 在VLMEvalKit的目录下,安装预提交 hook: +pip install pre-commit +pre-commit install +pre-commit run --all-files +# 然后提交你的代码。 +``` diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/EvalByLMDeploy.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/EvalByLMDeploy.md new file mode 100644 index 00000000..cdb46c70 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/EvalByLMDeploy.md @@ -0,0 +1,28 @@ +# 使用 LMDeploy 加速评测推理 + +VLMEvalKit 支持测试由 LMDeploy 部署的 VLM 模型,下面以 InternVL2-8B 为例,展示如何测试模型 + +## 第0步 安装 LMDeploy + +```bash +pip install lmdeploy +``` + +其他安装方式可以参考 LMDeploy 的[文档](https://github.com/InternLM/lmdeploy) + +## 第1步 启动推理服务 + +```bash +lmdeploy serve api_server OpenGVLab/InternVL2-8B --model-name InternVL2-8B +``` +> [!IMPORTANT] +> 因为 VLMEvalKit 中的模型对于不同数据集在构建 prompt 时可能有自定义行为,如 InternVL2 对于 HallusionBench 的处理,所以,server 端在启动的时候需要指定 `--model-name`,这样在使用 LMDEploy api 时可以根据名字选择合适的 prompt 构建策略。 +> +> 如果指定了 `--server-port`,需要设置对应的环境变量 `LMDEPLOY_API_BASE` + + +## 第2步 评测 + +```bash +python run.py --data MMStar --model InternVL2-8B --verbose --api-nproc 64 +``` diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/Makefile b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/Makefile new file mode 100644 index 00000000..d4bb2cbb --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/Quickstart.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/Quickstart.md new file mode 100644 index 00000000..1245235b --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/Quickstart.md @@ -0,0 +1,231 @@ +# 快速开始 + +在运行评测脚本之前,你需要先**配置** VLMs,并正确设置模型路径。然后你可以使用脚本 `run.py` 进行多个VLMs和基准测试的推理和评估。 + +## 第0步 安装和设置必要的密钥 + +**安装** + +```bash +git clone https://github.com/open-compass/VLMEvalKit.git +cd VLMEvalKit +pip install -e . +``` + +**设置密钥** + +要使用 API 模型(如 GPT-4v, Gemini-Pro-V 等)进行推理,或使用 LLM API 作为**评判者或选择提取器**,你需要首先设置 API 密钥。如果你设置了密钥,VLMEvalKit 将使用一个评判 LLM 从输出中提取答案,否则它将使用**精确匹配模式**(在输出字符串中查找 "Yes", "No", "A", "B", "C"...)。**精确匹配模式只能应用于是或否任务和多项选择任务。** + +- 你可以将所需的密钥放在 `$VLMEvalKit/.env` 中,或直接将它们设置为环境变量。如果你选择创建 `.env` 文件,其内容将如下所示: + + ```bash + # .env 文件,将其放置在 $VLMEvalKit 下 + # 专有 VLMs 的 API 密钥 + # QwenVL APIs + DASHSCOPE_API_KEY= + # Gemini w. Google Cloud Backends + GOOGLE_API_KEY= + # OpenAI API + OPENAI_API_KEY= + OPENAI_API_BASE= + # StepAI API + STEPAI_API_KEY= + # REKA API + REKA_API_KEY= + # GLMV API + GLMV_API_KEY= + # CongRong API + CW_API_BASE= + CW_API_KEY= + # SenseNova API + SENSENOVA_API_KEY= + # Hunyuan-Vision API + HUNYUAN_SECRET_KEY= + HUNYUAN_SECRET_ID= + # LMDeploy API + LMDEPLOY_API_BASE= + # MiniMax API + MINIMAX_API_KEY= + # 你可以设置一个评估时代理,评估阶段产生的 API 调用将通过这个代理进行 + EVAL_PROXY= + ``` + +- 如果需要使用 API 在对应键值空白处填写上你的密钥。这些 API 密钥将在进行推理和评估时自动加载。 +## 第1步 配置 + +**VLM 配置**:所有 VLMs 都在 `vlmeval/config.py` 中配置。对于某些 VLMs(如 MiniGPT-4、LLaVA-v1-7B),需要额外的配置(在配置文件中配置代码 / 模型权重根目录)。在评估时,你应该使用 `vlmeval/config.py` 中 `supported_VLM` 指定的模型名称来选择 VLM。确保在开始评估之前,你可以成功使用 VLM 进行推理,使用以下命令 `vlmutil check {MODEL_NAME}`。 + +注:对于Qwen-VL系列模型(Qwen-VL, Qwen2-VL, Qwen2.5-VL),vlmeval/config.py 中所指定的像素数量上下界如下: + +``` +min_pixels=1280 * 28 * 28, +max_pixels=16384 * 28 * 28, +``` +其中,1280为Qwen官方为平衡性能、计算资源与内存的推荐最大值,而16384为模型输入的理论最大值。这种设定对于部分需要高分辨率的视觉任务(如文档理解)有着积极的作用。但考虑这一设定并没有实际的依据,如果需要与官方的设定对齐,可以去掉这两个数值,或是设置为以下来自Qwen官方demo的数值: + +``` +min_pixels=256 * 28 * 28, +max_pixels=1280 * 28 * 28, +``` + +## 第2步 评测 + +**新功能!!!** 我们集成了一个新的配置系统,以实现更灵活的评估设置。查看[文档](/docs/zh-CN/ConfigSystem.md)或运行`python run.py --help`了解更多详情 🔥🔥🔥 + +我们使用 `run.py` 进行评估。你可以使用 `$VLMEvalKit/run.py` 或创建脚本的软链接运行(以便在任何地方使用该脚本): + +**参数** + +- `--data (list[str])`: 设置在 VLMEvalKit 中支持的数据集名称(可以在代码库首页的 README 中找到支持的数据集列表) +- `--model (list[str])`: 设置在 VLMEvalKit 中支持的 VLM 名称(在 `vlmeval/config.py` 中的 `supported_VLM` 中定义) +- `--mode (str, 默认值为 'all', 可选值为 ['all', 'infer'])`:当 mode 设置为 "all" 时,将执行推理和评估;当设置为 "infer" 时,只执行推理 +- `--api-nproc (int, 默认值为 4)`: 调用 API 的线程数 +- `--work-dir (str, default to '.')`: 存放测试结果的目录 + +**用于评测图像多模态评测集的命令** + +你可以使用 `python` 或 `torchrun` 来运行脚本: + +```bash +# 使用 `python` 运行时,只实例化一个 VLM,并且它可能使用多个 GPU。 +# 这推荐用于评估参数量非常大的 VLMs(如 IDEFICS-80B-Instruct)。 + +# 在 MMBench_DEV_EN、MME 和 SEEDBench_IMG 上使用 IDEFICS-80B-Instruct 进行推理和评估 +python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose +# 在 MMBench_DEV_EN、MME 和 SEEDBench_IMG 上使用 IDEFICS-80B-Instruct 仅进行推理 +python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose --mode infer + +# 使用 `torchrun` 运行时,每个 GPU 上实例化一个 VLM 实例。这可以加快推理速度。 +# 但是,这仅适用于消耗少量 GPU 内存的 VLMs。 + +# 在 MMBench_DEV_EN、MME 和 SEEDBench_IMG 上使用 IDEFICS-9B-Instruct、Qwen-VL-Chat、mPLUG-Owl2。在具有 8 个 GPU 的节点上进行推理和评估。 +torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct qwen_chat mPLUG-Owl2 --verbose +# 在 MME 上使用 Qwen-VL-Chat。在具有 2 个 GPU 的节点上进行推理和评估。 +torchrun --nproc-per-node=2 run.py --data MME --model qwen_chat --verbose +``` + +**用于评测视频多模态评测集的命令** + +```bash +# 使用 `python` 运行时,只实例化一个 VLM,并且它可能使用多个 GPU。 +# 这推荐用于评估参数量非常大的 VLMs(如 IDEFICS-80B-Instruct)。 + +# 在 MMBench-Video 上评测 IDEFCIS2-8B, 视频采样 8 帧作为输入,不采用 pack 模式评测. MMBench_Video_8frame_nopack 是一个定义在 `vlmeval/dataset/video_dataset_config.py` 的数据集设定. +torchrun --nproc-per-node=8 run.py --data MMBench_Video_8frame_nopack --model idefics2_8 +# 在 MMBench-Video 上评测 GPT-4o (API 模型), 视频采样每秒一帧作为输入,采用 pack 模式评测 +python run.py --data MMBench_Video_1fps_pack --model GPT4o +``` + +评估结果将作为日志打印出来。此外,**结果文件**也会在目录 `$YOUR_WORKING_DIRECTORY/{model_name}` 中生成。以 `.csv` 结尾的文件包含评估的指标。 +### 常见问题 +#### 构建输入prompt:`build_prompt()`函数 +如果您在评测某个benchmark时,发现模型输出的结果与预期不符,可能是因为您使用的模型没有正确构建输入prompt。 + +在VLMEvalkit中,每个`dataset`类都包含一个名为`build_prompt()`的函数,用于构建输入问题的格式。不同的benchmark可以选择自定义`build_prompt()`函数,也可以使用默认的实现。 + +例如,在处理默认的[多选题/Multi-Choice QA]([vlmeval/dataset/image_mcq.py](https://github.com/open-compass/VLMEvalKit/blob/43af13e052de6805a8b08cd04aed5e0d74f82ff5/vlmeval/dataset/image_mcq.py#L164))时,`ImageMCQDataset.build_prompt()`类会将`hint`、`question`、`options`等元素(若数据集中包含)组合成一个完整的问题格式,如下所示: +``` +HINT +QUESTION +Options: +A. Option A +B. Option B +··· +Please select the correct answer from the options above. +``` + +此外,由于不同模型对评测的需求可能有所不同,VLMEvalkit也支持在模型层面自定义对不同benchmark构建prompt的方法,即`model.build_prompt()`,具体示例可以参考[InternVL](https://github.com/open-compass/VLMEvalKit/blob/43af13e052de6805a8b08cd04aed5e0d74f82ff5/vlmeval/vlm/internvl_chat.py#L324)。 + +**注意:当同时定义了`model.build_prompt()`以及`dataset.build_prompt()`时,`model.build_prompt()`将优先于`dataset.build_prompt()`,即前者会覆盖后者。** + +由于部分模型(如Qwen2VL,InternVL等)对于不同类型的benchmark定义了广泛的prompt构建方法,为了更灵活地适应不同的benchmark,VLMEvalkit支持在模型中自定义`model.use_custom_prompt()`函数。通过添加或者修改`use_custom_prompt()`函数,您可以决定对于哪些benchmark使用模型自定义的`use_custom_prompt()`方法,示例如下: +``` +def use_custom_prompt(self, dataset: str) -> bool: + from vlmeval.dataset import DATASET_TYPE, DATASET_MODALITY + dataset_type = DATASET_TYPE(dataset, default=None) + if not self._use_custom_prompt: + return False + if listinstr(['MMVet'], dataset): + return True + if dataset_type == 'MCQ': + return True + if DATASET_MODALITY(dataset) == 'VIDEO': + return False + return False +``` +仅当`use_custom_prompt()`函数返回`True`时,VLMEvalkit才会对当前benchmark调用模型的`build_prompt()`函数。 +通过这种方式,您可以根据具体需求灵活地控制哪些benchmark使用模型自定义的prompt构建逻辑,从而更好地适配不同模型和任务的需求。 + +#### 模型切分 + +目前 VLMEvalKit 的启动方式自动支持同机上进程间 GPU 资源的划分与模型切分。该功能在推理后端为 `lmdeploy` 或 `transformers` 时被支持,具体行为如下: + +- 基于 `python` 命令启动时,模型默认分配到所有可用的 GPU 上,如想指定使用哪些 GPU,可以使用 `CUDA_VISIBLE_DEVICES` 环境变量。 +- 基于 `torchrun` 命令启动时,每个模型实例会被分配到 `N_GPU // N_PROC` 个 GPU 上,`N_PROC` 为 torchrun 命令中的 `--nproc-per-node` 参数所指定的进程数。`N_GPU` 的取值为: + - 如 `CUDA_VISIBLE_DEVICES` 环境变量未设置,`N_GPU` 为全部可用 GPU 数量。 + - 如 `CUDA_VISIBLE_DEVICES` 环境变量被设置,`N_GPU` 为 `CUDA_VISIBLE_DEVICES` 环境变量所指定的 GPU 数量,并且,仅有指定的 GPU 会被利用。 + +下面提供了,在一台配备 8 块 GPU 的机器上运行评测任务的具体示例: +```bash +# +torchrun --nproc-per-node=2 run.py --data MMBench_DEV_EN --model InternVL3-78B +# +python run.py --data MMBench_DEV_EN --model InternVL3-78B +# +CUDA_VISIBLE_DEVICES=1,2,3,4,5,6 torchrun --nproc-per-node=3 run.py --data MMBench_DEV_EN --model InternVL3-38B +``` + +注:此方式不支持 `vllm` 后端,基于 `vllm` 后端起评测任务时,请用 `python` 命令启动,默认调用所有可见的 GPU。 + +#### 性能差距 +在不同的运行环境中,模型的性能表现可能会有所差异。因此,在评估过程中,您可能会发现自己的评测结果与VLMEvalKit官方榜单上的结果存在差距。这种差异可能与`transformers`, `cuda`, `torch`等版本的变化有关。 + +此外,对于异常的表现,我们建议您优先查看运行完成后的本地生成记录`{model}_{dataset}.xlsx`或者评估记录`{model}_{dataset}_{judge_model}.xlsx`,这可能会帮助您更好地理解评估结果并发现问题。 + + + +### 部署本地语言模型作为评判 / 选择提取器 +上述默认设置使用 OpenAI 的 GPT 作为评判 LLM。你也可以使用 [LMDeploy](https://github.com/InternLM/lmdeploy) 部署本地评判 LLM。 + +首先进行安装: +``` +pip install lmdeploy openai +``` + +然后可以通过一行代码部署本地评判 LLM。LMDeploy 将自动从 Huggingface 下载模型。假设我们使用 internlm2-chat-1_8b 作为评判,端口为 23333,密钥为 sk-123456(密钥必须以 "sk-" 开头,后跟任意数字): +``` +lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333 +``` + +使用以下 Python 代码获取由 LMDeploy 注册的模型名称: +``` +from openai import OpenAI +client = OpenAI( + api_key='sk-123456', + base_url="http://0.0.0.0:23333/v1" +) +model_name = client.models.list().data[0].id +``` + +配置对应环境变量,以告诉 VLMEvalKit 如何使用本地评判 LLM。正如上面提到的,也可以在 `$VLMEvalKit/.env` 文件中设置: +``` +OPENAI_API_KEY=sk-123456 +OPENAI_API_BASE=http://0.0.0.0:23333/v1/chat/completions +LOCAL_LLM= +``` + +最后,你可以运行第2步中的命令,使用本地评判 LLM 来评估你的 VLM。 + +**请注意:** + +- 如果你希望将评判 LLM 部署在单独的一个 GPU 上,并且由于 GPU 内存有限而希望在其他 GPU 上评估你的 VLM,可以使用 `CUDA_VISIBLE_DEVICES=x` 这样的方法,例如: +``` +CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333 +CUDA_VISIBLE_DEVICES=1,2,3 torchrun --nproc-per-node=3 run.py --data HallusionBench --model qwen_chat --verbose +``` +- 如果本地评判 LLM 在遵循指令方面不够好,评估过程可能会失败。请通过 issues 报告此类失败情况。 +- 可以以不同的方式部署评判 LLM,例如使用私有 LLM(而非来自 HuggingFace)或使用量化 LLM。请参考 [LMDeploy doc](https://lmdeploy.readthedocs.io/en/latest/serving/api_server.html) 文档。也可以使用其他支持 OpenAI API 框架的方法。 + +### 使用 LMDeploy 加速模型推理 + +可参考[文档](/docs/zh-CN/EvalByLMDeploy.md) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/README_zh-CN.md b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/README_zh-CN.md new file mode 100644 index 00000000..92c526fc --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/README_zh-CN.md @@ -0,0 +1,131 @@ +
+ +![LOGO](http://opencompass.openxlab.space/utils/MMLB.jpg) + +VLMEvalKit: 一种多模态大模型评测工具 + +[![][github-contributors-shield]][github-contributors-link] • [![][github-forks-shield]][github-forks-link] • [![][github-stars-shield]][github-stars-link] • [![][github-issues-shield]][github-issues-link] • [![][github-license-shield]][github-license-link] + +[English](/README.md) | 简体中文 | [日本語](/docs/ja/README_ja.md) + +🏆 OpenCompass 排行榜 • +🏗️ 快速开始 • +📊 数据集和模型 • +🛠️ 开发指南 • +🎯 我们的目标 • +🖊️ 引用 + +🤗 HuggingFace 排行榜 (存档全部性能) • +🤗 原始评测记录 • +🔊 Discord • +📝 技术报告 +
+ +**VLMEvalKit** (python 包名为 **vlmeval**) 是一款专为大型视觉语言模型 (Large Vision-Language Models, LVLMs) 评测而设计的开源工具包。该工具支持在各种基准测试上对大型视觉语言模型进行**一键评估**,无需进行繁重的数据准备工作,让评估过程更加简便。在 VLMEvalKit 中,我们对所有大型视觉语言模型生成的结果进行评测,并提供基于**精确匹配**与基于 **LLM 的答案提取**两种评测结果。 + +## 🆕 更新 + +- **[2025-04-29]** 优化 `torchrun` 启动逻辑:目前 `torchrun` 启动时,若进程数为 M,机器 GPU 卡数为 N,将会自动调整每个进程分配的 GPU 数量为 `N // M`。目前此分配方式适用于 `transformers`, `lmdeploy` 推理后端,`vllm` 推理后端仅支持使用 python 启动 🔥🔥🔥 +- **[2025-02-20]** 支持新模型:**InternVL2.5 series, QwenVL2.5 series, QVQ-72B, Doubao-VL, Janus-Pro-7B, MiniCPM-o-2.6, InternVL2-MPO, LLaVA-CoT, Hunyuan-Standard-Vision, Ovis2, Valley, SAIL-VL, Ross, Long-VITA, EMU3, SmolVLM**。支持新基准:**MMMU-Pro, WeMath, 3DSRBench, LogicVista, VL-RewardBench, CC-OCR, CG-Bench, CMMMU, WorldSense**。请参考[**VLMEvalKit Features**](https://aicarrier.feishu.cn/wiki/Qp7wwSzQ9iK1Y6kNUJVcr6zTnPe?table=tblsdEpLieDoCxtb)以获取更多信息。感谢社区的各位贡献者 🔥🔥🔥 +- **[2024-11-21]** 集成了一个新的配置系统,以实现更灵活的评估设置。查看[文档](/docs/zh-CN/ConfigSystem.md)或运行`python run.py --help`了解更多详情 🔥🔥🔥 +- **[2024-11-21]** 支持 **[QSpatial](https://andrewliao11.github.io/spatial_prompt/)**,一个用于定量空间推理的多模态基准(例如,确定大小/距离),感谢 **[andrewliao11](https://github.com/andrewliao11)** 提供官方支持 🔥🔥🔥 +- **[2024-11-21]** 支持 **[MM-Math](https://github.com/kge-sun/mm-math)**,一个包含约6K初中多模态推理数学问题的新多模态数学基准。GPT-4o-20240806在该基准上达到了22.5%的准确率 🔥🔥🔥 +- **[2024-11-16]** 支持 **[OlympiadBench](https://github.com/OpenBMB/OlympiadBench)**,一个多模态基准,包含奥林匹克级别的数学和物理问题 🔥🔥🔥 +- **[2024-11-16]** 支持 **[WildVision](https://huggingface.co/datasets/WildVision/wildvision-bench)**,一个基于多模态竞技场数据的主观多模态基准 🔥🔥🔥 +- **[2024-11-13]** 支持 **[MIA-Bench](https://arxiv.org/abs/2407.01509)**,一个多模态指令跟随基准 🔥🔥🔥 +- **[2024-11-08]** 支持 **[Aria](https://arxiv.org/abs/2410.05993)**,一个多模态原生 MoE 模型,感谢 **[teowu](https://github.com/teowu)** 🔥🔥🔥 +- **[2024-11-04]** 支持 **[WorldMedQA-V](https://www.arxiv.org/abs/2410.12722)**,该基准包含 1000 多个医学 VQA 问题,涵盖巴西、以色列、日本、西班牙等四个国家的语言,以及它们的英文翻译 🔥🔥🔥 + +## 🏗️ 快速开始 + +请参阅[**快速开始**](/docs/zh-CN/Quickstart.md)获取入门指南。 + +## 📊 评测结果,支持的数据集和模型 + +### 评测结果 + +**[OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard)**: **[下载全部细粒度测试结果](http://opencompass.openxlab.space/assets/OpenVLM.json)**. + +请查看[**VLMEvalKit Features**](https://aicarrier.feishu.cn/wiki/Qp7wwSzQ9iK1Y6kNUJVcr6zTnPe?table=tblsdEpLieDoCxtb)中的 **Supported Benchmarks** 标签,以查看所有支持的图像和视频基准(70+)。 + +请查看[**VLMEvalKit Features**](https://aicarrier.feishu.cn/wiki/Qp7wwSzQ9iK1Y6kNUJVcr6zTnPe?table=tblsdEpLieDoCxtb)中的 **Supported LMMs** 标签,以查看所有支持的 LMMs,包括商业 API、开源模型等(200+)。 + +### 其他 + +**Transformers 的版本推荐:** + +**请注意**,某些 VLM 可能无法在某些特定的 transformers 版本下运行,我们建议使用以下设置来评估对应的VLM: + +- **请用** `transformers==4.33.0` **来运行**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`. +- **请用** `transformers==4.37.0 ` **来运行**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`, `Cambrian Series`, `VILA Series`, `Llama-3-MixSenseV1_1`, `Parrot-7B`, `PLLaVA Series`. +- **请用** `transformers==4.40.0 ` **来运行**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `360VL-70B`, `Phi-3-Vision`, `WeMM`. +- **请用** `transformers==4.42.0 ` **来运行**: `AKI`. +- **请用** `transformers==latest` **来运行**: `LLaVA-Next series`, `PaliGemma-3B`, `Chameleon series`, `Video-LLaVA-7B-HF`, `Ovis series`, `Mantis series`, `MiniCPM-V2.6`, `OmChat-v2.0-13B-sinlge-beta`, `Idefics-3`, `GLM-4v-9B`, `VideoChat2-HD`. + +**如何测试一个 VLM 是否可以正常运行:** + +```python +from vlmeval.config import supported_VLM +model = supported_VLM['idefics_9b_instruct']() +# 前向单张图片 +ret = model.generate(['assets/apple.jpg', 'What is in this image?']) +print(ret) # 这张图片上有一个带叶子的红苹果 +# 前向多张图片 +ret = model.generate(['assets/apple.jpg', 'assets/apple.jpg', 'How many apples are there in the provided images? ']) +print(ret) # 提供的图片中有两个苹果 +``` + +## 🛠️ 开发指南 + +要开发自定义评测数据集,支持其他 VLMs,或为 VLMEvalKit 贡献代码,请参阅[**开发指南**](/docs/zh-CN/Development_zh-CN.md)。 + +为激励来自社区的共享并分享相应的 credit,在下一次 report 更新中,我们将: + +- 致谢所有的 contribution +- 具备三个或以上主要贡献 (支持新模型、评测集、或是主要特性) 的贡献者将可以加入技术报告的作者列表 。合条件的贡献者可以创建 issue 或是在 [VLMEvalKit Discord Channel](https://discord.com/invite/evDT4GZmxN) 私信 kennyutc,我们将进行跟进 + +## 🎯 VLMEvalKit 的目标 + +**该代码库的设计目标是:** + +1. 提供一个**易于使用**的**开源评估工具包**,方便研究人员和开发人员评测现有的多模态大模型,并使评测结果**易于复现**。 +2. 使 VLM 开发人员能够轻松地评测自己的模型。在多个支持的基准测试上评估 VLM,只需实现一个 `generate_inner()` 函数,所有其他工作负载(数据下载、数据预处理、预测推理、度量计算)都由代码库处理。 + +**该代码库的设计目标不是:** + +复现所有**第三方基准测试**原始论文中报告的准确数字。有两个相关的原因: +1. VLMEvalKit 对所有 VLMs 使用基于生成的评估(可选使用基于 LLM 的答案提取)。同时,一些基准测试可能官方使用不同的方法(*例如,SEEDBench 使用基于 PPL 的评估*)。对于这些基准测试,我们在相应的结果中比较两个得分。我们鼓励开发人员在代码库中支持其他评估范式。 +2. 默认情况下,我们对所有多模态模型使用相同的提示模板来评估基准测试。同时,**一些多模态模型可能有他们特定的提示模板**(目前可能未在代码库中涵盖)。我们鼓励 VLM 的开发人员在 VLMEvalKit 中实现自己的提示模板,如果目前未覆盖。这将有助于提高可复现性。 + +## 🖊️ 引用 + +如果我们的工作对您有所帮助,请考虑 **star🌟** VLMEvalKit。感谢支持! + +[![Stargazers repo roster for @open-compass/VLMEvalKit](https://reporoster.com/stars/open-compass/VLMEvalKit)](https://github.com/open-compass/VLMEvalKit/stargazers) + +如果您在研究中使用了 VLMEvalKit,或希望参考已发布的开源评估结果,请使用以下 BibTeX 条目以及与您使用的特定 VLM / 基准测试相对应的 BibTex 条目。 + +```bib +@misc{duan2024vlmevalkit, + title={VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models}, + author={Haodong Duan and Junming Yang and Yuxuan Qiao and Xinyu Fang and Lin Chen and Yuan Liu and Xiaoyi Dong and Yuhang Zang and Pan Zhang and Jiaqi Wang and Dahua Lin and Kai Chen}, + year={2024}, + eprint={2407.11691}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2407.11691}, +} +``` + +

🔝回到顶部

+ +[github-contributors-link]: https://github.com/open-compass/VLMEvalKit/graphs/contributors +[github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/VLMEvalKit?color=c4f042&labelColor=black&style=flat-square +[github-forks-link]: https://github.com/open-compass/VLMEvalKit/network/members +[github-forks-shield]: https://img.shields.io/github/forks/open-compass/VLMEvalKit?color=8ae8ff&labelColor=black&style=flat-square +[github-issues-link]: https://github.com/open-compass/VLMEvalKit/issues +[github-issues-shield]: https://img.shields.io/github/issues/open-compass/VLMEvalKit?color=ff80eb&labelColor=black&style=flat-square +[github-license-link]: https://github.com/open-compass/VLMEvalKit/blob/main/LICENSE +[github-license-shield]: https://img.shields.io/github/license/open-compass/VLMEvalKit?color=white&labelColor=black&style=flat-square +[github-stars-link]: https://github.com/open-compass/VLMEvalKit/stargazers +[github-stars-shield]: https://img.shields.io/github/stars/open-compass/VLMEvalKit?color=ffcb47&labelColor=black&style=flat-square diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/css/readthedocs.css b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/css/readthedocs.css new file mode 100644 index 00000000..c83beffd --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/css/readthedocs.css @@ -0,0 +1,63 @@ +.header-logo { + background-image: url("../image/logo.svg"); + background-size: 275px 80px; + height: 80px; + width: 275px; +} + + +@media screen and (min-width: 1100px) { + .header-logo { + top: -25px; + } +} + +pre { + white-space: pre; +} + +@media screen and (min-width: 2000px) { + .pytorch-content-left { + width: 1200px; + margin-left: 30px; + } + article.pytorch-article { + max-width: 1200px; + } + .pytorch-breadcrumbs-wrapper { + width: 1200px; + } + .pytorch-right-menu.scrolling-fixed { + position: fixed; + top: 45px; + left: 1580px; + } +} + + +article.pytorch-article section code { + padding: .2em .4em; + background-color: #f3f4f7; + border-radius: 5px; +} + +/* Disable the change in tables */ +article.pytorch-article section table code { + padding: unset; + background-color: unset; + border-radius: unset; +} + +table.autosummary td { + width: 50% +} + +img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +article.pytorch-article p.rubric { + font-weight: bold; +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/image/logo.svg b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/image/logo.svg new file mode 100644 index 00000000..04353057 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/image/logo.svg @@ -0,0 +1,24 @@ + + + +Created with Fabric.js 5.3.0 + + + + + + + + + + + + + VLMEvalKit + diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/image/logo_icon.svg b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/image/logo_icon.svg new file mode 100644 index 00000000..c46dd3b5 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/image/logo_icon.svg @@ -0,0 +1,31 @@ + + + + + + + + + + + + + + + + + + + diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/js/custom.js b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/js/custom.js new file mode 100644 index 00000000..84da69d4 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_static/js/custom.js @@ -0,0 +1,10 @@ +var collapsedSections = []; + +$(document).ready(function () { + $('.model-summary').DataTable({ + "stateSave": false, + "lengthChange": false, + "pageLength": 20, + "order": [] + }); +}); diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_templates/404.html b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_templates/404.html new file mode 100644 index 00000000..64910175 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_templates/404.html @@ -0,0 +1,18 @@ +{% extends "layout.html" %} + +{% block body %} + +

Page Not Found

+

+ The page you are looking for cannot be found. +

+

+ If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in + the content table left, or go to the homepage. +

+ + +{% endblock %} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_templates/autosummary/class.rst b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_templates/autosummary/class.rst new file mode 100644 index 00000000..4c3a7a9a --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_templates/autosummary/class.rst @@ -0,0 +1,13 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :members: + +.. + autogenerated from _templates/autosummary/class.rst + note it does not have :inherited-members: diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_templates/callable.rst b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_templates/callable.rst new file mode 100644 index 00000000..3a7b9d2b --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/_templates/callable.rst @@ -0,0 +1,14 @@ +.. role:: hidden + :class: hidden-section +.. currentmodule:: {{ module }} + + +{{ name | underline}} + +.. autoclass:: {{ name }} + :members: + :special-members: __call__ + +.. + autogenerated from _templates/callable.rst + note it does not have :inherited-members: diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/conf.py b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/conf.py new file mode 100644 index 00000000..92283478 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/conf.py @@ -0,0 +1,242 @@ +# flake8: noqa +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. + +import ast +import os +import subprocess +import sys + +import pytorch_sphinx_theme +from sphinx.builders.html import StandaloneHTMLBuilder + +sys.path.insert(0, os.path.abspath('../../')) + +# -- Project information ----------------------------------------------------- + +project = 'VLMEvalKit' +copyright = '2023, VLMEvalKit' +author = 'VLMEvalKit Authors' + +# The full version, including alpha/beta/rc tags +version_file = '../../vlmeval/__init__.py' + + +def get_version(): + with open(version_file, 'r') as f: + file_content = f.read() + # Parse the file content into an abstract syntax tree (AST) + tree = ast.parse(file_content, filename=version_file) + + # Iterate through the body of the AST, looking for an assignment to __version__ + for node in tree.body: + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == '__version__': + return node.value.s + raise ValueError('__version__ not found') + + +release = get_version() + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'sphinx_copybutton', + 'sphinx_tabs.tabs', + 'notfound.extension', + 'sphinxcontrib.jquery', + 'sphinx_design', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +language = 'cn' + +# The master toctree document. +root_doc = 'index' +html_context = { + 'github_version': 'latest', +} +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'pytorch_sphinx_theme' +html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# yapf: disable +html_theme_options = { + 'menu': [ + { + 'name': 'GitHub', + 'url': 'https://github.com/open-compass/VLMEvalKit' + }, + ], + # Specify the language of shared menu + 'menu_lang': 'cn', + # Disable the default edit on GitHub + 'default_edit_on_github': False, +} +# yapf: enable + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_css_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css', + 'css/readthedocs.css' +] +html_js_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js', + 'js/custom.js' +] + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'vlmevalkitdoc' + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (root_doc, 'vlmevalkit.tex', 'VLMEvalKit Documentation', author, + 'manual'), +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', [author], + 1)] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', author, + 'VLMEvalKit Authors', 'AGI evaluation toolbox and benchmark.', + 'Miscellaneous'), +] + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + +# set priority when building html +StandaloneHTMLBuilder.supported_image_types = [ + 'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg' +] + +# -- Extension configuration ------------------------------------------------- +# Ignore >>> when copying code +copybutton_prompt_text = r'>>> |\.\.\. ' +copybutton_prompt_is_regexp = True + +# Auto-generated header anchors +myst_heading_anchors = 3 +# Enable "colon_fence" extension of myst. +myst_enable_extensions = ['colon_fence', 'dollarmath'] + +# Configuration for intersphinx +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'numpy': ('https://numpy.org/doc/stable', None), + 'torch': ('https://pytorch.org/docs/stable/', None), + 'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None), + 'transformers': + ('https://huggingface.co/docs/transformers/main/en/', None), +} +napoleon_custom_sections = [ + # Custom sections for data elements. + ('Meta fields', 'params_style'), + ('Data fields', 'params_style'), +] + +# Disable docstring inheritance +autodoc_inherit_docstrings = False +# Mock some imports during generate API docs. +autodoc_mock_imports = ['rich', 'attr', 'einops'] +# Disable displaying type annotations, these can be very verbose +autodoc_typehints = 'none' + +# The not found page +notfound_template = '404.html' + + +def builder_inited_handler(app): + subprocess.run(['./cp_origin_docs.sh']) + + +def setup(app): + app.connect('builder-inited', builder_inited_handler) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/cp_origin_docs.sh b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/cp_origin_docs.sh new file mode 100644 index 00000000..1e728323 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/cp_origin_docs.sh @@ -0,0 +1,9 @@ +#!/usr/bin/env bash + +# Copy *.md files from docs/ if it doesn't have a Chinese translation + +for filename in $(find ../en/ -name '*.md' -printf "%P\n"); +do + mkdir -p $(dirname $filename) + cp -n ../en/$filename ./$filename +done diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/docutils.conf b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/docutils.conf new file mode 100644 index 00000000..0c00c846 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/docutils.conf @@ -0,0 +1,2 @@ +[html writers] +table_style: colwidths-auto diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/index.rst b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/index.rst new file mode 100644 index 00000000..5147a23b --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/docs/zh-CN/index.rst @@ -0,0 +1,49 @@ +欢迎来到 VLMEvalKit 中文教程! +========================================== + +VLMEvalKit 上手路线 +------------------------------- + +为了用户能够快速上手,我们推荐以下流程: + +- 对于想要使用 VLMEvalKit 的用户,我们推荐先阅读 开始你的第一步_ 部分来设置环境,并启动一个迷你实验熟悉流程。 + +- 若您想进行更多模块的自定义,例如增加数据集和模型,我们提供了 进阶教程_ 。 + +我们始终非常欢迎用户的 PRs 和 Issues 来完善 VLMEvalKit! + +.. _快速开始: +.. toctree:: + :maxdepth: 1 + :caption: 快速开始 + + Quickstart.md + + +.. .. _教程: +.. .. toctree:: +.. :maxdepth: 1 +.. :caption: 教程 + +.. user_guides/framework_overview.md + +.. _进阶教程: +.. toctree:: + :maxdepth: 1 + :caption: 进阶教程 + + Development.md + ConfigSystem.md + +.. .. _其他说明: +.. .. toctree:: +.. :maxdepth: 1 +.. :caption: 其他说明 + +.. notes/contribution_guide.md + +索引与表格 +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/requirements.txt b/evaluation/cosmos3/reasoner/vlmevalkit/requirements.txt new file mode 100644 index 00000000..13466c1f --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/requirements.txt @@ -0,0 +1,80 @@ + +absl-py +anls +apted +bert_score +boto3 +cairosvg +colormath +datasets==2.21.0 +decord; platform_machine != 'arm64' +distance +editdistance +emoji +eva-decord; platform_machine == 'arm64' +imageio +immutabledict +ipdb +jieba +json_repair +langdetect +latex2sympy2 +antlr4-python3-runtime==4.9.3 # latex2sympy2 1.5.4 ships an ATN-v3 parser; unpinned floats to 4.13.2 (ATN-v4) and breaks MathVision eval + +# Upstream benchmark dependencies (imported at module load time) +Levenshtein +loguru +lpips +math-verify==0.9.0 # unpinned floats latex2sympy2->1.9.1, backtracking math-verify to 0.1.0 which imports the uninstalled latex2sympy2_extended +matplotlib +nest_asyncio +nltk +num2words +numpy +omegaconf +openai +opencv-python +openpyxl +pandas +pillow +physical-ai-av==0.2.2 +polygon3 +portalocker +protobuf +pylatexenc + +### Default to Cloud API use case. Choose proper vllm/accelerate versions for local inference. +# vllm +# acceleratepython-box +python-box +python-dotenv +qwen-vl-utils +qwen-vl-utils==0.0.14 +regex +requests +rich +rouge +s3fs +scikit-learn + +scipy +sentence_transformers +sentencepiece +setuptools +spacy +sty +syllapy +tabulate +termcolor +tiktoken +timeout-decorator +timm +torch>=2.0.1 +torchmetrics +torchvision>=0.15.2 +tqdm +transformers==4.57.1 # 5.0 does not work; 4.57.0 is yanked on PyPI +unicodedata2 +validators +xlsxwriter +zss diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/requirements/docs.txt b/evaluation/cosmos3/reasoner/vlmevalkit/requirements/docs.txt new file mode 100644 index 00000000..02587e64 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/requirements/docs.txt @@ -0,0 +1,11 @@ +docutils==0.18.1 +modelindex +myst-parser +-e git+https://github.com/open-compass/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme +sphinx==6.1.3 +sphinx-copybutton +sphinx-design +sphinx-notfound-page +sphinx-tabs +sphinxcontrib-jquery +tabulate diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/run.py b/evaluation/cosmos3/reasoner/vlmevalkit/run.py new file mode 100644 index 00000000..cb9a4166 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/run.py @@ -0,0 +1,551 @@ +import argparse +import copy as cp +import datetime +import json +import os +import os.path as osp +import subprocess +from functools import partial + +import pandas as pd +from tabulate import tabulate + + +# GET the number of GPUs on the node without importing libs like torch +def get_gpu_list(): + CUDA_VISIBLE_DEVICES = os.environ.get('CUDA_VISIBLE_DEVICES', '') + if CUDA_VISIBLE_DEVICES != '': + gpu_list = [int(x) for x in CUDA_VISIBLE_DEVICES.split(',')] + return gpu_list + try: + ps = subprocess.Popen(('nvidia-smi', '--list-gpus'), stdout=subprocess.PIPE) + output = subprocess.check_output(('wc', '-l'), stdin=ps.stdout) + return list(range(int(output))) + except Exception: + return [] + + +RANK = int(os.environ.get('RANK', 0)) +WORLD_SIZE = int(os.environ.get('WORLD_SIZE', 1)) +LOCAL_WORLD_SIZE = int(os.environ.get("LOCAL_WORLD_SIZE", 1)) +LOCAL_RANK = int(os.environ.get("LOCAL_RANK", 1)) + +GPU_LIST = get_gpu_list() +if LOCAL_WORLD_SIZE > 1 and len(GPU_LIST): + NGPU = len(GPU_LIST) + assert NGPU >= LOCAL_WORLD_SIZE, "The number of processes should be less than or equal to the number of GPUs" + GPU_PER_PROC = NGPU // LOCAL_WORLD_SIZE + DEVICE_START_IDX = GPU_PER_PROC * LOCAL_RANK + CUDA_VISIBLE_DEVICES = [str(i) for i in GPU_LIST[DEVICE_START_IDX: DEVICE_START_IDX + GPU_PER_PROC]] + CUDA_VISIBLE_DEVICES = ','.join(CUDA_VISIBLE_DEVICES) + # Set CUDA_VISIBLE_DEVICES + os.environ['CUDA_VISIBLE_DEVICES'] = CUDA_VISIBLE_DEVICES + print( + f'RANK: {RANK}, LOCAL_RANK: {LOCAL_RANK}, WORLD_SIZE: {WORLD_SIZE},' + f'LOCAL_WORLD_SIZE: {LOCAL_WORLD_SIZE}, CUDA_VISIBLE_DEVICES: {CUDA_VISIBLE_DEVICES}' + ) + + +from vlmeval.config import supported_VLM +from vlmeval.dataset import build_dataset +from vlmeval.dataset.video_dataset_config import supported_video_datasets +from vlmeval.inference import infer_data_job +from vlmeval.inference_mt import infer_data_job_mt +from vlmeval.inference_video import infer_data_job_video +from vlmeval.smp import (MMBenchOfficialServer, get_pred_file_format, githash, listinstr, load, + load_env, ls, prepare_reuse_files, proxy_set, setup_logger, timestr) +from vlmeval.utils.result_transfer import MMMU_result_transfer, MMTBench_result_transfer + + +# Make WORLD_SIZE invisible when build models +def build_model_from_config(cfg, model_name, use_vllm=False): + import vlmeval.api + import vlmeval.vlm + ws_bak = os.environ.pop('WORLD_SIZE', None) + + config = cp.deepcopy(cfg[model_name]) + if use_vllm: + config['use_vllm'] = use_vllm + if 'class' not in config: + return supported_VLM[model_name](**config) + cls_name = config.pop('class') + if hasattr(vlmeval.api, cls_name): + model = getattr(vlmeval.api, cls_name)(**config) + elif hasattr(vlmeval.vlm, cls_name): + model = getattr(vlmeval.vlm, cls_name)(**config) + else: + raise ValueError(f'Class {cls_name} is not supported in `vlmeval.api` or `vlmeval.vlm`') + + if ws_bak: + os.environ['WORLD_SIZE'] = ws_bak + return model + + +def build_dataset_from_config(cfg, dataset_name): + import inspect + + import vlmeval.dataset + config = cp.deepcopy(cfg[dataset_name]) + if config == {}: + return supported_video_datasets[dataset_name]() + assert 'class' in config + cls_name = config.pop('class') + if hasattr(vlmeval.dataset, cls_name): + cls = getattr(vlmeval.dataset, cls_name) + sig = inspect.signature(cls.__init__) + if 'dataset' in sig.parameters and 'dataset' not in config: + config['dataset'] = dataset_name + has_var_keyword = any( + p.kind == inspect.Parameter.VAR_KEYWORD for p in sig.parameters.values() + ) + if has_var_keyword: + valid_params = config + else: + valid_params = {k: v for k, v in config.items() if k in sig.parameters} + return cls(**valid_params) + else: + raise ValueError(f'Class {cls_name} is not supported in `vlmeval.dataset`') + + +def parse_args(): + help_msg = """\ +You can launch the evaluation by setting either --data and --model or --config. + +--data and --model: + Each Arg should be a list of strings, specifying the names of datasets and models. + To find all supported model names, please refer to the `vlmeval/config.py` of check the output of the command \ + `vlmutil mlist all` in the terminal (you should first have vlmeval installed). + To find all supported dataset names, please refer to the `vlmeval/dataset/__init__.py` file. The python script \ + to print all supported dataset names is as follows: + ```python + from vlmeval.dataset import SUPPORTED_DATASETS + print(SUPPORTED_DATASETS) + ``` + or you can check the output of the command `vlmutil dlist all` in the terminal. + To find all supported video dataset default settings, please refer to the \ + `vlmeval/dataset/video_dataset_config.py` file. + +--config: + Launch the evaluation by specifying the path to the config json file. Sample Json Content: + ```json + { + "model": { + "GPT4o_20240806_T00_HIGH": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 0, + "img_detail": "high" + }, + "GPT4o_20240806_T10_Low": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 1.0, + "img_detail": "low" + }, + "GPT4o_20241120": {} + }, + "data": { + "MME-RealWorld-Lite": { + "class": "MMERealWorld", + "dataset": "MME-RealWorld-Lite" + }, + "MMBench_DEV_EN_V11": { + "class": "ImageMCQDataset", + "dataset": "MMBench_DEV_EN_V11" + }, + "MMBench_Video_8frame_nopack": {}, + "Video-MME_16frame_subs": { + "class": "VideoMME", + "dataset": "Video-MME", + "nframe": 16, + "use_subtitle": true, + } + } + } + ``` + Currently, only `model` and `data` are supported fields. The content of each field is a dictionary. + For `model`, the key is the name of the model, and the value is a dictionary containing the following keys: + - `class`: The class name of the model, which should be a class in `vlmeval.vlm` or `vlmeval.api`. + - Other keys are specific to the model, please refer to the corresponding class. + - Tip: The defined model in the `supported_VLM` of `vlmeval/config.py` can be used as a shortcut. + For `data`, the key is the name of the dataset (should be the same as the `dataset` field in most cases, \ + except for video datasets), and the value is a dictionary containing the following keys: + - `class`: The class name of the dataset, which should be a class in `vlmeval.dataset`. + - `dataset`: The name of the dataset, which should be a string that is accepted by the `dataset` argument of the \ + corresponding class. + - Other keys are specific to the dataset, please refer to the corresponding class. + - Tip: The defined dataset in the `supported_video_datasets` of `vlmeval/dataset/video_dataset_config.py` \ + can be used as a shortcut. + + The keys in the `model` and `data` fields will be used for naming the prediction files and evaluation results. + When launching with `--config`, args for API VLMs, such as `--retry`, `--verbose`, will be ignored. +""" + parser = argparse.ArgumentParser(description=help_msg, formatter_class=argparse.RawTextHelpFormatter) + # Essential Args, Setting the Names of Datasets and Models + parser.add_argument('--data', type=str, nargs='+', help='Names of Datasets') + parser.add_argument('--model', type=str, nargs='+', help='Names of Models') + parser.add_argument('--config', type=str, help='Path to the Config Json File') + # Work Dir + parser.add_argument('--work-dir', type=str, default='./outputs', help='select the output directory') + # Infer + Eval or Infer Only + parser.add_argument('--mode', type=str, default='all', choices=['all', 'infer', 'eval']) + # API Kwargs, Apply to API VLMs and Judge API LLMs + parser.add_argument('--api-nproc', type=int, default=4, help='Parallel API calling') + parser.add_argument('--retry', type=int, default=None, help='retry numbers for API VLMs') + parser.add_argument('--judge-nproc', type=int, default=None, help='Parallel judge API calls (default: api-nproc)') + parser.add_argument('--judge-args', type=str, default=None, help='Judge arguments in JSON format') + # Explicitly Set the Judge Model + parser.add_argument('--judge', type=str, default=None) + # Logging Utils + parser.add_argument('--verbose', action='store_true') + # Configuration for Resume + # Ignore: will not rerun failed VLM inference + parser.add_argument('--ignore', action='store_true', help='Ignore failed indices. ') + # Reuse: will reuse the existing prediction files + parser.add_argument('--reuse', action='store_true') + # Reuse-aux: if set, when reuse is True, will also reuse the auxiliary evaluation files + parser.add_argument('--reuse-aux', type=int, default=True, help='reuse auxiliary evaluation files') + parser.add_argument( + '--use-vllm', action='store_true', help='use vllm to generate, the flag is only supported in Llama4 for now') + parser.add_argument('--use-verifier', action='store_true', help='use verifier to evaluate') + parser.add_argument('--save-eval-results', action='store_true', help='save eval results to json') + + args = parser.parse_args() + return args + + +def _save_df_eval_results(eval_results: pd.DataFrame, filepath: str) -> None: + with open(filepath, "w") as outfile: + outfile.write(eval_results.to_json(orient="split")) + + +def _save_dict_eval_results(eval_results: dict, filepath: str) -> None: + with open(filepath, "w") as outfile: + json.dump(eval_results, outfile) + + +def main(): + args = parse_args() + use_config, cfg = False, None + if args.config is not None: + assert args.data is None and args.model is None, '--data and --model should not be set when using --config' + use_config, cfg = True, load(args.config) + args.model = list(cfg['model'].keys()) + args.data = list(cfg['data'].keys()) + else: + assert len(args.data), '--data should be a list of data files' + + if 'MMEVAL_ROOT' in os.environ: + args.work_dir = os.environ['MMEVAL_ROOT'] + + date, commit_id = timestr('day'), githash(digits=8) + eval_id = f"T{date}_G{commit_id}" + logger = setup_logger(log_file=os.path.join(args.work_dir, 'logs', f'{eval_id}_{timestr()}.log')) + + if RANK == 0: + if not args.reuse: + logger.warning('--reuse is not set, will not reuse previous (before one day) temporary files') + else: + logger.warning('--reuse is set, will reuse the latest prediction & temporary pickle files') + + if not use_config: + for k, v in supported_VLM.items(): + if hasattr(v, 'keywords') and 'retry' in v.keywords and args.retry is not None: + v.keywords['retry'] = args.retry + supported_VLM[k] = v + if hasattr(v, 'keywords') and 'verbose' in v.keywords and args.verbose is not None: + v.keywords['verbose'] = args.verbose + supported_VLM[k] = v + + # If FWD_API is set, will use class `GPT4V` for all API models in the config + if os.environ.get('FWD_API', None) == '1': + from vlmeval.api import GPT4V + from vlmeval.config import api_models as supported_APIs + for m in args.model: + if m in supported_APIs: + kws = supported_VLM[m].keywords + supported_VLM[m] = partial(GPT4V, **kws) + logger.warning(f'FWD_API is set, will use class `GPT4V` for {m}') + + if WORLD_SIZE > 1: + import torch.distributed as dist + dist.init_process_group( + backend='nccl', + timeout=datetime.timedelta(seconds=int(os.environ.get('DIST_TIMEOUT', 3600))) + ) + + for _, model_name in enumerate(args.model): + logger.info(f'=========== {model_name} ===========') + model = None + date, commit_id = timestr('day'), githash(digits=8) + eval_id = f"T{date}_G{commit_id}" + + pred_root = osp.join(args.work_dir, model_name, eval_id) + pred_root_meta = osp.join(args.work_dir, model_name) + os.makedirs(pred_root_meta, exist_ok=True) + + prev_pred_roots = ls(osp.join(args.work_dir, model_name), mode='dir') + if len(prev_pred_roots) and args.reuse: + prev_pred_roots.sort() + + if not osp.exists(pred_root): + os.makedirs(pred_root, exist_ok=True) + + if use_config: + model = build_model_from_config(cfg['model'], model_name, args.use_vllm) + + for _, dataset_name in enumerate(args.data): + logger.info(f'----------- {dataset_name} -----------') + if WORLD_SIZE > 1: + dist.barrier() + + try: + pred_format = get_pred_file_format() + result_file_base = f'{model_name}_{dataset_name}.{pred_format}' + + if use_config: + if WORLD_SIZE > 1: + if RANK == 0: + dataset = build_dataset_from_config(cfg['data'], dataset_name) + dist.barrier() + dataset = build_dataset_from_config(cfg['data'], dataset_name) + if dataset is None: + logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ') + continue + else: + dataset_kwargs = {} + if dataset_name in ['MMLongBench_DOC', 'DUDE', 'DUDE_MINI', 'SLIDEVQA', 'SLIDEVQA_MINI']: + dataset_kwargs['model'] = model_name + + # If distributed, first build the dataset on the main process for doing preparation works + if WORLD_SIZE > 1: + if RANK == 0: + dataset = build_dataset(dataset_name, **dataset_kwargs) + dist.barrier() + + dataset = build_dataset(dataset_name, **dataset_kwargs) + if dataset is None: + logger.error(f'Dataset {dataset_name} is not valid, will be skipped. ') + continue + + # Handling Multi-Turn Dataset + result_file = osp.join(pred_root, result_file_base) + # Reuse the previous prediction file if exists + if RANK == 0 and len(prev_pred_roots): + prepare_reuse_files( + pred_root_meta=pred_root_meta, eval_id=eval_id, model_name=model_name, + dataset_name=dataset_name, reuse=args.reuse, reuse_aux=args.reuse_aux + ) + + if WORLD_SIZE > 1: + dist.barrier() + + if model is None: + model = model_name # which is only a name + + if args.mode != "eval": + # Perform the Inference + if dataset.MODALITY == 'VIDEO': + model = infer_data_job_video( + model, + work_dir=pred_root, + model_name=model_name, + dataset=dataset, + result_file_name=result_file_base, + verbose=args.verbose, + api_nproc=args.api_nproc, + use_vllm=args.use_vllm) + elif dataset.TYPE == 'MT': + model = infer_data_job_mt( + model, + work_dir=pred_root, + model_name=model_name, + dataset=dataset, + verbose=args.verbose, + api_nproc=args.api_nproc, + ignore_failed=args.ignore, + use_vllm=args.use_vllm) + else: + model = infer_data_job( + model, + work_dir=pred_root, + model_name=model_name, + dataset=dataset, + verbose=args.verbose, + api_nproc=args.api_nproc, + ignore_failed=args.ignore, + use_vllm=args.use_vllm) + + # Set the judge kwargs first before evaluation or dumping + + judge_kwargs = { + 'nproc': args.judge_nproc if args.judge_nproc is not None else args.api_nproc, + 'verbose': args.verbose, + 'retry': args.retry if args.retry is not None else 3, + **(json.loads(args.judge_args) if args.judge_args else {}), + } + + if args.retry is not None: + judge_kwargs['retry'] = args.retry + if args.judge is not None: + judge_kwargs['model'] = args.judge + else: + print(dataset_name) + if dataset.TYPE in ['MCQ', 'Y/N', 'MCQ_MMMU_Pro'] or listinstr( + ['moviechat1k', 'mme-reasoning'], dataset_name.lower() + ): + if listinstr(['WeMath', 'MME-Reasoning'], dataset_name): + judge_kwargs['model'] = 'gpt-4o-mini' + elif listinstr(['VisualPuzzles'], dataset_name): + judge_kwargs['model'] = 'exact_matching' + elif listinstr(['PuzzleVQA'], dataset_name): + judge_kwargs['model'] = 'exact_matching' + elif listinstr(['VisuLogic'], dataset_name): + judge_kwargs['model'] = 'exact_matching' + else: + judge_kwargs['model'] = 'chatgpt-0125' + elif listinstr(['MMVet', 'LLaVABench', 'MMBench_Video'], dataset_name): + if listinstr(['LLaVABench_KO'], dataset_name): + judge_kwargs['model'] = 'gpt-4o-0806' + else: + judge_kwargs['model'] = 'gpt-4-turbo' + elif listinstr(['VGRPBench'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'LENS', 'DynaMath', 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning', 'VTCBench', 'Asclepius', 'MMSafetyBench', 'MSSBench', 'SIUO', 'SIUO_GEN', 'XSTest', 'Flames'], dataset_name): # noqa: E501 + judge_kwargs['model'] = 'gpt-4o-mini' + elif listinstr(['OlympiadBench'], dataset_name): + use_api_judger = judge_kwargs.get("olympiad_use_api_judger", False) + if use_api_judger: + judge_kwargs['model'] = 'gpt-4o-mini' + elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', 'WildVision', 'MMAlignBench', 'MM-IFEval'], dataset_name): # noqa: E501 + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['ChartMimic'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['VDC'], dataset_name): + judge_kwargs['model'] = 'llama31-8b' + elif listinstr(['Video_MMLU_QA', 'Video_MMLU_CAP'], dataset_name): + judge_kwargs['model'] = 'qwen-72b' + elif listinstr(['MMVMBench'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['CVQA_EN', 'CVQA_LOC'], dataset_name): + judge_kwargs['model'] = 'gpt-4.1' + elif listinstr(['M4Bench'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['AyaVisionBench'], dataset_name): + judge_kwargs['model'] = 'gpt-4.1' + elif listinstr(['MathCanvas'], dataset_name): + judge_kwargs['model'] = 'gpt-4.1-2025-04-14' + elif listinstr(['MMReason'], dataset_name): + judge_kwargs['model'] = 'gpt-4.1', + elif listinstr(['CoreCognition'], dataset_name): + judge_kwargs['model'] = 'gpt-4.1' + elif listinstr(['WorldVQA'], dataset_name): + judge_kwargs['model'] = 'gpt-4o-1120' + + if args.use_verifier: + judge_kwargs['use_verifier'] = True + if args.use_vllm: + judge_kwargs['use_vllm'] = True + + if RANK == 0: + logger.info(judge_kwargs) + + if WORLD_SIZE > 1: + dist.barrier() + + # Only RANK 0 handles the evaluation part + if RANK == 0: + # Prepare Submission Files for MMMU_TEST AND MMT-Bench_ALL + if dataset_name in ['MMMU_TEST']: + result_json = MMMU_result_transfer(result_file) + logger.info(f'Transfer MMMU_TEST result to json for official evaluation, ' + f'json file saved in {result_json}') + continue + elif 'MMT-Bench_ALL' in dataset_name: + submission_file = MMTBench_result_transfer(result_file, **judge_kwargs) + logger.info(f'Extract options from prediction of MMT-Bench FULL split for official evaluation ' + f'(https://eval.ai/web/challenges/challenge-page/2328/overview), ' + f'submission file saved in {submission_file}') + continue + + # Skip the evaluation part if only infer + if args.mode == 'infer': + continue + + # Skip the evaluation part if the dataset evaluation is not supported or annotations are missing + if 'MLLMGuard_DS' in dataset_name: + logger.info('The evaluation of MLLMGuard_DS is not supported yet. ') + continue + elif 'AesBench_TEST' == dataset_name: + logger.info(f'The results are saved in {result_file}. ' + f'Please send it to the AesBench Team via huangyipo@hotmail.com.') + continue + elif dataset_name in ['DocVQA_TEST', 'InfoVQA_TEST', 'Q-Bench1_TEST', 'A-Bench_TEST']: + logger.info(f'{dataset_name} is a test split without ground-truth. ' + 'Thus only the inference part is supported for those datasets. ') + continue + elif dataset_name in [ + 'MMBench_TEST_CN', 'MMBench_TEST_EN', 'MMBench', 'MMBench_CN', + 'MMBench_TEST_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_V11', 'MMBench_CN_V11' + ] and not MMBenchOfficialServer(dataset_name): + logger.error( + f'Can not evaluate {dataset_name} on non-official servers, will skip the evaluation.') + continue + + # Setup the proxy for the evaluation + eval_proxy = os.environ.get('EVAL_PROXY', None) + old_proxy = os.environ.get('HTTP_PROXY', '') + if eval_proxy is not None: + proxy_set(eval_proxy) + + # Perform the Evaluation + eval_results = dataset.evaluate(result_file, **judge_kwargs) + # Display Evaluation Results in Terminal + if eval_results is not None: + assert isinstance(eval_results, dict) or isinstance(eval_results, pd.DataFrame) + logger.info(f'The evaluation of model {model_name} x dataset {dataset_name} has finished! ') + logger.info('Evaluation Results:') + if isinstance(eval_results, dict): + logger.info('\n' + json.dumps(eval_results, indent=4)) + if args.save_eval_results: + _save_dict_eval_results( + eval_results, + osp.join(pred_root, f'{model_name}_{dataset_name}.dict.eval.json'), + ) + elif isinstance(eval_results, pd.DataFrame): + if len(eval_results) < len(eval_results.columns): + eval_results = eval_results.T + logger.info('\n' + tabulate(eval_results)) + if args.save_eval_results: + _save_df_eval_results( + eval_results, + osp.join(pred_root, f'{model_name}_{dataset_name}.df.eval.json'), + ) + + # Restore the proxy + if eval_proxy is not None: + proxy_set(old_proxy) + + # Create the symbolic links for the prediction files + files = os.listdir(pred_root) + files = [x for x in files if (f'{model_name}_{dataset_name}' in x or "status.json" in x)] + for f in files: + cwd = os.getcwd() + file_addr = osp.join(cwd, pred_root, f) + link_addr = osp.join(cwd, pred_root_meta, f) + if osp.exists(link_addr) or osp.islink(link_addr): + os.remove(link_addr) + os.symlink(file_addr, link_addr) + + except Exception as e: + logger.exception(f'Model {model_name} x Dataset {dataset_name} combination failed: {e}, ' + 'skipping this combination.') + continue + + if WORLD_SIZE > 1: + dist.destroy_process_group() + + +if __name__ == '__main__': + load_env() + main() diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/run_api.py b/evaluation/cosmos3/reasoner/vlmevalkit/run_api.py new file mode 100644 index 00000000..478021aa --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/run_api.py @@ -0,0 +1,397 @@ +import argparse +import asyncio +import datetime +import json +import os +from functools import partial +from pathlib import Path +from typing import List + +from vlmeval.api import LMDeployAPI +from vlmeval.api.adapters import get_adapter_registry +from vlmeval.config import supported_VLM +from vlmeval.dataset import build_dataset +from vlmeval.inference_api import APIEvalPipeline, DatasetConfig +from vlmeval.smp import (get_pred_file_format, githash, listinstr, load_env, prepare_reuse_files, + setup_logger, timestr) + +group_dic = { + 'general-mini': ['MMMU_Pro_10c'], + 'math-reasoning-mini': ['MathVista_MINI', 'OlympiadBench', 'IPhO_2025', 'Physics'], + 'sci-reasoning-mini': ['SFE', 'MaCBench', 'MicroVQA', 'XLRS-Bench-lite', 'MSEarthMCQ'], + 'language-mini': ['MM-IFEval'], + 'coding-mini': ['ChartMimic_v2_direct'], + 'svg-mini': ['SArena_MINI'], + 'agent-mini': ['ScreenSpot_v2_Mobile', 'ScreenSpot_v2_Desktop', 'ScreenSpot_v2_Web'], + 'video-mini': ['Video-MME_64frame', 'VideoMMMU_48frame'], + 'sensing-mini': ['RefCOCO', 'OCRBench_v2_MINI', 'CCOCR', 'ChartQAPro', 'BLINK'], +} + + +def get_judge_kwargs(dataset_name: str, args) -> dict: + """Determine the default judge kwargs by dataset name.""" + judge_kwargs = { + 'nproc': args.judge_api_nproc, + 'verbose': args.verbose, + 'retry': args.judge_retry, + 'timeout': args.judge_timeout, + **(json.loads(args.judge_args) if args.judge_args else {}), + } + + if args.judge_base_url: + judge_kwargs['api_base'] = f"{args.judge_base_url.rstrip('/')}/chat/completions" + if args.judge_key: + judge_kwargs['key'] = args.judge_key + + if args.judge is not None: + judge_kwargs['model'] = args.judge + else: + judge_kwargs['model'] = 'gpt-4o-mini' # default + + if listinstr(['WeMath', 'MME-Reasoning'], dataset_name): + judge_kwargs['model'] = 'gpt-4o-mini' + elif listinstr(['VisuLogic'], dataset_name): + judge_kwargs['model'] = 'exact_matching' + elif listinstr(['MMVet', 'LLaVABench', 'MMBench_Video'], dataset_name): + if listinstr(['LLaVABench_KO'], dataset_name): + judge_kwargs['model'] = 'gpt-4o-0806' + else: + judge_kwargs['model'] = 'gpt-4-turbo' + elif listinstr(['VGRPBench'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['MathVista', 'MathVerse', 'MathVision', 'DynaMath', + 'VL-RewardBench', 'LogicVista', 'MOAT', 'OCR_Reasoning'], dataset_name): + judge_kwargs['model'] = 'gpt-4o-mini' + elif listinstr(['OlympiadBench'], dataset_name): + use_api_judger = judge_kwargs.get("olympiad_use_api_judger", False) + if use_api_judger: + judge_kwargs['model'] = 'gpt-4o-mini' + elif listinstr(['MMLongBench', 'MMDU', 'DUDE', 'SLIDEVQA', 'MIA-Bench', + 'WildVision', 'MMAlignBench', 'MM-IFEval'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['ChartMimic'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['VDC'], dataset_name): + judge_kwargs['model'] = 'llama31-8b' + elif listinstr(['Video_MMLU_QA', 'Video_MMLU_CAP'], dataset_name): + judge_kwargs['model'] = 'qwen-72b' + elif listinstr(['MMVMBench'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['CVQA_EN', 'CVQA_LOC'], dataset_name): + judge_kwargs['model'] = 'gpt-4.1' + elif listinstr(['M4Bench'], dataset_name): + judge_kwargs['model'] = 'gpt-4o' + elif listinstr(['AyaVisionBench'], dataset_name): + judge_kwargs['model'] = 'gpt-4.1' + elif listinstr(['MathCanvas'], dataset_name): + judge_kwargs['model'] = 'gpt-4.1-2025-04-14' + elif listinstr(['MMReason'], dataset_name): + judge_kwargs['model'] = 'gpt-4.1' + elif listinstr(['Video-MME'], dataset_name): + judge_kwargs['model'] = 'chatgpt-0125' + + if args.use_verifier: + judge_kwargs['use_verifier'] = True + if args.use_vllm: + judge_kwargs['use_vllm'] = True + + return judge_kwargs + + +def parse_args(): + help_msg = """\ +VLMEvalKit API Pipeline Runner + +This script uses an optimized pipeline for API-based models with the following improvements: +- Cross-dataset unified inference queue +- Parallel inference and evaluation +- Better remote model utilization + +You can launch the evaluation by setting either --data and --model. + +--data and --model: + Specify dataset names and model configuration for API-based inference. + +For more details, see the documentation in run.py. +""" + parser = argparse.ArgumentParser( + description=help_msg, + formatter_class=argparse.RawTextHelpFormatter + ) + + parser.add_argument('--data', type=str, nargs='+', help='Names of Datasets') + parser.add_argument('--group', type=str, nargs='+', default=None, + help='Benchmark groups to evaluate (see group_dic). Use "all" to run all groups.') + + # ================ 推理模型参数 ============== + parser.add_argument('--model', type=str, required=True) + parser.add_argument('--base-url', type=str, default=None, + help='Base URL of OpenAI-compatible API (e.g. http://localhost:8080/v1). ' + 'If set, LMDeployAPI is used for inference without modifying config.py.') + parser.add_argument('--key', type=str, default='sk-admin', help='API key for inference model') + parser.add_argument('--thinker', action='store_true', + help='Enable thinking mode: doubles timeout and max_tokens.') + parser.add_argument('--use-enable-thinking', action='store_true', + help='Pass enable_thinking flag to the model.') + parser.add_argument('--enable-thinking', action='store_true', + help='Value of enable_thinking passed to model (requires --use-enable-thinking).') + parser.add_argument('--max-tokens', type=int, default=2 ** 15, + help='Max tokens for model generation.') + parser.add_argument('--temperature', type=float, default=None) + parser.add_argument('--top-k', type=int, default=None) + parser.add_argument('--top-p', type=float, default=None) + parser.add_argument('--repetition-penalty', type=float, default=None) + parser.add_argument('--presence-penalty', type=float, default=None) + parser.add_argument('--api-nproc', type=int, default=32, + help='Parallel API calling (inference concurrency)') + parser.add_argument('--timeout', type=int, default=1800, + help='Max time in seconds for a single inference request.') + parser.add_argument('--retry', type=int, default=6, + help='Retry times for failed inference.') + parser.add_argument('--custom-prompt', type=str, + choices=list(get_adapter_registry().keys()), default=None, + help='Manually select a model adapter by name.') + + # ================ judge 模型参数 ============== + parser.add_argument('--judge', type=str, default=None) + parser.add_argument('--judge-base-url', type=str, default=None, + help='Base URL of judge API') + parser.add_argument('--judge-key', type=str, default=None, + help='API key for judge model') + parser.add_argument('--judge-api-nproc', type=int, default=32, + help='Parallel API calling for judger') + parser.add_argument('--judge-retry', type=int, default=6, + help='Retry times for failed judgement.') + parser.add_argument('--judge-timeout', type=int, default=600, + help='Max time in seconds for judgement.') + # legacy judger parameters + parser.add_argument('--judge-args', type=str, default=None, + help='Judge arguments in JSON format') + + parser.add_argument('--work-dir', type=str, default='./outputs', + help='Select the output directory') + parser.add_argument('--mode', type=str, default='all', + choices=['all', 'infer', 'eval'], + help='Mode: all (infer+eval), infer (only), eval (only)') + parser.add_argument('--verbose', action='store_true') + parser.add_argument('--ignore', action='store_true', + help='Ignore failed indices') + parser.add_argument('--reuse', action='store_true', + help='Reuse existing prediction files') + parser.add_argument('--reuse-aux', type=int, default=True, + help='Reuse auxiliary evaluation files') + parser.add_argument('--use-vllm', action='store_true', + help='Use vllm to generate') + parser.add_argument('--use-verifier', action='store_true', + help='Use verifier to evaluate') + parser.add_argument('--monitor-interval', type=int, default=30, + help='Status monitoring interval (seconds)') + parser.add_argument('--debug', action='store_true', + help='Debug mode: run evaluation in main process') + + args = parser.parse_args() + return args + + +def main(): + args = parse_args() + + # ============================================== + # Resolve --group into dataset list + # ============================================== + if args.group is not None and len(args.group) > 0: + if 'all' in args.group: + groups = list(group_dic.keys()) + else: + groups = args.group + assert args.data is None, '--data and --group should not be set at the same time' + args.data = [] + for g in groups: + assert g in group_dic, f'Unknown group: {g}. Available: {list(group_dic.keys())}' + args.data.extend(group_dic[g]) + + assert args.data, '--data or --group must be set' + + # ============================================== + # Prepare work dir and logging + # ============================================== + date, commit_id = timestr('day'), githash(digits=8) + eval_id = f"T{date}_G{commit_id}" + model_name = args.model.replace('/', '--') + + # Work dir for the specified model + work_dir = Path(args.work_dir) / model_name + work_dir.mkdir(parents=True, exist_ok=True) + + # Work dir for the current run + pred_root = Path(args.work_dir) / model_name / eval_id + # List previous run + prev_pred_roots = sorted(d for d in work_dir.iterdir() if d.is_dir()) + pred_root.mkdir(exist_ok=True) + + log_file = Path(work_dir) / 'logs' / f'{eval_id}_{datetime.datetime.now().strftime("%H%M%S")}.log' + logger = setup_logger(log_file=str(log_file)) + logger.info(f'Log file: {log_file}') + + if args.mode == 'eval': + args.reuse = True + logger.info('Force to use `reuse=True` for eval mode.') + + if not args.reuse: + logger.warning('--reuse is not set, will not reuse previous temporary files') + else: + logger.info('--reuse is set, will reuse the latest prediction & temporary files') + + WORLD_SIZE = int(os.environ.get('WORLD_SIZE', 1)) + if WORLD_SIZE > 1: + logger.error("API pipeline does not support multi-process mode (WORLD_SIZE > 1).") + return + + # ============================================== + # Build model args (shared across all datasets) + # ============================================== + use_think_args = args.thinker + if args.base_url is not None: + model_args = dict( + model=args.model, + api_base=f"{args.base_url.rstrip('/')}/chat/completions", + key=args.key, + custom_prompt=args.custom_prompt, + max_tokens=args.max_tokens, + retry=args.retry, + timeout=args.timeout, + temperature=args.temperature, + top_k=args.top_k, + top_p=args.top_p, + repetition_penalty=args.repetition_penalty, + presence_penalty=args.presence_penalty, + verbose=args.verbose, + ) + model_args = {k: v for k, v in model_args.items() if v is not None} + if args.use_enable_thinking: + model_args['enable_thinking'] = args.enable_thinking + if use_think_args: + model_args.update(dict(timeout=args.timeout * 2, max_tokens=args.max_tokens * 2)) + model_builder = partial(LMDeployAPI, **model_args) + else: + assert model_name in supported_VLM, \ + f'Model "{model_name}" not found in supported_VLM. Consider using --base-url to specify an API endpoint.' + model_builder = supported_VLM[model_name] + + # ============================================== + # Prepare all datasets + # ============================================== + dataset_configs: List[DatasetConfig] = [] + + for ds_name in args.data: + logger.info(f'-------------------- {ds_name} --------------------') + + # Construct the dataset. + try: + dataset_kwargs = {} + if ds_name in [ + 'MMLongBench_DOC', 'DUDE', 'DUDE_MINI', + 'SLIDEVQA', 'SLIDEVQA_MINI', + ]: + dataset_kwargs['model'] = model_name + dataset = build_dataset(ds_name, **dataset_kwargs) + + if dataset is None: + logger.error(f'Dataset {ds_name} is not valid, will be skipped.') + continue + + # Prepare the result file. + pred_format = get_pred_file_format() + result_file_base = f'{model_name}_{ds_name}.{pred_format}' + result_file = str(pred_root / result_file_base) + + # Prepare the reuse file + if args.reuse and len(prev_pred_roots): + prepare_reuse_files( + pred_root_meta=str(work_dir), + eval_id=eval_id, + model_name=model_name, + dataset_name=ds_name, + reuse=args.reuse, + reuse_aux=args.reuse_aux + ) + + # Skip special datasets. + if ds_name in ['MMMU_TEST']: + logger.info(f'{ds_name} requires special handling, skipped in pipeline.') + continue + if 'MMT-Bench_ALL' in ds_name: + logger.info(f'{ds_name} requires special handling, skipped in pipeline.') + continue + + judge_kwargs = get_judge_kwargs(ds_name, args) + logger.info(f'Judge kwargs: {judge_kwargs}') + + # Complete the dataset config + if dataset.MODALITY == 'VIDEO': + dataset_type = 'video' + elif dataset.TYPE == 'MT': + dataset_type = 'mt' + else: + dataset_type = 'image' + dataset_config = DatasetConfig( + dataset_name=ds_name, + dataset_obj=dataset, + dataset_type=dataset_type, + model_obj=model_builder(), + model_name=model_name, + work_dir=str(pred_root), + result_file=result_file, + judge_kwargs=judge_kwargs, + verbose=args.verbose + ) + dataset_configs.append(dataset_config) + + except Exception as e: + logger.exception(f'Failed to prepare dataset {ds_name}: {e}') + continue + + # ============================================== + # Create and run pipeline + # ============================================== + if len(dataset_configs) == 0: + logger.warning('No valid datasets to evaluate.') + return + + logger.info(f"Starting API Pipeline for model: {model_name}") + logger.info(f"Total datasets: {len(dataset_configs)}") + + pipeline = APIEvalPipeline( + dataset_configs=dataset_configs, + concurrency=args.api_nproc, + monitor_interval=args.monitor_interval, + run_infer=args.mode in {'infer', 'all'}, + run_eval=args.mode in {'eval', 'all'}, + debug=args.debug + ) + + try: + asyncio.run(pipeline.run()) + except KeyboardInterrupt: + logger.warning("Pipeline interrupted by user.") + except Exception as e: + logger.exception(f"Pipeline failed with error: {e}") + + # Create symbolic links. + try: + files = list(pred_root.iterdir()) + for ds_name in args.data: + files_to_link = [f for f in files if f.is_file() and f'{model_name}_{ds_name}' in f.name] + for f in files_to_link: + file_addr = pred_root.absolute() / f.name + link_addr = work_dir.absolute() / f.name + if link_addr.exists() or link_addr.is_symlink(): + link_addr.unlink() + link_addr.symlink_to(file_addr.relative_to(link_addr.parent)) + except Exception as e: + logger.warning(f"Failed to create symbolic links: {e}") + + +if __name__ == '__main__': + load_env() + main() diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/AI2D_preproc.ipynb b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/AI2D_preproc.ipynb new file mode 100644 index 00000000..f93b8a88 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/AI2D_preproc.ipynb @@ -0,0 +1,261 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import os, cv2\n", + "import string\n", + "import os.path as osp\n", + "import numpy as np\n", + "from collections import defaultdict\n", + "from vlmeval.smp import ls, load, dump, download_file, encode_image_file_to_base64, md5, mrlines\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import multiprocessing as mp\n", + "from PIL import Image, ImageFont, ImageDraw\n", + "\n", + "font_URL = 'http://opencompass.openxlab.space/utils/Fonts/timesb.ttf'\n", + "font_file = 'timesb.ttf'\n", + "if not osp.exists(font_file):\n", + " download_file(font_URL)\n", + " \n", + "test_split_URL = 'https://s3-us-east-2.amazonaws.com/prior-datasets/ai2d_test_ids.csv'\n", + "test_split_file = 'ai2d_test_ids.csv'\n", + "if not osp.exists(test_split_file):\n", + " download_file(test_split_URL)\n", + " \n", + "test_ids = set(mrlines(test_split_file))\n", + " \n", + "def proper_font_size(font_file, wh, text, ratio=1):\n", + " font_size = 2\n", + " while True:\n", + " font = ImageFont.truetype(font_file, font_size)\n", + " real_box = font.getbbox(text)\n", + " real_wh = (real_box[2] - real_box[0], real_box[3] - real_box[1])\n", + " if real_wh[0] > wh[0] * ratio or real_wh[1] > wh[1] * ratio:\n", + " break\n", + " font_size += 1\n", + " return font_size\n", + "\n", + "def cover_image(ann_path):\n", + " data = load(ann_path)\n", + " texts = list(data['text'].values())\n", + " raw_img = ann_path.replace('annotations', 'images').replace('.json', '')\n", + " tgt_img = raw_img.replace('images', 'images_abc')\n", + " img = Image.open(raw_img)\n", + " draw = ImageDraw.Draw(img)\n", + " for text in texts:\n", + " st, ed = tuple(text['rectangle'][0]), tuple(text['rectangle'][1])\n", + " T = text['replacementText']\n", + " draw.rectangle((st, ed), fill='white')\n", + " font_size = proper_font_size(font_file, (ed[0] - st[0], ed[1] - st[1]), T, ratio=1)\n", + " font = ImageFont.truetype(font_file, font_size)\n", + " text_box = font.getbbox(T)\n", + " text_wh = (text_box[2] - text_box[0], text_box[3] - text_box[1])\n", + " cx, cy = (st[0] + ed[0]) // 2, st[1]\n", + " stx = cx - text_wh[0] // 2\n", + " sty = cy - text_wh[1] // 2\n", + " draw.text((stx, sty), T, font=font, fill='black')\n", + " img.save(tgt_img) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Process for no mask images\n", + "test_ids = set(mrlines(test_split_file))\n", + "\n", + "def detect_image_color(image):\n", + " gray_image = image.convert('L')\n", + " mean_brightness = np.mean(np.array(gray_image))\n", + " if mean_brightness < 127:\n", + " return 'white'\n", + " else:\n", + " return 'black'\n", + "\n", + "def cover_image(ann_path):\n", + " data = load(ann_path)\n", + " texts = list(data['text'].values())\n", + " raw_img = ann_path.replace('annotations', 'images').replace('.json', '')\n", + " tgt_img = raw_img.replace('images', 'images_abc')\n", + " img = Image.open(raw_img)\n", + " draw = ImageDraw.Draw(img)\n", + " color = detect_image_color(img)\n", + " font_size = 0\n", + " for text in texts:\n", + " st, ed = tuple(text['rectangle'][0]), tuple(text['rectangle'][1])\n", + " font_size += (ed[1] - st[1])\n", + " if len(texts) != 0:\n", + " font_size /= len(texts)\n", + " else:\n", + " font_size = 2\n", + " for text in texts:\n", + " st, ed = tuple(text['rectangle'][0]), tuple(text['rectangle'][1])\n", + " T = text['replacementText']\n", + " for i in range(2):\n", + " draw.rectangle(\n", + " [(st[0] - i, st[1] - i), (ed[0] + i, ed[1] + i)],\n", + " outline=color\n", + " )\n", + " font = ImageFont.truetype(font_file, font_size)\n", + " text_box = font.getbbox(T)\n", + " text_wh = (text_box[2] - text_box[0], text_box[3] - text_box[1])\n", + " cx, cy = (st[0] + ed[0]) // 2, st[1]\n", + " stx = cx - text_wh[0] // 2\n", + " sty = cy - text_wh[1] * 1.5\n", + " if sty < 0:\n", + " sty = cy + text_wh[1] * 1.3\n", + " draw.text((stx, sty), T, font=font, fill=color)\n", + " img.save(tgt_img) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "download_file('https://ai2-public-datasets.s3.amazonaws.com/diagrams/ai2d-all.zip')\n", + "os.system('unzip -o ai2d-all.zip')\n", + "\n", + "images = ls('ai2d/images/')\n", + "questions = ls('ai2d/questions/')\n", + "annotations = ls('ai2d/annotations/')\n", + "cates = load('ai2d/categories.json')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pool = mp.Pool(32)\n", + "pool.map(cover_image, annotations)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def puncproc(inText):\n", + " import re\n", + " outText = inText\n", + " punct = [\n", + " ';', r'/', '[', ']', '\"', '{', '}', '(', ')', '=', '+', '\\\\', '_', '-',\n", + " '>', '<', '@', '`', ',', '?', '!'\n", + " ]\n", + " commaStrip = re.compile('(\\d)(,)(\\d)') # noqa: W605\n", + " periodStrip = re.compile('(?!<=\\d)(\\.)(?!\\d)') # noqa: W605\n", + " for p in punct:\n", + " if (p + ' ' in inText or ' ' + p in inText) or (re.search(commaStrip, inText) is not None):\n", + " outText = outText.replace(p, '')\n", + " else:\n", + " outText = outText.replace(p, ' ')\n", + " outText = periodStrip.sub('', outText, re.UNICODE)\n", + " return outText\n", + "\n", + "def check_choices(line):\n", + " def ischar(s):\n", + " s = str(s)\n", + " if s in ['{}', 'Both', 'None of above']:\n", + " return True\n", + " elif s.startswith('Stage ') and ischar(s[6:]):\n", + " return True\n", + " elif ' and ' in s and np.all([ischar(x) for x in s.split(' and ')]):\n", + " return True\n", + " elif len(s) <= 2:\n", + " return True\n", + " elif len(puncproc(s).split()) > 1:\n", + " return np.all([ischar(x) for x in puncproc(s).split()])\n", + " return False\n", + " n_char = sum([ischar(line[x]) for x in 'ABCD'])\n", + " return n_char >= 3\n", + "\n", + "def check_question(question):\n", + " words = puncproc(question).split()\n", + " for ch in string.ascii_lowercase + string.ascii_uppercase:\n", + " if ch in words:\n", + " return True\n", + " return False\n", + "\n", + "def is_abc(abc, choices, question):\n", + " if abc == 0:\n", + " return False\n", + " if check_choices(choices):\n", + " return True\n", + " if check_question(question):\n", + " return True\n", + " return False" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data_all = defaultdict(list)\n", + "for qfile in questions:\n", + " data = load(qfile)\n", + " idx = data['imageName'].split('.')[0]\n", + " if idx not in test_ids:\n", + " continue\n", + " image_pth = qfile.replace('questions', 'images').replace('.json', '')\n", + " cate = cates[image_pth.split('/')[-1]]\n", + " for q, qmeta in data['questions'].items():\n", + " assert '.png-' in qmeta['questionId']\n", + " main, sub = qmeta['questionId'].split('.png-')\n", + " idx = int(main) * 100 + int(sub)\n", + " \n", + " answers = qmeta['answerTexts']\n", + " correct = qmeta['correctAnswer']\n", + " \n", + " data_all['index'].append(idx)\n", + " data_all['question'].append(q)\n", + " assert len(answers) == 4\n", + " for c, a in zip('ABCD', answers):\n", + " data_all[c].append(a)\n", + " data_all['answer'].append('ABCD'[qmeta['correctAnswer']])\n", + " data_all['category'].append(cate)\n", + " data_all['abcLabel'].append(qmeta['abcLabel'])\n", + " abc = is_abc(qmeta['abcLabel'], {x: data_all[x][-1] for x in 'ABCD'}, q)\n", + " # if qmeta['abcLabel'] and not abc:\n", + " # print(qmeta['abcLabel'], {x: data_all[x][-1] for x in 'ABCD'}, q)\n", + " data_all['image_path'].append(image_pth.replace('images', 'images_abc') if abc else image_pth)\n", + "data = pd.DataFrame(data_all)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "images = []\n", + "image_seen = {}\n", + "for idx, pth in zip(data['index'], data['image_path']):\n", + " images.append(encode_image_file_to_base64(pth))\n", + "\n", + "data['image'] = images\n", + "dump(data, 'AI2D_TEST.tsv')\n", + "print(md5('AI2D_TEST.tsv'))" + ] + } + ], + "metadata": { + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/apires_scan.py b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/apires_scan.py new file mode 100644 index 00000000..e8b33843 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/apires_scan.py @@ -0,0 +1,60 @@ +import os.path as osp +import sys + +from vlmeval.dataset import SUPPORTED_DATASETS +from vlmeval.smp import listinstr, load, ls + +FAIL_MSG = 'Failed to obtain answer via API.' + +root = sys.argv[1] +if root[-1] in '/\\': + root = root[:-1] + +model_name = root.split('/')[-1] + +for d in SUPPORTED_DATASETS: + from vlmeval.smp import get_pred_file_format + pred_format = get_pred_file_format() + fname = f'{model_name}_{d}.{pred_format}' + pth = osp.join(root, fname) + if osp.exists(pth): + data = load(pth) + # Detect Failure + assert 'prediction' in data + data['prediction'] = [str(x) for x in data['prediction']] + fail = [FAIL_MSG in x for x in data['prediction']] + if sum(fail): + nfail = sum(fail) + ntot = len(fail) + print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ') + + eval_files = ls(root, match=f'{model_name}_{d}_') + eval_files = [x for x in eval_files if listinstr([f'{d}_openai', f'{d}_gpt'], x) and x.endswith('.xlsx')] + + if len(eval_files) == 0: + print(f'Model {model_name} x Dataset {d} openai missing') + continue + + assert len(eval_files) == 1 + eval_file = eval_files[0] + data = load(eval_file) + + if 'MMVet' in d: + bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)] + if len(bad): + print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') + elif 'MathVista' in d: + bad = [x for x in data['res'] if FAIL_MSG in str(x)] + if len(bad): + print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') + + elif d == 'LLaVABench': + sub = data[data['gpt4_score'] == -1] + sub = sub[sub['gpt4_score'] == -1] + if len(sub): + print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.') + else: + bad = [x for x in data['log'] if FAIL_MSG in str(x)] + if len(bad): + print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.') + \ No newline at end of file diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/auto_run.py b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/auto_run.py new file mode 100644 index 00000000..1c428236 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/auto_run.py @@ -0,0 +1,44 @@ +import argparse +import os +import os.path as osp + +from vlmeval.config import supported_VLM +from vlmeval.smp import listinstr + + +def is_api(x): + return getattr(supported_VLM[x].func, 'is_api', False) + +models = list(supported_VLM) +models = [x for x in models if 'fs' not in x] +models = [x for x in models if not is_api(x)] +exclude_list = ['cogvlm-grounding-generalist', 'emu2'] +models = [x for x in models if x not in exclude_list] + +def is_large(x): + return '80b' in x or 'emu2' in x or '34B' in x + +small_models = [x for x in models if not is_large(x)] +large_models = [x for x in models if is_large(x)] +models = small_models + large_models + +parser = argparse.ArgumentParser() +parser.add_argument('--data', type=str, nargs='+', required=True) +args = parser.parse_args() + +# Skip some models +models = [x for x in models if not listinstr(['MiniGPT', 'grounding-generalist'], x)] + +for m in models: + from vlmeval.smp import get_pred_file_format + pred_format = get_pred_file_format() + unknown_datasets = [x for x in args.data if not osp.exists(f'{m}/{m}_{x}.{pred_format}')] + if len(unknown_datasets) == 0: + continue + dataset_str = ' '.join(unknown_datasets) + if '80b' in m: + cmd = f'python run.py --data {dataset_str} --model {m}' + else: + cmd = f'bash run.sh --data {dataset_str} --model {m}' + print(cmd) + os.system(cmd) \ No newline at end of file diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/convert_macbench.py b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/convert_macbench.py new file mode 100644 index 00000000..e4ac308f --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/convert_macbench.py @@ -0,0 +1,98 @@ +import argparse +import string +import ast +from pathlib import Path +from tqdm import tqdm +from datasets import load_dataset, Dataset +import pandas as pd +# from vlmeval.smp.vlm import encode_image_to_base64 +from concurrent.futures import ProcessPoolExecutor + +prog_description = """\ +Convert original MaCBench dataset to TSV. +""" + + +def parse_args(): + parser = argparse.ArgumentParser(description=prog_description) + parser.add_argument('out', type=Path, help='output path') + args = parser.parse_args() + return args + + +CONFIGS = [ + 'afm-image', 'chem-lab-basic', 'chem-lab-comparison', + 'chem-lab-equipments', 'chirality', 'cif-atomic-species', + 'cif-crystal-system', 'cif-density', 'cif-symmetry', 'cif-volume', + 'electronic-structure', 'handdrawn-molecules', 'isomers', + 'mof-adsorption-strength-comparison', 'mof-adsorption-strength-order', + 'mof-capacity-comparison', 'mof-capacity-order', 'mof-capacity-value', + 'mof-henry-constant-comparison', 'mof-henry-constant-order', + 'mof-working-capacity-comparison', 'mof-working-capacity-order', + 'mof-working-capacity-value', 'org-schema', 'org-schema-wo-smiles', + 'organic-molecules', 'spectral-analysis', 'tables-qa', 'us-patent-figures', + 'us-patent-plots', 'xrd-pattern-matching', 'xrd-pattern-shape', + 'xrd-peak-position', 'xrd-relative-intensity' +] + + +def process_row(row): + example = ast.literal_eval(row['examples'][0]) + + image = example['qentries_modality']['image']['entry1']['value'].partition(',')[-1] + assert len(image) > 1000 + output = {'image': image} + + entries = example['qentries_modality']['image'] + question = example['input'] + for k, entry in entries.items(): + if entry['type'] == 'text': + question = question.replace('{' + k + '}', entry['value']) + else: + question = question.replace('{' + k + '}', '{image}') + output['question'] = question + + if 'target' in example: + answer = example['target'] + output['answer'] = str(answer) + elif 'target_scores' in example: + answer = [] + for i, (option, grade) in enumerate(example['target_scores'].items()): + column = string.ascii_uppercase[i] + output[column] = option + if grade: + answer.append(column) + output['answer'] = ','.join(answer) + + if row['relative_tolerance'] is not None: + output['relative_tolerance'] = row['relative_tolerance'] + return output + + +def process_single_config(name): + ds: Dataset = load_dataset('jablonkagroup/MaCBench', name)['train'] + processed_rows = [] + for row in ds: + processed_rows.append(process_row(row)) + df = pd.DataFrame(processed_rows) + df['category'] = str(name) + return df + + +def main(): + args = parse_args() + all_dfs = [] + with ProcessPoolExecutor() as executor: + tasks = [] + for name in CONFIGS: + tasks.append(executor.submit(process_single_config, name)) + for task in tqdm(tasks): + sub_df = task.result() + all_dfs.append(sub_df) + + df = pd.concat(all_dfs, ignore_index=True).reset_index() + df.to_csv(args.out, sep='\t', index=True) + + +if __name__ == "__main__": + main() diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/cover.sh b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/cover.sh new file mode 100644 index 00000000..0a35c508 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/cover.sh @@ -0,0 +1,4 @@ +#!/bin/bash +DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cp $DIR/../config.py $DIR/../vlmeval/ +cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/ \ No newline at end of file diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/data_browser.py b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/data_browser.py new file mode 100644 index 00000000..72ad5855 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/data_browser.py @@ -0,0 +1,179 @@ +""" +pip install gradio # proxy_on first +python vis_geochat_data.py +# browse data in http://127.0.0.1:10064 +""" + +import argparse +import base64 +import copy +import io +import json +import os +import os.path as osp +import string +import time +from io import BytesIO +from typing import Sequence + +import gradio as gr +import pandas as pd +from PIL import Image + +from vlmeval.api import OpenAIWrapper +from vlmeval.dataset import SUPPORTED_DATASETS, build_dataset +from vlmeval.smp import LMUDataRoot, encode_image_file_to_base64, load + +SYS = "You are a helpful assistant. Your job is to faithfully translate all provided text into Chinese faithfully. " + +# Translator = SiliconFlowAPI(model='Qwen/Qwen2.5-7B-Instruct', system_prompt=SYS) +Translator = OpenAIWrapper(model='gpt-4o-mini', system_prompt=SYS) + + +def image_to_mdstring(image): + return f"![image](data:image/jpeg;base64,{image})" + + +def images_to_md(images): + return '\n\n'.join([image_to_mdstring(image) for image in images]) + + +def mmqa_display(question, target_size=2048): + question = {k.lower() if len(k) > 1 else k: v for k, v in question.items()} + keys = list(question.keys()) + keys = [k for k in keys if k not in ['index', 'image']] + + idx = question.pop('index', 'XXX') + text = f'\n- INDEX: {idx}\n' + + if 'image' in question: + images = question.pop('image') + if images[0] == '[' and images[-1] == ']': + images = eval(images) + else: + images = [images] + else: + images = question.pop('image_path') + if images[0] == '[' and images[-1] == ']': + images = eval(images) + else: + images = [images] + images = [encode_image_file_to_base64(x) for x in images] + + qtext = question.pop('question', None) + if qtext is not None: + text += f'- QUESTION: {qtext}\n' + + if 'A' in question: + text += f'- Choices: \n' + for k in string.ascii_uppercase: + if k in question: + text += f'\t-{k}: {question.pop(k)}\n' + answer = question.pop('answer', None) + + for k in question: + if not pd.isna(question[k]): + text += f'- {k.upper()}. {question[k]}\n' + + if answer is not None: + text += f'- ANSWER: {answer}\n' + + image_md = images_to_md(images) + + return text, image_md + + +def parse_args(): + parser = argparse.ArgumentParser() + # Essential Args, Setting the Names of Datasets and Models + parser.add_argument('--port', type=int, default=7860) + args = parser.parse_args() + return args + + +def gradio_app_vis_dataset(port=7860): + data, loaded_obj = None, {} + + def btn_submit_click(filename, ann_id): + if filename not in loaded_obj: + return filename_change(filename, ann_id) + nonlocal data + data_desc = gr.Markdown(f'Visualizing {filename}, {len(data)} samples in total. ') + if ann_id < 0 or ann_id >= len(data): + return filename, ann_id, data_desc, gr.Markdown('Invalid Index'), gr.Markdown(f'Index out of range [0, {len(data) - 1}]') + item = data.iloc[ann_id] + text, image_md = mmqa_display(item) + return filename, ann_id, data_desc, image_md, text + + def btn_next_click(filename, ann_id): + return btn_submit_click(filename, ann_id + 1) + + # def translate_click(anno_en): + # return gr.Markdown(Translator.generate(anno_en)) + + def filename_change(filename, ann_id): + nonlocal data, loaded_obj + + def legal_filename(filename): + LMURoot = LMUDataRoot() + if filename in SUPPORTED_DATASETS: + return build_dataset(filename).data + elif osp.exists(filename): + data = load(filename) + assert 'index' in data and 'image' in data + image_map = {i: image for i, image in zip(data['index'], data['image'])} + for k, v in image_map.items(): + if (not isinstance(v, str) or len(v) < 64) and v in image_map: + image_map[k] = image_map[v] + data['image'] = [image_map[k] for k in data['index']] + return data + elif osp.exists(osp.join(LMURoot, filename)): + filename = osp.join(LMURoot, filename) + return legal_filename(filename) + else: + return None + + data = legal_filename(filename) + if data is None: + return filename, 0, gr.Markdown(''), gr.Markdown("File not found"), gr.Markdown("File not found") + + loaded_obj[filename] = data + return btn_submit_click(filename, 0) + + with gr.Blocks() as app: + + filename = gr.Textbox( + value='Dataset Name (supported by VLMEvalKit) or TSV FileName (Relative under `LMURoot` or Real Path)', + label='Dataset', + interactive=True, + visible=True) + + with gr.Row(): + ann_id = gr.Number(0, label='Sample Index (Press Enter)', interactive=True, visible=True) + btn_next = gr.Button("Next") + # btn_translate = gr.Button('CN Translate') + + with gr.Row(): + data_desc = gr.Markdown('Dataset Description', label='Dataset Description') + + with gr.Row(): + image_output = gr.Markdown('Image PlaceHolder', label='Image Visualization') + anno_en = gr.Markdown('Image Annotation', label='Image Annotation') + # anno_cn = gr.Markdown('Image Annotation (Chinese)', label='Image Annotation (Chinese)') + + input_components = [filename, ann_id] + all_components = [filename, ann_id, data_desc, image_output, anno_en] + + filename.submit(filename_change, input_components, all_components) + ann_id.submit(btn_submit_click, input_components, all_components) + btn_next.click(btn_next_click, input_components, all_components) + # btn_translate.click(translate_click, anno_en, anno_cn) + + # app.launch() + app.launch(server_name='0.0.0.0', debug=True, show_error=True, server_port=port) + + +if __name__ == "__main__": + args = parse_args() + gradio_app_vis_dataset(port=args.port) + diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/mmb_eval_gradio.py b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/mmb_eval_gradio.py new file mode 100644 index 00000000..a98034e6 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/mmb_eval_gradio.py @@ -0,0 +1,122 @@ +import datetime +import os +import os.path as osp +import shutil + +import gradio as gr +import numpy as np +import pandas as pd + +from vlmeval.dataset import build_dataset +from vlmeval.smp import LMUDataRoot, cn_string, dump, load, md5 +from vlmeval.tools import EVAL + +HEADER = """ +# Welcome to MMBench👏👏 +We are delighted that you are willing to submit the evaluation results to the MMBench official website! The evaluation service currently can handle submissions of MMBench, MMBench-CN, and CCBench. We use `gpt-3.5-turbo-0125` to help answer matching. Evaluation Codes in VLMEvalKit: https://github.com/open-compass/VLMEvalKit. Please adopt / follow the implementation of VLMEvalKit to generate the submission files. + +The evaluation script is available at https://github.com/open-compass/VLMEvalKit/tree/main/scripts/mmb_eval_gradio.py +Please contact `opencompass@pjlab.org.cn` for any inquirys about this script. +""" + +def upload_file(file): + file_path = file.name + return file_path + +def prepare_file(file_name): + file_md5 = md5(file_name) + root = LMUDataRoot() + root = osp.join(root, 'eval_server') + os.makedirs(root, exist_ok=True) + suffix = file_name.split('.')[-1] + if suffix not in ['xlsx', 'tsv', 'csv']: + return False, "Please submit a file that ends with `.xlsx`, `.tsv`, or `.csv`" + new_file_name = osp.join(root, f'{file_md5}.{suffix}') + shutil.move(file_name, new_file_name) + eval_file = new_file_name + try: + data = load(eval_file) + except: + return False, "Your excel file can not be successfully loaded by `pd.read_excel`, please double check and submit again. " + for k in data.keys(): + data[k.lower() if k not in 'ABCD' else k] = data.pop(k) + if "index" not in data: + return False, "Your excel file should have a column named `index`, please double check and submit again" , {} + if "prediction" not in data: + return False, "Your excel file should have a column named `prediction`, please double check and submit again" , {} + for ch in 'ABCD': + if ch not in data: + return False, f"Your excel file should have a column named `{ch}`, please double check and submit again" , {} + dump(data, eval_file) + return True, eval_file + +def determine_dataset(eval_file): + data = load(eval_file) + def cn_ratio(data): + iscn = [cn_string(x) for x in data['question']] + return np.mean(iscn) + max_ind = np.max([int(x) for x in data['index'] if int(x) < 1e5]) + if max_ind < 1000 and 'l2-category' not in data: + return 'CCBench' if cn_ratio(data) > 0.5 else "Unknown" + elif max_ind < 3000 : + return 'MMBench_CN' if cn_ratio(data) > 0.5 else "MMBench" + else: + return 'MMBench_CN_V11' if cn_ratio(data) > 0.5 else "MMBench_V11" + + +def reformat_acc(acc): + splits = set(acc['split']) + keys = list(acc.keys()) + keys.remove('split') + nacc = {'Category': []} + for sp in splits: + nacc[sp.upper()] = [] + for k in keys: + nacc['Category'].append(k) + for sp in splits: + nacc[sp.upper()].append(acc[acc['split'] == sp].iloc[0][k] * 100) + return pd.DataFrame(nacc) + +def evaluate(file): + file_name = file.name + flag, eval_file = prepare_file(file_name) + if not flag: + return "Error: " + eval_file + dataset = determine_dataset(eval_file) + if dataset == 'Unknown': + return "Error: Cannot determine the dataset given your submitted file. " + + eval_id = eval_file.split('/')[-1].split('.')[0] + ret = f"Evaluation ID: {eval_id}\n" + timestamp = datetime.datetime.now().strftime('%Y.%m.%d %H:%M:%S') + ret += f'Evaluation Timestamp: {timestamp}\n' + eval_data = load(eval_file) + eval_data['index'] = [int(x) for x in eval_data['index']] + base_data = build_dataset(dataset).data + base_index_set = set([int(x) for x in base_data['index']]) + inds_more = {k for k in eval_data['index'] if k not in base_index_set} + if len(inds_more) > 0: + inds_more = set([x % 1e6 for x in inds_more]) + ret += f"Warning: The matched dataset is {dataset}. The following indices are not in the base dataset: {inds_more}\n" + ret += f"We automatically remove those indices, and still recommend you to check the indices in your prediction file.\n" + eval_data = eval_data[eval_data['index'].isin(base_index_set)] + dump(eval_data, eval_file) + + acc = EVAL(dataset, eval_file) + nacc = reformat_acc(acc).round(1) + return ret, nacc + +with gr.Blocks() as demo: + gr.Markdown(HEADER) + file_output = gr.File() + upload_button = gr.UploadButton("Click to upload you prediction files for a supported benchmark") + upload_button.upload(upload_file, upload_button, file_output) + + btn = gr.Button("🚀 Evaluate") + eval_log = gr.Textbox(label="Evaluation Log", placeholder="Your evaluation log will be displayed here") + df_empty = pd.DataFrame([], columns=['Evaluation Result']) + eval_result = gr.components.DataFrame(value=df_empty) + btn.click(evaluate, inputs=[file_output], outputs=[eval_log, eval_result]) + +if __name__ == '__main__': + demo.launch(server_name='0.0.0.0', debug=True, show_error=True) \ No newline at end of file diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/prepare_spar_bench.py b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/prepare_spar_bench.py new file mode 100644 index 00000000..ec1f1715 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/prepare_spar_bench.py @@ -0,0 +1,55 @@ +"""Convert SPAR-Bench from HuggingFace to TSV format.""" +import argparse +import os +import base64 +from io import BytesIO +import pandas as pd +from datasets import load_dataset +from PIL import Image +from tqdm import tqdm + + +def encode_image_to_base64(image): + buffered = BytesIO() + image.save(buffered, format="PNG") + return base64.b64encode(buffered.getvalue()).decode() + + +def convert_spar_bench_to_tsv(dataset_name='jasonzhango/SPAR-Bench', output_dir='./data'): + print(f"Loading {dataset_name}...") + dataset = load_dataset(dataset_name) + test_data = dataset['test'] + + tsv_data = [] + for idx, example in enumerate(tqdm(test_data)): + images = example['image'] + if isinstance(images, list): + image_b64 = encode_image_to_base64(images[0]) + else: + image_b64 = encode_image_to_base64(images) + + tsv_data.append({ + 'index': idx, + 'image': image_b64, + 'question': example['question'], + 'answer': example['answer'], + 'task': example.get('task', 'unknown'), + 'img_type': example.get('img_type', 'single_view'), + 'format_type': example.get('format_type', 'unknown'), + 'source': example.get('source', 'unknown'), + }) + + df = pd.DataFrame(tsv_data) + os.makedirs(output_dir, exist_ok=True) + output_name = 'SPAR-Bench-Tiny.tsv' if 'Tiny' in dataset_name else 'SPAR-Bench.tsv' + output_path = os.path.join(output_dir, output_name) + df.to_csv(output_path, sep='\t', index=False) + print(f"✅ Saved to {output_path}") + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--dataset', default='jasonzhango/SPAR-Bench') + parser.add_argument('--output_dir', default='./data') + args = parser.parse_args() + convert_spar_bench_to_tsv(args.dataset, args.output_dir) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/run.sh b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/run.sh new file mode 100644 index 00000000..5dab509c --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/run.sh @@ -0,0 +1,4 @@ +#!/bin/bash +set -x +export GPU=$(nvidia-smi --list-gpus | wc -l) +torchrun --nproc-per-node=$GPU run.py ${@:1} \ No newline at end of file diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/run_endpoint_bench.sh b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/run_endpoint_bench.sh new file mode 100755 index 00000000..64ee12d4 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/run_endpoint_bench.sh @@ -0,0 +1,286 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Run VLMEvalKit benchmark against a custom inference endpoint. +# Supports running a single dataset or all datasets with --all. + +# dataset_name:class_name +ALL_DATASETS=( + "MMMU_DEV_VAL:MMMUDataset" + "MathVista_MINI:MathVista" + "MathVision:MathVision" + "MMBench_DEV_EN:ImageMCQDataset" + "MMBench_DEV_EN_V11:ImageMCQDataset" + "HallusionBench:ImageYORNDataset" + "InfoVQA_VAL:ImageVQADataset" + "DocVQA_VAL:ImageVQADataset" + "AI2D_TEST:ImageMCQDataset" + "CountBenchQA:CountBenchQA" + "CosmosERQA:CosmosERQA" + "OCRBench_v2:OCRBench_v2" +) + +usage() { + cat <<'EOF' +Usage: ./run_endpoint_bench.sh --endpoint URL --model MODEL (--dataset DATASET | --all) [OPTIONS] + +Required: + --endpoint URL Full chat completions URL (e.g. https://...lepton.run/v1/chat/completions) + --model MODEL Model name served at the endpoint (e.g. Qwen3-VL-8B-Instruct) + --dataset DATASET Dataset name (e.g. CountBenchQA) + --all Run all built-in datasets + +Optional: + --dataset-class CLASS Dataset class name (e.g. ImageVQADataset; default: auto-detect or same as dataset) + --temperature FLOAT Sampling temperature (default: 0) + --max-tokens INT Max tokens to generate (default: 16384) + --presence-penalty FLOAT Penalize repeated tokens to reduce repetitive reasoning (default: not set) + --retry INT Number of retries (default: 10) + --timeout INT Request timeout in seconds (default: 300) + --enable-thinking Enable model thinking/reasoning (default: disabled) + --work-dir DIR Output directory (default: ./outputs) + --api-nproc INT Parallel API calls (default: 4) + --nframe INT Number of frames for dataset-side video extraction (override dataset default) + --fps FLOAT FPS for dataset-side video extraction (override dataset default) + --model-nframes INT Number of frames for model-side video processing (for VIDEO_LLM models) + --model-fps FLOAT FPS for model-side video processing (for VIDEO_LLM models) + --model-max-frames INT Max frames cap for model-side video processing + --run-name NAME Custom name for the model entry in config (default: derived from model name) + --keep-config Do not delete the temp config file after run + +Environment: + COSMOS_API_KEY API key for the inference endpoint (required) + OPENAI_API_KEY API key for LLM judge (required by some datasets) + OPENAI_API_BASE Custom base URL for LLM judge (optional) + NGC_API_KEY NGC API key for downloading datasets from DSS (required by some datasets) + NVDATASET_TENANTID Tenant ID for NVIDIA Data Services (required by some datasets) +EOF + exit 1 +} + +# Run a single benchmark. Args: dataset_name, class_name +run_one() { + local dataset="$1" + local dataset_class="$2" + + # Resolve paths relative to this script (which lives alongside run.py) + local script_dir + script_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + + local config_file + config_file=$(mktemp "${script_dir}/tmp_config_XXXXXX.json") + + # Build the data section with optional video fields + local data_json="{\"class\": \"${dataset_class}\", \"dataset\": \"${dataset}\"" + if [[ -n "$NFRAME" ]]; then + data_json="${data_json}, \"nframe\": ${NFRAME}" + fi + if [[ -n "$FPS" ]]; then + data_json="${data_json}, \"fps\": ${FPS}" + fi + data_json="${data_json}}" + + # Build model-side video processing kwargs (independent of dataset config) + local model_video_kwargs="" + if [[ -n "$MODEL_NFRAMES" ]]; then + model_video_kwargs="${model_video_kwargs}, \"nframes\": ${MODEL_NFRAMES}" + fi + if [[ -n "$MODEL_FPS" ]]; then + model_video_kwargs="${model_video_kwargs}, \"fps\": ${MODEL_FPS}" + fi + if [[ -n "$MODEL_MAX_FRAMES" ]]; then + model_video_kwargs="${model_video_kwargs}, \"max_frames\": ${MODEL_MAX_FRAMES}" + fi + + cat > "$config_file" <= len(final.iloc[0].keys()): + print(tabulate(final)) + else: + print(tabulate(final.T)) + +if __name__ == '__main__': + args = parse_args() + if args.data == []: + args.data = list(SUPPORTED_DATASETS) + gen_table(args.model, args.data) \ No newline at end of file diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/scripts/visualize.ipynb b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/visualize.ipynb new file mode 100644 index 00000000..84d08dcf --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/scripts/visualize.ipynb @@ -0,0 +1,266 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import copy as cp\n", + "import numpy as np\n", + "import matplotlib.pyplot as plt\n", + "import matplotlib.font_manager as fm\n", + "\n", + "def download_file(url, filename=None):\n", + " from urllib.request import urlretrieve\n", + " if filename is None:\n", + " filename = url.split('/')[-1]\n", + " urlretrieve(url, filename)\n", + "\n", + "font_URL = 'http://opencompass.openxlab.space/utils/Fonts/segoepr.ttf'\n", + "download_file(font_URL)\n", + "\n", + "font12 = fm.FontProperties(fname='segoepr.ttf', size=12)\n", + "font15 = fm.FontProperties(fname='segoepr.ttf', size=15, weight='bold')\n", + "font18 = fm.FontProperties(fname='segoepr.ttf', size=18, weight='bold')\n", + "\n", + "DATA_URL = 'http://opencompass.openxlab.space/utils/OpenVLM.json'\n", + "download_file(DATA_URL)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def pre_normalize(raw_data, labels):\n", + " data_list = cp.deepcopy(raw_data)\n", + " minimum, maximum, max_range, range_map = {}, {}, 0, {}\n", + " for lb in labels:\n", + " minimum[lb] = min([x[lb] for x in data_list])\n", + " maximum[lb] = max([x[lb] for x in data_list])\n", + " max_range = max(max_range, maximum[lb] - minimum[lb])\n", + " max_range *= 1.25\n", + " for lb in labels:\n", + " mid = (minimum[lb] + maximum[lb]) / 2\n", + " new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)\n", + " range_map[lb] = new_range\n", + " for item in data_list:\n", + " assert new_range[0] <= item[lb] <= new_range[1]\n", + " item[lb] = (item[lb] - new_range[0]) / max_range * 100\n", + " return data_list, range_map\n", + "\n", + "# solve the problem that some benchmark score is too high and out of range\n", + "def log_normalize(raw_data, labels):\n", + " data_list = cp.deepcopy(raw_data)\n", + " minimum, maximum, max_range, range_map = {}, {}, 0, {}\n", + " for lb in labels:\n", + " minimum[lb] = min([np.log(x[lb]) for x in data_list])\n", + " maximum[lb] = max([np.log(x[lb]) for x in data_list])\n", + " max_range = max(max_range, maximum[lb] - minimum[lb])\n", + " max_range *= 1.005\n", + " for lb in labels:\n", + " mid = (minimum[lb] + maximum[lb]) / 2\n", + " new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)\n", + " range_map[lb] = new_range\n", + " for item in data_list:\n", + " assert new_range[0] <= np.log(item[lb]) <= new_range[1]\n", + " item[lb] = (np.log(item[lb]) - new_range[0]) / max_range * 100\n", + " return data_list, range_map" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Draw MMBench Radar Graph\n", + "data = json.loads(open('OpenVLM.json').read())['results']\n", + "models = list(data)\n", + "print(models)\n", + "\n", + "# model2vis = [\n", + "# 'GPT-4v (detail: low)', 'GeminiProVision', 'Qwen-VL-Plus', \n", + "# 'InternLM-XComposer2-VL', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n", + "# 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n", + "# ]\n", + "\n", + "model2vis = [\n", + " # 'GPT-4v (detail: low)', 'GeminiProVision', 'InternLM-XComposer2-VL', \n", + " 'GPT-4v (1106, detail-low)', 'Gemini-1.0-Pro', 'Gemini-1.5-Pro', #'Gemini-1.5-Flash', 'Qwen-VL-Plus', \n", + " 'InternLM-XComposer2', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n", + " 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n", + "]\n", + "\n", + "colors = [\n", + " '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', \n", + " '#e377c2', '#7f7f7f', '#bcbd22'\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from collections import defaultdict\n", + "\n", + "split = 'MMBench_TEST_EN'\n", + "# data_sub = {k: v[split] for k, v in data.items()}\n", + "data_sub = {k: defaultdict(int, v)[split] for k, v in data.items()}\n", + "# solve the problem that some model lack the evaluation of MMBench_TEST_EN\n", + "\n", + "labels = list(data_sub[model2vis[0]])\n", + "labels.remove('Overall')\n", + "num_vars = len(labels)\n", + "\n", + "raw_data = [data_sub[m] for m in model2vis]\n", + "data_list, range_map = pre_normalize(raw_data, labels)\n", + "\n", + "alpha = 0.25\n", + "angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()\n", + "angles_deg = np.linspace(0, 360, num_vars, endpoint=False).tolist()\n", + "fig, ax_base = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), subplot_kw=dict(polar=True))\n", + "\n", + "for i in range(len(data_list)):\n", + " item = data_list[i]\n", + " model_name = model2vis[i]\n", + " color = colors[i]\n", + " tmp_angles = angles[:] + [angles[0]]\n", + " tmp_values = [item[lb] for lb in labels] + [item[labels[0]]]\n", + " ax_base.plot(tmp_angles, tmp_values, color=color, linewidth=1, linestyle='solid', label=model_name)\n", + " ax_base.fill(tmp_angles, tmp_values, color=color, alpha=alpha)\n", + " \n", + "angles += [angles[0]]\n", + "ax_base.set_ylim(0, 100)\n", + "ax_base.set_yticks([40, 60, 80, 100])\n", + "ax_base.set_yticklabels([''] * 4)\n", + "\n", + "ax_base.tick_params(pad=25)\n", + "ax_base.set_xticks(angles[:-1])\n", + "ax_base.set_xticklabels(labels, fontproperties=font18)\n", + "\n", + "leg = ax_base.legend(loc='center right', bbox_to_anchor=(1.6, 0.5), prop=font15, ncol=1, frameon=True, labelspacing=1.2)\n", + "for line in leg.get_lines():\n", + " line.set_linewidth(2.5)\n", + "\n", + "cx, cy, sz = 0.44, 0.435, 0.34\n", + "axes = [fig.add_axes([cx - sz, cy - sz, cx + sz, cy + sz], projection='polar', label='axes%d' % i) for i in range(num_vars)]\n", + " \n", + "for ax, angle, label in zip(axes, angles_deg, labels):\n", + " ax.patch.set_visible(False)\n", + " ax.grid(False)\n", + " ax.xaxis.set_visible(False)\n", + " cur_range = range_map[label]\n", + " label_list = [cur_range[0] + (cur_range[1] - cur_range[0]) / 5 * i for i in range(2, 6)]\n", + " label_list = [f'{x:.1f}' for x in label_list]\n", + " ax.set_rgrids(range(40, 120, 20), angle=angle, labels=label_list, font_properties=font12)\n", + " ax.spines['polar'].set_visible(False)\n", + " ax.set_ylim(0, 100)\n", + "\n", + "title_text = f'{len(model2vis)} Representative VLMs on MMBench Test.'\n", + "plt.figtext(.7, .95, title_text, fontproperties=font18, ha='center')\n", + "plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "labels = ['SEEDBench_IMG', 'CCBench', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MME', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench']\n", + "num_vars = len(labels)\n", + "\n", + "raw_data = [{k: data[m][k]['Overall'] for k in labels} for m in model2vis]\n", + "data_list, range_map = pre_normalize(raw_data, labels)\n", + "\n", + "alpha = 0.25\n", + "angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()\n", + "angles_deg = np.linspace(0, 360, num_vars, endpoint=False).tolist()\n", + "fig, ax_base = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), subplot_kw=dict(polar=True))\n", + "\n", + "for i in range(len(data_list)):\n", + " item = data_list[i]\n", + " model_name = model2vis[i]\n", + " color = colors[i]\n", + " tmp_angles = angles[:] + [angles[0]]\n", + " tmp_values = [item[lb] for lb in labels] + [item[labels[0]]]\n", + " ax_base.plot(tmp_angles, tmp_values, color=color, linewidth=1, linestyle='solid', label=model_name)\n", + " ax_base.fill(tmp_angles, tmp_values, color=color, alpha=alpha)\n", + " \n", + "angles += [angles[0]]\n", + "ax_base.set_ylim(0, 100)\n", + "ax_base.set_yticks([40, 60, 80, 100])\n", + "ax_base.set_yticklabels([''] * 4)\n", + "\n", + "ax_base.tick_params(pad=15)\n", + "ax_base.set_xticks(angles[:-1])\n", + "ax_base.set_xticklabels(labels, fontproperties=font18)\n", + "\n", + "dataset_map = {\n", + " 'MMBench_TEST_EN': 'MMBench (Test)', \n", + " 'MMBench_TEST_CN': 'MMBenchCN (Test)', \n", + " 'MathVista': 'MathVista (TestMini)', \n", + " 'MMMU_VAL': 'MMMU (Val)'\n", + "}\n", + "for i, label in enumerate(ax_base.get_xticklabels()):\n", + " x,y = label.get_position()\n", + " text = label.get_text()\n", + " text = dataset_map[text] if text in dataset_map else text\n", + " lab = ax_base.text(x, y, text, transform=label.get_transform(),\n", + " ha=label.get_ha(), va=label.get_va(), font_properties=font15)\n", + " lab.set_rotation(360 / num_vars * i + 270)\n", + " labels.append(lab)\n", + "ax_base.set_xticklabels([])\n", + "\n", + "leg = ax_base.legend(loc='center right', bbox_to_anchor=(1.6, 0.5), prop=font15, ncol=1, frameon=True, labelspacing=1.2)\n", + "for line in leg.get_lines():\n", + " line.set_linewidth(2.5)\n", + "\n", + "cx, cy, sz = 0.44, 0.435, 0.34\n", + "axes = [fig.add_axes([cx - sz, cy - sz, cx + sz, cy + sz], projection='polar', label='axes%d' % i) for i in range(num_vars)]\n", + " \n", + "for ax, angle, label in zip(axes, angles_deg, labels):\n", + " ax.patch.set_visible(False)\n", + " ax.grid(False)\n", + " ax.xaxis.set_visible(False)\n", + " cur_range = range_map[label]\n", + " label_list = [cur_range[0] + (cur_range[1] - cur_range[0]) / 5 * i for i in range(2, 6)]\n", + " label_list = [f'{x:.1f}' for x in label_list]\n", + " ax.set_rgrids(range(40, 120, 20), angle=angle, labels=label_list, font_properties=font12)\n", + " ax.spines['polar'].set_visible(False)\n", + " ax.set_ylim(0, 100)\n", + "\n", + "title_text = f'{len(model2vis)} Representative VLMs on {num_vars} Benchmarks in OpenCompass Multi-Modal Leaderboard.'\n", + "plt.figtext(.7, .95, title_text, fontproperties=font18, ha='center')\n", + "plt.show()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "base", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/setup.cfg b/evaluation/cosmos3/reasoner/vlmevalkit/setup.cfg new file mode 100644 index 00000000..8a683f39 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/setup.cfg @@ -0,0 +1,23 @@ +[flake8] +max-line-length = 99 +ignore = W503,E203,E741 +per-file-ignores = + run.py: E402 + vlmeval/vlm/__init__.py: F401, E402 + vlmeval/dataset/video_dataset_config.py: F405, F403 + +[yapf] +based_on_style = pep8 +blank_line_before_nested_class_or_def = true +split_before_expression_after_opening_paren = true +split_penalty_import_names = 0 +split_penalty_after_opening_bracket = 800 +column_limit = 99 + + +[isort] +line_length = 99 +multi_line_output = 0 +known_first_party = vlmeval +no_lines_before = STDLIB,LOCALFOLDER +sections = FUTURE,STDLIB,THIRDPARTY,FIRSTPARTY,LOCALFOLDER diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/setup.py b/evaluation/cosmos3/reasoner/vlmevalkit/setup.py new file mode 100644 index 00000000..253ef4cf --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/setup.py @@ -0,0 +1,123 @@ +import re +import sys +from os.path import exists + +from setuptools import find_packages, setup + + +def parse_requirements(fname='requirements.txt', with_version=True): + """Parse the package dependencies listed in a requirements file but strips + specific versioning information. + + Args: + fname (str): path to requirements file + with_version (bool, default=False): if True include version specs + + Returns: + List[str]: list of requirements items + + CommandLine: + python -c "import setup; print(setup.parse_requirements())" + """ + + require_fpath = fname + + def parse_line(line): + """Parse information from a line in a requirements text file.""" + if line.startswith('-r '): + # Allow specifying requirements in other files + target = line.split(' ')[1] + for info in parse_require_file(target): + yield info + else: + info = {'line': line} + if line.startswith('-e '): + info['package'] = line.split('#egg=')[1] + elif '@git+' in line: + info['package'] = line + else: + # Remove versioning from the package + pat = '(' + '|'.join(['>=', '==', '>']) + ')' + parts = re.split(pat, line, maxsplit=1) + parts = [p.strip() for p in parts] + + info['package'] = parts[0] + if len(parts) > 1: + op, rest = parts[1:] + if ';' in rest: + # Handle platform specific dependencies + # https://setuptools.readthedocs.io/en/latest/setuptools.html#declaring-platform-specific-dependencies + version, platform_deps = map(str.strip, + rest.split(';')) + info['platform_deps'] = platform_deps + else: + version = rest # NOQA + info['version'] = (op, version) + yield info + + def parse_require_file(fpath): + with open(fpath, 'r') as f: + for line in f.readlines(): + line = line.strip() + if line and not line.startswith('#'): + for info in parse_line(line): + yield info + + def gen_packages_items(): + if exists(require_fpath): + for info in parse_require_file(require_fpath): + parts = [info['package']] + if with_version and 'version' in info: + parts.extend(info['version']) + if not sys.version.startswith('3.4'): + # apparently package_deps are broken in 3.4 + platform_deps = info.get('platform_deps') + if platform_deps is not None: + parts.append(';' + platform_deps) + item = ''.join(parts) + yield item + + packages = list(gen_packages_items()) + return packages + + +with open('README.md', encoding="utf-8") as f: + readme = f.read() + + +def do_setup(): + setup( + name='vlmeval', + version='0.1.0', + description='OpenCompass VLM Evaluation Kit', + author='Haodong Duan', + author_email='dhd.efz@gmail.com', + maintainer='Haodong Duan', + maintainer_email='dhd.efz@gmail.com', + long_description=readme, + long_description_content_type='text/markdown', + cmdclass={}, + install_requires=parse_requirements('requirements.txt'), + setup_requires=[], + python_requires='>=3.7.0', + packages=find_packages(exclude=[ + 'test*', + 'paper_test*', + ]), + keywords=['AI', 'NLP', 'in-context learning'], + entry_points={ + 'console_scripts': ['vlmutil = vlmeval:cli'] + }, + classifiers=[ + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', + 'Programming Language :: Python :: 3.9', + 'Programming Language :: Python :: 3.10', + 'Intended Audience :: Developers', + 'Intended Audience :: Education', + 'Intended Audience :: Science/Research', + ]) + + +if __name__ == '__main__': + do_setup() diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/__init__.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/__init__.py new file mode 100644 index 00000000..aefba9a0 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/__init__.py @@ -0,0 +1,43 @@ +import ssl + +# Temporarily bypass SSL certificate verification to download files from oss. +ssl._create_default_https_context = ssl._create_unverified_context + + +def load_env(): + import logging + import os + from pathlib import Path + logging.basicConfig( + format='[%(asctime)s] %(levelname)s - %(filename)s: %(funcName)s - %(lineno)d: %(message)s', + datefmt='%Y-%m-%d %H:%M:%S') + pth = Path(__file__).parent.parent / '.env' + if not pth.exists(): + logging.error(f'Did not detect the .env file at {pth}, failed to load.') + return + + from dotenv import dotenv_values + values = dotenv_values(str(pth)) + for k, v in values.items(): + if v is not None and len(v): + os.environ[k] = v + logging.info(f'API Keys successfully loaded from {pth}') + + +load_env() + +try: + import torch # noqa: F401 +except ImportError: + pass + +# from .api import * # noqa: F401, F403, E402 +# from .config import * # noqa: F401, F403, E402 +# from .dataset import * # noqa: F401, F403, E402 +# from .smp import * # noqa: F401, F403, E402 +from .tools import cli # noqa: F401, E402 + +# from .utils import * # noqa: F401, F403, E402 +# from .vlm import * # noqa: F401, F403, E402 + +__version__ = '0.2rc1' diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/__init__.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/__init__.py new file mode 100644 index 00000000..c77e2c38 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/__init__.py @@ -0,0 +1,58 @@ +from .arm_thinker import ARM_thinker +from .bailingmm import bailingMMAPI +from .bedrock import BedrockAPI +from .bluelm_api import BlueLM_API, BlueLMWrapper +from .claude import Claude3V, Claude_Wrapper +from .cloudwalk import CWWrapper +from .cosmos_reason import (CosmosReason1, CosmosReason1Think, CosmosReason2, CosmosReason2Think, + Qwen3VLThink) +from .doubao_vl_api import DoubaoVL +from .gcp_vertex import GCPVertexAPI +from .gemini import Gemini, GeminiWrapper +from .glm_vision import GLMVisionAPI +from .gpt import GPT4V, OpenAIWrapper +from .hf_chat_model import HFChatModel +from .hunyuan import HunyuanVision +from .jt_vl_chat import JTVLChatAPI +from .jt_vl_chat_mini import JTVLChatAPI_2B, JTVLChatAPI_Mini +from .kimivl_api import KimiVLAPI, KimiVLAPIWrapper +from .lmdeploy import LMDeployAPI, LMDeployWrapper +from .lvs_service import LVS_service +from .mimo import MiMo +from .minimax_api import MiniMaxAPI +from .mug_u import MUGUAPI +from .nemotron import Nemotron +from .nv_gemini import NVGemini +from .openai_sdk import OpenAISDKWrapper +from .qwen_api import QwenAPI +from .qwen_vl_api import Qwen2VLAPI, QwenVLAPI, QwenVLWrapper +from .rbdashmm_chat3_5_api import RBdashMMChat3_5_38B_API, RBdashMMChat3_78B_API +from .rbdashmm_chat3_api import RBdashChat3_5_API, RBdashMMChat3_API +from .reka import Reka +from .sensechat_vision import SenseChatVisionAPI, SenseChatVisionV2API +from .siliconflow import SiliconFlowAPI, TeleMMAPI +from .taichu import TaichuVLAPI, TaichuVLRAPI +from .taiyi import TaiyiAPI +from .telemm import TeleMM2_API +from .telemm_thinking import TeleMM2Thinking_API +from .together import TogetherAPI +from .video_chat_online_v2 import VideoChatOnlineV2API + +__all__ = [ + 'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V', 'Gemini', 'QwenVLWrapper', + 'QwenVLAPI', 'QwenAPI', 'Claude3V', 'Claude_Wrapper', 'Reka', 'GLMVisionAPI', 'CWWrapper', + 'SenseChatVisionAPI', 'SenseChatVisionV2API', 'HunyuanVision', 'Qwen2VLAPI', 'BlueLMWrapper', 'BlueLM_API', + 'JTVLChatAPI', 'JTVLChatAPI_Mini', 'JTVLChatAPI_2B', 'bailingMMAPI', 'TaiyiAPI', 'TeleMMAPI', + 'SiliconFlowAPI', 'LMDeployAPI', 'ARM_thinker', 'OpenAISDKWrapper', 'LMDeployWrapper', + 'TaichuVLAPI', 'TaichuVLRAPI', 'DoubaoVL', "MUGUAPI", 'KimiVLAPIWrapper', 'KimiVLAPI', + 'RBdashMMChat3_API', 'RBdashChat3_5_API', 'RBdashMMChat3_78B_API', 'RBdashMMChat3_5_38B_API', + 'VideoChatOnlineV2API', 'TeleMM2_API', 'TeleMM2Thinking_API', 'TogetherAPI', 'GCPVertexAPI', + 'BedrockAPI', 'SenseChatVisionV2API', 'MiniMaxAPI', + 'CosmosReason1', 'CosmosReason1Think', + 'CosmosReason2', 'CosmosReason2Think', + 'Qwen3VLThink', + 'LVS_service', + 'Nemotron', + 'MiMo', + 'NVGemini', +] diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/__init__.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/__init__.py new file mode 100644 index 00000000..aab55d18 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/__init__.py @@ -0,0 +1,13 @@ +from .base import ModelAdapter, build_adapter, get_adapter_registry, register_adapter +from .cogvlm2 import CogVLM2Adapter +from .interns1_1 import InternS1_1NoThinkAdapter, InternS1_1ThinkAdapter +from .internvl2 import InternVL2Adapter +from .internvl3 import InternVL3Adapter +from .qwen3 import Qwen3Adapter + +__all__ = [ + 'ModelAdapter', 'register_adapter', 'build_adapter', 'get_adapter_registry', + 'InternVL2Adapter', 'InternVL3Adapter', + 'InternS1_1NoThinkAdapter', 'InternS1_1ThinkAdapter', + 'CogVLM2Adapter', 'Qwen3Adapter', +] diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/base.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/base.py new file mode 100644 index 00000000..9efeaf50 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/base.py @@ -0,0 +1,78 @@ +_ADAPTER_REGISTRY = {} + + +def register_adapter(name, factory=None): + """Register an adapter class/factory under the given name. + + Can be used as a decorator:: + + @register_adapter('my_adapter') + class MyAdapter(ModelAdapter): ... + + Or called directly for variants:: + + register_adapter('my_adapter-variant', partial(MyAdapter, flag=True)) + """ + if factory is None: + def decorator(cls): + _ADAPTER_REGISTRY[name] = cls + return cls + return decorator + _ADAPTER_REGISTRY[name] = factory + return factory + + +def build_adapter(name, **kwargs): + """Instantiate a registered adapter by name.""" + if name not in _ADAPTER_REGISTRY: + available = list(_ADAPTER_REGISTRY.keys()) + raise KeyError(f"Adapter '{name}' not found in registry. Available: {available}") + return _ADAPTER_REGISTRY[name](**kwargs) + + +def get_adapter_registry(): + """Return a copy of the global adapter registry.""" + return dict(_ADAPTER_REGISTRY) + + +class ModelAdapter: + """Base class for model-specific input/output processing hooks. + + Subclasses override only the methods they need. All methods have + sensible no-op defaults except ``build_prompt``, which raises + ``NotImplementedError``. + + The ``dump_image_func`` attribute is injected by the wrapper at + evaluation time via ``set_dump_image``. + """ + + def dump_image(self, line, dataset): + """Return image path(s) for this sample.""" + return self.dump_image_func(line) + + def override_model_args(self, dataset) -> dict: + """Return extra generation kwargs for this dataset. + + Recognised keys: ``system_prompt``, ``temperature``, etc. + """ + return {} + + def use_custom_prompt(self, dataset: str, system_prompt=None) -> bool: + """Whether to use this adapter's ``build_prompt`` for this dataset.""" + return False + + def build_prompt(self, line, dataset=None): + """Construct the message list for this sample.""" + raise NotImplementedError + + def process_inputs(self, inputs, dataset=None): + """Transform the inputs list before HTTP message formatting.""" + return inputs + + def process_payload(self, payload: dict, dataset=None) -> dict: + """Modify the HTTP payload dict before it is sent.""" + return payload + + def postprocess(self, response: str, dataset=None) -> str: + """Post-process the raw response string.""" + return response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/cogvlm2.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/cogvlm2.py new file mode 100644 index 00000000..b83bb961 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/cogvlm2.py @@ -0,0 +1,50 @@ +import string + +import pandas as pd + +from vlmeval.dataset import DATASET_TYPE +from vlmeval.smp import cn_string +from .base import ModelAdapter, register_adapter + + +@register_adapter('cogvlm2') +class CogVLM2Adapter(ModelAdapter): + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset, system_prompt=None): + assert dataset is not None + return DATASET_TYPE(dataset) in 'MCQ' + + def build_prompt(self, line, dataset=None): + assert dataset is None or isinstance(dataset, str) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + + if dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + option_candidate = string.ascii_uppercase + options = { + cand: line[cand] + for cand in option_candidate + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if not cn_string(prompt): + prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly." + else: + prompt = prompt + '\n' + '请直接回答选项字母。' + else: + prompt = line['question'] + + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=p) for p in tgt_path]) + return message diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/interns1_1.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/interns1_1.py new file mode 100644 index 00000000..79dc52d1 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/interns1_1.py @@ -0,0 +1,296 @@ +import io +import os +import string + +import pandas as pd +from PIL import Image + +from vlmeval.dataset import DATASET_MODALITY, DATASET_TYPE +from vlmeval.smp import LMUDataRoot, get_logger, listinstr +from .base import ModelAdapter, register_adapter + +logger = get_logger(__name__) + + +@register_adapter('interns1_1_no_think') +class InternS1_1NoThinkAdapter(ModelAdapter): + + def __init__(self): + self.cot_prompt = None + self.screen_parse = True + self.split_think = True + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset, system_prompt=None): + if dataset in [ + 'atomic_dataset', 'electro_dataset', 'mechanics_dataset', + 'optics_dataset', 'quantum_dataset', 'statistics_dataset', + 'Physics', 'SFE', 'SFE-zh', 'IPhO_2025', 'XLRS-Bench-lite', + 'OmniEarth-Bench', + ]: + return False + elif listinstr([ + 'MMDU', 'MME-RealWorld', 'MME-RealWorld-CN', 'WeMath_COT', + 'MMAlignBench', 'ScreenSpot', 'ChartQAPro', 'MMMU', + ], dataset): + return False + elif DATASET_MODALITY(dataset) == 'VIDEO': + return False + elif DATASET_TYPE(dataset) in ['Y/N', 'MCQ', 'VQA', 'GUI']: + return True + return False + + def build_prompt(self, line, dataset=None): + from pathlib import Path + + import yaml + + from vlmeval.dataset import build_dataset, infer_dataset_basename + from vlmeval.vlm.internvl.utils import (build_mcq_cot_prompt, build_multi_choice_prompt, + build_qa_cot_prompt, format_nav_prompt, + pile_action_history) + + assert self.use_custom_prompt(dataset) + + if listinstr(['ChartMimic'], dataset): + input_figure_path_rel = line['input_figure'] + ROOT = LMUDataRoot() + img_root = os.path.join(ROOT, 'images', 'ChartMimic') + tgt_path = [os.path.join(img_root, input_figure_path_rel)] + else: + tgt_path = self.dump_image(line, dataset) + + if DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['MME'], dataset): + prompt = question + ' Answer the question using a single word or phrase.' + elif listinstr(['HallusionBench', 'AMBER'], dataset): + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + else: + prompt = question + elif DATASET_TYPE(dataset) == 'MCQ': + prompt = build_multi_choice_prompt(line, dataset) + if os.getenv('USE_COT') == '1': + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['LLaVABench', 'WildVision'], dataset): + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench', + 'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset): + prompt = question + '\nAnswer the question using a single word or phrase.' + elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse', + 'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', 'QSpatial', + 'WeMath', 'LogicVista', 'MM-IFEval', 'ChartMimic'], dataset): + prompt = question + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + elif DATASET_TYPE(dataset) == 'GUI': + vlmeval_root = Path(__file__).parent.parent.parent + tmpl_path = os.path.join(vlmeval_root, 'vlm/internvl/gui_template.yaml') + with open(tmpl_path, 'r') as f: + GUI_TEMPLATE = yaml.load(f, Loader=yaml.FullLoader) + ds_basename = infer_dataset_basename(dataset) + ds = build_dataset(dataset, skeleton=True) + action_space = ds.get_action_space() + traj_dict = ds.get_trajectory(line) + prompt_config = GUI_TEMPLATE[ds_basename] + if 'history' in prompt_config['placeholders']: + traj_dict['history'] = pile_action_history(traj_dict['history']) + prompt = format_nav_prompt( + ( + 'Please provide the bounding box coordinate of the region this sentence describes: {task}' # noqa: E501 + if self.screen_parse + else prompt_config['template'] + ), + prompt_config['placeholders'], + action_space=action_space, + **traj_dict, + ) + else: + prompt = line['question'] + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + + message = [dict(type='image', value=s) for s in tgt_path] + message.append(dict(type='text', value=prompt)) + return message + + def process_inputs(self, inputs, dataset=None): + from vlmeval.vlm.internvl.utils import build_video_prompt, reorganize_prompt + + image_items = [x.copy() for x in inputs if x['type'] == 'image'] + image_num = len(image_items) + prompt = reorganize_prompt(inputs, image_num, dataset=dataset) + + if dataset is not None and DATASET_MODALITY(dataset) == 'VIDEO': + prompt = build_video_prompt(prompt, dataset) + + if listinstr(['MMMU'], dataset) and len(image_items) > 0: + image = Image.open(image_items[0]['value']) + image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR) + tmp_image = io.BytesIO() + image.save(tmp_image, format='PNG') + image_items[0]['value'] = tmp_image + + prompt = prompt.replace('', '') + return [*image_items, dict(type='text', value=prompt)] + + def postprocess(self, response, dataset=None): + if self.split_think and '' in response and '' in response: + thinking, _, answer = response.partition('')[-1].partition('') + logger.info('-----------Thinking-----------\n' + f'{thinking}\n' + '------------------------------') + return answer + elif self.split_think and '' in response: + thinking, _, answer = response.partition('') + logger.info('-----------Thinking-----------\n' + f'{thinking}\n' + '------------------------------') + return answer + return response + + +@register_adapter('interns1_1_think') +class InternS1_1ThinkAdapter(ModelAdapter): + + def __init__(self): + self.cot_prompt = None + self.screen_parse = True + self.split_think = True + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset, system_prompt=None): + if dataset in ['SFE', 'SFE-zh', 'IPhO_2025']: + return True + return False + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + if listinstr(['SFE'], dataset): + return self._build_sfe_prompt(line, dataset) + elif listinstr(['IPhO_2025'], dataset): + return self._build_hipho_prompt(line, dataset) + else: + raise ValueError(f'Custom prompt not supported for dataset: {dataset}') + + def _build_sfe_prompt(self, line, dataset): + MCQ_PROMPT = 'You are an expert in {discipline} and need to solve the following question.' + EXACT_MATCH_PROMPT = 'You are an expert in {discipline} and need to solve the following question.' + OPEN_QUESTION_PROMPT = 'You are an expert in {discipline} and need to solve the following question.' + + tgt_path = self.dump_image(line, dataset) + question_type = line['question_type'] + field = line['category'] + question = line['question'] + + if question_type == 'exact_match': + prompt = EXACT_MATCH_PROMPT.format(discipline=field) + question = prompt + ' ' + question + elif question_type == 'mcq': + prompt = MCQ_PROMPT.format(discipline=field) + question = prompt + ' ' + question + if not pd.isna(line['A']): + question += '\nChoices are:\n' + for ch in string.ascii_uppercase[:15]: + if not pd.isna(line[ch]): + question += f'{ch}. {line[ch]}\n' + else: + break + elif question_type == 'open_ended': + prompt = OPEN_QUESTION_PROMPT.format(discipline=field) + question = prompt + ' ' + question + + prompt_segs = question.split('') + assert len(prompt_segs) == len(tgt_path) + 1 + msgs = [] + for i in range(len(tgt_path)): + text = prompt_segs[i].strip() + if text: + msgs.append(dict(type='text', value=text)) + msgs.append(dict(type='image', value=tgt_path[i])) + text = prompt_segs[-1].strip() + if text: + msgs.append(dict(type='text', value=text)) + return msgs + + def _build_hipho_prompt(self, line, dataset): + def safe_str(val): + return '' if pd.isna(val) or val == '' else str(val) + + context = safe_str(line.get('context', '')) + question = safe_str(line['question']) + information = safe_str(line.get('information', '')) + + SYSTEM_PROMPTS_EN = ( + 'Please answer the problem adhering to the following rules:\n' + '1. Please use LaTeX format to represent the variables and formulas ' + 'used in the solution process and results.\n' + '2. Please put the final answer(s) in \\boxed{}, note that the unit of ' + 'the answer should not be included in \\boxed{}.\n' + '3. If the problem requires multiple answers, list them in order, each in a separate \\boxed{}.\n' + 'Problem: Information:{information}\n' + 'Context:{context}\n' + 'Question: {problem}' + ) + formatted_prompt = ( + SYSTEM_PROMPTS_EN + .replace('{context}', context) + .replace('{problem}', question) + .replace('{information}', information) + ) + + msgs = [] + image_val = str(line.get('image', '')).strip() + if image_val and not image_val.startswith('NO_IMAGE_PLACEHOLDER_'): + tgt_path = self.dump_image(line, dataset) + if tgt_path and tgt_path != ['']: + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs.append(dict(type='image', value=tgt_path)) + + msgs.append(dict(type='text', value=formatted_prompt)) + return msgs + + def process_inputs(self, inputs, dataset=None): + from vlmeval.vlm.internvl.utils import build_video_prompt, reorganize_prompt + + image_items = [x.copy() for x in inputs if x['type'] == 'image'] + image_num = len(image_items) + prompt = reorganize_prompt(inputs, image_num, dataset=dataset) + + if dataset is not None and DATASET_MODALITY(dataset) == 'VIDEO': + prompt = build_video_prompt(prompt, dataset) + + if listinstr(['MMMU'], dataset) and len(image_items) > 0: + image = Image.open(image_items[0]['value']) + image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR) + tmp_image = io.BytesIO() + image.save(tmp_image, format='PNG') + image_items[0]['value'] = tmp_image + + prompt = prompt.replace('', '') + return [*image_items, dict(type='text', value=prompt)] + + def postprocess(self, response, dataset=None): + if self.split_think and '' in response and '' in response: + thinking, _, answer = response.partition('')[-1].partition('') + logger.info('-----------Thinking-----------\n' + f'{thinking}\n' + '------------------------------') + return answer + elif self.split_think and '' in response: + thinking, _, answer = response.partition('') + logger.info('-----------Thinking-----------\n' + f'{thinking}\n' + '------------------------------') + return answer + return response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/internvl2.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/internvl2.py new file mode 100644 index 00000000..81a3cb95 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/internvl2.py @@ -0,0 +1,166 @@ +import copy +from functools import partial + +from vlmeval.dataset import DATASET_MODALITY, DATASET_TYPE +from vlmeval.smp import LMUDataRoot, listinstr +from .base import ModelAdapter, register_adapter + + +@register_adapter('internvl2') +class InternVL2Adapter(ModelAdapter): + + def __init__(self, use_mpo_prompt=False): + self.use_mpo_prompt = use_mpo_prompt + self.cot_prompt = None + self.screen_parse = False + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset, system_prompt=None): + assert dataset is not None + if dataset in [ + 'atomic_dataset', 'electro_dataset', 'mechanics_dataset', + 'optics_dataset', 'quantum_dataset', 'statistics_dataset', + 'Physics', 'SFE', 'SFE-zh', + 'XLRS-Bench-lite', 'OmniEarth-Bench', + ]: + return False + if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN', 'WeMath_COT', 'MMAlignBench'], dataset): + return False + if system_prompt is not None and '' in system_prompt and listinstr( + ['MicroVQA', 'MSEarthMCQ', 'MMSci_DEV_MCQ', 'MMMU', 'VisuLogic'], dataset + ): + return False + if DATASET_MODALITY(dataset) == 'VIDEO': + return False + return True + + def build_prompt(self, line, dataset=None): + import os + from pathlib import Path + + import yaml + + from vlmeval.dataset import build_dataset, infer_dataset_basename + from vlmeval.vlm.internvl.utils import (build_mcq_cot_prompt, build_mpo_prompt, + build_multi_choice_prompt, build_qa_cot_prompt, + format_nav_prompt, pile_action_history) + + use_mpo_prompt = self.use_mpo_prompt and ( + getattr(self, 'use_cot', False) + or dataset in ['MMStar', 'HallusionBench', 'OCRBench'] + ) + + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + + if not listinstr(['ChartMimic'], dataset): + tgt_path = self.dump_image(line, dataset) + else: + input_figure_path_rel = line['input_figure'] + ROOT = LMUDataRoot() + img_root = os.path.join(ROOT, 'images', 'ChartMimic') + tgt_path = [os.path.join(img_root, input_figure_path_rel)] + + if dataset is not None and DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['MME'], dataset): + prompt = question + ' Answer the question using a single word or phrase.' + elif listinstr(['HallusionBench', 'AMBER'], dataset): + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + else: + prompt = question + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = build_multi_choice_prompt(line, dataset) + if os.getenv('USE_COT') == '1': + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['LLaVABench', 'WildVision'], dataset): + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench', + 'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset): + prompt = question + '\nAnswer the question using a single word or phrase.' + elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse', + 'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', 'QSpatial', + 'WeMath', 'LogicVista', 'MM-IFEval', 'ChartMimic'], dataset): + prompt = question + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + elif dataset is not None and DATASET_TYPE(dataset) == 'GUI': + vlmeval_root = Path(__file__).parent.parent.parent + with open(os.path.join(vlmeval_root, 'vlm/internvl/gui_template.yaml'), 'r') as f: + GUI_TEMPLATE = yaml.load(f, Loader=yaml.FullLoader) + ds_basename = infer_dataset_basename(dataset) + ds = build_dataset(dataset, skeleton=True) + action_space = ds.get_action_space() + traj_dict = ds.get_trajectory(line) + prompt_config = GUI_TEMPLATE[ds_basename] + if 'history' in prompt_config['placeholders']: + traj_dict['history'] = pile_action_history(traj_dict['history']) + prompt = format_nav_prompt( + ( + 'Please provide the bounding box coordinate of the region this sentence describes: {task}' # noqa: E501 + if self.screen_parse + else prompt_config['template'] + ), + prompt_config['placeholders'], + action_space=action_space, + **traj_dict, + ) + else: + prompt = line['question'] + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + + if use_mpo_prompt: + message = build_mpo_prompt(message, line, dataset) + return message + + def get_max_num(self, dataset): + if dataset is None: + return 6 + res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld', + 'VCR_EN', 'VCR_ZH', 'OCRVQA'] + res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST', 'DUDE', 'MMLongBench_DOC', 'SLIDEVQA'] + res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K'] + if DATASET_MODALITY(dataset) == 'VIDEO': + return 1 + elif listinstr(res_12_datasets, dataset): + return 12 + elif listinstr(res_18_datasets, dataset): + return 18 + elif listinstr(res_24_datasets, dataset): + return 24 + elif DATASET_TYPE(dataset) == 'GUI': + return 12 + return 6 + + def process_payload(self, payload, dataset=None): + max_num = self.get_max_num(dataset) + if max_num is not None: + payload = copy.deepcopy(payload) + for msg in payload['messages']: + content = msg['content'] + if isinstance(content, dict) and content.get('type') == 'image_url': + content['image_url']['max_dynamic_patch'] = max_num + elif isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get('type') == 'image_url': + item['image_url']['max_dynamic_patch'] = max_num + return payload + + def postprocess(self, response, dataset=None): + if self.use_mpo_prompt: + from vlmeval.vlm.internvl.utils import mpo_post_processing + return mpo_post_processing(response, dataset) + return response + + +register_adapter('internvl2-mpo-cot', partial(InternVL2Adapter, use_mpo_prompt=True)) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/internvl3.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/internvl3.py new file mode 100644 index 00000000..1cf07aa6 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/internvl3.py @@ -0,0 +1,204 @@ +import copy +import io +import os + +from vlmeval.dataset import DATASET_MODALITY, DATASET_TYPE +from vlmeval.smp import listinstr +from .base import ModelAdapter, register_adapter + +_GUI_TEMPLATE = None + + +def _get_gui_template(): + global _GUI_TEMPLATE + if _GUI_TEMPLATE is None: + from pathlib import Path + + import yaml + vlmeval_root = Path(__file__).parent.parent.parent + tmpl_path = os.path.join(vlmeval_root, 'vlm/internvl/gui_template.yaml') + with open(tmpl_path, 'r') as f: + _GUI_TEMPLATE = yaml.load(f, Loader=yaml.FullLoader) + return _GUI_TEMPLATE + + +@register_adapter('internvl3') +class InternVL3Adapter(ModelAdapter): + + def __init__(self): + self.cot_prompt = None + self.screen_parse = True + self.split_think = True + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def override_model_args(self, dataset): + think_ds = [ + 'MMMU', 'MathVista', 'SFE', 'Physics', 'MathVision', + 'OlympiadBench', 'IPhO_2025', 'MaCBench', + ] + think_system = ( + 'You are an expert reasoner with extensive experience in all ' + 'areas. You approach problems through systematic thinking and ' + 'rigorous reasoning. Your response should reflect deep ' + 'understanding and precise logical thinking, making your ' + 'solution path and reasoning clear to others. Please put your ' + 'thinking process within ... tags.' + ) + if listinstr(think_ds, dataset): + return dict(system_prompt=think_system) + else: + return dict(temperature=0.) + + def use_custom_prompt(self, dataset, system_prompt=None): + if dataset in [ + 'atomic_dataset', 'electro_dataset', 'mechanics_dataset', + 'optics_dataset', 'quantum_dataset', 'statistics_dataset', + 'Physics', 'SFE', 'SFE-zh', 'IPhO_2025', 'XLRS-Bench-lite', + 'OmniEarth-Bench', + ]: + return False + elif listinstr([ + 'MMDU', 'MME-RealWorld', 'MME-RealWorld-CN', 'WeMath_COT', + 'MMAlignBench', 'ScreenSpot', 'ChartQAPro', 'MMMU', + ], dataset): + return False + elif DATASET_MODALITY(dataset) == 'VIDEO': + return False + elif DATASET_TYPE(dataset) in ['Y/N', 'MCQ', 'VQA', 'GUI']: + return True + return False + + def build_prompt(self, line, dataset=None): + from vlmeval.dataset import build_dataset, infer_dataset_basename + from vlmeval.smp import LMUDataRoot + from vlmeval.vlm.internvl.utils import (build_mcq_cot_prompt, build_multi_choice_prompt, + build_qa_cot_prompt, format_nav_prompt, + pile_action_history) + + assert self.use_custom_prompt(dataset) + + if listinstr(['ChartMimic'], dataset): + input_figure_path_rel = line['input_figure'] + ROOT = LMUDataRoot() + img_root = os.path.join(ROOT, 'images', 'ChartMimic') + tgt_path = [os.path.join(img_root, input_figure_path_rel)] + else: + tgt_path = self.dump_image(line, dataset) + + if DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['MME'], dataset): + prompt = question + ' Answer the question using a single word or phrase.' + elif listinstr(['HallusionBench', 'AMBER'], dataset): + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + else: + prompt = question + elif DATASET_TYPE(dataset) == 'MCQ': + prompt = build_multi_choice_prompt(line, dataset) + if os.getenv('USE_COT') == '1': + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['LLaVABench', 'WildVision'], dataset): + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench', + 'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset): + prompt = question + '\nAnswer the question using a single word or phrase.' + elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse', + 'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', 'QSpatial', + 'WeMath', 'LogicVista', 'MM-IFEval', 'ChartMimic'], dataset): + prompt = question + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + elif DATASET_TYPE(dataset) == 'GUI': + GUI_TEMPLATE = _get_gui_template() + ds_basename = infer_dataset_basename(dataset) + ds = build_dataset(dataset, skeleton=True) + action_space = ds.get_action_space() + traj_dict = ds.get_trajectory(line) + prompt_config = GUI_TEMPLATE[ds_basename] + if 'history' in prompt_config['placeholders']: + traj_dict['history'] = pile_action_history(traj_dict['history']) + prompt = format_nav_prompt( + ( + 'Please provide the bounding box coordinate of the region this sentence describes: {task}' # noqa: E501 + if self.screen_parse + else prompt_config['template'] + ), + prompt_config['placeholders'], + action_space=action_space, + **traj_dict, + ) + else: + prompt = line['question'] + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + + message = [dict(type='image', value=s) for s in tgt_path] + message.append(dict(type='text', value=prompt)) + return message + + def process_inputs(self, inputs, dataset=None): + from PIL import Image + + from vlmeval.vlm.internvl.utils import build_video_prompt, reorganize_prompt + + image_items = [x.copy() for x in inputs if x['type'] == 'image'] + image_num = len(image_items) + prompt = reorganize_prompt(inputs, image_num, dataset=dataset) + + if dataset is not None and DATASET_MODALITY(dataset) == 'VIDEO': + prompt = build_video_prompt(prompt, dataset) + + if listinstr(['MMMU'], dataset) and len(image_items) > 0: + image = Image.open(image_items[0]['value']) + image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR) + tmp_image = io.BytesIO() + image.save(tmp_image, format='PNG') + image_items[0]['value'] = tmp_image + + prompt = prompt.replace('', '') + return [*image_items, dict(type='text', value=prompt)] + + def get_max_num(self, dataset): + if dataset is None: + return None + res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld', + 'VCR_EN', 'VCR_ZH', 'OCRVQA'] + res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST', 'DUDE', 'MMLongBench_DOC', 'SLIDEVQA'] + res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K'] + if DATASET_MODALITY(dataset) == 'VIDEO': + return 1 + elif listinstr(res_12_datasets, dataset): + return 12 + elif listinstr(res_18_datasets, dataset): + return 18 + elif listinstr(res_24_datasets, dataset): + return 24 + elif DATASET_TYPE(dataset) == 'GUI': + return 12 + return None + + def process_payload(self, payload, dataset=None): + max_num = self.get_max_num(dataset) + if max_num is not None: + payload = copy.deepcopy(payload) + for msg in payload['messages']: + content = msg['content'] + if isinstance(content, dict) and content.get('type') == 'image_url': + content['image_url']['max_dynamic_patch'] = max_num + elif isinstance(content, list): + for item in content: + if isinstance(item, dict) and item.get('type') == 'image_url': + item['image_url']['max_dynamic_patch'] = max_num + return payload + + def postprocess(self, response, dataset=None): + if self.split_think and '' in response and '' in response: + _, _, answer = response.partition('')[-1].partition('') + return answer + return response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/qwen3.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/qwen3.py new file mode 100644 index 00000000..ae951304 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/adapters/qwen3.py @@ -0,0 +1,14 @@ +from .base import ModelAdapter, register_adapter + + +@register_adapter('qwen3') +class Qwen3Adapter(ModelAdapter): + + def __init__(self, max_pixels=None): + self.max_pixels = max_pixels + + def process_payload(self, payload, dataset=None): + if self.max_pixels: + payload = payload.copy() + payload['mm_processor_kwargs'] = {'max_pixels': self.max_pixels} + return payload diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/arm_thinker.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/arm_thinker.py new file mode 100644 index 00000000..08bc1072 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/arm_thinker.py @@ -0,0 +1,276 @@ +import base64 +import datetime as dt +import json +import logging +import os +import os.path as osp +import sys +import traceback +import uuid + +import numpy as np +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import encode_image_to_base64 + + +class LMDeployWrapper(BaseAPI): + + is_api: bool = True + + custom_prompt: str = None + + def __init__( + self, + model: str = None, + retry: int = 5, + api_base: str = None, + key: str = "sk-123456", + verbose: bool = True, + timeout: int = 60, + agent_repo_root: str = None, + use_role_tool: bool = True, + **kwargs, + ): + # get key and api_base from environment variables + key = os.environ.get("LMDEPLOY_API_KEY", key) + api_base = os.environ.get("LMDEPLOY_API_BASE", api_base) + assert key is not None, "Please set the environment variable LMDEPLOY_API_KEY." + assert ( + api_base is not None + ), "Please set the environment variable LMDEPLOY_API_BASE." + self.key = key + self.api_base = api_base + model_url = "".join([api_base.split("v1")[0], "v1/models"]) + headers = {"Authorization": f"Bearer {self.key}"} + resp = requests.get(model_url, headers=headers) + model_id_list = [str(data["id"]) for data in resp.json()["data"]] + self.model = model if model in model_id_list else model_id_list[0] + + self.fail_msg = "Failed to obtain answer via API. " + self.timeout = timeout + self.extra_pt = kwargs.pop("extra_pt", None) + self.use_role_tool = use_role_tool + # Set up logging for agent mode if needed + self.debug_mode = kwargs.pop("debug_mode", False) + if self.debug_mode: + logging.basicConfig(level=logging.DEBUG, force=True) + + super().__init__( + retry=retry, verbose=verbose, **kwargs + ) + + if agent_repo_root is None: + raise ValueError('Please set `agent_repo_root` to ARM-Thinker directory, \ + which is cloned from here: https://github.com/InternLM/ARM-Thinker') + + print(f"agent_repo_root: {agent_repo_root}") + sys.path.append(agent_repo_root) + try: + from arm_agent.agent_verl import VerlAgent # noqa: F401 + except Exception: + raise ValueError('Please install ARM-Thinker from https://github.com/InternLM/ARM-Thinker') + + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x["type"] == "image" for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg["type"] == "text": + content_list.append(dict(type="text", text=msg["value"])) + elif msg["type"] == "image": + from PIL import Image + + img = Image.open(msg["value"]) + b64 = encode_image_to_base64(img) + extra_args = msg.copy() + extra_args.pop("type") + extra_args.pop("value") + img_struct = dict(url=f"data:image/jpeg;base64,{b64}", **extra_args) + content_list.append(dict(type="image_url", image_url=img_struct)) + else: + assert all([x["type"] == "text" for x in inputs]) + text = "\n".join([x["value"] for x in inputs]) + content_list = [dict(type="text", text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(["type" in x for x in inputs]) or np.all( + ["role" in x for x in inputs] + ), inputs + if "role" in inputs[0]: + assert inputs[-1]["role"] == "user", inputs[-1] + for item in inputs: + input_msgs.append( + dict(role=item["role"], content=self.prepare_itlist(item["content"])) + ) + else: + input_msgs.append(dict(role="user", content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> tuple: + mode = kwargs.pop("mode", "direct") + dataset = kwargs.pop("dataset", None) + + if mode == "direct": + input_msgs = self.prepare_inputs(inputs) + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {self.key}", + } + + if self.extra_pt: + input_msgs[-1]["content"][-1]["text"] = ( + input_msgs[-1]["content"][-1]["text"] + self.extra_pt + ) + + payload = { + "model": self.model, + "messages": input_msgs, + "n": 1, + "extra_body": {}, + } + if kwargs.get("repetition_penalty", None): + payload["extra_body"]["repetition_penalty"] = kwargs.get("repetition_penalty") + kwargs.pop("repetition_penalty") + + if kwargs.get("top_k", None): + payload["extra_body"]["top_k"] = kwargs.get("top_k") + kwargs.pop("top_k") + payload.update(kwargs) + + payload_copy = payload.copy() + payload_copy.pop("messages") + print(f"Full payload except messages:\n{payload_copy}") + + response = requests.post( + self.api_base, + headers=headers, + data=json.dumps(payload), + timeout=self.timeout * 1.1, + ) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct["choices"][0]["message"]["content"].strip() + except Exception: + pass + return ret_code, answer, response + + elif mode == "agent": + timestamp = dt.datetime.now().strftime("%Y%m%d%H%M%S") + temp_dir = f"/tmp/agent_images_{timestamp}" + os.environ["TOOL_CALL_IMG_TEMP"] = temp_dir + os.makedirs(temp_dir, exist_ok=True) + + from arm_agent.agent_verl import VerlAgent + agent = VerlAgent( + model_name=self.model, + api_base=self.api_base.split("chat/completions")[0], + api_key=self.key, + use_role_tool=self.use_role_tool, + **kwargs + ) + + input_msgs = self.prepare_inputs(inputs) + + if self.extra_pt: + input_msgs[-1]["content"][-1]["text"] = ( + input_msgs[-1]["content"][-1]["text"] + self.extra_pt + ) + + result, tool_call_count = agent.run(user_messages=input_msgs) + rtn = result[-1]["content"] + + # "base64" or "save_to_path" or "hidden", default is "base64" + # Option "base64": Direct save the image base64 to result file, which will be large. + # Option "save_to_path": Save the image to path, which will be large. + # Option "hidden": Hidden the image, which will be small. + option_for_process_image_in_result = "save_to_path" + if option_for_process_image_in_result == "hidden": + for msg in result: + role = msg.get("role", "") + content = msg.get("content", "") + # print image part, base64 is too long to print + if role == "user" and isinstance(content, list): + for item in content: + if "type" in item and item["type"] == "image_url": + item["image_url"]["url"] = "hidden" + print(f"{role.upper()}:\n{content}\n") + elif option_for_process_image_in_result == "base64": + pass + elif option_for_process_image_in_result == "save_to_path": + # vlmevalkit_root is path like xxx/VLMEvalKit + vlmevalkit_root = osp.dirname(osp.dirname(osp.dirname(osp.abspath(__file__)))) + save_img_root = osp.join(vlmevalkit_root, "temp", dataset, self.model) + os.makedirs(save_img_root, exist_ok=True) + + # uuid + short_uid = uuid.uuid4().hex[:6] # 6 bit uuid + sub_dir_name = f"{timestamp}_{short_uid}" + img_counter = 0 + for msg in result: + content = msg.get("content", "") + if not isinstance(content, list): + continue + for itemm in content: + if not (isinstance(itemm, dict) and itemm.get("type") == "image_url"): + continue + image_url = itemm.get("image_url", {}).get("url", "") + if not image_url: + continue + try: + # judge whether it is base64 or url + if image_url.startswith("data:image"): + header, b64_data = image_url.split(",", 1) + ext = header.split("/")[1].split(";")[0] # extract extension + img_bytes = base64.b64decode(b64_data) + img_name = f"{sub_dir_name}/img_{img_counter:04d}.{ext}" + else: + raise ValueError( + f"Invalid image url: {image_url}, can only process base64 image url" + ) + # save image + img_path = os.path.join(save_img_root, img_name) + os.makedirs(os.path.dirname(img_path), exist_ok=True) + with open(img_path, "wb") as f: + f.write(img_bytes) + + # replace image_url["url"] with relative path + itemm["image_url"]["url"] = f"{img_name}" + img_counter += 1 + + print(f"[Saved] {img_path}, {itemm['image_url']['url']}") + except Exception as e: + print(f"Error: {e}") + print("Traceback:\n" + traceback.format_exc()) + raise ValueError(f"Error: {e}") + else: + raise ValueError(f"Invalid option: {option_for_process_image_in_result}") + + extra_records = {"tool_call_count": tool_call_count, "conversation": result} + print(f"tool_call_count in extra_records: {tool_call_count}") + + if rtn: + ret_code = 0 + return ret_code, rtn, extra_records + else: + ret_code = 1 + return ret_code, self.fail_msg, extra_records + else: + raise ValueError(f"Invalid mode: {mode}") + + +class ARM_thinker(LMDeployWrapper): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def generate(self, message, dataset=None): + return super(ARM_thinker, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/bailingmm.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/bailingmm.py new file mode 100644 index 00000000..fdbcf040 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/bailingmm.py @@ -0,0 +1,96 @@ +import base64 +import copy as cp +import json +import os +import time + +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import get_logger + +logger = get_logger(__name__) + + +class bailingMMWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str, + retry: int = 5, + key: str = None, + verbose: bool = True, + system_prompt: str = None, + max_tokens: int = 1024, + proxy: str = None, + **kwargs): + + self.model = model + self.fail_msg = 'Failed to obtain answer via bailingMM API.' + if key is None: + key = os.environ.get('BAILINGMM_API_KEY', None) + assert key is not None, ('Please set the API Key for bailingMM.') + self.key = key + self.headers = {"Content-Type": "application/json"} + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + def image_to_base64(self, image_path): + with open(image_path, 'rb') as image_file: + encoded_string = str(base64.b64encode(image_file.read()), 'utf-8') + return encoded_string + + def prepare_inputs(self, inputs): + msgs = cp.deepcopy(inputs) + content = [] + for i, msg in enumerate(msgs): + if msg['type'] == 'text': + pass + else: + try: + image_data = self.image_to_base64(msg['value']) + except Exception as e: + if self.verbose: + logger.error(e) + image_data = '' + msg['value'] = image_data + content.append(msg) + return content + + def generate_inner(self, inputs, **kwargs) -> str: + assert isinstance(inputs, str) or isinstance(inputs, list) + start = time.time() + inputs = [inputs] if isinstance(inputs, str) else inputs + + messages = self.prepare_inputs(inputs) + + service_url = "https://bailingchat.alipay.com/api/proxy/eval/antgmm/completions" + + payload = { + "structInput": json.dumps([{"role": "user", "content": messages}]), + "sk": self.key, + "model": self.model, + "timeout": 180000 + } + response = requests.post(service_url, headers=self.headers, json=payload) + if self.verbose: + logger.info('Time for requesting is:') + logger.info(time.time() - start) + try: + assert response.status_code == 200 + output = json.loads(response.text) + answer = output['preds']['pred'] + if self.verbose: + logger.info(f'inputs: {inputs}\nanswer: {answer}') + return 0, answer, 'Succeeded! ' + except Exception as e: + if self.verbose: + logger.error(e) + logger.error(f'The input messages are {inputs}.') + return -1, self.fail_msg, '' + + +class bailingMMAPI(bailingMMWrapper): + + def generate(self, message, dataset=None): + return super(bailingMMAPI, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/base.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/base.py new file mode 100644 index 00000000..fa3d5d21 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/base.py @@ -0,0 +1,335 @@ +import copy as cp +import random as rd +import threading +import time +from abc import abstractmethod + +from vlmeval.smp import concat_images_vlmeval, get_logger, parse_file + +logger = get_logger(__name__) + + +class BaseAPI: + + allowed_types = ['text', 'image', 'video'] + INTERLEAVE = True + INSTALL_REQ = False + + def __init__(self, + retry=10, + wait=1, + system_prompt=None, + verbose=True, + fail_msg='Failed to obtain answer via API.', + **kwargs): + """Base Class for all APIs. + + Args: + retry (int, optional): The retry times for `generate_inner`. Defaults to 10. + wait (int, optional): The wait time after each failed retry of `generate_inner`. Defaults to 1. + system_prompt (str, optional): Defaults to None. + verbose (bool, optional): Defaults to True. + fail_msg (str, optional): The message to return when failed to obtain answer. + Defaults to 'Failed to obtain answer via API.'. + **kwargs: Other kwargs for `generate_inner`. + """ + + self.wait = wait + self.retry = retry + self.system_prompt = system_prompt + self.verbose = verbose + self.fail_msg = fail_msg + self._fail_reason_local = threading.local() + + if len(kwargs): + logger.info(f'BaseAPI received the following kwargs: {kwargs}') + logger.info('Will try to use them as kwargs for `generate`. ') + self.default_kwargs = kwargs + + def __getstate__(self): + state = self.__dict__.copy() + state.pop('_fail_reason_local', None) + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self._fail_reason_local = threading.local() + + @abstractmethod + def generate_inner(self, inputs, **kwargs): + """The inner function to generate the answer. + + Returns: + tuple(int, str, str): ret_code, response, log + """ + logger.warning('For APIBase, generate_inner is an abstract method. ') + assert 0, 'generate_inner not defined' + ret_code, answer, log = None, None, None + # if ret_code is 0, means succeed + return ret_code, answer, log + + def working(self): + """If the API model is working, return True, else return False. + + Returns: + bool: If the API model is working, return True, else return False. + """ + self.old_timeout = None + if hasattr(self, 'timeout'): + self.old_timeout = self.timeout + self.timeout = 120 + + retry = 5 + while retry > 0: + ret = self.generate('hello') + if ret is not None and ret != '' and self.fail_msg not in ret: + if self.old_timeout is not None: + self.timeout = self.old_timeout + return True + retry -= 1 + + if self.old_timeout is not None: + self.timeout = self.old_timeout + return False + + def check_content(self, msgs): + """Check the content type of the input. Four types are allowed: str, dict, liststr, listdict. + + Args: + msgs: Raw input messages. + + Returns: + str: The message type. + """ + if isinstance(msgs, str): + return 'str' + if isinstance(msgs, dict): + return 'dict' + if isinstance(msgs, list): + types = [self.check_content(m) for m in msgs] + if all(t == 'str' for t in types): + return 'liststr' + if all(t == 'dict' for t in types): + return 'listdict' + return 'unknown' + + def preproc_content(self, inputs): + """Convert the raw input messages to a list of dicts. + + Args: + inputs: raw input messages. + + Returns: + list(dict): The preprocessed input messages. Will return None if failed to preprocess the input. + """ + if self.check_content(inputs) == 'str': + return [dict(type='text', value=inputs)] + elif self.check_content(inputs) == 'dict': + assert 'type' in inputs and 'value' in inputs + return [inputs] + elif self.check_content(inputs) == 'liststr': + res = [] + for s in inputs: + mime, pth = parse_file(s) + if mime is None or mime == 'unknown': + res.append(dict(type='text', value=s)) + else: + res.append(dict(type=mime.split('/')[0], value=pth)) + return res + elif self.check_content(inputs) == 'listdict': + for item in inputs: + assert 'type' in item and 'value' in item + mime, s = parse_file(item['value']) + if mime is None: + assert item['type'] == 'text', item['value'] + else: + assert mime.split('/')[0] == item['type'] + item['value'] = s + return inputs + else: + return None + + # May exceed the context windows size, so try with different turn numbers. + def chat_inner(self, inputs, **kwargs): + _ = kwargs.pop('dataset', None) + while len(inputs): + try: + return self.generate_inner(inputs, **kwargs) + except Exception as e: + if self.verbose: + logger.info(f'{type(e)}: {e}') + inputs = inputs[1:] + while len(inputs) and inputs[0]['role'] != 'user': + inputs = inputs[1:] + continue + return -1, self.fail_msg + ': ' + 'Failed with all possible conversation turns.', None + + def chat(self, messages, **kwargs1): + """The main function for multi-turn chatting. Will call `chat_inner` with the preprocessed input messages.""" + assert hasattr(self, 'chat_inner'), 'The API model should has the `chat_inner` method. ' + for msg in messages: + assert isinstance(msg, dict) and 'role' in msg and 'content' in msg, msg + assert self.check_content(msg['content']) in ['str', 'dict', 'liststr', 'listdict'], msg + msg['content'] = self.preproc_content(msg['content']) + # merge kwargs + kwargs = cp.deepcopy(self.default_kwargs) + kwargs.update(kwargs1) + + answer = None + # a very small random delay [0s - 0.5s] + T = rd.random() * 0.5 + time.sleep(T) + + assert messages[-1]['role'] == 'user' + + for i in range(self.retry): + try: + ret_code, answer, log = self.chat_inner(messages, **kwargs) + if ret_code == 0 and self.fail_msg not in answer and answer != '': + if self.verbose: + print(answer) + return answer + elif self.verbose: + if not isinstance(log, str): + try: + log = log.text + except Exception as e: + logger.warning(f'Failed to parse {log} as an http response: {str(e)}. ') + logger.info(f'RetCode: {ret_code}\nAnswer: {answer}\nLog: {log}') + except Exception as err: + if self.verbose: + logger.error(f'An error occured during try {i}: ') + logger.error(f'{type(err)}: {err}') + # delay before each retry + if i < self.retry - 1: # Don't sleep after last attempt + T = rd.random() * self.wait * 2 + if self.verbose and T > 0: + logger.info(f'⏱️ Waiting {T:.1f}s before next retry...') + time.sleep(T) + + return self.fail_msg if answer in ['', None] else answer + + def preprocess_message_with_role(self, message): + system_prompt = '' + new_message = [] + + for data in message: + assert isinstance(data, dict) + role = data.pop('role', 'user') + if role == 'system': + system_prompt += data['value'] + '\n' + else: + new_message.append(data) + + if system_prompt != '': + if self.system_prompt is None: + self.system_prompt = system_prompt + else: + if system_prompt not in self.system_prompt: + self.system_prompt += '\n' + system_prompt + return new_message + + def generate(self, message, **kwargs1): + """The main function to generate the answer. Will call `generate_inner` with the preprocessed input messages. + + Args: + message: raw input messages. + + Returns: + str: The generated answer of the Failed Message if failed to obtain answer. + """ + if self.check_content(message) == 'listdict': + message = self.preprocess_message_with_role(message) + + assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}' + message = self.preproc_content(message) + assert message is not None and self.check_content(message) == 'listdict' + + # Defensive check: ensure allowed_types is set (fix for multiprocessing/state issues) + if not hasattr(self, 'allowed_types') or self.allowed_types is None: + if self.verbose: + logger.warning(f'allowed_types was None or missing! Setting to default. Class: {self.__class__.__name__}') + self.allowed_types = ['text', 'image', 'video'] + + for item in message: + try: + assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}' + except TypeError as e: + logger.error(f'TypeError checking allowed_types: {e}') + logger.error(f'allowed_types value: {self.allowed_types}') + logger.error(f'allowed_types type: {type(self.allowed_types)}') + logger.error(f'Class: {self.__class__.__name__}') + # Force set it again + self.allowed_types = ['text', 'image', 'video'] + # Re-check + assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}' + + # merge kwargs + kwargs = cp.deepcopy(self.default_kwargs) + kwargs.update(kwargs1) + + answer = None + ret_code = -1 + # a very small random delay [0s - 0.5s] + T = rd.random() * 0.5 + time.sleep(T) + + for i in range(self.retry): + try: + ret_code, answer, log = self.generate_inner(message, **kwargs) + # Check answer is valid before using 'in' operator to avoid TypeError with None + if ret_code == 0 and answer and answer != '' and self.fail_msg not in answer: + if self.verbose: + print(answer) + # Return both answer and extra_records + # Add for agent evaluation + if isinstance(log, dict): + return {"prediction": answer, "extra_records": log} + return answer + elif self.verbose: + if not isinstance(log, str): + try: + log = log.text + except Exception as e: + logger.warning(f'Failed to parse {log} as an http response: {str(e)}. ') + logger.info(f'RetCode: {ret_code}\nAnswer: {answer}\nLog: {log}') + except Exception as err: + if self.verbose: + logger.error(f'An error occured during try {i}: ') + logger.error(f'{type(err)}: {err}') + # delay before each retry + T = rd.random() * self.wait * 2 + time.sleep(T) + + self._fail_reason_local.reason = 2 if ret_code == -2 else 1 # 1=recoverable, 2=content_filter + return self.fail_msg if answer in ['', None] else answer + + def message_to_promptimg(self, message, dataset=None): + assert not self.INTERLEAVE + model_name = self.__class__.__name__ + import warnings + warnings.warn( + f'Model {model_name} does not support interleaved input. ' + 'Will use the first image and aggregated texts as prompt. ') + num_images = len([x for x in message if x['type'] == 'image']) + if num_images == 0: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = None + elif num_images == 1: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = [x['value'] for x in message if x['type'] == 'image'][0] + else: + prompt = '\n'.join([x['value'] if x['type'] == 'text' else '' for x in message]) + if dataset == 'BLINK': + image = concat_images_vlmeval( + [x['value'] for x in message if x['type'] == 'image'], + target_size=512) + else: + image = [x['value'] for x in message if x['type'] == 'image'][0] + return prompt, image + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def set_dump_image(self, dump_image_func): + self.dump_image_func = dump_image_func diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/bedrock.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/bedrock.py new file mode 100644 index 00000000..0f97a826 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/bedrock.py @@ -0,0 +1,173 @@ +""" +AWS Bedrock API support via the Converse API. +Supports vision models (e.g. Claude on Bedrock) with image inputs. +Set AWS credentials via environment (AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION) +or default profile. + +Requires: pip install boto3 +""" +import base64 +import os +import os.path as osp + +from ..smp import get_logger +from .base import BaseAPI + +try: + import boto3 + from botocore.exceptions import BotoCoreError, ClientError +except ImportError: + boto3 = None + BotoCoreError = ClientError = Exception + +logger = get_logger(__name__) + +# Common Bedrock vision model IDs (region-specific; use full ID in config) +BEDROCK_VISION_MODELS = { + "claude-3-5-sonnet": "anthropic.claude-3-5-sonnet-20241022-v2:0", + "claude-3-opus": "anthropic.claude-3-opus-20240229-v1:0", + "claude-3-sonnet": "anthropic.claude-3-sonnet-20240229-v1:0", + "claude-3-haiku": "anthropic.claude-3-haiku-20240307-v1:0", +} + + +def _image_path_to_format(path): + ext = osp.splitext(path)[-1].lower() + if ext in (".jpg", ".jpeg"): + return "jpeg" + if ext in (".png", ".gif", ".webp"): + return ext[1:] + return "jpeg" + + +class BedrockAPI(BaseAPI): + """VLM API using AWS Bedrock Converse API (supports text + image).""" + + is_api: bool = True + + def __init__( + self, + model_id: str, + region_name: str = None, + retry: int = 10, + wait: int = 1, + system_prompt: str = None, + verbose: bool = True, + temperature: float = 0, + max_tokens: int = 2048, + img_size: int = -1, + **kwargs, + ): + if boto3 is None: + raise ImportError("boto3 is required for BedrockAPI. Install with: pip install boto3") + + self.model_id = BEDROCK_VISION_MODELS.get(model_id, model_id) + self.region_name = region_name or os.environ.get("AWS_REGION", "us-east-1") + self.temperature = temperature + self.max_tokens = max_tokens + self.img_size = img_size + + super().__init__( + retry=retry, + wait=wait, + system_prompt=system_prompt, + verbose=verbose, + **kwargs, + ) + + self._client = boto3.client( + service_name="bedrock-runtime", + region_name=self.region_name, + ) + logger.info( + f"BedrockAPI: model_id={self.model_id}, region={self.region_name}" + ) + + def _encode_image_to_bytes(self, image_path, target_size=-1): + from ..smp import encode_image_file_to_base64 + + suffix = osp.splitext(image_path)[-1].lower() + fmt = "JPEG" if suffix in (".jpg", ".jpeg") else "PNG" + b64 = encode_image_file_to_base64( + image_path, target_size=target_size, fmt=fmt + ) + return base64.b64decode(b64), _image_path_to_format(image_path) + + def _build_content_blocks(self, inputs): + """Build Converse API content list from VLMEvalKit message list.""" + blocks = [] + for item in inputs: + if item["type"] == "text" and item["value"]: + blocks.append({"text": item["value"]}) + elif item["type"] == "image": + path = item["value"] + raw_bytes, fmt = self._encode_image_to_bytes( + path, target_size=self.img_size + ) + blocks.append({ + "image": { + "format": fmt, + "source": {"bytes": raw_bytes}, + } + }) + return blocks + + def _prepare_messages(self, inputs): + """Convert VLMEvalKit message list to Converse API messages. + inputs: either [ {type, value}, ... ] (single turn) or + [ {role, content: [ {type, value}, ... ] }, ... ] (multi-turn). + """ + if inputs and "role" in inputs[0]: + # Multi-turn chat + messages = [] + for msg in inputs: + role = msg["role"] + content = self._build_content_blocks(msg["content"]) + if not content: + content = [{"text": ""}] + messages.append({"role": role, "content": content}) + return messages + # Single turn + content = self._build_content_blocks(inputs) + if not content: + content = [{"text": ""}] + return [{"role": "user", "content": content}] + + def generate_inner(self, inputs, **kwargs): + messages = self._prepare_messages(inputs) + temperature = kwargs.pop("temperature", self.temperature) + max_tokens = kwargs.pop("max_tokens", self.max_tokens) + + request = { + "modelId": self.model_id, + "messages": messages, + "inferenceConfig": { + "maxTokens": max_tokens, + "temperature": temperature, + }, + } + if self.system_prompt: + request["system"] = [{"text": self.system_prompt}] + + try: + response = self._client.converse(**request) + except (BotoCoreError, ClientError) as e: + err_msg = str(e) + if self.verbose: + logger.error(f"Bedrock API error: {err_msg}") + return -1, self.fail_msg, err_msg + + answer = self.fail_msg + try: + content_list = response.get("output", {}).get("message", {}).get("content", []) + texts = [ + block["text"] + for block in content_list + if isinstance(block, dict) and "text" in block + ] + answer = "".join(texts).strip() if texts else self.fail_msg + except Exception as e: + if self.verbose: + logger.error(f"Parse response: {e}") + + return 0, answer, response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/bluelm_api.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/bluelm_api.py new file mode 100644 index 00000000..1cf8abb5 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/bluelm_api.py @@ -0,0 +1,240 @@ +import base64 +import json +import os +import re + +import numpy as np +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import concat_images_vlmeval, get_logger + +logger = get_logger(__name__) + + +def split_think(text: str) -> str: + """ + 提取think后的内容 + """ + if "" in text: + answer = text.split("")[1] + else: + if "" in text: + return 'Thinking mode too long to extract answer' + return text + return answer + + +def remove_boxed(s: str): + left = '\\boxed{' + try: + assert s[:len(left)] == left + assert s[-1] == '}' + return s[len(left):-1] + except Exception: + return None + + +def last_boxed_only_string(string: str): + idx = string.rfind('\\boxed') + if idx < 0: + idx = string.rfind('\\fbox') + if idx < 0: + return None + + i = idx + right_brace_idx = None + num_left_braces_open = 0 + while i < len(string): + if string[i] == '{': + num_left_braces_open += 1 + if string[i] == '}': + num_left_braces_open -= 1 + if num_left_braces_open == 0: + right_brace_idx = i + break + i += 1 + + if right_brace_idx is None: + retval = None + else: + retval = string[idx:right_brace_idx + 1] + + return retval + + +def extract_boxed(pred_str: str, strip_double_curly_brace=False): + boxed_str = last_boxed_only_string(pred_str) + if boxed_str is None: + return pred_str # 返回原始字符串 + answer = remove_boxed(boxed_str) + if answer is None: + return pred_str # 返回原始字符串 + if strip_double_curly_brace: + match = re.match('^\{(.*)\}$', answer) # noqa: W605 + if match: + answer = match.group(1) + return answer + + +def extract_boxed_answer(pred_str: str): + if pred_str.rfind('\\boxed') < 0 and pred_str.rfind('\\fbox') < 0: + return pred_str + return extract_boxed(pred_str, strip_double_curly_brace=True) + + +def get_streaming_response(response: requests.Response): + for chunk in response.iter_lines(chunk_size=4096, + decode_unicode=False): + if chunk: + data = json.loads(chunk.decode("utf-8")) + output = data.get("result") + yield output + + +def multimodal(images, text, url, key, temperature=0.6, max_tokens=32768, top_k=20, top_p=0.95, stream=True, history=[], timeout=60): # noqa: E501 + if images: + pics = [] + for image in images: + with open(image, 'rb') as f: + pic = base64.b64encode(f.read()).decode('utf-8') + pics.append(pic) + data = { + 'images': pics, 'text': text, 'key': key, 'temperature': temperature, + 'max_tokens': max_tokens, 'top_k': top_k, 'top_p': top_p, 'stream': stream + } + else: + data = { + 'text': text, 'key': key, 'temperature': temperature, + 'max_tokens': max_tokens, 'top_k': top_k, 'top_p': top_p, 'stream': stream + } + response = requests.post(url, json=data, headers={"Content-Type": "application/json"}, timeout=timeout) + if stream: + final_text = '' + for h in get_streaming_response(response): + final_text = h + else: + response_data = response.json() + final_text = response_data.get("result", "") + return final_text + + +class BlueLMWrapper(BaseAPI): + is_api: bool = True + + def __init__(self, + model: str = 'BlueLM-2.5-3B', + retry: int = 5, + verbose: bool = True, + temperature: float = 0.6, + system_prompt: str = None, + max_tokens: int = 32768, + top_k: int = 20, + top_p: float = 0.95, + timeout: int = 60, + key: str = None, + url: str = 'http://api-ai.vivo.com.cn/multimodal', + **kwargs): + + self.model = model + self.fail_msg = 'Failed to obtain answer BlueLM API. ' + self.max_tokens = max_tokens + self.temperature = temperature + self.top_k = top_k + self.top_p = top_p + self.url = url + self.key = key + self.timeout = timeout + + if self.key is None: + self.key = os.environ.get('BLUELM_API_KEY', None) + assert self.key is not None, ( + 'Please set the API Key (obtain it here: ' + 'contact by email : shuai.ren@vivo.com' + ) + + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + def message_to_promptimg(self, message, dataset=None): + + num_images = len([x for x in message if x['type'] == 'image']) + if num_images == 0: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = None + elif num_images == 1: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = [x['value'] for x in message if x['type'] == 'image'] + else: + prompt = '\n'.join([x['value'] if x['type'] == 'text' else '' for x in message]) + if dataset == 'BLINK': + image = concat_images_vlmeval( + [x['value'] for x in message if x['type'] == 'image'], + target_size=512) + else: + image = [x['value'] for x in message if x['type'] == 'image'] + + if dataset in ['MMBench_DEV_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', + 'AI2D_TEST', 'AI2D_TEST_TO_MASK', 'MMMU_DEV_VAL', 'MMStar']: + prompt = prompt.replace('Please select the correct answer from the options above.', + 'Answer with the option’s letter from the given choices directly.') + prompt = prompt.replace('Question: Hint: Please answer the question and provide the correct option letter, e.g., A, B, C, D, at the end.\n', '') # noqa: E501 + elif dataset in ['ChartQA_TEST']: + prompt = prompt.replace('Answer the question using a single word or phrase.', + 'Answer the question using a single number or phrase.') + elif dataset in ['DocVQA_VAL', 'DocVQA_TEST', ]: + prompt = prompt.replace('Answer the question using a single word or phrase.', + 'Give the short answer directly.') + elif dataset in ['TextVQA_VAL']: + prompt = prompt.replace('Answer the question using a single word or phrase.', + 'When the provided information is insufficient, respond with ’Unanswerable’.' + 'Answer the question using a single word or phrase.') + elif dataset in ['MTVQA_TEST']: + prompt = prompt.replace( + '\nAnswer the question using a word or phrase in the language of the question.', '') + elif dataset in ['MathVista_MINI']: + if 'Choices:' in prompt: + prompt = prompt.replace('Choices:', 'Options:').replace('Hint:', 'Context:') + for i in range(1, 7): # replace A ~ F + prompt = prompt.replace(f'({chr(64 + i)})', f'{chr(64 + i)}.') + prompt += '\nAnswer with the option’s letter from the given choices directly.' + else: + prompt += '\nAnswer the question using a single word or phrase.' + elif dataset in ['HallusionBench']: + prompt = prompt + " Please answer yes or no." + return prompt, image + + def generate_inner(self, inputs, **kwargs) -> str: + + assert isinstance(inputs, str) or isinstance(inputs, list) + pure_text = np.all([x['type'] == 'text' for x in inputs]) + assert not pure_text + + prompt, image_path = self.message_to_promptimg(inputs, kwargs['dataset']) + + try: + response = multimodal( + images=image_path, text=prompt, url=self.url, key=self.key, temperature=self.temperature, + max_tokens=self.max_tokens, top_k=self.top_k, top_p=self.top_p, timeout=self.timeout) + if kwargs['dataset'] in [ + 'MMBench_DEV_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', + 'AI2D_TEST', 'AI2D_TEST_TO_MASK', 'MMMU_DEV_VAL', 'MMStar', + 'OCRBench', 'MMVet', 'MathVista_MINI', 'HallusionBench' + ]: + + answer = split_think(response[0]) + answer = extract_boxed_answer(answer) + else: + answer = split_think(response[0]) + logger.info(f'answer : {answer}') + return 0, answer, 'Succeeded! ' + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(f'The input messages are {inputs}.') + return -1, '', '' + + +class BlueLM_API(BlueLMWrapper): + + def generate(self, message, dataset=None): + return super(BlueLM_API, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/claude.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/claude.py new file mode 100644 index 00000000..c86302cb --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/claude.py @@ -0,0 +1,154 @@ +import json +import mimetypes +import os +import os.path as osp + +import numpy as np +import requests +from PIL import Image + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import encode_image_to_base64, get_logger + +logger = get_logger(__name__) + +alles_url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat' +alles_headers = { + 'alles-apin-token': '', + 'Content-Type': 'application/json' +} +official_url = 'https://api.anthropic.com/v1/messages' +official_headers = { + 'x-api-key': '', + 'anthropic-version': '2023-06-01', + 'content-type': 'application/json' +} + + +class Claude_Wrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + backend: str = 'alles', + model: str = 'claude-3-opus-20240229', + key: str = None, + retry: int = 10, + timeout: int = 60, + system_prompt: str = None, + verbose: bool = True, + temperature: float = 0, + max_tokens: int = 2048, + **kwargs): + + if os.environ.get('ANTHROPIC_BACKEND', '') == 'official': + backend = 'official' + + assert backend in ['alles', 'official'], f'Invalid backend: {backend}' + self.backend = backend + self.url = alles_url if backend == 'alles' else official_url + self.model = model + self.temperature = temperature + self.max_tokens = max_tokens + self.headers = alles_headers if backend == 'alles' else official_headers + self.timeout = timeout + + if key is not None: + self.key = key + else: + self.key = os.environ.get('ALLES', '') if self.backend == 'alles' else os.environ.get('ANTHROPIC_API_KEY', '') # noqa: E501 + + if self.backend == 'alles': + self.headers['alles-apin-token'] = self.key + else: + self.headers['x-api-key'] = self.key + + super().__init__(retry=retry, verbose=verbose, system_prompt=system_prompt, **kwargs) + + def encode_image_file_to_base64(self, image_path, target_size=-1, fmt='.jpg'): + image = Image.open(image_path) + if fmt in ('.jpg', '.jpeg'): + format = 'JPEG' + elif fmt == '.png': + format = 'PNG' + else: + print(f'Unsupported image format: {fmt}, will cause media type match error.') + + return encode_image_to_base64(image, target_size=target_size, fmt=format) + + # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] + # content can be a string or a list of image & text + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text' and msg['value'] != '': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + pth = msg['value'] + suffix = osp.splitext(pth)[-1].lower() + media_type = mimetypes.types_map.get(suffix, None) + assert media_type is not None + + content_list.append(dict( + type='image', + source={ + 'type': 'base64', + 'media_type': media_type, + 'data': self.encode_image_file_to_base64(pth, target_size=4096, fmt=suffix) + })) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + payload = { + 'model': self.model, + 'max_tokens': self.max_tokens, + 'messages': self.prepare_inputs(inputs), + **kwargs + } + if self.system_prompt is not None: + payload['system'] = self.system_prompt + + response = requests.request( + 'POST', self.url, headers=self.headers, data=json.dumps(payload), timeout=self.timeout * 1.1 + ) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + + try: + resp_struct = json.loads(response.text) + if self.backend == 'alles': + answer = resp_struct['data']['content'][0]['text'].strip() + elif self.backend == 'official': + answer = resp_struct['content'][0]['text'].strip() + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(response.text if hasattr(response, 'text') else response) + + return ret_code, answer, response + + +class Claude3V(Claude_Wrapper): + + def generate(self, message, dataset=None): + return super(Claude_Wrapper, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/cloudwalk.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/cloudwalk.py new file mode 100644 index 00000000..6cbd60bc --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/cloudwalk.py @@ -0,0 +1,110 @@ +import json +import os + +import numpy as np +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import encode_image_to_base64, get_logger + +logger = get_logger(__name__) + + +class CWWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'cw-congrong-v2.0', + retry: int = 10, + key: str = None, + verbose: bool = True, + system_prompt: str = None, + temperature: float = 0, + timeout: int = 600, + api_base: str = '', + max_tokens: int = 2048, + img_detail: str = 'low', + **kwargs): + + self.model = model + self.cur_idx = 0 + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.temperature = temperature + + base = os.environ.get('CW_API_BASE', None) + self.api_base = base if base is not None else api_base + + env_key = os.environ.get('CW_API_KEY', None) + self.key = env_key if env_key is not None else key + assert self.key is not None, 'API key not provided. Please set CW_API_KEY environment variable or \ + pass it to the constructor.' + + assert img_detail in ['high', 'low'] + self.img_detail = img_detail + + self.vision = True + self.timeout = timeout + + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] + # content can be a string or a list of image & text + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + img_struct = dict(url=f"data:image/jpeg;base64,{b64}", detail=self.img_detail) + content_list.append(dict(type='image_url', image_url=img_struct)) + input_msgs.append(dict(role='user', content=content_list)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + input_msgs.append(dict(role='user', content=text)) + return input_msgs + + def generate_inner(self, inputs, **kwargs): + input_msgs = self.prepare_inputs(inputs) + temperature = kwargs.pop('temperature', self.temperature) + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + + if 0 < max_tokens <= 100: + logger.warning( + 'Less than 100 tokens left, ' + 'may exceed the context window with some additional meta symbols. ' + ) + if max_tokens <= 0: + return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' + + headers = {'Content-Type': 'application/json', 'Authorization': f'{self.key}'} + payload = dict( + model=self.model, + messages=input_msgs, + max_tokens=max_tokens, + n=1, + temperature=temperature, + **kwargs) + response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(response.text if hasattr(response, 'text') else response) + + return ret_code, answer, response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/cosmos_reason.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/cosmos_reason.py new file mode 100644 index 00000000..69f8c70f --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/cosmos_reason.py @@ -0,0 +1,774 @@ +import copy +import os +import random +import re +import string +from typing import Literal + +import requests + +import numpy as np +from qwen_vl_utils import process_vision_info + +from vlmeval.dataset import img_root_map +from ..dataset import DATASET_MODALITY, DATASET_TYPE +from ..smp import * +from ..utils.video_cache import get_video_cache +from .base import BaseAPI + +logger = get_logger(__name__) + + +def build_multi_choice_prompt(line, dataset=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + if listinstr(['MMMU'], dataset): + prompt += '\n请直接回答选项字母。' if cn_string(prompt) else "\nPlease answer directly with only the letter of the correct option and nothing else." + else: + prompt += '\n请直接回答选项字母。' if cn_string(prompt) else "\nPlease answer directly with only the letter of the correct option and nothing else. " + else: + if listinstr(['MMMU'], dataset): + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + else: + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + + return prompt + + +APIBASES = { + # 'OFFICIAL': 'https:///v1/chat/completions', +} + + +import base64 +import subprocess +import threading + +import torch + + +def tensor_video_to_base64( + video_tensor: torch.Tensor, + fps: int, + crf: int = 0, # NOTE: yilzhao: the quality of writing the video is important since artifacts may be introduced and lead to wrong answers, for example, with crf=14, tailgating f1=0.7, with crf=0, tailgating f1=0.74 + base64_decode_code: str = "utf-8" +) -> str: + """ + Convert a (T, C, H, W) RGB tensor to an MP4 data URL using ffmpeg via pipes. + - video_tensor: RGB in [0,255] or [0,1], shape (T,C,H,W), C=3 + - fps: frames per second, fps could not be infered from video_tensor, so it is required + - crf: quality (lower is better; 18–28 reasonable) + """ + assert video_tensor.ndim == 4, "Expected (T,C,H,W)" + T, C, H, W = video_tensor.shape + assert C == 3, "Expected 3 channels (RGB)" + + # Ensure uint8 RGB [0,255] + if video_tensor.dtype != torch.uint8: + # If it's float, assume [0,1] or [0,255]; clamp & scale safely + v = video_tensor + if torch.is_floating_point(v) and v.max() <= 1.0: + v = (v * 255.0) + video_u8 = v.clamp(0, 255).to(torch.uint8) + else: + video_u8 = video_tensor + + # (T,C,H,W) -> (T,H,W,C), contiguous + video_thwc = video_u8.permute(0, 2, 3, 1).contiguous() + + # ffmpeg command: read raw rgb24, encode H.264, write MP4 to stdout + # Use fragmented MP4 so MP4 can be streamed to stdout (no moov at end) + cmd = [ + "ffmpeg", + "-threads", "1", + "-loglevel", "error", + "-f", "rawvideo", + "-pix_fmt", "rgb24", + "-s", f"{W}x{H}", + "-r", str(fps), + "-i", "-", # stdin + "-an", + "-vcodec", "libx264", + "-preset", "fast", + "-crf", str(crf), + "-pix_fmt", "yuv420p", # broad compatibility + "-movflags", "frag_keyframe+empty_moov", + "-f", "mp4", + "-" # stdout + ] + proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, bufsize=10**6) + try: + out, err = proc.communicate(input=video_thwc.numpy().tobytes(), timeout=120) + except subprocess.TimeoutExpired: + proc.kill() + proc.communicate() + raise RuntimeError(f"ffmpeg timed out after 120s (shape={video_tensor.shape}, fps={fps})") + except Exception as e: + raise RuntimeError(f"ffmpeg communication failed: {e}") + + if proc.returncode != 0: + err_msg = err.decode('utf-8', errors='ignore') if err else "No error output" + raise RuntimeError(f"ffmpeg failed (returncode={proc.returncode}): {err_msg}\nVideo shape: {video_tensor.shape}, fps={fps}") + + video_base64 = base64.b64encode(out).decode(base64_decode_code) + return video_base64 + + +def process_video_info_to_video_url( + video_path, + image_patch_size=16, + use_cache=True, + **kwargs, +): + """ + Process video to base64 data URL with caching support. + + Args: + video_path: Path to video file + image_patch_size: Patch size for video processing + use_cache: Whether to use cache (default: True) + **kwargs: Additional processing parameters (fps, nframes, etc.) + + Returns: + tuple: (video_url, video_kwargs) + """ + # Check cache first if enabled + if use_cache: + cache = get_video_cache() + cached_result = cache.get(video_path, image_patch_size=image_patch_size, **kwargs) + if cached_result[0] is not None: + # Normalize for current vLLM (same rules as the miss path below). + # Older cache entries may have list-form fps and/or do_resize set. + cached_kwargs = cached_result[1] or {} + cached_kwargs.setdefault("do_sample_frames", False) + if isinstance(cached_kwargs.get("fps"), list): + cached_kwargs["fps"] = cached_kwargs["fps"][0] + cached_kwargs.pop("do_resize", None) + return cached_result[0], cached_kwargs + # Cache miss or caching disabled - process video + messages = [ + { + "role": "user", + "content": [ + { + "type": "video", + "video": video_path, + **kwargs, + } + ] + } + ] + # NOTE: tensor is large (1-2GB for 60s video at 1fps) + # This is the memory bottleneck - caching avoids this entirely + _, video_inputs, video_kwargs = process_vision_info(messages, return_video_kwargs=True, image_patch_size=image_patch_size) + video_base64 = tensor_video_to_base64( + video_inputs[0], + video_kwargs['fps'][0] + ) + video_url = f"data:video/mp4;base64,{video_base64}" + + # vLLM Qwen3-VL needs scalar fps (qwen-vl-utils returns list) and rejects + # do_resize (removed in vLLM PR #26193). Normalize before cache.set so + # cache-hit retries see the same shape we send on the live request. + video_kwargs.setdefault("do_sample_frames", False) + if isinstance(video_kwargs.get("fps"), list): + video_kwargs["fps"] = video_kwargs["fps"][0] + video_kwargs.pop("do_resize", None) + + if use_cache: + cache.set(video_path, video_url, video_kwargs, image_patch_size=image_patch_size, **kwargs) + + return video_url, video_kwargs + + +def _process_video_with_timeout( + video_path, + image_patch_size, + use_cache, + kwargs, + timeout=300, +): + """Run process_video_info_to_video_url on a daemon thread with a hard timeout. + + Needed because decord.VideoReader can stall indefinitely on non-faststart + MP4s over NFS; SIGALRM cannot interrupt C-extension blocking I/O. + """ + result: list = [None, None] # [payload, exception] + + def _worker(): + try: + result[0] = process_video_info_to_video_url( + video_path, + image_patch_size=image_patch_size, + use_cache=use_cache, + **kwargs, + ) + except Exception as e: + result[1] = e + + t = threading.Thread(target=_worker, daemon=True) + t.start() + t.join(timeout=timeout) + if t.is_alive(): + raise RuntimeError( + f"Video processing timed out after {timeout}s for {video_path}" + ) + if result[1] is not None: + raise result[1] + return result[0] + + +class CosmosReason(BaseAPI): + is_api: bool = True + VIDEO_LLM: bool = True # sometimes used by video datasets + # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] + # content can be a string or a list of image & text + + # video llm, default settings + # NOTE: the priority of this set of video is lower than the one in video datasets, suggested to use total_pixels only for corpus videos (varying length and resolution) while using dataset specific setting for the others when dealing with certain use case (e.g. same length and resolution) + nframes: int = None + fps: float = None + total_pixels: int = None + max_pixels: int = None + min_pixels: int = None + max_frames: int = None + + def __init__(self, + model: str = 'qwen3_30b_a3b', + retry: int = 5, + wait: int = 5, + key: str = None, + verbose: bool = False, + system_prompt: str = None, + temperature: float = 0, + top_p: float = None, + top_k: int = None, + repetition_penalty: float = None, + presence_penalty: float = None, + timeout: int = 60, + api_base: str = None, + max_tokens: int = 2048, + seed: int = 1, + img_size: int = -1, + img_detail: str = 'high', + use_video_cache: bool = True, + **kwargs): + self.model = model + self.cur_idx = 0 + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.temperature = temperature + self.top_p = top_p + self.top_k = top_k + self.repetition_penalty = repetition_penalty + self.presence_penalty = presence_penalty + self.seed = seed + + key = os.environ.get('COSMOS_API_KEY', '') + + self.key = key + assert img_size > 0 or img_size == -1 + self.img_size = img_size + assert img_detail in ['high', 'low'] + self.img_detail = img_detail + self.timeout = timeout + # args for video processing + self.nframes = kwargs.pop('nframes', None) + self.fps = kwargs.pop('fps', None) + self.total_pixels = kwargs.pop('total_pixels', None) + self.max_pixels = kwargs.pop('max_pixels', None) + self.min_pixels = kwargs.pop('min_pixels', None) + self.max_frames = kwargs.pop('max_frames', None) + self.use_nim = kwargs.pop('use_nim', False) + self.chat_template_kwargs = kwargs.pop('chat_template_kwargs', None) + self.response_format = kwargs.pop('response_format', None) + # Generic payload passthrough for vLLM-specific fields (e.g. structured_outputs). + self.extra_body = kwargs.pop('extra_body', None) + # Default Qwen3 / Reason2 setup. + self.image_patch_size = kwargs.pop('image_patch_size', None) + self.key = key + assert img_size > 0 or img_size == -1 + self.img_size = img_size + assert img_detail in ['high', 'low'] + self.img_detail = img_detail + self.timeout = timeout + self.use_video_cache = use_video_cache + + self.api_base = api_base + + # Track the last error type for inference stats reporting + self._last_error_type = None + + super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}') + + # Check if endpoint is accessible + self._check_endpoint_health() + + # Resolve model name from endpoint (auto-detect if declared name doesn't match) + self._resolve_model_name() + + def _resolve_model_name(self, max_retries: int = 5, retry_wait: int = 10): + """Query /v1/models and remap self.model if declared name doesn't match. + + Retries on transient errors (timeout, connection) up to max_retries times. + Raises RuntimeError if all retries are exhausted or the endpoint has no models. + """ + import time + models_url = self.api_base.rsplit('/chat/completions', 1)[0] + '/models' + last_error = None + for attempt in range(1, max_retries + 1): + try: + resp = requests.get(models_url, headers={'Authorization': f'Bearer {self.key}'}, timeout=10) + resp.raise_for_status() + models = resp.json().get('data', []) + if not models: + raise RuntimeError( + f'No models found at endpoint {models_url}. ' + f'Please verify the endpoint is serving a model at {self.api_base}' + ) + server_model = models[0]['id'] + if server_model != self.model: + logger.warning( + f'Model name remapped: {self.model!r} -> {server_model!r} (auto-detected from endpoint)' + ) + self.model = server_model + if len(models) > 1: + ids = [m['id'] for m in models] + logger.warning(f'Multiple models at endpoint: {ids}. Using first: {self.model!r}') + return + except RuntimeError: + raise + except Exception as e: + last_error = e + logger.warning( + f'Failed to query /v1/models (attempt {attempt}/{max_retries}): {e}' + ) + if attempt < max_retries: + time.sleep(retry_wait) + raise RuntimeError( + f'Cannot resolve model from endpoint after {max_retries} attempts: {last_error}. ' + f'Please verify the endpoint is serving a model at {self.api_base}' + ) from last_error + + def _check_endpoint_health(self): + """Check if the API endpoint is accessible""" + try: + # Try a simple GET request to check if endpoint is up + response = requests.get(self.api_base.rsplit('/', 1)[0], timeout=5) + logger.info(f'✓ Endpoint health check passed: {self.api_base}') + except requests.exceptions.Timeout: + logger.warning(f'⚠️ WARNING: Endpoint health check timed out: {self.api_base}') + logger.warning(f'⚠️ The endpoint may be slow or down. Evaluation will proceed but may fail.') + except requests.exceptions.ConnectionError: + logger.error(f'⚠️ ERROR: Cannot connect to endpoint: {self.api_base}') + logger.error(f'⚠️ Please verify the endpoint is running before starting evaluation.') + logger.error(f'⚠️ Evaluation will proceed but all requests will likely fail.') + except Exception as e: + logger.warning(f'⚠️ WARNING: Endpoint health check failed: {type(e).__name__}: {str(e)}') + + def use_custom_prompt(self, dataset): + assert dataset is not None + if listinstr(['BlinkSpatial', 'BlinkDepth', 'RealWorldQA', 'MMBench'], dataset): + return True + if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset).startswith('MCQ'): + # Use VLMEvalKit's standard MCQ prompt for fair benchmark comparison. + # CosmosReason's "only the letter...nothing else" directive suppresses + # model reasoning and causes accuracy drop on MCQ benchmarks such as MMMU. + return False + if listinstr([ + 'SparBench', 'EgoPlanBench2', 'CVBench', 'EmbSpatialBench', + 'RoboSpatialHome', 'RefSpatial', 'SATBench', 'Where2Place', + 'CosmosERQA', 'TemporalLocalization', 'AETCBench', + 'VANTAGE_SOT', 'MetropolisEventVerification', + 'MetropolisVQA', 'VANTAGE_VQA', + 'CountBenchQA', + 'AVSpecialOODReasoningBench', + ], dataset): + return False + if listinstr(['MathVision', 'MathVerse'], dataset): + # Math VQA benchmarks (TYPE='VQA'): use dataset's default prompt (raw question only). + # CosmosReason's "Answer using a single word or phrase" suppresses reasoning, + # which is critical for multi-step math computation. + return False + if listinstr(['IFBench'], dataset): + # IFBench (TYPE='OPENENDED'): use dataset's own build_prompt() (raw prompt). + # CosmosReason's OPENENDED fallthrough sets max_new_tokens=128, far too low + # for instruction-following tasks that require long responses. + return False + return True + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + tgt_path = self.dump_image(line, dataset) + + kwargs_default = dict(do_sample=False, max_new_tokens=128, top_p=None, num_beams=1) + self.kwargs = kwargs_default + + if dataset is not None and DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['MME'], dataset): + prompt = question + ' Answer the question using a single word or phrase.' + elif listinstr(['HallusionBench', 'AMBER'], dataset): + prompt = question + ' Please answer yes or no.' + else: + prompt = question + elif dataset is not None and (DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset).startswith('MCQ')): + prompt = build_multi_choice_prompt(line, dataset) + + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['MathVista'], dataset): + prompt = question + ' Answer the question with a step-by-step process if the problem is complex, otherwise answer directly. Finally give the final answer with "The answer is ..."' + elif listinstr(['LLaVABench', 'WildVision'], dataset): + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench', + 'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset): + prompt = question + '\nAnswer the question using a single word or phrase.' + elif listinstr(['Omni3D'], dataset): + prompt = question + elif listinstr(['Astro2D', 'Metropolis2D', 'Metropolis2DGrounding'], dataset): + prompt = question + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + prompt = line['question'] + # The base model expects image first instead of text first. + message = [dict(type='image', value=s) for s in tgt_path] + message.extend([dict(type='text', value=prompt)]) + return message + + def parse_answer(self, answer: str) -> str: + return answer + + def _get_video_processing_kwargs(self, msg) -> dict: + process_video_kwargs = copy.deepcopy(msg) + process_video_kwargs.pop('type') + process_video_kwargs.pop('value') + video_processing_kwargs_keys = ['nframes', 'fps', 'total_pixels', 'max_pixels', 'min_pixels', 'max_frames'] # if the msg provided the kwargs, do not override anything + if any(video_processing_kwargs_key in process_video_kwargs for video_processing_kwargs_key in video_processing_kwargs_keys): + return process_video_kwargs + for video_processing_kwargs_key in video_processing_kwargs_keys: + if video_processing_kwargs_value := getattr(self, video_processing_kwargs_key, None): + process_video_kwargs[video_processing_kwargs_key] = video_processing_kwargs_value + + return process_video_kwargs + + def prepare_itlist(self, inputs): + video_kwargs = None + assert np.all([isinstance(x, dict) for x in inputs]) + has_multimodal = np.sum([ + x['type'] == 'image' or x['type'] == 'video' or x['type'] == 'video_base64' for x in inputs + ]) + if has_multimodal: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + b64 = encode_image_file_to_base64(msg['value'], target_size=self.img_size) + import mimetypes + mime_type = 'image/jpeg' if self.img_size > 0 else ( + mimetypes.guess_type(msg['value'])[0] or 'image/png') + img_struct = dict(url=f'data:{mime_type};base64,{b64}', detail=self.img_detail) + content_list.append(dict(type='image_url', image_url=img_struct)) + elif msg['type'] == 'video_base64': + # Pre-encoded video URL (avoid subprocess in multiprocessing) + content_list.append(dict( + type='video_url', + video_url=dict(url=msg['value']) + )) + elif msg['type'] == 'video': + assert self.image_patch_size is not None, "please set image_patch_size, 14 for qwen2.5vl series, 16 for qwen3.vl series" + video_url, video_kwargs = _process_video_with_timeout( + msg['value'], + image_patch_size=self.image_patch_size, + use_cache=self.use_video_cache, + kwargs=self._get_video_processing_kwargs(msg), + timeout=getattr(self, 'video_proc_timeout', 300), + ) + content_list.append(dict( # NOTE: confirm this: video being processed to ideal size + fps, recorded within the base64 encoded video bytes, assuming the server will use the video file's parameters as default + type='video_url', + video_url=dict( + url=video_url, + ) + )) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list, video_kwargs + + def prepare_inputs(self, inputs): + input_msgs, video_kwargs = [], None + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + output = self.prepare_itlist(item['content']) + if video_kwargs is None: + video_kwargs = output[1] + input_msgs.append(dict(role=item['role'], content=output[0])) + else: + output = self.prepare_itlist(inputs) + if video_kwargs is None: + video_kwargs = output[1] + input_msgs.append(dict(role='user', content=output[0])) + return input_msgs, video_kwargs + + def generate_inner(self, inputs, **kwargs) -> str: + self._last_error_type = None # reset per-sample; prevents leaking across samples + try: + input_msgs, video_kwargs = self.prepare_inputs(inputs) + except Exception as e: + # Non-fail_msg sentinel skips BaseAPI retry (it retries only when fail_msg in answer). + logger.warning(f'Video preparation failed (skipping): {type(e).__name__}: {e}') + return 0, 'VIDEO_LOAD_FAILED', None + kwargs.pop('dataset', None) + temperature = kwargs.pop('temperature', self.temperature) + top_p = kwargs.pop('top_p', self.top_p) + top_k = kwargs.pop('top_k', self.top_k) + repetition_penalty = kwargs.pop('repetition_penalty', self.repetition_penalty) + presence_penalty = kwargs.pop('presence_penalty', self.presence_penalty) + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + seed = kwargs.pop('seed', self.seed) + # Will send request if use Azure, dk how to use openai client for it + headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} + payload = dict( + model=self.model, + messages=input_msgs, + n=1, + temperature=temperature, + top_p=top_p, + seed=seed, + max_tokens=max_tokens, + **kwargs, + ) + if self.chat_template_kwargs is not None: + payload['chat_template_kwargs'] = self.chat_template_kwargs + if self.response_format is not None: + payload['response_format'] = self.response_format + if self.extra_body: + payload.update(self.extra_body) + + if video_kwargs is not None: + if self.use_nim: + # TODO: NIM evaluation haven't been fully aligned yet. + payload['mm_processor_kwargs'] = { + "fps": video_kwargs['fps'], + } + payload['media_io_kwargs'] = { # for NIM deployments + "fps": video_kwargs['fps'][0], + } + else: + # Verified that this branch will bring expected number of tokens to the vllm server. (from server output) + # with previously used container (vllm 0.10.0), this is correct + payload.update({ + 'mm_processor_kwargs': video_kwargs + }) + + THINKING_RETRY = 1 + for i in range(THINKING_RETRY): + payload['seed'] += i + if self.use_nim: + # TODO: NIM evaluation haven't been fully aligned yet. + payload['nvext'] = { # these go into nvext for NIMs + "top_k": top_k, + "repetition_penalty": repetition_penalty, + } + else: + payload['top_k'] = top_k + payload['repetition_penalty'] = repetition_penalty + payload['presence_penalty'] = presence_penalty + try: + response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + except requests.exceptions.Timeout: + logger.error(f'⚠️ ENDPOINT TIMEOUT: {self.api_base}') + logger.error(f'⚠️ The endpoint took longer than {self.timeout}s to respond. It may be down or overloaded.') + self._last_error_type = "timeout" + return 408, self.fail_msg, None + except requests.exceptions.ConnectionError as e: + logger.error(f'⚠️ ENDPOINT DOWN: {self.api_base}') + logger.error(f'⚠️ Cannot connect to the endpoint. Please check if the service is running.') + logger.error(f'⚠️ Error: {str(e)}') + self._last_error_type = "http_error" + return 503, self.fail_msg, None + except Exception as e: + logger.error(f'⚠️ REQUEST FAILED: {type(e).__name__}: {str(e)}') + self._last_error_type = "http_error" + return 500, self.fail_msg, None + + + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + + # Check for common error status codes + if ret_code == 401: + logger.error(f'⚠️ AUTHENTICATION FAILED (401): Invalid API key for {self.api_base}') + elif ret_code == 403: + logger.error(f'⚠️ ACCESS FORBIDDEN (403): API key lacks permissions for {self.api_base}') + elif ret_code == 404: + logger.error(f'⚠️ ENDPOINT NOT FOUND (404): {self.api_base}') + elif ret_code == 429: + logger.error(f'⚠️ RATE LIMIT EXCEEDED (429): Too many requests to {self.api_base}') + elif ret_code == 500: + logger.error(f'⚠️ SERVER ERROR (500): The endpoint {self.api_base} encountered an internal error') + elif ret_code == 503: + logger.error(f'⚠️ SERVICE UNAVAILABLE (503): {self.api_base} is temporarily unavailable') + elif ret_code != 0: + logger.error(f'⚠️ HTTP ERROR ({ret_code}): Request to {self.api_base} failed') + logger.error(f'⚠️ Response: {response.text[:1000] if hasattr(response, "text") else "N/A"}') + + if ret_code in (408, 504): + self._last_error_type = "timeout" + elif ret_code != 0: + self._last_error_type = "http_error" + + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + finish_reason = resp_struct['choices'][0]['finish_reason'] + if finish_reason == 'stop': + if self.verbose: + logger.info(f"prompt: {payload['messages'][0]['content'][-1]}") + logger.info(f'API Response: {answer}') + answer = self.parse_answer(answer) + if self._last_error_type is not None and self._last_error_type != "truncated": + logger.info(f'✅ Recovered after {self._last_error_type}') + self._last_error_type = None + break + else: + self._last_error_type = "truncated" + logger.info(f"⚠️ Answer truncated at max_tokens={payload['max_tokens']} (finish_reason={finish_reason}).") + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(response.text if hasattr(response, 'text') else response) + return ret_code, answer, response + + def generate(self, message, **kwargs): + """Wrap parent generate() to record per-sample inference outcome.""" + result = super().generate(message, **kwargs) + stats_file = os.environ.get("INFERENCE_STATS_FILE") + if stats_file: + is_success = result is not None and self.fail_msg not in str(result) + # Truncated responses return valid content (for scoring) but are still + # tracked as "truncated" so we can monitor max_tokens adequacy. + if self._last_error_type == "truncated": + category = "truncated" + elif is_success: + category = "success" + else: + category = self._last_error_type or "other" + try: + with open(stats_file, "a") as f: + f.write(json.dumps({"outcome": category}) + "\n") + except Exception: + pass + return result + + +class CosmosReason1(CosmosReason): + + def __init__(self, model: str = 'cosmos_reason1_7b', **kwargs): + image_patch_size = 14 + super().__init__(model=model, image_patch_size=image_patch_size, **kwargs) + + def parse_answer(self, answer: str) -> str: + pattern = r"^([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>(?:\n|\n\n| |)([\s\S]*?)<\/answer>$" + match = re.search(pattern, answer, re.DOTALL) + if match: + return match.group(2).strip() + else: + return answer + +class CosmosReason1Think(CosmosReason1): + def prepare_inputs(self, inputs): + # NOTE: yilzhao: following cosmos reason1 thinking convention, adding the thinking prompt to the system prompt + input_msgs, video_kwargs = super().prepare_inputs(inputs) + reasoning_prompt = "\n".join([ + "", + "Make your response in the following format:", + "", + "your reasoning", + "", + "", + "your answer", + "", + ]) + for msg in input_msgs: + if msg['role'] == 'system': + if isinstance(msg['content'], list): + for content in msg['content']: + if content['type'] == 'text': + content['text'] += reasoning_prompt + elif isinstance(msg['content'], str): + msg['content'] += reasoning_prompt + return input_msgs, video_kwargs + +class CosmosReason2(CosmosReason): + + def __init__(self, model: str = 'qwen3_30b_a3b', **kwargs): + image_patch_size = 16 + super().__init__(model=model, image_patch_size=image_patch_size, **kwargs) + + def parse_answer(self, answer: str) -> str: + pattern = r"^([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>(?:\n|\n\n| |)([\s\S]*?)$" + match = re.search(pattern, answer, re.DOTALL) + if match: + return match.group(2).strip() # answer + else: + return answer + + +class CosmosReason2Think(CosmosReason): + def __init__(self, model: str = 'qwen3_30b_a3b', **kwargs): + image_patch_size = 16 + self.pattern = r"^(.*)<\/think>(.+)$" + super().__init__(model=model, image_patch_size=image_patch_size, **kwargs) + + def parse_answer(self, answer: str) -> str: + # pattern = r"^([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>(?:\n|\n\n| |)([\s\S]*?)$" + # match = re.search(pattern, answer, re.DOTALL) + match = re.match(self.pattern, answer, re.DOTALL) + if match: + return match.group(2).strip() # answer + else: + return answer + + def prepare_inputs(self, inputs): + input_msgs, video_kwargs = super().prepare_inputs(inputs) + for msg in input_msgs: + if msg['role'] == 'user': + for content in msg['content']: + if content['type'] == 'text': + content['text'] += '\nPlease provide your detailed reasoning within and tags before giving the final answer.' + return input_msgs, video_kwargs + +class Qwen3VLThink(CosmosReason2Think): + def parse_answer(self, answer: str) -> str: + # NOTE: yilzhao: Qwen3VLThink series do not return the first token since it is embedded, overriding the parsing logic to handle the different format of return + pattern = r"^([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>(?:\n|\n\n| |)([\s\S]*?)$" + match = re.search(pattern, answer, re.DOTALL) + if match: + return match.group(2).strip() # answer + else: + return answer diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/doubao_vl_api.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/doubao_vl_api.py new file mode 100644 index 00000000..44f24de9 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/doubao_vl_api.py @@ -0,0 +1,209 @@ +import os +import os.path as osp + +import numpy as np +from openai import OpenAI + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import img_root_map +from vlmeval.smp import (LMUDataRoot, decode_base64_to_image_file, encode_image_to_base64, + get_logger, read_ok, toliststr) + +logger = get_logger(__name__) + + +class DoubaoVLWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = '', + retry: int = 5, + verbose: bool = True, + system_prompt: str = None, + temperature: float = 0, + timeout: int = 60, + max_tokens: int = 4096, + api_base: str = 'https://ark.cn-beijing.volces.com/api/v3', # 使用系统推荐的服务区域地址 + **kwargs): + + self.model = model # This variable is unused + self.cur_idx = 0 + self.fail_msg = 'Failed to obtain answer via API. ' + self.temperature = temperature + self.max_tokens = max_tokens + + assert 'DOUBAO_VL_KEY' in os.environ, 'You may need to set the env variable DOUBAO_VL_KEY to use DOUBAO_VL.' + + key = os.environ.get('DOUBAO_VL_KEY', None) + assert key is not None, 'Please set the environment variable DOUBAO_VL_KEY. ' + self.key = key + + assert api_base is not None, 'Please set the variable API_BASE. ' + self.api_base = api_base + self.timeout = timeout + + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + # Models that require an EP + # assert self.model in ['Doubao-1.5-vision-pro', 'doubao-1-5-thinking-vision-pro-250428'] + EP_KEY = 'DOUBAO_VL_ENDPOINT' + '_' + self.model.replace('.', '_').replace('-', '_').upper() + endpoint = os.getenv(EP_KEY, None) + + if endpoint is not None: + self.endpoint = endpoint + else: + logger.warning( + f'Endpoint for model {model} is not set (can be set w. environment var {EP_KEY}. ' + f'By default, we will use the model name {model} as the EP if not set. ' + ) + self.endpoint = model + + self.client = OpenAI( + api_key=self.key, + base_url=self.api_base, + timeout=self.timeout + ) + + logger.info(f'Using API Base: {self.api_base}; End Point: {self.endpoint}; API Key: {self.key}') + + def dump_image(self, line, dataset): + """Dump the image(s) of the input line to the corresponding dataset folder. + + Args: + line (line of pd.DataFrame): The raw input line. + dataset (str): The name of the dataset. + + Returns: + str | list[str]: The paths of the dumped images. + """ + ROOT = LMUDataRoot() + assert isinstance(dataset, str) + + img_root = os.path.join(ROOT, 'images', img_root_map(dataset) if dataset in img_root_map(dataset) else dataset) + os.makedirs(img_root, exist_ok=True) + if 'image' in line: + if isinstance(line['image'], list): + tgt_path = [] + assert 'image_path' in line + for img, im_name in zip(line['image'], line['image_path']): + path = osp.join(img_root, im_name) + if not read_ok(path): + decode_base64_to_image_file(img, path) + tgt_path.append(path) + else: + tgt_path = osp.join(img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + tgt_path = [tgt_path] + else: + assert 'image_path' in line + tgt_path = toliststr(line['image_path']) + + return tgt_path + + def use_custom_prompt(self, dataset_name): + if dataset_name == 'MathVerse_MINI_Vision_Only': + return True + else: + return False + + def build_prompt(self, line, dataset: str) -> list[dict[str, str]]: + + if dataset in {'MathVerse_MINI_Vision_Only'}: + return self. _build_mathVerse_mini_vision_only_prompt(line, dataset) + raise ValueError(f'Unsupported dataset: {dataset}') + + def _build_mathVerse_mini_vision_only_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + + tgt_path = self.dump_image(line, dataset) + + question = line['question'] + + # remove 'directly' from the prompt, so the model will answer the question in Chain-of-Thought (CoT) manner + prompt = question.replace('directly', '', 1) + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=prompt)) + return msgs + + # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] + # content can be a string or a list of image & text + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + img_struct = dict(url=f'data:image/jpeg;base64,{b64}') + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + + input_msgs = self.prepare_inputs(inputs) + temperature = kwargs.pop('temperature', self.temperature) + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + + ret_code = -1 + answer = self.fail_msg + response = None + payload = dict(model=self.endpoint, messages=input_msgs, max_tokens=max_tokens, temperature=temperature) + try: + response = self.client.chat.completions.create(**payload) + answer = response.choices[0].message.content.strip() + ret_code = 0 + except Exception as err: + logger.error(f'{type(err)}: {err}') + logger.error(response.text if hasattr(response, 'text') else response) + + return ret_code, answer, response + + +class DoubaoVL(DoubaoVLWrapper): + + def generate(self, message, dataset=None): + return super(DoubaoVL, self).generate(message) + + +if __name__ == '__main__': + # export DOUBAO_VL_KEY='' + # export DOUBAO_VL_ENDPOINT='' + model = DoubaoVLWrapper(verbose=True) + inputs = [ + {'type': 'image', 'value': './assets/apple.jpg'}, + {'type': 'text', 'value': '请详细描述一下这张图片。'}, + ] + code, answer, resp = model.generate_inner(inputs) + print(code, answer, resp) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/gcp_vertex.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/gcp_vertex.py new file mode 100644 index 00000000..1b4d646d --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/gcp_vertex.py @@ -0,0 +1,211 @@ +""" +GCP (Google Cloud Platform) Vertex AI API support. +Single class for both Gemini and Claude on Vertex. Backend is chosen by model name: +- model starts with "claude-" -> AnthropicVertex (anthropic[vertex]) +- else -> Vertex AI GenerativeModel (google-cloud-aiplatform) +Uses Application Default Credentials (ADC). Set GOOGLE_CLOUD_PROJECT and optionally GOOGLE_CLOUD_LOCATION. + +Gemini: pip install google-cloud-aiplatform +Claude: pip install -U google-cloud-aiplatform "anthropic[vertex]" +""" +import mimetypes +import os +import os.path as osp + +import numpy as np + +from ..smp import encode_image_to_base64, get_logger +from .base import BaseAPI + +# Vertex AI (Gemini) +try: + import vertexai + from vertexai.generative_models import GenerationConfig, GenerativeModel + from vertexai.generative_models import Image as VertexImage + from vertexai.generative_models import Part +except ImportError: + vertexai = None + GenerativeModel = Part = GenerationConfig = VertexImage = None + +# Claude on Vertex +try: + from anthropic import AnthropicVertex +except ImportError: + AnthropicVertex = None + +logger = get_logger(__name__) + + +def _encode_image_file_to_base64(image_path, target_size=-1, fmt=".jpg"): + from PIL import Image + + image = Image.open(image_path) + if fmt in (".jpg", ".jpeg"): + pil_fmt = "JPEG" + elif fmt == ".png": + pil_fmt = "PNG" + else: + pil_fmt = "JPEG" + return encode_image_to_base64(image, target_size=target_size, fmt=pil_fmt) + + +def _is_claude_model(model: str) -> bool: + return model.strip().lower().startswith("claude-") + + +class GCPVertexAPI(BaseAPI): + """Single API for GCP Vertex AI: Gemini or Claude, chosen by model name (claude-* -> Claude).""" + + is_api: bool = True + + def __init__( + self, + model: str = "gemini-1.5-flash", + project_id: str = None, + location: str = None, + retry: int = 10, + wait: int = 1, + system_prompt: str = None, + verbose: bool = True, + temperature: float = 0, + max_tokens: int = 2048, + **kwargs, + ): + self.model = model + self._is_claude = _is_claude_model(model) + self.project_id = project_id or os.environ.get("GOOGLE_CLOUD_PROJECT") + self.location = location or os.environ.get( + "GOOGLE_CLOUD_LOCATION", + "global" if self._is_claude else "us-central1", + ) + self.temperature = temperature + self.max_tokens = max_tokens + + if not self.project_id: + raise ValueError( + "GCP project_id is required. Set GOOGLE_CLOUD_PROJECT or pass project_id=..." + ) + + super().__init__( + retry=retry, + wait=wait, + system_prompt=system_prompt, + verbose=verbose, + **kwargs, + ) + + if self._is_claude: + if AnthropicVertex is None: + raise ImportError( + "Claude on Vertex requires: pip install -U google-cloud-aiplatform \"anthropic[vertex]\"" + ) + self._client = AnthropicVertex( + project_id=self.project_id, + region=self.location, + ) + self._gemini_model = None + else: + if vertexai is None or GenerativeModel is None: + raise ImportError( + "GCP Vertex AI (Gemini) requires google-cloud-aiplatform. " + "Install with: pip install google-cloud-aiplatform" + ) + vertexai.init(project=self.project_id, location=self.location) + model_id = "gemini-1.0-pro-vision" if model == "gemini-1.0-pro" else model + self._gemini_model = GenerativeModel(model_id) + self._client = None + + logger.info( + f"GCPVertexAPI: model={self.model}, project={self.project_id}, " + f"location={self.location} ({'Claude' if self._is_claude else 'Gemini'})" + ) + + def _build_gemini_contents(self, inputs): + parts = [] + if self.system_prompt: + parts.append(self.system_prompt) + for inp in inputs: + if inp["type"] == "text" and inp["value"]: + parts.append(inp["value"]) + elif inp["type"] == "image": + parts.append(Part.from_image(VertexImage.load_from_file(inp["value"]))) + return parts + + def _generate_gemini(self, inputs, temperature, max_tokens): + contents = self._build_gemini_contents(inputs) + if not contents: + contents = [""] + resp = self._gemini_model.generate_content( + contents, + generation_config=GenerationConfig( + temperature=temperature, + max_output_tokens=max_tokens, + ), + ) + answer = resp.text.strip() if resp.text else self.fail_msg + return 0, answer, resp + + def _prepare_claude_content(self, inputs): + assert all(isinstance(x, dict) for x in inputs) + has_images = np.sum([x["type"] == "image" for x in inputs]) + if has_images: + content_list = [] + for item in inputs: + if item["type"] == "text" and item["value"]: + content_list.append({"type": "text", "text": item["value"]}) + elif item["type"] == "image": + pth = item["value"] + suffix = osp.splitext(pth)[-1].lower() + media_type = mimetypes.types_map.get(suffix) or "image/jpeg" + content_list.append({ + "type": "image", + "source": { + "type": "base64", + "media_type": media_type, + "data": _encode_image_file_to_base64( + pth, target_size=4096, fmt=suffix + ), + }, + }) + return content_list + text = "\n".join([x["value"] for x in inputs if x["type"] == "text"]) + return [{"type": "text", "text": text or ""}] + + def _prepare_claude_messages(self, inputs): + if inputs and "role" in inputs[0]: + return [ + {"role": item["role"], "content": self._prepare_claude_content(item["content"])} + for item in inputs + ] + return [{"role": "user", "content": self._prepare_claude_content(inputs)}] + + def _generate_claude(self, inputs, temperature, max_tokens): + messages = self._prepare_claude_messages(inputs) + create_kwargs = { + "model": self.model, + "max_tokens": max_tokens, + "messages": messages, + "temperature": temperature, + } + if self.system_prompt: + create_kwargs["system"] = self.system_prompt + resp = self._client.messages.create(**create_kwargs) + answer = ( + resp.content[0].text.strip() + if resp.content and getattr(resp.content[0], "text", None) + else self.fail_msg + ) + return 0, answer, resp + + def generate_inner(self, inputs, **kwargs): + temperature = kwargs.pop("temperature", self.temperature) + max_tokens = kwargs.pop("max_tokens", self.max_tokens) + + try: + if self._is_claude: + return self._generate_claude(inputs, temperature, max_tokens) + return self._generate_gemini(inputs, temperature, max_tokens) + except Exception as err: + if self.verbose: + logger.error(f"{type(err).__name__}: {err}") + return -1, self.fail_msg, str(err) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/gemini.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/gemini.py new file mode 100644 index 00000000..918cc679 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/gemini.py @@ -0,0 +1,213 @@ +import os +import time + +from PIL import Image + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import get_logger, proxy_set + +headers = 'Content-Type: application/json' + + +logger = get_logger(__name__) + + +class GeminiWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'gemini-1.0-pro', + retry: int = 5, + key: str = None, + verbose: bool = True, + temperature: float = 0.0, + system_prompt: str = None, + max_tokens: int = 2048, + proxy: str = None, + backend='genai', + project_id='vlmeval', + thinking_budget: int = None, # range from 0 to 24576 + # see https://ai.google.dev/gemini-api/docs/thinking + fps: int = 1, + media_resolution: str = None, + **kwargs): + + self.model = model + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.temperature = temperature + self.thinking_budget = thinking_budget + self.fps = fps + # for image, high and medium resolution is 258 tokens per image [default], low resolution is 66 tokens per image + # for video, not support high resolution, medium resolution is 258 tokens per image [default], low resolution is 66 tokens per image # noqa: E501 + self.media_resolution = media_resolution + if self.media_resolution: + assert self.media_resolution in ['low', 'medium', 'high'] + if key is None: + key = os.environ.get('GOOGLE_API_KEY', None) + # Try to load backend from environment variable + be = os.environ.get('GOOGLE_API_BACKEND', None) + if be is not None and be in ['genai', 'vertex']: + backend = be + + assert backend in ['genai', 'vertex'] + if backend == 'genai': + # We have not evaluated Gemini-1.5 w. GenAI backend + assert key is not None # Vertex does not require API Key + try: + from google import genai + from google.genai import types + except ImportError as e: + raise ImportError( + "Could not import 'google.genai'. Please install it with:\n" + " pip install --upgrade google-genai" + ) from e + self.media_resolution_dict = { # noqa: F841 + 'low': types.MediaResolution.MEDIA_RESOLUTION_LOW, + 'medium': types.MediaResolution.MEDIA_RESOLUTION_MEDIUM, + 'high': types.MediaResolution.MEDIA_RESOLUTION_HIGH + } + self.genai = genai + self.client = genai.Client(api_key=key) + + self.backend = backend + self.project_id = project_id + self.api_key = key + + if proxy is not None: + proxy_set(proxy) + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + # Ensure allowed_types is set (fix for multiprocessing/state corruption) + if not hasattr(self, 'allowed_types') or self.allowed_types is None: + self.allowed_types = ['text', 'image', 'video'] + + def upload_video_genai(self, video_path): + from google.genai import types + myfile = self.client.files.upload(file=video_path) + + video_part = types.Part.from_uri( + file_uri=myfile.uri, + mime_type="video/mp4" + ) + + video_part.video_metadata = types.VideoMetadata(fps=self.fps) + + while True: + myfile = self.client.files.get(name=myfile.name) + if myfile.state == "ACTIVE": + break + time.sleep(2) + + return video_part + + def build_msgs_genai(self, inputs): + video_in_msg = False + video_parts = [] + text_and_images = [] if self.system_prompt is None else [self.system_prompt] + + for inp in inputs: + if inp['type'] == 'text': + text_and_images.append(inp['value']) + elif inp['type'] == 'image': + text_and_images.append(Image.open(inp['value'])) + elif inp['type'] == 'video': + video_file = self.upload_video_genai(inp['value']) + video_parts.append(video_file) + video_in_msg = True + + messages = video_parts + text_and_images + return messages, video_in_msg + + def build_msgs_vertex(self, inputs): + from vertexai.generative_models import Image, Part + messages = [] if self.system_prompt is None else [self.system_prompt] + for inp in inputs: + if inp['type'] == 'text': + messages.append(inp['value']) + elif inp['type'] == 'image': + messages.append(Part.from_image(Image.load_from_file(inp['value']))) + return messages + + def generate_inner(self, inputs, **kwargs) -> str: + if self.backend == 'genai': + from google.genai import types + assert isinstance(inputs, list) + model = self.model + messages, video_in_msg = self.build_msgs_genai(inputs) + + # Configure generation parameters + config_args = { + "temperature": self.temperature, + "max_output_tokens": self.max_tokens + } + # set resolution for vision input + if self.media_resolution: + if video_in_msg: + assert self.media_resolution != 'high', "For video input, only support medium and low resolution" + config_args["media_resolution"] = self.media_resolution_dict[self.media_resolution] + + # If thinking_budget is specified, add thinking_config + # By default, Gemini 2.5 Pro will automatically select + # a thinking budget not exceeding 8192 if not specified. + if self.thinking_budget is not None: + config_args["thinking_config"] = types.ThinkingConfig( + thinking_budget=self.thinking_budget + ) + config_args.update(kwargs) + + try: + resp = self.client.models.generate_content( + model=model, + contents=messages, + config=types.GenerateContentConfig(**config_args) + ) + + # Check if response has text + if hasattr(resp, 'text') and resp.text: + answer = resp.text + return 0, answer, 'Succeeded! ' + else: + # Response succeeded but no text - could be safety filter or model issue + error_msg = f'Response received but no text. ' + if hasattr(resp, 'prompt_feedback'): + error_msg += f'Prompt feedback: {resp.prompt_feedback}' + if hasattr(resp, 'candidates') and resp.candidates: + error_msg += f' Candidates: {resp.candidates}' + if self.verbose: + logger.error(error_msg) + logger.error(f'Full response object: {resp}') + return -1, '', error_msg + + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(f'The input messages are {inputs}.') + + return -1, '', str(err) + elif self.backend == 'vertex': + import vertexai + from vertexai.generative_models import GenerativeModel + vertexai.init(project=self.project_id, location='us-central1') + model_name = 'gemini-1.0-pro-vision' if self.model == 'gemini-1.0-pro' else self.model + model = GenerativeModel(model_name=model_name) + messages = self.build_msgs_vertex(inputs) + try: + resp = model.generate_content(messages) + answer = resp.text + return 0, answer, 'Succeeded! ' + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(f'The input messages are {inputs}.') + + return -1, '', '' + + +class Gemini(GeminiWrapper): + VIDEO_LLM = True + allowed_types = ['text', 'image', 'video'] # Explicitly set to avoid None issues + + def generate(self, message, dataset=None): + return super(Gemini, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/glm_vision.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/glm_vision.py new file mode 100644 index 00000000..9a799786 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/glm_vision.py @@ -0,0 +1,76 @@ +import copy as cp +import os + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import encode_image_file_to_base64, get_logger + +logger = get_logger(__name__) + + +class GLMVisionWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str, + retry: int = 5, + key: str = None, + verbose: bool = True, + system_prompt: str = None, + max_tokens: int = 4096, + proxy: str = None, + **kwargs): + + from zhipuai import ZhipuAI + self.model = model + self.fail_msg = 'Failed to obtain answer via API. ' + if key is None: + key = os.environ.get('GLMV_API_KEY', None) + assert key is not None, ( + 'Please set the API Key (obtain it here: ' + 'https://bigmodel.cn)' + ) + self.client = ZhipuAI(api_key=key) + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + def build_msgs(self, msgs_raw, system_prompt=None, dataset=None): + msgs = cp.deepcopy(msgs_raw) + content = [] + for i, msg in enumerate(msgs): + if msg['type'] == 'text': + content.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value'])))) + if dataset in {'HallusionBench', 'POPE'}: + content.append(dict(type="text", text="Please answer yes or no.")) + ret = [dict(role='user', content=content)] + return ret + + def generate_inner(self, inputs, **kwargs) -> str: + assert isinstance(inputs, str) or isinstance(inputs, list) + inputs = [inputs] if isinstance(inputs, str) else inputs + + messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None)) + + try: + response = self.client.chat.completions.create( + model=self.model, + messages=messages, + do_sample=False, + max_tokens=2048 + ) + answer = response.choices[0].message.content.strip() + if self.verbose: + logger.info(f'inputs: {inputs}\nanswer: {answer}') + return 0, answer, 'Succeeded!' + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(f'The input messages are {inputs}.') + return -1, self.fail_msg, '' + + +class GLMVisionAPI(GLMVisionWrapper): + + def generate(self, message, dataset=None): + return super(GLMVisionAPI, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/gpt.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/gpt.py new file mode 100644 index 00000000..a80718b0 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/gpt.py @@ -0,0 +1,334 @@ +import json +import math +import os + +import numpy as np +import requests +from PIL import Image + +from vlmeval.smp import encode_image_to_base64, get_logger +from .base import BaseAPI + +logger = get_logger(__name__) + +APIBASES = { + 'OFFICIAL': 'https://api.openai.com/v1/chat/completions', +} + + +def GPT_context_window(model): + length_map = { + 'gpt-4': 8192, + 'gpt-4-0613': 8192, + 'gpt-4-turbo-preview': 128000, + 'gpt-4-1106-preview': 128000, + 'gpt-4-0125-preview': 128000, + 'gpt-4-vision-preview': 128000, + 'gpt-4-turbo': 128000, + 'gpt-4-turbo-2024-04-09': 128000, + 'gpt-3.5-turbo': 16385, + 'gpt-3.5-turbo-0125': 16385, + 'gpt-3.5-turbo-1106': 16385, + 'gpt-3.5-turbo-instruct': 4096, + } + if model in length_map: + return length_map[model] + else: + return 128000 + + +class OpenAIWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'gpt-3.5-turbo-0613', + retry: int = 5, + key: str = None, + verbose: bool = False, + system_prompt: str = None, + temperature: float = 0, + timeout: int = 300, + api_base: str = None, + max_tokens: int = 2048, + img_size: int = -1, + total_img_size: int = -1, + img_detail: str = 'low', + use_azure: bool = False, + **kwargs): + + self.model = model + self.cur_idx = 0 + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.temperature = temperature + self.use_azure = use_azure + + if 'step' in model: + env_key = os.environ.get('STEPAI_API_KEY', '') + if key is None: + key = env_key + elif 'yi-vision' in model: + env_key = os.environ.get('YI_API_KEY', '') + if key is None: + key = env_key + elif 'internvl2-pro' in model: + env_key = os.environ.get('InternVL2_PRO_KEY', '') + if key is None: + key = env_key + elif 'abab' in model: + env_key = os.environ.get('MiniMax_API_KEY', '') + if key is None: + key = env_key + elif 'moonshot' in model: + env_key = os.environ.get('MOONSHOT_API_KEY', '') + if key is None: + key = env_key + elif 'grok' in model: + env_key = os.environ.get('XAI_API_KEY', '') + if key is None: + key = env_key + elif 'gemini' in model and 'preview' in model and not os.environ.get('OPENAI_API_BASE'): + # Direct Google Gemini API. Skip when the user has pinned + # OPENAI_API_BASE to a different OpenAI-compatible gateway + # (e.g. NVIDIA's inference gateway, which serves Gemini under + # gcp/google/* model strings via the standard OpenAI shape). + env_key = os.environ.get('GOOGLE_API_KEY', '') + if key is None: + key = env_key + api_base = "https://generativelanguage.googleapis.com/v1beta/openai/chat/completions" + elif 'ernie' in model: + env_key = os.environ.get('BAIDU_API_KEY', '') + if key is None: + key = env_key + api_base = 'https://qianfan.baidubce.com/v2/chat/completions' + self.baidu_appid = os.environ.get('BAIDU_APP_ID', None) + else: + if use_azure: + env_key = os.environ.get('AZURE_OPENAI_API_KEY', None) + assert env_key is not None, 'Please set the environment variable AZURE_OPENAI_API_KEY. ' + + if key is None: + key = env_key + assert isinstance(key, str), ( + 'Please set the environment variable AZURE_OPENAI_API_KEY to your openai key. ' + ) + else: + env_key = os.environ.get('OPENAI_API_KEY', '') + if key is None: + key = env_key + + self.key = key + assert img_size > 0 or img_size == -1 + self.img_size = img_size + assert total_img_size > 0 or total_img_size == -1 + self.total_img_size = total_img_size + assert img_detail in ['high', 'low'] + self.img_detail = img_detail + self.timeout = timeout + self.is_max_completion_tokens = ('o1' in model) or ('o3' in model) or ('o4' in model) or ('gpt-5' in model) + self.is_o_model = ('o1' in model) or ('o3' in model) or ('o4' in model) + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + if use_azure: + api_base_template = ( + '{endpoint}openai/deployments/{deployment_name}/chat/completions?api-version={api_version}' + ) + endpoint = os.getenv('AZURE_OPENAI_ENDPOINT', None) + assert endpoint is not None, 'Please set the environment variable AZURE_OPENAI_ENDPOINT. ' + deployment_name = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME', None) + assert deployment_name is not None, 'Please set the environment variable AZURE_OPENAI_DEPLOYMENT_NAME. ' + api_version = os.getenv('OPENAI_API_VERSION', None) + assert api_version is not None, 'Please set the environment variable OPENAI_API_VERSION. ' + + self.api_base = api_base_template.format( + endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'), + deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'), + api_version=os.getenv('OPENAI_API_VERSION') + ) + else: + if api_base is None: + if 'OPENAI_API_BASE' in os.environ and os.environ['OPENAI_API_BASE'] != '': + logger.info('Environment variable OPENAI_API_BASE is set. Will use it as api_base. ') + api_base = os.environ['OPENAI_API_BASE'] + else: + api_base = 'OFFICIAL' + + assert api_base is not None + + if api_base in APIBASES: + self.api_base = APIBASES[api_base] + elif api_base.startswith('http'): + self.api_base = api_base + else: + logger.error('Unknown API Base. ') + raise NotImplementedError + if os.environ.get('BOYUE', None): + self.api_base = os.environ.get('BOYUE_API_BASE') + self.key = os.environ.get('BOYUE_API_KEY') + + logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}') + + # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] + # content can be a string or a list of image & text + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + has_videos = np.sum([x['type'] == 'video' for x in inputs]) + image_num = len([x['type'] == 'image' for x in inputs]) + if has_images or has_videos: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + img = Image.open(msg['value']) + target_size = math.inf + if self.img_size > 0: + target_size = self.img_size + if self.total_img_size > 0: + target_size = min(target_size, int(self.img_size / (image_num**0.5))) + target_size = -1 if math.isinf(target_size) else target_size + b64 = encode_image_to_base64(img, target_size=target_size) + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail) + content_list.append(dict(type='image_url', image_url=img_struct)) + elif msg['type'] == 'video': + import base64 + import mimetypes + with open(msg['value'], 'rb') as fh: + b64 = base64.b64encode(fh.read()).decode('utf-8') + mime = mimetypes.guess_type(msg['value'])[0] or 'video/mp4' + vid_struct = dict(url=f'data:{mime};base64,{b64}') + content_list.append(dict(type='image_url', image_url=vid_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + input_msgs = self.prepare_inputs(inputs) + temperature = kwargs.pop('temperature', self.temperature) + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + + # Will send request if use Azure, dk how to use openai client for it + if self.use_azure: + headers = {'Content-Type': 'application/json', 'api-key': self.key} + elif 'internvl2-pro' in self.model: + headers = {'Content-Type': 'application/json', 'Authorization': self.key} + else: + headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} + if hasattr(self, 'baidu_appid'): + headers['appid'] = self.baidu_appid + + payload = dict( + model=self.model, + messages=input_msgs, + n=1, + temperature=temperature, + **kwargs) + + if self.is_max_completion_tokens: + payload['max_completion_tokens'] = max_tokens + payload.pop('temperature') + else: + payload['max_tokens'] = max_tokens + + if 'gemini' in self.model: + payload.pop('max_tokens') + payload.pop('n') + payload['reasoning_effort'] = 'high' + + proxies = {} + if os.getenv('http_proxy'): + proxies['http'] = os.getenv('http_proxy') + if os.getenv('https_proxy'): + proxies['https'] = os.getenv('https_proxy') + proxies = proxies or None + + response = requests.post( + self.api_base, + headers=headers, + data=json.dumps(payload), + proxies=proxies, + timeout=self.timeout * 1.1, + ) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + finish_reason = resp_struct.get('choices', [{}])[0].get('finish_reason', '') + if finish_reason == 'content_filter': + return -2, self.fail_msg, response + answer = resp_struct['choices'][0]['message']['content'].strip() + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(response.text if hasattr(response, 'text') else response) + + return ret_code, answer, response + + def get_image_token_len(self, img_path, detail='low'): + import math + if detail == 'low': + return 85 + + im = Image.open(img_path) + height, width = im.size + if width > 1024 or height > 1024: + if width > height: + height = int(height * 1024 / width) + width = 1024 + else: + width = int(width * 1024 / height) + height = 1024 + + h = math.ceil(height / 512) + w = math.ceil(width / 512) + total = 85 + 170 * h * w + return total + + def get_token_len(self, inputs) -> int: + import tiktoken + + try: + enc = tiktoken.encoding_for_model(self.model) + except Exception as err: + if 'gpt' in self.model.lower(): + if self.verbose: + logger.warning(f'{type(err)}: {err}') + enc = tiktoken.encoding_for_model('gpt-4') + else: + return 0 + assert isinstance(inputs, list) + tot = 0 + for item in inputs: + if 'role' in item: + tot += self.get_token_len(item['content']) + elif item['type'] == 'text': + tot += len(enc.encode(item['value'])) + elif item['type'] == 'image': + tot += self.get_image_token_len(item['value'], detail=self.img_detail) + return tot + + +class GPT4V(OpenAIWrapper): + + def generate(self, message, dataset=None): + return super(GPT4V, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/hf_chat_model.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/hf_chat_model.py new file mode 100644 index 00000000..86f2ef5e --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/hf_chat_model.py @@ -0,0 +1,263 @@ +import os +import os.path as osp + +import torch + +from vlmeval.smp import get_logger, listinstr + +logger = get_logger(__name__) + + +def get_gpu_num(model_name): + model_name = model_name.lower() + kws = { + 8: ['65b', '70b'], + 4: ['30b', '33b', '35b', '40b'], + 2: ['13b', '14b', '20b', '8b'], + 1: ['6b', '7b', 'moss'], + } + for k in [8, 4, 2, 1]: + for keyword in kws[k]: + if keyword in model_name: + return k + return 8 + + +validated_llms = [ + 'internlm/internlm-chat-7b', 'internlm/internlm-chat-7b-8k', 'internlm/internlm-chat-20b', + 'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat', + 'THUDM/chatglm2-6b', 'THUDM/chatglm2-6b-32k', 'THUDM/chatglm3-6b', 'THUDM/chatglm3-6b-32k', + 'baichuan-inc/Baichuan2-7B-Chat', 'baichuan-inc/Baichuan2-13B-Chat', + 'lmsys/vicuna-7b-v1.5', 'lmsys/vicuna-13b-v1.5', + 'meta-llama/Llama-2-7b-chat-hf', + 'meta-llama/Llama-3.1-8B-Instruct' +] +Auto_model = ['chatglm'] + + +class HFChatModel: + + def _get_context_length(self, model, model_path): + # By default, we use model.config.seq_length + model_path = model_path.lower() + if 'baichuan' in model_path: + context_window = model.config.model_max_length + elif 'internlm' in model_path or 'llama' in model_path: + context_window = model.config.max_position_embeddings + elif 'vicuna' in model_path: + context_window = model.generation_config.max_length + else: + # chatglm & qwen + context_window = model.config.seq_length + return context_window + + def _get_context_length_robust(self, model, model_path): + try: + context_window = self._get_context_length(model, model_path) + return context_window + except Exception as err: + logger.critical(f'{type(err)}: {err}') + logger.critical( + 'Failed to extract context_window information from config / generation_config. ' + 'Please read the above code and check if the logic works for you model path' + ) + raise NotImplementedError + + def __init__(self, + model_path, + system_prompt: str = None, + **kwargs): + + if 'vicuna' in model_path.lower() or 'llama' in model_path.lower(): + try: + from fastchat.model import get_conversation_template # noqa: F401 + except Exception as err: + logger.critical('Please install fastchat first to use vicuna. ') + raise err + + self.explicit_device = kwargs.pop('device', None) + if self.explicit_device is None: + # If CUDA_VISIBLE_DEVICES is not properly set + if 'CUDA_VISIBLE_DEVICES' not in os.environ or os.environ['CUDA_VISIBLE_DEVICES'] == '0,1,2,3,4,5,6,7': + num_gpu = get_gpu_num(model_path) + gpu_offset = kwargs.pop('gpu_offset', 0) + cuda_visible_devices = ','.join([str(i) for i in range(gpu_offset, gpu_offset + num_gpu)]) + os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices + + from transformers import AutoModel, AutoModelForCausalLM, AutoTokenizer + + if model_path not in validated_llms: + logger.warning(f'{model_path} not in validated LLMs, may have inference troubles. ') + + self.model_path = model_path + if listinstr(Auto_model, model_path): + LoadModel = AutoModel + else: + LoadModel = AutoModelForCausalLM + assert osp.exists(model_path) or len(model_path.split('/')) == 2 + + device = self.explicit_device if self.explicit_device else 'auto' + + precision = {} + if 'internlm-chat-7b' in model_path: + precision = {'torch_dtype': torch.float16} + elif 'internlm-chat-20b' in model_path: + precision = {'torch_dtype': torch.bfloat16} + + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + cuda_devices = os.environ.get('CUDA_VISIBLE_DEVICES', '0') + if ',' in cuda_devices: + device_ids = [int(x) for x in cuda_devices.split(',')] + _ = {i: i for i in range(len(device_ids))} + else: + _ = {'': 0} + + if 'llama' in self.model_path.lower(): + from lmdeploy import GenerationConfig, TurbomindEngineConfig, pipeline + print(f"Loading model {model_path} with {num_gpu} GPUs") + backend_config = TurbomindEngineConfig(tp=num_gpu) + self.gen_config = GenerationConfig(max_new_tokens=256) + model = pipeline(model_path, backend_config=backend_config) + else: + model = LoadModel.from_pretrained(model_path, trust_remote_code=True, device_map='cpu', **precision) + model = model.eval() + + if device != 'cpu': + model = model.to(f'cuda:{device}' if isinstance(device, int) else 'cuda') + try: + from transformers.generation import GenerationConfig + model.generation_config = GenerationConfig.from_pretrained( + model_path, trust_remote_code=True, device_map=device) + except Exception as err: + logger.warning(f'{type(err)}: {err}') + + self.context_length = self._get_context_length_robust(model=model, model_path=model_path) + + torch.cuda.empty_cache() + self.model = model + self.answer_buffer = 192 + self.system_prompt = system_prompt + for k, v in kwargs.items(): + logger.info(f'Following args will be used for generation (If not set specifically), {k}: {v}. ') + self.kwargs = kwargs + + def generate_str(self, input, **kwargs): + if 'baichuan' in self.model_path.lower(): + messages = [] + messages.append({'role': 'user', 'content': input}) + resp = self.model.chat(self.tokenizer, messages, **kwargs) + elif 'vicuna' in self.model_path.lower(): + from fastchat.model import get_conversation_template + conv = get_conversation_template('vicuna') + conv.append_message(conv.roles[0], input) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + inputs = self.tokenizer([prompt], return_tensors='pt') + if torch.cuda.is_available(): + for k in inputs: + inputs[k] = inputs[k].cuda() + + params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512) + params.update(self.kwargs) + params.update(kwargs) + outputs = self.model.generate(**inputs, **params) + resp = self.tokenizer.decode( + outputs[0][len(inputs['input_ids'][0]):], + skip_special_tokens=True, + spaces_between_special_tokens=False) + elif 'llama' in self.model_path.lower(): + prompt = [{'role': 'system', 'content': self.system_prompt}, {'role': 'user', 'content': input}] + resp = self.model(prompt, gen_config=self.gen_config).text + else: + params = self.kwargs + params.update(kwargs) + resp, _ = self.model.chat(self.tokenizer, input, history=[], **params) + + return resp + + def length_ok(self, inputs): + tot = len(self.tokenizer.encode(self.system_prompt)) if self.system_prompt is not None else 0 + for s in inputs: + tot += len(self.tokenizer.encode(s)) + return tot + self.answer_buffer < self.context_length + + def generate_list(self, full_inputs, offset=0, **kwargs): + assert isinstance(full_inputs, list) + inputs = full_inputs[offset:] + if not self.length_ok(inputs): + return self.chat(full_inputs, offset + 1) + + model_path = self.model_path.lower() + + if sum([x in model_path for x in ['baichuan']]): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='user', content=self.system_prompt)) + if len(inputs): + assert isinstance(inputs, list) and isinstance(inputs[0], str) + roles = ['user', 'assistant'] if len(inputs) % 2 == 1 else ['assistant', 'user'] + roles = roles * len(inputs) + for role, msg in zip(roles, inputs): + input_msgs.append(dict(role=role, content=msg)) + response = self.model.chat(self.tokenizer, input_msgs) + elif sum([x in model_path for x in ['vicuna']]): + from fastchat.model import get_conversation_template + conv = get_conversation_template('vicuna') + assert isinstance(inputs, list) and isinstance(inputs[0], str) + if len(inputs) % 2 == 1: + if self.system_prompt is not None: + conv.append_message(conv.roles[0], self.system_prompt) + for i in range(len(inputs) // 2): + conv.append_message(conv.roles[0], inputs[2 * i]) + conv.append_message(conv.roles[1], inputs[2 * i + 1]) + else: + assert self.system_prompt is not None + conv.append_message(conv.roles[0], self.system_prompt) + conv.append_message(conv.roles[1], inputs[0]) + for i in range(len(inputs) // 2 - 1): + conv.append_message(conv.roles[0], inputs[2 * i + 1]) + conv.append_message(conv.roles[1], inputs[2 * i + 2]) + conv.append_message(conv.roles[0], inputs[-1]) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + inputs = self.tokenizer([prompt], return_tensors='pt') + if torch.cuda.is_available(): + for k in inputs: + inputs[k] = inputs[k].cuda() + + params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512) + params.update(self.kwargs) + params.update(kwargs) + + outputs = self.model.generate(**inputs, **params) + response = self.tokenizer.decode( + outputs[0][len(inputs['input_ids'][0]):], + skip_special_tokens=True, + spaces_between_special_tokens=False) + response = response.lstrip('\n') + else: + # The default option, support internlm, chatglm, qwen + history, msg = [], None + if len(inputs) % 2 == 1: + if self.system_prompt is not None: + history = [(self.system_prompt, '')] + for i in range(len(inputs) // 2): + history.append((inputs[2 * i], inputs[2 * i + 1])) + else: + assert self.system_prompt is not None + history = [(self.system_prompt, inputs[0])] + for i in range(len(inputs) // 2 - 1): + history.append((inputs[2 * i + 1], inputs[2 * i + 2])) + msg = inputs[-1] + + params = self.kwargs + params.update(kwargs) + response, _ = self.model.chat(self.tokenizer, msg, history=history, **params) + + return response, offset + + def generate(self, inputs, **kwargs): + if isinstance(inputs, str): + return self.generate_str(inputs, **kwargs) + elif isinstance(inputs, list): + return self.generate_list(inputs, **kwargs) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/hunyuan.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/hunyuan.py new file mode 100644 index 00000000..29a75a5d --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/hunyuan.py @@ -0,0 +1,184 @@ +import json +import os +import string +import warnings + +import numpy as np +import pandas as pd + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_TYPE +from vlmeval.smp import encode_image_to_base64, get_logger + +logger = get_logger(__name__) + + +class HunyuanWrapper(BaseAPI): + + is_api: bool = True + _apiVersion = '2024-12-31' + _service = 'hunyuan' + + def __init__(self, + model: str = 'hunyuan-standard-vision', + retry: int = 5, + secret_key: str = None, + secret_id: str = None, + verbose: bool = True, + system_prompt: str = None, + temperature: float = 0, + timeout: int = 60, + api_base: str = 'hunyuan.tencentcloudapi.com', + **kwargs): + + self.model = model + self.cur_idx = 0 + self.fail_msg = 'Failed to obtain answer via API. ' + self.temperature = temperature + + warnings.warn('You may need to set the env variable HUNYUAN_SECRET_ID & HUNYUAN_SECRET_KEY to use Hunyuan. ') + + secret_key = os.environ.get('HUNYUAN_SECRET_KEY', secret_key) + assert secret_key is not None, 'Please set the environment variable HUNYUAN_SECRET_KEY. ' + secret_id = os.environ.get('HUNYUAN_SECRET_ID', secret_id) + assert secret_id is not None, 'Please set the environment variable HUNYUAN_SECRET_ID. ' + + self.model = model + self.endpoint = api_base + self.secret_id = secret_id + self.secret_key = secret_key + self.timeout = timeout + + try: + from tencentcloud.common import credential + from tencentcloud.common.profile.client_profile import ClientProfile + from tencentcloud.common.profile.http_profile import HttpProfile + from tencentcloud.hunyuan.v20230901 import hunyuan_client + except ImportError as err: + logger.critical('Please install tencentcloud-sdk-python to use Hunyuan API. ') + raise err + + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + cred = credential.Credential(self.secret_id, self.secret_key) + httpProfile = HttpProfile(reqTimeout=300) + httpProfile.endpoint = self.endpoint + clientProfile = ClientProfile() + clientProfile.httpProfile = httpProfile + self.client = hunyuan_client.HunyuanClient(cred, '', clientProfile) + logger.info( + f'Using Endpoint: {self.endpoint}; API Secret ID: {self.secret_id}; API Secret Key: {self.secret_key}' + ) + + def use_custom_prompt(self, dataset_name): + if DATASET_TYPE(dataset_name) == 'MCQ': + return True + else: + return False + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + + tgt_path = self.dump_image(line, dataset) + + question = line['question'] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = 'Options:\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = '' + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'Question: {question}\n' + if len(options): + prompt += options_prompt + prompt += 'Answer with the option letter from the given choices directly.' + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=prompt)) + return msgs + + # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] + # content can be a string or a list of image & text + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(Type='text', Text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + img_struct = dict(Url=f'data:image/jpeg;base64,{b64}') + content_list.append(dict(Type='image_url', ImageUrl=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(Type='text', Text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(Role='system', Content=self.system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(Role=item['role'], Contents=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(Role='user', Contents=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + from tencentcloud.common.exception.tencent_cloud_sdk_exception import \ + TencentCloudSDKException + from tencentcloud.hunyuan.v20230901 import models + + input_msgs = self.prepare_inputs(inputs) + temperature = kwargs.pop('temperature', self.temperature) + + payload = dict( + Model=self.model, + Messages=input_msgs, + Temperature=temperature, + TopK=1, + **kwargs) + + try: + req = models.ChatCompletionsRequest() + req.from_json_string(json.dumps(payload)) + resp = self.client.ChatCompletions(req) + resp = json.loads(resp.to_json_string()) + answer = resp['Choices'][0]['Message']['Content'] + return 0, answer, resp + except TencentCloudSDKException as e: + logger.error(f'Got error code: {e.get_code()}') + if e.get_code() == 'ClientNetworkError': + return -1, self.fail_msg + e.get_code(), None + elif e.get_code() in ['InternalError', 'ServerNetworkError']: + return -1, self.fail_msg + e.get_code(), None + elif e.get_code() in ['LimitExceeded']: + return -1, self.fail_msg + e.get_code(), None + else: + return -1, self.fail_msg + str(e), None + + +class HunyuanVision(HunyuanWrapper): + + def generate(self, message, dataset=None): + return super(HunyuanVision, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/jt_vl_chat.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/jt_vl_chat.py new file mode 100644 index 00000000..7efac9ac --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/jt_vl_chat.py @@ -0,0 +1,289 @@ +import base64 +import json +import os +import os.path as osp +import string + +import pandas as pd +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_TYPE, img_root_map +from vlmeval.smp import (LMUDataRoot, cn_string, concat_images_vlmeval, + decode_base64_to_image_file, get_logger, listinstr, read_ok, toliststr) + +logger = get_logger(__name__) + +API_ENDPOINT = "https://hl.jiutian.10086.cn/kunlun/ingress/api/hl-4a9c15/7b11a3451e1a4612a6661c3e22235df6/ai-4dfc1be4e6854a75a833aab9b956128c/service-e5893ba5c1154a1192cb8e60af11e276/v1/chat/completions" # noqa: E501 +APP_CODE = "B0m6Tuglt5shfY7t3GyoJn1V5yVAm0Ba" + + +class JTVLChatWrapper(BaseAPI): + is_api: bool = True + INTERLEAVE = False + + def __init__(self, + model: str = 'jt-vl-chat', + retry: int = 5, + wait: int = 5, + api_base: str = '', + app_code: str = '', + verbose: bool = True, + system_prompt: str = None, + temperature: float = 0.7, + max_tokens: int = 2048, + proxy: str = None, + **kwargs): + self.model = model + + self.temperature = temperature + self.max_tokens = max_tokens + self.api_base = API_ENDPOINT + self.app_code = APP_CODE + + super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + def dump_image(self, line, dataset): + """Dump the image(s) of the input line to the corresponding dataset folder. + + Args: + line (line of pd.DataFrame): The raw input line. + dataset (str): The name of the dataset. + + Returns: + str | list[str]: The paths of the dumped images. + """ + ROOT = LMUDataRoot() + assert isinstance(dataset, str) + + img_root = os.path.join( + ROOT, 'images', + img_root_map(dataset) if dataset in img_root_map(dataset) else dataset) + os.makedirs(img_root, exist_ok=True) + if 'image' in line: + if isinstance(line['image'], list): + tgt_path = [] + assert 'image_path' in line + for img, im_name in zip(line['image'], line['image_path']): + path = osp.join(img_root, im_name) + if not read_ok(path): + decode_base64_to_image_file(img, path) + tgt_path.append(path) + else: + tgt_path = osp.join(img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + tgt_path = [tgt_path] + else: + assert 'image_path' in line + tgt_path = toliststr(line['image_path']) + + return tgt_path + + def use_custom_prompt(self, dataset): + assert dataset is not None + if listinstr(['MMMU_DEV_VAL', 'MMMU_TEST'], dataset): + return False + else: + return True + + def build_multi_choice_prompt(self, line, dataset=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + prompt += '\n请直接回答选项字母。' if cn_string( + prompt) else "\nAnswer with the option's letter from the given choices directly." + else: + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + + return prompt + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + + tgt_path = self.dump_image(line, dataset) + + if dataset is not None and listinstr(['MME'], dataset): + question = line['question'] + prompt = question + ' Answer the question using a single word or phrase.' + elif dataset is not None and listinstr(['HallusionBench'], dataset): + question = line['question'] + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = self.build_multi_choice_prompt(line, dataset) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + if listinstr(['MathVista', 'MathVision'], dataset): + prompt = line['question'] + elif listinstr(['LLaVABench'], dataset): + question = line['question'] + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['MMVet'], dataset): + prompt = line['question'] + else: + question = line['question'] + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + prompt = line['question'] + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + return message + + def message_to_promptimg(self, message, dataset=None): + assert not self.INTERLEAVE + model_name = self.__class__.__name__ + import warnings + + warnings.warn(f'Model {model_name} does not support interleaved input. ' + 'Will use the first image and aggregated texts as prompt. ') + num_images = len([x for x in message if x['type'] == 'image']) + if num_images == 0: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = None + else: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + if dataset == 'BLINK': + image = concat_images_vlmeval( + [x['value'] for x in message if x['type'] == 'image'], target_size=512) + else: + image = [x['value'] for x in message if x['type'] == 'image'][0] + return prompt, image + + def get_send_data(self, + prompt, + image_path, + temperature, + max_tokens, + stream=False, + understanding_plus=False): + image = '' + with open(image_path, 'rb') as f: + image = str(base64.b64encode(f.read()), 'utf-8') + send_data = { + "messages": [{ + "role": "user", + "content": prompt + }], + "image_base64": image, + "max_tokens": max_tokens, + "temperature": temperature, + "do_sample": False, + "understanding_plus": understanding_plus, + "stream": stream + } + return send_data + + def get_send_data_no_image(self, + prompt, + temperature, + max_tokens, + stream=False, + understanding_plus=False): + send_data = { + "messages": [{ + "role": "user", + "content": prompt + }], + "max_tokens": max_tokens, + "temperature": temperature, + "understanding_plus": understanding_plus, + "stream": stream + } + return send_data + + def generate_inner(self, inputs, **kwargs) -> str: + assert isinstance(inputs, str) or isinstance(inputs, list) + inputs = [inputs] if isinstance(inputs, str) else inputs + dataset = kwargs.get('dataset', None) + prompt, image_path = self.message_to_promptimg(message=inputs, dataset=dataset) + # print("prompt:",prompt) + if image_path: + send_data = self.get_send_data( + prompt=prompt, + image_path=image_path, + temperature=self.temperature, + max_tokens=self.max_tokens, + understanding_plus=True, + stream=True) + else: + send_data = self.get_send_data_no_image( + prompt=prompt, + temperature=self.temperature, + max_tokens=self.max_tokens, + understanding_plus=True, + stream=True) + + json_data = json.dumps(send_data) + + header_dict = {'Content-Type': 'application/json', 'Authorization': self.app_code} + + r = requests.post( + self.api_base, headers=header_dict, data=json_data, timeout=3000, stream=True) + try: + if send_data.get('stream', False): + chunks = [] + full_content = "" + # last_valid_usage = None # 用于记录最后一个有效的usage + # 流式处理 - 直接迭代并打印结果 + try: + for line in r.iter_lines(): + if line: + decoded_line = line.decode('utf-8') + if decoded_line.startswith('data: '): + event_data = decoded_line[6:] + if event_data == '[DONE]': + break + try: + chunk = json.loads(event_data) + chunks.append(chunk) + + # 记录最后一个有效的usage(不累加) + if 'usage' in chunk: + last_valid_usage = chunk['usage'] # noqa: F841 + + if 'choices' in chunk: + for choice in chunk['choices']: + if 'delta' in choice and 'content' in choice['delta']: + content = choice['delta']['content'] + print(content, end='', flush=True) + full_content += content + except json.JSONDecodeError: + continue + print() # 打印换行 + return 0, full_content, 'Succeeded! ' + except Exception as e: + return -1, f'Error: {str(e)}', '' + + else: + # 非流式处理 + try: + r_json = r.json() + output = r_json['choices'][0]['message']['content'] + return 0, output, 'Succeeded! ' + except Exception: + error_msg = f'Error! code {r.status_code} content: {r.content}' + error_con = r.content.decode('utf-8') + if self.verbose: + logger.error(error_msg) + logger.error(error_con) + logger.error(f'The input messages are {inputs}.') + return -1, error_msg, '' + except Exception as e: + return -1, f'Error: {str(e)}', '' + + +class JTVLChatAPI(JTVLChatWrapper): + + def generate(self, message, dataset=None): + return super(JTVLChatAPI, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/jt_vl_chat_mini.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/jt_vl_chat_mini.py new file mode 100644 index 00000000..6299dbb9 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/jt_vl_chat_mini.py @@ -0,0 +1,290 @@ +# flake8: noqa +import base64 +import json +import os +import os.path as osp +import string + +import pandas as pd +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_TYPE, img_root_map +from vlmeval.smp import (LMUDataRoot, cn_string, concat_images_vlmeval, + decode_base64_to_image_file, get_logger, listinstr, read_ok, toliststr) + +API_ENDPOINT = "https://hl.jiutian.10086.cn/kunlun/ingress/api/hl-4a9c15/7b11a3451e1a4612a6661c3e22235df6/ai-e7d64e71b61e421b8a41c0d43424d73a/service-cd5a067331e34cfea25f5a9d7960ffe8//v1/chat/completions" +API_ENDPOINT_2B = 'https://hl.jiutian.10086.cn/kunlun/ingress/api/hl-4a9c15/7b11a3451e1a4612a6661c3e22235df6/ai-3101961c75eb47ad8cc8d8ebb62fb8b0/service-c0bf9bac00824ace8639c0e8e0a4a5da/v1/chat/completions' +APP_CODE = "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiI1YjYyODY0ZjZmMWI0Yzg4YWE2ZDk1NzBhNDY1MWI3OSIsImlzcyI6ImFwaS1hdXRoLWtleSIsImV4cCI6NDg5ODU1NzAwN30.jLUbctPdJ74VC3Vwlr0nB4x9N2QxWtSGYE0vWsceZN-agDecVnH8pH8Q5SoCQ3-SBYx5jDx-UOg3kkoMqY9CdxiALauKU_UZ56CV2NKCcHUVeJIgNvfQJMb0z6yCCbSe80e1T8FxrxQXDvubyWtl4pTAhixYaEUqNG8rjUrDuA-vRgZ1e7HilBmU487OI76D9LUnU-zEdMWhzsCkh_Yy3M1Ur4PsKgMFi5QSmMuGSUGJjkpJHiGNx1QcevBLQSOCL2jvg15ifB2n2dD6zb8iPXFkfQTtmvbZofxWACSvkri-x9V3gFWg7DODwKUZsyyogPzRJVbmxDGruMsgiiCsPg" + + +logger = get_logger(__name__) + +class JTVLChatWrapper(BaseAPI): + is_api: bool = True + INTERLEAVE = False + + def __init__(self, + model: str = 'jt-vl-chat-mini', + retry: int = 5, + wait: int = 5, + api_base: str = '', + app_code: str = '', + verbose: bool = True, + system_prompt: str = None, + temperature: float = 0.7, + max_tokens: int = 2048, + proxy: str = None, + **kwargs): + self.model = model + + self.temperature = temperature + self.max_tokens = max_tokens + if model == 'jt-vl-chat-mini': + self.api_base = API_ENDPOINT + else: + self.api_base = API_ENDPOINT_2B + self.app_code = APP_CODE + + super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + def dump_image(self, line, dataset): + """Dump the image(s) of the input line to the corresponding dataset folder. + + Args: + line (line of pd.DataFrame): The raw input line. + dataset (str): The name of the dataset. + + Returns: + str | list[str]: The paths of the dumped images. + """ + ROOT = LMUDataRoot() + assert isinstance(dataset, str) + + img_root = os.path.join(ROOT, 'images', img_root_map(dataset) if dataset in img_root_map(dataset) else dataset) + os.makedirs(img_root, exist_ok=True) + if 'image' in line: + if isinstance(line['image'], list): + tgt_path = [] + assert 'image_path' in line + for img, im_name in zip(line['image'], line['image_path']): + path = osp.join(img_root, im_name) + if not read_ok(path): + decode_base64_to_image_file(img, path) + tgt_path.append(path) + else: + tgt_path = osp.join(img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + tgt_path = [tgt_path] + else: + assert 'image_path' in line + tgt_path = toliststr(line['image_path']) + + return tgt_path + + def use_custom_prompt(self, dataset): + assert dataset is not None + if listinstr(['MMMU_DEV_VAL', 'MMMU_TEST'], dataset): + return False + else: + return True + + def build_multi_choice_prompt(self, line, dataset=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + prompt += '\n请直接回答选项字母。' if cn_string( + prompt) else "\nAnswer with the option's letter from the given choices directly." + else: + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + + return prompt + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + + tgt_path = self.dump_image(line, dataset) + + if dataset is not None and listinstr(['MME'], dataset): + question = line['question'] + prompt = question + ' Answer the question using a single word or phrase.' + elif dataset is not None and listinstr(['HallusionBench'], dataset): + question = line['question'] + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = self.build_multi_choice_prompt(line, dataset) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + if listinstr(['MathVista', 'MathVision'], dataset): + prompt = line['question'] + elif listinstr(['LLaVABench'], dataset): + question = line['question'] + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['MMVet'], dataset): + prompt = line['question'] + else: + question = line['question'] + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + prompt = line['question'] + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + return message + + def message_to_promptimg(self, message, dataset=None): + assert not self.INTERLEAVE + model_name = self.__class__.__name__ + import warnings + warnings.warn( + f'Model {model_name} does not support interleaved input. ' + 'Will use the first image and aggregated texts as prompt. ') + num_images = len([x for x in message if x['type'] == 'image']) + if num_images == 0: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = None + else: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + if dataset == 'BLINK': + image = concat_images_vlmeval( + [x['value'] for x in message if x['type'] == 'image'], + target_size=512) + else: + image = [x['value'] for x in message if x['type'] == 'image'][0] + return prompt, image + + def get_send_data(self, prompt, image_path, temperature, max_tokens, stream=False): + image = '' + with open(image_path, 'rb') as f: + image = str(base64.b64encode(f.read()), 'utf-8') + send_data = { + "messages": [ + { + "role": "user", + "content": prompt + } + ], + "image_base64": image, + "max_tokens": max_tokens, + "temperature": temperature, + "do_sample": False, + "stream": stream + } + return send_data + + def get_send_data_no_image(self, prompt, temperature, max_tokens, stream=False): + send_data = { + "messages": [ + { + "role": "user", + "content": prompt + } + ], + "max_tokens": max_tokens, + "temperature": temperature, + "stream": stream, + } + return send_data + + def generate_inner(self, inputs, **kwargs) -> str: + assert isinstance(inputs, str) or isinstance(inputs, list) + inputs = [inputs] if isinstance(inputs, str) else inputs + dataset = kwargs.get('dataset', None) + prompt, image_path = self.message_to_promptimg(message=inputs, dataset=dataset) + # print("prompt:",prompt) + if image_path: + send_data = self.get_send_data( + prompt=prompt, + image_path=image_path, + temperature=self.temperature, + max_tokens=self.max_tokens, + stream=True) + else: + send_data = self.get_send_data_no_image( + prompt=prompt, + temperature=self.temperature, + max_tokens=self.max_tokens, + stream=True) + + json_data = json.dumps(send_data) + + header_dict = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + self.app_code} + + r = requests.post(self.api_base, headers=header_dict, data=json_data, timeout=3000, stream=True) + try: + if send_data.get('stream', False): + # 流式处理 + chunks = [] + full_content = "" + + try: + for line in r.iter_lines(): + if line: + decoded_line = line.decode('utf-8') + if decoded_line.startswith('data: '): + event_data = decoded_line[6:] + if event_data == '[DONE]': + break + try: + chunk = json.loads(event_data) + chunks.append(chunk) + + # 记录最后一个有效的usage(不累加) + if 'usage' in chunk: + _ = chunk['usage'] + + # 实时输出内容 + if 'choices' in chunk: + for choice in chunk['choices']: + if 'delta' in choice and 'content' in choice['delta']: + content = choice['delta']['content'] + print(content, end='', flush=True) + full_content += content + except json.JSONDecodeError: + continue + print("\n") # 换行 + + return 0, full_content, 'Succeeded! ' + + except Exception as e: + return -1, f'Error: {str(e)}', '' + else: + # 非流式处理 + try: + r_json = r.json() + output = r_json['choices'][0]['message']['content'] + return 0, output, 'Succeeded! ' + except: + error_msg = f'Error! code {r.status_code} content: {r.content}' + error_con = r.content.decode('utf-8') + if self.verbose: + logger.error(error_msg) + logger.error(error_con) + logger.error(f'The input messages are {inputs}.') + return -1, error_msg, '' + except Exception as e: + return -1, f'Error: {str(e)}', '' + + +class JTVLChatAPI_Mini(JTVLChatWrapper): + + def generate(self, message, dataset=None): + return super(JTVLChatAPI_Mini, self).generate(message, dataset=dataset) + + +class JTVLChatAPI_2B(JTVLChatWrapper): + + def generate(self, message, dataset=None): + return super(JTVLChatAPI_2B, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/kimivl_api.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/kimivl_api.py new file mode 100644 index 00000000..2823698c --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/kimivl_api.py @@ -0,0 +1,166 @@ +import json +import os + +import numpy as np +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import encode_image_to_base64, get_logger + +logger = get_logger(__name__) + +APIBASES = { + 'OFFICIAL': 'http://localhost:8000/v1/chat/completions', +} + + +def extract_summary(text: str, bot: str = "◁think▷", eot: str = "◁/think▷") -> str: + # 输出截断, 返回空字符串 + if bot in text and eot not in text: + return "" + if eot in text: + return text[text.index(eot) + len(eot):].strip() + return text + + +class KimiVLAPIWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'api-kimi-vl-thinking-2506', + retry: int = 5, + key: str = None, + verbose: bool = True, + system_prompt: str = None, + temperature: float = 0.8, + timeout: int = 360, + api_base: str = 'OFFICIAL', + max_tokens: int = 32768, + **kwargs): + + self.model = model + self.cur_idx = 0 + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.temperature = temperature + + if 'kimi' in model: + env_key = os.environ.get('KIMI_VL_API_KEY', '') + if key is None: + key = env_key + + self.key = key + self.timeout = timeout + + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + if 'KIMI_VL_API_BASE' in os.environ and os.environ['KIMI_VL_API_BASE'] != '': + logger.info('Environment variable KIMI_VL_API_BASE is set. Will use it as api_base. ') + api_base = os.environ['KIMI_VL_API_BASE'] + else: + api_base = 'OFFICIAL' + + print(api_base) + + assert api_base is not None + + if api_base in APIBASES: + self.api_base = APIBASES[api_base] + elif api_base.startswith('http'): + self.api_base = api_base + else: + logger.error('Unknown API Base. ') + raise NotImplementedError + + logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}') + + # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] + # content can be a string or a list of image & text + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + if msg["value"] == "": + continue + content_list.append(dict(type='text', text=msg['value'])) + + elif msg['type'] == 'image': + from PIL import Image + + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + img_struct = dict(url=f'data:image/jpeg;base64,{b64}') + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + if os.environ.get("THINKING_SKIPPED", False): + input_msgs.append({ + "role": "assistant", + "content": "◁think▷\n\n◁/think▷", + "partial": True + }) + logger.info("Add skip thinking pattern") + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + input_msgs = self.prepare_inputs(inputs) + temperature = kwargs.pop('temperature', self.temperature) + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + + headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} + payload = dict( + model=self.model, + messages=input_msgs, + n=1, + temperature=temperature, + **kwargs) + print(self.model) + + payload['max_tokens'] = max_tokens + response = requests.post( + self.api_base, + headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + print(answer) + length_befofe_es = len(answer.split()) + answer = extract_summary(answer) + length_after_es = len(answer.split()) + if length_befofe_es != length_after_es: + logger.info("Thinking length: {}".format(length_befofe_es - length_after_es)) + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(response.text if hasattr(response, 'text') else response) + + return ret_code, answer, response + + +class KimiVLAPI(KimiVLAPIWrapper): + + def generate(self, message, dataset=None): + return super(KimiVLAPI, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/lmdeploy.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/lmdeploy.py new file mode 100644 index 00000000..a8d286bc --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/lmdeploy.py @@ -0,0 +1,142 @@ +import os + +import numpy as np + +from ..smp import encode_image_to_base64, get_logger +from .adapters import build_adapter +from .openai_sdk import OpenAISDKWrapper + +logger = get_logger(__name__) + + +class LMDeployWrapper(OpenAISDKWrapper): + """OpenAI-compatible wrapper for lmdeploy-served models. + + Handles lmdeploy-specific details: + + * ``LMDEPLOY_API_KEY`` / ``LMDEPLOY_API_BASE`` environment variables + * Automatic adapter selection based on model name (see :meth:`_detect_adapter`) + * lmdeploy image-encoding format via :meth:`prepare_itlist` + + Model-specific prompt building and payload post-processing are + delegated to a :class:`~vlmeval.api.adapters.ModelAdapter` that is + selected automatically or specified via ``custom_prompt``. + """ + + def __init__(self, + model, + retry=5, + wait=5, + key='sk-123456', + verbose=True, + timeout=120, + api_base=None, + system_prompt=None, + custom_prompt=None, + **kwargs): + self.fail_msg = 'Failed to obtain answer via API. ' + self.timeout = timeout + + key = os.environ.get('LMDEPLOY_API_KEY', key) + api_base = api_base or os.environ.get('LMDEPLOY_API_BASE', None) + assert key is not None, 'Please set the environment variable LMDEPLOY_API_KEY.' + assert api_base, 'Please set the environment variable LMDEPLOY_API_BASE.' + self.key = key + self.api_base = api_base + + kwargs = {'max_tokens': 16384, 'temperature': 0.0, **kwargs} + kwargs = {k: v for k, v in kwargs.items() if v is not None} + + # Call OpenAISDKWrapper without model_adapter; we set self.adapter below + # after resolving the model name. + super().__init__( + retry=retry, + wait=wait, + system_prompt=system_prompt, + verbose=verbose, + **kwargs, + ) + + self.model = model + logger.info(f'lmdeploy evaluate model: {self.model}') + + # Resolve and instantiate adapter + if custom_prompt is None: + custom_prompt = self._detect_adapter(self.model) + if custom_prompt is not None: + self.adapter = build_adapter(custom_prompt) + logger.info(f'Using model adapter: {custom_prompt}') + + # ------------------------------------------------------------------ + # Adapter auto-detection + # ------------------------------------------------------------------ + + def _detect_adapter(self, model_name): + """Return the adapter name for *model_name*, or ``None``.""" + name = model_name.lower() + if 'cogvlm2-llama3-chat-19b' in name: + return 'cogvlm2' + if 'interns1' in name or 'intern-s1' in name: + return 'interns1_1_no_think' + if 'internvl3' in name: + return 'internvl3' + if 'internvl' in name: + if 'mpo' in name: + return 'internvl2-mpo-cot' + return 'internvl2' + if 'qwen3' in name: + return 'qwen3' + return None + + # ------------------------------------------------------------------ + # HTTP message formatting (lmdeploy-specific) + # ------------------------------------------------------------------ + + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + extra_args = {k: v for k, v in msg.items() if k not in ('type', 'value')} + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', **extra_args) + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all(x['type'] == 'text' for x in inputs) + text = '\n'.join(x['value'] for x in inputs) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs, system_prompt): + input_msgs = [] + if system_prompt is not None: + input_msgs.append(dict(role='system', content=system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert ( + np.all(['type' in x for x in inputs]) + or np.all(['role' in x for x in inputs]) + ), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append( + dict(role=item['role'], content=self.prepare_itlist(item['content'])) + ) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + +class LMDeployAPI(LMDeployWrapper): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def generate(self, message, dataset=None): + return super().generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/lvs_service.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/lvs_service.py new file mode 100644 index 00000000..9e3afa7c --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/lvs_service.py @@ -0,0 +1,224 @@ +from ..smp import * +import os +import sys +import mimetypes +from .base import BaseAPI + + +API_BASE = "http://localhost:8009" + + +class LVSWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'lvs', + retry: int = 5, + key: str = None, + verbose: bool = False, + system_prompt: str = None, + chunk_duration: int = 10, + num_frames_per_chunk: int = 10, + temperature: float = 0.1, + api_base: str = None, + max_tokens: int = 16384, + fail_msg: dict = {"events": [], "total_events": 0, "video_summary": ""}, + **kwargs): + + self.model = model + self.cur_idx = 0 + self.max_tokens = max_tokens + self.temperature = temperature + self.api_base = api_base if api_base is not None else API_BASE + self.chunk_duration = chunk_duration + self.num_frames_per_chunk = num_frames_per_chunk + + self.key = key + self.fail_msg = fail_msg + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + if api_base is None: + self.logger.info(f'API Base not provided. Using default API Base: {API_BASE}') + else: + self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}') + + import requests + try: + response = requests.get( + self.api_base + '/v1/live', + ) + ret_code = response.status_code + if 200 <= int(ret_code) < 300: + self.logger.info( + f'Successfully connected to LVS at {self.api_base}. ' + f'Health check returned status code: {ret_code}' + f'Ensure LVS is deployed with /files API enabled by setting VIA_DEV_API=true' + ) + else: + raise ConnectionError( + f'LVS is not available at {self.api_base}. ' + f'Health check failed with status code: {ret_code}' + ) + except requests.exceptions.RequestException as e: + raise ConnectionError( + f'LVS is not available at {self.api_base}. ' + f'Connection error: {e}' + ) + + def generate_inner(self, inputs, *args, **kwargs) -> str: + video_url = None + text_prompt = None + + for item in inputs: + if item['type'] == 'video': + video_url = item['value'] + elif item['type'] == 'text': + text_prompt = item['value'] + + if video_url is None: + raise ValueError('No video URL found in inputs') + if text_prompt is None: + raise ValueError('No text prompt found in inputs') + + self.logger.info(f'Video path: {video_url}') + self.logger.info(f'Text Prompt: {text_prompt[:50] if len(text_prompt) > 50 else text_prompt}') + + # Upload file to LVS using multipart form-data + # Equivalent to: curl --form 'file=@"/path/to/video.mp4"' --form 'purpose="vision"' --form 'media_type="video"' + + # Verify file exists + if not os.path.exists(video_url): + log = f'Video file not found: {video_url}' + self.logger.error(log) + return 404, json.dumps(self.fail_msg), log + + # Get filename and determine MIME type + filename = os.path.basename(video_url) + mime_type, _ = mimetypes.guess_type(video_url) + if mime_type is None: + mime_type = 'video/mp4' # Default to mp4 + + self.logger.info(f'Uploading file: {filename} (MIME: {mime_type})') + + try: + with open(video_url, 'rb') as video_file: + # Build multipart form-data request + files = { + 'file': (filename, video_file, mime_type) + } + data = { + 'purpose': 'vision', + 'media_type': 'video' + } + + upload_response = requests.post( + f'{self.api_base}/files', + files=files, + data=data + ) + except IOError as e: + log = f'Failed to read video file: {video_url}, error: {e}' + self.logger.error(log) + return 500, json.dumps(self.fail_msg), log + + ret_code = upload_response.status_code + if not (200 <= ret_code < 300): + log = f'Failed to upload file. Status: {ret_code}, Response: {upload_response.text}' + self.logger.error(log) + return ret_code, json.dumps(self.fail_msg), log + + # Get file_id from upload response + try: + upload_data = upload_response.json() + file_id = upload_data['id'] + except (json.JSONDecodeError, KeyError) as e: + log = f'Failed to parse upload response: {upload_response.text}, error: {e}' + self.logger.error(log) + return ret_code, json.dumps(self.fail_msg), log + + self.logger.info(f'Successfully uploaded file. file_id: {file_id}') + + # Build summarize request + summarize_data = { + 'id': [file_id], + 'model': self.model, + 'auto_generate_prompt': True, + 'chunk_duration': self.chunk_duration, + 'num_frames_per_chunk': self.num_frames_per_chunk, + } + + try: + prompt_data = json.loads(text_prompt) + if isinstance(prompt_data, dict): + events = prompt_data.get('event_list', prompt_data.get('events', [])) + scenario = prompt_data.get('scenario', '') + # If events is a string (pipe-separated), convert to list + if isinstance(events, str): + events = [e.strip() for e in events.split('|')] + summarize_data['events'] = events + summarize_data['scenario'] = scenario + self.logger.info(f'Sending events: {events}, scenario: {scenario}') + except (json.JSONDecodeError, TypeError): + # If parsing fails, use text_prompt as-is as prompt + summarize_data['prompt'] = text_prompt + + # Call summarize API + self.logger.info('Calling /summarize API...') + summarize_response = requests.post( + f'{self.api_base}/summarize', + headers={'Content-Type': 'application/json'}, + json=summarize_data + ) + + http_status = summarize_response.status_code + if 200 <= http_status < 300: + try: + response_data = summarize_response.json() + except json.JSONDecodeError: + log = f'Failed to parse summarize response as JSON: {summarize_response.text}' + self.logger.error(log) + # Return non-zero ret_code to trigger retry + return 1, json.dumps(self.fail_msg), log + + try: + choices = response_data.get('choices', []) + if choices and len(choices) > 0: + message = choices[0].get('message', {}) + content = message.get('content', '') + + # Parse the content JSON + if content: + content_dict = json.loads(content) + events = content_dict.get('events', []) + + answer = json.dumps(events) + log = f'Successfully called summarize. Found {len(events)} events.' + self.logger.info(log) + # Return 0 for success (BaseAPI expects ret_code == 0 for success) + return 0, answer, log + else: + log = f'Empty content in response: {response_data}' + self.logger.warning(log) + return 1, json.dumps(self.fail_msg), log + else: + log = f'No choices found in response: {response_data}' + self.logger.error(log) + return 1, json.dumps(self.fail_msg), log + + except Exception as e: + log = f'Error parsing response: {e}, response_data: {response_data}' + self.logger.error(log) + return 1, json.dumps(self.fail_msg), log + else: + log = f'Failed to summarize. Status: {http_status}, Response: {summarize_response.text}' + self.logger.error(log) + # Return HTTP status code (non-zero) to trigger retry + return http_status, json.dumps(self.fail_msg), log + + +class LVS_service(LVSWrapper): + VIDEO_LLM = True + + def generate(self, message, dataset=None): + return super(LVS_service, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/mimo.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/mimo.py new file mode 100644 index 00000000..a065ea57 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/mimo.py @@ -0,0 +1,41 @@ +"""Xiaomi MiMo family wrapper (MiMo-Embodied-7B, MiMo-VL-7B-{SFT,RL}, ...). + +All current MiMo VLMs ship a Qwen2.5-VL-7B backbone (HF vision_config.patch_size=14) +with hardcoded thinking-on; chat_template_kwargs.enable_thinking is a no-op. We +append ' /no_think' to the last user-text part (the trained-on signal per +MiMo-Embodied/lmms_eval/models/mivllm.py:217-222) so the server emits an empty + envelope plus the answer (~50x cheaper than default thinking-on), +then strip the envelope client-side with the CosmosReason2 no-answer-tag pattern. +""" + +import re + +from .cosmos_reason import CosmosReason + + +class MiMo(CosmosReason): + + NO_THINK_SUFFIX = ' /no_think' + _THINK_PATTERN = r"^([^<]*(?:<(?!/?think>)[^<]*)*)<\/think>(?:\n|\n\n| |)([\s\S]*?)$" + + def __init__(self, *args, **kwargs): + kwargs.setdefault('image_patch_size', 14) # Qwen2.5-VL family + super().__init__(*args, **kwargs) + + def generate_inner(self, inputs, **kwargs): + inputs = self._append_no_think(inputs) + return super().generate_inner(inputs, **kwargs) + + @classmethod + def _append_no_think(cls, inputs): + # BaseAPI.generate flattens to a listdict of {'type', 'value'} before + # reaching generate_inner. Append the suffix to the last text part. + for part in reversed(inputs): + if isinstance(part, dict) and part.get('type') == 'text': + part['value'] = part['value'].rstrip() + cls.NO_THINK_SUFFIX + return inputs + return inputs + + def parse_answer(self, answer: str) -> str: + match = re.search(self._THINK_PATTERN, answer, re.DOTALL) + return match.group(2).strip() if match else answer diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/minimax_api.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/minimax_api.py new file mode 100644 index 00000000..593562de --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/minimax_api.py @@ -0,0 +1,124 @@ +""" +MiniMax API support for text-based LLM evaluation. +OpenAI-compatible chat completions endpoint. +Set MINIMAX_API_KEY or pass key=... + +Models: MiniMax-M2.7, MiniMax-M2.5, MiniMax-M2.5-highspeed +API docs: https://platform.minimaxi.com/document/guides/chat-model/text-generation +""" +import json +import os + +import requests + +from vlmeval.smp.log import get_logger +from .base import BaseAPI + +MINIMAX_API_BASE = "https://api.minimax.io/v1/chat/completions" +logger = get_logger(__name__) + + +class MiniMaxAPI(BaseAPI): + """Text LLM API using MiniMax (OpenAI-compatible).""" + + is_api: bool = True + + def __init__( + self, + model: str = "MiniMax-M2.7", + key: str = None, + api_base: str = None, + retry: int = 10, + wait: int = 1, + system_prompt: str = None, + verbose: bool = True, + temperature: float = 0, + max_tokens: int = 2048, + timeout: int = 300, + **kwargs, + ): + self.model = model + self.key = key or os.environ.get("MINIMAX_API_KEY") + self.api_base = api_base or os.environ.get("MINIMAX_API_BASE", MINIMAX_API_BASE) + self.temperature = temperature + self.max_tokens = max_tokens + self.timeout = timeout + + if not self.key: + raise ValueError( + "MiniMax API key is required. Set MINIMAX_API_KEY or pass key=..." + ) + + super().__init__( + retry=retry, + wait=wait, + system_prompt=system_prompt, + verbose=verbose, + **kwargs, + ) + + logger.info(f"MiniMaxAPI: model={self.model}, api_base={self.api_base}") + + def _prepare_messages(self, inputs): + """Build OpenAI-style messages from VLMEvalKit input format.""" + messages = [] + if self.system_prompt: + messages.append({"role": "system", "content": self.system_prompt}) + + # Handle multi-turn chat format + if inputs and isinstance(inputs[0], dict) and "role" in inputs[0]: + for item in inputs: + content_parts = item.get("content", []) + text = "\n".join( + x["value"] for x in content_parts if x["type"] == "text" + ) + messages.append({"role": item["role"], "content": text or ""}) + else: + # Single-turn: extract text from inputs + text = "\n".join( + x["value"] for x in inputs if x["type"] == "text" + ) + messages.append({"role": "user", "content": text or ""}) + + return messages + + def generate_inner(self, inputs, **kwargs): + temperature = kwargs.pop("temperature", self.temperature) + max_tokens = kwargs.pop("max_tokens", self.max_tokens) + + messages = self._prepare_messages(inputs) + payload = { + "model": self.model, + "messages": messages, + "temperature": temperature, + "max_tokens": max_tokens, + } + + try: + response = requests.post( + self.api_base, + headers={ + "Authorization": f"Bearer {self.key}", + "Content-Type": "application/json", + }, + data=json.dumps(payload), + timeout=self.timeout * 1.1, + ) + except Exception as err: + if self.verbose: + logger.error(f"{type(err).__name__}: {err}") + return -1, self.fail_msg, str(err) + + ret_code = response.status_code + ret_code = 0 if (200 <= ret_code < 300) else ret_code + answer = self.fail_msg + + try: + data = response.json() + answer = data["choices"][0]["message"]["content"].strip() + except Exception as err: + if self.verbose: + logger.error(f"{type(err).__name__}: {err}") + logger.error(response.text) + + return ret_code, answer, response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/mug_u.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/mug_u.py new file mode 100644 index 00000000..2d3662b1 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/mug_u.py @@ -0,0 +1,214 @@ +import json +import os + +import numpy as np +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_MODALITY, DATASET_TYPE +from vlmeval.smp import encode_image_to_base64, get_logger, listinstr + +logger = get_logger(__name__) + + +class MUGUWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str, + retry: int = 5, + key: str = None, + verbose: bool = True, + temperature: float = 0.0, + timeout: int = 60, + api_base: str = None, + system_prompt: str = None, + max_tokens: int = 4096, + use_mpo_prompt: bool = False, + **kwargs): + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.timeout = timeout + + api_base = 'https://shopee.sg/api/v1/compassllvm/v1/chat/completions' + assert api_base is not None, 'Please set the environment variable LMDEPLOY_API_BASE.' + self.api_base = api_base + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + model_url = ''.join([api_base.split('v1')[0], 'v1/models']) + _ = requests.get(model_url) + self.model = model + if hasattr(self, 'custom_prompt'): + logger.info(f'using custom prompt {self.custom_prompt}') + self.temperature = temperature + logger.info(f'Init temperature: {self.temperature}') + self.use_mpo_prompt = use_mpo_prompt + + self.temperature = 0.0 + + def use_custom_prompt(self, dataset): + assert dataset is not None + assert DATASET_MODALITY(dataset) != 'VIDEO', 'not supported' + if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset): + # For Multi-Turn we don't have custom prompt + return False + if DATASET_MODALITY(dataset) == 'VIDEO': + # For Video benchmarks we don't have custom prompt at here + return False + else: + return True + + def get_max_num(self, dataset): + assert dataset is not None + res_1_datasets = ['MMBench-Video', 'Video-MME', 'MVBench', 'Video', 'WorldSense'] + res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld', + 'VCR_EN', 'VCR_ZH', 'OCRVQA'] + res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST', 'DUDE', 'MMLongBench_DOC', 'SLIDEVQA'] + res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K'] + if listinstr(res_1_datasets, dataset): + return 1 + elif listinstr(res_12_datasets, dataset): + return 12 + elif listinstr(res_18_datasets, dataset): + return 18 + elif listinstr(res_24_datasets, dataset): + return 24 + else: + return 6 + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + from ..vlm.internvl.utils import (build_mcq_cot_prompt, build_mpo_prompt, + build_multi_choice_prompt, build_qa_cot_prompt, + reorganize_prompt) + + tgt_path = self.dump_image(line, dataset) + max_num = self.get_max_num(dataset) + if dataset is not None and DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['MME'], dataset): + prompt = question + ' Answer the question using a single word or phrase.' + elif listinstr(['HallusionBench', 'AMBER'], dataset): + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + else: + prompt = question + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = build_multi_choice_prompt(line, dataset) + if os.getenv('USE_COT') == '1': + prompt = build_mcq_cot_prompt(line, prompt) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['LLaVABench', 'WildVision'], dataset): + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench', + 'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset): + prompt = question + '\nAnswer the question using a single word or phrase.' + elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse', + 'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', + 'QSpatial', 'WeMath', 'LogicVista'], dataset): + prompt = question + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt) + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + # VQA_ex_prompt: OlympiadBench, VizWiz + prompt = line['question'] + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt) + + message = [dict(type='text', value=prompt)] + image_num = len(tgt_path) + max_num = max(1, min(max_num, 64 // image_num)) + # TODO:support upscale_flag + message.extend([dict(type='image', value=s, max_dynamic_patch=max_num) for s in tgt_path]) + + if self.use_mpo_prompt: + message = build_mpo_prompt(message, line, dataset) + + # reorganize_prompt + prompt = reorganize_prompt(message, image_num, dataset=dataset) + prompt.replace('', '') + message[0] = dict(type='text', value=prompt) + return message + + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + extra_args = msg.copy() + extra_args.pop('type') + extra_args.pop('value') + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', **extra_args) + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + input_msgs = self.prepare_inputs(inputs) + + temperature = kwargs.pop('temperature', self.temperature) + logger.info(f'Generate temperature: {temperature}') + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + + headers = {'Content-Type': 'application/json'} + payload = dict( + model=self.model, + messages=input_msgs, + max_tokens=max_tokens, + n=1, + top_k=1, + temperature=temperature, + stream=False, + **kwargs) + + response = requests.post( + self.api_base, + headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + + # for internvl2-8b-mpo-cot + if getattr(self, 'use_mpo_prompt', False): + from ..vlm.internvl.utils import mpo_post_processing + + answer = mpo_post_processing(answer, kwargs.get('dataset')) + except Exception: + pass + return ret_code, answer, response + + +class MUGUAPI(MUGUWrapper): + def generate(self, message, dataset=None): + return super(MUGUAPI, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/nemotron.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/nemotron.py new file mode 100644 index 00000000..645f9438 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/nemotron.py @@ -0,0 +1,21 @@ +"""NVIDIA Nemotron VL family wrapper. + +NanoNemotronVLProcessor 400s on any mm_processor_kwarg (do_sample_frames, +fps, ...), so prepare_inputs drops video_kwargs to omit the field. +""" + +from .cosmos_reason import CosmosReason + + +class Nemotron(CosmosReason): + """NVIDIA Nemotron VL endpoint wrapper (Nemotron-3-Nano-Omni and successors).""" + + def __init__(self, *args, **kwargs): + # Required by CosmosReason.prepare_itlist video branch. + kwargs.setdefault('image_patch_size', 16) + super().__init__(*args, **kwargs) + + def prepare_inputs(self, inputs): + # NanoNemotronVLProcessor 400s on any mm_processor_kwarg. + input_msgs, _ = super().prepare_inputs(inputs) + return input_msgs, None diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/nv_gemini.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/nv_gemini.py new file mode 100644 index 00000000..421d2c07 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/nv_gemini.py @@ -0,0 +1,277 @@ +"""NV-deployed Gemini wrapper for gcp/google/gemini-3.1-{pro,flash-lite}-preview. + +Routes through NV inference gateway's Vertex passthrough +(inference-api.nvidia.com/vertex_ai/v1/.../models/{MODEL}:generateContent). +Native Gemini schema (contents/parts/inlineData) — the OpenAI-compat path on +the same gateway resamples video to ~1 fps server-side, so Vertex passthrough +is the only path that preserves the client's encoded fps end-to-end. + +Reuses cosmos_reason.process_video_info_to_video_url for client-side +resample + lossless ffmpeg re-encode; sets videoMetadata.fps to the +re-encoded fps so Vertex samples at exactly that rate (same semantic as the +vLLM --media-io-kwargs num_frames=-1, fps=-1 production setup). + +The same class targets vanilla GCP Vertex (aiplatform.googleapis.com); only +$OPENAI_API_KEY needs to hold a different bearer ($(gcloud auth +print-access-token)). +""" + +import base64 +import copy +import json +import os + +import requests + +from ..smp import get_logger +from .base import BaseAPI +from .cosmos_reason import _process_video_with_timeout + +logger = get_logger(__name__) + + +_VIDEO_KW_KEYS = ( + 'nframes', 'fps', 'total_pixels', 'max_pixels', 'min_pixels', 'max_frames', +) + + +class NVGemini(BaseAPI): + is_api: bool = True + VIDEO_LLM: bool = True + allowed_types = ['text', 'image', 'video'] + + def __init__( + self, + model: str, + api_base: str, + key: str | None = None, + enable_thinking: bool | None = None, + thinking_budget: int | None = None, + max_tokens: int = 8192, + temperature: float = 0.0, + retry: int = 10, + wait: int = 5, + timeout: int = 600, + video_proc_timeout: int = 300, + use_video_cache: bool = True, + verbose: bool = False, + image_patch_size: int = 16, + system_prompt: str | None = None, + **kwargs, + ): + self.model = model + self.api_base = api_base + self.key = key or os.environ.get('OPENAI_API_KEY', '') + self.thinking_budget = self._resolve_thinking_budget( + enable_thinking=enable_thinking, + thinking_budget=thinking_budget, + chat_template_kwargs=kwargs.get('chat_template_kwargs'), + ) + self.max_tokens = max_tokens + self.temperature = temperature + self.timeout = timeout + self.video_proc_timeout = video_proc_timeout + self.use_video_cache = use_video_cache + self.image_patch_size = image_patch_size + self._last_error_type = None + + super().__init__( + retry=retry, + wait=wait, + verbose=verbose, + system_prompt=system_prompt, + ) + logger.info( + f'NVGemini: model={self.model}, api_base={self.api_base}, ' + f'thinking_budget={self.thinking_budget}' + ) + self._preflight() + + @staticmethod + def _resolve_thinking_budget(enable_thinking, thinking_budget, chat_template_kwargs): + if thinking_budget is not None: + return int(thinking_budget) + flag = enable_thinking + if flag is None and chat_template_kwargs: + flag = chat_template_kwargs.get('enable_thinking') + return 8192 if flag else 0 + + def _preflight(self): + """One-shot :generateContent probe replacing CosmosReason's + _check_endpoint_health + _resolve_model_name. Logs loudly on failure + but does not raise — transient startup blips shouldn't kill the job, + and a genuinely broken endpoint will surface as http_error counts on + every subsequent request via the normal retry loop. + """ + payload = { + 'contents': [{'role': 'user', 'parts': [{'text': 'ping'}]}], + 'generationConfig': { + 'maxOutputTokens': 8, + 'thinkingConfig': {'thinkingBudget': 0}, + }, + } + headers = { + 'Authorization': f'Bearer {self.key}', + 'Content-Type': 'application/json', + } + try: + r = requests.post( + self.api_base, headers=headers, + data=json.dumps(payload), timeout=30, + ) + except Exception as e: + logger.warning( + f'⚠️ NVGemini preflight network error: {type(e).__name__}: {e}. ' + f'Proceeding; live requests will surface the issue.' + ) + return + if r.status_code == 200 and r.json().get('candidates'): + logger.info(f'✓ NVGemini preflight passed: {self.api_base}') + return + logger.error( + f'⚠️ NVGemini preflight failed (HTTP {r.status_code}): {r.text[:400]}. ' + f'Proceeding; live requests will surface the issue.' + ) + + # ---- part builders ------------------------------------------------------ + + def _video_kwargs(self, msg: dict) -> dict: + kw = copy.deepcopy(msg) + kw.pop('type', None) + kw.pop('value', None) + return {k: v for k, v in kw.items() if k in _VIDEO_KW_KEYS and v is not None} + + def _video_part(self, msg: dict) -> dict: + video_url, video_kwargs = _process_video_with_timeout( + msg['value'], + image_patch_size=self.image_patch_size, + use_cache=self.use_video_cache, + kwargs=self._video_kwargs(msg), + timeout=self.video_proc_timeout, + ) + b64 = video_url.split('base64,', 1)[1] + part = {'inlineData': {'mimeType': 'video/mp4', 'data': b64}} + fps = (video_kwargs or {}).get('fps') + if fps is not None: + part['videoMetadata'] = {'fps': fps if not isinstance(fps, list) else fps[0]} + return part + + def _image_part(self, msg: dict) -> dict: + with open(msg['value'], 'rb') as f: + b64 = base64.b64encode(f.read()).decode() + suffix = msg['value'].lower().rsplit('.', 1)[-1] + if suffix in ('jpg', 'jpeg'): + mime = 'image/jpeg' + elif suffix in ('png', 'webp', 'gif'): + mime = f'image/{suffix}' + else: + mime = 'image/png' + return {'inlineData': {'mimeType': mime, 'data': b64}} + + def _part_for(self, msg: dict) -> dict: + t = msg['type'] + if t == 'text': + return {'text': msg['value']} + if t == 'image': + return self._image_part(msg) + if t == 'video': + return self._video_part(msg) + raise ValueError(f'NVGemini: unsupported message type {t!r}') + + # ---- generate ----------------------------------------------------------- + + def generate_inner(self, inputs, **kwargs): + self._last_error_type = None + try: + parts = [self._part_for(x) for x in inputs] + except Exception as e: + logger.warning(f'NVGemini: input preparation failed: {type(e).__name__}: {e}') + return 0, 'VIDEO_LOAD_FAILED', None + + max_tokens = kwargs.get('max_tokens', self.max_tokens) + temperature = kwargs.get('temperature', self.temperature) + payload = { + 'contents': [{'role': 'user', 'parts': parts}], + 'generationConfig': { + 'maxOutputTokens': max_tokens, + 'temperature': temperature, + 'thinkingConfig': {'thinkingBudget': self.thinking_budget}, + }, + } + if self.system_prompt: + payload['systemInstruction'] = {'parts': [{'text': self.system_prompt}]} + + headers = { + 'Authorization': f'Bearer {self.key}', + 'Content-Type': 'application/json', + } + try: + r = requests.post( + self.api_base, headers=headers, + data=json.dumps(payload), timeout=self.timeout * 1.1, + ) + except requests.exceptions.Timeout: + logger.error(f'⚠️ ENDPOINT TIMEOUT: {self.api_base}') + self._last_error_type = 'timeout' + return 408, self.fail_msg, None + except requests.exceptions.ConnectionError as e: + logger.error(f'⚠️ ENDPOINT DOWN: {self.api_base}: {e}') + self._last_error_type = 'http_error' + return 503, self.fail_msg, None + except Exception as e: + logger.error(f'⚠️ REQUEST FAILED: {type(e).__name__}: {e}') + self._last_error_type = 'http_error' + return 500, self.fail_msg, None + + if r.status_code != 200: + self._last_error_type = 'timeout' if r.status_code in (408, 504) else 'http_error' + logger.error(f'⚠️ HTTP {r.status_code} from {self.api_base}: {r.text[:400]}') + return r.status_code, self.fail_msg + r.text[:400], r.text + + try: + data = r.json() + except Exception as e: + self._last_error_type = 'http_error' + return -1, self.fail_msg + f'json decode: {e}', r.text + + cands = data.get('candidates') or [] + if not cands: + self._last_error_type = 'http_error' + return -1, self.fail_msg + json.dumps(data)[:400], json.dumps(data) + + finish = cands[0].get('finishReason') + text = ''.join( + p.get('text', '') + for p in cands[0].get('content', {}).get('parts', []) + ) + if finish == 'MAX_TOKENS': + self._last_error_type = 'truncated' + logger.info(f"⚠️ Answer truncated at maxOutputTokens={max_tokens}.") + elif finish == 'SAFETY': + self._last_error_type = 'content_filter' + logger.info('⚠️ Response blocked by Gemini safety filter (finishReason=SAFETY).') + + if self.verbose: + logger.info(f'NVGemini response: {text[:200]}') + + return 0, text.strip(), r.text + + def generate(self, message, **kwargs): + """Record per-sample inference outcome to $INFERENCE_STATS_FILE, mirroring + CosmosReason.generate so run_metric.py aggregation works unchanged.""" + result = super().generate(message, **kwargs) + stats_file = os.environ.get('INFERENCE_STATS_FILE') + if stats_file: + is_success = result is not None and self.fail_msg not in str(result) + if self._last_error_type == 'truncated': + category = 'truncated' + elif is_success: + category = 'success' + else: + category = self._last_error_type or 'other' + try: + with open(stats_file, 'a') as f: + f.write(json.dumps({'outcome': category}) + '\n') + except Exception: + pass + return result diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/openai_sdk.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/openai_sdk.py new file mode 100644 index 00000000..8f734321 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/openai_sdk.py @@ -0,0 +1,140 @@ +import json + +import requests + +from ..smp import get_logger +from .base import BaseAPI + +logger = get_logger(__name__) + + +class OpenAISDKWrapper(BaseAPI): + """Base class for OpenAI-compatible API wrappers. + + Provides a standard ``generate_inner`` implementation with optional + :class:`~vlmeval.api.adapters.ModelAdapter` support for model-specific + prompt building and payload processing. + + Subclasses **must** implement :meth:`prepare_inputs` to convert the + internal message list into the HTTP message format expected by their + specific API endpoint. + + Attributes: + adapter: An optional :class:`ModelAdapter` instance. Set to + ``None`` to disable adapter hooks. + key (str): API authentication key. + api_base (str): Full URL of the chat completions endpoint. + model (str): Model identifier sent in the request payload. + timeout (int): HTTP request timeout in seconds. + """ + + is_api: bool = True + + def __init__(self, + retry=5, + wait=5, + verbose=True, + system_prompt=None, + model_adapter=None, + **kwargs): + """ + Args: + model_adapter: Either a :class:`ModelAdapter` instance, a + registered adapter name (``str``), or ``None``. + """ + self.adapter = None + if model_adapter is not None: + if isinstance(model_adapter, str): + from .adapters import build_adapter + self.adapter = build_adapter(model_adapter) + else: + self.adapter = model_adapter + + super().__init__( + retry=retry, + wait=wait, + system_prompt=system_prompt, + verbose=verbose, + **kwargs, + ) + + # ------------------------------------------------------------------ + # Adapter integration + # ------------------------------------------------------------------ + + def set_dump_image(self, dump_image_func): + if self.adapter is not None: + self.adapter.dump_image_func = dump_image_func + self.dump_image_func = dump_image_func + + def use_custom_prompt(self, dataset) -> bool: + if self.adapter is not None: + return self.adapter.use_custom_prompt(dataset, self.system_prompt) + return False + + def build_prompt(self, line, dataset=None): + if self.adapter is not None: + return self.adapter.build_prompt(line, dataset) + raise NotImplementedError + + # ------------------------------------------------------------------ + # HTTP formatting – subclasses must override + # ------------------------------------------------------------------ + + def prepare_inputs(self, inputs, system_prompt): + """Convert the internal message list to HTTP message dicts. + + Args: + inputs: List of ``{'type': ..., 'value': ...}`` dicts, or a + list of role-keyed dicts for multi-turn conversations. + system_prompt: System prompt string, or ``None``. + + Returns: + List of OpenAI-style message dicts with ``role`` and + ``content`` fields. + """ + raise NotImplementedError( + f'{type(self).__name__} must implement prepare_inputs()' + ) + + # ------------------------------------------------------------------ + # Core generation loop + # ------------------------------------------------------------------ + + def generate_inner(self, inputs, dataset=None, **kwargs) -> tuple: + if self.adapter is not None: + model_args = self.adapter.override_model_args(dataset) + system_prompt = model_args.pop('system_prompt', self.system_prompt) + inputs = self.adapter.process_inputs(inputs, dataset) + kwargs.update(model_args) + else: + system_prompt = self.system_prompt + + input_msgs = self.prepare_inputs(inputs, system_prompt) + + headers = { + 'Content-Type': 'application/json', + 'Authorization': f'Bearer {self.key}', + } + payload = dict(model=self.model, messages=input_msgs, n=1, **kwargs) + + if self.adapter is not None: + payload = self.adapter.process_payload(payload, dataset=dataset) + + response = requests.post( + self.api_base, + headers=headers, + data=json.dumps(payload), + timeout=self.timeout * 1.1, + ) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + if self.adapter is not None: + answer = self.adapter.postprocess(answer, dataset=dataset) + except Exception: + pass + return ret_code, answer, response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/qwen_api.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/qwen_api.py new file mode 100644 index 00000000..9fcea5fc --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/qwen_api.py @@ -0,0 +1,75 @@ +import copy as cp +import os +from http import HTTPStatus + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import proxy_set + + +# Note: This is a pure language model API. +class QwenAPI(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'qwen-max-1201', + retry: int = 5, + verbose: bool = True, + seed: int = 2680, + temperature: float = 0.0, + system_prompt: str = None, + key: str = None, + max_tokens: int = 2048, + proxy: str = None, + **kwargs): + + assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext'] + self.model = model + import dashscope + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.temperature = temperature + self.seed = seed + if key is None: + key = os.environ.get('DASHSCOPE_API_KEY', None) + assert key is not None, ( + 'Please set the API Key (obtain it here: ' + 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' + ) + dashscope.api_key = key + if proxy is not None: + proxy_set(proxy) + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + @staticmethod + def build_msgs(msgs_raw, system_prompt=None): + msgs = cp.deepcopy(msgs_raw) + ret = [] + if system_prompt is not None: + ret.append(dict(role='system', content=system_prompt)) + for i, msg in enumerate(msgs): + role = 'user' if i % 2 == 0 else 'assistant' + ret.append(dict(role=role, content=msg)) + return ret + + def generate_inner(self, inputs, **kwargs) -> str: + assert isinstance(inputs, str) or isinstance(inputs, list) + inputs = [inputs] if isinstance(inputs, str) else inputs + messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt) + + import dashscope + response = dashscope.Generation.call( + model=self.model, + messages=messages, + seed=self.seed, + temperature=self.temperature, + max_tokens=self.max_tokens, + result_format='message', # set the result to be "message" format. + ) + if response.status_code != HTTPStatus.OK: + return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. ' + + try: + return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! ' + except Exception as err: + return -1, f'Error: Failed to parse the response. {err}', response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/qwen_vl_api.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/qwen_vl_api.py new file mode 100644 index 00000000..fee909f8 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/qwen_vl_api.py @@ -0,0 +1,222 @@ +from __future__ import annotations +import os +import warnings + +import numpy as np + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import get_logger, proxy_set +from vlmeval.vlm.qwen2_vl.prompt import Qwen2VLPromptMixin + +logger = get_logger(__name__) + + +def ensure_image_url(image: str) -> str: + prefixes = ['http://', 'https://', 'file://', 'data:image;'] + if any(image.startswith(prefix) for prefix in prefixes): + return image + if os.path.exists(image): + return 'file://' + image + raise ValueError(f'Invalid image: {image}') + + +class Qwen2VLAPI(Qwen2VLPromptMixin, BaseAPI): + is_api: bool = True + + def __init__( + self, + model: str = 'qwen-vl-max-0809', + key: str | None = None, + min_pixels: int | None = None, + max_pixels: int | None = None, + max_length=1024, + top_p=0.001, + top_k=1, + temperature=0.01, + repetition_penalty=1.0, + presence_penalty=0.0, + seed=3407, + use_custom_prompt: bool = True, + **kwargs, + ): + import dashscope + + self.model = model + self.min_pixels = min_pixels + self.max_pixels = max_pixels + self.generate_kwargs = dict( + max_length=max_length, + top_p=top_p, + top_k=top_k, + temperature=temperature, + repetition_penalty=repetition_penalty, + presence_penalty=presence_penalty, + seed=seed, + ) + + key = os.environ.get('DASHSCOPE_API_KEY', None) if key is None else key + assert key is not None, ( + 'Please set the API Key (obtain it here: ' + 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' + ) + dashscope.api_key = key + super().__init__(use_custom_prompt=use_custom_prompt, **kwargs) + + def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]: + """ + inputs list[dict[str, str]], each dict has keys: ['type', 'value'] + """ + content = [] + for s in inputs: + if s['type'] == 'image': + item = {'type': 'image', 'image': ensure_image_url(s['value'])} + if dataset == 'OCRBench': + item['min_pixels'] = 10 * 10 * 28 * 28 + warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}") + if self.max_pixels is not None: + item['max_pixels'] = self.max_pixels + else: + if self.min_pixels is not None: + item['min_pixels'] = self.min_pixels + if self.max_pixels is not None: + item['max_pixels'] = self.max_pixels + elif s['type'] == 'text': + item = {'type': 'text', 'text': s['value']} + else: + raise ValueError(f"Invalid message type: {s['type']}, {s}") + content.append(item) + return content + + def generate_inner(self, inputs, **kwargs) -> str: + import dashscope + + messages = [] + if self.system_prompt is not None: + messages.append({'role': 'system', 'content': self.system_prompt}) + messages.append( + {'role': 'user', 'content': self._prepare_content(inputs, dataset=kwargs.get('dataset', None))} + ) + if self.verbose: + print(f'\033[31m{messages}\033[0m') + + # generate + generation_kwargs = self.generate_kwargs.copy() + kwargs.pop('dataset', None) + generation_kwargs.update(kwargs) + try: + response = dashscope.MultiModalConversation.call( + model=self.model, + messages=messages, + **generation_kwargs, + ) + if self.verbose: + print(response) + answer = response.output.choices[0]['message']['content'][0]['text'] + return 0, answer, 'Succeeded! ' + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(f'The input messages are {inputs}.') + return -1, '', '' + + +class QwenVLWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'qwen-vl-plus', + retry: int = 5, + key: str = None, + verbose: bool = True, + temperature: float = 0.0, + system_prompt: str = None, + max_tokens: int = 2048, + proxy: str = None, + **kwargs): + + assert model in ['qwen-vl-plus', 'qwen-vl-max'] + self.model = model + import dashscope + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.temperature = temperature + if key is None: + key = os.environ.get('DASHSCOPE_API_KEY', None) + assert key is not None, ( + 'Please set the API Key (obtain it here: ' + 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)' + ) + dashscope.api_key = key + if proxy is not None: + proxy_set(proxy) + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] + # content can be a string or a list of image & text + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(text=msg['value'])) + elif msg['type'] == 'image': + content_list.append(dict(image='file://' + msg['value'])) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + from dashscope import MultiModalConversation + + assert isinstance(inputs, str) or isinstance(inputs, list) + + if 'type' in inputs[0]: + pure_text = np.all([x['type'] == 'text' for x in inputs]) + else: + pure_text = True + for inp in inputs: + if not np.all([x['type'] == 'text' for x in inp['content']]): + pure_text = False + break + + assert not pure_text + messages = self.prepare_inputs(inputs) + gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature) + gen_config.update(kwargs) + try: + response = MultiModalConversation.call(model=self.model, messages=messages) + if self.verbose: + print(response) + answer = response.output.choices[0]['message']['content'][0]['text'] + return 0, answer, 'Succeeded! ' + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(f'The input messages are {inputs}.') + + return -1, '', '' + + +class QwenVLAPI(QwenVLWrapper): + + def generate(self, message, dataset=None): + return super(QwenVLAPI, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/rbdashmm_chat3_5_api.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/rbdashmm_chat3_5_api.py new file mode 100644 index 00000000..b50ecd2f --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/rbdashmm_chat3_5_api.py @@ -0,0 +1,601 @@ +import json +import os +import re + +import numpy as np +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_MODALITY, DATASET_TYPE +from vlmeval.smp import encode_image_to_base64, get_logger, listinstr + +logger = get_logger(__name__) + + +def process_prediction_text(text): + if not isinstance(text, str): + return text + + if "\\boxed{" in text: + start = text.rfind("\\boxed{") + i = start + len("\\boxed{") + brace_count = 1 + content_chars = [] + + while i < len(text) and brace_count > 0: + ch = text[i] + if ch == "{": + brace_count += 1 + elif ch == "}": + brace_count -= 1 + if brace_count > 0: + content_chars.append(ch) + i += 1 + + if content_chars: + inner = "".join(content_chars).strip() + if inner: + return inner + + redacted_pattern = r'.*?' + if re.search(redacted_pattern, text, re.DOTALL): + result = re.sub(redacted_pattern, '', text, flags=re.DOTALL) + return result.strip() + + # 默认返回 + return text + + +R1_SYSTEM_PROMPT = """ +You are an AI assistant that rigorously follows this response protocol: + +1. First, conduct a detailed analysis of the question. Consider different \ +angles, potential solutions, and reason through the problem step-by-step. \ +Enclose this entire thinking process within and tags. + +2. After the thinking section, provide a clear, concise, and direct answer to \ +the user's question. Separate the answer from the think section with a newline. + +Ensure that the thinking process is thorough but remains focused on the \ +query. The final answer should be standalone and not reference the thinking \ +section. +""".strip() + + +class RBdashMMChat3_78B_PromptUtil: + def __init__(self, use_mpo_prompt=False): + self.use_mpo_prompt = use_mpo_prompt + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset): + assert dataset is not None + assert DATASET_MODALITY(dataset) != 'VIDEO', 'not supported' + if dataset in [ + 'atomic_dataset', 'electro_dataset', 'mechanics_dataset', + 'optics_dataset', 'quantum_dataset', 'statistics_dataset' + ]: + return False + if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN', 'WeMath_COT', 'MMAlignBench'], dataset): + # For Multi-Turn we don't have custom prompt + return False + if DATASET_MODALITY(dataset) == 'VIDEO': + # For Video benchmarks we don't have custom prompt at here + return False + else: + return True + + def build_prompt(self, line, dataset=None): + use_cot = (os.getenv('USE_COT') == '1') + use_mpo_prompt = self.use_mpo_prompt and (use_cot or dataset in ['MMStar', 'HallusionBench', 'OCRBench']) + + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + from ..vlm.internvl.utils import (build_mcq_cot_prompt, build_mpo_prompt, + build_multi_choice_prompt, build_qa_cot_prompt, + reorganize_prompt) + + tgt_path = self.dump_image(line, dataset) + max_num = self.get_max_num(dataset) + if dataset is not None and DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['MME'], dataset): + prompt = question + ' Answer the question using a single word or phrase.' + elif listinstr(['HallusionBench', 'AMBER'], dataset): + prompt = question + cot_prompt = ( + "Answer the preceding yes-or-no question. Think step by step logically, considering all " + "relevant information before answering. If you are uncertain or the problem is ambiguous, make a " + "reasoned guess based on the information provided. The last line of your response must be exactly " + "in this format: 'Answer: \\boxed{Yes}' or 'Answer: \\boxed{No}' (without quotes)." + ) + prompt = prompt + '\n' + cot_prompt + else: + prompt = question + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = build_multi_choice_prompt(line, dataset) + if os.getenv('USE_COT') == '1': + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['LLaVABench', 'WildVision'], dataset): + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench', + 'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset): + prompt = question + '\nAnswer the question using a single word or phrase. \ + The image is guaranteed to contain the correct answer. Please provide the most likely \ + answer — do not answer "No." Let us think step by step.' + elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse', + 'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', + 'QSpatial', 'WeMath', 'LogicVista'], dataset): + if listinstr(['MMVet'], dataset): + prompt = question + "\nPlease first think and then answer the question." + else: + prompt = question + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + # VQA_ex_prompt: OlympiadBench, VizWiz + prompt = line['question'] + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt) + + message = [dict(type='text', value=prompt)] + image_num = len(tgt_path) + max_num = max(1, min(max_num, 64 // image_num)) + # TODO:support upscale_flag + message.extend([dict(type='image', value=s, max_dynamic_patch=max_num) for s in tgt_path]) + + if use_mpo_prompt: + message = build_mpo_prompt(message, line, dataset) + + # reorganize_prompt + prompt = reorganize_prompt(message, image_num, dataset=dataset) + prompt.replace('', '') + message[0] = dict(type='text', value=prompt) + return message + + def get_max_num(self, dataset): + self.total_max_num = 64 + if dataset is None: + self.max_num = 6 + return None + res_8_datasets = ['MMStar'] + res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld', 'HallusionBench', + 'VCR_EN', 'VCR_ZH', 'OCRVQA', 'BMMR', 'MathVista_MINI'] + res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST', 'DUDE', 'MMLongBench_DOC', 'SLIDEVQA'] + res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K', 'MMVet'] + if DATASET_MODALITY(dataset) == 'VIDEO': + self.max_num = 1 + elif listinstr(res_8_datasets, dataset): + return 8 + elif listinstr(res_12_datasets, dataset): + return 12 + elif listinstr(res_18_datasets, dataset): + return 18 + elif listinstr(res_24_datasets, dataset): + return 24 + else: + return 6 + + +class RBdashMMChat3_5_38B_PromptUtil: + def __init__(self, use_mpo_prompt=False): + self.use_mpo_prompt = use_mpo_prompt + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset): + assert dataset is not None + assert DATASET_MODALITY(dataset) != 'VIDEO', 'not supported' + if dataset in [ + 'atomic_dataset', 'electro_dataset', 'mechanics_dataset', + 'optics_dataset', 'quantum_dataset', 'statistics_dataset' + ]: + return False + if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN', 'WeMath_COT', 'MMAlignBench'], dataset): + # For Multi-Turn we don't have custom prompt + return False + if DATASET_MODALITY(dataset) == 'VIDEO': + # For Video benchmarks we don't have custom prompt at here + return False + else: + return True + + def build_prompt(self, line, dataset=None): + use_cot = (os.getenv('USE_COT') == '1') + use_mpo_prompt = self.use_mpo_prompt and (use_cot or dataset in ['MMStar', 'HallusionBench', 'OCRBench']) + + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + from ..vlm.internvl.utils import (build_mcq_cot_prompt, build_mpo_prompt, + build_multi_choice_prompt, build_qa_cot_prompt, + reorganize_prompt) + + tgt_path = self.dump_image(line, dataset) + max_num = self.get_max_num(dataset) + if dataset is not None and DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['MME'], dataset): + prompt = question + ' Answer the question using a single word or phrase.' + elif listinstr(['HallusionBench', 'AMBER'], dataset): + prompt = question + cot_prompt = ( + "Answer the preceding yes-or-no question. Think step by step logically, considering all " + "relevant information before answering. If you are uncertain or the problem is ambiguous, make " + "a reasoned guess based on the information provided. The last line of your response must be " + "exactly in this format: 'Answer: \\boxed{Yes}' or 'Answer: \\boxed{No}' (without quotes)." + ) + prompt = prompt + '\n' + cot_prompt + else: + prompt = question + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = build_multi_choice_prompt(line, dataset) + if os.getenv('USE_COT') == '1': + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['LLaVABench', 'WildVision'], dataset): + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench', + 'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset): + prompt = question + '\nAnswer the question using a single word or phrase. \ + The image is guaranteed to contain the correct answer. Please provide the most likely \ + answer — do not answer "No." Let us think step by step.' + elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse', + 'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', + 'QSpatial', 'WeMath', 'LogicVista'], dataset): + prompt = question + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + # VQA_ex_prompt: OlympiadBench, VizWiz + prompt = line['question'] + if os.getenv('USE_COT') == '1': + prompt = build_qa_cot_prompt(line, prompt) + + message = [dict(type='text', value=prompt)] + image_num = len(tgt_path) + max_num = max(1, min(max_num, 64 // image_num)) + # TODO:support upscale_flag + message.extend([dict(type='image', value=s, max_dynamic_patch=max_num) for s in tgt_path]) + + if use_mpo_prompt: + message = build_mpo_prompt(message, line, dataset) + + # reorganize_prompt + prompt = reorganize_prompt(message, image_num, dataset=dataset) + prompt.replace('', '') + message[0] = dict(type='text', value=prompt) + return message + + def get_max_num(self, dataset): + self.total_max_num = 64 + if dataset is None: + self.max_num = 6 + return None + res_8_datasets = ['MMStar'] + res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld', + 'VCR_EN', 'VCR_ZH', 'OCRVQA', 'BMMR', 'MathVista_MINI'] + res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST', 'DUDE', 'MMLongBench_DOC', 'SLIDEVQA'] + res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K', 'MMVet'] + if DATASET_MODALITY(dataset) == 'VIDEO': + self.max_num = 1 + elif listinstr(res_8_datasets, dataset): + return 8 + elif listinstr(res_12_datasets, dataset): + return 12 + elif listinstr(res_18_datasets, dataset): + return 18 + elif listinstr(res_24_datasets, dataset): + return 24 + else: + return 6 + + +class RBdashMMChat3_78B_Wrapper(BaseAPI): + is_api: bool = True + custom_prompt = 'rbdash3' + prompt_map = { + 'rbdash3': RBdashMMChat3_78B_PromptUtil() + } + + def __init__(self, + model: str = None, + retry: int = 5, + key: str = 'sk-123456', + verbose: bool = True, + temperature: float = 0.0, + timeout: int = 600, + api_base: str = None, + system_prompt: str = None, + max_tokens: int = 20480, + **kwargs): + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.timeout = timeout + + key = os.environ.get('LMDEPLOY_API_KEY', key) + api_base = os.environ.get('LMDEPLOY_API_BASE', api_base) + assert key is not None, 'Please set the environment variable LMDEPLOY_API_KEY.' + assert api_base is not None, 'Please set the environment variable LMDEPLOY_API_BASE.' + self.key = key + self.api_base = api_base + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + model_url = ''.join([api_base.split('v1')[0], 'v1/models']) + resp = requests.get(model_url) + model_id_list = [str(data['id']) for data in resp.json()['data']] + self.model = model if model in model_id_list else model_id_list[0] + logger.info(f'lmdeploy evaluate model: {self.model}') + + self.max_tokens = 20480 + self.temperature = 0.0 + self.custom_prompt = 'rbdash3' + + logger.info(f'using custom prompt {self.custom_prompt}') + logger.info(f'Init temperature: {self.temperature}') + self.system_prompt = R1_SYSTEM_PROMPT + + def set_dump_image(self, dump_image_func): + if self.custom_prompt in self.prompt_map: + self.prompt_map[self.custom_prompt].dump_image_func = dump_image_func + self.dump_image_func = dump_image_func + + def use_custom_prompt(self, dataset): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].use_custom_prompt(dataset) + return False + + def build_prompt(self, line, dataset=None): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].build_prompt(line, dataset) + raise NotImplementedError + + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + print('text msg:', msg['value']) + elif msg['type'] == 'image': + from PIL import Image + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + extra_args = msg.copy() + extra_args.pop('type') + extra_args.pop('value') + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', **extra_args) + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + temperature = kwargs.pop('temperature', self.temperature) + logger.info(f'Generate temperature: {temperature}') + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + dataset = kwargs.pop('dataset', None) + if listinstr(['MMMU_DEV_VAL', 'MathVista'], dataset): + self.system_prompt = R1_SYSTEM_PROMPT + else: + self.system_prompt = None + + input_msgs = self.prepare_inputs(inputs) + + if dataset is not None and listinstr(['BMMR'], dataset): + # BMMR dataset has a very long prompt, so we need to increase max_tokens + max_tokens = 8196 + logger.info('BMMR dataset detected, set max_tokens to 8196') + + headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} + payload = dict( + model=self.model, + messages=input_msgs, + max_tokens=max_tokens, + n=1, + temperature=temperature, + do_sample=False, + **kwargs) + response = requests.post( + self.api_base, + headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + answer = process_prediction_text(answer) + except Exception: + pass + return ret_code, answer, response + + +class RBdashMMChat3_5_38B_Wrapper(BaseAPI): + is_api: bool = True + custom_prompt = 'rbdash3_5' + prompt_map = { + 'rbdash3_5': RBdashMMChat3_5_38B_PromptUtil() + } + + def __init__(self, + model: str = None, + retry: int = 5, + key: str = 'sk-123456', + verbose: bool = True, + temperature: float = 0.0, + timeout: int = 600, + api_base: str = None, + system_prompt: str = None, + max_tokens: int = 20480, + **kwargs): + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.timeout = timeout + + key = os.environ.get('LMDEPLOY_API_KEY', key) + api_base = os.environ.get('LMDEPLOY_API_BASE', api_base) + assert key is not None, 'Please set the environment variable LMDEPLOY_API_KEY.' + assert api_base is not None, 'Please set the environment variable LMDEPLOY_API_BASE.' + self.key = key + self.api_base = api_base + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + model_url = ''.join([api_base.split('v1')[0], 'v1/models']) + resp = requests.get(model_url) + model_id_list = [str(data['id']) for data in resp.json()['data']] + self.model = model if model in model_id_list else model_id_list[0] + logger.info(f'lmdeploy evaluate model: {self.model}') + + self.max_tokens = 20480 + self.temperature = 0.0 + self.custom_prompt = 'rbdash3_5' + + logger.info(f'using custom prompt {self.custom_prompt}') + logger.info(f'Init temperature: {self.temperature}') + self.system_prompt = R1_SYSTEM_PROMPT + + def set_dump_image(self, dump_image_func): + if self.custom_prompt in self.prompt_map: + self.prompt_map[self.custom_prompt].dump_image_func = dump_image_func + self.dump_image_func = dump_image_func + + def use_custom_prompt(self, dataset): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].use_custom_prompt(dataset) + return False + + def build_prompt(self, line, dataset=None): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].build_prompt(line, dataset) + raise NotImplementedError + + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + print('text msg:', msg['value']) + elif msg['type'] == 'image': + from PIL import Image + + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + extra_args = msg.copy() + extra_args.pop('type') + extra_args.pop('value') + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', **extra_args) + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + temperature = kwargs.pop('temperature', self.temperature) + logger.info(f'Generate temperature: {temperature}') + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + dataset = kwargs.pop('dataset', None) + if listinstr([ + 'MMMU_DEV_VAL', 'MathVista_MINI', 'MMBench_V11', + 'MMStar', 'DynaMath', 'MathVision', 'MathVerse', + 'LogicVista', 'WeMath'], dataset): + self.system_prompt = R1_SYSTEM_PROMPT + else: + self.system_prompt = None + + input_msgs = self.prepare_inputs(inputs) + + if dataset is not None and listinstr(['BMMR'], dataset): + # BMMR dataset has a very long prompt, so we need to increase max_tokens + max_tokens = 8196 + logger.info('BMMR dataset detected, set max_tokens to 8196') + + headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} + payload = dict( + model=self.model, + messages=input_msgs, + max_tokens=max_tokens, + n=1, + temperature=temperature, + do_sample=False, + **kwargs) + response = requests.post( + self.api_base, + headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + answer = process_prediction_text(answer) + except Exception: + pass + return ret_code, answer, response + + +class RBdashMMChat3_78B_API(RBdashMMChat3_78B_Wrapper): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def generate(self, message, dataset=None): + return super(RBdashMMChat3_78B_API, self).generate(message, dataset=dataset) + + +class RBdashMMChat3_5_38B_API(RBdashMMChat3_5_38B_Wrapper): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def generate(self, message, dataset=None): + return super(RBdashMMChat3_5_38B_API, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/rbdashmm_chat3_api.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/rbdashmm_chat3_api.py new file mode 100644 index 00000000..c15efb03 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/rbdashmm_chat3_api.py @@ -0,0 +1,530 @@ +import json +import os +import re + +import numpy as np +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import encode_image_to_base64, get_logger, listinstr + +logger = get_logger(__name__) + + +def process_prediction_text(text): + if not isinstance(text, str): + return text + + # ====== 处理 \boxed{}(支持嵌套)====== + if "\\boxed{" in text: + start = text.find("\\boxed{") + i = start + len("\\boxed{") + brace_count = 1 + content_chars = [] + + while i < len(text) and brace_count > 0: + ch = text[i] + if ch == "{": + brace_count += 1 + elif ch == "}": + brace_count -= 1 + if brace_count > 0: # 在外层 } 之前才加入字符 + content_chars.append(ch) + i += 1 + + if content_chars: + inner = "".join(content_chars).strip() + if inner: + return inner + + # ====== 删除 标签内容 ====== + redacted_pattern = r'.*?' + if re.search(redacted_pattern, text, re.DOTALL): + result = re.sub(redacted_pattern, '', text, flags=re.DOTALL) + return result.strip() + + # ====== 长文本截断 ====== + if len(text) > 32767: + return text[-30000:] + + # 默认返回 + return text + + +class RBdashMMChat3_PromptUtil: + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset): + assert dataset is not None + return True + + def build_prompt(self, line, dataset=None): + from ..vlm.internvl.utils import (build_mcq_cot_prompt, build_multi_choice_prompt, + build_qa_cot_prompt, build_yesorno_cot_prompt, + reorganize_prompt) + tgt_path = self.dump_image(line, dataset) + max_num = self.get_max_num(dataset) + if dataset is not None and listinstr(['MMBench_V11', 'MMBench_CN_V11'], dataset): + prompt = build_multi_choice_prompt(line, dataset) + elif dataset is not None and listinstr(['MMStar'], dataset): + prompt = build_multi_choice_prompt(line, dataset) + elif dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset): + prompt = build_multi_choice_prompt(line, dataset) + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and listinstr(['MathVista_MINI'], dataset): + prompt = line['question'] + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and listinstr(['OCRBench'], dataset): + prompt = line['question'] + '\nAnswer the question using a single word or phrase. \ + The image is guaranteed to contain the correct answer. Please provide the most likely \ + answer — do not answer "No." Let us think step by step.' + elif dataset is not None and listinstr(['AI2D_TEST'], dataset): + prompt = build_multi_choice_prompt(line, dataset) + elif dataset is not None and listinstr(['HallusionBench'], dataset): + self.cot_prompt = None + prompt = build_yesorno_cot_prompt(line, line['question'], self.cot_prompt) + elif dataset is not None and listinstr(['MMVet'], dataset): + prompt = line['question'] + "\nPlease first think and then answer the question." + + message = [dict(type='text', value=prompt)] + image_num = len(tgt_path) + max_num = max(1, min(max_num, 64 // image_num)) + + message.extend([dict(type='image', value=s, max_dynamic_patch=max_num) for s in tgt_path]) + + # reorganize_prompt + prompt = reorganize_prompt(message, image_num, dataset=dataset) + prompt.replace('', '') + message[0] = dict(type='text', value=prompt) + return message + + def get_max_num(self, dataset): + self.total_max_num = 64 + if dataset is None: + self.max_num = 6 + return None + elif dataset is not None and listinstr(['MMBench_V11'], dataset): + return 6 + elif dataset is not None and listinstr(['MMStar'], dataset): + return 8 + elif dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset): + return 12 + elif dataset is not None and listinstr(['MathVista_MINI'], dataset): + return 12 + elif dataset is not None and listinstr(['OCRBench'], dataset): + return 24 + elif dataset is not None and listinstr(['AI2D_TEST'], dataset): + return 6 + elif dataset is not None and listinstr(['HallusionBench'], dataset): + return 12 + elif dataset is not None and listinstr(['MMVet'], dataset): + return 24 + else: + return 6 + + +class RBdashMMChat3_5_PromptUtil: + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset): + assert dataset is not None + return True + + def build_prompt(self, line, dataset=None): + from ..vlm.internvl.utils import (build_mcq_cot_prompt, build_multi_choice_prompt, + build_qa_cot_prompt, build_yesorno_cot_prompt, + reorganize_prompt) + tgt_path = self.dump_image(line, dataset) + max_num = self.get_max_num(dataset) + if dataset is not None and listinstr(['MMBench_V11'], dataset): + prompt = build_multi_choice_prompt(line, dataset) + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and listinstr(['MMStar'], dataset): + prompt = build_multi_choice_prompt(line, dataset) + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset): + prompt = build_multi_choice_prompt(line, dataset) + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and listinstr(['MathVista_MINI'], dataset): + prompt = line['question'] + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + prompt = build_qa_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and listinstr(['OCRBench'], dataset): + prompt = line['question'] + '\nAnswer the question using a single word or phrase. \ + The image is guaranteed to contain the correct answer. Please provide the most likely \ + answer — do not answer "No." Let us think step by step.' + elif dataset is not None and listinstr(['AI2D_TEST'], dataset): + prompt = build_multi_choice_prompt(line, dataset) + elif dataset is not None and listinstr(['HallusionBench'], dataset): + self.cot_prompt = None + prompt = build_yesorno_cot_prompt(line, line['question'], self.cot_prompt) + elif dataset is not None and listinstr(['MMVet'], dataset): + prompt = line['question'] + elif listinstr(['MathVista', 'DynaMath', 'MathVision', 'MathVerse', 'LogicVista'], dataset): + cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + prompt = build_qa_cot_prompt(line, line['question'], cot_prompt) + elif listinstr(['WeMath'], dataset): + prompt = build_multi_choice_prompt(line, dataset) + cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + prompt = build_mcq_cot_prompt(line, prompt, cot_prompt) + + message = [dict(type='text', value=prompt)] + image_num = len(tgt_path) + max_num = max(1, min(max_num, 64 // image_num)) + + message.extend([dict(type='image', value=s, max_dynamic_patch=max_num) for s in tgt_path]) + + # reorganize_prompt + prompt = reorganize_prompt(message, image_num, dataset=dataset) + prompt.replace('', '') + message[0] = dict(type='text', value=prompt) + return message + + def get_max_num(self, dataset): + self.total_max_num = 64 + if dataset is None: + self.max_num = 6 + return None + elif dataset is not None and listinstr(['MMBench_V11'], dataset): + return 6 + elif dataset is not None and listinstr(['MMStar'], dataset): + return 8 + elif dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset): + return 12 + elif dataset is not None and listinstr(['MathVista_MINI'], dataset): + return 12 + elif dataset is not None and listinstr(['OCRBench'], dataset): + return 24 + elif dataset is not None and listinstr(['AI2D_TEST'], dataset): + return 6 + elif dataset is not None and listinstr(['HallusionBench'], dataset): + return 6 + elif dataset is not None and listinstr(['MMVet'], dataset): + return 24 + else: + return 6 + + +R1_SYSTEM_PROMPT = """ +You are an AI assistant that rigorously follows this response protocol: + +1. First, conduct a detailed analysis of the question. Consider different \ +angles, potential solutions, and reason through the problem step-by-step. \ +Enclose this entire thinking process within and tags. + +2. After the thinking section, provide a clear, concise, and direct answer to \ +the user's question. Separate the answer from the think section with a newline. + +Ensure that the thinking process is thorough but remains focused on the \ +query. The final answer should be standalone and not reference the thinking \ +section. +""".strip() + + +class RBdashMMChat3Wrapper(BaseAPI): + + is_api: bool = True + custom_prompt = '' + prompt_map = { + "rbdashmm3chat": RBdashMMChat3_PromptUtil(), + } + + def __init__(self, + model: str = None, + retry: int = 5, + key: str = 'sk-123456', + verbose: bool = True, + temperature: float = 0.0, + timeout: int = 300, + api_base: str = None, + system_prompt: str = None, + max_tokens: int = 16384, + **kwargs): + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.timeout = timeout + + key = os.environ.get('LMDEPLOY_API_KEY', key) + api_base = os.environ.get('LMDEPLOY_API_BASE', api_base) + assert key is not None, 'Please set the environment variable LMDEPLOY_API_KEY.' + assert api_base is not None, 'Please set the environment variable LMDEPLOY_API_BASE.' + self.key = key + self.api_base = api_base + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + model_url = ''.join([api_base.split('v1')[0], 'v1/models']) + resp = requests.get(model_url) + model_id_list = [str(data['id']) for data in resp.json()['data']] + self.model = model if model in model_id_list else model_id_list[0] + logger.info(f'lmdeploy evaluate model: {self.model}') + + self.custom_prompt = "rbdashmm3chat" + self.max_tokens = 16384 + self.temperature = 0.0 + if hasattr(self, 'custom_prompt'): + logger.info(f'using custom prompt {self.custom_prompt}') + + self.temperature = temperature + logger.info(f'Init temperature: {self.temperature}') + + self.system_prompt = None + + def set_dump_image(self, dump_image_func): + if self.custom_prompt in self.prompt_map: + self.prompt_map[self.custom_prompt].dump_image_func = dump_image_func + self.dump_image_func = dump_image_func + + def use_custom_prompt(self, dataset): + assert dataset is not None + return True + + def build_prompt(self, line, dataset=None): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].build_prompt(line, dataset) + raise NotImplementedError + + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + extra_args = msg.copy() + extra_args.pop('type') + extra_args.pop('value') + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', **extra_args) + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs, dataset): + input_msgs = [] + + if listinstr(['MMMU_DEV_VAL', 'MathVista_MINI'], dataset): + self.system_prompt = R1_SYSTEM_PROMPT + else: + self.system_prompt = None + + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + else: + input_msgs.append(dict(role='system', content="你是书生·万象,英文名是InternVL,是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型。")) + + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + temperature = kwargs.pop('temperature', self.temperature) + logger.info(f'Generate temperature: {temperature}') + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + dataset = kwargs.pop('dataset', None) + + input_msgs = self.prepare_inputs(inputs, dataset) + + headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} + payload = dict( + model=self.model, + messages=input_msgs, + max_tokens=max_tokens, + n=1, + temperature=temperature, + session_len=40960, + do_sample=False, + **kwargs) + response = requests.post( + self.api_base, + headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + answer = process_prediction_text(answer) + except Exception: + pass + return ret_code, answer, response + + +class RBdashMMChat3_API(RBdashMMChat3Wrapper): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def generate(self, message, dataset=None): + return super(RBdashMMChat3_API, self).generate(message, dataset=dataset) + + +class RBdashMMChat3_5_Wrapper(BaseAPI): + + is_api: bool = True + custom_prompt = '' + prompt_map = { + "rbdashmm3_5chat": RBdashMMChat3_5_PromptUtil(), + } + + def __init__(self, + model: str = None, + retry: int = 5, + key: str = 'sk-123456', + verbose: bool = True, + temperature: float = 0.0, + timeout: int = 300, + api_base: str = None, + system_prompt: str = None, + max_tokens: int = 20480, + **kwargs): + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.timeout = timeout + + key = os.environ.get('LMDEPLOY_API_KEY', key) + api_base = os.environ.get('LMDEPLOY_API_BASE', api_base) + assert key is not None, 'Please set the environment variable LMDEPLOY_API_KEY.' + assert api_base is not None, 'Please set the environment variable LMDEPLOY_API_BASE.' + self.key = key + self.api_base = api_base + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + model_url = ''.join([api_base.split('v1')[0], 'v1/models']) + resp = requests.get(model_url) + model_id_list = [str(data['id']) for data in resp.json()['data']] + self.model = model if model in model_id_list else model_id_list[0] + logger.info(f'lmdeploy evaluate model: {self.model}') + + self.custom_prompt = "rbdashmm3_5chat" + self.max_tokens = 20480 + self.temperature = 0.0 + if hasattr(self, 'custom_prompt'): + logger.info(f'using custom prompt {self.custom_prompt}') + + self.temperature = temperature + logger.info(f'Init temperature: {self.temperature}') + + self.system_prompt = None + + def set_dump_image(self, dump_image_func): + if self.custom_prompt in self.prompt_map: + self.prompt_map[self.custom_prompt].dump_image_func = dump_image_func + self.dump_image_func = dump_image_func + + def use_custom_prompt(self, dataset): + assert dataset is not None + return True + + def build_prompt(self, line, dataset=None): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].build_prompt(line, dataset) + raise NotImplementedError + + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + extra_args = msg.copy() + extra_args.pop('type') + extra_args.pop('value') + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', **extra_args) + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs, dataset): + input_msgs = [] + + if listinstr([ + 'MMMU_DEV_VAL', 'MathVista_MINI', 'MMBench_V11', + 'MMStar', 'DynaMath', 'MathVision', 'MathVerse', + 'LogicVista', 'WeMath'], dataset): + self.system_prompt = R1_SYSTEM_PROMPT + else: + self.system_prompt = None + + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + temperature = kwargs.pop('temperature', self.temperature) + logger.info(f'Generate temperature: {temperature}') + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + dataset = kwargs.pop('dataset', None) + + input_msgs = self.prepare_inputs(inputs, dataset) + + headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} + payload = dict( + model=self.model, + messages=input_msgs, + max_tokens=max_tokens, + n=1, + temperature=temperature, + session_len=40960, + do_sample=False, + **kwargs) + response = requests.post( + self.api_base, + headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + answer = process_prediction_text(answer) + except Exception: + pass + return ret_code, answer, response + + +class RBdashChat3_5_API(RBdashMMChat3_5_Wrapper): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def generate(self, message, dataset=None): + return super(RBdashChat3_5_API, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/reka.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/reka.py new file mode 100644 index 00000000..ff08e64c --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/reka.py @@ -0,0 +1,59 @@ +import os + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import encode_image_file_to_base64 + + +class Reka_Wrapper(BaseAPI): + + is_api: bool = True + INTERLEAVE: bool = False + + def __init__(self, + model: str = 'reka-flash-20240226', + key: str = None, + retry: int = 10, + system_prompt: str = None, + verbose: bool = True, + temperature: float = 0, + max_tokens: int = 1024, + **kwargs): + + try: + import reka # noqa: F401 + except ImportError: + raise ImportError('Please install reka by running "pip install reka-api"') + + self.model = model + default_kwargs = dict(temperature=temperature, request_output_len=max_tokens) + default_kwargs.update(kwargs) + self.kwargs = default_kwargs + if key is not None: + self.key = key + else: + self.key = os.environ.get('REKA_API_KEY', '') + super().__init__(retry=retry, verbose=verbose, system_prompt=system_prompt, **kwargs) + + def generate_inner(self, inputs, **kwargs) -> str: + import reka + reka.API_KEY = self.key + dataset = kwargs.pop('dataset', None) + prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset) + image_b64 = encode_image_file_to_base64(image_path) + + response = reka.chat( + model_name=self.model, + human=prompt, + media_url=f'data:image/jpeg;base64,{image_b64}', + **self.kwargs) + + try: + return 0, response['text'], response + except Exception as err: + return -1, self.fail_msg + str(err), response + + +class Reka(Reka_Wrapper): + + def generate(self, message, dataset=None): + return super(Reka_Wrapper, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/sensechat_vision.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/sensechat_vision.py new file mode 100644 index 00000000..fd72da3e --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/sensechat_vision.py @@ -0,0 +1,411 @@ +import os +import string +import time +from typing import Optional + +import pandas as pd +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_TYPE, img_root_map +from vlmeval.smp import (LMUDataRoot, cn_string, decode_base64_to_image_file, get_logger, + listinstr, osp, read_ok, toliststr) + +logger = get_logger(__name__) + + +class SenseChatVisionWrapper(BaseAPI): + is_api: bool = True + + def __init__( + self, + base_url: str = "https://api.sensenova.cn/v1/llm/chat-completions", + api_key: str = None, + model: str = "SenseNova-V6-5-Pro", + retry: int = 5, + wait: int = 5, + verbose: bool = True, + system_prompt: str = None, + max_tokens: int = 16384, + **kwargs, + ): + self.base_url = base_url + self.model = model + self.fail_msg = "Failed to obtain answer via API. " + self.api_key = os.getenv("SENSENOVA_API_KEY", api_key) + assert self.api_key is not None, ( + "Please set the `SENSENOVA_API_KEY` environment variable or pass `api_key` in the config.json." + ) + self.max_new_tokens = max_tokens + self.thinking = False + super().__init__( + wait=wait, + retry=retry, + system_prompt=system_prompt, + verbose=verbose, + **kwargs, + ) + + def dump_image(self, line, dataset): + """Dump the image(s) of the input line to the corresponding dataset folder. + + Args: + line (line of pd.DataFrame): The raw input line. + dataset (str): The name of the dataset. + + Returns: + str | list[str]: The paths of the dumped images. + """ + ROOT = LMUDataRoot() + assert isinstance(dataset, str) + img_root = osp.join(ROOT, "images", img_root_map(dataset)) + os.makedirs(img_root, exist_ok=True) + if "image" in line: + if isinstance(line["image"], list): + tgt_path = [] + assert "image_path" in line + for img, im_name in zip(line["image"], line["image_path"]): + path = osp.join(img_root, im_name) + if not read_ok(path): + decode_base64_to_image_file(img, path) + tgt_path.append(path) + else: + tgt_path = osp.join(img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line["image"], tgt_path) + tgt_path = [tgt_path] + else: + assert "image_path" in line + tgt_path = toliststr(line["image_path"]) + + return tgt_path + + def image_to_base64(self, image_path): + import base64 + + with open(image_path, "rb") as image_file: + encoded_string = base64.b64encode(image_file.read()) + return encoded_string.decode("utf-8") + + def use_custom_prompt(self, *args, **kwargs): + """Check if the prompt is customized.""" + return True + + def build_multi_choice_prompt(self, line, dataset=None): + question = line["question"] + hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None + if hint is not None: + question = hint + "\n" + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f"\n{key}. {item}" + prompt = question + + if len(options): + prompt += ( + "\n请直接回答选项字母。" + if cn_string(prompt) + else "\nAnswer with the option's letter from the given choices directly." + ) + else: + prompt += ( + "\n请直接回答问题。" + if cn_string(prompt) + else "\nAnswer the question directly." + ) + + return prompt + + def build_mcq_cot_prompt(self, line, prompt): + question = line["question"] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = { + 'multiple-choice': "You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Answer the preceding multiple choice question. The last line of your response should follow this format: 'Answer: \\boxed LETTER', where LETTER is one of the options. If you are uncertain or the problem is too complex, make a reasoned guess based on the information provided. Avoid repeating steps indefinitely—provide your best guess even if unsure. Think step by step logically, considering all relevant information before answering.", # noqa: E501 + 'open': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is only the answer and nothing else: "ANSWER: X"' # noqa: E501 + } + subject = '_'.join(line['id'].split('_')[1:-1]) + prompt = prompt[line['question_type']].format(subject, subject) + '\n' + question + return prompt + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + + tgt_path = self.dump_image(line, dataset) + + if dataset is not None and listinstr(["MME"], dataset): + question = line["question"] + prompt = question + " Answer the question using a single word or phrase." + elif dataset is not None and listinstr(["HallusionBench"], dataset): + question = line["question"] + prompt = ( + question + + " Please answer yes or no. Answer the question using a single word or phrase." + ) + elif dataset is not None and DATASET_TYPE(dataset) == "MCQ": + prompt = self.build_multi_choice_prompt(line, dataset) + if "MMMU" in dataset: + prompt = self.build_mcq_cot_prompt(line, prompt) + self.thinking = True + elif dataset is not None and DATASET_TYPE(dataset) == "VQA": + if "MathVista" in dataset: + prompt = line["question"] + self.thinking = True + elif listinstr(["LLaVABench"], dataset): + question = line["question"] + prompt = question + "\nAnswer this question in detail." + elif listinstr(["MMVet"], dataset): + prompt = line["question"] + else: + question = line["question"] + prompt = ( + question + + "\nPlease reason step by step, and put your final answer within \\boxed{}." + ) + else: + prompt = line["question"] + + message = [dict(type="text", value=prompt)] + message.extend([dict(type="image", value=s) for s in tgt_path]) + + return message + + def message_to_promptimg(self, message, dataset=None): + if dataset is None or listinstr(["MMMU", "BLINK"], dataset): + prompt = "\n".join([x["value"] for x in message if x["type"] == "text"]) + image = [[x["value"] for x in message if x["type"] == "image"][0]] + else: + prompt = "\n".join([x["value"] for x in message if x["type"] == "text"]) + image = [x["value"] for x in message if x["type"] == "image"] + return prompt, image + + def set_max_num(self, dataset: Optional[str] = None) -> None: + """Set the max_num based on the dataset.""" + if dataset is not None and listinstr( + [ + "ChartQA_TEST", + "MMMU_DEV_VAL", + "MMMU_TEST", + "MME-RealWorld", + "VCR_EN", + "VCR_ZH", + "OCRVQA", + ], + dataset, + ): + self.max_num = 12 + elif dataset is not None and listinstr( + ["DocVQA_VAL", "DocVQA_TEST", "DUDE", "MMLongBench_DOC", "SLIDEVQA"], + dataset, + ): + self.max_num = 18 + elif dataset is not None and listinstr( + ["InfoVQA_VAL", "InfoVQA_TEST", "OCRBench", "HRBench4K", "HRBench8K"], + dataset, + ): + self.max_num = 24 + else: + self.max_num = 6 + + def generate_inner(self, inputs, **kwargs) -> str: + assert isinstance(inputs, str) or isinstance(inputs, list) + inputs = [inputs] if isinstance(inputs, str) else inputs + dataset = kwargs.get("dataset", None) + + self.set_max_num(dataset=dataset) + + prompt, image = self.message_to_promptimg(message=inputs, dataset=dataset) + content = [ + { + "image_base64": self.image_to_base64(item), + "type": "image_base64", + } + for item in image + ] + + content.append( + { + "text": prompt, + "type": "text", + } + ) + + message = [{"content": content, "role": "user"}] + data = { + "messages": message, + "max_new_tokens": self.max_new_tokens, + "model": self.model, + "stream": False, + "image_split_count": self.max_num, + "thinking": { + "enabled": self.thinking, + } + } + + headers = { + "Content-type": "application/json", + "Authorization": self.api_key, + } + + response = requests.post( + self.base_url, + headers=headers, + json=data, + ) + request_id = response.headers.get("x-request-id", "") + logger.info(f"Request-id: {request_id}") + + time.sleep(1) + try: + assert response.status_code == 200 + response = response.json()["data"]["choices"][0]["message"].strip() + if self.verbose: + logger.info(f"inputs: {inputs}\nanswer: {response}") + return 0, response, "Succeeded! " + except Exception as err: + if self.verbose: + logger.error( + "---------------------------ERROR---------------------------" + ) + logger.error(response.json()) + logger.error(err) + logger.error( + "---------------------------request_id---------------------------" + + request_id + ) + logger.error( + "api error" + + response.json()["error"]["message"] + + str( + [ + input["value"] if input["type"] == "image" else None + for input in inputs + ] + ) + ) + logger.error(f"The input messages are {inputs}.") + return -1, response.json()["error"]["message"], "" + + +class SenseChatVisionAPI(SenseChatVisionWrapper): + def generate(self, message, dataset=None): + return super(SenseChatVisionAPI, self).generate(message, dataset=dataset) + + +class SenseChatVisionV2API(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'SenseNova-V6-5-Pro-20251215', + retry: int = 5, + key: str = None, + verbose: bool = False, + system_prompt: str = None, + temperature: float = 0, + timeout: int = 300, + api_base: str = "https://api.sensenova.cn/compatible-mode/v2/chat/completions", + max_completion_tokens: int = 4096, + img_size: int = -1, + **kwargs): + + self.model = model + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_completion_tokens = max_completion_tokens + self.temperature = temperature + self.api_base = api_base + self.key = key + assert img_size > 0 or img_size == -1 + self.img_size = img_size + self.timeout = timeout + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}') + + def generate(self, message, dataset=None): + return super(SenseChatVisionV2API, self).generate(message) + + def prepare_itlist(self, inputs): + import numpy as np + + from vlmeval.smp import encode_image_to_base64 + + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + image_num = len([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img, target_size=int(self.img_size / (image_num ** 0.5))) + img_struct = dict(url=f'data:image/jpeg;base64,{b64}') + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert all(['type' in x for x in inputs]) or all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + import json + input_msgs = self.prepare_inputs(inputs) + + headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} + + payload = dict(model=self.model, messages=input_msgs, stream=False, **kwargs) + + proxies = {} + if os.getenv('http_proxy'): + proxies['http'] = os.getenv('http_proxy') + if os.getenv('https_proxy'): + proxies['https'] = os.getenv('https_proxy') + proxies = proxies or None + + response = requests.post( + self.api_base, + headers=headers, + data=json.dumps(payload), + proxies=proxies, + timeout=self.timeout * 1.1, + ) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(response.text if hasattr(response, 'text') else response) + + return ret_code, answer, response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/siliconflow.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/siliconflow.py new file mode 100644 index 00000000..0114a29b --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/siliconflow.py @@ -0,0 +1,288 @@ +import base64 +import io +import json +import math +import os +import os.path as osp + +import requests +from PIL import Image + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import img_root_map +from vlmeval.smp import LMUDataRoot, decode_base64_to_image_file, get_logger, read_ok, toliststr + +logger = get_logger(__name__) + +API_BASE = "https://api.siliconflow.cn/v1/chat/completions" + + +def resize_image(image: Image.Image, max_height: int, max_width: int) -> Image.Image: + width, height = image.size + if min(width, height) < 50: + scale = 50 / min(width, height) + image = image.resize((int(width * scale), int(height * scale))) + current_pixels = width * height + + if current_pixels <= max_height * max_width: + return image + + scale = math.sqrt(max_height * max_width / current_pixels) + new_width = int(width * scale) + new_height = int(height * scale) + + return image.resize((new_width, new_height), Image.Resampling.LANCZOS) + + +def encode_image(path: str, max_height: int = 1024, max_width: int = 1024) -> str: + image = Image.open(path).convert("RGB") + image = resize_image(image, max_height, max_width) + width, height = image.size + if min(height, width) < 50: + scale = 50 / min(width, height) + image = image.resize((int(width * scale), int(height * scale))) + buffered = io.BytesIO() + image.save(buffered, format="PNG") + img_bytes = buffered.getvalue() + img_base64 = base64.b64encode(img_bytes).decode("utf-8") + return img_base64 + + +class SiliconFlowAPI(BaseAPI): + + is_api: bool = True + + def __init__( + self, + model: str = "deepseek-ai/DeepSeek-V2.5", + retry: int = 5, + key: str = None, + api_base: str = API_BASE, + verbose: bool = True, + system_prompt: str = None, + timeout: int = 60, + reasoning: bool = False, # If set, will return results in the format of {'content': '...', 'reasoning': '...'} + **kwargs, + ): + + self.model = model + self.api_base = api_base + self.reasoning = reasoning + self.timeout = timeout + + default_kwargs = { + "stream": False, + "temperature": 0, + "n": 1, + "max_tokens": 1280, + } + for k, v in default_kwargs.items(): + if k not in kwargs: + kwargs[k] = default_kwargs[k] + if key is not None: + self.key = key + else: + self.key = os.environ.get("SiliconFlow_API_KEY", "") + headers = {"Authorization": "Bearer {}", "Content-Type": "application/json"} + headers["Authorization"] = headers["Authorization"].format(self.key) + self.headers = headers + super().__init__( + retry=retry, + system_prompt=system_prompt, + verbose=verbose, + **kwargs, + ) + + @staticmethod + def build_msgs(msgs_raw): + messages = [] + message = {"role": "user", "content": []} + image_b64 = None + for msg in msgs_raw: + if msg["type"] == "image" and not image_b64: + image_b64 = encode_image(msg["value"]) + message["content"].append({ + "image_url": { + "url": f"data:image/png;base64,{image_b64}" + }, + "type": "image_url" + }) + elif msg["type"] == "text": + message["content"].append({"text": msg["value"], "type": "text"}) + + messages.append(message) + return messages + + def generate_inner(self, inputs, **kwargs) -> str: + default_kwargs = self.default_kwargs + default_kwargs.update(kwargs) + + payload = dict( + model=self.model, + messages=self.build_msgs(msgs_raw=inputs), + **default_kwargs, + ) + + response = requests.post( + self.api_base, headers=self.headers, data=json.dumps(payload), timeout=self.timeout * 1.1 + ) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + msg = resp_struct["choices"][0]["message"] + if self.reasoning and 'reasoning_content' in msg: + answer = {'content': msg['content'], 'reasoning': msg['reasoning_content']} + else: + answer = resp_struct["choices"][0]["message"]["content"].strip() + except Exception: + pass + return ret_code, answer, response + + +class TeleMMAPI(SiliconFlowAPI): + + is_api: bool = True + + def __init__( + self, + model: str = "TeleAI/TeleMM", + key: str = None, + max_height: int = 1280, + max_width: int = 784, + **kwargs, + ): + super().__init__(model=model, key=key, **kwargs) + self.max_height = max_height + self.max_width = max_width + + def dump_image(self, line, dataset): + """Dump the image(s) of the input line to the corresponding dataset folder. + + Args: + line (line of pd.DataFrame): The raw input line. + dataset (str): The name of the dataset. + + Returns: + str | list[str]: The paths of the dumped images. + """ + ROOT = LMUDataRoot() + assert isinstance(dataset, str) + # img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset) + img_root = osp.join(ROOT, "images", img_root_map(dataset)) + os.makedirs(img_root, exist_ok=True) + if "image" in line: + if isinstance(line["image"], list): + tgt_path = [] + assert "image_path" in line + for img, im_name in zip(line["image"], line["image_path"]): + path = osp.join(img_root, im_name) + if not read_ok(path): + decode_base64_to_image_file(img, path) + tgt_path.append(path) + else: + tgt_path = osp.join(img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line["image"], tgt_path) + tgt_path = [tgt_path] + else: + assert "image_path" in line + tgt_path = toliststr(line["image_path"]) + return tgt_path + + def _prepare_content( + self, inputs: list[dict[str, str]], dataset: str = None + ) -> list[dict[str, str]]: + """ + inputs list[dict[str, str]], each dict has keys: ['type', 'value'] + """ + content = [] + has_image = False + for s in inputs: + if s["type"] == "image": + if not has_image: + item = { + "type": "image_url", + "image_url": { + "url": encode_image( + s["value"], + max_height=self.max_height, + max_width=self.max_width, + ) + }, + } + has_image = True + else: + continue + elif s["type"] == "text": + prompt = s["value"] + if len(prompt) == 0: + continue + if dataset == "HallusionBench": + prompt += " Please answer yes or no directly, without any unnecessary explanation." + elif dataset == "OCRBench": + prompt = ( + prompt + "\nExtract the text from the image intactly and " + + "answer the question concisely and clearly if possible." + ) + + elif ( + dataset == "AI2D_TEST" + or dataset == "MMStar" + or dataset == "MMBench_TEST_EN_V11" + or dataset == "MMVet" + ): + prompt = prompt.replace( + "Please select the correct answer from the options above. \n", + "Please select the correct option from the above choices based on the " + + "input image and question. The final output should only be one option, such as 'A'", + ) + elif dataset == "MMBench_TEST_CN_V11": + prompt = prompt.replace( + "Please select the correct answer from the options above. \n", + "请根据输入图像和问题从上述选项中选择正确选项,最终的输出只有一个选项,例如'A'", + ) + item = {"type": "text", "text": prompt} + else: + raise ValueError(f"Invalid message type: {s['type']}, {s}") + content.append(item) + + return content + + def generate_inner(self, inputs, **kwargs) -> str: + default_kwargs = self.default_kwargs + default_kwargs.update(kwargs) + + messages = [] + messages.append( + { + "role": "user", + "content": self._prepare_content( + inputs, dataset=kwargs.get("dataset", None) + ), + } + ) + + payload = dict(model=self.model, messages=messages, **default_kwargs) + + response = requests.post( + self.api_base, headers=self.headers, data=json.dumps(payload) + ) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct["choices"][0]["message"]["content"].strip() + return ret_code, answer, response + except Exception as err: + import traceback + + traceback.print_exc() + if self.verbose: + logger.error(f"{type(err)}: {err}") + logger.error(f"The input messages are {inputs}.") + return -1, "", "" diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/stepai.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/stepai.py new file mode 100644 index 00000000..cd9f9a9e --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/stepai.py @@ -0,0 +1,94 @@ +import json +import os + +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.smp import encode_image_file_to_base64, get_logger + +url = 'https://api.stepfun.com/v1/chat/completions' +headers = { + 'Content-Type': 'application/json', + 'Authorization': 'Bearer {}', +} + + +logger = get_logger(__name__) + + +class StepAPI_INT(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'step-1v-8k', + retry: int = 10, + key: str = None, + temperature: float = 0, + max_tokens: int = 300, + verbose: bool = True, + system_prompt: str = None, + **kwargs): + self.model = model + self.fail_msg = 'Fail to obtain answer via API.' + self.headers = headers + self.temperature = temperature + self.max_tokens = max_tokens + self.system_prompt = system_prompt + if key is not None: + self.key = key + else: + self.key = os.environ.get('STEPAI_API_KEY', '') + headers['Authorization'] = headers['Authorization'].format(self.key) + + super().__init__(retry=retry, verbose=verbose, system_prompt=system_prompt, **kwargs) + + @staticmethod + def build_msgs(msgs_raw): + messages = [] + message = {'role': 'user', 'content': []} + + for msg in msgs_raw: + if msg['type'] == 'image': + image_b64 = encode_image_file_to_base64(msg['value']) + message['content'].append({ + 'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)}, + 'type': 'image_url' + }) + elif msg['type'] == 'text': + message['content'].append({ + 'text': msg['value'], + 'type': 'text' + }) + + messages.append(message) + return messages + + def generate_inner(self, inputs, **kwargs) -> str: + print(inputs, '\n') + payload = dict( + model=self.model, + max_tokens=self.max_tokens, + temperature=self.temperature, + messages=self.build_msgs(msgs_raw=inputs), + **kwargs) + response = requests.post(url, headers=headers, data=json.dumps(payload)) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(response.text if hasattr(response, 'text') else response) + + return ret_code, answer, response + + +class Step1V_INT(StepAPI_INT): + + def generate(self, message, dataset=None): + return super(StepAPI_INT, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/taichu.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/taichu.py new file mode 100644 index 00000000..7a73b68a --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/taichu.py @@ -0,0 +1,359 @@ +import base64 +import copy +import json +import os +import re +import string +from io import BytesIO + +import pandas as pd +import requests +from PIL import Image + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_TYPE +from vlmeval.smp import get_logger, listinstr + +logger = get_logger(__name__) + + +class ChatResponse(dict): + def __getattr__(self, name): + value = self.get(name) + if isinstance(value, dict): + return ChatResponse(value) # 如果值是字典,递归包装成 DotDict + elif isinstance(value, list): + return [ChatResponse(v) if isinstance(v, dict) else v for v in value] # 如果值是列表,处理其中的字典 + return value + + def __setattr__(self, name, value): + self[name] = value + + def __delattr__(self, name): + del self[name] + + +class TaichuVLWrapper(BaseAPI): + is_api: bool = True + + def __init__(self, + model: str = 'Taichu-VL-2B', + retry: int = 5, + verbose: bool = True, + temperature: float = 0.0, + system_prompt: str = None, + max_tokens: int = 4096, + key: str = None, + url: str = None, + **kwargs): + + self.model = model + self.kwargs = kwargs + self.max_tokens = max_tokens + + self.system_prompt = '[sys]You are a helpful assistant.[/sys]' + self.hint_prompt = '||' + self.mcq_prompt = '||' + + self.datasets_use_system = ['MMVet'] + self.datasets_use_multichoice = [ + 'MathVista', 'MathVision'] + + openai_key = os.environ.get('OPENAI_API_KEY', None) + use_openai = os.environ.get('USE_OPENAI_EVAL', True) + self.use_openai_evaluate = (isinstance(openai_key, str) and openai_key.startswith('sk-') and use_openai) + + self.api_key = os.environ.get('TAICHU_API_KEY', key) + self.api_url = url + + assert self.api_key is not None, 'Please set the API Key' + + super().__init__(retry=retry, system_prompt=self.system_prompt, verbose=verbose, **kwargs) + + def use_custom_prompt(self, dataset): + if listinstr(['MCQ', 'VQA'], DATASET_TYPE(dataset)): + return True + elif dataset is not None and listinstr(['HallusionBench'], dataset): + return True + return False + + def clear_prompt(self, prompt): + prompt = re.sub(r"Hint:.*?Question:", "", prompt, flags=re.S).strip() + prompt = re.sub(r"\nChoices:\n.*", "", prompt, flags=re.S).strip() + return prompt + + def encode_image(self, pil_image): + buffer = BytesIO() + pil_image.save(buffer, format='PNG') + base64_str = base64.b64encode(buffer.getvalue()).decode("utf-8") + return base64_str + + def build_prompt(self, line, dataset=None): + if isinstance(line, int): + line = self.data.iloc[line] + + tgt_path = self.dump_image(line, dataset) + question = line['question'] + hint = None + if listinstr(self.datasets_use_system, dataset): + system_prompt = self.system_prompt + else: + system_prompt = '' + mcq = False + if DATASET_TYPE(dataset) == 'MCQ' or listinstr(self.datasets_use_multichoice, dataset): + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + if listinstr(self.datasets_use_multichoice, dataset): + options = {} + if not pd.isna(line['choices']): + for i, c in enumerate(eval(line['choices'])): + options[string.ascii_uppercase[i]] = c + question = self.clear_prompt(question) + + # support chinese + if listinstr(['_CN', '_cn'], dataset): + options_prompt = '\n选项:\n' + else: + options_prompt = '\nOPTIONS:\n' + options_prompt += '\n'.join(f"{key}:{value}" for key, value in options.items()) + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + mcq = True if len(options) else False + if len(options): + prompt = question + options_prompt + else: + prompt = question + else: + prompt = question + + msgs = [] + if system_prompt: + msgs.append(dict(type='text', value=system_prompt)) + + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs.append(dict(type='image', value=tgt_path)) + + if hint: + prompt = 'Hint: ' + hint + '\n' + prompt + msgs.append(dict(type='text', value=prompt)) + + if mcq: + msgs.append(dict(type='text', value=self.mcq_prompt)) + return msgs + + def prompt_to_request_messages(self, inputs): + + messages = [ + {'role': 'user', 'content': []} + ] + is_mcq = False + for x in inputs: + if x['type'] == 'text': + if x['value'] == self.system_prompt: + messages = [{'role': 'system', 'content': [{"type": "text", "text": x['value']}]}] + messages + elif self.mcq_prompt == x['value']: + is_mcq = True + else: + messages[-1]['content'].append( + {"type": "text", "text": x['value']}, + ) + if x['type'] == 'image': + _url = self.encode_image(Image.open(x['value'])) + messages[-1]['content'].append( + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{_url}"}}, + ) + else: + continue + + return messages, is_mcq + + def generate_inner(self, inputs, **kwargs) -> str: + messages, is_mcq = self.prompt_to_request_messages(inputs) + + data = { + "model": self.model, + "messages": messages, + "max_tokens": self.max_tokens, + "temperature": 0, + "top_p": 0.8, + "stream": False, + "extra_body": { + "repetition_penalty": 1 + } + } + + headers = { + 'Authorization': self.api_key, + 'Content-Type': 'application/json' + } + + try: + chat_response = requests.post(self.api_url, json=data, headers=headers) + response = ChatResponse(json.loads(chat_response.content)) + result = response.choices[0].message.content + # Extract index to exact matching when ChatGPT is unavailable. + if self.use_openai_evaluate is False and is_mcq is True: + try: + result = result[0] + except Exception: + result = 'A' + return 0, result, 'Succeeded! ' + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(f'The input messages are {inputs}.') + return -1, '', '' + + +class TaichuVLAPI(TaichuVLWrapper): + + def generate(self, message, dataset=None): + return super(TaichuVLAPI, self).generate(message, dataset=dataset) + + +class TaichuVLRWrapper(BaseAPI): + is_api: bool = True + + def __init__(self, + model: str = 'taichu_vlr_3b', + retry: int = 5, + verbose: bool = True, + temperature: float = 0.0, + system_prompt: str = None, + max_tokens: int = 4096, + use_reasoning_prompt: bool = True, + post_process: bool = True, + key: str = None, + url: str = None, + **kwargs): + + self.model = model + self.kwargs = kwargs + self.max_tokens = max_tokens + self.system_prompt = system_prompt + self.use_reasoning_prompt = use_reasoning_prompt + self.post_process = post_process + self.verbose = verbose + + openai_key = os.environ.get('OPENAI_API_KEY', None) + use_openai = os.environ.get('USE_OPENAI_EVAL', True) + self.use_openai_evaluate = (isinstance(openai_key, str) and openai_key.startswith('sk-') and use_openai) + + self.api_key = os.environ.get('TAICHU_API_KEY', key) + self.api_url = url + + assert self.api_key is not None, 'Please set the API Key' + + super().__init__(retry=retry, system_prompt=self.system_prompt, verbose=verbose, **kwargs) + + def use_custom_prompt(self, dataset): + return False + + def encode_image(self, pil_image): + buffer = BytesIO() + pil_image.save(buffer, format='PNG') + base64_str = base64.b64encode(buffer.getvalue()).decode("utf-8") + return base64_str + + def post_process_func(self, response): + resp = response.split('\\boxed{')[-1] + lt = len(resp) + counter, end = 1, None + for i in range(lt): + if resp[i] == '{': + counter += 1 + elif resp[i] == '}': + counter -= 1 + if counter == 0: + end = i + break + elif i == lt - 1: + end = lt + break + if end is not None: + response = resp[:end] + return response + + def prompt_to_request_messages(self, inputs): + + messages = [ + {'role': 'user', 'content': []} + ] + for x in inputs: + if x['type'] == 'text': + messages[-1]['content'].append( + {"type": "text", "text": x['value']}, + ) + if x['type'] == 'image': + _url = self.encode_image(Image.open(x['value'])) + messages[-1]['content'].append( + {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{_url}"}}, + ) + else: + continue + + PROMPT = ( + "First thinks about the reasoning process in the mind and then provides the user with the answer. " + "Put your final answer within \\boxed{}. " + "The response of reasoning and answer are formatted in reasoning \\boxed{answer here} .\n" # noqa: E501 + ) + + if self.use_reasoning_prompt: + for content in messages[0]['content']: + if content['type'] == 'text': + content['text'] = PROMPT + content['text'] + break + + return messages + + def generate_inner(self, inputs, **kwargs) -> str: + messages = self.prompt_to_request_messages(inputs) + if self.verbose: + verbose_messages = copy.deepcopy(messages) + for mess in verbose_messages: + if mess['role'] == 'user': + for content in mess['content']: + if content['type'] == 'image_url': + content['image_url']['url'] = '' + print(f'\033[31m{verbose_messages}\033[0m') + + data = { + "model": self.model, + "messages": messages, + "max_tokens": self.max_tokens, + "temperature": 0, + "top_p": 0.8, + "stream": False, + "repetition_penalty": 1.0 + } + + headers = { + 'Authorization': f"Bearer {self.api_key}", + 'Content-Type': 'application/json' + } + + try: + chat_response = requests.post(self.api_url, json=data, headers=headers) + response = ChatResponse(json.loads(chat_response.content)) + result = response.choices[0].message.content + if self.post_process: + result = self.post_process_func(result) + if self.verbose: + print(f'\033[32m{result}\033[0m') + + return 0, result, 'Succeeded! ' + except Exception as err: + if self.verbose: + logger.error(f'{type(err)}: {err}') + logger.error(f'The input messages are {inputs}.') + return -1, '', '' + + +class TaichuVLRAPI(TaichuVLRWrapper): + + def generate(self, message, dataset=None): + return super(TaichuVLRAPI, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/taiyi.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/taiyi.py new file mode 100644 index 00000000..1638af8c --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/taiyi.py @@ -0,0 +1,196 @@ +import base64 +import json +import os +import string + +import numpy as np +import pandas as pd +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_TYPE +from vlmeval.smp import cn_string, get_logger, listinstr + +logger = get_logger(__name__) + + +class TaiyiWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'taiyi', + retry: int = 5, + key: str = None, + verbose: bool = False, + system_prompt: str = None, + temperature: float = 0, + timeout: int = 60, + url: str = "https://taiyi.megvii.com/v1/chat/completions", + max_tokens: int = 1024, + **kwargs): + + self.model = model + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.temperature = temperature + + if key is None: + key = os.environ.get('TAIYI_API_KEY', None) + assert key is not None, ('Please set the API Key ') + self.key = key + + self.timeout = timeout + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + assert url is not None, ('Please set the url ') + self.url = url + logger.info(f'Using url: {self.url}; API Key: {self.key}') + + def use_custom_prompt(self, dataset): + if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA': + return True + return False + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + imgbytes = open(msg['value'], 'rb').read() + b64 = base64.b64encode(imgbytes).decode('ascii') + img_struct = dict(url=f'data:image/jpeg;base64,{b64}') + content_list.append(dict(type='image_url', image_url=img_struct)) + input_msgs.append(dict(role='user', content=content_list)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + input_msgs.append(dict(role='user', content=text)) + return input_msgs + + def image_first(self, msgs): + nr_img = 0 + for s in msgs: + if s['type'] == 'image': + nr_img += 1 + + if nr_img == 1: + new_msgs = [] + img_msg = None + for s in msgs: + if s['type'] == 'text': + new_msgs.append(s) + else: + img_msg = s + new_msgs.insert(0, img_msg) + else: + new_msgs = msgs + + return new_msgs + + def build_multi_choice_prompt(self, line, dataset=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + prompt += '\n请直接回答选项字母。' if cn_string( + prompt) else "\nAnswer with the option's letter from the given choices directly." + else: + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + + return prompt + + def build_yorn_prompt(self, line, dataset=None): + if listinstr(['HallusionBench'], dataset): + pre_prompt = 'Read the following question carefully, think and solve it step by step.\n\n' + else: + pre_prompt = '' + + prompt = pre_prompt + line['question'] + ' Please answer yes or no as the final answer.' + + return prompt + + def build_vqa_prompt(self, line, dataset=None): + if listinstr(['OCRBench'], dataset): + pre_prompt = 'Carefully identify the text in the image and answer the question.\n\n' + else: + pre_prompt = '' + + if listinstr(['MMVet'], dataset): + post_prompt = '\nAnswer this question in detail.' + else: + post_prompt = '' + + prompt = pre_prompt + line['question'] + post_prompt + + return prompt + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + tgt_path = self.dump_image(line, dataset) + + if DATASET_TYPE(dataset) == 'MCQ': + prompt = self.build_multi_choice_prompt(line, dataset) + elif DATASET_TYPE(dataset) == 'Y/N': + prompt = self.build_yorn_prompt(line, dataset) + elif DATASET_TYPE(dataset) == 'VQA': + prompt = self.build_vqa_prompt(line, dataset) + else: + raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}') + message = [] + message.extend([dict(type='image', value=s) for s in tgt_path]) + message.extend([dict(type='text', value=prompt)]) + + # interleave dataset + if dataset.startswith('MMMU_'): + from .. import MMMUDataset + message = MMMUDataset.split_MMMU(message) + message = self.image_first(message) + + return message + + def generate_inner(self, inputs, **kwargs) -> str: + + input_msgs = self.prepare_inputs(inputs) + temperature = kwargs.pop('temperature', self.temperature) + + headers = {'Authorization': f'Bearer {self.key}'} + payload = dict( + model=self.model, + messages=input_msgs, + n=1, + temperature=temperature, + **kwargs) + response = requests.post(self.url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + except Exception: + pass + return ret_code, answer, response + + +class TaiyiAPI(TaiyiWrapper): + + def generate(self, message, dataset=None): + return super(TaiyiAPI, self).generate(message) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/telemm.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/telemm.py new file mode 100644 index 00000000..a2b5a2ef --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/telemm.py @@ -0,0 +1,296 @@ +import json +import os +import re + +import numpy as np +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_MODALITY, DATASET_TYPE +from vlmeval.smp import encode_image_to_base64, get_logger, listinstr + +logger = get_logger(__name__) + +API_BASE = "https://openapi.teleagi.cn:443/aipaas/aiproxy-stream/chat/completions/8005400012" + +PREDEFINED_SYSTEM_PROMPT = """ +You are an AI assistant that rigorously follows this response protocol: + +1. First, conduct a detailed analysis of the question. Consider different \ +angles, potential solutions, and reason through the problem step-by-step. \ +Enclose this entire thinking process within {START} and {END} tags. + +2. After the thinking section, provide a clear, concise, and direct answer to \ +the user's question. Separate the answer from the think section with a newline. + +Ensure that the thinking process is thorough but remains focused on the \ +query. The final answer should be standalone and not reference the thinking \ +section. +""".strip() + + +class TeleMM2_PromptUtil: + def __init__(self): + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset): + assert dataset is not None + assert DATASET_MODALITY(dataset) != 'VIDEO', 'not supported' + return True + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + from ..vlm.internvl.utils import (build_mcq_cot_prompt, build_multi_choice_prompt, + build_qa_cot_prompt, reorganize_prompt) + + tgt_path = self.dump_image(line, dataset) + if dataset is not None and DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['HallusionBench'], dataset): + cot_prompt = ( + "Answer the preceding yes-or-no question. Think step by step logically, considering all " + "relevant information before answering. If you are uncertain or the problem is ambiguous, make a " + "reasoned guess based on the information provided. The last line of your response must be exactly " + "in this format: 'Answer: \\boxed{Yes}' or 'Answer: \\boxed{No}' (without quotes)." + ) + prompt = question + '\n' + cot_prompt + else: + prompt = question + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = build_multi_choice_prompt(line, dataset) + if listinstr(['MMMU_DEV_VAL'], dataset): + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['OCRBench'], dataset): + OCR_prompt = ( + "\nAnswer the question using a single word or phrase. " + "The image is guaranteed to contain the correct answer. Please provide the most likely " + "answer — do not answer \"No.\". Do not answer 'r'. " + ) + prompt = question + OCR_prompt + elif listinstr(['MathVista'], dataset): + prompt = build_qa_cot_prompt(line, question, self.cot_prompt) + elif listinstr(['MMVet'], dataset): + prompt = question + "\nPlease first think and then answer the question. " + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + prompt = line['question'] + + message = [dict(type='text', value=prompt)] + image_num = len(tgt_path) + message.extend([dict(type='image', value=s) for s in tgt_path]) + + prompt = reorganize_prompt(message, image_num, dataset=dataset) + prompt.replace('', '') + message[0] = dict(type='text', value=prompt) + return message + + +class TeleMM2_Wrapper(BaseAPI): + is_api: bool = True + custom_prompt = 'telemm2' + prompt_map = { + 'telemm2': TeleMM2_PromptUtil() + } + + def __init__(self, + model: str = None, + retry: int = 3, + verbose: bool = True, + timeout: int = 600, + authorization: str = None, + app_id: str = None, + system_prompt: str = None, + max_tokens: int = 20480, + **kwargs): + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.timeout = timeout + + self.api_base = API_BASE + self.authorization = os.environ.get('AUTHOR', authorization) + self.app_id = os.environ.get('APP_ID', app_id) + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + self.model = model + + logger.info(f'evaluate model: {self.model}') + + self.max_tokens = 20480 + self.temperature = 0.0 + self.custom_prompt = 'telemm2' + + logger.info(f'using custom prompt {self.custom_prompt}') + logger.info(f'Init temperature: {self.temperature}') + self.system_prompt = None + + def set_dump_image(self, dump_image_func): + if self.custom_prompt in self.prompt_map: + self.prompt_map[self.custom_prompt].dump_image_func = dump_image_func + self.dump_image_func = dump_image_func + + def use_custom_prompt(self, dataset): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].use_custom_prompt(dataset) + return False + + def build_prompt(self, line, dataset=None): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].build_prompt(line, dataset) + raise NotImplementedError + + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + print('text msg:', msg['value']) + elif msg['type'] == 'image': + from PIL import Image + + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + extra_args = msg.copy() + extra_args.pop('type') + extra_args.pop('value') + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', **extra_args) + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def extract_boxed(self, text): + if "\\boxed{" not in text: + return None + + start = text.rfind("\\boxed{") + i = start + len("\\boxed{") + brace_count = 1 + content_chars = [] + + while i < len(text) and brace_count > 0: + ch = text[i] + if ch == "{": + brace_count += 1 + elif ch == "}": + brace_count -= 1 + if brace_count > 0: + content_chars.append(ch) + i += 1 + + if content_chars: + inner = "".join(content_chars).strip() + if inner: + return inner + return None + + def extract_answer(self, text, dataset_name=None): + if not isinstance(text, str): + return text + + boxed = self.extract_boxed(text) + if boxed is not None: + return boxed + + if listinstr(['MMVet'], dataset_name): + redacted_pattern = r'.*?' + if re.search(redacted_pattern, text, re.DOTALL): + result = re.sub(redacted_pattern, '', text, flags=re.DOTALL) + return result.strip() + return text + + elif listinstr(['MMMU_DEV_VAL'], dataset_name): + last_end = text.rfind("") + if last_end != -1: + result = text[last_end + len(""):] + return result.strip() + return text + else: + last_end = text.rfind("") + if last_end != -1: + result = text[last_end + len(""):] + return result.strip() + return text + + def generate_inner(self, inputs, **kwargs) -> str: + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + dataset = kwargs.pop('dataset', None) + if listinstr(['MathVista'], dataset): + self.system_prompt = PREDEFINED_SYSTEM_PROMPT.format(START="", END="") + elif listinstr(['MMMU_DEV_VAL'], dataset): + self.system_prompt = PREDEFINED_SYSTEM_PROMPT.format(START="", END="") + else: + self.system_prompt = None + + input_msgs = self.prepare_inputs(inputs) + + if listinstr(['MMMU_DEV_VAL'], dataset): + do_sample = True + temperature = 0.6 + else: + do_sample = False + temperature = 0.0 + + logger.info(f'Generate temperature: {temperature}') + + headers = { + 'Content-Type': 'application/json', + 'Authorization': self.authorization, + 'X-APP-ID': self.app_id} + + payload = dict( + model=self.model, + messages=input_msgs, + max_tokens=max_tokens, + n=1, + temperature=temperature, + do_sample=do_sample, + **kwargs) + response = requests.post( + self.api_base, + headers=headers, data=json.dumps(payload), timeout=self.timeout) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + answer = self.extract_answer(answer, dataset) + except Exception: + pass + return ret_code, answer, response + + +class TeleMM2_API(TeleMM2_Wrapper): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def generate(self, message, dataset=None): + return super(TeleMM2_API, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/telemm_thinking.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/telemm_thinking.py new file mode 100644 index 00000000..fbf1ec42 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/telemm_thinking.py @@ -0,0 +1,297 @@ +import json +import os +import re + +import numpy as np +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_MODALITY, DATASET_TYPE +from vlmeval.smp import encode_image_to_base64, get_logger, listinstr + +logger = get_logger(__name__) + +API_BASE = "https://openapi.teleagi.cn:443/aipaas/aiproxy-stream/chat/completions/8005400012" + +PREDEFINED_SYSTEM_PROMPT = """ +You are an AI assistant that rigorously follows this response protocol: + +1. First, conduct a detailed analysis of the question. Consider different \ +angles, potential solutions, and reason through the problem step-by-step. \ +Enclose this entire thinking process within {START} and {END} tags. + +2. After the thinking section, provide a clear, concise, and direct answer to \ +the user's question. Separate the answer from the think section with a newline. + +Ensure that the thinking process is thorough but remains focused on the \ +query. The final answer should be standalone and not reference the thinking \ +section. +""".strip() + + +class TeleMM2Thinking_PromptUtil: + def __init__(self): + self.cot_prompt = 'Please answer the question and put the final answer within \\boxed{}.' + + def dump_image(self, line, dataset): + return self.dump_image_func(line) + + def use_custom_prompt(self, dataset): + assert dataset is not None + assert DATASET_MODALITY(dataset) != 'VIDEO', 'not supported' + return True + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + from ..vlm.internvl.utils import (build_mcq_cot_prompt, build_multi_choice_prompt, + build_qa_cot_prompt, reorganize_prompt) + + tgt_path = self.dump_image(line, dataset) + if dataset is not None and DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['HallusionBench'], dataset): + cot_prompt = ( + "Answer the preceding yes-or-no question. Think step by step logically, considering all " + "relevant information before answering. If you are uncertain or the problem is ambiguous, make a " + "reasoned guess based on the information provided. The last line of your response must be exactly " + "in this format: 'Answer: \\boxed{Yes}' or 'Answer: \\boxed{No}' (without quotes)." + ) + prompt = question + '\n' + cot_prompt + else: + prompt = question + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = build_multi_choice_prompt(line, dataset) + if listinstr(['MMMU_DEV_VAL'], dataset): + prompt = build_mcq_cot_prompt(line, prompt, self.cot_prompt) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['OCRBench'], dataset): + OCR_prompt = ( + "\nAnswer the question using a single word or phrase. " + "The image is guaranteed to contain the correct answer. Please provide the most likely " + "answer — do not answer \"No.\". Do not answer 'r'. " + ) + prompt = question + OCR_prompt + elif listinstr(['MathVista'], dataset): + prompt = build_qa_cot_prompt(line, question, self.cot_prompt) + elif listinstr(['MMVet'], dataset): + prompt = question + "\nPlease first think and then answer the question. " + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + prompt = line['question'] + + message = [dict(type='text', value=prompt)] + image_num = len(tgt_path) + message.extend([dict(type='image', value=s) for s in tgt_path]) + + prompt = reorganize_prompt(message, image_num, dataset=dataset) + prompt.replace('', '') + message[0] = dict(type='text', value=prompt) + return message + + +class TeleMM2Thinking_Wrapper(BaseAPI): + is_api: bool = True + custom_prompt = 'telemm2thinking' + prompt_map = { + 'telemm2thinking': TeleMM2Thinking_PromptUtil() + } + + def __init__(self, + model: str = None, + retry: int = 3, + verbose: bool = True, + timeout: int = 600, + authorization: str = None, + app_id: str = None, + system_prompt: str = None, + max_tokens: int = 20480, + **kwargs): + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.timeout = timeout + + self.api_base = API_BASE + self.authorization = os.environ.get('AUTHOR', authorization) + self.app_id = os.environ.get('APP_ID', app_id) + super().__init__(retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + self.model = model + + logger.info(f'evaluate model: {self.model}') + + self.max_tokens = 20480 + self.temperature = 0.0 + self.custom_prompt = 'telemm2thinking' + + logger.info(f'using custom prompt {self.custom_prompt}') + logger.info(f'Init temperature: {self.temperature}') + self.system_prompt = None + + def set_dump_image(self, dump_image_func): + if self.custom_prompt in self.prompt_map: + self.prompt_map[self.custom_prompt].dump_image_func = dump_image_func + self.dump_image_func = dump_image_func + + def use_custom_prompt(self, dataset): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].use_custom_prompt(dataset) + return False + + def build_prompt(self, line, dataset=None): + if self.custom_prompt in self.prompt_map: + return self.prompt_map[self.custom_prompt].build_prompt(line, dataset) + raise NotImplementedError + + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + print('text msg:', msg['value']) + elif msg['type'] == 'image': + from PIL import Image + + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img) + extra_args = msg.copy() + extra_args.pop('type') + extra_args.pop('value') + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', **extra_args) + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def extract_boxed(self, text): + if "\\boxed{" not in text: + return None + + start = text.rfind("\\boxed{") + i = start + len("\\boxed{") + brace_count = 1 + content_chars = [] + + while i < len(text) and brace_count > 0: + ch = text[i] + if ch == "{": + brace_count += 1 + elif ch == "}": + brace_count -= 1 + if brace_count > 0: + content_chars.append(ch) + i += 1 + + if content_chars: + inner = "".join(content_chars).strip() + if inner: + return inner + return None + + def extract_answer(self, text, dataset_name=None): + if not isinstance(text, str): + return text + + boxed = self.extract_boxed(text) + if boxed is not None: + return boxed + + if listinstr(['MMVet'], dataset_name): + redacted_pattern = r'.*?' + if re.search(redacted_pattern, text, re.DOTALL): + result = re.sub(redacted_pattern, '', text, flags=re.DOTALL) + return result.strip() + return text + + elif listinstr(['MMMU_DEV_VAL'], dataset_name): + last_end = text.rfind("") + if last_end != -1: + result = text[last_end + len(""):] + return result.strip() + return text + else: + last_end = text.rfind("") + if last_end != -1: + result = text[last_end + len(""):] + return result.strip() + return text + + def generate_inner(self, inputs, **kwargs) -> str: + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + dataset = kwargs.pop('dataset', None) + if listinstr(['MathVista'], dataset): + self.system_prompt = PREDEFINED_SYSTEM_PROMPT.format(START="", END="") + elif listinstr(['MMMU_DEV_VAL'], dataset): + self.system_prompt = PREDEFINED_SYSTEM_PROMPT.format(START="", END="") + else: + self.system_prompt = None + + input_msgs = self.prepare_inputs(inputs) + + if listinstr(['MMMU_DEV_VAL'], dataset): + do_sample = True + temperature = 0.6 + else: + do_sample = False + temperature = 0.0 + + logger.info(f'Generate temperature: {temperature}') + + headers = { + 'Content-Type': 'application/json', + 'Authorization': self.authorization, + "Model": "TeleMM2Thinking", + 'X-APP-ID': self.app_id} + + payload = dict( + model=self.model, + messages=input_msgs, + max_tokens=max_tokens, + n=1, + temperature=temperature, + do_sample=do_sample, + **kwargs) + response = requests.post( + self.api_base, + headers=headers, data=json.dumps(payload), timeout=self.timeout) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + answer = self.extract_answer(answer, dataset) + except Exception: + pass + return ret_code, answer, response + + +class TeleMM2Thinking_API(TeleMM2Thinking_Wrapper): + + def __init__(self, **kwargs): + super().__init__(**kwargs) + + def generate(self, message, dataset=None): + return super(TeleMM2Thinking_API, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/together.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/together.py new file mode 100644 index 00000000..269d1e91 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/together.py @@ -0,0 +1,141 @@ +""" +Together AI API support for vision models. +OpenAI-compatible chat completions with image_url (base64 or URL). +Set TOGETHER_API_KEY or pass key=... + +Requires: pip install together (optional; or use requests with api_base) +""" +import json +import os + +import numpy as np +import requests + +from ..smp import encode_image_to_base64, get_logger +from .base import BaseAPI + +logger = get_logger(__name__) + +TOGETHER_API_BASE = "https://api.together.xyz/v1/chat/completions" + + +class TogetherAPI(BaseAPI): + """VLM API using Together AI (OpenAI-compatible; supports vision).""" + + is_api: bool = True + + def __init__( + self, + model: str = "meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", + key: str = None, + api_base: str = None, + retry: int = 10, + wait: int = 1, + system_prompt: str = None, + verbose: bool = True, + temperature: float = 0, + max_tokens: int = 2048, + timeout: int = 300, + img_size: int = -1, + **kwargs, + ): + self.model = model + self.key = key or os.environ.get("TOGETHER_API_KEY") + self.api_base = api_base or os.environ.get("TOGETHER_API_BASE", TOGETHER_API_BASE) + self.temperature = temperature + self.max_tokens = max_tokens + self.timeout = timeout + self.img_size = img_size + + if not self.key: + raise ValueError( + "Together API key is required. Set TOGETHER_API_KEY or pass key=..." + ) + + super().__init__( + retry=retry, + wait=wait, + system_prompt=system_prompt, + verbose=verbose, + **kwargs, + ) + + logger.info(f"TogetherAPI: model={self.model}, api_base={self.api_base}") + + def _prepare_content(self, inputs): + """Build OpenAI-style content list (text + image_url with base64).""" + assert all(isinstance(x, dict) for x in inputs) + has_images = np.sum([x["type"] == "image" for x in inputs]) + if has_images: + content_list = [] + for item in inputs: + if item["type"] == "text": + content_list.append({"type": "text", "text": item["value"]}) + elif item["type"] == "image": + from PIL import Image + + img = Image.open(item["value"]) + b64 = encode_image_to_base64(img, target_size=self.img_size) + content_list.append({ + "type": "image_url", + "image_url": {"url": f"data:image/jpeg;base64,{b64}"}, + }) + return content_list + text = "\n".join([x["value"] for x in inputs if x["type"] == "text"]) + return [{"type": "text", "text": text or ""}] + + def _prepare_messages(self, inputs): + if self.system_prompt: + out = [{"role": "system", "content": self.system_prompt}] + else: + out = [] + if inputs and "role" in inputs[0]: + for item in inputs: + out.append({ + "role": item["role"], + "content": self._prepare_content(item["content"]), + }) + else: + out.append({"role": "user", "content": self._prepare_content(inputs)}) + return out + + def generate_inner(self, inputs, **kwargs): + temperature = kwargs.pop("temperature", self.temperature) + max_tokens = kwargs.pop("max_tokens", self.max_tokens) + + messages = self._prepare_messages(inputs) + payload = { + "model": self.model, + "messages": messages, + "temperature": temperature, + "max_tokens": max_tokens, + } + + try: + response = requests.post( + self.api_base, + headers={ + "Authorization": f"Bearer {self.key}", + "Content-Type": "application/json", + }, + data=json.dumps(payload), + timeout=self.timeout * 1.1, + ) + except Exception as err: + if self.verbose: + logger.error(f"{type(err).__name__}: {err}") + return -1, self.fail_msg, str(err) + + ret_code = response.status_code + ret_code = 0 if (200 <= ret_code < 300) else ret_code + answer = self.fail_msg + + try: + data = response.json() + answer = data["choices"][0]["message"]["content"].strip() + except Exception as err: + if self.verbose: + logger.error(f"{type(err).__name__}: {err}") + logger.error(response.text) + + return ret_code, answer, response diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/video_chat_online_v2.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/video_chat_online_v2.py new file mode 100644 index 00000000..5b2da4b8 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/api/video_chat_online_v2.py @@ -0,0 +1,285 @@ +# flake8: noqa +import base64 +import json +import os +import os.path as osp +import string + +import pandas as pd +import requests + +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import DATASET_TYPE, img_root_map +from vlmeval.smp import (LMUDataRoot, cn_string, concat_images_vlmeval, + decode_base64_to_image_file, get_logger, listinstr, read_ok, toliststr) + +logger = get_logger(__name__) + +class VideoChatOnlineV2Wrapper(BaseAPI): + is_api: bool = True + INTERLEAVE = False + + def __init__(self, + model: str = 'VideoChatOnlineV2', + retry: int = 5, + wait: int = 5, + url: str = '', + key: str = '', + verbose: bool = True, + system_prompt: str = None, + temperature: float = 0.7, + max_tokens: int = 2048, + proxy: str = None, + **kwargs): + self.model = model + + self.temperature = temperature + self.max_tokens = max_tokens + self.url = url + self.key = key + + + super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + def dump_image(self, line, dataset): + """Dump the image(s) of the input line to the corresponding dataset folder. + + Args: + line (line of pd.DataFrame): The raw input line. + dataset (str): The name of the dataset. + + Returns: + str | list[str]: The paths of the dumped images. + """ + ROOT = LMUDataRoot() + assert isinstance(dataset, str) + + img_root = os.path.join(ROOT, 'images', img_root_map(dataset) if dataset in img_root_map(dataset) else dataset) + os.makedirs(img_root, exist_ok=True) + if 'image' in line: + if isinstance(line['image'], list): + tgt_path = [] + assert 'image_path' in line + for img, im_name in zip(line['image'], line['image_path']): + path = osp.join(img_root, im_name) + if not read_ok(path): + decode_base64_to_image_file(img, path) + tgt_path.append(path) + else: + tgt_path = osp.join(img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + tgt_path = [tgt_path] + else: + assert 'image_path' in line + tgt_path = toliststr(line['image_path']) + + return tgt_path + + def use_custom_prompt(self, dataset): + assert dataset is not None + if listinstr(['MMMU_DEV_VAL','MMMU_TEST'], dataset): + return False + else: + return True + + def build_multi_choice_prompt(self, line, dataset=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + prompt += '\n请直接回答选项字母。' if cn_string( + prompt) else "\nAnswer with the option's letter from the given choices directly." + else: + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + + return prompt + + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + + tgt_path = self.dump_image(line, dataset) + + if dataset is not None and listinstr(['MME'], dataset): + question = line['question'] + prompt = question + ' Answer the question using a single word or phrase.' + elif dataset is not None and listinstr(['HallusionBench'], dataset): + question = line['question'] + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = self.build_multi_choice_prompt(line, dataset) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + if listinstr(['MathVista', 'MathVision'], dataset): + prompt = line['question'] + elif listinstr(['LLaVABench'], dataset): + question = line['question'] + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['MMVet'], dataset): + prompt = line['question'] + else: + question = line['question'] + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + prompt = line['question'] + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + return message + + + def message_to_promptimg(self, message, dataset=None): + assert not self.INTERLEAVE + model_name = self.__class__.__name__ + import warnings + warnings.warn( + f'Model {model_name} does not support interleaved input. ' + 'Will use the first image and aggregated texts as prompt. ') + num_images = len([x for x in message if x['type'] == 'image']) + if num_images == 0: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = None + else: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + if dataset == 'BLINK': + image = concat_images_vlmeval( + [x['value'] for x in message if x['type'] == 'image'], + target_size=512) + else: + image = [x['value'] for x in message if x['type'] == 'image'][0] + return prompt, image + + def get_send_data(self,prompt, image_path, temperature, max_tokens,stream=False): + image = '' + with open(image_path, 'rb') as f: + image = str(base64.b64encode(f.read()), 'utf-8') + send_data = { + "messages": [ + { + "role": "user", + "content": prompt + } + ], + "image_base64": image, + "max_tokens": max_tokens, + "temperature": temperature, + "do_sample": False, + "stream": stream + } + return send_data + + def get_send_data_no_image(self,prompt, temperature, max_tokens, stream=False): + send_data = { + "messages": [ + { + "role": "user", + "content": prompt + } + ], + "max_tokens": max_tokens, + "temperature": temperature, + "do_sample": False, + "stream": stream + } + return send_data + + def generate_inner(self, inputs, **kwargs) -> str: + assert isinstance(inputs, str) or isinstance(inputs, list) + inputs = [inputs] if isinstance(inputs, str) else inputs + dataset = kwargs.get('dataset', None) + prompt, image_path = self.message_to_promptimg(message=inputs, dataset=dataset) + + if image_path: + send_data = self.get_send_data( + prompt=prompt, + image_path=image_path, + temperature=self.temperature, + max_tokens=self.max_tokens, + stream = True) + else: + send_data = self.get_send_data_no_image( + prompt=prompt, + temperature=self.temperature, + max_tokens=self.max_tokens, + stream = True) + + json_data = json.dumps(send_data) + + header_dict = {'Content-Type': 'application/json','Authorization':self.key} + + r = requests.post(self.url, headers=header_dict, data=json_data, timeout=3000,stream=True) + try: + if send_data.get('stream', False): + # 流式处理 + chunks = [] + full_content = "" + last_valid_usage = None # 用于记录最后一个有效的usage + + try: + for line in r.iter_lines(): + if line: + decoded_line = line.decode('utf-8') + if decoded_line.startswith('data: '): + event_data = decoded_line[6:] + if event_data == '[DONE]': + break + try: + chunk = json.loads(event_data) + chunks.append(chunk) + + # 记录最后一个有效的usage(不累加) + if 'usage' in chunk: + last_valid_usage = chunk['usage'] + + # 实时输出内容 + if 'choices' in chunk: + for choice in chunk['choices']: + if 'delta' in choice and 'content' in choice['delta']: + content = choice['delta']['content'] + # print(content, end='', flush=True) + full_content += content + except json.JSONDecodeError: + continue + + # print('流式输出内容:', full_content) + + return 0,full_content,'Succeeded! ' + + except Exception as e: + return -1,f'Error: {str(e)}','' + else: + # 非流式处理 + try: + r_json = r.json() + output = r_json['choices'][0]['message']['content'] + return 0,output,'Succeeded! ' + except: + error_msg = f'Error! code {r.status_code} content: {r.content}' + error_con = r.content.decode('utf-8') + if self.verbose: + logger.error(error_msg) + logger.error(error_con) + logger.error(f'The input messages are {inputs}.') + return -1,error_msg,'' + except Exception as e: + return -1,f'Error: {str(e)}','' + + + + +class VideoChatOnlineV2API(VideoChatOnlineV2Wrapper): + + def generate(self, message, dataset=None): + return super(VideoChatOnlineV2API, self).generate(message, dataset=dataset) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/config.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/config.py new file mode 100644 index 00000000..e65f2d5a --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/config.py @@ -0,0 +1,2663 @@ +import copy as cp +import os +from functools import partial + +import vlmeval.api as api +import vlmeval.vlm as vlm + +PandaGPT_ROOT = None +MiniGPT4_ROOT = None +TransCore_ROOT = None +Yi_ROOT = None +OmniLMM_ROOT = None +Mini_Gemini_ROOT = None +VXVERSE_ROOT = None +VideoChat2_ROOT = None +VideoChatGPT_ROOT = None +PLLaVA_ROOT = None +RBDash_ROOT = None +VITA_ROOT = None +LLAVA_V1_7B_MODEL_PTH = "Please set your local path to LLaVA-7B-v1.1 here, the model weight is obtained by merging LLaVA delta weight based on vicuna-7b-v1.1 in https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md with vicuna-7b-v1.1. " + +video_models = { + "Cosmos-Reason1-7B": partial( + vlm.CosmosReasonHF, + model_path="nvidia/Cosmos-Reason1-7B", + ), + # An example of a finetuned Reason1 model. Model path can be on the HuggingFace hub or locally. + "Cosmos-Reason1-7B-VideoPhy2": partial( + vlm.CosmosReasonHF, + model_path="shunznv/Cosmos-Reason1-7B-VideoPhy2", + ), + "Video-LLaVA-7B": partial(vlm.VideoLLaVA, model_path="LanguageBind/Video-LLaVA-7B"), + "Video-LLaVA-7B-HF": partial( + vlm.VideoLLaVA_HF, model_path="LanguageBind/Video-LLaVA-7B-hf" + ), + "VideoChat2-HD": partial( + vlm.VideoChat2_HD, + model_path="OpenGVLab/VideoChat2_HD_stage4_Mistral_7B", + root=VideoChat2_ROOT, + config_file="./vlmeval/vlm/video_llm/configs/videochat2_hd.json", + ), + "Chat-UniVi-7B": partial(vlm.Chatunivi, model_path="Chat-UniVi/Chat-UniVi"), + "Chat-UniVi-7B-v1.5": partial( + vlm.Chatunivi, model_path="Chat-UniVi/Chat-UniVi-7B-v1.5" + ), + "LLaMA-VID-7B": partial( + vlm.LLaMAVID, model_path="YanweiLi/llama-vid-7b-full-224-video-fps-1" + ), + "Video-ChatGPT": partial( + vlm.VideoChatGPT, model_path="MBZUAI/Video-ChatGPT-7B", dir_root=VideoChatGPT_ROOT + ), + "PLLaVA-7B": partial(vlm.PLLaVA, model_path="ermu2001/pllava-7b", dir_root=PLLaVA_ROOT), + "PLLaVA-13B": partial( + vlm.PLLaVA, model_path="ermu2001/pllava-13b", dir_root=PLLaVA_ROOT + ), + "PLLaVA-34B": partial( + vlm.PLLaVA, model_path="ermu2001/pllava-34b", dir_root=PLLaVA_ROOT + ), +} + +ungrouped = { + "AKI": partial(vlm.AKI, name="AKI", ckpt_pth="Sony/AKI-4B-phi-3.5-mini"), + "TransCore_M": partial(vlm.TransCoreM, root=TransCore_ROOT), + "PandaGPT_13B": partial(vlm.PandaGPT, name="PandaGPT_13B", root=PandaGPT_ROOT), + "flamingov2": partial( + vlm.OpenFlamingo, + name="v2", + mpt_pth="anas-awadalla/mpt-7b", + ckpt_pth="openflamingo/OpenFlamingo-9B-vitl-mpt7b", + ), + "VisualGLM_6b": partial(vlm.VisualGLM, model_path="THUDM/visualglm-6b"), + "mPLUG-Owl2": partial(vlm.mPLUG_Owl2, model_path="MAGAer13/mplug-owl2-llama2-7b"), + "mPLUG-Owl3": partial(vlm.mPLUG_Owl3, model_path="mPLUG/mPLUG-Owl3-7B-240728"), + "OmniLMM_12B": partial( + vlm.OmniLMM12B, model_path="openbmb/OmniLMM-12B", root=OmniLMM_ROOT + ), + "MGM_7B": partial( + vlm.Mini_Gemini, model_path="YanweiLi/MGM-7B-HD", root=Mini_Gemini_ROOT + ), + "Bunny-llama3-8B": partial(vlm.BunnyLLama3, model_path="BAAI/Bunny-v1_1-Llama-3-8B-V"), + "VXVERSE": partial(vlm.VXVERSE, model_name="XVERSE-V-13B", root=VXVERSE_ROOT), + "360VL-70B": partial(vlm.QH_360VL, model_path="qihoo360/360VL-70B"), + "Llama-3-MixSenseV1_1": partial( + vlm.LLama3Mixsense, model_path="Zero-Vision/Llama-3-MixSenseV1_1" + ), + "Parrot": partial(vlm.Parrot, model_path="AIDC-AI/Parrot-7B"), + "OmChat": partial(vlm.OmChat, model_path="omlab/omchat-v2.0-13B-single-beta_hf"), + "RBDash_72b": partial( + vlm.RBDash, model_path="RBDash-Team/RBDash-v1.5", root=RBDash_ROOT + ), + "Pixtral-12B": partial(vlm.Pixtral, model_path="mistralai/Pixtral-12B-2409"), + "Ministral-3-14B-Instruct-2512_api": partial( + api.LMDeployAPI, + api_base="http://0.0.0.0:8000/v1/chat/completions", + model='Ministral-3-14B-Instruct-2512', + temperature=0.15, + max_new_tokens=32768, + retry=6, + timeout=1800, + ), + "Falcon2-VLM-11B": partial(vlm.Falcon2VLM, model_path="tiiuae/falcon-11B-vlm"), + "KVL": partial(vlm.InternVLChat, model_path="amoeba04/KVL", version="V2.0"), +} + +o1_key = os.environ.get('O1_API_KEY', None) +o1_base = os.environ.get('O1_API_BASE', None) +o1_apis = { + 'o1': partial( + api.GPT4V, + model="o1-2024-12-17", + key=o1_key, + api_base=o1_base, + temperature=0, + img_detail='high', + retry=3, + timeout=1800, + max_tokens=16384, + verbose=False, + + ), + 'o3': partial( + api.GPT4V, + model="o3-2025-04-16", + key=o1_key, + api_base=o1_base, + temperature=0, + img_detail='high', + retry=3, + timeout=1800, + max_tokens=16384, + verbose=False, + ), + 'o4-mini': partial( + api.GPT4V, + model="o4-mini-2025-04-16", + key=o1_key, + api_base=o1_base, + temperature=0, + img_detail='high', + retry=3, + timeout=1800, + max_tokens=16384, + verbose=False, + ), +} + +api_models = { + # Lepton API models + "qwen3_235b_a22b": partial( + api.CosmosReason2, + model="Qwen3-VL-235B-A22B-Instruct", + api_base="https://b5k2m9x7-qwen3-vl-235b-a22b-instruct.xenon.lepton.run/v1/chat/completions", + temperature=0, + retry=10, + timeout=300, # Increased timeout for video processing (5 minutes) + verbose=False, + ), + "qwen3_30b_a3b": partial( + api.CosmosReason2, + model="Qwen3-VL-30B-A3B-Instruct", + api_base="https://b5k2m9x7-qwen3-vl-30b-a3b-instruct.xenon.lepton.run/v1/chat/completions", + temperature=0, + retry=10, + timeout=300, # Increased timeout for video processing (5 minutes) + ), + "qwen3_8b": partial( + api.CosmosReason2, + model="Qwen3-VL-8B-Instruct", + api_base="https://b5k2m9x7-qwen3-vl-8b.xenon.lepton.run/v1/chat/completions", + temperature=0, + retry=10, + ), + "cosmos_reason1_7b": partial( + api.CosmosReason1, + model="Cosmos-Reason1-7B", + api_base="https://b5k2m9x7-cosmos-reason1-7b-1107.xenon.lepton.run/v1/chat/completions", + temperature=0, + retry=10, + ), + "GeminiPro3-0": partial( + api.Gemini, model="gemini-3-pro-preview", temperature=0, retry=10, max_tokens=16384 + ), + "LVS_service": partial( + api.LVS_service, + model="Qwen/Qwen3-VL-8B-Instruct", + temperature=0, + retry=5, + verbose=False, + max_tokens=16384, + api_base="http://10.111.53.95:8009", + fail_msg={"events": [], "total_events": 0, "video_summary": ""}, + chunk_duration=10, + num_frames_per_chunk=15, + ), + # GPT + "GPT4V": partial( + api.GPT4V, + model="gpt-4-1106-vision-preview", + temperature=0, + img_size=512, + img_detail="low", + retry=10, + verbose=False, + ), + "GPT4V_HIGH": partial( + api.GPT4V, + model="gpt-4-1106-vision-preview", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "GPT4V_20240409": partial( + api.GPT4V, + model="gpt-4-turbo-2024-04-09", + temperature=0, + img_size=512, + img_detail="low", + retry=10, + verbose=False, + ), + "GPT4V_20240409_HIGH": partial( + api.GPT4V, + model="gpt-4-turbo-2024-04-09", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "GPT4o": partial( + api.GPT4V, + model="gpt-4o-2024-05-13", + temperature=0, + img_size=512, + img_detail="low", + retry=10, + verbose=False, + ), + "GPT4o_HIGH": partial( + api.GPT4V, + model="gpt-4o-2024-05-13", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "GPT4o_20240806": partial( + api.GPT4V, + model="gpt-4o-2024-08-06", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "GPT4o_20241120": partial( + api.GPT4V, + model="gpt-4o-2024-11-20", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "ChatGPT4o": partial( + api.GPT4V, + model="chatgpt-4o-latest", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "GPT4o_MINI": partial( + api.GPT4V, + model="gpt-4o-mini-2024-07-18", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "GPT4.5": partial( + api.GPT4V, + model='gpt-4.5-preview-2025-02-27', + temperature=0, + timeout=600, + img_size=-1, + img_detail='high', + retry=10, + verbose=False, + ), + "gpt-4.1-2025-04-14": partial( + api.GPT4V, + model="gpt-4.1-2025-04-14", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "gpt-4.1-mini-2025-04-14": partial( + api.GPT4V, + model="gpt-4.1-mini-2025-04-14", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "gpt-4.1-nano-2025-04-14": partial( + api.GPT4V, + model="gpt-4.1-nano-2025-04-14", + temperature=0, + img_size=-1, + img_detail="high", + retry=10, + verbose=False, + ), + "gpt-5-2025-08-07": partial( + api.GPT4V, + model="gpt-5-2025-08-07", + img_detail="high", + retry=3, + verbose=False, + max_tokens=2**14, + timeout=3000, + ), + "gpt-5-mini-2025-08-07": partial( + api.GPT4V, + model="gpt-5-mini-2025-08-07", + img_detail="high", + retry=3, + verbose=False, + max_tokens=2**14, + timeout=300, + ), + "gpt-5-nano-2025-08-07": partial( + api.GPT4V, + model="gpt-5-nano-2025-08-07", + img_detail="high", + retry=3, + verbose=False, + max_tokens=2**14, + timeout=300, + ), + "gpt-5.1-2025-11-13": partial( + api.GPT4V, + model="gpt-5.1-2025-11-13", + img_detail="high", + retry=3, + verbose=False, + max_tokens=2**14, + timeout=300, + ), + # Gemini + "GeminiPro1-0": partial( + api.Gemini, model="gemini-1.0-pro", temperature=0, retry=10 + ), # now GeminiPro1-0 is only supported by vertex backend + "GeminiPro1-5": partial( + api.Gemini, model="gemini-1.5-pro", temperature=0, retry=10 + ), + "GeminiFlash1-5": partial( + api.Gemini, model="gemini-1.5-flash", temperature=0, retry=10 + ), + "GeminiPro1-5-002": partial( + api.GPT4V, model="gemini-1.5-pro-002", temperature=0, retry=10 + ), # Internal Use Only + "GeminiFlash1-5-002": partial( + api.GPT4V, model="gemini-1.5-flash-002", temperature=0, retry=10 + ), # Internal Use Only + "GeminiFlash2-0": partial( + api.Gemini, model="gemini-2.0-flash", temperature=0, retry=10 + ), + "GeminiFlashLite2-0": partial( + api.Gemini, model="gemini-2.0-flash-lite", temperature=0, retry=10 + ), + "GeminiFlash2-5": partial( + api.Gemini, model="gemini-2.5-flash", temperature=0, retry=10 + ), + "GeminiPro2-5": partial( + api.GPT4V, + model="gemini-2.5-pro", + temperature=0, + retry=10, + timeout=6000, + max_tokens=65536, + ), + "Gemini-3.1-Pro-Preview": partial( + api.GPT4V, + model="gemini-3.1-pro-preview-thinking", + retry=10, + timeout=3600, + max_tokens=65536, + img_detail='high', + ), + + # GCP Vertex AI – Claude (same GCPVertexAPI; model name selects Claude backend) + "GCP_Claude3-5Sonnet": partial( + api.GCPVertexAPI, + model="claude-3-5-sonnet-20241022", + temperature=0, + retry=10, + ), + "GCP_Claude3-5Haiku": partial( + api.GCPVertexAPI, + model="claude-3-5-haiku@20241022", + temperature=0, + retry=10, + ), + "GCP_Claude3-7Sonnet": partial( + api.GCPVertexAPI, + model="claude-3-7-sonnet@20250219", + temperature=0, + retry=10, + ), + "GCP_ClaudeSonnet4-5": partial( + api.GCPVertexAPI, + model="claude-sonnet-4-5@20250929", + temperature=0, + retry=10, + ), + "GCP_ClaudeOpus4-6": partial( + api.GCPVertexAPI, + model="claude-opus-4-6", + temperature=0, + retry=10, + ), + # Qwen-VL + "QwenVLPlus": partial(api.QwenVLAPI, model="qwen-vl-plus", temperature=0, retry=10), + "QwenVLMax": partial(api.QwenVLAPI, model="qwen-vl-max", temperature=0, retry=10), + "QwenVLMax-250408": partial(api.QwenVLAPI, model="qwen-vl-max-2025-04-08", temperature=0, retry=10), + + # Reka + "RekaEdge": partial(api.Reka, model="reka-edge-20240208"), + "RekaFlash": partial(api.Reka, model="reka-flash-20240226"), + "RekaCore": partial(api.Reka, model="reka-core-20240415"), + # Step1V + "Step1V": partial( + api.GPT4V, + model="step-1v-32k", + api_base="https://api.stepfun.com/v1/chat/completions", + temperature=0, + retry=10, + img_size=-1, + img_detail="high", + ), + "Step1.5V-mini": partial( + api.GPT4V, + model="step-1.5v-mini", + api_base="https://api.stepfun.com/v1/chat/completions", + temperature=0, + retry=10, + img_size=-1, + img_detail="high", + ), + "Step1o": partial( + api.GPT4V, + model="step-1o-vision-32k", + api_base="https://api.stepfun.com/v1/chat/completions", + temperature=0, + retry=10, + img_size=-1, + img_detail="high", + ), + "Step3-VL-10B_api": partial( + api.LMDeployAPI, + api_base="http://0.0.0.0:8000/v1/chat/completions", + model='Step3-VL-10B', + temperature=1.0, + max_tokens=40960, + repetition_penalty=1.0, + presence_penalty=0.0, + top_p=0.95, + top_k=20, + retry=6, + timeout=1800, + ), + # Yi-Vision + "Yi-Vision": partial( + api.GPT4V, + model="yi-vision", + api_base="https://api.lingyiwanwu.com/v1/chat/completions", + temperature=0, + retry=10, + ), + # Together AI (set TOGETHER_API_KEY) + "Together_Llama3.2-11B-Vision": partial( + api.TogetherAPI, + model="meta-llama/Llama-3.2-11B-Vision-Instruct-Turbo", + temperature=0, + max_tokens=2048, + retry=10, + ), + "Together_Llama3.2-90B-Vision": partial( + api.TogetherAPI, + model="meta-llama/Llama-3.2-90B-Vision-Instruct-Turbo", + temperature=0, + max_tokens=2048, + retry=10, + ), + "Together_Llama4-Scout-17B": partial( + api.TogetherAPI, + model="meta-llama/Llama-4-Scout-17B-16E-Instruct", + temperature=0, + max_tokens=2048, + retry=10, + ), + "Together_Llama4-Maverick-17B": partial( + api.TogetherAPI, + model="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", + temperature=0, + max_tokens=2048, + retry=10, + ), + "Together_Qwen2-VL-72B": partial( + api.TogetherAPI, + model="Qwen/Qwen2-VL-72B-Instruct", + temperature=0, + max_tokens=2048, + retry=10, + ), + # MiniMax (set MINIMAX_API_KEY) + "MiniMax-M2.7": partial( + api.MiniMaxAPI, + model="MiniMax-M2.7", + temperature=0, + max_tokens=2048, + retry=10, + ), + "MiniMax-M2.5": partial( + api.MiniMaxAPI, + model="MiniMax-M2.5", + temperature=0, + max_tokens=2048, + retry=10, + ), + "MiniMax-M2.5-highspeed": partial( + api.MiniMaxAPI, + model="MiniMax-M2.5-highspeed", + temperature=0, + max_tokens=2048, + retry=10, + ), + # Claude + "Claude3V_Opus": partial( + api.Claude3V, model="claude-3-opus-20240229", temperature=0, retry=10, verbose=False + ), + "Claude3V_Sonnet": partial( + api.Claude3V, + model="claude-3-sonnet-20240229", + temperature=0, + retry=10, + verbose=False, + ), + "Claude3V_Haiku": partial( + api.Claude3V, + model="claude-3-haiku-20240307", + temperature=0, + retry=10, + verbose=False, + ), + "Claude3-5V_Sonnet": partial( + api.Claude3V, + model="claude-3-5-sonnet-20240620", + temperature=0, + retry=10, + verbose=False, + ), + "Claude3-5V_Sonnet_20241022": partial( + api.Claude3V, + model="claude-3-5-sonnet-20241022", + temperature=0, + retry=10, + verbose=False, + ), + "Claude3-7V_Sonnet": partial( + api.Claude3V, + model="claude-3-7-sonnet-20250219", + temperature=0, + retry=10, + verbose=False, + ), + "Claude4_Opus": partial( + api.Claude3V, + model="claude-4-opus-20250514", + temperature=0, + retry=10, + verbose=False, + timeout=1800 + ), + "Claude4_Sonnet": partial( + api.Claude3V, + model="claude-4-sonnet-20250514", + temperature=0, + retry=10, + verbose=False, + timeout=1800 + ), + "Claude-Opus-4-6": partial( + api.GPT4V, + model="claude-opus-4-6-thinking", + retry=10, + timeout=3600, + max_tokens=65536, + img_detail='high', + ), + "GPT-5.4-2026-03-05": partial( + api.GPT4V, + model="gpt-5.4-2026-03-05", + retry=10, + timeout=3600, + max_tokens=65536, + img_detail='high', + ), + "GPT-5.2": partial( + api.GPT4V, + model="gpt-5.2", + retry=10, + timeout=3600, + max_tokens=65536, + img_detail='high', + ), + # AWS Bedrock (Converse API; set AWS_REGION or pass region_name) + "Bedrock_Claude3-5Sonnet": partial( + api.BedrockAPI, + model_id="anthropic.claude-3-5-sonnet-20241022-v2:0", + temperature=0, + retry=10, + ), + "Bedrock_Claude3Opus": partial( + api.BedrockAPI, + model_id="anthropic.claude-3-opus-20240229-v1:0", + temperature=0, + retry=10, + ), + "Bedrock_Claude3Sonnet": partial( + api.BedrockAPI, + model_id="anthropic.claude-3-sonnet-20240229-v1:0", + temperature=0, + retry=10, + ), + "Bedrock_Claude3Haiku": partial( + api.BedrockAPI, + model_id="anthropic.claude-3-haiku-20240307-v1:0", + temperature=0, + retry=10, + ), + # GLM4V + "GLM4V": partial(api.GLMVisionAPI, model="glm4v-biz-eval", temperature=0, retry=10), + "GLM4V_PLUS": partial(api.GLMVisionAPI, model="glm-4v-plus", temperature=0, retry=10), + "GLM4V_PLUS_20250111": partial( + api.GLMVisionAPI, model="glm-4v-plus-0111", temperature=0, retry=10 + ), + # MiniMax abab + "abab6.5s": partial( + api.GPT4V, + model="abab6.5s-chat", + api_base="https://api.minimax.chat/v1/chat/completions", + temperature=0, + retry=10, + ), + "abab7-preview": partial( + api.GPT4V, + model="abab7-chat-preview", + api_base="https://api.minimax.chat/v1/chat/completions", + temperature=0, + retry=10, + ), + # CongRong + "CongRong-v1.5": partial(api.CWWrapper, model="cw-congrong-v1.5", temperature=0, retry=10), + "CongRong-v2.0": partial(api.CWWrapper, model="cw-congrong-v2.0", temperature=0, retry=10), + # SenseNova + "SenseNova-V6-Pro": partial( + api.SenseChatVisionAPI, model="SenseNova-V6-Pro", temperature=0, retry=10 + ), + "SenseNova-V6-Reasoner": partial( + api.SenseChatVisionAPI, model="SenseNova-V6-Reasoner", temperature=0, retry=10 + ), + "SenseNova-V6-5-Pro": partial( + api.SenseChatVisionAPI, model="SenseNova-V6-5-Pro", retry=10 + ), + "SenseNova-V6-5-Pro-20251215": partial( + api.SenseChatVisionV2API, model="SenseNova-V6-5-Pro-20251215", max_completion_tokens=40960, repetition_penalty=1.05, temperature=0.6, top_p=0.95, top_k=20, timeout=1800, retry=3, img_size=4096 + ), + "HunYuan-Vision": partial( + api.HunyuanVision, model="hunyuan-vision", temperature=0, retry=10 + ), + "HunYuan-Standard-Vision": partial( + api.HunyuanVision, model="hunyuan-standard-vision", temperature=0, retry=10 + ), + "HunYuan-Large-Vision": partial( + api.HunyuanVision, model="hunyuan-large-vision", temperature=0, retry=10 + ), + "BailingMM-Lite-1203": partial( + api.bailingMMAPI, model="BailingMM-Lite-1203", temperature=0, retry=10 + ), + "BailingMM-Pro-0120": partial( + api.bailingMMAPI, model="BailingMM-Pro-0120", temperature=0, retry=10 + ), + # BlueLM-2.5 + "BlueLM-2.5-3B": partial(api.BlueLM_API, model="BlueLM-2.5-3B", temperature=0, retry=3), + # JiuTian-VL + "JTVL": partial(api.JTVLChatAPI, model="jt-vl-chat", temperature=0, retry=10), + "JTVL-Mini": partial(api.JTVLChatAPI_Mini, model="jt-vl-chat-mini", temperature=0, retry=10), + "JTVL-2B": partial(api.JTVLChatAPI_2B, model="jt-vl-chat-2b", temperature=0, retry=10), + "VideoChatOnlineV2": partial(api.VideoChatOnlineV2API, model="videochatonline_v2", temperature=0, retry=10), + "Taiyi": partial(api.TaiyiAPI, model="taiyi", temperature=0, retry=10), + # TeleMM + "TeleMM": partial(api.TeleMMAPI, model="TeleAI/TeleMM", temperature=0, retry=10), + "TeleMM2.0": partial(api.TeleMM2_API, model="TeleAI/TeleMM", retry=3, timeout=600), + "TeleMM2.0Thinking": partial(api.TeleMM2Thinking_API, model="TeleAI/TeleMM", retry=3, timeout=600), + "Qwen2.5-VL-32B-Instruct-SiliconFlow": partial( + api.SiliconFlowAPI, model="Qwen/Qwen2.5-VL-32B-Instruct", temperature=0, retry=10), + "Qwen3-VL-8B--crop--arm_thinker_prompt--sglang": partial( + api.ARM_thinker, + mode="agent", + agent_repo_root="/path/to/your/ARM-Thinker", + model="Qwen/Qwen3-VL-8B-Instruct", + retry=10, + timeout=300, + api_base="http://100.97.158.184:38888/v1/chat/completions", + key="EMPTY", + temperature=0.0, + max_tokens=4096, + # agent params + max_round=16, + max_tool_response_length=4096, + tool_config_path="/path/to/your/ARM-Thinker/examples/self/multiturn/config/tool_config/image_zoom_in_tool_config.yaml", + # special for sglang server + use_role_tool=False, + system_template_type="CommonSystemTemplate", + # extra prompt to adapt to the ARM-Thinker prompt template [CommonSystemTemplate] + extra_pt="\n\n**Important Requirement:**\nThe given image is `original_image`. You must output your reasoning inside `...`. After reasoning, either output the final answer within `...` or call a tool within `...`. You may call tools multiple times across turns to assist with judgment or verification, **but only one tool per turn**. If a tool call fails, you can retry or stop and give your final answer. Once no more tool calls are needed, provide your final answer or judgment within `...`.", + ), + "Qwen3-VL-8B--crop--official_prompt--vllm": partial( + api.ARM_thinker, + mode="agent", + agent_repo_root="/path/to/your/ARM-Thinker", + model="Qwen/Qwen3-VL-8B-Instruct", + retry=10, + timeout=300, + api_base="http://100.97.203.103:40001/v1/chat/completions", + key="EMPTY", + temperature=0.0, + max_tokens=4096, + extra_pt="", + # agent params + max_round=16, + max_tool_response_length=4096, + system_template_type="Qwen3VLSystemTemplateWithTools", + tool_config_path="/path/to/your/ARM-Thinker/examples/self/multiturn/config/tool_config/image_zoom_in_tool_qwen3vl_config.yaml", + use_role_tool=True, + ), + # lmdeploy api + "lmdeploy_internvl_78B_MPO": partial( + api.LMDeployAPI, + model="InternVL2_5-78B-MPO", + api_base="http://0.0.0.0:23333/v1/chat/completions", + temperature=0, + retry=10, + timeout=100, + ), + "lmdeploy_qvq_72B_preview": partial( + api.LMDeployAPI, + model="QVQ-72B-Preview", + api_base="http://0.0.0.0:23333/v1/chat/completions", + temperature=0, + retry=10, + timeout=300, + ), + 'Taichu-VLR-3B': partial( + api.TaichuVLRAPI, + model='taichu_vlr_3b', + url="https://platform.wair.ac.cn/maas/v1/chat/completions" + ), + 'Taichu-VLR-7B': partial( + api.TaichuVLRAPI, + model='taichu_vlr_7b', + url="https://platform.wair.ac.cn/maas/v1/chat/completions" + ), + # doubao_vl + "DoubaoVL": partial( + api.DoubaoVL, model="Doubao-1.5-vision-pro", temperature=0, retry=3, verbose=False + ), + "Seed1.5-VL": partial( + api.DoubaoVL, + model="doubao-1-5-thinking-vision-pro-250428", + temperature=0, + retry=3, + verbose=False, + max_tokens=16384, + ), + "Seed1.6": partial( + api.DoubaoVL, + model="doubao-seed-1.6-250615", + temperature=0, + retry=3, + verbose=False, + max_tokens=16384, + ), + "Seed1.6-Flash": partial( + api.DoubaoVL, + model="doubao-seed-1.6-flash-250615", + temperature=0, + retry=3, + verbose=False, + max_tokens=16384, + ), + "Seed1.6-Thinking": partial( + api.DoubaoVL, + model="doubao-seed-1.6-thinking-250615", + temperature=0, + retry=3, + verbose=False, + max_tokens=16384, + ), + "Doubao-Seed-2.0-Pro-260215": partial( + api.GPT4V, + model="doubao-seed-2-0-pro-260215", + retry=3, + timeout=1200, + max_tokens=32768, + ), + # Shopee MUG-U + 'MUG-U-7B': partial( + api.MUGUAPI, + model='MUG-U', + temperature=0, + retry=10, + verbose=False, + timeout=300), + # grok + "grok-vision-beta": partial( + api.GPT4V, + model="grok-vision-beta", + api_base="https://api.x.ai/v1/chat/completions", + temperature=0, + retry=10, + ), + "grok-2-vision-1212": partial( + api.GPT4V, + model="grok-2-vision", + api_base="https://api.x.ai/v1/chat/completions", + temperature=0, + retry=10, + ), + "grok-4-0709": partial( + api.GPT4V, + model="grok-4-0709", + api_base="https://api.x.ai/v1/chat/completions", + temperature=0, + retry=3, + timeout=1200, + max_tokens=16384 + ), + "Grok-4.1-Fast": partial( + api.GPT4V, + model="grok-4-1-fast-reasoning", + retry=3, + timeout=1200, + max_tokens=16384, + ), + # kimi + "moonshot-v1-8k": partial( + api.GPT4V, + model="moonshot-v1-8k-vision-preview", + api_base="https://api.moonshot.cn/v1/chat/completions", + temperature=0, + retry=10, + ), + "moonshot-v1-32k": partial( + api.GPT4V, + model="moonshot-v1-32k-vision-preview", + api_base="https://api.moonshot.cn/v1/chat/completions", + temperature=0, + retry=10, + ), + "moonshot-v1-128k": partial( + api.GPT4V, + model="moonshot-v1-128k-vision-preview", + api_base="https://api.moonshot.cn/v1/chat/completions", + temperature=0, + retry=10, + ), + 'ernie4.5-turbo': partial( + api.GPT4V, + model='ernie-4.5-turbo-vl-32k', + temperature=0, + retry=3, + max_tokens=12000, + ), + 'ernie4.5-a3b': partial( + api.GPT4V, + model='ernie-4.5-vl-28b-a3b', + temperature=0, + retry=3, + max_tokens=8000, + ), + "360zhinao3-vl": + partial( + api.GPT4V, + model="360zhinao3-vl", + api_base="https://api.360.cn/v1/chat/completions", + temperature=0.6, + top_p=0.95, + top_k=20, + thinking_budget=18000, + max_tokens=24000, + presence_penalty=0.0, + frequency_penalty=0.0, + retry=3, + ), +} + +api_models['gpt-5'] = cp.deepcopy(api_models['gpt-5-2025-08-07']) +api_models['gpt-5-mini'] = cp.deepcopy(api_models['gpt-5-mini-2025-08-07']) +api_models['gpt-5-nano'] = cp.deepcopy(api_models['gpt-5-nano-2025-08-07']) + +emu_series = { + "emu2_chat": partial(vlm.Emu, model_path="BAAI/Emu2-Chat"), + "emu3_chat": partial(vlm.Emu3_chat, model_path="BAAI/Emu3-Chat"), + "emu3_gen": partial(vlm.Emu3_gen, model_path="BAAI/Emu3-Gen"), +} + +granite_vision_series = { + 'granite_vision_3.1_2b_preview': partial(vlm.GraniteVision3, model_path="ibm-granite/granite-vision-3.1-2b-preview"), + 'granite_vision_3.2_2b': partial(vlm.GraniteVision3, model_path="ibm-granite/granite-vision-3.2-2b"), + 'granite_vision_3.3_2b': partial(vlm.GraniteVision3, model_path="ibm-granite/granite-vision-3.3-2b"), +} + +mmalaya_series = { + "MMAlaya": partial(vlm.MMAlaya, model_path="DataCanvas/MMAlaya"), + "MMAlaya2": partial(vlm.MMAlaya2, model_path="DataCanvas/MMAlaya2"), +} + +minicpm_series = { + "MiniCPM-V": partial(vlm.MiniCPM_V, model_path="openbmb/MiniCPM-V"), + "MiniCPM-V-2": partial(vlm.MiniCPM_V, model_path="openbmb/MiniCPM-V-2"), + "MiniCPM-Llama3-V-2_5": partial( + vlm.MiniCPM_Llama3_V, model_path="openbmb/MiniCPM-Llama3-V-2_5" + ), + "MiniCPM-V-2_6": partial(vlm.MiniCPM_V_2_6, model_path="openbmb/MiniCPM-V-2_6"), + "MiniCPM-o-2_6": partial(vlm.MiniCPM_o_2_6, model_path="openbmb/MiniCPM-o-2_6"), + "MiniCPM-V-4": partial(vlm.MiniCPM_V_4, model_path="openbmb/MiniCPM-V-4"), + "MiniCPM-V-4_5": partial(vlm.MiniCPM_V_4_5, model_path="openbmb/MiniCPM-V-4_5"), + "MiniCPM-o-4_5": partial(vlm.MiniCPM_o_4_5, model_path="openbmb/MiniCPM-o-4_5"), + "MiniCPM-o-4_5_api": partial( + api.LMDeployAPI, + api_base="http://0.0.0.0:8000/v1/chat/completions", + model='MiniCPM-o-4_5', + temperature=0.7, + top_p=0.8, + top_k=100, + repetition_penalty=1.02, + max_tokens=32768, + retry=6, + timeout=1800, + ), +} + +xtuner_series = { + "llava-internlm2-7b": partial( + vlm.LLaVA_XTuner, + llm_path="internlm/internlm2-chat-7b", + llava_path="xtuner/llava-internlm2-7b", + visual_select_layer=-2, + prompt_template="internlm2_chat", + ), + "llava-internlm2-20b": partial( + vlm.LLaVA_XTuner, + llm_path="internlm/internlm2-chat-20b", + llava_path="xtuner/llava-internlm2-20b", + visual_select_layer=-2, + prompt_template="internlm2_chat", + ), + "llava-internlm-7b": partial( + vlm.LLaVA_XTuner, + llm_path="internlm/internlm-chat-7b", + llava_path="xtuner/llava-internlm-7b", + visual_select_layer=-2, + prompt_template="internlm_chat", + ), + "llava-v1.5-7b-xtuner": partial( + vlm.LLaVA_XTuner, + llm_path="lmsys/vicuna-7b-v1.5", + llava_path="xtuner/llava-v1.5-7b-xtuner", + visual_select_layer=-2, + prompt_template="vicuna", + ), + "llava-v1.5-13b-xtuner": partial( + vlm.LLaVA_XTuner, + llm_path="lmsys/vicuna-13b-v1.5", + llava_path="xtuner/llava-v1.5-13b-xtuner", + visual_select_layer=-2, + prompt_template="vicuna", + ), + "llava-llama-3-8b": partial( + vlm.LLaVA_XTuner, + llm_path="xtuner/llava-llama-3-8b-v1_1", + llava_path="xtuner/llava-llama-3-8b-v1_1", + visual_select_layer=-2, + prompt_template="llama3_chat", + ), +} + +qwen_series = { + "qwen_base": partial(vlm.QwenVL, model_path="Qwen/Qwen-VL"), + "qwen_chat": partial(vlm.QwenVLChat, model_path="Qwen/Qwen-VL-Chat"), + "monkey": partial(vlm.Monkey, model_path="echo840/Monkey"), + "monkey-chat": partial(vlm.MonkeyChat, model_path="echo840/Monkey-Chat"), + "minimonkey": partial(vlm.MiniMonkey, model_path="mx262/MiniMonkey"), +} + +thyme_series = { + "Thyme-7B": partial(vlm.Thyme, model_path="Kwai-Keye/Thyme-RL") +} + +llava_series = { + "llava_v1.5_7b": partial(vlm.LLaVA, model_path="liuhaotian/llava-v1.5-7b"), + "llava_v1.5_13b": partial(vlm.LLaVA, model_path="liuhaotian/llava-v1.5-13b"), + "llava_v1_7b": partial(vlm.LLaVA, model_path=LLAVA_V1_7B_MODEL_PTH), + "sharegpt4v_7b": partial(vlm.LLaVA, model_path="Lin-Chen/ShareGPT4V-7B"), + "sharegpt4v_13b": partial(vlm.LLaVA, model_path="Lin-Chen/ShareGPT4V-13B"), + "llava_next_vicuna_7b": partial( + vlm.LLaVA_Next, model_path="llava-hf/llava-v1.6-vicuna-7b-hf" + ), + "llava_next_vicuna_13b": partial( + vlm.LLaVA_Next, model_path="llava-hf/llava-v1.6-vicuna-13b-hf" + ), + "llava_next_mistral_7b": partial( + vlm.LLaVA_Next, model_path="llava-hf/llava-v1.6-mistral-7b-hf" + ), + "llava_next_yi_34b": partial(vlm.LLaVA_Next, model_path="llava-hf/llava-v1.6-34b-hf"), + "llava_next_llama3": partial( + vlm.LLaVA_Next, model_path="llava-hf/llama3-llava-next-8b-hf" + ), + "llava_next_72b": partial(vlm.LLaVA_Next, model_path="llava-hf/llava-next-72b-hf"), + "llava_next_110b": partial(vlm.LLaVA_Next, model_path="llava-hf/llava-next-110b-hf"), + "llava_next_qwen_32b": partial( + vlm.LLaVA_Next2, model_path="lmms-lab/llava-next-qwen-32b" + ), + "llava_next_interleave_7b": partial( + vlm.LLaVA_Next, model_path="llava-hf/llava-interleave-qwen-7b-hf" + ), + "llava_next_interleave_7b_dpo": partial( + vlm.LLaVA_Next, model_path="llava-hf/llava-interleave-qwen-7b-dpo-hf" + ), + "llava-onevision-qwen2-0.5b-ov-hf": partial( + vlm.LLaVA_OneVision_HF, model_path="llava-hf/llava-onevision-qwen2-0.5b-ov-hf" + ), + "llava-onevision-qwen2-0.5b-si-hf": partial( + vlm.LLaVA_OneVision_HF, model_path="llava-hf/llava-onevision-qwen2-0.5b-si-hf" + ), + "llava-onevision-qwen2-7b-ov-hf": partial( + vlm.LLaVA_OneVision_HF, model_path="llava-hf/llava-onevision-qwen2-7b-ov-hf" + ), + "llava-onevision-qwen2-7b-si-hf": partial( + vlm.LLaVA_OneVision_HF, model_path="llava-hf/llava-onevision-qwen2-7b-si-hf" + ), + "llava_onevision_qwen2_0.5b_si": partial( + vlm.LLaVA_OneVision, model_path="lmms-lab/llava-onevision-qwen2-0.5b-si" + ), + "llava_onevision_qwen2_7b_si": partial( + vlm.LLaVA_OneVision, model_path="lmms-lab/llava-onevision-qwen2-7b-si" + ), + "llava_onevision_qwen2_72b_si": partial( + vlm.LLaVA_OneVision, model_path="lmms-lab/llava-onevision-qwen2-72b-si" + ), + "llava_onevision_qwen2_0.5b_ov": partial( + vlm.LLaVA_OneVision, model_path="lmms-lab/llava-onevision-qwen2-0.5b-ov" + ), + "llava_onevision_qwen2_7b_ov": partial( + vlm.LLaVA_OneVision, model_path="lmms-lab/llava-onevision-qwen2-7b-ov" + ), + "llava_onevision_qwen2_72b_ov": partial( + vlm.LLaVA_OneVision, model_path="lmms-lab/llava-onevision-qwen2-72b-ov-sft" + ), + "Aquila-VL-2B": partial(vlm.LLaVA_OneVision, model_path="BAAI/Aquila-VL-2B-llava-qwen"), + "llava_video_qwen2_7b": partial( + vlm.LLaVA_OneVision, model_path="lmms-lab/LLaVA-Video-7B-Qwen2" + ), + "llava_video_qwen2_72b": partial( + vlm.LLaVA_OneVision, model_path="lmms-lab/LLaVA-Video-72B-Qwen2" + ), + "LLaVA-OneVision-1.5-8B-Instruct": partial( + vlm.LLaVA_OneVision_1_5, model_path="lmms-lab/LLaVA-OneVision-1.5-8B-Instruct", max_new_tokens=8192 + ), +} + +varco_vision_series = { + "varco-vision-hf": partial( + vlm.LLaVA_OneVision_HF, model_path="NCSOFT/VARCO-VISION-14B-HF" + ), + "varco-vision-2-1.7b": partial( + vlm.VarcoVision, model_path="NCSOFT/VARCO-VISION-2.0-1.7B" + ), + "varco-vision-2-14b": partial( + vlm.VarcoVision, model_path="NCSOFT/VARCO-VISION-2.0-14B" + ), +} + +vita_series = { + "vita": partial(vlm.VITA, model_path="VITA-MLLM/VITA", root=VITA_ROOT), + "vita_qwen2": partial(vlm.VITAQwen2, model_path="VITA-MLLM/VITA-1.5", root=VITA_ROOT), +} + +long_vita_series = { + "Long-VITA-16K": partial( + vlm.LongVITA, model_path="VITA-MLLM/Long-VITA-16K_HF", max_num_frame=128 + ), + "Long-VITA-128K": partial( + vlm.LongVITA, model_path="VITA-MLLM/Long-VITA-128K_HF", max_num_frame=256 + ), + "Long-VITA-1M": partial( + vlm.LongVITA, model_path="VITA-MLLM/Long-VITA-1M_HF", max_num_frame=256 + ), +} + +interns1_mini = { + "Intern-S1-mini": partial( + vlm.InternS1Chat, model_path="/mnt/shared-storage-user/mllm/lijinsong/models/Intern-S1-mini/" + ), +} + +internvl = { + "InternVL-Chat-V1-1": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL-Chat-V1-1", version="V1.1" + ), + "InternVL-Chat-V1-2": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL-Chat-V1-2", version="V1.2" + ), + "InternVL-Chat-V1-2-Plus": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL-Chat-V1-2-Plus", version="V1.2" + ), + "InternVL-Chat-V1-5": partial( + vlm.InternVLChat, + model_path="OpenGVLab/InternVL-Chat-V1-5", + version="V1.5", + ) +} + +mini_internvl = { + "Mini-InternVL-Chat-2B-V1-5": partial( + vlm.InternVLChat, model_path="OpenGVLab/Mini-InternVL-Chat-2B-V1-5", version="V1.5" + ), + "Mini-InternVL-Chat-4B-V1-5": partial( + vlm.InternVLChat, model_path="OpenGVLab/Mini-InternVL-Chat-4B-V1-5", version="V1.5" + ), +} + +internvl2 = { + "InternVL2-1B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2-1B", version="V2.0" + ), + "InternVL2-2B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2-2B", version="V2.0" + ), + "InternVL2-4B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2-4B", version="V2.0" + ), + "InternVL2-8B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2-8B", version="V2.0" + ), + "InternVL2-26B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2-26B", version="V2.0" + ), + "InternVL2-40B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2-40B", version="V2.0" + ), + "InternVL2-76B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2-Llama3-76B", version="V2.0" + ), + "InternVL2-8B-MPO": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2-8B-MPO", version="V2.0" + ), + "InternVL2-8B-MPO-CoT": partial( + vlm.InternVLChat, + model_path="OpenGVLab/InternVL2-8B-MPO", + version="V2.0", + use_mpo_prompt=True, + ), +} + +internvl2_5 = { + "InternVL2_5-1B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2_5-1B", version="V2.0" + ), + "InternVL2_5-2B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2_5-2B", version="V2.0" + ), + "QTuneVL1-2B": partial( + vlm.InternVLChat, model_path="hanchaow/QTuneVL1-2B", version="V2.0" + ), + "InternVL2_5-4B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2_5-4B", version="V2.0" + ), + "InternVL2_5-8B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2_5-8B", version="V2.0" + ), + "InternVL2_5-26B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2_5-26B", version="V2.0" + ), + "InternVL2_5-38B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2_5-38B", version="V2.0" + ), + "InternVL2_5-78B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2_5-78B", version="V2.0" + ), + # InternVL2.5 series with Best-of-N evaluation + "InternVL2_5-8B-BoN-8": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL2_5-8B", version="V2.0", + best_of_n=8, reward_model_path="OpenGVLab/VisualPRM-8B", + ), +} + +internvl2_5_mpo = { + "InternVL2_5-1B-MPO": partial( + vlm.InternVLChat, + model_path="OpenGVLab/InternVL2_5-1B-MPO", + version="V2.0", + use_mpo_prompt=True, + ), + "InternVL2_5-2B-MPO": partial( + vlm.InternVLChat, + model_path="OpenGVLab/InternVL2_5-2B-MPO", + version="V2.0", + use_mpo_prompt=True, + ), + "InternVL2_5-4B-MPO": partial( + vlm.InternVLChat, + model_path="OpenGVLab/InternVL2_5-4B-MPO", + version="V2.0", + use_mpo_prompt=True, + ), + "InternVL2_5-8B-MPO": partial( + vlm.InternVLChat, + model_path="OpenGVLab/InternVL2_5-8B-MPO", + version="V2.0", + use_mpo_prompt=True, + ), + "InternVL2_5-26B-MPO": partial( + vlm.InternVLChat, + model_path="OpenGVLab/InternVL2_5-26B-MPO", + version="V2.0", + use_mpo_prompt=True, + ), + "InternVL2_5-38B-MPO": partial( + vlm.InternVLChat, + model_path="OpenGVLab/InternVL2_5-38B-MPO", + version="V2.0", + use_mpo_prompt=True, + ), + "InternVL2_5-78B-MPO": partial( + vlm.InternVLChat, + model_path="OpenGVLab/InternVL2_5-78B-MPO", + version="V2.0", + use_mpo_prompt=True, + ), + "InternVL2_5-8B-GUI": partial( + vlm.InternVLChat, + model_path="/fs-computility/mllm1/shared/zhaoxiangyu/models/internvl2_5_8b_internlm2_5_7b_dynamic_res_stage1", + version="V2.0", + max_new_tokens=512, + screen_parse=False, + ), + "InternVL3-7B-GUI": partial( + vlm.InternVLChat, + model_path="/fs-computility/mllm1/shared/zhaoxiangyu/GUI/checkpoints/internvl3_7b_dynamic_res_stage1_56/", + version="V2.0", + max_new_tokens=512, + screen_parse=False, + ), +} + +internvl3 = { + "InternVL3-1B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3-1B", version="V2.0" + ), + "InternVL3-2B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3-2B", version="V2.0" + ), + "InternVL3-8B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3-8B", version="V2.0", + ), + "InternVL3-9B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3-9B", version="V2.0" + ), + "InternVL3-14B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3-14B", version="V2.0" + ), + "InternVL3-38B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3-38B", version="V2.0" + ), + "InternVL3-78B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3-78B", version="V2.0" + ), +} + +internvl3_5 = { + "InternVL3_5-1B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-1B", version="V2.0" + ), + "InternVL3_5-2B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-2B", version="V2.0" + ), + "InternVL3_5-4B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-4B", version="V2.0" + ), + "InternVL3_5-8B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-8B", version="V2.0" + ), + "InternVL3_5-14B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-14B", version="V2.0" + ), + "InternVL3_5-GPT-OSS-20B-A4B-Preview": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview", version="V2.0" + ), + "InternVL3_5-30B-A3B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-30B-A3B", version="V2.0" + ), + "InternVL3_5-38B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-38B", version="V2.0" + ), + "InternVL3_5-241B-A28B": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-241B-A28B", version="V2.0", + max_new_tokens=16384, + ), + + "InternVL3_5-1B-Thinking": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-1B", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), + "InternVL3_5-2B-Thinking": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-2B", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), + "InternVL3_5-4B-Thinking": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-4B", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), + "InternVL3_5-8B-Thinking": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-8B", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), + "InternVL3_5-14B-Thinking": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-14B", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), + "InternVL3_5-GPT-OSS-20B-A4B-Preview-Thinking": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-GPT-OSS-20B-A4B-Preview", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), + "InternVL3_5-30B-A3B-Thinking": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-30B-A3B", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), + "InternVL3_5-38B-Thinking": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-38B", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), + "InternVL3_5-241B-A28B-Thinking": partial( + vlm.InternVLChat, model_path="OpenGVLab/InternVL3_5-241B-A28B", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), + "InternVL3_5-241B-A28B-Thinking-api": partial( + api.LMDeployAPI, model="internvl-3.5-241b", use_lmdeploy=True, + max_new_tokens=2**16, cot_prompt_version="r1", do_sample=True, version="V2.0" + ), +} + +qwen3vl_series = { + "Qwen3-VL-235B-A22B-Instruct": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-235B-A22B-Instruct", + use_custom_prompt=False, + use_vllm=True, + temperature=0.7, + max_new_tokens=16384, + repetition_penalty=1.0, + presence_penalty=1.5, + top_p=0.8, + top_k=20 + ), + "Qwen3-VL-235B-A22B-Thinking": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-235B-A22B-Thinking", + use_custom_prompt=False, + use_vllm=True, + temperature=1.0, + max_new_tokens=40960, + repetition_penalty=1.0, + presence_penalty=0.0, + top_p=0.95, + top_k=20 + ), + "Qwen3-VL-30B-A3B-Instruct": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-30B-A3B-Instruct", + use_custom_prompt=False, + use_vllm=True, + temperature=0.7, + max_new_tokens=16384, + repetition_penalty=1.0, + presence_penalty=1.5, + top_p=0.8, + top_k=20 + ), + "Qwen3-VL-30B-A3B-Thinking": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-30B-A3B-Thinking", + use_custom_prompt=False, + use_vllm=True, + temperature=1.0, + max_new_tokens=40960, + repetition_penalty=1.0, + presence_penalty=0.0, + top_p=0.95, + top_k=20 + ), + "Qwen3-VL-8B-Thinking": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-8B-Thinking", + use_custom_prompt=False, + use_vllm=True, + temperature=1.0, + max_new_tokens=40960, + repetition_penalty=1.0, + presence_penalty=0.0, + top_p=0.95, + top_k=20 + ), + "Qwen3-VL-4B-Thinking": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-4B-Thinking", + use_custom_prompt=False, + use_vllm=True, + temperature=1.0, + max_new_tokens=40960, + repetition_penalty=1.0, + presence_penalty=0.0, + top_p=0.95, + top_k=20 + ), + "Qwen3-VL-8B-Instruct": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-8B-Instruct", + use_custom_prompt=False, + use_vllm=True, + temperature=0.7, + max_new_tokens=16384, + repetition_penalty=1.0, + presence_penalty=1.5, + top_p=0.8, + top_k=20 + ), + "Qwen3-VL-4B-Instruct": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-4B-Instruct", + use_custom_prompt=False, + use_vllm=True, + temperature=0.7, + max_new_tokens=16384, + repetition_penalty=1.0, + presence_penalty=1.5, + top_p=0.8, + top_k=20 + ), + "Qwen3-VL-2B-Instruct": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-2B-Instruct", + use_custom_prompt=False, + use_vllm=True, + temperature=0.7, + max_new_tokens=16384, + repetition_penalty=1.0, + presence_penalty=1.5, + top_p=0.8, + top_k=20 + ), + "Qwen3-VL-32B-Instruct": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-32B-Instruct", + use_custom_prompt=False, + use_vllm=True, + temperature=0.7, + max_new_tokens=16384, + repetition_penalty=1.0, + presence_penalty=1.5, + top_p=0.8, + top_k=20 + + ), + "Qwen3-VL-2B-Thinking": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-2B-Thinking", + use_custom_prompt=False, + use_vllm=True, + temperature=1.0, + max_new_tokens=40960, + repetition_penalty=1.0, + presence_penalty=0.0, + top_p=0.95, + top_k=20 + ), + "Qwen3-VL-32B-Thinking": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-VL-32B-Thinking", + use_custom_prompt=False, + use_vllm=False, + temperature=1.0, + max_new_tokens=40960, + repetition_penalty=1.0, + presence_penalty=0.0, + top_p=0.95, + top_k=20 + ), + "Qwen3-Omni-30B-A3B-Instruct": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-Omni-30B-A3B-Instruct", + use_custom_prompt=False, + use_vllm=True, + temperature=0.6, + top_p=0.95, + top_k=20, + max_new_tokens=16384, + ), + "Qwen3-Omni-30B-A3B-Thinking": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-Omni-30B-A3B-Thinking", + use_custom_prompt=False, + use_vllm=True, + temperature=0.6, + top_p=0.95, + top_k=20, + max_new_tokens=16384, + ), + "Qwen3-Omni-30B-A3B-Captioner": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3-Omni-30B-A3B-Captioner", + use_custom_prompt=False, + use_vllm=True, + temperature=0.6, + top_p=0.95, + top_k=20, + max_new_tokens=16384, + ), + +} + +qwen3_5_series = { + # vllm serve command example: + # vllm serve Qwen/Qwen3.5-122B-A10B --port 8000 --tensor-parallel-size 8 --max-model-len 262144 --reasoning-parser qwen3 + "Qwen3.5-35B-A3B_api": partial( + api.LMDeployAPI, + model="Qwen/Qwen3.5-122B-A10B", + api_base="http://0.0.0.0:8000/v1/chat/completions", + temperature=0.6, + top_p=0.95, + top_k=20, + presence_penalty=1.5, + repetition_penalty=1.0, + max_new_tokens=32768, + retry=6, + timeout=1800, + ), + "Qwen3.5-397B-A17B_api": partial( + api.LMDeployAPI, + api_base="http://0.0.0.0:8000/v1/chat/completions", + model='Qwen3.5-397B', + temperature=0.6, + top_p=0.95, + top_k=20, + presence_penalty=1.5, + repetition_penalty=1.0, + max_tokens=32768, + retry=10, + timeout=1800, + ), + "Qwen3.5-122B-A10B_ThinkMode_api": partial( + api.LMDeployAPI, + api_base="http://0.0.0.0:8000/v1/chat/completions", + model='Qwen3.5-35B', + temperature=1.0, + top_p=0.95, + top_k=20, + min_p=0.0, + presence_penalty=1.5, + repetition_penalty=1.0, + max_tokens=81920, + retry=10, + timeout=900, + ), + "Qwen3.5-122B-A10B_InstructMode_api": partial( + api.LMDeployAPI, + model="Qwen/Qwen3.5-122B-A10B", + api_base="http://0.0.0.0:8000/v1/chat/completions", + temperature=1.0, + top_p=0.95, + top_k=20, + min_p=0.0, + presence_penalty=1.5, + repetition_penalty=1.0, + max_tokens=81920, + retry=10, + timeout=900, + chat_template_kwargs={"enable_thinking": False}, + ), + "Qwen3.5-397B-A17B": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3.5-397B-A17B", + use_custom_prompt=False, + use_vllm=True, + temperature=1.0, + top_p=0.95, + top_k=20, + presence_penalty=1.5, + max_new_tokens=32768, + ), + "Qwen3.5-122B-A10B": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3.5-122B-A10B", + use_custom_prompt=False, + use_vllm=True, + temperature=1.0, + top_p=0.95, + top_k=20, + presence_penalty=1.5, + max_new_tokens=32768, + ), + "Qwen3.5-35B-A3B": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3.5-35B-A3B", + use_custom_prompt=False, + use_vllm=True, + temperature=1.0, + top_p=0.95, + top_k=20, + presence_penalty=1.5, + max_new_tokens=32768, + ), + "Qwen3.5-27B": partial( + vlm.Qwen3VLChat, + model_path="Qwen/Qwen3.5-27B", + use_custom_prompt=False, + use_vllm=True, + temperature=1.0, + top_p=0.95, + top_k=20, + presence_penalty=1.5, + max_new_tokens=32768, + ), +} + +sail_series = { + "SAIL-VL-2B": partial(vlm.SailVL, model_path="BytedanceDouyinContent/SAIL-VL-2B"), + "SAIL-VL-1.5-2B": partial(vlm.SailVL, model_path="BytedanceDouyinContent/SAIL-VL-1d5-2B", use_msac = True), + "SAIL-VL-1.5-8B": partial(vlm.SailVL, model_path="BytedanceDouyinContent/SAIL-VL-1d5-8B", use_msac = True), + "SAIL-VL-1.6-8B": partial(vlm.SailVL, model_path="BytedanceDouyinContent/SAIL-VL-1d6-8B", use_msac = True), + "SAIL-VL-1.7-Thinking-2B-2507": partial(vlm.SailVL, model_path="BytedanceDouyinContent/SAIL-VL-1d7-Thinking-2B-2507", use_msac = True, use_cot=True, max_new_tokens=4096), + "SAIL-VL-1.7-Thinking-8B-2507": partial(vlm.SailVL, model_path="BytedanceDouyinContent/SAIL-VL-1d7-Thinking-8B-2507", use_msac = True, use_cot=True, max_new_tokens=4096), + "SAIL-VL2-2B": partial(vlm.SailVL, model_path="BytedanceDouyinContent/SAIL-VL2-2B", use_msac = True), + "SAIL-VL2-8B": partial(vlm.SailVL, model_path="BytedanceDouyinContent/SAIL-VL2-8B", use_msac = True), +} + +ristretto_series = { + "Ristretto-3B": partial(vlm.Ristretto, model_path="LiAutoAD/Ristretto-3B"), +} + +yivl_series = { + "Yi_VL_6B": partial(vlm.Yi_VL, model_path="01-ai/Yi-VL-6B", root=Yi_ROOT), + "Yi_VL_34B": partial(vlm.Yi_VL, model_path="01-ai/Yi-VL-34B", root=Yi_ROOT), +} + +xcomposer_series = { + "XComposer": partial(vlm.XComposer, model_path="internlm/internlm-xcomposer-vl-7b"), + "sharecaptioner": partial(vlm.ShareCaptioner, model_path="Lin-Chen/ShareCaptioner"), + "XComposer2": partial(vlm.XComposer2, model_path="internlm/internlm-xcomposer2-vl-7b"), + "XComposer2_1.8b": partial( + vlm.XComposer2, model_path="internlm/internlm-xcomposer2-vl-1_8b" + ), + "XComposer2_4KHD": partial( + vlm.XComposer2_4KHD, model_path="internlm/internlm-xcomposer2-4khd-7b" + ), + "XComposer2d5": partial( + vlm.XComposer2d5, model_path="internlm/internlm-xcomposer2d5-7b" + ), +} + +minigpt4_series = { + "MiniGPT-4-v2": partial(vlm.MiniGPT4, mode="v2", root=MiniGPT4_ROOT), + "MiniGPT-4-v1-7B": partial(vlm.MiniGPT4, mode="v1_7b", root=MiniGPT4_ROOT), + "MiniGPT-4-v1-13B": partial(vlm.MiniGPT4, mode="v1_13b", root=MiniGPT4_ROOT), +} + +idefics_series = { + "idefics_9b_instruct": partial( + vlm.IDEFICS, model_path="HuggingFaceM4/idefics-9b-instruct" + ), + "idefics_80b_instruct": partial( + vlm.IDEFICS, model_path="HuggingFaceM4/idefics-80b-instruct" + ), + "idefics2_8b": partial(vlm.IDEFICS2, model_path="HuggingFaceM4/idefics2-8b"), + # Idefics3 follows Idefics2 Pattern + "Idefics3-8B-Llama3": partial( + vlm.IDEFICS2, model_path="HuggingFaceM4/Idefics3-8B-Llama3" + ), + 'granite-docling-258M': partial( + vlm.DOCLING, model_path="ibm-granite/granite-docling-258M" + ) + +} + +smolvlm_series = { + "SmolVLM-256M": partial(vlm.SmolVLM, model_path="HuggingFaceTB/SmolVLM-256M-Instruct"), + "SmolVLM-500M": partial(vlm.SmolVLM, model_path="HuggingFaceTB/SmolVLM-500M-Instruct"), + "SmolVLM": partial(vlm.SmolVLM, model_path="HuggingFaceTB/SmolVLM-Instruct"), + "SmolVLM-DPO": partial(vlm.SmolVLM, model_path="HuggingFaceTB/SmolVLM-Instruct-DPO"), + "SmolVLM-Synthetic": partial(vlm.SmolVLM, model_path="HuggingFaceTB/SmolVLM-Synthetic"), + "SmolVLM2-256M": partial( + vlm.SmolVLM2, model_path="HuggingFaceTB/SmolVLM2-256M-Video-Instruct" + ), + "SmolVLM2-500M": partial( + vlm.SmolVLM2, model_path="HuggingFaceTB/SmolVLM2-500M-Video-Instruct" + ), + "SmolVLM2": partial(vlm.SmolVLM2, model_path="HuggingFaceTB/SmolVLM2-2.2B-Instruct"), +} + +instructblip_series = { + "instructblip_7b": partial(vlm.InstructBLIP, name="instructblip_7b"), + "instructblip_13b": partial(vlm.InstructBLIP, name="instructblip_13b"), +} + +deepseekvl_series = { + "deepseek_vl_7b": partial(vlm.DeepSeekVL, model_path="deepseek-ai/deepseek-vl-7b-chat"), + "deepseek_vl_1.3b": partial( + vlm.DeepSeekVL, model_path="deepseek-ai/deepseek-vl-1.3b-chat" + ), +} + +deepseekvl2_series = { + "deepseek_vl2_tiny": partial( + vlm.DeepSeekVL2, model_path="deepseek-ai/deepseek-vl2-tiny" + ), + "deepseek_vl2_small": partial( + vlm.DeepSeekVL2, model_path="deepseek-ai/deepseek-vl2-small" + ), + "deepseek_vl2": partial(vlm.DeepSeekVL2, model_path="deepseek-ai/deepseek-vl2"), +} + +deepseekocr_series = { + "DeepSeek-OCR": partial( + vlm.DeepSeekOCR, model_path="deepseek-ai/DeepSeek-OCR" + ), +} + +janus_series = { + "Janus-1.3B": partial(vlm.Janus, model_path="deepseek-ai/Janus-1.3B"), + "Janus-Pro-1B": partial(vlm.Janus, model_path="deepseek-ai/Janus-Pro-1B"), + "Janus-Pro-7B": partial(vlm.Janus, model_path="deepseek-ai/Janus-Pro-7B"), +} + +cogvlm_series = { + "cogvlm-grounding-generalist": partial( + vlm.CogVlm, + model_path="THUDM/cogvlm-grounding-generalist-hf", + tokenizer_name="lmsys/vicuna-7b-v1.5", + ), + "cogvlm-chat": partial( + vlm.CogVlm, model_path="THUDM/cogvlm-chat-hf", tokenizer_name="lmsys/vicuna-7b-v1.5" + ), + "cogvlm2-llama3-chat-19B": partial( + vlm.CogVlm, model_path="THUDM/cogvlm2-llama3-chat-19B" + ), + "glm-4v-9b": partial(vlm.GLM4v, model_path="THUDM/glm-4v-9b"), + "GLM4_1VThinking-9b": partial(vlm.GLMThinking, model_path="THUDM/GLM-4.1V-9B-Thinking"), + "GLM4_5V": partial(vlm.GLMThinking, model_path="THUDM/GLM-4.5V"), + "GLM4_6V": partial(vlm.GLMThinking, model_path="THUDM/GLM-4.6V"), + "GLM4_6V-api": partial( + api.LMDeployAPI, + api_base="http://0.0.0.0:8000/v1/chat/completions", + model='glm-4.6v', + temperature=0.8, + top_p=0.6, + top_k=2, + repetition_penalty=1.1, + max_tokens=16384, + retry=6, + timeout=1800, + ), +} + +wemm_series = { + "WeMM": partial(vlm.WeMM, model_path="feipengma/WeMM"), +} + +cambrian_series = { + "cambrian_8b": partial(vlm.Cambrian, model_path="nyu-visionx/cambrian-8b"), + "cambrian_13b": partial(vlm.Cambrian, model_path="nyu-visionx/cambrian-13b"), + "cambrian_34b": partial(vlm.Cambrian, model_path="nyu-visionx/cambrian-34b"), + + "cambrian-s-0.5b": partial(vlm.CambrianS, model_path="nyu-visionx/Cambrian-S-0.5B"), + "cambrian-s-1.5b": partial(vlm.CambrianS, model_path="nyu-visionx/Cambrian-S-1.5B"), + "cambrian-s-3b": partial(vlm.CambrianS, model_path="nyu-visionx/Cambrian-S-3B"), + "cambrian-s-7b": partial(vlm.CambrianS, model_path="nyu-visionx/Cambrian-S-7B"), +} + +chameleon_series = { + "chameleon_7b": partial(vlm.Chameleon, model_path="facebook/chameleon-7b"), + "chameleon_30b": partial(vlm.Chameleon, model_path="facebook/chameleon-30b"), +} + +vila_series = { + "VILA1.5-3b": partial(vlm.VILA, model_path="Efficient-Large-Model/VILA1.5-3b"), + "Llama-3-VILA1.5-8b": partial( + vlm.VILA, model_path="Efficient-Large-Model/Llama-3-VILA1.5-8b" + ), + "VILA1.5-13b": partial(vlm.VILA, model_path="Efficient-Large-Model/VILA1.5-13b"), + "VILA1.5-40b": partial(vlm.VILA, model_path="Efficient-Large-Model/VILA1.5-40b"), + "NVILA-8B": partial(vlm.NVILA, model_path="Efficient-Large-Model/NVILA-8B"), + "NVILA-15B": partial(vlm.NVILA, model_path="Efficient-Large-Model/NVILA-15B"), +} + +ovis_series = { + "Ovis1.5-Llama3-8B": partial(vlm.Ovis, model_path="AIDC-AI/Ovis1.5-Llama3-8B"), + "Ovis1.5-Gemma2-9B": partial(vlm.Ovis, model_path="AIDC-AI/Ovis1.5-Gemma2-9B"), + "Ovis1.6-Gemma2-9B": partial(vlm.Ovis1_6, model_path="AIDC-AI/Ovis1.6-Gemma2-9B"), + "Ovis1.6-Llama3.2-3B": partial(vlm.Ovis1_6, model_path="AIDC-AI/Ovis1.6-Llama3.2-3B"), + "Ovis1.6-Gemma2-27B": partial( + vlm.Ovis1_6_Plus, model_path="AIDC-AI/Ovis1.6-Gemma2-27B" + ), + "Ovis2-1B": partial(vlm.Ovis2, model_path="AIDC-AI/Ovis2-1B"), + "Ovis2-2B": partial(vlm.Ovis2, model_path="AIDC-AI/Ovis2-2B"), + "Ovis2-4B": partial(vlm.Ovis2, model_path="AIDC-AI/Ovis2-4B"), + "Ovis2-8B": partial(vlm.Ovis2, model_path="AIDC-AI/Ovis2-8B"), + "Ovis2-16B": partial(vlm.Ovis2, model_path="AIDC-AI/Ovis2-16B"), + "Ovis2-34B": partial(vlm.Ovis2, model_path="AIDC-AI/Ovis2-34B"), + "Ovis-U1-3B": partial(vlm.OvisU1, model_path="AIDC-AI/Ovis-U1-3B"), + "Ovis2.5-2B": partial(vlm.Ovis2_5, model_path="AIDC-AI/Ovis2.5-2B"), + "Ovis2.5-9B": partial(vlm.Ovis2_5, model_path="AIDC-AI/Ovis2.5-9B"), + "Ovis2.6-30B-A3B_api": partial( + api.LMDeployAPI, + api_base="http://0.0.0.0:8000/v1/chat/completions", + model='Ovis2.6-30B-A3B', + max_tokens=32768, + retry=6, + timeout=1800, + ), +} + +mantis_series = { + "Mantis-8B-siglip-llama3": partial( + vlm.Mantis, model_path="TIGER-Lab/Mantis-8B-siglip-llama3" + ), + "Mantis-8B-clip-llama3": partial( + vlm.Mantis, model_path="TIGER-Lab/Mantis-8B-clip-llama3" + ), + "Mantis-8B-Idefics2": partial(vlm.Mantis, model_path="TIGER-Lab/Mantis-8B-Idefics2"), + "Mantis-8B-Fuyu": partial(vlm.Mantis, model_path="TIGER-Lab/Mantis-8B-Fuyu"), +} + +phi3_series = { + "Phi-3-Vision": partial( + vlm.Phi3Vision, model_path="microsoft/Phi-3-vision-128k-instruct" + ), + "Phi-3.5-Vision": partial( + vlm.Phi3_5Vision, model_path="microsoft/Phi-3.5-vision-instruct" + ), +} + +phi4_series = { + 'Phi-4-Vision': partial(vlm.Phi4Multimodal, model_path='microsoft/Phi-4-multimodal-instruct'), +} + +xgen_mm_series = { + "xgen-mm-phi3-interleave-r-v1.5": partial( + vlm.XGenMM, model_path="Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5" + ), + "xgen-mm-phi3-dpo-r-v1.5": partial( + vlm.XGenMM, model_path="Salesforce/xgen-mm-phi3-mini-instruct-dpo-r-v1.5" + ), +} + +hawkvl_series = { + "HawkVL-2B": partial( + vlm.HawkVL, + model_path="xjtupanda/HawkVL-2B", + min_pixels=4 * 28 * 28, + max_pixels=6800 * 28 * 28, + use_custom_prompt=True + ) +} + +qwen2vl_series = { + "Qwen-VL-Max-20250813": partial( + api.Qwen2VLAPI, + model="qwen-vl-max-2025-08-13", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + max_length=8192, + ), + "Qwen-VL-Max-0809": partial( + api.Qwen2VLAPI, + model="qwen-vl-max-0809", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen-VL-Plus-0809": partial( + api.Qwen2VLAPI, + model="qwen-vl-plus-0809", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "QVQ-72B-Preview": partial( + vlm.Qwen2VLChat, + model_path="Qwen/QVQ-72B-Preview", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + system_prompt="You are a helpful and harmless assistant. You are Qwen developed by Alibaba. You should think step-by-step.", + max_new_tokens=8192, + post_process=False, + ), + "Qwen2-VL-72B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2-VL-72B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen2-VL-7B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2-VL-7B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen2-VL-7B-Instruct-AWQ": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2-VL-7B-Instruct-AWQ", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen2-VL-7B-Instruct-GPTQ-Int4": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int4", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen2-VL-7B-Instruct-GPTQ-Int8": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2-VL-7B-Instruct-GPTQ-Int8", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen2-VL-2B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2-VL-2B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen2-VL-2B-Instruct-AWQ": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2-VL-2B-Instruct-AWQ", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen2-VL-2B-Instruct-GPTQ-Int4": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int4", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen2-VL-2B-Instruct-GPTQ-Int8": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2-VL-2B-Instruct-GPTQ-Int8", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "XinYuan-VL-2B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Cylingo/Xinyuan-VL-2B", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + ), + "Qwen2.5-VL-3B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-VL-3B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "Qwen2.5-VL-3B-Instruct-AWQ": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-VL-3B-Instruct-AWQ", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "Qwen2.5-VL-7B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-VL-7B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "Qwen2.5-VL-7B-Instruct-ForVideo": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-VL-7B-Instruct", + min_pixels=128 * 28 * 28, + max_pixels=768 * 28 * 28, + total_pixels=24576 * 28 * 28, + use_custom_prompt=False, + ), + "Qwen2.5-VL-7B-Instruct-AWQ": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-VL-7B-Instruct-AWQ", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "Qwen2.5-VL-32B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-VL-32B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "Qwen2.5-VL-72B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-VL-72B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "MiMo-VL-7B-SFT": partial( + vlm.Qwen2VLChat, + model_path="XiaomiMiMo/MiMo-VL-7B-SFT", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + use_lmdeploy=True + ), + "MiMo-VL-7B-RL": partial( + vlm.Qwen2VLChat, + model_path="XiaomiMiMo/MiMo-VL-7B-RL", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + use_lmdeploy=True + ), + "Qwen2.5-VL-72B-Instruct-ForVideo": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-VL-72B-Instruct", + min_pixels=128 * 28 * 28, + max_pixels=768 * 28 * 28, + total_pixels=24576 * 28 * 28, + use_custom_prompt=False, + ), + "Qwen2.5-VL-72B-Instruct-AWQ": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-VL-72B-Instruct-AWQ", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "Qwen2.5-Omni-7B-ForVideo": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-Omni-7B", + min_pixels=128 * 28 * 28, + max_pixels=768 * 28 * 28, + total_pixels=24576 * 28 * 28, + use_custom_prompt=False, + use_audio_in_video=True, # set use audio in video + ), + "Qwen2.5-Omni-7B": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen2.5-Omni-7B", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + # Qwen3-VL series (local inference) + "Qwen3-VL-8B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen3-VL-8B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + use_vllm=True, + ), + "Qwen3-VL-30B-A3B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen3-VL-30B-A3B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + use_vllm=True, + ), + "Qwen3-VL-235B-A22B-Instruct": partial( + vlm.Qwen2VLChat, + model_path="Qwen/Qwen3-VL-235B-A22B-Instruct", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + use_vllm=True, + ), + 'VLM-R1': partial( + vlm.VLMR1Chat, + model_path='omlab/VLM-R1-Qwen2.5VL-3B-Math-0305', + min_pixels=1280*28*28, + max_pixels=16384*28*28, + use_custom_prompt=False), + 'VLAA-Thinker-Qwen2.5VL-3B': partial( + vlm.VLAAThinkerChat, + model_path='UCSC-VLAA/VLAA-Thinker-Qwen2.5VL-3B', + min_pixels=1280*28*28, + max_pixels=16384*28*28, + use_custom_prompt=False, + post_process=True, # post processing for evaluation + system_prompt=('' + "You are VL-Thinking🤔, a helpful assistant with excellent reasoning ability." + " A user asks you a question, and you should try to solve it." + " You should first think about the reasoning process in the mind and then provides the user with the answer." + " The reasoning process and answer are enclosed within and" + " tags, respectively, i.e., reasoning process here " + " answer here " + ), + ), + 'VLAA-Thinker-Qwen2.5VL-7B': partial( + vlm.VLAAThinkerChat, + model_path='UCSC-VLAA/VLAA-Thinker-Qwen2.5VL-7B', + min_pixels=1280*28*28, + max_pixels=16384*28*28, + use_custom_prompt=False, + post_process=True, # post processing for evaluation + system_prompt=('' + "You are VL-Thinking🤔, a helpful assistant with excellent reasoning ability." + " A user asks you a question, and you should try to solve it." + " You should first think about the reasoning process in the mind and then provides the user with the answer." + " The reasoning process and answer are enclosed within and" + " tags, respectively, i.e., reasoning process here " + " answer here " + ), + ), + 'WeThink-Qwen2.5VL-7B': partial( + vlm.WeThinkVL, + model_path='yangjie-cv/WeThink-Qwen2.5VL-7B', + min_pixels=1280*28*28, + max_pixels=16384*28*28, + use_custom_prompt=False, + system_prompt=("You FIRST think about the reasoning process as an internal monologue and then provide the final answer.\nThe reasoning process MUST BE enclosed within tags. The final answer MUST BE enclosed within tags." + ), + ), +} + +slime_series = { + "Slime-7B": partial(vlm.SliME, model_path="yifanzhang114/SliME-vicuna-7B"), + "Slime-8B": partial(vlm.SliME, model_path="yifanzhang114/SliME-Llama3-8B"), + "Slime-13B": partial(vlm.SliME, model_path="yifanzhang114/SliME-vicuna-13B"), +} + +eagle_series = { + "Eagle-X4-8B-Plus": partial(vlm.Eagle, model_path="NVEagle/Eagle-X4-8B-Plus"), + "Eagle-X4-13B-Plus": partial(vlm.Eagle, model_path="NVEagle/Eagle-X4-13B-Plus"), + "Eagle-X5-7B": partial(vlm.Eagle, model_path="NVEagle/Eagle-X5-7B"), + "Eagle-X5-13B": partial(vlm.Eagle, model_path="NVEagle/Eagle-X5-13B"), + "Eagle-X5-13B-Chat": partial(vlm.Eagle, model_path="NVEagle/Eagle-X5-13B-Chat"), + "Eagle-X5-34B-Chat": partial(vlm.Eagle, model_path="NVEagle/Eagle-X5-34B-Chat"), + "Eagle-X5-34B-Plus": partial(vlm.Eagle, model_path="NVEagle/Eagle-X5-34B-Plus"), +} + +moondream_series = { + "Moondream1": partial(vlm.Moondream1, model_path="vikhyatk/moondream1"), + "Moondream2": partial(vlm.Moondream2, model_path="vikhyatk/moondream2"), + "Moondream3": partial(vlm.Moondream3, model_path="moondream/moondream3-preview"), +} + +llama_series = { + "Llama-3.2-11B-Vision-Instruct": partial( + vlm.llama_vision, model_path="meta-llama/Llama-3.2-11B-Vision-Instruct" + ), + "LLaVA-CoT": partial(vlm.llama_vision, model_path="Xkev/Llama-3.2V-11B-cot"), + "Llama-3.2-90B-Vision-Instruct": partial( + vlm.llama_vision, model_path="meta-llama/Llama-3.2-90B-Vision-Instruct" + ), + "Llama-4-Scout-17B-16E-Instruct": partial( + vlm.llama4, model_path="meta-llama/Llama-4-Scout-17B-16E-Instruct", use_vllm=True + ), +} + +molmo_series = { + "molmoE-1B-0924": partial(vlm.molmo, model_path="allenai/MolmoE-1B-0924"), + "molmo-7B-D-0924": partial(vlm.molmo, model_path="allenai/Molmo-7B-D-0924"), + "molmo-7B-O-0924": partial(vlm.molmo, model_path="allenai/Molmo-7B-O-0924"), + "molmo-72B-0924": partial(vlm.molmo, model_path="allenai/Molmo-72B-0924"), +} + +kosmos_series = { + "Kosmos2": partial(vlm.Kosmos2, model_path="microsoft/kosmos-2-patch14-224") +} + +points_series = { + "POINTS-Yi-1.5-9B-Chat": partial( + vlm.POINTS, model_path="WePOINTS/POINTS-Yi-1-5-9B-Chat" + ), + "POINTS-Qwen-2.5-7B-Chat": partial( + vlm.POINTS, model_path="WePOINTS/POINTS-Qwen-2-5-7B-Chat" + ), + "POINTSV15-Qwen-2.5-7B-Chat": partial( + vlm.POINTSV15, model_path="WePOINTS/POINTS-1-5-Qwen-2-5-7B-Chat" + ), +} + +nvlm_series = { + "NVLM": partial(vlm.NVLM, model_path="nvidia/NVLM-D-72B"), + "NVLM-D-72B_api": partial( + api.LMDeployAPI, + api_base="http://0.0.0.0:8000/v1/chat/completions", + model='NVLM-D-72B', + top_p=1.0, + max_new_tokens=32768, + retry=6, + timeout=1800, + ), +} + +vintern_series = { + "Vintern-3B-beta": partial(vlm.VinternChat, model_path="5CD-AI/Vintern-3B-beta"), + "Vintern-1B-v2": partial(vlm.VinternChat, model_path="5CD-AI/Vintern-1B-v2"), +} + +aria_series = {"Aria": partial(vlm.Aria, model_path="rhymes-ai/Aria")} + +h2ovl_series = { + "h2ovl-mississippi-2b": partial(vlm.H2OVLChat, model_path="h2oai/h2ovl-mississippi-2b"), + "h2ovl-mississippi-1b": partial( + vlm.H2OVLChat, model_path="h2oai/h2ovl-mississippi-800m" + ), +} + +valley_series = { + "valley2": partial( + vlm.Valley2Chat, model_path="bytedance-research/Valley-Eagle-7B" + ), + "valley2_dpo": partial( + vlm.Valley2Chat, model_path="bytedance-research/Valley2-DPO" + ), + "valley2.5": partial( + vlm.Valley3Chat, use_gthinker_thinking=True, model_path="bytedance-research/Valley2.5" + ), +} + +ola_series = { + "ola": partial(vlm.Ola, model_path="THUdyh/Ola-7b"), +} + +xvl_series = { + "X-VL-4B": partial(vlm.X_VL_HF, model_path="YannQi/X-VL-4B", temperature=0, retry=10), +} + +ross_series = { + "ross-qwen2-7b": partial(vlm.Ross, model_path="HaochenWang/ross-qwen2-7b"), +} + +ursa_series = { + "URSA-8B": partial(vlm.UrsaChat, model_path="URSA-MATH/URSA-8B"), + "URSA-8B-PS-GRPO": partial(vlm.UrsaChat, model_path="URSA-MATH/URSA-8B-PS-GRPO") +} + +gemma_series = { + "paligemma-3b-mix-448": partial( + vlm.PaliGemma, model_path="google/paligemma-3b-mix-448" + ), + + # 3B + "paligemma2-3b-pt-224": partial(vlm.PaliGemma, model_path="google/paligemma2-3b-pt-224"), + "paligemma2-3b-pt-448": partial(vlm.PaliGemma, model_path="google/paligemma2-3b-pt-448"), + "paligemma2-3b-mix-224": partial(vlm.PaliGemma, model_path="google/paligemma2-3b-mix-224"), + "paligemma2-3b-mix-448": partial(vlm.PaliGemma, model_path="google/paligemma2-3b-mix-448"), + + # 10B + "paligemma2-10b-pt-224": partial(vlm.PaliGemma, model_path="google/paligemma2-10b-pt-224"), + "paligemma2-10b-pt-448": partial(vlm.PaliGemma, model_path="google/paligemma2-10b-pt-448"), + "paligemma2-10b-mix-224": partial(vlm.PaliGemma, model_path="google/paligemma2-10b-mix-224"), + "paligemma2-10b-mix-448": partial(vlm.PaliGemma, model_path="google/paligemma2-10b-mix-448"), + + # 28B + "paligemma2-28b-pt-224": partial(vlm.PaliGemma, model_path="google/paligemma2-28b-pt-224"), + "paligemma2-28b-pt-448": partial(vlm.PaliGemma, model_path="google/paligemma2-28b-pt-448"), + "paligemma2-28b-mix-224": partial(vlm.PaliGemma, model_path="google/paligemma2-28b-mix-224"), + "paligemma2-28b-mix-448": partial(vlm.PaliGemma, model_path="google/paligemma2-28b-mix-448"), + + 'Gemma3-4B': partial(vlm.Gemma3, model_path='google/gemma-3-4b-it'), + 'Gemma3-12B': partial(vlm.Gemma3, model_path='google/gemma-3-12b-it'), + 'Gemma3-27B': partial(vlm.Gemma3, model_path='google/gemma-3-27b-it') +} + +aguvis_series = { + "aguvis_7b": partial( + vlm.Qwen2VLChatAguvis, + model_path=os.getenv( + "EVAL_MODEL", + "xlangai/Aguvis-7B-720P", + ), + min_pixels=256 * 28 * 28, + max_pixels=46 * 26 * 28 * 28, + use_custom_prompt=False, + mode='grounding', + ) +} + +kimi_series = { + 'Kimi-VL-A3B-Thinking': partial(vlm.KimiVL, model_path='moonshotai/Kimi-VL-A3B-Thinking'), + 'Kimi-VL-A3B-Instruct': partial(vlm.KimiVL, model_path='moonshotai/Kimi-VL-A3B-Instruct'), + 'Kimi-VL-A3B-Thinking-2506': partial(vlm.KimiVL, model_path='moonshotai/Kimi-VL-A3B-Thinking-2506', temperature=0.8, max_tokens=32768, extract_summary=True), + 'Kimi-K2.5-api': partial( + api.LMDeployAPI, + api_base="http://0.0.0.0:8000/v1/chat/completions", + model='Kimi-K2.5', + temperature=1.0, + top_p=0.95, + max_tokens=32768, + retry=6, + timeout=1800, + ) +} + +flash_vl = { + 'Flash-VL-2B-Dynamic-ISS': partial(vlm.FlashVL, model_path='FlashVL/FlashVL-2B-Dynamic-ISS') +} + + +oryx_series = { + 'oryx': partial(vlm.Oryx, model_path="THUdyh/Oryx-1.5-7B"), +} + +# recommend: vllm serve moonshotai/Kimi-VL-A3B-Thinking-2506 +# --served-model-name api-kimi-vl-thinking-2506 --trust-remote-code +# --tensor-parallel-size 2 --max-num-batched-tokens 131072 +# --max-model-len 131072 --limit-mm-per-prompt image=256 +kimi_vllm_series = { + "api-kimi-vl-thinking-2506": partial( + api.KimiVLAPI, + model="api-kimi-vl-thinking-2506", + ), + "api-kimi-vl-thinking": partial( + api.KimiVLAPI, + model="api-kimi-vl-thinking", + ), + "api-kimi-vl": partial( + api.KimiVLAPI, + model="api-kimi-vl", + max_new_tokens=2048, + temperature=0, + ), +} + + +treevgr_series = { + 'TreeVGR-7B': partial( + vlm.TreeVGR, + model_path='HaochenWang/TreeVGR-7B', + min_pixels=1280*28*28, max_pixels=16384*28*28, + ), +} + +# QTuneVL series +qtunevl_series = { + "QTuneVL1_5-2B": partial( + vlm.QTuneVLChat, model_path="hanchaow/QTuneVL1_5-2B", version="V1.5" + ), + + "QTuneVL1_5-3B": partial( + vlm.QTuneVL, + model_path="hanchaow/QTuneVL1_5-3B", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=True, + post_process=True + ), +} + +# RbdashMM series via lmdeploy API +rbdashmm_api_series_lmdeploy = { + "rbdashmm3_DPO_38B_api": partial( + api.RBdashMMChat3_API, + api_base="http://0.0.0.0:23333/v1/chat/completions", + temperature=0, + retry=3, + timeout=600 + ), + "rbdashmm3_5_DPO_38B_api": partial( + api.RBdashChat3_5_API, + api_base="http://0.0.0.0:23333/v1/chat/completions", + temperature=0, + retry=3, + timeout=600 + ), + "rbdashmm3_5_38B_api": partial( + api.RBdashMMChat3_5_38B_API, + api_base="http://0.0.0.0:23333/v1/chat/completions", + temperature=0, + retry=3, + timeout=600 + ), + "rbdashmm3_78B_api": partial( + api.RBdashMMChat3_78B_API, + api_base="http://0.0.0.0:23333/v1/chat/completions", + temperature=0, + retry=3, + timeout=600 + ) +} + +logics_series = { + "Logics-Thinking-8B": partial(vlm.Logics_Thinking,model_path='Logics-MLLM/Logics-Thinking-8B'), + "Logics-Thinking-32B": partial(vlm.Logics_Thinking,model_path='Logics-MLLM/Logics-Thinking-32B'), +} + +insight_v_series = { + "insightv": partial(vlm.InsightV, pretrained_reason="THUdyh/Insight-V-Reason-LLaMA3", pretrained_summary="THUdyh/Insight-V-Summary-LLaMA3"), +} + +cosmos_series = { + 'Cosmos-Reason1-7B': partial(vlm.Cosmos, model_path='nvidia/Cosmos-Reason1-7B', use_vllm=True), +} + +keye_series = { + "Keye-VL-1.5-8B-auto":partial(vlm.KeyeChat, model_path="Kwai-Keye/Keye-VL-1_5-8B"), + "Keye-VL-1.5-8B-think":partial(vlm.KeyeChat, model_path="Kwai-Keye/Keye-VL-1_5-8B", think=True), + "Keye-VL-1.5-8B-nothink":partial(vlm.KeyeChat, model_path="Kwai-Keye/Keye-VL-1_5-8B", no_think=True), + "Keye-VL-8B-Preview-think":partial(vlm.KeyeChat, model_path="Kwai-Keye/Keye-VL-8B-Preview", think=True), +} + +qianfanvl_series = { + 'Qianfan-VL-3B': partial(vlm.Qianfan_VL, model_path='baidu/Qianfan-VL-3B'), + 'Qianfan-VL-8B': partial(vlm.Qianfan_VL, model_path='baidu/Qianfan-VL-8B'), + 'Qianfan-VL-70B': partial(vlm.Qianfan_VL, model_path='baidu/Qianfan-VL-70B'), +} + +lfm2vl_series = { + "LFM2-VL-450M": partial(vlm.LFM2VL, model_path="LiquidAI/LFM2-VL-450M"), + "LFM2-VL-1.6B": partial(vlm.LFM2VL, model_path="LiquidAI/LFM2-VL-1.6B"), + "LFM2-VL-3B": partial(vlm.LFM2VL, model_path="LiquidAI/LFM2-VL-3B"), + "LFM2.5-VL-1.6B": partial(vlm.LFM2VL, model_path="LiquidAI/LFM2.5-VL-1.6B"), +} + +covt_series = { + "CoVT-7B-seg": partial( + vlm.CoVTChat, + model_path="Wakals/CoVT-7B-seg", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "CoVT-7B-depth": partial( + vlm.CoVTChat, + model_path="Wakals/CoVT-7B-depth", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "CoVT-7B-seg_depth_dino": partial( + vlm.CoVTChat, + model_path="Wakals/CoVT-7B-seg_depth_dino", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "CoVT-7B-seg_depth_dino_edge": partial( + vlm.CoVTChat, + model_path="Wakals/CoVT-7B-seg_depth_dino_edge", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), +} + +bagel_series = { + "BAGEL-7B-MoT": partial(vlm.Bagel, model_path='ByteDance-Seed/BAGEL-7B-MoT'), +} + +spatial_related_models = { + # 3B models + "MindCube-Qwen2.5VL-RawQA-SFT": partial( + vlm.Qwen2VLChat, + model_path="MLL-Lab/MindCube-Qwen2.5VL-RawQA-SFT", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "MindCube-Qwen2.5VL-Aug-CGMap-FFR-Out-SFT": partial( + vlm.Qwen2VLChat, + model_path="MLL-Lab/MindCube-Qwen2.5VL-Aug-CGMap-FFR-Out-SFT", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "MindCube-Qwen2.5VL-Plain-CGMap-FFR-Out-SFT": partial( + vlm.Qwen2VLChat, + model_path="MLL-Lab/MindCube-Qwen2.5VL-Plain-CGMap-FFR-Out-SFT", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "SpatialLadder-3B": partial( + vlm.Qwen2VLChat, + model_path="hongxingli/SpatialLadder-3B", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "Spatial-MLLM-subset-sft": partial( + vlm.SpatialMLLM, + model_path="Diankun/Spatial-MLLM-subset-sft", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + max_num_frames=16, + use_custom_prompt=False, + post_process=True, + ), + "VST-3B-SFT": partial( + vlm.Qwen2VLChat, + model_path="rayruiyang/VST-3B-SFT", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + + # 7B models + "SpaceR-SFT-7B": partial( + vlm.Qwen2VLChat, + model_path="RUBBISHLIKE/SpaceR-SFT-7B", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "ViLaSR": partial( + vlm.Qwen2VLChat, + model_path="inclusionAI/ViLaSR", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "VST-7B-SFT": partial( + vlm.Qwen2VLChat, + model_path="rayruiyang/VST-7B-SFT", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "VLM-3R": partial( + vlm.VLM3R, + model_path="Journey9ni/vlm-3r-llava-qwen2-lora", + ), +} + +sensenova_si_series = { + # SenseNova-SI-1.0 series + "SenseNova-SI-InternVL3-2B": partial( + vlm.InternVLChat, + model_path="sensenova/SenseNova-SI-InternVL3-2B", + use_custom_prompt=False, + version="V2.0" + ), + "SenseNova-SI-InternVL3-8B": partial( + vlm.InternVLChat, + model_path="sensenova/SenseNova-SI-InternVL3-8B", + use_custom_prompt=False, + version="V2.0" + ), + # SenseNova-SI-1.1 series + "SenseNova-SI-1.1-InternVL3-2B": partial( + vlm.InternVLChat, + model_path="sensenova/SenseNova-SI-1.1-InternVL3-2B", + use_custom_prompt=False, + version="V2.0" + ), + "SenseNova-SI-1.1-InternVL3-8B": partial( + vlm.InternVLChat, + model_path="sensenova/SenseNova-SI-1.1-InternVL3-8B", + use_custom_prompt=False, + version="V2.0" + ), + "SenseNova-SI-1.1-Qwen2.5-VL-3B": partial( + vlm.Qwen2VLChat, + model_path="sensenova/SenseNova-SI-1.1-Qwen2.5-VL-3B", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "SenseNova-SI-1.1-Qwen2.5-VL-7B": partial( + vlm.Qwen2VLChat, + model_path="sensenova/SenseNova-SI-1.1-Qwen2.5-VL-7B", + min_pixels=1280 * 28 * 28, + max_pixels=16384 * 28 * 28, + use_custom_prompt=False, + ), + "SenseNova-SI-1.1-Qwen3-VL-8B": partial( + vlm.Qwen3VLChat, + model_path="sensenova/SenseNova-SI-1.1-Qwen3-VL-8B", + use_custom_prompt=False, + ), + "SenseNova-SI-1.1-BAGEL-7B-MoT": partial( + vlm.Bagel, + model_path='sensenova/SenseNova-SI-1.1-BAGEL-7B-MoT' + ), + # SenseNova-SI-1.2 series + "SenseNova-SI-1.2-InternVL3-8B": partial( + vlm.InternVLChat, + model_path="sensenova/SenseNova-SI-1.2-InternVL3-8B", + use_custom_prompt=False, + version="V2.0" + ), + # SenseNova-SI-1.3 series + "SenseNova-SI-1.3-InternVL3-8B": partial( + vlm.InternVLChat, + model_path="sensenova/SenseNova-SI-1.3-InternVL3-8B", + use_custom_prompt=False, + version="V2.0" + ), +} + +internvl_groups = [ + internvl, internvl2, internvl2_5, mini_internvl, internvl2_5_mpo, + internvl3, internvl3_5 +] +internvl_series = {} +for group in internvl_groups: + internvl_series.update(group) + +interns1_groups = [ + interns1_mini +] +interns1_series = {} +for group in interns1_groups: + interns1_series.update(group) + +supported_VLM = {} + +model_groups = [ + ungrouped, o1_apis, api_models, xtuner_series, qwen_series, llava_series, granite_vision_series, + internvl_series, yivl_series, xcomposer_series, minigpt4_series, + idefics_series, instructblip_series, deepseekvl_series, deepseekvl2_series, deepseekocr_series, + janus_series, minicpm_series, cogvlm_series, wemm_series, cambrian_series, + chameleon_series, video_models, ovis_series, vila_series, mantis_series, + mmalaya_series, phi3_series, phi4_series, xgen_mm_series, qwen2vl_series, qwen3vl_series, qwen3_5_series, + slime_series, eagle_series, moondream_series, llama_series, molmo_series, + kosmos_series, points_series, nvlm_series, vintern_series, h2ovl_series, + aria_series, smolvlm_series, sail_series, valley_series, vita_series, + ross_series, emu_series, ola_series, ursa_series, gemma_series, + long_vita_series, ristretto_series, kimi_series, aguvis_series, hawkvl_series, + flash_vl, kimi_vllm_series, oryx_series, treevgr_series, varco_vision_series, qtunevl_series, + xvl_series, thyme_series, logics_series, cosmos_series, keye_series, qianfanvl_series, + lfm2vl_series, rbdashmm_api_series_lmdeploy, interns1_series, insight_v_series, covt_series +] + +# add by EASI team +model_groups.extend([bagel_series, spatial_related_models, sensenova_si_series]) + +for grp in model_groups: + supported_VLM.update(grp) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/__init__.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/cg_av_counting.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/cg_av_counting.py new file mode 100644 index 00000000..6ebcdae6 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/cg_av_counting.py @@ -0,0 +1,415 @@ +import json +import os +import os.path as osp +from pathlib import Path + +import numpy as np +import pandas as pd +import portalocker +from huggingface_hub import snapshot_download +from PIL import Image + +from vlmeval.smp import (dump, get_cache_path, get_file_extension, get_intermediate_file_path, + load, md5, modelscope_flag_set) +from ..utils.cgbench import post_process, unzip_hf_zip +from ..video_base import VideoBaseDataset +from .utils import get_timestampes, rating_func + + +class CGAVCounting(VideoBaseDataset): + + dataset = "CG-AV-Counting" + + TYPE = "Video-Counting" + + MD5 = "d1cd8486353ab85178098d443264a7d0" + + SYS = "" + + def __init__( + self, + dataset="CG-AV-Counting", + use_frame_time=False, + nframe=0, + fps=-1, + ): + super().__init__(dataset=dataset, nframe=nframe, fps=fps) + self.use_frame_time = use_frame_time + self.dataset_name = dataset + self.frame_tmpl_clue = 'frame-{}.jpg' + + @classmethod + def supported_datasets(cls): + return ["CGAVCounting"] + + def frame_paths_clue(self, video, timestamp_list): + frame_root = osp.join(self.frame_root, video) + os.makedirs(frame_root, exist_ok=True) + return [osp.join(frame_root, self.frame_tmpl_clue.format(i)) for i in timestamp_list] + + def save_video_frames_clue(self, video, uid, timestamp_list): + if type(uid) is not str: + uid = str(uid) + import decord + frame_paths = self.frame_paths_clue(uid, timestamp_list) + flag = np.all([osp.exists(p) for p in frame_paths]) + if flag: + frame = Image.open(frame_paths[0]) + return frame_paths, frame.width, frame.height + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + frames = [] + # 获取视频的帧率 + fps = vid.get_avg_fps() + lock_path = osp.splitext(vid_path)[0] + '.lock' + with portalocker.Lock(lock_path, 'w', timeout=30): + for timestamp_sec in timestamp_list: + # 计算视频帧对应的索引 + frame_idx = int(timestamp_sec * fps) + + # 获取对应帧 + frame = vid[frame_idx] + + # 将帧转换为PIL图像 + img = Image.fromarray(frame.asnumpy()) + frames.append(img) + for im, pth in zip(frames, frame_paths): + if not osp.exists(pth): + im.save(pth) + return frame_paths, frames[0].width, frames[0].height + + def format_time(self, t): + return f"{t:.2f}" + + def get_output_filename(self, item): + video_id = Path(item["video"]).stem + start_str = self.format_time(item["query_interval"][0]) + end_str = self.format_time(item["query_interval"][1]) + return f"{video_id}_{start_str}_{end_str}.mp4" + + def prepare_dataset(self, dataset_name="CG-AV-Counting", repo_id="CG-Bench/CG-AV-Counting"): + + def check_integrity(pth): + data_file = osp.join(pth, f"{dataset_name}.tsv") + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + data = load(data_file) + for video_pth in data["video"]: + if not osp.exists(osp.join(pth, video_pth)): + return False + return True + + cache_path = get_cache_path(repo_id) + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + + def generate_tsv(pth): + + tsv_file = osp.join(pth, f"{dataset_name}.tsv") + + task_modes = ["long_acc", "ref_acc", "clue_acc"] + all_data = [] + for task_mode in task_modes: + with open(osp.join(pth, "cg-av-counting.json"), "r") as f: + data_file = pd.DataFrame(json.load(f)) + + data_file = data_file.assign(index=range(len(data_file))) + data_file["video_uid"] = data_file["video"].replace(".mp4", "") + data_file["video"] = data_file["video"].apply(lambda x: f"cg_videos_720p/{x}") + + data_file["ref_video_path"] = "" + data_file["ref_video_uid"] = "" + + if task_mode in ["ref_acc"]: + data_file["ref_video_path"] = data_file.apply( + lambda row: f"ref_videos/{self.get_output_filename(row)}", axis=1 + ) + data_file["ref_video_uid"] = data_file["ref_video_path"].apply( + lambda x: x.split("/")[-1].replace(".mp4", "")) + + data_file["task_mode"] = task_mode + + if task_mode == "clue_acc": + data_file["answer"] = data_file["clue"].apply(json.dumps) + + data_file = data_file[ + [ + "index", + "video_uid", + "video", + "ref_video_path", + "ref_video_uid", + "question", + "answer", + "type", + "category", + "task_mode" + ] + ] + + all_data.append(data_file) + + final_data = pd.concat(all_data, ignore_index=True) + final_data["index"] = range(len(final_data)) + final_data.to_csv(tsv_file, sep="\t", index=False) + dataset_path = cache_path + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + + dataset_path = dataset_snapshot_download(dataset_id=repo_id) + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type="dataset") + + unzip_hf_zip(dataset_path) + + generate_tsv(dataset_path) + + tsv_file = osp.join(dataset_path, f"{dataset_name}.tsv") + + return dict(data_file=tsv_file, root=dataset_path) + + def build_prompt(self, line, video_llm): + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + task_mode = line["task_mode"] + assert task_mode in ["long_acc", "clue_acc", "ref_acc"] + if task_mode == "long_acc": + user_prompt = "" + message = [] + video_path = line["video"] + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + + user_prompt += ( + f"Please answer the question '{line['question']}' with a number. Just output the number itself, " + "don't output anything else." + ) + message.append(dict(type="text", value=user_prompt)) + elif task_mode == "ref_acc": + user_prompt = "" + message = [] + video_path = line["ref_video_path"] + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["ref_video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + + if self.use_frame_time: + user_prompt += get_timestampes(frame_indices, vid_fps) + user_prompt += ( + f"Please answer the question '{line['question']}' with a number. Just output the number itself, " + "don't output anything else." + ) + message.append(dict(type="text", value=user_prompt)) + elif task_mode == "clue_acc": + if line["category"] == "event": + user_prompt = "" + message = [] + video_path = line["video"] + if video_llm: + message.append(dict(type="video", value=osp.join(self.data_root, video_path))) + else: + image_paths, frame_indices, vid_fps = self.save_video_frames( + video_path, uid=line["video_uid"], num_frames=self.nframe, fps=self.fps + ) + message.extend(dict(type="image", value=im) for im in image_paths) + user_prompt += get_timestampes(frame_indices, vid_fps) + + user_prompt += ( + f"Watch the video and provide your answer to the question '{line['question']}', " + "including the start and end timestamps for each event." + "Format your answer in JSON, enclosed in and tags. " + "The output should look like this: [[\"start_time\", \"end_time\"], ...]. " + "Ensure each timestamp is in seconds (e.g., 'xx.xx')." + ) + message.append(dict(type="text", value=user_prompt)) + elif line["category"] == "object": + user_prompt = "" + message = [] + video_path = line["video"] + clue_timestamp_list = [] + for clue in json.loads(line["answer"]): + if clue["timestamp"] not in clue_timestamp_list: + clue_timestamp_list.append(clue["timestamp"]) + image_paths, width, height = self.save_video_frames_clue( + video_path, uid=line["video_uid"], timestamp_list=clue_timestamp_list + ) + message.append( + dict(type="text", value=f"There are {len(image_paths)} frames in the size of {width}x{height}")) + for idx, im in enumerate(image_paths): + message.append(dict(type="text", value=f"Frame{idx + 1}:")) + message.append(dict(type="image", value=im)) + user_prompt += ( + f"Answer the question '{line['question']}', " + "including the bounding box for the query object in the first frame " + "where it appears. For subsequent frames where the object appears, " + "do not provide the bounding box again. " + "Format your answer in JSON, enclosed within and tags. " + "The output should look like this: " + "{\"Frame1\": [[x_min, y_min, x_max, y_max]], \"Frame2\": [...],...}. " + "In the output, each frame should either contain the bounding box of the object " + "(if it appears for the first time in that frame) or an empty list `[]` " + "(if the object does not appear or it has already been labeled in a previous frame). " + "Ensure that bounding boxes are listed as [x_min, y_min, x_max, y_max]." + ) + message.append(dict(type="text", value=user_prompt)) + elif line["category"] == "attribute": + user_prompt = "" + message = [] + video_path = line["video"] + clue_timestamp_list = [] + for clue_ in json.loads(line["answer"]): + for clue in clue_: + if clue["timestamp"] not in clue_timestamp_list: + clue_timestamp_list.append(clue["timestamp"]) + image_paths, width, height = self.save_video_frames_clue( + video_path, uid=line["video_uid"], timestamp_list=clue_timestamp_list + ) + message.append(dict( + type="text", + value=f"There are {len(image_paths)} frames in the size of {width}x{height}")) + for idx, im in enumerate(image_paths): + message.append(dict(type="text", value=f"Frame{idx + 1}:")) + message.append(dict(type="image", value=im)) + user_prompt += ( + f"Answer the question '{line['question']}', clustering the objects according to the question. " + "For each unique cluster, assign a unique label and return the bounding box for each object in " + "the first frame where it appears. For subsequent frames where the object appears, " + "do not output anything. " + "Format your answer in JSON, enclosed within and tags. " + "The output should look like this: " + "{\"Frame 1\": [{\"bbox\": [x_min, y_min, x_max, y_max], 'label': \"Label 1\"}], " + "\"Frame 2\": [...], ...}. " + "In the output, each frame should either contain the bounding box and label for the object " + "(if it appears for the first time in that frame) or an empty list `[]` " + "(if the object has already been labeled or does not appear in that frame). " + "The label should correspond to a unique object cluster according to the question." + ) + message.append(dict(type="text", value=user_prompt)) + print(message) + return message + + def save_video_frames(self, video, uid, num_frames=8, fps=-1): + + if type(uid) is not str: + uid = str(uid) + import decord + vid_path = osp.join(self.data_root, video) + vid = decord.VideoReader(vid_path) + vid_fps = vid.get_avg_fps() + n_frames = len(vid) + + if num_frames > 0 and fps < 0: + step_size = len(vid) / (num_frames + 1) + indices = [int(i * step_size) for i in range(1, num_frames + 1)] + + frame_paths = self.frame_paths(uid) + elif fps > 0: + total_duration = n_frames / vid_fps + required_frames = int(total_duration * fps) + step_size = vid_fps / fps + indices = [int(i * step_size) for i in range(required_frames)] + frame_paths = self.frame_paths_fps(uid, len(indices)) + + # Save and validate frames + valid_paths = [] + valid_indices = [] + lock_path = osp.splitext(vid_path)[0] + '.lock' + with portalocker.Lock(lock_path, 'w', timeout=30): + if not np.all([osp.exists(p) for p in frame_paths]): + images = [vid[i].asnumpy() for i in indices] + for i, (img_array, path) in enumerate(zip(images, frame_paths)): + if osp.exists(path): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + try: + img = Image.fromarray(img_array) + img.save(path) + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + else: + for i, path in enumerate(frame_paths): + try: + with Image.open(path) as img: + img.verify() + valid_paths.append(path) + valid_indices.append(indices[i]) + except Exception: + continue + + return valid_paths, valid_indices, vid_fps + + def evaluate(self, eval_file, **judge_kwargs): + + assert get_file_extension(eval_file) in ['xlsx', 'json', 'tsv'], \ + 'data file should be an supported format (xlsx/json/tsv) file' + + tgt_file = get_intermediate_file_path(eval_file, '_rating', 'json') + score_file = get_intermediate_file_path(eval_file, '_score', 'csv') + + data = load(eval_file) + + data_un = data[~pd.isna(data["prediction"])] + data_pred_na = data[pd.isna(data["prediction"])] + + data_pred_na["score"] = -1 + + scores_df = data_un.apply( + lambda row: post_process( + response=row["prediction"], + right_answer=row["answer"], + task_mode=row["task_mode"], + category=row["category"] + ), + axis=1, + result_type='expand' + ) + + data_un = pd.concat([data_un, scores_df], axis=1) + + data = pd.concat([data_pred_na, data_un]) + + rejected_count = (data["score"] == -1).sum() + + print( + f"Among {len(data)} questions, " + f"failed to obtain prediction for {len(data_pred_na)} questions, " + f"failed to obtain the score for {rejected_count - len(data_pred_na)} questions. " + f"Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating." + ) + + dump(data, score_file) + + rating = rating_func(score_file) + + dump(rating, tgt_file) + + return rating diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/requirements.txt b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/requirements.txt new file mode 100644 index 00000000..3c5cbc61 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/requirements.txt @@ -0,0 +1,2 @@ +scipy +word2number diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/utils.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/utils.py new file mode 100644 index 00000000..353ff838 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/CGAVCounting/utils.py @@ -0,0 +1,423 @@ +import json +import math +import re +import signal +import zipfile +from pathlib import Path + +import numpy as np +from tqdm import tqdm + +from vlmeval.smp import load + + +def rating_func(data_path): + df = load(data_path) + + task_mode_fields = { + "long_acc": ["acc", "oboa", "mae", "rmse"], + "ref_acc": ["acc", "oboa", "mae", "rmse"], + "clue_acc": ["wcs", "ifa"], + } + + rating = {} + + for task_mode, fields in task_mode_fields.items(): + sub_df = df[df["task_mode"] == task_mode] + for field in fields: + values = sub_df[field] + if field == "rmse": + # RMSE: sqrt(mean(x^2)) + rmse_val = np.sqrt(values.mean()) + rating[f"{task_mode}/rmse"] = round(rmse_val, 4) + else: + rating[f"{task_mode}/{field}"] = round(values.mean(), 4) + + return rating + + +def get_timestampes(frame_indices, fps): + seconds = list(map(lambda x: str(round(x / fps, 4)), frame_indices)) + timestamps = ", ".join(seconds) + return "A total of {frame_num} frames are sampled. Their corresponding timestamps are:\n\n{timestamps}\n\n".format( + frame_num=len(frame_indices), timestamps=timestamps + ) + + +def time_str_to_seconds(time_str: str) -> float: + time_str = time_str.strip() + if '.' in time_str: + time_main, milliseconds = time_str.split('.') + milliseconds = float(f"0.{milliseconds}") + else: + time_main = time_str + milliseconds = 0.0 + + parts = list(map(int, time_main.split(":"))) + + if len(parts) == 2: + minutes, seconds = parts + total_seconds = minutes * 60 + seconds + elif len(parts) == 3: + hours, minutes, seconds = parts + total_seconds = hours * 3600 + minutes * 60 + seconds + else: + raise ValueError(f"Invalid time format: {time_str}") + + return total_seconds + milliseconds + + +def extract_outer_json(text): + stack = [] + start_idx = None + opening = {'{': '}', '[': ']'} + closing = {'}': '{', ']': '['} + + for i, char in enumerate(text): + if char in opening: + if not stack: + start_idx = i # 最外层起点 + stack.append(char) + elif char in closing: + if stack and stack[-1] == closing[char]: + stack.pop() + if not stack and start_idx is not None: + candidate = text[start_idx:i + 1] + try: + return json.dumps(json.loads(candidate)) + except json.JSONDecodeError: + continue # 尝试下一个 JSON 块 + return None + + +def compute_tiou(t1, t2): + """Temporal IoU""" + inter_start = max(t1[0], t2[0]) + inter_end = min(t1[1], t2[1]) + inter = max(0.0, inter_end - inter_start) + union = max(t1[1], t2[1]) - min(t1[0], t2[0]) + return inter / union if union > 0 else 0.0 + + +def compute_sIoU(box1, box2): + """ + Complete IoU (sIoU) between two bounding boxes. + Args: + box1 (list or np.array): [x1, y1, x2, y2] of ground truth box + box2 (list or np.array): [x1, y1, x2, y2] of predicted box + + Returns: + IoU (float): The IoU score between the two boxes. + """ + + # Ensure the coordinates are ordered: [min_x, min_y, max_x, max_y] + box1 = np.array([min(box1[0], box1[2]), min(box1[1], box1[3]), + max(box1[0], box1[2]), max(box1[1], box1[3])]) + box2 = np.array([min(box2[0], box2[2]), min(box2[1], box2[3]), + max(box2[0], box2[2]), max(box2[1], box2[3])]) + + # Compute the intersection area + inter_x1 = max(box1[0], box2[0]) + inter_y1 = max(box1[1], box2[1]) + inter_x2 = min(box1[2], box2[2]) + inter_y2 = min(box1[3], box2[3]) + + inter_area = max(0, inter_x2 - inter_x1) * max(0, inter_y2 - inter_y1) + + # Compute areas of the individual boxes + area1 = (box1[2] - box1[0]) * (box1[3] - box1[1]) + area2 = (box2[2] - box2[0]) * (box2[3] - box2[1]) + + # Compute union area + union = area1 + area2 - inter_area + iou = inter_area / union if union > 0 else 0.0 + + return iou + + +def greedy_matching(gt_instances, pred_instances, iou_func): + """Greedy matching based on maximum IoU""" + unmatched_gt = set(range(len(gt_instances))) + unmatched_pred = set(range(len(pred_instances))) + matches = [] + + while unmatched_gt and unmatched_pred: + max_iou = -1 + best_match = None + for gt_idx in unmatched_gt: + for pred_idx in unmatched_pred: + iou = iou_func(gt_instances[gt_idx], pred_instances[pred_idx]) + if iou > max_iou: + max_iou = iou + best_match = (gt_idx, pred_idx) + + if best_match: + gt_idx, pred_idx = best_match + matches.append((gt_idx, pred_idx)) + unmatched_gt.remove(gt_idx) + unmatched_pred.remove(pred_idx) + + return matches + + +def compute_cluster_pair_wcs(gt, pred, iou_type): + if iou_type == 'tIoU': + loc_sum = 0.0 + for g in gt: + loc_sum += max([compute_tiou(g, p) for p in pred] or [0.0]) + loc_acc = loc_sum / len(gt) if gt else 0.0 + count_penalty = 1.0 - abs(len(pred) - len(gt)) / max(len(gt), 1) + # count_penalty = 1.0 + return math.sqrt(loc_acc * max(0, count_penalty)) + + elif iou_type == 'sIoU': + # group by frame index + from collections import defaultdict + gt_by_f = defaultdict(list) + pred_by_f = defaultdict(list) + for f, box in gt: + gt_by_f[f].append(box) + for f, box in pred: + pred_by_f[f].append(box) + + all_f = set(gt_by_f) | set(pred_by_f) + wcs = 0.0 + for f in all_f: + gt_f = gt_by_f.get(f, []) + pred_f = pred_by_f.get(f, []) + matches = greedy_matching(gt_f, pred_f, compute_sIoU) + loc_sum = sum([compute_sIoU(gt_f[i], pred_f[j]) for i, j in matches]) + loc_acc = loc_sum / len(gt_f) if gt_f else 0.0 + count_penalty = 1.0 - abs(len(pred_f) - len(gt_f)) / max(len(gt_f), 1) + # count_penalty = 1.0 + wcs += math.sqrt(loc_acc * max(0, count_penalty)) + return wcs / max(len(all_f), 1) + + else: + raise ValueError("Unsupported iou_type") + + +class TimeoutException(Exception): + pass + + +def timeout_handler(signum, frame): + raise TimeoutException("Function execution exceeded the time limit.") + + +def compute_wcs_unlabeled(gt_clusters, pred_clusters, iou_type='tIoU', + timeout=10): # 主要是给attribute用的,但是object和event视作一个cluster也能用 + from scipy.optimize import linear_sum_assignment + + # Set the timeout signal handler + signal.signal(signal.SIGALRM, timeout_handler) + signal.alarm(timeout) # Set the alarm to go off in 'timeout' seconds + + try: + # Original function logic + K = len(gt_clusters) + M = len(pred_clusters) + + # Build cost matrix (we want max score → min cost) + score_matrix = np.zeros((K, M)) + for i in range(K): + for j in range(M): + score_matrix[i, j] = compute_cluster_pair_wcs(gt_clusters[i], pred_clusters[j], iou_type) + + cost_matrix = -score_matrix # maximize score → minimize cost + + row_ind, col_ind = linear_sum_assignment(cost_matrix) + + matched_scores = [score_matrix[i, j] for i, j in zip(row_ind, col_ind)] + + # WCS = average over gt clusters (including unmatched = 0) + total_wcs = sum(matched_scores) + return total_wcs / K + + except TimeoutException: + print(gt_clusters, pred_clusters) + print("Function execution exceeded the time limit.") + return None # or you can return some default value to indicate timeout + + finally: + signal.alarm(0) # Cancel the alarm after the function completes or times out + + +def post_process(response, right_answer, task_mode, category): + from word2number import w2n + if task_mode in ["long_acc", "ref_acc"]: + result = {"acc": 0, "oboa": 0, "mae": 0, "rmse": 0} + if response: + try: + pred = w2n.word_to_num(response) + except Exception: + pred = 0 + if abs(float(right_answer) - float(pred)) <= 1e-5: + result["acc"] = 1 + + if abs(float(right_answer) - float(pred)) <= 1: + result["oboa"] = 1 + + if abs(float(right_answer) - float(pred)) <= max(2 * float(right_answer), 100): + result["mae"] = abs(float(right_answer) - float(pred)) + result["rmse"] = abs(float(right_answer) - float(pred)) ** 2 + else: + result["mae"] = abs(float(right_answer) * 2) + result["rmse"] = abs(float(right_answer) * 2) ** 2 + elif task_mode == "clue_acc": + result = {"wcs": 0, "ifa": 0} + if response: + clues = json.loads(right_answer) + content_match = re.search(r"(.*?)", response, re.DOTALL) + student_answer = content_match.group(1).strip() if content_match else response.strip() + j = None + try: + try: + j = json.loads(student_answer) + except Exception: + j = json.loads(extract_outer_json(student_answer)) + except Exception: + pass + if j is not None: + try: + if category == "event": + pred = [] + for e in j: + + if isinstance(e[0], str) and isinstance(e[1], str) and ":" in e[0] and ":" in e[1]: + pred.append([time_str_to_seconds(e[0]), time_str_to_seconds(e[1])]) + else: + pred.append([float(e[0].split(" ")[0]) if isinstance(e[0], str) else e[0], + float(e[1].split(" ")[0]) if isinstance(e[1], str) else e[1]]) + gt = [] + for e in clues: + gt.append([float(e['start']), float(e['end'])]) + + result["wcs"] = compute_wcs_unlabeled([gt], [pred], "tIoU") + result["ifa"] = 1 + elif category == "object": + gt = [] + clue_timestamp_list = [] + for clue in clues: + if clue["timestamp"] not in clue_timestamp_list: + clue_timestamp_list.append(clue["timestamp"]) + for clue in clues: + gt.append((clue_timestamp_list.index(clue["timestamp"]), clue['bbox'])) + pred = [] + for key in j.keys(): + if "Frame" not in key: + continue + idx = int(key.replace("Frame", "")) - 1 + if len(j[key]) == 0: + continue + if isinstance(j[key][0], list) and len(j[key][0]) == 4: + for e in j[key]: + if isinstance(e, list) and len(e) == 4: + pred.append((idx, e)) + elif isinstance(j[key][0], list) and len(j[key][0]) == 2: + for ii in range(int(len(j[key]) // 2)): + if isinstance(j[key][ii * 2], list) and len(j[key][ii * 2]) == 2 and isinstance( + j[key][ii * 2 + 1], list) and len(j[key][ii * 2 + 1]) == 2: + pred.append((idx, [j[key][ii * 2][0], j[key][ii * 2][1], j[key][ii * 2 + 1][0], + j[key][ii * 2 + 1][1]])) + result["wcs"] = compute_wcs_unlabeled([gt], [pred], "sIoU") + result["ifa"] = 1 + elif category == "attribute": + gt = [] + clue_timestamp_list = [] + for clue_ in clues: + for clue in clue_: + if clue["timestamp"] not in clue_timestamp_list: + clue_timestamp_list.append(clue["timestamp"]) + for clue_ in clues: + gt_ = [] + for clue in clue_: + gt_.append((clue_timestamp_list.index(clue["timestamp"]), clue['bbox'])) + gt.append(gt_) + pred = {} + for key in j.keys(): + if "Frame" not in key: + continue + idx = int(key.replace("Frame", "")) - 1 + for e in j[key]: + if e['label'] not in pred.keys(): + pred[e['label']] = [] + if 'bbox' in e: + if isinstance(e['bbox'], list) and len(e['bbox']) == 4: + pred[e['label']].append((idx, e['bbox'])) + if 'bbox_2d' in e: + if isinstance(e['bbox_2d'], list) and len(e['bbox_2d']) == 4: + pred[e['label']].append((idx, e['bbox_2d'])) + pred_list = [pred[key] for key in pred] + result["wcs"] = compute_wcs_unlabeled(gt, pred_list, "sIoU") + result["ifa"] = 1 + except Exception: + pass + + return result + + +def get_chunk_number(filename): + try: + num = filename.split("chunk_")[1].split(".zip")[0] + return int(num) + except Exception: + return float('inf') + + +def auto_merge_and_unzip_parts(target_dir, extract_dir, zip_prefix=None): + target_dir = Path(target_dir) + extract_dir = Path(extract_dir) + extract_dir.mkdir(parents=True, exist_ok=True) + + # 匹配 zip 分卷:例如 video_chunk_001.zip.part000 + part_files = sorted(target_dir.glob("*.zip.part*")) + groups = {} + + # 分组:根据前缀提取 group 名(即 zip 文件名) + for part_file in part_files: + match = re.match(r"(.*\.zip)\.part\d+$", part_file.name) + if match: + zip_name = match.group(1) + if zip_prefix is None or Path(zip_name).stem.startswith(zip_prefix): + groups.setdefault(zip_name, []).append(part_file) + + if not groups: + print(f"No matching zip parts found with prefix: {zip_prefix}") + return + + # 合并每一组分卷 -> 解压 + for zip_name, parts in tqdm(groups.items(), desc="Merging and unzipping"): + parts = sorted(parts, key=lambda p: int(p.name.split("part")[-1])) + zip_path = target_dir / zip_name + + # 合并分卷 + with open(zip_path, 'wb') as outfile: + for part in parts: + with open(part, 'rb') as infile: + outfile.write(infile.read()) + + # 解压合并后的 zip 文件 + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(extract_dir) + + # 删除合并后的 zip 文件(可注释) + zip_path.unlink() + + +def unzip_hf_zip(target_dir): + target_dir = Path(target_dir) + + videos_dir = target_dir / "cg_videos_720p" + ref_videos_dir = target_dir / "ref_videos" + + if videos_dir.exists() and ref_videos_dir.exists(): + print("all target dirs exist, skip.") + return + + videos_dir.mkdir(parents=True, exist_ok=True) + + auto_merge_and_unzip_parts(target_dir, ref_videos_dir, zip_prefix="ref_videos") + auto_merge_and_unzip_parts(target_dir, videos_dir, zip_prefix="videos") + + print("sucessfully unzip all files.") diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/README.md b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/README.md new file mode 100644 index 00000000..34491189 --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/README.md @@ -0,0 +1,79 @@ +# EgoExoBench + +This is the official repository of [EgoExoBench: A +Benchmark for First- and Third-person View Video +Understanding in MLLMs]() + +## 📊 Benchmark Overview + +**EgoExoBench** is a large-scale benchmark designed to evaluate cross-view video understanding in multimodal large language models (MLLMs). It contains paired egocentric–exocentric videos and over **7,300 multiple-choice questions** across **11 subtasks**, covering three key dimensions of ego–exo reasoning: + +* **Ego-Exo Relation** +* **Ego-Exo View Transition** +* **Ego-Exo Temporal Reasoning** + +## 📝 Data Preparation + +### Video Data + +EgoExoBench builds upon six publicly available ego–exo datasets. + +* [Ego-Exo4D](https://ego-exo4d-data.org/) +* [LEMMA](https://sites.google.com/view/lemma-activity) +* [EgoExoLearn](https://huggingface.co/datasets/hyf015/EgoExoLearn) +* [TF2023](https://github.com/ziweizhao1993/PEN) +* [EgoMe](https://huggingface.co/datasets/HeqianQiu/EgoMe) +* [CVMHAT](https://github.com/RuizeHan/CVMHT) + +The script will automatically download the processed video data, **except Ego-Exo4D**, due to license restrictions. You need to manually download it from the [official website](https://ego-exo4d-data.org/) and organize it as shown below. + +If you prefer to download all datasets manually, you can simply create empty `processed_videos/` and `processed_frames/` folders and organize the datasets in the following structure: + +``` +[LMUData]/videos/EgoExoBench +├── CVMHAT/ +│ └── data/ +├── EgoExo4D/ +│ └── takes/ +├── EgoExoLearn/ +├── EgoMe/ +├── LEMMA/ +├── TF2023/ +│ └── data/ +├── processed_frames/ +└── processed_videos/ +``` +### Multiple-Choice Questions (MCQs) + +The script will automatically download the EgoExoBench **multiple-choice questions (MCQs)** file from this [link](https://huggingface.co/datasets/Heleun/EgoExoBench_MCQ). + +## 🚀 Model Evaluation + +Use the following commands to evaluate your VLMs on EgoExoBench: + +```shell +# For lightweight vision-language models +torchrun --nproc-per-node=1 run.py \ + --data EgoExoBench_MCQ \ + --model Qwen2.5-VL-7B-Instruct-ForVideo + +# For larger models with higher memory usage +python run.py \ + --data EgoExoBench_MCQ \ + --model Qwen2.5-VL-72B-Instruct-ForVideo +``` + +To skip evaluation on the **Ego-Exo4D** portion of the benchmark, specify the `EgoExoBench_64frame_skip_EgoExo4D` configuration with the **`--data`** argument. + +``` +# Example command to skip Ego-Exo4D +torchrun --nproc-per-node=1 run.py \ + --data EgoExoBench_64frame_skip_EgoExo4D \ + --model [Your_Model_Name] +``` + +> 💡 Note: If you encounter errors related to stacking videos with varying frame counts, try using `transformers==4.49.0` as a temporary workaround. + +## 🙏 Acknowledgements + +EgoExoBench builds upon publicly available ego–exo datasets: [Ego-Exo4D](https://ego-exo4d-data.org/), [LEMMA](https://sites.google.com/view/lemma-activity), [EgoExoLearn](https://huggingface.co/datasets/hyf015/EgoExoLearn), [TF2023](https://github.com/ziweizhao1993/PEN), [EgoMe](https://huggingface.co/datasets/HeqianQiu/EgoMe), [CVMHAT](https://github.com/RuizeHan/CVMHT). Thanks for open-sourcing! diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/__init__.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/__init__.py new file mode 100644 index 00000000..f3fe93da --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/__init__.py @@ -0,0 +1 @@ +from .egoexobench import EgoExoBench_MCQ # noqa: F401 diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/cvmhat_preprocess.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/cvmhat_preprocess.py new file mode 100644 index 00000000..2045461a --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/cvmhat_preprocess.py @@ -0,0 +1,46 @@ +import json +import os +import xml.etree.ElementTree as ET + +import cv2 + +# replace with your actual path +ann_file = 'EgoExoBench/MCQ/Ego-Exo-Relation/person_relation.json' + + +def add_bbox(bbox_img_path): + bbox_dir = os.path.dirname(bbox_img_path) + os.makedirs(bbox_dir, exist_ok=True) + ori_img_dir = os.path.dirname(bbox_img_path).replace('bbox', 'frame_sel') + frame_idx, person_id = os.path.basename(bbox_img_path).split('.')[0].split('_') + ori_img_path = os.path.join(ori_img_dir, frame_idx + '.jpg') + xml_file = ori_img_path.replace('data', 'GT_xml').replace('frame_sel/', '').replace('.jpg', '.xml') + + tree = ET.parse(xml_file) + root = tree.getroot() + im = cv2.imread(ori_img_path) + for object in root.findall('object'): + object_name = object.find('name').text + if object_name != person_id: + continue + im_copy = im.copy() + Xmin = int(object.find('rectangle').find('xmin').text) + Ymin = int(object.find('rectangle').find('ymin').text) + Xmax = int(object.find('rectangle').find('xmax').text) + Ymax = int(object.find('rectangle').find('ymax').text) + color = (255, 0, 0) + cv2.rectangle(im_copy, (Xmin, Ymin), (Xmax, Ymax), color, 3) + cv2.imwrite(bbox_img_path, im_copy) + return + + +with open(ann_file, 'r') as f: + ann_data = json.load(f) + for aitem in ann_data.values(): + image_paths = [] + image_paths.extend(aitem['query']['image_paths']) + for oitem in aitem['options']: + image_paths.extend(oitem['image_paths']) + + for image_path in image_paths: + add_bbox(image_path) diff --git a/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/egoexobench.py b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/egoexobench.py new file mode 100644 index 00000000..1d37651c --- /dev/null +++ b/evaluation/cosmos3/reasoner/vlmevalkit/vlmeval/dataset/EgoExoBench/egoexobench.py @@ -0,0 +1,305 @@ +import glob +import os +import os.path as osp +import re +import shutil +import warnings + +import numpy as np +import pandas as pd +import torch +import torchvision.transforms as T +from huggingface_hub import snapshot_download +from PIL import Image +from torchvision import transforms + +from vlmeval.smp import (dump, get_cache_path, get_file_extension, get_intermediate_file_path, + load, md5) +from vlmeval.smp.file import LMUDataRoot +from ..utils import DEBUG_MESSAGE, build_judge +from ..video_base import VideoBaseDataset +from .utils import Stack, ToTorchFormatTensor + +FAIL_MSG = 'Failed to obtain answer via API.' + + +class EgoExoBench_MCQ(VideoBaseDataset): + MD5 = '9c0aa8da235d766d02dd7e9a19182719' + TYPE = 'Video-MCQ' + DEFAULT_JUDGE = ['chatgpt-0125', 'gpt-4-0125'] + + def __init__(self, dataset='EgoExoBench_MCQ', nframe=64, skip_EgoExo4D=False): + super().__init__(dataset=dataset, nframe=nframe) + self.frame_fps = 2 + self.skip_EgoExo4D = skip_EgoExo4D + + @classmethod + def supported_datasets(cls): + return ['EgoExoBench_MCQ'] + + def prepare_dataset(self, dataset_name='EgoExoBench_MCQ', repo_id='Heleun/EgoExoBench_MCQ', video_repo_id='onlyfaces/EgoExoBench'): # noqa: E501 + def check_integrity(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + + if not osp.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + + return True + cache_path = get_cache_path(repo_id) + self.video_root = os.path.join(LMUDataRoot(), 'videos', 'EgoExoBench') + os.makedirs(self.video_root, exist_ok=True) + if not osp.exists(osp.join(self.video_root, 'processed_videos')) or not osp.exists(osp.join(self.video_root, 'processed_frames')): # noqa: E501 + snapshot_download( + repo_id=video_repo_id, + repo_type='dataset', + allow_patterns=['*.tar.gz.part*'], + local_dir=self.video_root + ) + + def combine_and_extract(root_dir, prefix, remove_parts=True): + parts_pattern = osp.join(root_dir, f'{prefix}.tar.gz.part*') + combined_archive = osp.join(root_dir, f'{prefix}.tar.gz') + if not osp.exists(combined_archive): + parts = sorted(glob.glob(parts_pattern)) + with open(combined_archive, 'wb') as outfile: + for part in parts: + with open(part, 'rb') as infile: + shutil.copyfileobj(infile, outfile) + shutil.unpack_archive(combined_archive, root_dir) + if remove_parts: + for part in parts: + os.remove(part) + os.remove(combined_archive) + + combine_and_extract(self.video_root, 'processed_videos') + combine_and_extract(self.video_root, 'processed_frames') + + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') + + data_file = osp.join(dataset_path, f'{dataset_name}.tsv') + + # transform + self.transform = T.Compose([ + Stack(), + ToTorchFormatTensor() + ]) + + return dict(root=dataset_path, data_file=data_file) + + def get_index(self, bound, fps, max_frame, first_idx=0, num_segments=16): + start, end = bound if bound else (-100000, 100000) + start_idx = max(first_idx, round(start * fps)) + end_idx = min(round(end * fps), max_frame) + seg_size = (end_idx - start_idx) / num_segments + mid_seg_size = seg_size / 2 + indices = np.arange(num_segments) + frame_indices = start_idx + mid_seg_size + np.round(seg_size * indices) + return frame_indices.astype(int) + + def load_into_video_and_process(self, media, mcq_idx): + try: + from moviepy.editor import ImageSequenceClip, VideoFileClip + except Exception: + raise ImportError( + 'MoviePy is not installed, please install it by running "pip install moviepy==1.0.3"' + ) + video_root = self.video_root + if media['type'] in ['image']: + original_image_path = osp.join(video_root, media['image_paths'][0]) + processed_video_path = osp.join(video_root, 'processed_videos', f'{mcq_idx}.jpg') + if not os.path.exists(processed_video_path): + shutil.copy(original_image_path, processed_video_path) + return dict(type='image', value=processed_video_path) + elif media['type'] in ['frames']: + input_images = [osp.join(video_root, im) for im in media['image_paths']] + processed_video_path = osp.join(video_root, 'processed_videos', f'{mcq_idx}.mp4') + media['nframes'] = len(input_images) // 2 * 2 + if not os.path.exists(processed_video_path): + # using MoviePy to transform images into mp4 + image_files = sorted(input_images) + image_clip = ImageSequenceClip(image_files, fps=self.frame_fps) + image_clip.write_videofile(processed_video_path, codec='libx264') + image_clip.close() + elif media['type'] in ['video']: + original_video_path = osp.join(video_root, media['video_path']) + processed_video_path = osp.join(video_root, 'processed_videos', f'{mcq_idx}.mp4') + if 'video_start' in media and 'video_end' in media and media['video_start'] is not None and media['video_end'] is not None: # noqa: E501 + video_start, video_end = media['video_start'], media['video_end'] + if not os.path.exists(processed_video_path): + video_clip = VideoFileClip(original_video_path) + clip = video_clip.subclip(video_start, min(video_end, video_clip.duration)) + clip.write_videofile(processed_video_path) + clip.close() + else: + if not os.path.exists(processed_video_path): + shutil.copy(original_video_path, processed_video_path) + else: + raise ValueError(f"Unsupported media type: {media['type']}") + + return dict(type='video', value=processed_video_path, nframes=media.get('nframes', 8)) + + def save_video_into_images(self, media, mcq_idx): + bound = None + video_root = self.video_root + + if media['type'] in ['frames', 'image']: + media_paths = [osp.join(video_root, im) for im in media['image_paths']] + save_dir = osp.join(video_root, 'processed_frames', str(mcq_idx)) + os.makedirs(save_dir, exist_ok=True) + input_images = [] + for media_path in media_paths: + img_path = media_path.split('/')[-1] + save_image_path = osp.join(save_dir, img_path) + shutil.copy(media_path, save_image_path) + input_images.append(save_image_path) + return input_images + + if 'video_start' in media and 'video_end' in media and media['video_start'] is not None and media['video_end'] is not None: # noqa: E501 + bound = ( + media['video_start'], media['video_end'] + ) + video_path = os.path.join(video_root, media['video_path']) + + def read_video(video_path, bound=None, num_segments=16): + from decord import VideoReader, cpu + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + max_frame = len(vr) - 1 + fps = float(vr.get_avg_fps()) + + images_group = list() + frame_indices = self.get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments) + save_dir = osp.join(video_root, 'processed_frames', str(mcq_idx)) + + if osp.exists(save_dir) and len(os.listdir(save_dir)) > 0: + return None, frame_indices + + for frame_index in frame_indices: + img = Image.fromarray(vr[frame_index].asnumpy()) + images_group.append(img) + torch_imgs = self.transform(images_group) + return torch_imgs, frame_indices + + def save_video_frames(imgs, video_root, frame_indices, mcq_idx): + save_dir = osp.join(video_root, 'processed_frames', str(mcq_idx)) + os.makedirs(save_dir, exist_ok=True) + frame_paths = [osp.join(save_dir, f'{fidx:07d}.jpg') for fidx in frame_indices] + + flag = np.all([osp.exists(pth) for pth in frame_paths]) + + if not flag: + block_size = imgs.size(0) // len(frame_indices) + split_tensors = torch.split(imgs, block_size) + to_pil = transforms.ToPILImage() + images = [to_pil(arr) for arr in split_tensors] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths + + torch_imgs, frame_indices = read_video(video_path, bound, media['nframes']) + img_frame_paths = save_video_frames(torch_imgs, video_root, frame_indices, mcq_idx) + return img_frame_paths + + def process_text_and_media(self, text, media_list, video_llm, mcq_idx): + + message = [] + chunks = re.split(r'(|