From 9c2591967070228783a06c146a70d801e79b1c97 Mon Sep 17 00:00:00 2001 From: Wei Du Date: Wed, 24 Jun 2026 21:25:09 -0700 Subject: [PATCH] Add apex_2025 benchmark Signed-off-by: Wei Du --- benchmarks/apex_2025/README.md | 51 ++++++++++++++++++++++++++ benchmarks/apex_2025/__init__.py | 0 benchmarks/apex_2025/config.yaml | 24 +++++++++++++ benchmarks/apex_2025/data/.gitignore | 1 + benchmarks/apex_2025/prepare.py | 54 ++++++++++++++++++++++++++++ 5 files changed, 130 insertions(+) create mode 100644 benchmarks/apex_2025/README.md create mode 100644 benchmarks/apex_2025/__init__.py create mode 100644 benchmarks/apex_2025/config.yaml create mode 100644 benchmarks/apex_2025/data/.gitignore create mode 100644 benchmarks/apex_2025/prepare.py diff --git a/benchmarks/apex_2025/README.md b/benchmarks/apex_2025/README.md new file mode 100644 index 0000000000..d02e89d28c --- /dev/null +++ b/benchmarks/apex_2025/README.md @@ -0,0 +1,51 @@ +# APEX 2025 + +Math problems from MathArena's APEX 2025 benchmark, sourced from +`MathArena/apex_2025` on HuggingFace. This benchmark is intended as a +newer alternative to `apex_shortlist`. + +## Verification + +Reuses the `math_with_judge` resource server in **symbolic-only** mode +(`should_use_judge: false`) to mirror NeMo Skills' `eval_type=math` +default for this benchmark. The HuggingFace `math-verify` library does +symbolic equivalence of the model-extracted `\boxed{...}` answer against +`expected_answer`. + +## Prompt + +User-only prompt, character-for-character match with NeMo Skills' +`generic/math.yaml`: + +``` +Solve the following math problem. Make sure to put the answer (and only answer) inside \boxed{}. + + +``` + +## Data preparation + +```bash +ng_prepare_benchmark '+config_paths=[benchmarks/apex_2025/config.yaml]' +``` + +Writes `data/apex_2025_benchmark.jsonl` with one row per problem: +`{"problem_idx": 1, "source": "...", "question": "...", "expected_answer": "..."}`. + +## Running servers + +```bash +config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\ +benchmarks/apex_2025/config.yaml" +ng_run "+config_paths=[$config_paths]" +``` + +## Collecting rollouts + +```bash +ng_collect_rollouts \ + +agent_name=apex_2025_math_with_judge_simple_agent \ + +input_jsonl_fpath=benchmarks/apex_2025/data/apex_2025_benchmark.jsonl \ + +output_jsonl_fpath=results/apex_2025_rollouts.jsonl \ + +num_repeats=4 +``` diff --git a/benchmarks/apex_2025/__init__.py b/benchmarks/apex_2025/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/benchmarks/apex_2025/config.yaml b/benchmarks/apex_2025/config.yaml new file mode 100644 index 0000000000..bf134c2657 --- /dev/null +++ b/benchmarks/apex_2025/config.yaml @@ -0,0 +1,24 @@ +# Chain to existing resource server + agent config +config_paths: + - resources_servers/math_with_judge/configs/math_with_judge.yaml + +# We use `_inherit_from` directives to inherit from and not use the generic config +# above to ensure this benchmark config is isolated. +apex_2025_math_with_judge_resources_server: + _inherit_from: math_with_judge + resources_servers: + math_with_judge: + should_use_judge: false + +apex_2025_math_with_judge_simple_agent: + _inherit_from: math_with_judge_simple_agent + responses_api_agents: + simple_agent: + resources_server: + name: apex_2025_math_with_judge_resources_server + datasets: + - name: apex_2025 + type: benchmark + jsonl_fpath: benchmarks/apex_2025/data/apex_2025_benchmark.jsonl + prompt_config: benchmarks/prompts/generic/math.yaml + prepare_script: benchmarks/apex_2025/prepare.py diff --git a/benchmarks/apex_2025/data/.gitignore b/benchmarks/apex_2025/data/.gitignore new file mode 100644 index 0000000000..b06d45fe69 --- /dev/null +++ b/benchmarks/apex_2025/data/.gitignore @@ -0,0 +1 @@ +*benchmark.jsonl diff --git a/benchmarks/apex_2025/prepare.py b/benchmarks/apex_2025/prepare.py new file mode 100644 index 0000000000..14ee1671e2 --- /dev/null +++ b/benchmarks/apex_2025/prepare.py @@ -0,0 +1,54 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare the APEX 2025 benchmark data.""" + +import json +from pathlib import Path + +from datasets import load_dataset + + +BENCHMARK_DIR = Path(__file__).parent +DATA_DIR = BENCHMARK_DIR / "data" +OUTPUT_FPATH = DATA_DIR / "apex_2025_benchmark.jsonl" + +HF_REPO_ID = "MathArena/apex_2025" + + +def prepare() -> Path: + """Download and prepare APEX 2025 data. Returns the output file path.""" + DATA_DIR.mkdir(parents=True, exist_ok=True) + + print(f"Loading APEX 2025 data from {HF_REPO_ID}...") + ds = load_dataset(HF_REPO_ID, split="train") + + count = 0 + with open(OUTPUT_FPATH, "w", encoding="utf-8") as f: + for row in ds: + out = { + "problem_idx": row["problem_idx"], + "source": row["source"], + "question": row["problem"], + "expected_answer": str(row["answer"]), + } + f.write(json.dumps(out, ensure_ascii=False) + "\n") + count += 1 + + print(f"Wrote {count} problems to {OUTPUT_FPATH}") + return OUTPUT_FPATH + + +if __name__ == "__main__": + prepare()