From 9c2591967070228783a06c146a70d801e79b1c97 Mon Sep 17 00:00:00 2001
From: Wei Du <wedu@nvidia.com>
Date: Wed, 24 Jun 2026 21:25:09 -0700
Subject: [PATCH] Add apex_2025 benchmark

Signed-off-by: Wei Du <wedu@nvidia.com>
---
 benchmarks/apex_2025/README.md       | 51 ++++++++++++++++++++++++++
 benchmarks/apex_2025/__init__.py     |  0
 benchmarks/apex_2025/config.yaml     | 24 +++++++++++++
 benchmarks/apex_2025/data/.gitignore |  1 +
 benchmarks/apex_2025/prepare.py      | 54 ++++++++++++++++++++++++++++
 5 files changed, 130 insertions(+)
 create mode 100644 benchmarks/apex_2025/README.md
 create mode 100644 benchmarks/apex_2025/__init__.py
 create mode 100644 benchmarks/apex_2025/config.yaml
 create mode 100644 benchmarks/apex_2025/data/.gitignore
 create mode 100644 benchmarks/apex_2025/prepare.py
diff --git a/benchmarks/apex_2025/README.md b/benchmarks/apex_2025/README.md
new file mode 100644
index 0000000000..d02e89d28c
--- /dev/null
+++ b/benchmarks/apex_2025/README.md
@@ -0,0 +1,51 @@
+# APEX 2025
+
+Math problems from MathArena's APEX 2025 benchmark, sourced from
+`MathArena/apex_2025` on HuggingFace. This benchmark is intended as a
+newer alternative to `apex_shortlist`.
+
+## Verification
+
+Reuses the `math_with_judge` resource server in **symbolic-only** mode
+(`should_use_judge: false`) to mirror NeMo Skills' `eval_type=math`
+default for this benchmark. The HuggingFace `math-verify` library does
+symbolic equivalence of the model-extracted `\boxed{...}` answer against
+`expected_answer`.
+
+## Prompt
+
+User-only prompt, character-for-character match with NeMo Skills'
+`generic/math.yaml`:
+
+```
+Solve the following math problem. Make sure to put the answer (and only answer) inside \boxed{}.
+
+<question>
+```
+
+## Data preparation
+
+```bash
+ng_prepare_benchmark '+config_paths=[benchmarks/apex_2025/config.yaml]'
+```
+
+Writes `data/apex_2025_benchmark.jsonl` with one row per problem:
+`{"problem_idx": 1, "source": "...", "question": "...", "expected_answer": "..."}`.
+
+## Running servers
+
+```bash
+config_paths="responses_api_models/vllm_model/configs/vllm_model.yaml,\
+benchmarks/apex_2025/config.yaml"
+ng_run "+config_paths=[$config_paths]"
+```
+
+## Collecting rollouts
+
+```bash
+ng_collect_rollouts \
+    +agent_name=apex_2025_math_with_judge_simple_agent \
+    +input_jsonl_fpath=benchmarks/apex_2025/data/apex_2025_benchmark.jsonl \
+    +output_jsonl_fpath=results/apex_2025_rollouts.jsonl \
+    +num_repeats=4
+```
diff --git a/benchmarks/apex_2025/__init__.py b/benchmarks/apex_2025/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/benchmarks/apex_2025/config.yaml b/benchmarks/apex_2025/config.yaml
new file mode 100644
index 0000000000..bf134c2657
--- /dev/null
+++ b/benchmarks/apex_2025/config.yaml
@@ -0,0 +1,24 @@
+# Chain to existing resource server + agent config
+config_paths:
+  - resources_servers/math_with_judge/configs/math_with_judge.yaml
+
+# We use `_inherit_from` directives to inherit from and not use the generic config
+# above to ensure this benchmark config is isolated.
+apex_2025_math_with_judge_resources_server:
+  _inherit_from: math_with_judge
+  resources_servers:
+    math_with_judge:
+      should_use_judge: false
+
+apex_2025_math_with_judge_simple_agent:
+  _inherit_from: math_with_judge_simple_agent
+  responses_api_agents:
+    simple_agent:
+      resources_server:
+        name: apex_2025_math_with_judge_resources_server
+      datasets:
+      - name: apex_2025
+        type: benchmark
+        jsonl_fpath: benchmarks/apex_2025/data/apex_2025_benchmark.jsonl
+        prompt_config: benchmarks/prompts/generic/math.yaml
+        prepare_script: benchmarks/apex_2025/prepare.py
diff --git a/benchmarks/apex_2025/data/.gitignore b/benchmarks/apex_2025/data/.gitignore
new file mode 100644
index 0000000000..b06d45fe69
--- /dev/null
+++ b/benchmarks/apex_2025/data/.gitignore
@@ -0,0 +1 @@
+*benchmark.jsonl
diff --git a/benchmarks/apex_2025/prepare.py b/benchmarks/apex_2025/prepare.py
new file mode 100644
index 0000000000..14ee1671e2
--- /dev/null
+++ b/benchmarks/apex_2025/prepare.py
@@ -0,0 +1,54 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare the APEX 2025 benchmark data."""
+
+import json
+from pathlib import Path
+
+from datasets import load_dataset
+
+
+BENCHMARK_DIR = Path(__file__).parent
+DATA_DIR = BENCHMARK_DIR / "data"
+OUTPUT_FPATH = DATA_DIR / "apex_2025_benchmark.jsonl"
+
+HF_REPO_ID = "MathArena/apex_2025"
+
+
+def prepare() -> Path:
+    """Download and prepare APEX 2025 data. Returns the output file path."""
+    DATA_DIR.mkdir(parents=True, exist_ok=True)
+
+    print(f"Loading APEX 2025 data from {HF_REPO_ID}...")
+    ds = load_dataset(HF_REPO_ID, split="train")
+
+    count = 0
+    with open(OUTPUT_FPATH, "w", encoding="utf-8") as f:
+        for row in ds:
+            out = {
+                "problem_idx": row["problem_idx"],
+                "source": row["source"],
+                "question": row["problem"],
+                "expected_answer": str(row["answer"]),
+            }
+            f.write(json.dumps(out, ensure_ascii=False) + "\n")
+            count += 1
+
+    print(f"Wrote {count} problems to {OUTPUT_FPATH}")
+    return OUTPUT_FPATH
+
+
+if __name__ == "__main__":
+    prepare()