From 0985c39f66f3ec281cc0d29e0788e6fc72216220 Mon Sep 17 00:00:00 2001
From: Carlos Gomes <cgomes@nvidia.com>
Date: Wed, 17 Jun 2026 17:36:27 +0200
Subject: [PATCH 01/22] Add reasoning benchmark scaffold

---
 .gitmodules         |   4 +
 README.md           |   1 +
 reasoning/README.md | 196 ++++++++++++++++++++++++++++++++++++++++++++
 reasoning/RL        |   1 +
 4 files changed, 202 insertions(+)
 create mode 100644 reasoning/README.md
 create mode 160000 reasoning/RL
diff --git a/.gitmodules b/.gitmodules
index 51d8eac03..10f6e78cc 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,3 +6,7 @@
 	path = text_to_image/torchtitan
 	url = https://github.com/pytorch/torchtitan.git
 	branch = mlperf-training-flux.1
+[submodule "reasoning/RL"]
+	path = reasoning/RL
+	url = https://github.com/CarlosGomes98/RL.git
+	branch = mlperf-training
diff --git a/README.md b/README.md
index f3477e86f..f16c412a3 100644
--- a/README.md
+++ b/README.md
@@ -47,6 +47,7 @@ Some these benchmarks are rather slow or take a long time to run on the referenc
 | Model | reference implementation | framework* | dataset | model parameter count**
 | ---- | ---- | ---- | ---- | ----
 | flux.1 | [text_to_image](https://github.com/mlcommons/training/tree/master/text_to_image) | torchtitan | CC12M subset | 11.9B
+| qwen3_235b_a22b_swe_grpo | [reasoning](https://github.com/mlcommons/training/tree/master/reasoning) | NeMo-RL / NeMo-Gym | SWE tasks | 235B
 | llama3.1_8b | [small_llm_pretraining](https://github.com/mlcommons/training/tree/master/small_llm_pretraining) | NeMo | C4 | 8b
 | llama2_70b_lora | [llama2_70b_lora](https://github.com/mlcommons/training/tree/master/llama2_70b_lora) | pytorch | SCROLLS GovReport | 70B
 | llama3.1_405b | [large_language_model_pretraining](https://github.com/mlcommons/training/tree/master/large_language_model_pretraining) | NeMo | C4 | 405B
diff --git a/reasoning/README.md b/reasoning/README.md
new file mode 100644
index 000000000..071251c13
--- /dev/null
+++ b/reasoning/README.md
@@ -0,0 +1,196 @@
+# 1. Problem
+
+## Reasoning - GRPO with NeMo-Gym SWE/OpenHands.
+
+[NeMo-RL](https://github.com/CarlosGomes98/RL) provides the implementation used for this benchmark. The benchmark trains `Qwen/Qwen3-235B-A22B-Instruct-2507` with GRPO against a NeMo-Gym software-engineering environment driven by an OpenHands SWE agent.
+
+The relevant config files are under `RL/examples/nemo_gym`. The benchmark launch entrypoint is `RL/examples/nemo_gym/launch_nemo_gym_multinode_training.sh`, using `RL/examples/nemo_gym/grpo_qwen3_235b_swe_openhands_async.yaml`.
+
+Formal benchmark description: <to be completed>
+
+# 2. Directions
+
+## Steps to configure machine
+
+To use this repository, please ensure your system can run containers and has appropriate GPU support. The multinode launch script assumes a Slurm cluster with Pyxis/Enroot-style container support, Ray, and 8 GPUs per node.
+
+### Container setup
+
+The Dockerfile to build for this benchmark is the NeMo-RL v0.6.0 Gym overlay Dockerfile.
+
+```bash
+cd RL
+docker buildx build \
+  --platform <linux/amd64 or linux/arm64> \
+  -t <tag> \
+  -f docker/Dockerfile.gym_v0.6.0 \
+  .
+```
+
+The Dockerfile overlays the SWE/NeMo-Gym pieces on top of `nvcr.io/nvidia/nemo-rl:v0.6.0` and prefetches Gym virtual environments for `examples/nemo_gym/grpo_qwen3_235b_swe_openhands_async.yaml`.
+
+## Steps to download and verify data
+
+The run requires the following artifacts:
+
+| Artifact | Description | Status |
+|---|---|---|
+| Policy model | Host directory containing `Qwen/Qwen3-235B-A22B-Instruct-2507`, passed as `HF_CKPT_PATH` and mounted into the container | <to be completed> |
+| Megatron-Core checkpoint cache | Host directory for the HF-to-Megatron converted checkpoint cache, passed as `NRL_MEGATRON_CHECKPOINT_DIR` and mounted into the container | <to be completed> |
+| Training JSONL | Host path to NeMo-Gym SWE training tasks, passed as `NEMO_GYM_SWE_TRAIN_DATA_PATH` and mounted into the container | <to be completed> |
+| Validation JSONL | Host path to NeMo-Gym SWE validation tasks, passed as `NEMO_GYM_SWE_VALIDATION_DATA_PATH` and mounted into the container | <to be completed> |
+| Task containers | Host directory containing Apptainer/Singularity SIF images, passed as `NEMO_GYM_SWE_SIF_DIR` and mounted into the container | <to be completed> |
+
+Dataset download commands: <to be completed>
+
+Dataset verification commands: <to be completed>
+
+### Model cache setup
+
+From outside the container, download the Hugging Face model into a host directory. The launcher requires `HF_CKPT_PATH` to point at this directory, then mounts it at `/inputs/nemo_gym/hf_ckpt` by default and passes that container path to the recipe as `policy.model_name`.
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+pip install huggingface_hub
+
+export HF_CKPT_PATH=$(pwd)/hf/Qwen/Qwen3-235B-A22B-Instruct-2507
+mkdir -p "$HF_CKPT_PATH"
+HF_TOKEN=<your hf token> hf download Qwen/Qwen3-235B-A22B-Instruct-2507 --local-dir "$HF_CKPT_PATH"
+```
+
+The launcher also creates and mounts a host Hugging Face cache. Set `HF_HOME` before launch if you want to use a cache outside `$(pwd)/.cache`.
+
+## Steps to run and time
+
+All steps below are assumed to be run from this `reasoning` directory on the host; `cd RL` enters the NeMo-RL submodule checkout. The launcher submits `ray.sub` and runs training from the checkout baked into the container at `/opt/nemo-rl` by default.
+
+```bash
+cd RL
+
+export REPO_LOCATION=$(pwd)
+export EXP_NAME=<experiment name>
+export CONTAINER_IMAGE_PATH=<container image path or tag>
+export SLURM_ACCOUNT=<account>
+export SLURM_PARTITION=<partition>
+export HF_CKPT_PATH=<host path to HF checkpoint directory. empty dir is ok, will be converted in the first run>
+export NRL_MEGATRON_CHECKPOINT_DIR=<host path to Megatron-Core checkpoint cache directory>
+export NEMO_GYM_SWE_TRAIN_DATA_PATH=<host path to training JSONL>
+export NEMO_GYM_SWE_VALIDATION_DATA_PATH=<host path to validation JSONL>
+export NEMO_GYM_SWE_SIF_DIR=<host directory containing SWE task SIF images>
+
+# Defaults are defined by the launcher and may be overridden here.
+export TRAIN_NODES=<number of training nodes>        # default: 16
+export GEN_NODES=<number of generation nodes>        # default: 24
+export SLURM_TIME=<walltime>                         # default: 1:0:0
+export RECIPE=examples/nemo_gym/grpo_qwen3_235b_swe_openhands_async.yaml
+
+# Optional extra mounts. The launcher automatically mounts the paths above.
+export EXTRA_MOUNTS=<host_path>:<container_path>[,<host_path>:<container_path>...]
+
+bash examples/nemo_gym/launch_nemo_gym_multinode_training.sh
+```
+
+The launcher also accepts `NODES` to override `TRAIN_NODES + GEN_NODES`, `CONTAINER_REPO_LOCATION` to override the baked checkout path `/opt/nemo-rl`, `CONTAINER_INPUT_ROOT` and the `CONTAINER_*` path variables to override the stable container mount targets, `SLURM_EXCLUDE` to exclude problematic nodes, and `SLURM_COMMENT`/`SLURM_IDLE_EXEMPT_MINS` to customize the idle-GPU reaper exemption.
+
+# 3. Dataset/Environment
+
+### Publication/Attribution
+
+Dataset and task-container attribution: <to be completed>
+
+### Data preprocessing
+
+The recipe consumes prebuilt JSONL files through `NemoGymDataset`. Each row represents a software-engineering task for the NeMo-Gym environment. The exact transformation from the source task corpus into the benchmark JSONL format is <to be completed>.
+
+The environment also requires per-task SIF images. The recipe resolves task containers from `sif_dir` with these templates:
+
+```yaml
+- "${sif_dir}/swebench_sweb.eval.x86_64.{instance_id}.sif"
+- "${sif_dir}/swegym_sweb.eval.x86_64.{instance_id}.sif"
+- "${sif_dir}/r2egym_{instance_id}.sif"
+```
+
+### Training and test data separation
+
+The config uses separate training and validation JSONL files:
+
+```yaml
+policy:
+  model_name: ${oc.env:HF_CKPT_PATH}
+data:
+  train:
+    data_path: ${oc.env:NEMO_GYM_SWE_TRAIN_DATA_PATH}
+  validation:
+    data_path: ${oc.env:NEMO_GYM_SWE_VALIDATION_DATA_PATH}
+sif_dir: ${oc.env:NEMO_GYM_SWE_SIF_DIR}
+```
+
+The official split procedure is <to be completed>.
+
+### Training data order
+
+Training data order is preserved by the recipe with `data.shuffle: false`. The intended ordering or curriculum of the training JSONL is <to be completed>.
+
+### Test data order
+
+Validation data order is preserved by the recipe. The config uses `grpo.max_val_samples: null`, so validation thoroughness is inferred from the validation dataset size unless overridden.
+
+### Simulation environment (RL models only)
+
+The benchmark uses NeMo-Gym with the SWE/OpenHands agent configuration. Rollouts are collected through a vLLM-backed policy server, with OpenHands interacting with task containers via Apptainer/Singularity. The async recipe uses non-colocated generation and training, with one-step-stale trajectories corrected by importance sampling.
+
+# 4. Model
+
+### Publication/Attribution
+
+Model and implementation attribution: <to be completed>
+
+### List of layers
+
+| **Component** | **Architecture** | **Parameters** | **Technical Details** |
+|---------------|------------------|----------------|-----------------------|
+| **Policy model** | Qwen3 MoE Transformer | 235B total, 22B active | `Qwen/Qwen3-235B-A22B-Instruct-2507` |
+| **Training runtime** | Megatron-Core through NeMo-RL | Same policy weights | TP4 x CP4 x PP8 minimum training replica; expert model parallel size 16 |
+| **Generation runtime** | vLLM | Same policy weights | TP16, BF16, HTTP server exposed for NeMo-Gym |
+| **SWE environment** | NeMo-Gym + OpenHands | N/A | Agent max turns 15 |
+
+Exact layer-by-layer model description: <to be completed>
+
+### Weight and bias initialization
+
+Training starts from a pretrained Hugging Face checkpoint converted to Megatron-Core format. Random initialization is not used for the policy model. Any benchmark-specific initialization details are <to be completed>.
+
+### Loss function
+
+The recipe uses token-level GRPO with reward normalization and a leave-one-out baseline. Reference-policy KL is disabled (`reference_policy_kl_penalty: 0`), and the async recipe uses importance-sampling correction for one-step-stale rollouts.
+
+### Optimizer
+
+Adam with distributed optimizer state. The async recipe sets `lr: 5.0e-6`, `weight_decay: 0.0`, BF16 training, and FP32 optimizer parameters.
+
+### Randomness
+
+The recipe sets `grpo.seed: 42`. Additional determinism and environment-seeding requirements are <to be completed>.
+
+### Precision
+
+The recipe uses BF16 policy precision by default.
+
+# 5. Quality
+
+### Quality metric
+
+The checkpointing metric is `val:total_reward/mean`, computed from NeMo-Gym validation rollouts.
+
+### Quality target
+
+<to be completed>
+
+### Evaluation frequency
+
+The async recipe sets `grpo.val_period: 5`, `grpo.val_at_start: true`, and `grpo.val_at_end: true`.
+
+### Evaluation thoroughness
+
+The validation set size is <to be completed>.
diff --git a/reasoning/RL b/reasoning/RL
new file mode 160000
index 000000000..d141b7a21
--- /dev/null
+++ b/reasoning/RL
@@ -0,0 +1 @@
+Subproject commit d141b7a2130aa77e8bee0a50e6e3b1b33078536d

From 0a27166e9350d15092f77cb9c9dfab057a287064 Mon Sep 17 00:00:00 2001
From: Carlos Gomes <cgomes@nvidia.com>
Date: Wed, 17 Jun 2026 17:40:33 +0200
Subject: [PATCH 02/22] Remove setting seed

---
 reasoning/README.md | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index 071251c13..29ea0ec09 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -169,10 +169,6 @@ The recipe uses token-level GRPO with reward normalization and a leave-one-out b
 
 Adam with distributed optimizer state. The async recipe sets `lr: 5.0e-6`, `weight_decay: 0.0`, BF16 training, and FP32 optimizer parameters.
 
-### Randomness
-
-The recipe sets `grpo.seed: 42`. Additional determinism and environment-seeding requirements are <to be completed>.
-
 ### Precision
 
 The recipe uses BF16 policy precision by default.

From ab73c222b94aeafa8d0b439cef3c43e4f3933baa Mon Sep 17 00:00:00 2001
From: Carlos Gomes <cgomes@nvidia.com>
Date: Wed, 17 Jun 2026 17:41:21 +0200
Subject: [PATCH 03/22] Update reasoning submodule pointer

---
 reasoning/README.md | 8 ++++++--
 reasoning/RL        | 2 +-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index 29ea0ec09..d7bce7208 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -73,12 +73,16 @@ export EXP_NAME=<experiment name>
 export CONTAINER_IMAGE_PATH=<container image path or tag>
 export SLURM_ACCOUNT=<account>
 export SLURM_PARTITION=<partition>
-export HF_CKPT_PATH=<host path to HF checkpoint directory. empty dir is ok, will be converted in the first run>
-export NRL_MEGATRON_CHECKPOINT_DIR=<host path to Megatron-Core checkpoint cache directory>
+export HF_CKPT_PATH=<host path to HF checkpoint directory>
+export NRL_MEGATRON_CHECKPOINT_DIR=<host path to Megatron-Core checkpoint cache directory>  # may be empty on first run
 export NEMO_GYM_SWE_TRAIN_DATA_PATH=<host path to training JSONL>
 export NEMO_GYM_SWE_VALIDATION_DATA_PATH=<host path to validation JSONL>
 export NEMO_GYM_SWE_SIF_DIR=<host directory containing SWE task SIF images>
 
+# Optional authentication/logging.
+export HF_TOKEN=<huggingface token>
+export WANDB_API_KEY=<wandb token>
+
 # Defaults are defined by the launcher and may be overridden here.
 export TRAIN_NODES=<number of training nodes>        # default: 16
 export GEN_NODES=<number of generation nodes>        # default: 24
diff --git a/reasoning/RL b/reasoning/RL
index d141b7a21..34dbecc08 160000
--- a/reasoning/RL
+++ b/reasoning/RL
@@ -1 +1 @@
-Subproject commit d141b7a2130aa77e8bee0a50e6e3b1b33078536d
+Subproject commit 34dbecc085abc1543079f2a4b38a35a1a3a721e1

From 743c959b813861cd186ac5f8638095aab1012011 Mon Sep 17 00:00:00 2001
From: Carlos Gomes <cgomes@nvidia.com>
Date: Wed, 17 Jun 2026 17:52:49 +0200
Subject: [PATCH 04/22] Document GPUs per node for reasoning

---
 reasoning/README.md | 3 ++-
 reasoning/RL        | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index d7bce7208..955d22e3d 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -12,7 +12,7 @@ Formal benchmark description: <to be completed>
 
 ## Steps to configure machine
 
-To use this repository, please ensure your system can run containers and has appropriate GPU support. The multinode launch script assumes a Slurm cluster with Pyxis/Enroot-style container support, Ray, and 8 GPUs per node.
+To use this repository, please ensure your system can run containers and has appropriate GPU support.
 
 ### Container setup
 
@@ -73,6 +73,7 @@ export EXP_NAME=<experiment name>
 export CONTAINER_IMAGE_PATH=<container image path or tag>
 export SLURM_ACCOUNT=<account>
 export SLURM_PARTITION=<partition>
+export GPUS_PER_NODE=<number of GPUs per Slurm node>
 export HF_CKPT_PATH=<host path to HF checkpoint directory>
 export NRL_MEGATRON_CHECKPOINT_DIR=<host path to Megatron-Core checkpoint cache directory>  # may be empty on first run
 export NEMO_GYM_SWE_TRAIN_DATA_PATH=<host path to training JSONL>
diff --git a/reasoning/RL b/reasoning/RL
index 34dbecc08..e91c5badf 160000
--- a/reasoning/RL
+++ b/reasoning/RL
@@ -1 +1 @@
-Subproject commit 34dbecc085abc1543079f2a4b38a35a1a3a721e1
+Subproject commit e91c5badf74ee1c2d2b4e7285af3a8232aadea37

From 148377cf31ee85ba07c6035a11d5aceda95a474a Mon Sep 17 00:00:00 2001
From: Hristo Filaretov <hfilaretov@nvidia.com>
Date: Thu, 18 Jun 2026 09:41:54 +0200
Subject: [PATCH 05/22] add data processing instructions

---
 reasoning/README.md | 87 ++++++++++++++++++++++++++++++++++++++++-----
 reasoning/RL        |  2 +-
 2 files changed, 79 insertions(+), 10 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index 955d22e3d..40506f63d 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -41,9 +41,26 @@ The run requires the following artifacts:
 | Validation JSONL | Host path to NeMo-Gym SWE validation tasks, passed as `NEMO_GYM_SWE_VALIDATION_DATA_PATH` and mounted into the container | <to be completed> |
 | Task containers | Host directory containing Apptainer/Singularity SIF images, passed as `NEMO_GYM_SWE_SIF_DIR` and mounted into the container | <to be completed> |
 
-Dataset download commands: <to be completed>
+To download the training and validation JSONL files:
 
-Dataset verification commands: <to be completed>
+```bash
+; hf download hfilaretov/Benchmark-R2E-Gym-Easy --repo-type dataset --local-dir hfilaretov__Benchmark-R2E-Gym-Easy
+...
+
+; tree hfilaretov__Benchmark-R2E-Gym-Easy
+hfilaretov__Benchmark-R2E-Gym-Easy/
+├── mlperf_r2e_gym_easy_train.jsonl
+├── mlperf_r2e_gym_easy_val.jsonl
+└── README.md
+```
+
+The environment also requires per-task SIF images. The recipe resolves task containers from `sif_dir` with this template:
+
+```yaml
+- "${sif_dir}/r2egym_{instance_id}.sif"
+```
+
+The task containers have to be built and converted to SIF format, please see [Section 3](#data-preprocessing) below.
 
 ### Model cache setup
 
@@ -102,20 +119,72 @@ The launcher also accepts `NODES` to override `TRAIN_NODES + GEN_NODES`, `CONTAI
 
 ### Publication/Attribution
 
-Dataset and task-container attribution: <to be completed>
+We use a subset of the [R2E-Gym/R2E-Gym-Subset](https://huggingface.co/datasets/R2E-Gym/R2E-Gym-Subset) dataset.
 
 ### Data preprocessing
 
-The recipe consumes prebuilt JSONL files through `NemoGymDataset`. Each row represents a software-engineering task for the NeMo-Gym environment. The exact transformation from the source task corpus into the benchmark JSONL format is <to be completed>.
+The recipe consumes prebuilt JSONL files through from [Benchmark-R2E-Gym-Easy](https://huggingface.co/datasets/hfilaretov/Benchmark-R2E-Gym-Easy).
+Each row represents a software-engineering task for the NeMo-Gym environment.
 
-The environment also requires per-task SIF images. The recipe resolves task containers from `sif_dir` with these templates:
+The JSONL files refer to SIF container files that need to be generated.
+This is a two-step process:
+1. Images are built from the repository and git revision specified in the dataset.
+2. These images are converted to SIF file format.
 
-```yaml
-- "${sif_dir}/swebench_sweb.eval.x86_64.{instance_id}.sif"
-- "${sif_dir}/swegym_sweb.eval.x86_64.{instance_id}.sif"
-- "${sif_dir}/r2egym_{instance_id}.sif"
+For x86_64 platforms, there are already pre-built images, so step 1 can be skipped.
+
+For arm64 platforms, we provide a script to build the images.
+
+You can build the container defined in `./dataset-processing-container` that already pre-packages all necessary dependencies and can be used for both steps.
+
+You should be able to run the following:
+
+Prepare the builder image:
+
+```bash
+cd RL/docker/dataset-processing-container
+export REGISTRY=<your-container-registry>
+docker build --push -t $REGISTRY/grpo-data-builder:latest .
 ```
 
+Note: to build the dataset images within the builder image, you need to mount the Docker daemon socket inside the container.
+If you do not want to do that, please set up an environment equivalent to the builder image, and then run the scripts outside a container.
+
+To build the images and push them to a registry, ideally on a GB200 system to build natively:
+
+```bash
+export HF_TOKEN=<read-token-for-huggingface>
+export DOCKER_REGISTRY=<url-to-docker-registry>
+export DOCKER_TOKEN=<docker-registry-token>
+export DOCKER_USER=<docker-registry-username>
+export STATE_DIR=<path-to-persistent-storage>
+export MAX_WORKERS=<maximum-number-of-parallel-build-tasks>
+
+# R2E-Gym Easy subset
+docker run -it --rm \
+    -v /var/run/docker.sock:/var/run/docker.sock \
+    -v $STATE_DIR:/workspace/state \
+    -e DOCKER_REGISTRY -e DOCKER_TOKEN -e DOCKER_USER -e HF_TOKEN -e MAX_WORKERS \
+    $REGISTRY/grpo-data-builder:latest \
+    /workspace/run-r2e-gym-build-images.sh
+```
+
+To convert the images from the registry to SIF files:
+
+```bash
+export SIF_LOCAL_DIR=<local-directory-to-store-sif-containers>
+
+# SIF images, final dataset
+docker run -it --rm \
+    -v $SIF_LOCAL_DIR:/opt/data \
+    -e DOCKER_REGISTRY -e DOCKER_TOKEN -e DOCKER_USER -e HF_TOKEN -e MAX_WORKERS \
+    $REGISTRY/grpo-data-builder:latest \
+    /workspace/run-build-sif-images.sh
+```
+
+Please note that the above container will use its local storage to build the SIF files and then copy them over to your `SIF_LOCAL_DIR`.
+You therefore might be constrained in the number of `$MAX_WORKERS` by your available local storage.
+
 ### Training and test data separation
 
 The config uses separate training and validation JSONL files:
diff --git a/reasoning/RL b/reasoning/RL
index e91c5badf..85f36e0c3 160000
--- a/reasoning/RL
+++ b/reasoning/RL
@@ -1 +1 @@
-Subproject commit e91c5badf74ee1c2d2b4e7285af3a8232aadea37
+Subproject commit 85f36e0c3dd967ca05b1879228cbbf42a024ddcb

From 811010cdc372301b2a9bd98d6871702c314ae797 Mon Sep 17 00:00:00 2001
From: Hristo Filaretov <hfilaretov@nvidia.com>
Date: Thu, 18 Jun 2026 11:00:36 +0200
Subject: [PATCH 06/22] update readme

---
 reasoning/README.md | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index 40506f63d..512c7188a 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -41,17 +41,19 @@ The run requires the following artifacts:
 | Validation JSONL | Host path to NeMo-Gym SWE validation tasks, passed as `NEMO_GYM_SWE_VALIDATION_DATA_PATH` and mounted into the container | <to be completed> |
 | Task containers | Host directory containing Apptainer/Singularity SIF images, passed as `NEMO_GYM_SWE_SIF_DIR` and mounted into the container | <to be completed> |
 
-To download the training and validation JSONL files:
+To download the training and validation JSONL files using the HuggingFace CLI:
 
 ```bash
 ; hf download hfilaretov/Benchmark-R2E-Gym-Easy --repo-type dataset --local-dir hfilaretov__Benchmark-R2E-Gym-Easy
 ...
 
 ; tree hfilaretov__Benchmark-R2E-Gym-Easy
-hfilaretov__Benchmark-R2E-Gym-Easy/
-├── mlperf_r2e_gym_easy_train.jsonl
-├── mlperf_r2e_gym_easy_val.jsonl
+hfilaretov__Benchmark-R2E-Gym-Easy
+├── benchmark_r2e_gym_easy_train.jsonl
+├── benchmark_r2e_gym_easy_val.jsonl
 └── README.md
+
+1 directory, 3 files
 ```
 
 The environment also requires per-task SIF images. The recipe resolves task containers from `sif_dir` with this template:
@@ -143,8 +145,8 @@ Prepare the builder image:
 
 ```bash
 cd RL/docker/dataset-processing-container
-export REGISTRY=<your-container-registry>
-docker build --push -t $REGISTRY/grpo-data-builder:latest .
+export DOCKER_REGISTRY=<your-container-registry>
+docker build --push -t $DOCKER_REGISTRY/grpo-data-builder:latest .
 ```
 
 Note: to build the dataset images within the builder image, you need to mount the Docker daemon socket inside the container.
@@ -165,7 +167,7 @@ docker run -it --rm \
     -v /var/run/docker.sock:/var/run/docker.sock \
     -v $STATE_DIR:/workspace/state \
     -e DOCKER_REGISTRY -e DOCKER_TOKEN -e DOCKER_USER -e HF_TOKEN -e MAX_WORKERS \
-    $REGISTRY/grpo-data-builder:latest \
+    $DOCKER_REGISTRY/grpo-data-builder:latest \
     /workspace/run-r2e-gym-build-images.sh
 ```
 
@@ -178,7 +180,7 @@ export SIF_LOCAL_DIR=<local-directory-to-store-sif-containers>
 docker run -it --rm \
     -v $SIF_LOCAL_DIR:/opt/data \
     -e DOCKER_REGISTRY -e DOCKER_TOKEN -e DOCKER_USER -e HF_TOKEN -e MAX_WORKERS \
-    $REGISTRY/grpo-data-builder:latest \
+    $DOCKER_REGISTRY/grpo-data-builder:latest \
     /workspace/run-build-sif-images.sh
 ```
 

From 5612b772411c97e00dfd4d91143630a5047e26e8 Mon Sep 17 00:00:00 2001
From: Hristo Filaretov <hfilaretov@nvidia.com>
Date: Thu, 18 Jun 2026 11:02:32 +0200
Subject: [PATCH 07/22] update readme

---
 reasoning/README.md | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index 512c7188a..bdefb9db5 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -133,10 +133,6 @@ This is a two-step process:
 1. Images are built from the repository and git revision specified in the dataset.
 2. These images are converted to SIF file format.
 
-For x86_64 platforms, there are already pre-built images, so step 1 can be skipped.
-
-For arm64 platforms, we provide a script to build the images.
-
 You can build the container defined in `./dataset-processing-container` that already pre-packages all necessary dependencies and can be used for both steps.
 
 You should be able to run the following:
@@ -152,7 +148,7 @@ docker build --push -t $DOCKER_REGISTRY/grpo-data-builder:latest .
 Note: to build the dataset images within the builder image, you need to mount the Docker daemon socket inside the container.
 If you do not want to do that, please set up an environment equivalent to the builder image, and then run the scripts outside a container.
 
-To build the images and push them to a registry, ideally on a GB200 system to build natively:
+To build the images and push them to a registry, run on a host that has your target architecture to build natively:
 
 ```bash
 export HF_TOKEN=<read-token-for-huggingface>

From 32cc9b95db431047a80b82cb149cd45d7493165f Mon Sep 17 00:00:00 2001
From: Hristo Filaretov <hfilaretov@nvidia.com>
Date: Thu, 18 Jun 2026 11:03:59 +0200
Subject: [PATCH 08/22] update readme

---
 reasoning/README.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index bdefb9db5..df4a0fc51 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -135,8 +135,6 @@ This is a two-step process:
 
 You can build the container defined in `./dataset-processing-container` that already pre-packages all necessary dependencies and can be used for both steps.
 
-You should be able to run the following:
-
 Prepare the builder image:
 
 ```bash

From 7734f53b16c6440a5e69272031e510855f8b99f0 Mon Sep 17 00:00:00 2001
From: Carlos Gomes <cgomes@nvidia.com>
Date: Thu, 18 Jun 2026 16:01:22 +0200
Subject: [PATCH 09/22] update submodule

---
 reasoning/README.md | 14 ++++++++------
 reasoning/RL        |  2 +-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index df4a0fc51..c2f4f942d 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -35,11 +35,11 @@ The run requires the following artifacts:
 
 | Artifact | Description | Status |
 |---|---|---|
-| Policy model | Host directory containing `Qwen/Qwen3-235B-A22B-Instruct-2507`, passed as `HF_CKPT_PATH` and mounted into the container | <to be completed> |
+| Policy model | Host directory containing `Qwen/Qwen3-235B-A22B-Instruct-2507`, passed as `HF_CKPT_PATH`, mounted into the container, and exposed to the recipe through `CONTAINER_HF_CKPT_PATH` | <to be completed> |
 | Megatron-Core checkpoint cache | Host directory for the HF-to-Megatron converted checkpoint cache, passed as `NRL_MEGATRON_CHECKPOINT_DIR` and mounted into the container | <to be completed> |
 | Training JSONL | Host path to NeMo-Gym SWE training tasks, passed as `NEMO_GYM_SWE_TRAIN_DATA_PATH` and mounted into the container | <to be completed> |
 | Validation JSONL | Host path to NeMo-Gym SWE validation tasks, passed as `NEMO_GYM_SWE_VALIDATION_DATA_PATH` and mounted into the container | <to be completed> |
-| Task containers | Host directory containing Apptainer/Singularity SIF images, passed as `NEMO_GYM_SWE_SIF_DIR` and mounted into the container | <to be completed> |
+| Task containers | Host directory containing Apptainer/Singularity SIF images in the layout expected by the recipe, passed as `NEMO_GYM_SWE_SIF_DIR` and mounted into the container | <to be completed> |
 
 To download the training and validation JSONL files using the HuggingFace CLI:
 
@@ -66,7 +66,7 @@ The task containers have to be built and converted to SIF format, please see [Se
 
 ### Model cache setup
 
-From outside the container, download the Hugging Face model into a host directory. The launcher requires `HF_CKPT_PATH` to point at this directory, then mounts it at `/inputs/nemo_gym/hf_ckpt` by default and passes that container path to the recipe as `policy.model_name`.
+From outside the container, download the Hugging Face model into a host directory. The launcher requires `HF_CKPT_PATH` to point at this directory, mounts that path into the container, and exports `CONTAINER_HF_CKPT_PATH` to the recipe as `policy.model_name`. By default, `CONTAINER_HF_CKPT_PATH` is the same path as `HF_CKPT_PATH`.
 
 ```bash
 python -m venv .venv
@@ -102,6 +102,8 @@ export NEMO_GYM_SWE_SIF_DIR=<host directory containing SWE task SIF images>
 # Optional authentication/logging.
 export HF_TOKEN=<huggingface token>
 export WANDB_API_KEY=<wandb token>
+export MLPERF_TARGET_ACCURACY=<target accuracy>      # default: 1.0
+export GRPO_SEED=<integer seed>                      # default: random per launch
 
 # Defaults are defined by the launcher and may be overridden here.
 export TRAIN_NODES=<number of training nodes>        # default: 16
@@ -115,7 +117,7 @@ export EXTRA_MOUNTS=<host_path>:<container_path>[,<host_path>:<container_path>..
 bash examples/nemo_gym/launch_nemo_gym_multinode_training.sh
 ```
 
-The launcher also accepts `NODES` to override `TRAIN_NODES + GEN_NODES`, `CONTAINER_REPO_LOCATION` to override the baked checkout path `/opt/nemo-rl`, `CONTAINER_INPUT_ROOT` and the `CONTAINER_*` path variables to override the stable container mount targets, `SLURM_EXCLUDE` to exclude problematic nodes, and `SLURM_COMMENT`/`SLURM_IDLE_EXEMPT_MINS` to customize the idle-GPU reaper exemption.
+The launcher also accepts `NODES` to override `TRAIN_NODES + GEN_NODES`, `CONTAINER_REPO_LOCATION` to override the baked checkout path `/opt/nemo-rl`, `CONTAINER_INPUT_ROOT` and the `CONTAINER_*` path variables to override container-side paths, `SLURM_EXCLUDE` to exclude problematic nodes, and `SLURM_COMMENT`/`SLURM_IDLE_EXEMPT_MINS` to customize the idle-GPU reaper exemption.
 
 # 3. Dataset/Environment
 
@@ -187,7 +189,7 @@ The config uses separate training and validation JSONL files:
 
 ```yaml
 policy:
-  model_name: ${oc.env:HF_CKPT_PATH}
+  model_name: ${oc.env:CONTAINER_HF_CKPT_PATH}
 data:
   train:
     data_path: ${oc.env:NEMO_GYM_SWE_TRAIN_DATA_PATH}
@@ -237,7 +239,7 @@ The recipe uses token-level GRPO with reward normalization and a leave-one-out b
 
 ### Optimizer
 
-Adam with distributed optimizer state. The async recipe sets `lr: 5.0e-6`, `weight_decay: 0.0`, BF16 training, and FP32 optimizer parameters.
+Adam with distributed optimizer state. The async recipe sets `lr: 3.0e-6`, `weight_decay: 0.0`, BF16 training, and FP32 optimizer parameters.
 
 ### Precision
 
diff --git a/reasoning/RL b/reasoning/RL
index 85f36e0c3..3cded0c90 160000
--- a/reasoning/RL
+++ b/reasoning/RL
@@ -1 +1 @@
-Subproject commit 85f36e0c3dd967ca05b1879228cbbf42a024ddcb
+Subproject commit 3cded0c90d1af70e6c5c985a5d112f8a594a9cb3

From 3d527bc3fb75713efc0942c456c6235f172d3ce7 Mon Sep 17 00:00:00 2001
From: Carlos Gomes <cgomes@nvidia.com>
Date: Thu, 25 Jun 2026 14:54:56 +0200
Subject: [PATCH 10/22] remove unnecessary parts of readme

---
 reasoning/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index c2f4f942d..0bb59ed68 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -102,7 +102,6 @@ export NEMO_GYM_SWE_SIF_DIR=<host directory containing SWE task SIF images>
 # Optional authentication/logging.
 export HF_TOKEN=<huggingface token>
 export WANDB_API_KEY=<wandb token>
-export MLPERF_TARGET_ACCURACY=<target accuracy>      # default: 1.0
 export GRPO_SEED=<integer seed>                      # default: random per launch
 
 # Defaults are defined by the launcher and may be overridden here.
@@ -117,7 +116,7 @@ export EXTRA_MOUNTS=<host_path>:<container_path>[,<host_path>:<container_path>..
 bash examples/nemo_gym/launch_nemo_gym_multinode_training.sh
 ```
 
-The launcher also accepts `NODES` to override `TRAIN_NODES + GEN_NODES`, `CONTAINER_REPO_LOCATION` to override the baked checkout path `/opt/nemo-rl`, `CONTAINER_INPUT_ROOT` and the `CONTAINER_*` path variables to override container-side paths, `SLURM_EXCLUDE` to exclude problematic nodes, and `SLURM_COMMENT`/`SLURM_IDLE_EXEMPT_MINS` to customize the idle-GPU reaper exemption.
+The launcher also accepts `NODES` to override `TRAIN_NODES + GEN_NODES`, `CONTAINER_REPO_LOCATION` to override the baked checkout path `/opt/nemo-rl`, `CONTAINER_INPUT_ROOT` and the `CONTAINER_*` path variables to override container-side paths.
 
 # 3. Dataset/Environment
 

From be8e05990f287bff52054d320474b26fb919d5f8 Mon Sep 17 00:00:00 2001
From: Hristo Filaretov <hfilaretov@nvidia.com>
Date: Thu, 25 Jun 2026 15:01:33 +0200
Subject: [PATCH 11/22] update data instructions

---
 reasoning/README.md | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/reasoning/README.md b/reasoning/README.md
index c2f4f942d..1f40f9073 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -130,6 +130,30 @@ We use a subset of the [R2E-Gym/R2E-Gym-Subset](https://huggingface.co/datasets/
 The recipe consumes prebuilt JSONL files through from [Benchmark-R2E-Gym-Easy](https://huggingface.co/datasets/hfilaretov/Benchmark-R2E-Gym-Easy).
 Each row represents a software-engineering task for the NeMo-Gym environment.
 
+To build the JSONL files, please run:
+
+```bash
+# Optional token
+export HF_TOKEN=<read-token>
+hf download R2E-Gym/R2E-Gym-Subset --repo-type dataset --local-dir tmp/R2E-Gym__R2E-Gym-Subset
+uv run --with pyarrow python RL/tools/create_r2e_gym_easy_subset_jsonl.py \
+  --dataset-dir tmp/R2E-Gym__R2E-Gym-Subset \
+  --output-dir outputs/data/ \
+  --cache-dir tmp/r2e_repo_cache \
+  --train-ids RL/tools/train-instance-ids.txt \
+  --val-ids RL/tools/val-instance-ids.txt
+```
+
+You'll have the relevant output files in `outputs`:
+
+```bash
+; wc -l outputs/data/*.jsonl
+     721 outputs/data/benchmark_r2e_gym_easy_train.jsonl
+     256 outputs/data/benchmark_r2e_gym_easy_val.jsonl
+    4578 outputs/data/r2e_gym_subset_full.jsonl
+    5555 total
+```
+
 The JSONL files refer to SIF container files that need to be generated.
 This is a two-step process:
 1. Images are built from the repository and git revision specified in the dataset.

From ebdcd0993f77b4e893d00359d231143e296fcb35 Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Thu, 25 Jun 2026 20:29:11 +0200
Subject: [PATCH 12/22] Update SWE GRPO README

Use placeholders for some parameters that still need to be fixed.
---
 reasoning/README.md | 119 ++++++++++++++++++++++++++++++--------------
 1 file changed, 81 insertions(+), 38 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index f487bb2ca..b36568d2c 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -1,18 +1,18 @@
 # 1. Problem
 
-## Reasoning - GRPO with NeMo-Gym SWE/OpenHands.
+## SWE Agent Reinforcement Learning  - GRPO with NeMo-Gym SWE/OpenHands.
 
-[NeMo-RL](https://github.com/CarlosGomes98/RL) provides the implementation used for this benchmark. The benchmark trains `Qwen/Qwen3-235B-A22B-Instruct-2507` with GRPO against a NeMo-Gym software-engineering environment driven by an OpenHands SWE agent.
+[NeMo-RL](https://github.com/CarlosGomes98/RL) provides the implementation used for this benchmark. The benchmark uses reinforcement learning to train `Qwen/Qwen3.5-397B-A17B` with GRPO against a NeMo-Gym software-engineering environment driven by an OpenHands SWE agent.
 
-The relevant config files are under `RL/examples/nemo_gym`. The benchmark launch entrypoint is `RL/examples/nemo_gym/launch_nemo_gym_multinode_training.sh`, using `RL/examples/nemo_gym/grpo_qwen3_235b_swe_openhands_async.yaml`.
+The task is to improve the SWE agent's accuracy in solving held-out R2E-Gym software-engineering tasks. A rollout receives reward 1 when the generated patch passes the task evaluation and reward 0 otherwise.
 
-Formal benchmark description: <to be completed>
+The relevant config files are under `RL/examples/nemo_gym` and `RL/qwen_35`. The benchmark launch entrypoint is `RL/examples/nemo_gym/launch_nemo_gym_multinode_training.sh`, using `RL/qwen_35/configs/grpo_qwen35_397b_swe_openhands_async_benchmark.yaml`.
 
 # 2. Directions
 
 ## Steps to configure machine
 
-To use this repository, please ensure your system can run containers and has appropriate GPU support.
+To use this repository, please ensure your have access to a SLURM cluster with Enroot/Pyxis and at least 64x4 GB200 GPUs.
 
 ### Container setup
 
@@ -27,7 +27,7 @@ docker buildx build \
   .
 ```
 
-The Dockerfile overlays the SWE/NeMo-Gym pieces on top of `nvcr.io/nvidia/nemo-rl:v0.6.0` and prefetches Gym virtual environments for `examples/nemo_gym/grpo_qwen3_235b_swe_openhands_async.yaml`.
+The Dockerfile overlays the SWE/NeMo-Gym pieces on top of `nvcr.io/nvidia/nemo-rl:v0.6.0` and prefetches Gym virtual environments for `examples/nemo_gym/grpo_qwen35_397b_swe_openhands_async.yaml`.
 
 ## Steps to download and verify data
 
@@ -35,19 +35,19 @@ The run requires the following artifacts:
 
 | Artifact | Description | Status |
 |---|---|---|
-| Policy model | Host directory containing `Qwen/Qwen3-235B-A22B-Instruct-2507`, passed as `HF_CKPT_PATH`, mounted into the container, and exposed to the recipe through `CONTAINER_HF_CKPT_PATH` | <to be completed> |
-| Megatron-Core checkpoint cache | Host directory for the HF-to-Megatron converted checkpoint cache, passed as `NRL_MEGATRON_CHECKPOINT_DIR` and mounted into the container | <to be completed> |
-| Training JSONL | Host path to NeMo-Gym SWE training tasks, passed as `NEMO_GYM_SWE_TRAIN_DATA_PATH` and mounted into the container | <to be completed> |
-| Validation JSONL | Host path to NeMo-Gym SWE validation tasks, passed as `NEMO_GYM_SWE_VALIDATION_DATA_PATH` and mounted into the container | <to be completed> |
-| Task containers | Host directory containing Apptainer/Singularity SIF images in the layout expected by the recipe, passed as `NEMO_GYM_SWE_SIF_DIR` and mounted into the container | <to be completed> |
+| Policy model | Host directory containing `Qwen/Qwen3.5-397B-A17B`, passed as `HF_CKPT_PATH`, mounted into the container, and exposed to the recipe through `CONTAINER_HF_CKPT_PATH` | Download from Hugging Face |
+| Megatron-Core checkpoint cache | Host directory for the HF-to-Megatron converted checkpoint cache, passed as `NRL_MEGATRON_CHECKPOINT_DIR` and mounted into the container | Empty directory is allowed on first run |
+| Training JSONL | Host path to NeMo-Gym SWE training tasks, passed as `NEMO_GYM_SWE_TRAIN_DATA_PATH` and mounted into the container | Build with `RL/tools/create_r2e_gym_easy_subset_jsonl.py` |
+| Validation JSONL | Host path to NeMo-Gym SWE validation tasks, passed as `NEMO_GYM_SWE_VALIDATION_DATA_PATH` and mounted into the container | Build with `RL/tools/create_r2e_gym_easy_subset_jsonl.py` |
+| Task containers | Host directory containing Apptainer/Singularity SIF images in the layout expected by the recipe, passed as `NEMO_GYM_SWE_SIF_DIR` and mounted into the container | Build with `RL/docker/dataset-processing-container` |
 
 To download the training and validation JSONL files using the HuggingFace CLI:
 
 ```bash
-; hf download hfilaretov/Benchmark-R2E-Gym-Easy --repo-type dataset --local-dir hfilaretov__Benchmark-R2E-Gym-Easy
+hf download hfilaretov/Benchmark-R2E-Gym-Easy --repo-type dataset --local-dir hfilaretov__Benchmark-R2E-Gym-Easy
 ...
 
-; tree hfilaretov__Benchmark-R2E-Gym-Easy
+tree hfilaretov__Benchmark-R2E-Gym-Easy
 hfilaretov__Benchmark-R2E-Gym-Easy
 ├── benchmark_r2e_gym_easy_train.jsonl
 ├── benchmark_r2e_gym_easy_val.jsonl
@@ -59,7 +59,7 @@ hfilaretov__Benchmark-R2E-Gym-Easy
 The environment also requires per-task SIF images. The recipe resolves task containers from `sif_dir` with this template:
 
 ```yaml
-- "${sif_dir}/r2egym_{instance_id}.sif"
+- "${sif_dir}/r2egym/{instance_id}.sif"
 ```
 
 The task containers have to be built and converted to SIF format, please see [Section 3](#data-preprocessing) below.
@@ -73,9 +73,9 @@ python -m venv .venv
 source .venv/bin/activate
 pip install huggingface_hub
 
-export HF_CKPT_PATH=$(pwd)/hf/Qwen/Qwen3-235B-A22B-Instruct-2507
+export HF_CKPT_PATH=$(pwd)/hf/Qwen/Qwen3.5-397B-A17B
 mkdir -p "$HF_CKPT_PATH"
-HF_TOKEN=<your hf token> hf download Qwen/Qwen3-235B-A22B-Instruct-2507 --local-dir "$HF_CKPT_PATH"
+HF_TOKEN=<your hf token> hf download Qwen/Qwen3.5-397B-A17B --local-dir "$HF_CKPT_PATH"
 ```
 
 The launcher also creates and mounts a host Hugging Face cache. Set `HF_HOME` before launch if you want to use a cache outside `$(pwd)/.cache`.
@@ -102,13 +102,14 @@ export NEMO_GYM_SWE_SIF_DIR=<host directory containing SWE task SIF images>
 # Optional authentication/logging.
 export HF_TOKEN=<huggingface token>
 export WANDB_API_KEY=<wandb token>
+export MLPERF_TARGET_ACCURACY=<target reward mean>  # default: 1.0 until the target is finalized
 export GRPO_SEED=<integer seed>                      # default: random per launch
 
 # Defaults are defined by the launcher and may be overridden here.
 export TRAIN_NODES=<number of training nodes>        # default: 16
 export GEN_NODES=<number of generation nodes>        # default: 24
 export SLURM_TIME=<walltime>                         # default: 1:0:0
-export RECIPE=examples/nemo_gym/grpo_qwen3_235b_swe_openhands_async.yaml
+export RECIPE=qwen_35/configs/grpo_qwen35_397b_swe_openhands_async_benchmark.yaml
 
 # Optional extra mounts. The launcher automatically mounts the paths above.
 export EXTRA_MOUNTS=<host_path>:<container_path>[,<host_path>:<container_path>...]
@@ -127,9 +128,14 @@ We use a subset of the [R2E-Gym/R2E-Gym-Subset](https://huggingface.co/datasets/
 ### Data preprocessing
 
 The recipe consumes prebuilt JSONL files through from [Benchmark-R2E-Gym-Easy](https://huggingface.co/datasets/hfilaretov/Benchmark-R2E-Gym-Easy).
-Each row represents a software-engineering task for the NeMo-Gym environment.
+Each row represents a software-engineering task for the NeMo-Gym environment. We filtered the original `R2E-Gym/R2E-Gym-Subset` dataset based on these two conditions:
+* whether an environment container image successfully builds for both x86_64 and aarch64
+* complexity using the following condition:
+  ```
+  where num_non_test_func_methods == 1 | where num_non_test_files == 1 | where num_non_test_lines <= 20
+  ```
 
-To build the JSONL files, please run:
+To build the JSONL files yourself, please run:
 
 ```bash
 # Optional token
@@ -146,7 +152,7 @@ uv run --with pyarrow python RL/tools/create_r2e_gym_easy_subset_jsonl.py \
 You'll have the relevant output files in `outputs`:
 
 ```bash
-; wc -l outputs/data/*.jsonl
+wc -l outputs/data/*.jsonl
      721 outputs/data/benchmark_r2e_gym_easy_train.jsonl
      256 outputs/data/benchmark_r2e_gym_easy_val.jsonl
     4578 outputs/data/r2e_gym_subset_full.jsonl
@@ -221,15 +227,15 @@ data:
 sif_dir: ${oc.env:NEMO_GYM_SWE_SIF_DIR}
 ```
 
-The official split procedure is <to be completed>.
+The official split is defined by the fixed instance-id files `RL/tools/train-instance-ids.txt` and `RL/tools/val-instance-ids.txt`. The conversion script validates that the lists do not overlap, writes matching rows to `benchmark_r2e_gym_easy_train.jsonl` and `benchmark_r2e_gym_easy_val.jsonl`, and leaves rows in neither list only in `r2e_gym_subset_full.jsonl`.
 
 ### Training data order
 
-Training data order is preserved by the recipe with `data.shuffle: false`. The intended ordering or curriculum of the training JSONL is <to be completed>.
+Training data order is preserved by the recipe with `data.shuffle: false`. The converter writes the training JSONL in the order encountered in the converted R2E-Gym subset after filtering by `RL/tools/train-instance-ids.txt`; the benchmark does not add runtime shuffling.
 
 ### Test data order
 
-Validation data order is preserved by the recipe. The config uses `grpo.max_val_samples: null`, so validation thoroughness is inferred from the validation dataset size unless overridden.
+Validation data order is preserved by the recipe. The config uses `grpo.max_val_samples: null`, so validation thoroughness is inferred from the validation dataset size unless overridden. The benchmark does not add runtime shuffling of validation data.
 
 ### Simulation environment (RL models only)
 
@@ -239,22 +245,48 @@ The benchmark uses NeMo-Gym with the SWE/OpenHands agent configuration. Rollouts
 
 ### Publication/Attribution
 
-Model and implementation attribution: <to be completed>
-
-### List of layers
+The policy starts from the [`Qwen/Qwen3.5-397B-A17B`](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) checkpoint released by the Qwen team. The reference training implementation is NeMo-RL with the Qwen 3.5.
+
+### Model details
+
+Architecture values below are taken from the [Hugging Face model card](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) and [`config.json`](https://huggingface.co/Qwen/Qwen3.5-397B-A17B/blob/main/config.json).
+
+| Config | Value |
+| :-- | :-- |
+| # Total Parameters | 397B |
+| # Active Parameters | 17B |
+| # Layers | 60 |
+| Hidden Layout | 3 Gated DeltaNet + 1 Gated Attention layers per block (15 blocks) |
+| Attention Type | Hybrid Gated DeltaNet + Gated Attention |
+| Gated DeltaNet Heads (V / QK) | 64 / 16 |
+| Gated DeltaNet Head Dimension | 128 |
+| Gated Attention Heads (Q / KV) | 32 / 2 |
+| Gated Attention Head Dimension | 256 |
+| RoPE Dimension | 64 |
+| Model Dimension | 4,096 |
+| # Routed Experts | 512 |
+| # Active Routed Experts | 10 |
+| # Shared Experts | 1 |
+| Expert Intermediate Dimension | 1,024 |
+| Activation | SiLU (SwiGLU in MoE) |
+| Normalization | RMSNorm |
+| Vocab Size | 248,320 |
+| Native Context Length | 262,144 |
+| Benchmark Context Length | 65,536 |
+
+### Benchmark runtime
 
 | **Component** | **Architecture** | **Parameters** | **Technical Details** |
 |---------------|------------------|----------------|-----------------------|
-| **Policy model** | Qwen3 MoE Transformer | 235B total, 22B active | `Qwen/Qwen3-235B-A22B-Instruct-2507` |
-| **Training runtime** | Megatron-Core through NeMo-RL | Same policy weights | TP4 x CP4 x PP8 minimum training replica; expert model parallel size 16 |
-| **Generation runtime** | vLLM | Same policy weights | TP16, BF16, HTTP server exposed for NeMo-Gym |
-| **SWE environment** | NeMo-Gym + OpenHands | N/A | Agent max turns 15 |
-
-Exact layer-by-layer model description: <to be completed>
+| **Training runtime** | Megatron-Core through NeMo-RL | Same policy weights | TP4 x PP2 x CP1, EP16, BF16 |
+| **Generation runtime** | vLLM | Same policy weights | TP8, EP8, 64k benchmark context, HTTP server exposed for NeMo-Gym |
+| **SWE environment** | NeMo-Gym + OpenHands | N/A | CodeActAgent, max 30 turns |
 
 ### Weight and bias initialization
 
-Training starts from a pretrained Hugging Face checkpoint converted to Megatron-Core format. Random initialization is not used for the policy model. Any benchmark-specific initialization details are <to be completed>.
+Training starts from the pretrained Hugging Face checkpoint converted to Megatron-Core format. Random initialization is not used for the policy model. The first run can populate `NRL_MEGATRON_CHECKPOINT_DIR` with the converted checkpoint cache.
+
+MoE router weights are kept frozen.
 
 ### Loss function
 
@@ -262,7 +294,9 @@ The recipe uses token-level GRPO with reward normalization and a leave-one-out b
 
 ### Optimizer
 
-Adam with distributed optimizer state. The async recipe sets `lr: 3.0e-6`, `weight_decay: 0.0`, BF16 training, and FP32 optimizer parameters.
+TODO: final
+
+AdamW with distributed optimizer state. The benchmark config sets `lr: 2.0e-6`, `min_lr: 2.0e-6`, `weight_decay: 0.0`, 2 warmup steps, BF16 training, and FP32 optimizer parameters.
 
 ### Precision
 
@@ -272,16 +306,25 @@ The recipe uses BF16 policy precision by default.
 
 ### Quality metric
 
-The checkpointing metric is `val:total_reward/mean`, computed from NeMo-Gym validation rollouts.
+The quality metric is `val:accuracy`, computed from NeMo-Gym validation rollouts.
 
 ### Quality target
 
-<to be completed>
+TODO: final
+
+The quality target is pending MLCommons ratification. The current launcher reads `MLPERF_TARGET_ACCURACY` and defaults to `0.6`.
 
 ### Evaluation frequency
 
-The async recipe sets `grpo.val_period: 5`, `grpo.val_at_start: true`, and `grpo.val_at_end: true`.
+TODO: final
+
+The benchmark recipe sets:
+```
+grpo.val_period: 2
+grpo.val_at_start: true
+grpo.val_at_end: true
+```
 
 ### Evaluation thoroughness
 
-The validation set size is <to be completed>.
+The validation JSONL contains 256 R2E-Gym tasks and each evaluation uses the full validation set.
\ No newline at end of file

From 35bd44b0a61cd2561a25cdbf03c181c492073e56 Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Thu, 25 Jun 2026 21:59:07 +0200
Subject: [PATCH 13/22] Update submodule to mlperf-training-qwen35 branch

---
 .gitmodules  | 2 +-
 reasoning/RL | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 10f6e78cc..591b4eaf9 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -9,4 +9,4 @@
 [submodule "reasoning/RL"]
 	path = reasoning/RL
 	url = https://github.com/CarlosGomes98/RL.git
-	branch = mlperf-training
+	branch = mlperf-training-qwen35
diff --git a/reasoning/RL b/reasoning/RL
index 3cded0c90..e4d0b38c3 160000
--- a/reasoning/RL
+++ b/reasoning/RL
@@ -1 +1 @@
-Subproject commit 3cded0c90d1af70e6c5c985a5d112f8a594a9cb3
+Subproject commit e4d0b38c3e9146ebf055647c85d305994d2bdb42

From ae9b0c3a59bcc3b4eafced85f4c4b82e16be58b4 Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Thu, 25 Jun 2026 22:07:31 +0200
Subject: [PATCH 14/22] Update SWE GRPO paths for Qwen3.5 branch

---
 reasoning/README.md | 68 ++++++++++++++++++++++++++++-----------------
 1 file changed, 42 insertions(+), 26 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index b36568d2c..d6660a159 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -2,11 +2,11 @@
 
 ## SWE Agent Reinforcement Learning  - GRPO with NeMo-Gym SWE/OpenHands.
 
-[NeMo-RL](https://github.com/CarlosGomes98/RL) provides the implementation used for this benchmark. The benchmark uses reinforcement learning to train `Qwen/Qwen3.5-397B-A17B` with GRPO against a NeMo-Gym software-engineering environment driven by an OpenHands SWE agent.
+[NeMo-RL](https://github.com/CarlosGomes98/RL/tree/mlperf-training-qwen35) provides the implementation used for this benchmark from branch `mlperf-training-qwen35` at commit `e4d0b38c3e9146ebf055647c85d305994d2bdb42`. The benchmark uses reinforcement learning to train `Qwen/Qwen3.5-397B-A17B` with GRPO against a NeMo-Gym software-engineering environment driven by an OpenHands SWE agent.
 
 The task is to improve the SWE agent's accuracy in solving held-out R2E-Gym software-engineering tasks. A rollout receives reward 1 when the generated patch passes the task evaluation and reward 0 otherwise.
 
-The relevant config files are under `RL/examples/nemo_gym` and `RL/qwen_35`. The benchmark launch entrypoint is `RL/examples/nemo_gym/launch_nemo_gym_multinode_training.sh`, using `RL/qwen_35/configs/grpo_qwen35_397b_swe_openhands_async_benchmark.yaml`.
+The relevant config files are under `RL/examples/nemo_gym` and `RL/qwen_35`. The benchmark launch entrypoint is `RL/examples/nemo_gym/launch_qwen35_nemo_gym_multinode_training.sh`, using `RL/qwen_35/configs/grpo_qwen35_397b_swe_openhands_async_benchmark.yaml`.
 
 # 2. Directions
 
@@ -27,7 +27,7 @@ docker buildx build \
   .
 ```
 
-The Dockerfile overlays the SWE/NeMo-Gym pieces on top of `nvcr.io/nvidia/nemo-rl:v0.6.0` and prefetches Gym virtual environments for `examples/nemo_gym/grpo_qwen35_397b_swe_openhands_async.yaml`.
+The Dockerfile overlays the SWE/NeMo-Gym pieces on top of `nvcr.io/nvidia/nemo-rl:v0.6.0` and prefetches Gym virtual environments for `qwen_35/configs/grpo_qwen35_397b_swe_openhands_async.yaml`.
 
 ## Steps to download and verify data
 
@@ -37,8 +37,8 @@ The run requires the following artifacts:
 |---|---|---|
 | Policy model | Host directory containing `Qwen/Qwen3.5-397B-A17B`, passed as `HF_CKPT_PATH`, mounted into the container, and exposed to the recipe through `CONTAINER_HF_CKPT_PATH` | Download from Hugging Face |
 | Megatron-Core checkpoint cache | Host directory for the HF-to-Megatron converted checkpoint cache, passed as `NRL_MEGATRON_CHECKPOINT_DIR` and mounted into the container | Empty directory is allowed on first run |
-| Training JSONL | Host path to NeMo-Gym SWE training tasks, passed as `NEMO_GYM_SWE_TRAIN_DATA_PATH` and mounted into the container | Build with `RL/tools/create_r2e_gym_easy_subset_jsonl.py` |
-| Validation JSONL | Host path to NeMo-Gym SWE validation tasks, passed as `NEMO_GYM_SWE_VALIDATION_DATA_PATH` and mounted into the container | Build with `RL/tools/create_r2e_gym_easy_subset_jsonl.py` |
+| Training JSONL | Host path to NeMo-Gym SWE training tasks, passed as `NEMO_GYM_SWE_TRAIN_DATA_PATH` and mounted into the container | Download `benchmark_r2e_gym_easy_train.jsonl` or rebuild with `RL/tools/create_r2e_gym_easy_subset_jsonl.py` |
+| Validation JSONL | Host path to NeMo-Gym SWE validation tasks, passed as `NEMO_GYM_SWE_VALIDATION_DATA_PATH` and mounted into the container | Download `benchmark_r2e_gym_easy_val.jsonl` or rebuild with `RL/tools/create_r2e_gym_easy_subset_jsonl.py` |
 | Task containers | Host directory containing Apptainer/Singularity SIF images in the layout expected by the recipe, passed as `NEMO_GYM_SWE_SIF_DIR` and mounted into the container | Build with `RL/docker/dataset-processing-container` |
 
 To download the training and validation JSONL files using the HuggingFace CLI:
@@ -114,7 +114,7 @@ export RECIPE=qwen_35/configs/grpo_qwen35_397b_swe_openhands_async_benchmark.yam
 # Optional extra mounts. The launcher automatically mounts the paths above.
 export EXTRA_MOUNTS=<host_path>:<container_path>[,<host_path>:<container_path>...]
 
-bash examples/nemo_gym/launch_nemo_gym_multinode_training.sh
+bash examples/nemo_gym/launch_qwen35_nemo_gym_multinode_training.sh
 ```
 
 The launcher also accepts `NODES` to override `TRAIN_NODES + GEN_NODES`, `CONTAINER_REPO_LOCATION` to override the baked checkout path `/opt/nemo-rl`, `CONTAINER_INPUT_ROOT` and the `CONTAINER_*` path variables to override container-side paths.
@@ -127,7 +127,7 @@ We use a subset of the [R2E-Gym/R2E-Gym-Subset](https://huggingface.co/datasets/
 
 ### Data preprocessing
 
-The recipe consumes prebuilt JSONL files through from [Benchmark-R2E-Gym-Easy](https://huggingface.co/datasets/hfilaretov/Benchmark-R2E-Gym-Easy).
+The recipe consumes prebuilt JSONL files from [Benchmark-R2E-Gym-Easy](https://huggingface.co/datasets/hfilaretov/Benchmark-R2E-Gym-Easy).
 Each row represents a software-engineering task for the NeMo-Gym environment. We filtered the original `R2E-Gym/R2E-Gym-Subset` dataset based on these two conditions:
 * whether an environment container image successfully builds for both x86_64 and aarch64
 * complexity using the following condition:
@@ -135,24 +135,28 @@ Each row represents a software-engineering task for the NeMo-Gym environment. We
   where num_non_test_func_methods == 1 | where num_non_test_files == 1 | where num_non_test_lines <= 20
   ```
 
-To build the JSONL files yourself, please run:
+To build the JSONL files yourself, run the converter from the RL checkout:
 
 ```bash
+cd RL
+
 # Optional token
 export HF_TOKEN=<read-token>
 hf download R2E-Gym/R2E-Gym-Subset --repo-type dataset --local-dir tmp/R2E-Gym__R2E-Gym-Subset
-uv run --with pyarrow python RL/tools/create_r2e_gym_easy_subset_jsonl.py \
+uv run --with pyarrow python tools/create_r2e_gym_easy_subset_jsonl.py \
   --dataset-dir tmp/R2E-Gym__R2E-Gym-Subset \
   --output-dir outputs/data/ \
   --cache-dir tmp/r2e_repo_cache \
-  --train-ids RL/tools/train-instance-ids.txt \
-  --val-ids RL/tools/val-instance-ids.txt
+  --train-ids tools/train-instance-ids.txt \
+  --val-ids tools/val-instance-ids.txt
 ```
 
 You'll have the relevant output files in `outputs`:
 
 ```bash
-wc -l outputs/data/*.jsonl
+wc -l outputs/data/benchmark_r2e_gym_easy_train.jsonl \
+      outputs/data/benchmark_r2e_gym_easy_val.jsonl \
+      outputs/data/r2e_gym_subset_full.jsonl
      721 outputs/data/benchmark_r2e_gym_easy_train.jsonl
      256 outputs/data/benchmark_r2e_gym_easy_val.jsonl
     4578 outputs/data/r2e_gym_subset_full.jsonl
@@ -164,7 +168,7 @@ This is a two-step process:
 1. Images are built from the repository and git revision specified in the dataset.
 2. These images are converted to SIF file format.
 
-You can build the container defined in `./dataset-processing-container` that already pre-packages all necessary dependencies and can be used for both steps.
+You can build the container defined in `RL/docker/dataset-processing-container` that already pre-packages all necessary dependencies and can be used for both steps.
 
 Prepare the builder image:
 
@@ -245,7 +249,7 @@ The benchmark uses NeMo-Gym with the SWE/OpenHands agent configuration. Rollouts
 
 ### Publication/Attribution
 
-The policy starts from the [`Qwen/Qwen3.5-397B-A17B`](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) checkpoint released by the Qwen team. The reference training implementation is NeMo-RL with the Qwen 3.5.
+The policy starts from the [`Qwen/Qwen3.5-397B-A17B`](https://huggingface.co/Qwen/Qwen3.5-397B-A17B) checkpoint released by the Qwen team. The reference training implementation is NeMo-RL with Qwen 3.5 support from the `mlperf-training-qwen35` branch.
 
 ### Model details
 
@@ -294,9 +298,23 @@ The recipe uses token-level GRPO with reward normalization and a leave-one-out b
 
 ### Optimizer
 
-TODO: final
+AdamW with distributed optimizer state.
 
-AdamW with distributed optimizer state. The benchmark config sets `lr: 2.0e-6`, `min_lr: 2.0e-6`, `weight_decay: 0.0`, 2 warmup steps, BF16 training, and FP32 optimizer parameters.
+| Parameter | Value |
+| :-- | :-- |
+| Optimizer | AdamW |
+| Base learning rate | `2.0e-6` |
+| End learning rate | `2.0e-6` |
+| Learning-rate schedule | Constant |
+| Warmup steps | 2 |
+| Weight decay | `0.0` |
+| Adam beta1 | `0.9` |
+| Adam beta2 | `0.999` |
+| Adam epsilon | `1e-8` |
+| Gradient clipping | `1.0` |
+| Distributed optimizer | Enabled |
+| Optimizer parameters | FP32 |
+| Training precision | BF16 |
 
 ### Precision
 
@@ -312,19 +330,17 @@ The quality metric is `val:accuracy`, computed from NeMo-Gym validation rollouts
 
 TODO: final
 
-The quality target is pending MLCommons ratification. The current launcher reads `MLPERF_TARGET_ACCURACY` and defaults to `0.6`.
+The quality target is pending MLCommons ratification. The current launcher reads `MLPERF_TARGET_ACCURACY` and defaults to `1.0`.
 
 ### Evaluation frequency
 
-TODO: final
-
-The benchmark recipe sets:
-```
-grpo.val_period: 2
-grpo.val_at_start: true
-grpo.val_at_end: true
-```
+| Parameter | Value |
+| :-- | :-- |
+| Evaluate at start | Yes |
+| Evaluation period | Every 2 training steps |
+| Evaluate at end | Yes |
+| Maximum training steps | 20 |
 
 ### Evaluation thoroughness
 
-The validation JSONL contains 256 R2E-Gym tasks and each evaluation uses the full validation set.
\ No newline at end of file
+The validation JSONL contains 256 R2E-Gym tasks and each evaluation uses the full validation set.

From a8e006640f5f393f000d7ac14b9cbe9e1abae713 Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Thu, 25 Jun 2026 22:28:37 +0200
Subject: [PATCH 15/22] Add software versions

---
 reasoning/README.md | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index d6660a159..3920525cc 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -282,10 +282,20 @@ Architecture values below are taken from the [Hugging Face model card](https://h
 
 | **Component** | **Architecture** | **Parameters** | **Technical Details** |
 |---------------|------------------|----------------|-----------------------|
-| **Training runtime** | Megatron-Core through NeMo-RL | Same policy weights | TP4 x PP2 x CP1, EP16, BF16 |
+| **Training runtime** | Megatron-Bridge and Megatron-Core through NeMo-RL | Same policy weights | TP4 x PP2 x CP1, EP16, BF16 |
 | **Generation runtime** | vLLM | Same policy weights | TP8, EP8, 64k benchmark context, HTTP server exposed for NeMo-Gym |
 | **SWE environment** | NeMo-Gym + OpenHands | N/A | CodeActAgent, max 30 turns |
 
+Source revisions identify the checked-out editable packages used by the reference implementation.
+
+| **Runtime package** | **Package version** | **Source revision** |
+|---------------------|---------------------|---------------------|
+| NeMo-RL | `0.6.0` | `e4d0b38c` |
+| Megatron-Bridge | `0.5.0` | `95e5f38f` |
+| Megatron-Core | `0.18.0` | `d30c3ae54` |
+| vLLM | `0.17.1` | PyPI package pin |
+| NeMo-Gym | `0.3.0rc0` | `1a4912e` |
+
 ### Weight and bias initialization
 
 Training starts from the pretrained Hugging Face checkpoint converted to Megatron-Core format. Random initialization is not used for the policy model. The first run can populate `NRL_MEGATRON_CHECKPOINT_DIR` with the converted checkpoint cache.

From f3c219e049184423774d9771ca89dcd0e55cfacd Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Fri, 26 Jun 2026 10:47:59 +0200
Subject: [PATCH 16/22] Rephrase first paragraph

* explicitly mention RLVR
* expand GRPO
* remove commit
---
 reasoning/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index 3920525cc..5ce850859 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -2,7 +2,7 @@
 
 ## SWE Agent Reinforcement Learning  - GRPO with NeMo-Gym SWE/OpenHands.
 
-[NeMo-RL](https://github.com/CarlosGomes98/RL/tree/mlperf-training-qwen35) provides the implementation used for this benchmark from branch `mlperf-training-qwen35` at commit `e4d0b38c3e9146ebf055647c85d305994d2bdb42`. The benchmark uses reinforcement learning to train `Qwen/Qwen3.5-397B-A17B` with GRPO against a NeMo-Gym software-engineering environment driven by an OpenHands SWE agent.
+[NeMo-RL](https://github.com/CarlosGomes98/RL/tree/mlperf-training-qwen35) provides the implementation used for this benchmark from branch `mlperf-training-qwen35`. The benchmark uses Reinforcement Learning with Verifiable Rewards (RLVR) to train `Qwen/Qwen3.5-397B-A17B` with  Group Relative Policy Optimization (GRPO) against a NeMo-Gym software-engineering environment driven by an OpenHands SWE agent.
 
 The task is to improve the SWE agent's accuracy in solving held-out R2E-Gym software-engineering tasks. A rollout receives reward 1 when the generated patch passes the task evaluation and reward 0 otherwise.
 

From 34fc67d66f45e62cc52df103755461ad85e2e5f5 Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Fri, 26 Jun 2026 13:40:23 +0200
Subject: [PATCH 17/22] Address review comments and add note on reasoning

---
 reasoning/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/reasoning/README.md b/reasoning/README.md
index 5ce850859..177f755fa 100644
--- a/reasoning/README.md
+++ b/reasoning/README.md
@@ -12,7 +12,7 @@ The relevant config files are under `RL/examples/nemo_gym` and `RL/qwen_35`. The
 
 ## Steps to configure machine
 
-To use this repository, please ensure your have access to a SLURM cluster with Enroot/Pyxis and at least 64x4 GB200 GPUs.
+To use this repository, please ensure your have access to a SLURM cluster with Enroot/Pyxis and at least 256 GB200 GPUs.
 
 ### Container setup
 
@@ -243,7 +243,7 @@ Validation data order is preserved by the recipe. The config uses `grpo.max_val_
 
 ### Simulation environment (RL models only)
 
-The benchmark uses NeMo-Gym with the SWE/OpenHands agent configuration. Rollouts are collected through a vLLM-backed policy server, with OpenHands interacting with task containers via Apptainer/Singularity. The async recipe uses non-colocated generation and training, with one-step-stale trajectories corrected by importance sampling.
+The benchmark uses NeMo-Gym with the SWE/OpenHands agent configuration. Rollouts are collected through a vLLM-backed policy server, with OpenHands interacting with task containers via Apptainer/Singularity. Model reasoning/thinking is disabled through the vLLM chat template configuration. The async recipe uses non-colocated generation and training, with one-step-stale trajectories corrected by importance sampling.
 
 # 4. Model
 

From a8524916c62ab1b1f02252162b56f57e627ff33a Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Fri, 26 Jun 2026 14:26:07 +0200
Subject: [PATCH 18/22] Rename benchmark to llm_moe_grpo

---
 .gitmodules                           | 2 +-
 {reasoning => llm_moe_grpo}/README.md | 2 +-
 {reasoning => llm_moe_grpo}/RL        | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename {reasoning => llm_moe_grpo}/README.md (98%)
 rename {reasoning => llm_moe_grpo}/RL (100%)

diff --git a/.gitmodules b/.gitmodules
index 591b4eaf9..ba12fb274 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -7,6 +7,6 @@
 	url = https://github.com/pytorch/torchtitan.git
 	branch = mlperf-training-flux.1
 [submodule "reasoning/RL"]
-	path = reasoning/RL
+	path = llm_moe_grpo/RL
 	url = https://github.com/CarlosGomes98/RL.git
 	branch = mlperf-training-qwen35
diff --git a/reasoning/README.md b/llm_moe_grpo/README.md
similarity index 98%
rename from reasoning/README.md
rename to llm_moe_grpo/README.md
index 177f755fa..a1d60b26d 100644
--- a/reasoning/README.md
+++ b/llm_moe_grpo/README.md
@@ -82,7 +82,7 @@ The launcher also creates and mounts a host Hugging Face cache. Set `HF_HOME` be
 
 ## Steps to run and time
 
-All steps below are assumed to be run from this `reasoning` directory on the host; `cd RL` enters the NeMo-RL submodule checkout. The launcher submits `ray.sub` and runs training from the checkout baked into the container at `/opt/nemo-rl` by default.
+All steps below are assumed to be run from this `llm_moe_grpo` directory on the host; `cd RL` enters the NeMo-RL submodule checkout. The launcher submits `ray.sub` and runs training from the checkout baked into the container at `/opt/nemo-rl` by default.
 
 ```bash
 cd RL
diff --git a/reasoning/RL b/llm_moe_grpo/RL
similarity index 100%
rename from reasoning/RL
rename to llm_moe_grpo/RL

From 2c3803625ec49b20180fc450c20590131ee0acc3 Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Fri, 26 Jun 2026 15:02:51 +0200
Subject: [PATCH 19/22] Add index placeholder for GRPO benchmark

---
 README.md | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f16c412a3..dcd47c170 100644
--- a/README.md
+++ b/README.md
@@ -42,12 +42,21 @@ Each benchmark will run until the target quality is reached and then stop, print
 
 Some these benchmarks are rather slow or take a long time to run on the reference hardware. We expect to see significant performance improvements with more hardware and optimized implementations.
 
+# MLPerf Training v6.1 (Submission Deadline XXX, 2026)
+
+| Model | reference implementation | framework* | dataset | model parameter count**
+| ---- | ---- | ---- | ---- | ----
+| qwen3.5_397b_a17b_swe_grpo | [llm_moe_grpo](https://github.com/mlcommons/training/tree/master/llm_moe_grpo) | NeMo-RL / NeMo-Gym | SWE tasks | 397B
+
+*Framework here is given for the reference implementation. Submitters are free to use their own frameworks to run the benchmark.
+
+**Model parameter count is not the same as active parameter that are being trained in the benchmark.
+
 # MLPerf Training v6.0 (Submission Deadline May 15, 2026)
 
 | Model | reference implementation | framework* | dataset | model parameter count**
 | ---- | ---- | ---- | ---- | ----
 | flux.1 | [text_to_image](https://github.com/mlcommons/training/tree/master/text_to_image) | torchtitan | CC12M subset | 11.9B
-| qwen3_235b_a22b_swe_grpo | [reasoning](https://github.com/mlcommons/training/tree/master/reasoning) | NeMo-RL / NeMo-Gym | SWE tasks | 235B
 | llama3.1_8b | [small_llm_pretraining](https://github.com/mlcommons/training/tree/master/small_llm_pretraining) | NeMo | C4 | 8b
 | llama2_70b_lora | [llama2_70b_lora](https://github.com/mlcommons/training/tree/master/llama2_70b_lora) | pytorch | SCROLLS GovReport | 70B
 | llama3.1_405b | [large_language_model_pretraining](https://github.com/mlcommons/training/tree/master/large_language_model_pretraining) | NeMo | C4 | 405B

From dbaeb7efa29920fbd585967f0bb6235c3ed613db Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Fri, 26 Jun 2026 15:53:18 +0200
Subject: [PATCH 20/22] llm_moe_grpo: Update reference and sync README with it

---
 .gitmodules            |  2 +-
 llm_moe_grpo/README.md | 10 +++++-----
 llm_moe_grpo/RL        |  2 +-
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index ba12fb274..c93db8b62 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -6,7 +6,7 @@
 	path = text_to_image/torchtitan
 	url = https://github.com/pytorch/torchtitan.git
 	branch = mlperf-training-flux.1
-[submodule "reasoning/RL"]
+[submodule "llm_moe_grpo/RL"]
 	path = llm_moe_grpo/RL
 	url = https://github.com/CarlosGomes98/RL.git
 	branch = mlperf-training-qwen35
diff --git a/llm_moe_grpo/README.md b/llm_moe_grpo/README.md
index a1d60b26d..f7d33ec2f 100644
--- a/llm_moe_grpo/README.md
+++ b/llm_moe_grpo/README.md
@@ -92,7 +92,7 @@ export EXP_NAME=<experiment name>
 export CONTAINER_IMAGE_PATH=<container image path or tag>
 export SLURM_ACCOUNT=<account>
 export SLURM_PARTITION=<partition>
-export GPUS_PER_NODE=<number of GPUs per Slurm node>
+export GPUS_PER_NODE=4                                # GB200 reference configuration
 export HF_CKPT_PATH=<host path to HF checkpoint directory>
 export NRL_MEGATRON_CHECKPOINT_DIR=<host path to Megatron-Core checkpoint cache directory>  # may be empty on first run
 export NEMO_GYM_SWE_TRAIN_DATA_PATH=<host path to training JSONL>
@@ -106,8 +106,8 @@ export MLPERF_TARGET_ACCURACY=<target reward mean>  # default: 1.0 until the tar
 export GRPO_SEED=<integer seed>                      # default: random per launch
 
 # Defaults are defined by the launcher and may be overridden here.
-export TRAIN_NODES=<number of training nodes>        # default: 16
-export GEN_NODES=<number of generation nodes>        # default: 24
+export TRAIN_NODES=<number of training nodes>        # default: 32
+export GEN_NODES=<number of generation nodes>        # default: 32
 export SLURM_TIME=<walltime>                         # default: 1:0:0
 export RECIPE=qwen_35/configs/grpo_qwen35_397b_swe_openhands_async_benchmark.yaml
 
@@ -282,7 +282,7 @@ Architecture values below are taken from the [Hugging Face model card](https://h
 
 | **Component** | **Architecture** | **Parameters** | **Technical Details** |
 |---------------|------------------|----------------|-----------------------|
-| **Training runtime** | Megatron-Bridge and Megatron-Core through NeMo-RL | Same policy weights | TP4 x PP2 x CP1, EP16, BF16 |
+| **Training runtime** | Megatron-Bridge and Megatron-Core through NeMo-RL | Same policy weights | TP4 x PP2 x CP1, EP32, BF16 |
 | **Generation runtime** | vLLM | Same policy weights | TP8, EP8, 64k benchmark context, HTTP server exposed for NeMo-Gym |
 | **SWE environment** | NeMo-Gym + OpenHands | N/A | CodeActAgent, max 30 turns |
 
@@ -290,7 +290,7 @@ Source revisions identify the checked-out editable packages used by the referenc
 
 | **Runtime package** | **Package version** | **Source revision** |
 |---------------------|---------------------|---------------------|
-| NeMo-RL | `0.6.0` | `e4d0b38c` |
+| NeMo-RL | `0.6.0` | `fbc91daf` |
 | Megatron-Bridge | `0.5.0` | `95e5f38f` |
 | Megatron-Core | `0.18.0` | `d30c3ae54` |
 | vLLM | `0.17.1` | PyPI package pin |
diff --git a/llm_moe_grpo/RL b/llm_moe_grpo/RL
index e4d0b38c3..fbc91dafe 160000
--- a/llm_moe_grpo/RL
+++ b/llm_moe_grpo/RL
@@ -1 +1 @@
-Subproject commit e4d0b38c3e9146ebf055647c85d305994d2bdb42
+Subproject commit fbc91dafeadf923095845e4058a6419beda05e85

From fc2542a4e7d70df046e3ea7108318a7f462d540a Mon Sep 17 00:00:00 2001
From: Jeremi Piotrowski <jpiotrowski@nvidia.com>
Date: Fri, 26 Jun 2026 19:41:02 +0200
Subject: [PATCH 21/22] llm_moe_grpo: Update submodule reference to NVIDIA-NeMo
 org

---
 .gitmodules | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index c93db8b62..7c190ac21 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -8,5 +8,5 @@
 	branch = mlperf-training-flux.1
 [submodule "llm_moe_grpo/RL"]
 	path = llm_moe_grpo/RL
-	url = https://github.com/CarlosGomes98/RL.git
-	branch = mlperf-training-qwen35
+	url = https://github.com/NVIDIA-NeMo/RL.git
+	branch = v0.6.0-mlperf-training-qwen35

From f40e314ff92dec99ca6e7302ad9a0f7042187205 Mon Sep 17 00:00:00 2001
From: Carlos Gomes <cgomes@nvidia.com>
Date: Tue, 30 Jun 2026 09:12:58 +0200
Subject: [PATCH 22/22] update readme link to correct repo

---
 llm_moe_grpo/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm_moe_grpo/README.md b/llm_moe_grpo/README.md
index f7d33ec2f..32763e8e0 100644
--- a/llm_moe_grpo/README.md
+++ b/llm_moe_grpo/README.md
@@ -2,7 +2,7 @@
 
 ## SWE Agent Reinforcement Learning  - GRPO with NeMo-Gym SWE/OpenHands.
 
-[NeMo-RL](https://github.com/CarlosGomes98/RL/tree/mlperf-training-qwen35) provides the implementation used for this benchmark from branch `mlperf-training-qwen35`. The benchmark uses Reinforcement Learning with Verifiable Rewards (RLVR) to train `Qwen/Qwen3.5-397B-A17B` with  Group Relative Policy Optimization (GRPO) against a NeMo-Gym software-engineering environment driven by an OpenHands SWE agent.
+[NeMo-RL](https://github.com/NVIDIA-NeMo/RL/tree/v0.6.0-mlperf-training-qwen35) provides the implementation used for this benchmark from branch `mlperf-training-qwen35`. The benchmark uses Reinforcement Learning with Verifiable Rewards (RLVR) to train `Qwen/Qwen3.5-397B-A17B` with  Group Relative Policy Optimization (GRPO) against a NeMo-Gym software-engineering environment driven by an OpenHands SWE agent.
 
 The task is to improve the SWE agent's accuracy in solving held-out R2E-Gym software-engineering tasks. A rollout receives reward 1 when the generated patch passes the task evaluation and reward 0 otherwise.