verl-project · yyDing1 · Jun 24, 2026 · Jun 24, 2026 · gemini-code-assist · Jun 24, 2026
diff --git a/examples/agent_interaction/parallel_infer.py b/examples/agent_interaction/parallel_infer.py
@@ -1,4 +1,5 @@
 import argparse
+import json
 import logging
 import os
 from pathlib import Path
@@ -111,11 +112,30 @@ def run_inference(args: argparse.Namespace):
 
     # 6. Process results
     rm_scores = output.batch["rm_scores"].sum(dim=-1).tolist()
-    mean_score = np.mean(rm_scores)
+    mean_score = float(np.mean(rm_scores)) if len(rm_scores) > 0 else 0.0
 
     logger.info(f"Generation completed. Mean RM Score: {mean_score:.4f}")
     print(f"\n=> Mean RM Score: {mean_score:.4f}\n")
 
+    # 7. Optionally persist a machine-readable result file (used by eval_checkpoints.py).
+    if args.result_path:
+        result_path = os.path.expanduser(args.result_path)
+        os.makedirs(os.path.dirname(result_path) or ".", exist_ok=True)
+        result = {
+            "model_path": os.path.expanduser(args.model_path),
+            "data_path": data_path,
+            "agent_config_path": os.path.expanduser(args.agent_config_path),
+            "n": config.actor_rollout_ref.rollout.n,
+            "num_samples": len(rm_scores),
+            "mean_rm_score": mean_score,
+            "rm_scores": rm_scores,
+        }
+        with open(result_path, "w") as f:
+            json.dump(result, f, indent=2)
-        with open(result_path, "w") as f:
-            json.dump(result, f, indent=2)
+        with open(result_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=2)
-        with open(result_path, "w") as f:
-            json.dump(result, f, indent=2)
+        with open(result_path, "w", encoding="utf-8") as f:
+            json.dump(result, f, indent=2)
+        logger.info(f"Wrote result file to: {result_path}")
+
+    return mean_score
+
 
 def main():
     parser = argparse.ArgumentParser(description="Uni-Agent Inference Runner")
@@ -140,6 +160,12 @@ def main():
         default="examples/agent_interaction/agent_config.yaml",
         help="Path to the agent loop configuration YAML.",
     )
+    parser.add_argument(
+        "--result-path",
+        type=str,
+        default=None,
+        help="Optional path to write a JSON result file (mean reward and per-rollout scores).",
+    )
 
     # Inference parameters
     parser.add_argument("--max-turns", type=int, default=100, help="Maximum number of interaction turns per episode.")

diff --git a/examples/agent_train/train_qwen3_moe.sh b/examples/agent_train/train_qwen3_moe.sh
@@ -79,6 +79,22 @@ trigger_parameter_sync_step=4
 require_batches=1
 partial_rollout=True
 
+# ============================================================================
+# Decoupled PPO (bypass_mode=False) + Rollout Correction (Rollout IS)
+# ============================================================================
+bypass_mode=${BYPASS_MODE:-False}                                # False => decoupled PPO (recompute old_log_prob as proximal anchor)
+rollout_is=${ROLLOUT_IS:-token}                                  # token | sequence | null  (IS aggregation level)
+rollout_is_threshold=${ROLLOUT_IS_THRESHOLD:-2.0}                # single float => TIS upper clamp; "lo_hi" string => IcePop
+rollout_is_batch_normalize=${ROLLOUT_IS_BATCH_NORMALIZE:-False}  # normalize IS weights to mean=1.0 within a batch
+rollout_rs=${ROLLOUT_RS:-seq_mean_k1}                            # seq_mean_k1 | seq_mean_k3 | token_k1 | null
+rollout_rs_threshold=${ROLLOUT_RS_THRESHOLD:-"0.999_1.001"}      # k1: "lo_hi" ratio band; k3: single upper bound
+
+# ============================================================================
+# 30B MoE Router Replay (R3)
+# ============================================================================
+router_replay_mode=${ROUTER_REPLAY_MODE:-R3}                          # disabled | R2 | R3
+enable_rollout_routing_replay=${ENABLE_ROLLOUT_ROUTING_REPLAY:-True}  # required for R3 (rollout-side replay)
+
 ray job submit --no-wait --runtime-env $RUNTIME_ENV \
     -- python3 -m verl.experimental.fully_async_policy.fully_async_main \
     --config-name='fully_async_ppo_megatron_trainer.yaml' \
@@ -140,10 +156,17 @@ ray job submit --no-wait --runtime-env $RUNTIME_ENV \
     +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
     +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
     +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
-    actor_rollout_ref.actor.router_replay.mode="R3" \
-    actor_rollout_ref.rollout.enable_rollout_routing_replay=True \
+    algorithm.rollout_correction.bypass_mode=${bypass_mode} \
+    algorithm.rollout_correction.rollout_is=${rollout_is} \
+    algorithm.rollout_correction.rollout_is_threshold=${rollout_is_threshold} \
+    algorithm.rollout_correction.rollout_is_batch_normalize=${rollout_is_batch_normalize} \
+    algorithm.rollout_correction.rollout_rs=${rollout_rs} \
+    algorithm.rollout_correction.rollout_rs_threshold="${rollout_rs_threshold}" \
+    actor_rollout_ref.actor.router_replay.mode=${router_replay_mode} \
+    actor_rollout_ref.rollout.enable_rollout_routing_replay=${enable_rollout_routing_replay} \
     actor_rollout_ref.actor.entropy_coeff=0 \
     actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+    +actor_rollout_ref.actor.checkpoint.save_contents=['model','hf_model'] \
-    +actor_rollout_ref.actor.checkpoint.save_contents=['model','hf_model'] \
+    +actor_rollout_ref.actor.checkpoint.save_contents="['model','hf_model']" \
-    +actor_rollout_ref.actor.checkpoint.save_contents=['model','hf_model'] \
+    +actor_rollout_ref.actor.checkpoint.save_contents="['model','hf_model']" \
     actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
     actor_rollout_ref.rollout.multi_turn.enable=True \
     actor_rollout_ref.rollout.multi_turn.max_parallel_calls=1 \
@@ -153,6 +176,7 @@ ray job submit --no-wait --runtime-env $RUNTIME_ENV \
     actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
     actor_rollout_ref.rollout.enable_chunked_prefill=True \
     actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
+    actor_rollout_ref.rollout.max_model_len=$((max_prompt_length + max_response_length)) \
     actor_rollout_ref.rollout.temperature=${temperature} \
     actor_rollout_ref.rollout.top_p=${top_p} \
     actor_rollout_ref.rollout.top_k=${top_k} \
@@ -165,6 +189,7 @@ ray job submit --no-wait --runtime-env $RUNTIME_ENV \
     actor_rollout_ref.rollout.mode=${rollout_mode} \
     actor_rollout_ref.rollout.calculate_log_probs=True \
     actor_rollout_ref.hybrid_engine=False \
+    actor_rollout_ref.nccl_timeout=9600 \
     actor_rollout_ref.rollout.enforce_eager=False \
     actor_rollout_ref.rollout.free_cache_engine=True \
     actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \

diff --git a/examples/data_preprocess/swe_rebench.py b/examples/data_preprocess/swe_rebench.py
@@ -153,9 +153,9 @@ def process_swe_rebench(example):
         }
         return sample
 
-    data_source = "dyyyyyyyy/swe-rebench-filtered"
+    data_source = "nebius/SWE-rebench"
     print(f"Loading the {data_source} dataset from huggingface...", flush=True)
-    dataset = load_dataset(data_source, split="train")
+    dataset = load_dataset(data_source, split="filtered")
     dataset = dataset.map(process_swe_rebench, remove_columns=dataset.column_names)
     dataset = dataset.filter(lambda ex: ex["extra_info"]["tools_kwargs"]["env"]["deployment"]["image"] is not None)
     return dataset