diff --git a/examples/agent_interaction/parallel_infer.py b/examples/agent_interaction/parallel_infer.py index 58bece2f..d9965e09 100644 --- a/examples/agent_interaction/parallel_infer.py +++ b/examples/agent_interaction/parallel_infer.py @@ -1,4 +1,5 @@ import argparse +import json import logging import os from pathlib import Path @@ -111,11 +112,30 @@ def run_inference(args: argparse.Namespace): # 6. Process results rm_scores = output.batch["rm_scores"].sum(dim=-1).tolist() - mean_score = np.mean(rm_scores) + mean_score = float(np.mean(rm_scores)) if len(rm_scores) > 0 else 0.0 logger.info(f"Generation completed. Mean RM Score: {mean_score:.4f}") print(f"\n=> Mean RM Score: {mean_score:.4f}\n") + # 7. Optionally persist a machine-readable result file (used by eval_checkpoints.py). + if args.result_path: + result_path = os.path.expanduser(args.result_path) + os.makedirs(os.path.dirname(result_path) or ".", exist_ok=True) + result = { + "model_path": os.path.expanduser(args.model_path), + "data_path": data_path, + "agent_config_path": os.path.expanduser(args.agent_config_path), + "n": config.actor_rollout_ref.rollout.n, + "num_samples": len(rm_scores), + "mean_rm_score": mean_score, + "rm_scores": rm_scores, + } + with open(result_path, "w") as f: + json.dump(result, f, indent=2) + logger.info(f"Wrote result file to: {result_path}") + + return mean_score + def main(): parser = argparse.ArgumentParser(description="Uni-Agent Inference Runner") @@ -140,6 +160,12 @@ def main(): default="examples/agent_interaction/agent_config.yaml", help="Path to the agent loop configuration YAML.", ) + parser.add_argument( + "--result-path", + type=str, + default=None, + help="Optional path to write a JSON result file (mean reward and per-rollout scores).", + ) # Inference parameters parser.add_argument("--max-turns", type=int, default=100, help="Maximum number of interaction turns per episode.") diff --git a/examples/agent_train/train_qwen3_moe.sh b/examples/agent_train/train_qwen3_moe.sh index 97e62f5b..ccc14827 100644 --- a/examples/agent_train/train_qwen3_moe.sh +++ b/examples/agent_train/train_qwen3_moe.sh @@ -79,6 +79,22 @@ trigger_parameter_sync_step=4 require_batches=1 partial_rollout=True +# ============================================================================ +# Decoupled PPO (bypass_mode=False) + Rollout Correction (Rollout IS) +# ============================================================================ +bypass_mode=${BYPASS_MODE:-False} # False => decoupled PPO (recompute old_log_prob as proximal anchor) +rollout_is=${ROLLOUT_IS:-token} # token | sequence | null (IS aggregation level) +rollout_is_threshold=${ROLLOUT_IS_THRESHOLD:-2.0} # single float => TIS upper clamp; "lo_hi" string => IcePop +rollout_is_batch_normalize=${ROLLOUT_IS_BATCH_NORMALIZE:-False} # normalize IS weights to mean=1.0 within a batch +rollout_rs=${ROLLOUT_RS:-seq_mean_k1} # seq_mean_k1 | seq_mean_k3 | token_k1 | null +rollout_rs_threshold=${ROLLOUT_RS_THRESHOLD:-"0.999_1.001"} # k1: "lo_hi" ratio band; k3: single upper bound + +# ============================================================================ +# 30B MoE Router Replay (R3) +# ============================================================================ +router_replay_mode=${ROUTER_REPLAY_MODE:-R3} # disabled | R2 | R3 +enable_rollout_routing_replay=${ENABLE_ROLLOUT_ROUTING_REPLAY:-True} # required for R3 (rollout-side replay) + ray job submit --no-wait --runtime-env $RUNTIME_ENV \ -- python3 -m verl.experimental.fully_async_policy.fully_async_main \ --config-name='fully_async_ppo_megatron_trainer.yaml' \ @@ -140,10 +156,17 @@ ray job submit --no-wait --runtime-env $RUNTIME_ENV \ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \ +actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \ - actor_rollout_ref.actor.router_replay.mode="R3" \ - actor_rollout_ref.rollout.enable_rollout_routing_replay=True \ + algorithm.rollout_correction.bypass_mode=${bypass_mode} \ + algorithm.rollout_correction.rollout_is=${rollout_is} \ + algorithm.rollout_correction.rollout_is_threshold=${rollout_is_threshold} \ + algorithm.rollout_correction.rollout_is_batch_normalize=${rollout_is_batch_normalize} \ + algorithm.rollout_correction.rollout_rs=${rollout_rs} \ + algorithm.rollout_correction.rollout_rs_threshold="${rollout_rs_threshold}" \ + actor_rollout_ref.actor.router_replay.mode=${router_replay_mode} \ + actor_rollout_ref.rollout.enable_rollout_routing_replay=${enable_rollout_routing_replay} \ actor_rollout_ref.actor.entropy_coeff=0 \ actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \ + +actor_rollout_ref.actor.checkpoint.save_contents=['model','hf_model'] \ actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ actor_rollout_ref.rollout.multi_turn.enable=True \ actor_rollout_ref.rollout.multi_turn.max_parallel_calls=1 \ @@ -153,6 +176,7 @@ ray job submit --no-wait --runtime-env $RUNTIME_ENV \ actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \ actor_rollout_ref.rollout.enable_chunked_prefill=True \ actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \ + actor_rollout_ref.rollout.max_model_len=$((max_prompt_length + max_response_length)) \ actor_rollout_ref.rollout.temperature=${temperature} \ actor_rollout_ref.rollout.top_p=${top_p} \ actor_rollout_ref.rollout.top_k=${top_k} \ @@ -165,6 +189,7 @@ ray job submit --no-wait --runtime-env $RUNTIME_ENV \ actor_rollout_ref.rollout.mode=${rollout_mode} \ actor_rollout_ref.rollout.calculate_log_probs=True \ actor_rollout_ref.hybrid_engine=False \ + actor_rollout_ref.nccl_timeout=9600 \ actor_rollout_ref.rollout.enforce_eager=False \ actor_rollout_ref.rollout.free_cache_engine=True \ actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \ diff --git a/examples/data_preprocess/swe_rebench.py b/examples/data_preprocess/swe_rebench.py index 5a56c0dd..3add8b28 100644 --- a/examples/data_preprocess/swe_rebench.py +++ b/examples/data_preprocess/swe_rebench.py @@ -153,9 +153,9 @@ def process_swe_rebench(example): } return sample - data_source = "dyyyyyyyy/swe-rebench-filtered" + data_source = "nebius/SWE-rebench" print(f"Loading the {data_source} dataset from huggingface...", flush=True) - dataset = load_dataset(data_source, split="train") + dataset = load_dataset(data_source, split="filtered") dataset = dataset.map(process_swe_rebench, remove_columns=dataset.column_names) dataset = dataset.filter(lambda ex: ex["extra_info"]["tools_kwargs"]["env"]["deployment"]["image"] is not None) return dataset