Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 27 additions & 1 deletion examples/agent_interaction/parallel_infer.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import argparse
import json
import logging
import os
from pathlib import Path
Expand Down Expand Up @@ -111,11 +112,30 @@ def run_inference(args: argparse.Namespace):

# 6. Process results
rm_scores = output.batch["rm_scores"].sum(dim=-1).tolist()
mean_score = np.mean(rm_scores)
mean_score = float(np.mean(rm_scores)) if len(rm_scores) > 0 else 0.0

logger.info(f"Generation completed. Mean RM Score: {mean_score:.4f}")
print(f"\n=> Mean RM Score: {mean_score:.4f}\n")

# 7. Optionally persist a machine-readable result file (used by eval_checkpoints.py).
if args.result_path:
result_path = os.path.expanduser(args.result_path)
os.makedirs(os.path.dirname(result_path) or ".", exist_ok=True)
result = {
"model_path": os.path.expanduser(args.model_path),
"data_path": data_path,
"agent_config_path": os.path.expanduser(args.agent_config_path),
"n": config.actor_rollout_ref.rollout.n,
"num_samples": len(rm_scores),
"mean_rm_score": mean_score,
"rm_scores": rm_scores,
}
with open(result_path, "w") as f:
json.dump(result, f, indent=2)
Comment on lines +133 to +134

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

When opening files for writing text, it is highly recommended to specify encoding="utf-8" to ensure consistent behavior across different platforms and environments (e.g., Windows vs. Linux).

Suggested change
with open(result_path, "w") as f:
json.dump(result, f, indent=2)
with open(result_path, "w", encoding="utf-8") as f:
json.dump(result, f, indent=2)

logger.info(f"Wrote result file to: {result_path}")

return mean_score


def main():
parser = argparse.ArgumentParser(description="Uni-Agent Inference Runner")
Expand All @@ -140,6 +160,12 @@ def main():
default="examples/agent_interaction/agent_config.yaml",
help="Path to the agent loop configuration YAML.",
)
parser.add_argument(
"--result-path",
type=str,
default=None,
help="Optional path to write a JSON result file (mean reward and per-rollout scores).",
)

# Inference parameters
parser.add_argument("--max-turns", type=int, default=100, help="Maximum number of interaction turns per episode.")
Expand Down
29 changes: 27 additions & 2 deletions examples/agent_train/train_qwen3_moe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,22 @@ trigger_parameter_sync_step=4
require_batches=1
partial_rollout=True

# ============================================================================
# Decoupled PPO (bypass_mode=False) + Rollout Correction (Rollout IS)
# ============================================================================
bypass_mode=${BYPASS_MODE:-False} # False => decoupled PPO (recompute old_log_prob as proximal anchor)
rollout_is=${ROLLOUT_IS:-token} # token | sequence | null (IS aggregation level)
rollout_is_threshold=${ROLLOUT_IS_THRESHOLD:-2.0} # single float => TIS upper clamp; "lo_hi" string => IcePop
rollout_is_batch_normalize=${ROLLOUT_IS_BATCH_NORMALIZE:-False} # normalize IS weights to mean=1.0 within a batch
rollout_rs=${ROLLOUT_RS:-seq_mean_k1} # seq_mean_k1 | seq_mean_k3 | token_k1 | null
rollout_rs_threshold=${ROLLOUT_RS_THRESHOLD:-"0.999_1.001"} # k1: "lo_hi" ratio band; k3: single upper bound

# ============================================================================
# 30B MoE Router Replay (R3)
# ============================================================================
router_replay_mode=${ROUTER_REPLAY_MODE:-R3} # disabled | R2 | R3
enable_rollout_routing_replay=${ENABLE_ROLLOUT_ROUTING_REPLAY:-True} # required for R3 (rollout-side replay)

ray job submit --no-wait --runtime-env $RUNTIME_ENV \
-- python3 -m verl.experimental.fully_async_policy.fully_async_main \
--config-name='fully_async_ppo_megatron_trainer.yaml' \
Expand Down Expand Up @@ -140,10 +156,17 @@ ray job submit --no-wait --runtime-env $RUNTIME_ENV \
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_method=uniform \
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_granularity=full \
+actor_rollout_ref.actor.megatron.override_transformer_config.recompute_num_layers=1 \
actor_rollout_ref.actor.router_replay.mode="R3" \
actor_rollout_ref.rollout.enable_rollout_routing_replay=True \
algorithm.rollout_correction.bypass_mode=${bypass_mode} \
algorithm.rollout_correction.rollout_is=${rollout_is} \
algorithm.rollout_correction.rollout_is_threshold=${rollout_is_threshold} \
algorithm.rollout_correction.rollout_is_batch_normalize=${rollout_is_batch_normalize} \
algorithm.rollout_correction.rollout_rs=${rollout_rs} \
algorithm.rollout_correction.rollout_rs_threshold="${rollout_rs_threshold}" \
actor_rollout_ref.actor.router_replay.mode=${router_replay_mode} \
actor_rollout_ref.rollout.enable_rollout_routing_replay=${enable_rollout_routing_replay} \
actor_rollout_ref.actor.entropy_coeff=0 \
actor_rollout_ref.actor.loss_agg_mode=${loss_agg_mode} \
+actor_rollout_ref.actor.checkpoint.save_contents=['model','hf_model'] \

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

In bash, passing unquoted arguments containing square brackets [ and ] (like ['model','hf_model']) can trigger shell globbing (pattern matching) if any matching files exist in the directory. To prevent unexpected shell expansion and ensure robust parsing by Hydra, please quote the entire argument.

Suggested change
+actor_rollout_ref.actor.checkpoint.save_contents=['model','hf_model'] \
+actor_rollout_ref.actor.checkpoint.save_contents="['model','hf_model']" \

actor_rollout_ref.rollout.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
actor_rollout_ref.rollout.multi_turn.enable=True \
actor_rollout_ref.rollout.multi_turn.max_parallel_calls=1 \
Expand All @@ -153,6 +176,7 @@ ray job submit --no-wait --runtime-env $RUNTIME_ENV \
actor_rollout_ref.rollout.tensor_model_parallel_size=${gen_tp} \
actor_rollout_ref.rollout.enable_chunked_prefill=True \
actor_rollout_ref.rollout.max_num_batched_tokens=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.rollout.max_model_len=$((max_prompt_length + max_response_length)) \
actor_rollout_ref.rollout.temperature=${temperature} \
actor_rollout_ref.rollout.top_p=${top_p} \
actor_rollout_ref.rollout.top_k=${top_k} \
Expand All @@ -165,6 +189,7 @@ ray job submit --no-wait --runtime-env $RUNTIME_ENV \
actor_rollout_ref.rollout.mode=${rollout_mode} \
actor_rollout_ref.rollout.calculate_log_probs=True \
actor_rollout_ref.hybrid_engine=False \
actor_rollout_ref.nccl_timeout=9600 \
actor_rollout_ref.rollout.enforce_eager=False \
actor_rollout_ref.rollout.free_cache_engine=True \
actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=${infer_ppo_max_token_len} \
Expand Down
4 changes: 2 additions & 2 deletions examples/data_preprocess/swe_rebench.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,9 +153,9 @@ def process_swe_rebench(example):
}
return sample

data_source = "dyyyyyyyy/swe-rebench-filtered"
data_source = "nebius/SWE-rebench"
print(f"Loading the {data_source} dataset from huggingface...", flush=True)
dataset = load_dataset(data_source, split="train")
dataset = load_dataset(data_source, split="filtered")
dataset = dataset.map(process_swe_rebench, remove_columns=dataset.column_names)
dataset = dataset.filter(lambda ex: ex["extra_info"]["tools_kwargs"]["env"]["deployment"]["image"] is not None)
return dataset
Expand Down
Loading