Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 92 additions & 21 deletions scripts/run-qwen3-30B-A3B.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash

# for rerun the task
pkill -9 vllm
pkill -9 -f "vllm serve"
sleep 3
ray stop --force
pkill -9 ray
Expand All @@ -24,17 +24,53 @@ else
fi
echo "HAS_NVLINK: $HAS_NVLINK (detected $NVLINK_COUNT NVLink references)"

if command -v nvidia-smi >/dev/null 2>&1; then
DETECTED_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l | tr -d ' ')
DETECTED_GPU_NAME=$(nvidia-smi --query-gpu=name --format=csv,noheader 2>/dev/null | head -n 1)
else
DETECTED_GPUS=0
DETECTED_GPU_NAME="unknown"
fi
NUM_GPUS=${NUM_GPUS:-8}
if [ -z "$NUM_GPUS" ] || [ "$NUM_GPUS" -le 0 ]; then
NUM_GPUS=8
fi
if [ "$DETECTED_GPUS" -gt 0 ] && [ "$NUM_GPUS" -gt "$DETECTED_GPUS" ]; then
echo "Requested NUM_GPUS=$NUM_GPUS but only detected $DETECTED_GPUS GPUs" >&2
exit 1
fi
echo "BENCHMARK_GPU: ${DETECTED_GPU_NAME}"
echo "NUM_GPUS: $NUM_GPUS"

SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)"
VIME_ROOT="$(cd -- "${SCRIPT_DIR}/.." &>/dev/null && pwd)"
source "${SCRIPT_DIR}/models/qwen3-30B-A3B.sh"

MEGATRON_TP=${MEGATRON_TP:-2}
MEGATRON_EP=${MEGATRON_EP:-${NUM_GPUS}}
MEGATRON_CP=${MEGATRON_CP:-1}
MAX_TOKENS_PER_GPU=${MAX_TOKENS_PER_GPU:-20480}
NUM_ROLLOUT=${NUM_ROLLOUT:-3000}
ROLLOUT_BATCH_SIZE=${ROLLOUT_BATCH_SIZE:-32}
N_SAMPLES_PER_PROMPT=${N_SAMPLES_PER_PROMPT:-8}
ROLLOUT_MAX_RESPONSE_LEN=${ROLLOUT_MAX_RESPONSE_LEN:-8192}
GLOBAL_BATCH_SIZE=${GLOBAL_BATCH_SIZE:-$((ROLLOUT_BATCH_SIZE * N_SAMPLES_PER_PROMPT))}
ROLLOUT_NUM_GPUS_PER_ENGINE=${ROLLOUT_NUM_GPUS_PER_ENGINE:-${NUM_GPUS}}
VLLM_GPU_MEMORY_UTILIZATION=${VLLM_GPU_MEMORY_UTILIZATION:-0.7}
VIME_CKPT_DIR=${VIME_CKPT_DIR:-/root/Qwen3-30B-A3B_vime}

CKPT_ARGS=(
--hf-checkpoint /root/Qwen3-30B-A3B
#--hf-checkpoint /root/Qwen3-30B-A3B-FP8
--ref-load /root/Qwen3-30B-A3B_torch_dist
--load /root/Qwen3-30B-A3B_vime/
--save /root/Qwen3-30B-A3B_vime/
--save-interval 20
--load "${VIME_CKPT_DIR}/"
)
if [[ "${VIME_DISABLE_SAVE:-0}" != "1" ]]; then
CKPT_ARGS+=(
--save "${VIME_CKPT_DIR}/"
--save-interval "${VIME_SAVE_INTERVAL:-20}"
)
fi

ROLLOUT_ARGS=(
--prompt-data /root/dapo-math-17k/dapo-math-17k.jsonl
Expand All @@ -43,13 +79,13 @@ ROLLOUT_ARGS=(
--apply-chat-template
--rollout-shuffle
--rm-type deepscaler
--num-rollout 3000
--rollout-batch-size 32
--n-samples-per-prompt 8
--rollout-max-response-len 8192
--num-rollout "${NUM_ROLLOUT}"
--rollout-batch-size "${ROLLOUT_BATCH_SIZE}"
--n-samples-per-prompt "${N_SAMPLES_PER_PROMPT}"
--rollout-max-response-len "${ROLLOUT_MAX_RESPONSE_LEN}"
--rollout-temperature 1

--global-batch-size 256
--global-batch-size "${GLOBAL_BATCH_SIZE}"
--balance-data
)

Expand All @@ -60,13 +96,16 @@ EVAL_ARGS=(
--eval-max-response-len 16384
--eval-top-p 1
)
if [[ "${VIME_SKIP_EVAL_BEFORE_TRAIN:-0}" == "1" ]]; then
EVAL_ARGS+=(--skip-eval-before-train)
fi

PERF_ARGS=(
--tensor-model-parallel-size 4
--tensor-model-parallel-size "${MEGATRON_TP}"
--sequence-parallel
--pipeline-model-parallel-size 1
--context-parallel-size 1
--expert-model-parallel-size 8
--context-parallel-size "${MEGATRON_CP}"
--expert-model-parallel-size "${MEGATRON_EP}"
--expert-tensor-parallel-size 1

--recompute-granularity full
Expand All @@ -75,8 +114,11 @@ PERF_ARGS=(

# --micro-batch-size 1
--use-dynamic-batch-size
--max-tokens-per-gpu 20480
--max-tokens-per-gpu "${MAX_TOKENS_PER_GPU}"
)
if [[ "${VIME_NO_GRAD_ACCUM_FUSION:-0}" == "1" ]]; then
PERF_ARGS+=(--no-gradient-accumulation-fusion)
fi

GRPO_ARGS=(
--advantage-estimator grpo
Expand Down Expand Up @@ -108,11 +150,24 @@ WANDB_ARGS=(
# --wandb-key ${WANDB_KEY}
)

TB_ARGS=()
if [[ "${VIME_TENSORBOARD:-0}" == "1" ]]; then
export TENSORBOARD_DIR="${TENSORBOARD_DIR:-${VIME_ROOT}/tensorboard_log/${TB_EXPERIMENT_NAME:-qwen3-30B-A3B}}"
TB_ARGS+=(--use-tensorboard)
TB_ARGS+=(--tb-project-name "${TB_PROJECT_NAME:-vime-rlk}")
TB_ARGS+=(--tb-experiment-name "${TB_EXPERIMENT_NAME:-qwen3-30B-A3B}")
fi

VLLM_ARGS=(
--rollout-num-gpus-per-engine 8
--vllm-gpu-memory-utilization 0.7
--vllm-cudagraph-capture-sizes 1 2 4 8 $(seq 16 8 256)
--rollout-num-gpus-per-engine "${ROLLOUT_NUM_GPUS_PER_ENGINE}"
--vllm-gpu-memory-utilization "${VLLM_GPU_MEMORY_UTILIZATION}"
--vllm-enable-expert-parallel
)
if [[ "${VIME_VLLM_ENFORCE_EAGER:-0}" == "1" ]]; then
VLLM_ARGS+=(--vllm-enforce-eager)
else
VLLM_ARGS+=(--vllm-cudagraph-capture-sizes 1 2 4 8 $(seq 16 8 256))
fi

MISC_ARGS=(
# default dropout in megatron is 0.1
Expand All @@ -125,32 +180,48 @@ MISC_ARGS=(
--attention-backend flash
)

RLK_ARGS=()
if [[ "${VIME_RL_KERNEL:-0}" == "1" ]]; then
RLK_ARGS+=(--enable-rl-kernel --rl-kernel-ops "${VIME_RL_KERNEL_OPS:-linear_logp}")
if [[ "${VIME_RL_KERNEL_STRICT:-0}" == "1" ]]; then
RLK_ARGS+=(--rl-kernel-strict)
fi
fi

# launch the master node of ray in container
export MASTER_ADDR=${MASTER_ADDR:-"127.0.0.1"}
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus 8 --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265
ray start --head --node-ip-address ${MASTER_ADDR} --num-gpus ${NUM_GPUS} --disable-usage-stats --dashboard-host=0.0.0.0 --dashboard-port=8265

# Build the runtime environment JSON with proper variable substitution
RUNTIME_ENV_JSON="{
\"env_vars\": {
\"PYTHONPATH\": \"/root/Megatron-LM/\",
\"PYTHONPATH\": \"${VIME_ROOT}:/root/Megatron-LM/\",
\"PATH\": \"${PATH}\",
\"CUDA_HOME\": \"${CUDA_HOME:-}\",
\"LD_LIBRARY_PATH\": \"${LD_LIBRARY_PATH:-}\",
\"CPATH\": \"${CPATH:-}\",
\"LIBRARY_PATH\": \"${LIBRARY_PATH:-}\",
\"CUDA_DEVICE_MAX_CONNECTIONS\": \"1\",
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\"
\"NCCL_NVLS_ENABLE\": \"${HAS_NVLINK}\",
\"TENSORBOARD_DIR\": \"${TENSORBOARD_DIR:-}\"
}
}"

ray job submit --address="http://127.0.0.1:8265" \
--runtime-env-json="${RUNTIME_ENV_JSON}" \
-- python3 train.py \
--actor-num-nodes 1 \
--actor-num-gpus-per-node 8 \
--actor-num-gpus-per-node ${NUM_GPUS} \
--colocate \
${MODEL_ARGS[@]} \
${CKPT_ARGS[@]} \
${ROLLOUT_ARGS[@]} \
${OPTIMIZER_ARGS[@]} \
${GRPO_ARGS[@]} \
${WANDB_ARGS[@]} \
${TB_ARGS[@]} \
${PERF_ARGS[@]} \
${EVAL_ARGS[@]} \
${VLLM_ARGS[@]} \
${MISC_ARGS[@]}
${MISC_ARGS[@]} \
${RLK_ARGS[@]}
89 changes: 3 additions & 86 deletions tests/_unit_stubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def install_rollout_optional_stubs() -> None:
"""Stub rollout-side optional imports when not installed."""
ensure_ray_stub()

install_vllm_router_stub()
if not real_module_available("vllm_router"):
sys.modules["vllm_router"] = types.ModuleType("vllm_router")

if not real_module_available("PIL"):
pil = types.ModuleType("PIL")
Expand Down Expand Up @@ -96,48 +97,6 @@ def _raise_os_error(*args, **kwargs):
sys.modules["pylatexenc"] = pylatexenc
sys.modules["pylatexenc.latex2text"] = latex2text

install_wandb_stub()


def install_vllm_router_stub() -> None:
if real_module_available("vllm_router"):
return

class RouterArgs:
@classmethod
def add_cli_args(cls, parser, *args, **kwargs): # noqa: ARG003
return parser

@classmethod
def from_cli_args(cls, args, *unused_args, **unused_kwargs): # noqa: ARG003
return types.SimpleNamespace()

router_mod = types.ModuleType("vllm_router")
router_mod.__path__ = []
launch_router_mod = types.ModuleType("vllm_router.launch_router")
router_args_mod = types.ModuleType("vllm_router.router_args")
launch_router_mod.RouterArgs = RouterArgs
router_args_mod.RouterArgs = RouterArgs
router_mod.launch_router = launch_router_mod
router_mod.router_args = router_args_mod
sys.modules["vllm_router"] = router_mod
sys.modules["vllm_router.launch_router"] = launch_router_mod
sys.modules["vllm_router.router_args"] = router_args_mod


def install_wandb_stub() -> None:
if real_module_available("wandb"):
return
wandb_mod = types.ModuleType("wandb")
wandb_mod.run = None
wandb_mod.log = MagicMock()
wandb_mod.finish = MagicMock()
wandb_mod.login = MagicMock()
wandb_mod.init = MagicMock()
wandb_mod.Settings = MagicMock()
wandb_mod.util = types.SimpleNamespace(generate_id=lambda: "unit-test")
sys.modules["wandb"] = wandb_mod


def save_sys_modules(names: Iterable[str]) -> dict[str, Any]:
return {k: sys.modules.get(k) for k in names}
Expand Down Expand Up @@ -237,57 +196,14 @@ def add_cli_args(cls, parser): # noqa: ARG003

arg_utils.AsyncEngineArgs = AsyncEngineArgs
engine_mod.arg_utils = arg_utils
system_utils_mod = types.ModuleType("vllm.utils.system_utils")
system_utils_mod.kill_process_tree = lambda pid, include_parent=True: None # noqa: ARG005
utils_mod.system_utils = system_utils_mod

# vllm.entrypoints stubs (used by arguments.add_vllm_arguments and vllm_engine._vllm_server_field_names)
entrypoints_mod = types.ModuleType("vllm.entrypoints")
entrypoints_mod.__path__ = []
openai_mod = types.ModuleType("vllm.entrypoints.openai")
openai_mod.__path__ = []
cli_args_mod = types.ModuleType("vllm.entrypoints.openai.cli_args")

import dataclasses as _dc

@_dc.dataclass
class FrontendArgs:
@classmethod
def add_cli_args(cls, parser): # noqa: ARG003
return parser

cli_args_mod.FrontendArgs = FrontendArgs
cli_args_mod.make_arg_parser = lambda parser=None: parser
cli_args_mod.validate_parsed_serve_args = lambda args: args
openai_mod.cli_args = cli_args_mod
entrypoints_mod.openai = openai_mod
vllm_mod.entrypoints = entrypoints_mod

cli_mod = types.ModuleType("vllm.entrypoints.cli")
cli_mod.__path__ = []
serve_mod = types.ModuleType("vllm.entrypoints.cli.serve")

class ServeSubcommand:
pass

serve_mod.ServeSubcommand = ServeSubcommand
cli_mod.serve = serve_mod
entrypoints_mod.cli = cli_mod

vllm_mod.engine = engine_mod
vllm_mod.utils = utils_mod

sys.modules["vllm"] = vllm_mod
sys.modules["vllm.utils"] = utils_mod
sys.modules["vllm.utils.argparse_utils"] = argparse_utils
sys.modules["vllm.utils.system_utils"] = system_utils_mod
sys.modules["vllm.engine"] = engine_mod
sys.modules["vllm.engine.arg_utils"] = arg_utils
sys.modules["vllm.entrypoints"] = entrypoints_mod
sys.modules["vllm.entrypoints.openai"] = openai_mod
sys.modules["vllm.entrypoints.openai.cli_args"] = cli_args_mod
sys.modules["vllm.entrypoints.cli"] = cli_mod
sys.modules["vllm.entrypoints.cli.serve"] = serve_mod


def install_triton_stub() -> None:
Expand All @@ -307,4 +223,5 @@ def install_triton_stub() -> None:
def install_vime_distributed_utils_stub() -> None:
vime_utils = types.ModuleType("vime.utils.distributed_utils")
vime_utils.get_gloo_group = MagicMock(return_value="gloo")
vime_utils.distributed_masked_whiten = MagicMock(side_effect=lambda values, *args, **kwargs: values)
sys.modules.setdefault("vime.utils.distributed_utils", vime_utils)
Loading