From 886e619bc42fbf28d15e6a9883fd295c7f56c3d1 Mon Sep 17 00:00:00 2001 From: Yeswanth K Date: Fri, 22 May 2026 13:08:04 -0700 Subject: [PATCH] [GB300][SGLang] Add GLM5 FP8 dynamo-sglang disagg configs Port PR69 GLM5 FP8 GB300 disaggregated SGLang recipes to SA upstream and wire gb300-nv launcher support while keeping SA-default SLURM account/partition and sqsh paths. --- .github/configs/nvidia-master.yaml | 210 ++++++++++++++++++ .../1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml | 148 ++++++++++++ .../1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml | 148 ++++++++++++ .../1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml | 148 ++++++++++++ .../1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml | 148 ++++++++++++ .../1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml | 148 ++++++++++++ .../1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml | 140 ++++++++++++ .../1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml | 140 ++++++++++++ .../8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml | 148 ++++++++++++ .../8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml | 148 ++++++++++++ .../8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml | 148 ++++++++++++ .../8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml | 148 ++++++++++++ .../8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml | 140 ++++++++++++ .../8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml | 140 ++++++++++++ .../8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml | 140 ++++++++++++ perf-changelog.yaml | 8 + runners/launch_gb300-nv.sh | 12 +- 17 files changed, 2261 insertions(+), 1 deletion(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index da46276e2..48493e06b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9234,3 +9234,213 @@ qwen3.5-fp8-h100-sglang-mtp: osl: 1024 search-space: - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + +glm5-fp8-gb300-dynamo-sglang: + image: lmsysorg/sglang:v0.5.11-cu130 + model: zai-org/GLM-5-FP8 + model-prefix: glm5 + runner: gb300-nv + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + # ---------- 8k1k high-throughput (wide-EP decode) ---------- + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [2800] + prefill: + num-worker: 14 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [1700] + prefill: + num-worker: 12 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true + - conc-list: [1300] + prefill: + num-worker: 10 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [900] + prefill: + num-worker: 8 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml" + decode: + num-worker: 1 + tp: 40 + ep: 40 + dp-attn: true + # ---------- 8k1k low-latency (per-node TP=4 decode workers) ---------- + - isl: 8192 + osl: 1024 + search-space: + - conc-list: [150] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml" + decode: + num-worker: 9 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [128, 64, 32] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [24] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false + # ---------- 1k1k high-throughput (wide-EP decode) ---------- + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [8192] + prefill: + num-worker: 12 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml" + decode: + num-worker: 1 + tp: 24 + ep: 24 + dp-attn: true + - conc-list: [7500] + prefill: + num-worker: 10 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [7300] + prefill: + num-worker: 8 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml" + decode: + num-worker: 1 + tp: 40 + ep: 40 + dp-attn: true + - conc-list: [6500] + prefill: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + - conc-list: [5700] + prefill: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml" + decode: + num-worker: 1 + tp: 56 + ep: 56 + dp-attn: true + # ---------- 1k1k low-latency (per-node TP=4 decode workers) ---------- + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [512, 256, 128, 64] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false + - conc-list: [32] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml" + decode: + num-worker: 17 + tp: 4 + ep: 1 + dp-attn: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml new file mode 100644 index 000000000..33da57e94 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml @@ -0,0 +1,148 @@ +name: gb300-fp8-glm5_1k1k_hightpt_0 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 12 + prefill_workers: 12 + decode_nodes: 6 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 24 + expert-parallel-size: 24 + data-parallel-size: 24 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 8192 + cuda-graph-max-bs: 512 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '8192' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml new file mode 100644 index 000000000..03cb7e671 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml @@ -0,0 +1,148 @@ +name: gb300-fp8-glm5_1k1k_hightpt_1 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 10 + prefill_workers: 10 + decode_nodes: 8 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 32 + expert-parallel-size: 32 + data-parallel-size: 32 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 8192 + cuda-graph-max-bs: 256 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '7500' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml new file mode 100644 index 000000000..ef1a646d0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml @@ -0,0 +1,148 @@ +name: gb300-fp8-glm5_1k1k_hightpt_2 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 8 + prefill_workers: 8 + decode_nodes: 10 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 40 + expert-parallel-size: 40 + data-parallel-size: 40 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 24 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 256 + cuda-graph-max-bs: 180 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '7300' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml new file mode 100644 index 000000000..377ebac10 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml @@ -0,0 +1,148 @@ +name: gb300-fp8-glm5_1k1k_hightpt_3 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 6 + prefill_workers: 6 + decode_nodes: 12 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 48 + expert-parallel-size: 48 + data-parallel-size: 48 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 6500 + cuda-graph-max-bs: 128 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '6500' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml new file mode 100644 index 000000000..54cef10cc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml @@ -0,0 +1,148 @@ +name: gb300-fp8-glm5_1k1k_hightpt_4 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 4 + prefill_workers: 4 + decode_nodes: 14 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 56 + expert-parallel-size: 56 + data-parallel-size: 56 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 24 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 5700 + cuda-graph-max-bs: 100 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '5700' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml new file mode 100644 index 000000000..3ace5647c --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml @@ -0,0 +1,140 @@ +name: gb300-fp8-glm5_1k1k_lowlat_0 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 17 + decode_workers: 17 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 32 + cuda-graph-max-bs: 32 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: 512x256x128x64 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml new file mode 100644 index 000000000..965b6f148 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml @@ -0,0 +1,140 @@ +name: gb300-fp8-glm5_1k1k_lowlat_1 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 17 + decode_workers: 17 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 1 + cuda-graph-max-bs: 1 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 1024 + osl: 1024 + concurrencies: '32' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml new file mode 100644 index 000000000..150e62233 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml @@ -0,0 +1,148 @@ +name: gb300-fp8-glm5_8k1k_hightpt_0 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 14 + prefill_workers: 14 + decode_nodes: 4 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 16 + expert-parallel-size: 16 + data-parallel-size: 16 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 2800 + cuda-graph-max-bs: 175 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '2800' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml new file mode 100644 index 000000000..2f5ccb4bf --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml @@ -0,0 +1,148 @@ +name: gb300-fp8-glm5_8k1k_hightpt_1 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 12 + prefill_workers: 12 + decode_nodes: 6 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 24 + expert-parallel-size: 24 + data-parallel-size: 24 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 1700 + cuda-graph-max-bs: 70 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '1700' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml new file mode 100644 index 000000000..fea7335cc --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml @@ -0,0 +1,148 @@ +name: gb300-fp8-glm5_8k1k_hightpt_2 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 10 + prefill_workers: 10 + decode_nodes: 8 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 32 + expert-parallel-size: 32 + data-parallel-size: 32 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 32 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 1300 + cuda-graph-max-bs: 40 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '1300' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml new file mode 100644 index 000000000..ec84f3261 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml @@ -0,0 +1,148 @@ +name: gb300-fp8-glm5_8k1k_hightpt_3 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 8 + prefill_workers: 8 + decode_nodes: 10 + decode_workers: 1 +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 9 +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 40 + expert-parallel-size: 40 + data-parallel-size: 40 + enable-dp-lm-head: true + enable-dp-attention: true + moe-dense-tp-size: 1 + ep-num-redundant-experts: 24 + ep-dispatch-algorithm: static + moe-a2a-backend: deepep + deepep-mode: low_latency + deepep-config: /configs/deepep_config.json + max-running-requests: 900 + cuda-graph-max-bs: 22 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '900' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml new file mode 100644 index 000000000..8065160bd --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml @@ -0,0 +1,140 @@ +name: gb300-fp8-glm5_8k1k_lowlat_0 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 9 + decode_workers: 9 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 15 + cuda-graph-max-bs: 15 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '150' diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml new file mode 100644 index 000000000..33f0324b6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml @@ -0,0 +1,140 @@ +name: gb300-fp8-glm5_8k1k_lowlat_1 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 17 + decode_workers: 17 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 8 + cuda-graph-max-bs: 8 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: 128x64x32 diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml new file mode 100644 index 000000000..64d4c701a --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml @@ -0,0 +1,140 @@ +name: gb300-fp8-glm5_8k1k_lowlat_2 + +model: + path: glm-5-fp8 + container: "lmsysorg/sglang:v0.5.11-cu130" + precision: fp8 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 17 + decode_workers: 17 +frontend: + type: dynamo +dynamo: + version: 1.1.0 + +backend: + prefill_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + + decode_environment: + TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800' + PYTHONUNBUFFERED: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + MC_TE_METRIC: 'true' + MC_FORCE_MNNVL: '1' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0' + SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1' + DYN_REQUEST_PLANE: nats + # DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size). + # Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024. + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512' + + sglang_config: + prefill: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: prefill + disaggregation-transfer-backend: nixl + + # Size limits + max-running-requests: 256 + cuda-graph-max-bs: 256 + mem-fraction-static: 0.7 + context-length: 9600 + chunked-prefill-size: 32768 + max-prefill-tokens: 8192 + + # Parallelism + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 1 + enable-dp-attention: true + enable-dp-lm-head: true + load-balance-method: total_tokens + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + moe-runner-backend: flashinfer_trtllm + + # Other flags + enable-flashinfer-allreduce-fusion: true + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + + decode: + # Model configuration + served-model-name: GLM-5-FP8 + trust-remote-code: true + + quantization: fp8 + kv-cache-dtype: fp8_e4m3 + + # Disaggregation mode + disaggregation-mode: decode + disaggregation-transfer-backend: nixl + + # Memory and token limits + mem-fraction-static: 0.8 + context-length: 9600 + + # Backend + nsa-decode-backend: trtllm + nsa-prefill-backend: trtllm + # moe-runner-backend: "cutedsl" + + # Detokenizer + skip-tokenizer-init: true + stream-interval: 30 + + # Other flags + disable-radix-cache: true + weight-loader-prefetch-checkpoints: true + model-loader-extra-config: '{"enable_multithread_load": true}' + tensor-parallel-size: 4 + expert-parallel-size: 1 + data-parallel-size: 1 + enable-flashinfer-allreduce-fusion: true + moe-runner-backend: flashinfer_trtllm + max-running-requests: 1 + cuda-graph-max-bs: 1 +health_check: + max_attempts: 360 + interval_seconds: 10 + +benchmark: + type: sa-bench + req_rate: inf + isl: 8192 + osl: 1024 + concurrencies: '24' diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 982a651e9..b3f462d05 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3100,3 +3100,11 @@ description: - "Truncate sweep to conc=1 and conc=2 only: set conc-start=1, conc-end=2 in every search-space across all six DSR1 SGLang agg configs" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1534 + +- config-keys: + - glm5-fp8-gb300-dynamo-sglang + description: + - "Add GLM-5 FP8 GB300 Dynamo SGLang disaggregated multi-node coverage using lmsysorg/sglang:v0.5.11-cu130" + - "1k1k and 8k1k STP hightpt and lowlat srt-slurm recipes under benchmarks/multi_node/srt-slurm-recipes/sglang/glm5/gb300-fp8/ (resolved from upstream srt-slurm PR #160 via srtctl resolve-override)" + - "Wire glm5/fp8 model + dynamo-sglang framework branches into runners/launch_gb300-nv.sh with SA upstream defaults (SLURM_PARTITION=batch_1, SLURM_ACCOUNT=benchmark, SQUASH_FILE under /home/sa-shared/gharunners/squash/)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX diff --git a/runners/launch_gb300-nv.sh b/runners/launch_gb300-nv.sh index 4ef4709d2..b964a89be 100644 --- a/runners/launch_gb300-nv.sh +++ b/runners/launch_gb300-nv.sh @@ -21,8 +21,11 @@ elif [[ $MODEL_PREFIX == "dsr1" && $PRECISION == "fp8" ]]; then elif [[ $MODEL_PREFIX == "dsv4" && $PRECISION == "fp4" ]]; then export MODEL_PATH=/scratch/models/DeepSeek-V4-Pro export SRT_SLURM_MODEL_PREFIX="deepseek-v4-pro" +elif [[ $MODEL_PREFIX == "glm5" && $PRECISION == "fp8" ]]; then + export MODEL_PATH=/scratch/models/GLM-5-FP8 + export SRT_SLURM_MODEL_PREFIX="glm-5-fp8" else - echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4" + echo "Unsupported model: $MODEL_PREFIX-$PRECISION. Supported models are: dsr1-fp4, dsr1-fp8, dsv4-fp4, glm5-fp8" exit 1 fi @@ -68,6 +71,12 @@ if [[ $FRAMEWORK == "dynamo-vllm" && $MODEL_PREFIX == "dsv4" ]]; then git checkout aflowers/gb200-dsv4-recipes mkdir -p recipes/vllm/deepseek-v4 cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/vllm/deepseek-v4" recipes/vllm/deepseek-v4 +elif [[ $FRAMEWORK == "dynamo-sglang" && $MODEL_PREFIX == "glm5" ]]; then + git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" + cd "$SRT_REPO_DIR" + git checkout sa-submission-q2-2026 + mkdir -p recipes/sglang/glm5 + cp -rT "$GITHUB_WORKSPACE/benchmarks/multi_node/srt-slurm-recipes/sglang/glm5" recipes/sglang/glm5 else git clone https://github.com/NVIDIA/srt-slurm.git "$SRT_REPO_DIR" cd "$SRT_REPO_DIR" @@ -116,6 +125,7 @@ model_paths: containers: dynamo-trtllm: ${SQUASH_FILE} dynamo-sglang: ${SQUASH_FILE} + "${IMAGE}": ${SQUASH_FILE} nginx-sqsh: ${NGINX_SQUASH_FILE} use_segment_sbatch_directive: false EOF