Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
210 changes: 210 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9234,3 +9234,213 @@ qwen3.5-fp8-h100-sglang-mtp:
osl: 1024
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }

glm5-fp8-gb300-dynamo-sglang:
image: lmsysorg/sglang:v0.5.11-cu130
model: zai-org/GLM-5-FP8
model-prefix: glm5
runner: gb300-nv
precision: fp8
framework: dynamo-sglang
multinode: true
disagg: true
scenarios:
fixed-seq-len:
# ---------- 8k1k high-throughput (wide-EP decode) ----------
- isl: 8192
osl: 1024
search-space:
- conc-list: [2800]
prefill:
num-worker: 14
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_0.yaml"
decode:
num-worker: 1
tp: 16
ep: 16
dp-attn: true
- conc-list: [1700]
prefill:
num-worker: 12
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_1.yaml"
decode:
num-worker: 1
tp: 24
ep: 24
dp-attn: true
- conc-list: [1300]
prefill:
num-worker: 10
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_2.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [900]
prefill:
num-worker: 8
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_hightpt_3.yaml"
decode:
num-worker: 1
tp: 40
ep: 40
dp-attn: true
# ---------- 8k1k low-latency (per-node TP=4 decode workers) ----------
- isl: 8192
osl: 1024
search-space:
- conc-list: [150]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_0.yaml"
decode:
num-worker: 9
tp: 4
ep: 1
dp-attn: false
- conc-list: [128, 64, 32]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_1.yaml"
decode:
num-worker: 17
tp: 4
ep: 1
dp-attn: false
- conc-list: [24]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/8k1k/disagg/stp/8k1k_stp_lowlat_2.yaml"
decode:
num-worker: 17
tp: 4
ep: 1
dp-attn: false
# ---------- 1k1k high-throughput (wide-EP decode) ----------
- isl: 1024
osl: 1024
search-space:
- conc-list: [8192]
prefill:
num-worker: 12
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_0.yaml"
decode:
num-worker: 1
tp: 24
ep: 24
dp-attn: true
- conc-list: [7500]
prefill:
num-worker: 10
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_1.yaml"
decode:
num-worker: 1
tp: 32
ep: 32
dp-attn: true
- conc-list: [7300]
prefill:
num-worker: 8
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_2.yaml"
decode:
num-worker: 1
tp: 40
ep: 40
dp-attn: true
- conc-list: [6500]
prefill:
num-worker: 6
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_3.yaml"
decode:
num-worker: 1
tp: 48
ep: 48
dp-attn: true
- conc-list: [5700]
prefill:
num-worker: 4
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_hightpt_4.yaml"
decode:
num-worker: 1
tp: 56
ep: 56
dp-attn: true
# ---------- 1k1k low-latency (per-node TP=4 decode workers) ----------
- isl: 1024
osl: 1024
search-space:
- conc-list: [512, 256, 128, 64]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_0.yaml"
decode:
num-worker: 17
tp: 4
ep: 1
dp-attn: false
- conc-list: [32]
prefill:
num-worker: 1
tp: 4
ep: 1
dp-attn: true
additional-settings:
- "CONFIG_FILE=recipes/sglang/glm5/gb300-fp8/1k1k/disagg/stp/1k1k_stp_lowlat_1.yaml"
decode:
num-worker: 17
tp: 4
ep: 1
dp-attn: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
name: gb300-fp8-glm5_1k1k_hightpt_0

model:
path: glm-5-fp8
container: "lmsysorg/sglang:v0.5.11-cu130"
precision: fp8

resources:
gpu_type: gb300
gpus_per_node: 4
prefill_nodes: 12
prefill_workers: 12
decode_nodes: 6
decode_workers: 1
frontend:
type: dynamo
enable_multiple_frontends: true
num_additional_frontends: 9
dynamo:
version: 1.1.0

backend:
prefill_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
PYTHONUNBUFFERED: '1'
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
MC_TE_METRIC: 'true'
MC_FORCE_MNNVL: '1'
NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
DYN_REQUEST_PLANE: nats

decode_environment:
TORCH_DISTRIBUTED_DEFAULT_TIMEOUT: '1800'
PYTHONUNBUFFERED: '1'
DYN_SKIP_SGLANG_LOG_FORMATTING: '1'
SGLANG_DISAGGREGATION_HEARTBEAT_MAX_FAILURE: '100000'
SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000'
SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000'
MC_TE_METRIC: 'true'
MC_FORCE_MNNVL: '1'
NCCL_MNNVL_ENABLE: '1'
NCCL_CUMEM_ENABLE: '1'
SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True'
SGLANG_USE_MESSAGE_QUEUE_BROADCASTER: '0'
SGLANG_DISABLE_TP_MEMORY_INBALANCE_CHECK: '1'
DYN_REQUEST_PLANE: nats
# DeepEP per-rank dispatch buffer; must be >= ceil(cuda_graph_max_bs / dp_size).
# Default 128 overflows with large DP + batch (e.g. 4096/24 ~= 171 > 128). Limit 1024.
SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '512'

sglang_config:
prefill:
# Model configuration
served-model-name: GLM-5-FP8
trust-remote-code: true
quantization: fp8
kv-cache-dtype: fp8_e4m3

# Disaggregation mode
disaggregation-mode: prefill
disaggregation-transfer-backend: nixl

# Size limits
max-running-requests: 256
cuda-graph-max-bs: 256
mem-fraction-static: 0.7
context-length: 9600
chunked-prefill-size: 32768
max-prefill-tokens: 8192

# Parallelism
tensor-parallel-size: 4
data-parallel-size: 4
expert-parallel-size: 1
enable-dp-attention: true
enable-dp-lm-head: true
load-balance-method: total_tokens

# Backend
nsa-decode-backend: trtllm
nsa-prefill-backend: trtllm
moe-runner-backend: flashinfer_trtllm

# Other flags
enable-flashinfer-allreduce-fusion: true
disable-radix-cache: true
weight-loader-prefetch-checkpoints: true
model-loader-extra-config: '{"enable_multithread_load": true}'

decode:
# Model configuration
served-model-name: GLM-5-FP8
trust-remote-code: true

quantization: fp8
kv-cache-dtype: fp8_e4m3

# Disaggregation mode
disaggregation-mode: decode
disaggregation-transfer-backend: nixl

# Memory and token limits
mem-fraction-static: 0.8
context-length: 9600

# Backend
nsa-decode-backend: trtllm
nsa-prefill-backend: trtllm
# moe-runner-backend: "cutedsl"

# Detokenizer
skip-tokenizer-init: true
stream-interval: 30

# Other flags
disable-radix-cache: true
weight-loader-prefetch-checkpoints: true
model-loader-extra-config: '{"enable_multithread_load": true}'
tensor-parallel-size: 24
expert-parallel-size: 24
data-parallel-size: 24
enable-dp-lm-head: true
enable-dp-attention: true
moe-dense-tp-size: 1
ep-num-redundant-experts: 32
ep-dispatch-algorithm: static
moe-a2a-backend: deepep
deepep-mode: low_latency
deepep-config: /configs/deepep_config.json
max-running-requests: 8192
cuda-graph-max-bs: 512
health_check:
max_attempts: 360
interval_seconds: 10

benchmark:
type: sa-bench
req_rate: inf
isl: 1024
osl: 1024
concurrencies: '8192'
Loading