diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2e1b43a97..11175f0f5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9034,7 +9034,7 @@ glm5-fp8-b200-dynamo-sglang: # MTP variant of dsv4-fp4-gb300-dynamo-sglang. dsv4-fp4-gb300-dynamo-sglang-mtp: - image: lmsysorg/sglang:nightly-dev-cu13-20260509-9ee83034 + image: lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: gb300-cw diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml index accb5e56f..8e4b47b6d 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p1d-tp4-tp4-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" precision: "mxfp4" sbatch_directives: @@ -31,14 +31,12 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" @@ -49,14 +47,12 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index 3e8fca87b..09619bedc 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" precision: "mxfp4" sbatch_directives: @@ -31,23 +31,16 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -60,14 +53,11 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" NCCL_MNNVL_ENABLE: "1" NCCL_CUMEM_ENABLE: "1" SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml index 6745aa37e..9357c0017 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" precision: "mxfp4" sbatch_directives: @@ -33,23 +33,16 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -62,22 +55,16 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml index 842fbb556..5a2c99669 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" precision: "mxfp4" sbatch_directives: @@ -33,23 +33,16 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -62,22 +55,16 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml index 49b0d31c3..7ada85405 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" precision: "mxfp4" sbatch_directives: @@ -33,23 +33,16 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -62,22 +55,16 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml index d907c369e..833b44f33 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml @@ -11,7 +11,7 @@ dynamo: model: path: "deepseek-v4-pro" - container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + container: "lmsysorg/sglang:nightly-dev-cu13-20260519-dbac4647" precision: "mxfp4" sbatch_directives: @@ -33,23 +33,16 @@ backend: prefill_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" @@ -62,22 +55,16 @@ backend: decode_environment: PYTHONUNBUFFERED: "1" - SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_RADIX_FORCE_MISS: "1" SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" SGLANG_DEFAULT_THINKING: "1" SGLANG_DSV4_REASONING_EFFORT: "max" SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" - SGLANG_OPT_USE_JIT_NORM: "1" - SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" - SGLANG_OPT_USE_TOPK_V2: "1" - SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" - SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" - SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" - SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" NCCL_MNNVL_ENABLE: "1" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c4910b13..df762b507 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2934,3 +2934,9 @@ description: - "Update SGLang ROCm image from v0.5.11/v0.5.10rc0 to v0.5.12-rocm720-mi35x-20260517" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1440 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang-mtp + description: + - "Update SGLang image to nightly-20260519 and remove obsolete env vars for MTP configs" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1478