From 8880ed0d211902535021c1f63d425aedbef25983 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Sat, 23 May 2026 00:13:47 +0800 Subject: [PATCH] Update DSV4 GB300 8k1k MTP disagg configs - Add dep16 mid-curve configs (2p1d, 4p1d, 8p1d) and 12p1d-dep4-dep12 - Update 1p6d low-latency dynamo hash - Remove dep8 mid-curve configs (replaced by dep16) --- .github/configs/nvidia-master.yaml | 53 +++--- .../disagg-low-latency-1p6d-dep4-tp4-mtp.yaml | 2 +- ...disagg-mid-curve-12p1d-dep4-dep12-mtp.yaml | 157 ++++++++++++++++++ ...disagg-mid-curve-2p1d-dep4-dep16-mtp.yaml} | 12 +- ...disagg-mid-curve-4p1d-dep4-dep16-mtp.yaml} | 12 +- ...disagg-mid-curve-8p1d-dep4-dep16-mtp.yaml} | 28 ++-- perf-changelog.yaml | 8 + 7 files changed, 226 insertions(+), 46 deletions(-) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-12p1d-dep4-dep12-mtp.yaml rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml => disagg-mid-curve-2p1d-dep4-dep16-mtp.yaml} (95%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml => disagg-mid-curve-4p1d-dep4-dep16-mtp.yaml} (95%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/{disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml => disagg-mid-curve-8p1d-dep4-dep16-mtp.yaml} (90%) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0eba30478..d21e3db6c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9116,7 +9116,7 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: tp: 4 ep: 1 dp-attn: false - # Mid curve 1p1d-dep4-dep8. 3 nodes. + # Mid curve 1p1d-dep4-dep16. 5 nodes. - spec-decoding: "mtp" conc-list: [256] prefill: @@ -9125,56 +9125,71 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml" decode: num-worker: 1 - tp: 8 - ep: 8 + tp: 16 + ep: 16 dp-attn: true - # Mid curve 1p1d-dep4-dep16. 5 nodes. + # Mid curve 2p1d-dep4-dep16. 6 nodes. - spec-decoding: "mtp" - conc-list: [256] + conc-list: [512] prefill: - num-worker: 1 + num-worker: 2 tp: 4 ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep16-mtp.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep16-mtp.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - # Mid curve 2p1d-dep4-dep8. 4 nodes. + # Mid curve 4p1d-dep4-dep16. 8 nodes. - spec-decoding: "mtp" - conc-list: [512] + conc-list: [1024] prefill: - num-worker: 2 + num-worker: 4 tp: 4 ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep16-mtp.yaml" decode: num-worker: 1 - tp: 8 - ep: 8 + tp: 16 + ep: 16 dp-attn: true - # Mid curve 4p1d-dep4-dep8. 6 nodes. + # Mid curve 8p1d-dep4-dep16. 12 nodes. - spec-decoding: "mtp" conc-list: [1024] prefill: - num-worker: 4 + num-worker: 8 tp: 4 ep: 4 dp-attn: true additional-settings: - - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml" + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-8p1d-dep4-dep16-mtp.yaml" decode: num-worker: 1 - tp: 8 - ep: 8 + tp: 16 + ep: 16 + dp-attn: true + # Mid curve 12p1d-dep4-dep12. 15 nodes. + - spec-decoding: "mtp" + conc-list: [6144] + prefill: + num-worker: 12 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-12p1d-dep4-dep12-mtp.yaml" + decode: + num-worker: 1 + tp: 12 + ep: 12 dp-attn: true kimik2.5-int4-h100-vllm: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml index 3e8fca87b..2f9487068 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-low-latency-1p6d-dep4-tp4-mtp.yaml @@ -6,7 +6,7 @@ frontend: num_additional_frontends: 8 dynamo: - hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + hash: "9d3c913d300eb368cda28b3f98a23a5762621e0d" install: true model: diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-12p1d-dep4-dep12-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-12p1d-dep4-dep12-mtp.yaml new file mode 100644 index 000000000..c7d15d9c8 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-12p1d-dep4-dep12-mtp.yaml @@ -0,0 +1,157 @@ +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-12p1d-dep4-dep12-mtp" + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + +dynamo: + hash: "34d55a596fb8d3d44daefe425ec1e303131f4d2c" + install: true + +model: + path: "deepseek-v4-pro" + container: "lmsysorg/sglang:nightly-dev-cu13-20260510-2473659e" + precision: "mxfp4" + +sbatch_directives: + cpus-per-task: "144" + mem: "0" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 12 + prefill_workers: 12 + gpus_per_prefill: 4 + decode_nodes: 3 + decode_workers: 1 + gpus_per_decode: 12 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_RADIX_DISABLE_REUSE: "1" + SGLANG_JIT_DEEPGEMM_FAST_WARMUP: "1" + SGLANG_DEFAULT_THINKING: "1" + SGLANG_DSV4_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "0" # CAR_V2 is single-node only. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.9 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 + chunked-prefill-size: 32768 + stream-interval: 60 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + tool-call-parser: deepseekv4 # gates dsv4 chat-encoding spec. + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 12 + data-parallel-size: 12 + expert-parallel-size: 12 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + speculative-algo: "EAGLE" + speculative-num-steps: 3 + speculative-eagle-topk: 1 + speculative-num-draft-tokens: 4 + + mem-fraction-static: 0.9 + max-running-requests: 6144 + cuda-graph-max-bs: 1024 + swa-full-tokens-ratio: 0.15 + context-length: 16384 + stream-interval: 60 + +benchmark: + type: "sa-bench" + isl: 8192 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "6144" + req_rate: "inf" + use_chat_template: true + custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep16-mtp.yaml similarity index 95% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep16-mtp.yaml index 49b0d31c3..95e7a3dd8 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-2p1d-dep4-dep16-mtp.yaml @@ -1,4 +1,4 @@ -name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-2p1d-dep4-dep8-mtp" +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-2p1d-dep4-dep16-mtp" frontend: type: dynamo @@ -24,9 +24,9 @@ resources: prefill_nodes: 2 prefill_workers: 2 gpus_per_prefill: 4 - decode_nodes: 2 + decode_nodes: 4 decode_workers: 1 - gpus_per_decode: 8 + gpus_per_decode: 16 backend: type: sglang @@ -124,9 +124,9 @@ backend: disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 enable-dp-attention: true enable-dp-lm-head: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep16-mtp.yaml similarity index 95% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep16-mtp.yaml index d907c369e..a56fb575b 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-4p1d-dep4-dep16-mtp.yaml @@ -1,4 +1,4 @@ -name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-4p1d-dep4-dep8-mtp" +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-4p1d-dep4-dep16-mtp" frontend: type: dynamo @@ -24,9 +24,9 @@ resources: prefill_nodes: 4 prefill_workers: 4 gpus_per_prefill: 4 - decode_nodes: 2 + decode_nodes: 4 decode_workers: 1 - gpus_per_decode: 8 + gpus_per_decode: 16 backend: type: sglang @@ -124,9 +124,9 @@ backend: disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 enable-dp-attention: true enable-dp-lm-head: true diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-8p1d-dep4-dep16-mtp.yaml similarity index 90% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-8p1d-dep4-dep16-mtp.yaml index 842fbb556..156ac3a58 100644 --- a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-1p1d-dep4-dep8-mtp.yaml +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/8k1k/disagg-mid-curve-8p1d-dep4-dep16-mtp.yaml @@ -1,4 +1,4 @@ -name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-1p1d-dep4-dep8-mtp" +name: "dsv4-pro-gb300-disagg-8k1k-mid-curve-8p1d-dep4-dep16-mtp" frontend: type: dynamo @@ -21,12 +21,12 @@ sbatch_directives: resources: gpu_type: "gb300" gpus_per_node: 4 - prefill_nodes: 1 - prefill_workers: 1 + prefill_nodes: 8 + prefill_workers: 8 gpus_per_prefill: 4 - decode_nodes: 2 + decode_nodes: 4 decode_workers: 1 - gpus_per_decode: 8 + gpus_per_decode: 16 backend: type: sglang @@ -75,7 +75,7 @@ backend: SGLANG_OPT_USE_FAST_MASK_EP: "1" SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" - SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "2048" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "4096" SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" @@ -110,8 +110,8 @@ backend: deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' mem-fraction-static: 0.9 - max-running-requests: 256 - cuda-graph-max-bs: 256 + max-running-requests: 1024 + cuda-graph-max-bs: 1024 chunked-prefill-size: 32768 stream-interval: 60 @@ -124,9 +124,9 @@ backend: disaggregation-mode: "decode" disaggregation-transfer-backend: mooncake - tensor-parallel-size: 8 - data-parallel-size: 8 - expert-parallel-size: 8 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 enable-dp-attention: true enable-dp-lm-head: true @@ -139,9 +139,9 @@ backend: speculative-eagle-topk: 1 speculative-num-draft-tokens: 4 - mem-fraction-static: 0.94 + mem-fraction-static: 0.9 max-running-requests: 3072 - cuda-graph-max-bs: 256 + cuda-graph-max-bs: 1024 swa-full-tokens-ratio: 0.15 context-length: 16384 stream-interval: 60 @@ -151,7 +151,7 @@ benchmark: isl: 8192 osl: 1024 random_range_ratio: 0.8 - concurrencies: "256" + concurrencies: "3072" req_rate: "inf" use_chat_template: true custom_tokenizer: "sa_bench_tokenizers.sglang_deepseek_v4.SGLangDeepseekV4Tokenizer" diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 38df66d21..35e268e52 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3089,3 +3089,11 @@ description: - "Update SGLang image from v0.5.10.post1-cu130 / v0.5.11-cu130 (30d old) to v0.5.12-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1451 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang-mtp + description: + - "Add dep16 mid-curve configs (2p1d, 4p1d, 8p1d) and 12p1d-dep4-dep12" + - "Update 1p6d low-latency dynamo hash" + - "Remove dep8 mid-curve configs (replaced by dep16)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1529