From 8cebe42a7e9c0c17b93a014aea404035cad34d4d Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 20 May 2026 22:19:51 +0800 Subject: [PATCH 1/4] Add DSV4 GB300 1k1k STP disagg configs Port 9 non-MTP disagg configs from NVIDIA/srt-slurm#161: - 1p1d dep8/dep16, 1p4d, 1p6d, 2p1d dep12/dep16/dep48 - low-latency dep4/tp4 with zip overrides --- .../1k1k/stp/disagg-1p1d-dep16-conc8192.yaml | 211 +++++++++++++++++ .../1k1k/stp/disagg-1p1d-dep8-conc8192.yaml | 210 +++++++++++++++++ .../1k1k/stp/disagg-1p4d-dep4-tp4.yaml | 132 +++++++++++ .../1k1k/stp/disagg-1p6d-dep4-tp4.yaml | 131 +++++++++++ .../1k1k/stp/disagg-2p1d-dep12-conc18432.yaml | 154 +++++++++++++ .../1k1k/stp/disagg-2p1d-dep16-conc16384.yaml | 211 +++++++++++++++++ .../1k1k/stp/disagg-2p1d-dep48-conc16384.yaml | 213 ++++++++++++++++++ .../1k1k/stp/disagg-low-latency-dep4.yaml | 140 ++++++++++++ .../1k1k/stp/disagg-low-latency-tp4.yaml | 126 +++++++++++ 9 files changed, 1528 insertions(+) create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep16-conc8192.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep8-conc8192.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p4d-dep4-tp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p6d-dep4-tp4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep12-conc18432.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep16-conc16384.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep48-conc16384.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-dep4.yaml create mode 100644 benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-tp4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep16-conc8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep16-conc8192.yaml new file mode 100644 index 000000000..6343c9ef0 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep16-conc8192.yaml @@ -0,0 +1,211 @@ +# 1k1k 1P1D / DEP16 / total 5 nodes +# conc sweep: 2048, 4096, 8192 (c/4, c/2, c where c = X * 8192) +# stack: baizhou-revert + sglang-cw-dev (dirty) + dynamo dd37acac04 (locked to PR #130 / job 1350) + +name: "disagg-1p1d-dep16-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + nodelist: 'slurm-gb300-138-[235,239,243,245,247]' + nodes: '5' + +dynamo: + hash: "dd37acac04fa8b00a95de4f089d717080c98e2f4" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +model: + path: dsv4-pro + container: "dsv4-grace-blackwell-baizhou-revert" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +extra_mount: +- /mnt/home/yangminl/sglang-cw-dev:/sgl-workspace/sglang +- /mnt/home/yangminl/sglang-cw-dev:/workspace/sglang + +backend: + type: sglang + + prefill_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: 'info,dynamo_runtime::pipeline::network::ingress::push_handler=warn' + + decode_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '1152' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: 'info,dynamo_runtime::pipeline::network::ingress::push_handler=warn' + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + moe-dense-tp-size: 1 + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.12 + context-length: 16384 + max-running-requests: 18432 + cuda-graph-max-bs: 1152 + +benchmark: + type: custom + command: | + set -e + ulimit -n 1048576 + REPO=/configs/upstream-sa-bench/InferenceX + [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" + cd "$REPO/utils/bench_serving" + + echo "=== block conc=2048 prompts=6144 warmups=2048 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 6144 --max-concurrency 2048 --request-rate inf \ + --num-warmups 2048 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc2048.json + + echo "=== block conc=4096 prompts=12288 warmups=4096 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 12288 --max-concurrency 4096 --request-rate inf \ + --num-warmups 4096 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc4096.json + + echo "=== block conc=8192 prompts=24576 warmups=8192 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 24576 --max-concurrency 8192 --request-rate inf \ + --num-warmups 8192 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc8192.json diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep8-conc8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep8-conc8192.yaml new file mode 100644 index 000000000..42c9627f6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep8-conc8192.yaml @@ -0,0 +1,210 @@ +# 1k1k 1P1D / DEP8 / total 3 nodes +# conc sweep: 2048, 4096, 8192 (c/4, c/2, c where c = X * 8192) +# stack: baizhou-revert + sglang-cw-dev (dirty) + dynamo dd37acac04 (locked to PR #130 / job 1350) + +name: "disagg-1p1d-dep8-conc8192" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + +dynamo: + hash: "dd37acac04fa8b00a95de4f089d717080c98e2f4" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +model: + path: dsv4-pro + container: "dsv4-grace-blackwell-baizhou-revert" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + gpus_per_prefill: 4 + decode_nodes: 2 + decode_workers: 1 + gpus_per_decode: 8 + +extra_mount: +- /mnt/home/yangminl/sglang-cw-dev:/sgl-workspace/sglang +- /mnt/home/yangminl/sglang-cw-dev:/workspace/sglang + +backend: + type: sglang + + prefill_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: 'info,dynamo_runtime::pipeline::network::ingress::push_handler=warn' + + decode_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '1152' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: 'info,dynamo_runtime::pipeline::network::ingress::push_handler=warn' + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + moe-dense-tp-size: 1 + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 8 + data-parallel-size: 8 + expert-parallel-size: 8 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.12 + context-length: 16384 + max-running-requests: 9216 + cuda-graph-max-bs: 1152 + +benchmark: + type: custom + command: | + set -e + ulimit -n 1048576 + REPO=/configs/upstream-sa-bench/InferenceX + [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" + cd "$REPO/utils/bench_serving" + + echo "=== block conc=2048 prompts=6144 warmups=2048 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 6144 --max-concurrency 2048 --request-rate inf \ + --num-warmups 2048 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc2048.json + + echo "=== block conc=4096 prompts=12288 warmups=4096 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 12288 --max-concurrency 4096 --request-rate inf \ + --num-warmups 4096 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc4096.json + + echo "=== block conc=8192 prompts=24576 warmups=8192 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 24576 --max-concurrency 8192 --request-rate inf \ + --num-warmups 8192 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc8192.json + diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p4d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p4d-dep4-tp4.yaml new file mode 100644 index 000000000..c5950fec6 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p4d-dep4-tp4.yaml @@ -0,0 +1,132 @@ +name: "dsv4-pro-gb300-disagg-1k1k-1p4d-dep4-tp4" + +# DeepSeek-V4-Pro 1k/1k disagg on GB300 — 1P + 4D, DEP4 prefill / TP4 decode. +# +# Single-config recipe derived from disagg-low-latency-dep4.yaml zip override +# variant [2] (decode_nodes=4). Concurrencies pruned to the Pareto-frontier +# points from the original sweep. + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 4 + decode_workers: 4 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "8x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p6d-dep4-tp4.yaml new file mode 100644 index 000000000..ecebfe3c3 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p6d-dep4-tp4.yaml @@ -0,0 +1,131 @@ +name: "dsv4-pro-gb300-disagg-1k1k-1p6d-dep4-tp4" +# DeepSeek-V4-Pro 1k/1k disagg on GB300 — 1P + 6D, DEP4 prefill / TP4 decode. +# +# Single-config recipe derived from disagg-low-latency-dep4.yaml zip override +# variant [3] (decode_nodes=6). Concurrencies pruned to the Pareto-frontier +# points from the original sweep + +frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + +model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + +resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 6 + decode_workers: 6 + +backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 128 + cuda-graph-max-bs: 128 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + +benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "32x64" + req_rate: "inf" + use_chat_template: false diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep12-conc18432.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep12-conc18432.yaml new file mode 100644 index 000000000..295d64726 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep12-conc18432.yaml @@ -0,0 +1,154 @@ +# 1k1k 2P1D / DEP12 / total 5 nodes +# rate-match point between 2:4 and 3:4; c8bd stack, single conc=18432, mrr=13824 +# stack: baizhou-revert + clean deepseek_v4_dev at c8bd6cf74 + dynamo dd37acac04 + +name: "disagg-2p1d-dep12-conc18432" +slurm: + time_limit: 03:00:00 +sbatch_directives: + cpus-per-task: '144' + mem: '0' + nodelist: 'slurm-gb300-133-[183,185,187,189,191]' + nodes: '5' +dynamo: + hash: dd37acac04fa8b00a95de4f089d717080c98e2f4 + install: true +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: '1' + args: + router-mode: kv + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true +model: + path: dsv4-pro + container: dsv4-grace-blackwell-baizhou-revert + precision: fp4 +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 3 + decode_workers: 1 + gpus_per_decode: 12 +extra_mount: +- /mnt/home/yangminl/sglang-cw-dev-clean-deepseek-v4-dev-0509:/sgl-workspace/sglang +- /mnt/home/yangminl/sglang-cw-dev-clean-deepseek-v4-dev-0509:/workspace/sglang +backend: + type: sglang + prefill_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '8192' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: info,dynamo_runtime::pipeline::network::ingress::push_handler=warn + decode_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '1152' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: '1' + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: '8' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: info,dynamo_runtime::pipeline::network::ingress::push_handler=warn + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + moe-dense-tp-size: 1 + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 12 + data-parallel-size: 12 + expert-parallel-size: 12 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.12 + context-length: 16384 + max-running-requests: 13824 + cuda-graph-max-bs: 1152 +benchmark: + type: custom + command: "set -e\nulimit -n 1048576\nREPO=/configs/upstream-sa-bench/InferenceX\n[ -d \"$REPO\" ] || git clone https://github.com/fzyzcjy/InferenceX.git \"$REPO\"\ncd \"$REPO/utils/bench_serving\"\n\necho \"=== 2P1D DEP12 rate-match point: conc=18432 prompts=55296 warmups=18432 mrr=13824 ===\"\npython3 benchmark_serving.py \\\n --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \\\n --host 127.0.0.1 --port 8000 --endpoint /v1/completions \\\n --dataset-name random \\\n --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \\\n --random-num-workers 96 \\\n --num-prompts 55296 --max-concurrency 18432 --request-rate inf \\\n --num-warmups 18432 \\\n --ignore-eos --trust-remote-code \\\n --percentile-metrics ttft,tpot,itl,e2el \\\n --save-result --result-dir /logs --result-filename results_conc18432_mrr13824.json\n" diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep16-conc16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep16-conc16384.yaml new file mode 100644 index 000000000..a59721bb9 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep16-conc16384.yaml @@ -0,0 +1,211 @@ +# 1k1k 2P1D / DEP16 / total 6 nodes +# conc sweep: 4096, 8192, 16384 (c/4, c/2, c where c = X * 8192) +# stack: baizhou-revert + sglang-cw-dev (dirty) + dynamo dd37acac04 (locked to PR #130 / job 1350) + +name: "disagg-2p1d-dep16-conc16384" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + nodelist: 'slurm-gb300-139-[001,003,005,007,009,011]' + nodes: '6' + +dynamo: + hash: "dd37acac04fa8b00a95de4f089d717080c98e2f4" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +model: + path: dsv4-pro + container: "dsv4-grace-blackwell-baizhou-revert" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 4 + decode_workers: 1 + gpus_per_decode: 16 + +extra_mount: +- /mnt/home/yangminl/sglang-cw-dev:/sgl-workspace/sglang +- /mnt/home/yangminl/sglang-cw-dev:/workspace/sglang + +backend: + type: sglang + + prefill_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: 'info,dynamo_runtime::pipeline::network::ingress::push_handler=warn' + + decode_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '1152' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: 'info,dynamo_runtime::pipeline::network::ingress::push_handler=warn' + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + moe-dense-tp-size: 1 + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 16 + data-parallel-size: 16 + expert-parallel-size: 16 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.12 + context-length: 16384 + max-running-requests: 18432 + cuda-graph-max-bs: 1152 + +benchmark: + type: custom + command: | + set -e + ulimit -n 1048576 + REPO=/configs/upstream-sa-bench/InferenceX + [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" + cd "$REPO/utils/bench_serving" + + echo "=== block conc=4096 prompts=12288 warmups=4096 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 12288 --max-concurrency 4096 --request-rate inf \ + --num-warmups 4096 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc4096.json + + echo "=== block conc=8192 prompts=24576 warmups=8192 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 24576 --max-concurrency 8192 --request-rate inf \ + --num-warmups 8192 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc8192.json + + echo "=== block conc=16384 prompts=49152 warmups=16384 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 49152 --max-concurrency 16384 --request-rate inf \ + --num-warmups 16384 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc16384.json diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep48-conc16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep48-conc16384.yaml new file mode 100644 index 000000000..4976b61db --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep48-conc16384.yaml @@ -0,0 +1,213 @@ +# 1k1k 2P1D / DEP48 / total 14 nodes +# conc sweep: 4096, 8192, 16384 (c/4, c/2, c where c = X * 8192) +# stack: baizhou-revert + sglang-cw-dev (dirty) + dynamo dd37acac04 (locked to PR #130 / job 1350) + +name: "disagg-2p1d-dep48-conc16384" + +slurm: + time_limit: "03:00:00" + +sbatch_directives: + cpus-per-task: '144' + mem: '0' + nodelist: 'slurm-gb300-133-181,slurm-gb300-133-183,slurm-gb300-133-185,slurm-gb300-133-187,slurm-gb300-133-189,slurm-gb300-133-191,slurm-gb300-133-193,slurm-gb300-133-195,slurm-gb300-133-197,slurm-gb300-133-199,slurm-gb300-133-201,slurm-gb300-133-203,slurm-gb300-133-205,slurm-gb300-133-207' + nodes: '14' + +dynamo: + hash: "dd37acac04fa8b00a95de4f089d717080c98e2f4" + install: true + +frontend: + type: dynamo + enable_multiple_frontends: true + num_additional_frontends: 8 + env: + DYN_ROUTER_LOAD_BLOCK_SIZE: "1" + args: + router-mode: "kv" + router-kv-overlap-score-weight: 0 + router-queue-threshold: 64 + router-temperature: 0.5 + no-kv-events: true + +model: + path: dsv4-pro + container: "dsv4-grace-blackwell-baizhou-revert" + precision: fp4 + +resources: + gpu_type: gb300 + gpus_per_node: 4 + prefill_nodes: 2 + prefill_workers: 2 + gpus_per_prefill: 4 + decode_nodes: 12 + decode_workers: 1 + gpus_per_decode: 48 + +extra_mount: +- /mnt/home/yangminl/sglang-cw-dev:/sgl-workspace/sglang +- /mnt/home/yangminl/sglang-cw-dev:/workspace/sglang + +backend: + type: sglang + + prefill_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "8192" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: 'info,dynamo_runtime::pipeline::network::ingress::push_handler=warn' + + decode_environment: + SGLANG_DG_CACHE_DIR: /configs/deepgemm_cache + PYTHONUNBUFFERED: '1' + SGLANG_JIT_DEEPGEMM_PRECOMPILE: '0' + SGLANG_ENABLE_THINKING: '1' + SGLANG_REASONING_EFFORT: max + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: '1' + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: '1' + SGLANG_OPT_USE_JIT_NORM: '1' + SGLANG_OPT_USE_JIT_INDEXER_METADATA: '1' + SGLANG_OPT_USE_TOPK_V2: '1' + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: '1' + SGLANG_OPT_FIX_HASH_MEGA_MOE: '1' + SGLANG_OPT_USE_FAST_MASK_EP: '1' + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: '1' + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: '1152' + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: '1' + SGLANG_OPT_USE_ONLINE_COMPRESS: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: '0' + NCCL_MNNVL_ENABLE: '1' + NCCL_CUMEM_ENABLE: '1' + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: 'True' + SGLANG_CLIP_MAX_NEW_TOKENS_ESTIMATION: "8" + MC_FORCE_MNNVL: '1' + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: '100000' + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: '100000' + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: '1' + DYN_SKIP_SGLANG_LOG_FORMATTING: '1' + SGLANG_LOG_FORWARD_ITERS: '1' + SGLANG_LOG_MS: '1' + SGLANG_REQUEST_STATE_WAIT_TIMEOUT: '60' + DYN_LOG: 'info,dynamo_runtime::pipeline::network::ingress::push_handler=warn' + + sglang_config: + prefill: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + enable-dp-attention: true + moe-dense-tp-size: 1 + enable-dp-lm-head: true + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: prefill + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 32768 + + decode: + served-model-name: deepseek-ai/DeepSeek-V4-Pro + trust-remote-code: true + watchdog-timeout: 86400 + skip-tokenizer-init: true + stream-interval: 60 + tensor-parallel-size: 48 + data-parallel-size: 48 + expert-parallel-size: 48 + enable-dp-attention: true + enable-dp-lm-head: true + moe-dense-tp-size: 1 + moe-a2a-backend: deepep + deepep-config: '{"normal_dispatch":{"num_sms":88,"num_max_nvl_chunked_send_tokens":28,"num_max_nvl_chunked_recv_tokens":512},"normal_combine": {"num_sms":88,"num_max_nvl_chunked_send_tokens":16,"num_max_nvl_chunked_recv_tokens":512}}' + disaggregation-mode: decode + disaggregation-transfer-backend: mooncake + mem-fraction-static: 0.94 + swa-full-tokens-ratio: 0.12 + context-length: 16384 + max-running-requests: 55296 + cuda-graph-max-bs: 1152 + +benchmark: + type: custom + command: | + set -e + ulimit -n 1048576 + REPO=/configs/upstream-sa-bench/InferenceX + [ -d "$REPO" ] || git clone https://github.com/fzyzcjy/InferenceX.git "$REPO" + cd "$REPO/utils/bench_serving" + + echo "=== block conc=4096 prompts=12288 warmups=4096 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 12288 --max-concurrency 4096 --request-rate inf \ + --num-warmups 4096 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc4096.json + + echo "=== block conc=8192 prompts=24576 warmups=8192 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 24576 --max-concurrency 8192 --request-rate inf \ + --num-warmups 8192 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc8192.json + + echo "=== block conc=16384 prompts=49152 warmups=16384 ===" + python3 benchmark_serving.py \ + --backend vllm --model deepseek-ai/DeepSeek-V4-Pro --tokenizer /model \ + --host 127.0.0.1 --port 8000 --endpoint /v1/completions \ + --dataset-name random \ + --random-input-len 1024 --random-output-len 1024 --random-range-ratio 0.8 \ + --random-num-workers 96 \ + --num-prompts 49152 --max-concurrency 16384 --request-rate inf \ + --num-warmups 16384 \ + --ignore-eos --trust-remote-code \ + --percentile-metrics ttft,tpot,itl,e2el \ + --save-result --result-dir /logs --result-filename results_conc16384.json + + diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-dep4.yaml new file mode 100644 index 000000000..bd7cc7010 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-dep4.yaml @@ -0,0 +1,140 @@ +base: + name: "dsv4-pro-gb300-disagg-1k1k-stp-low-latency-dep4" + + frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + + model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + + resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + + backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + + SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN: "1" + SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2: "1" + SGLANG_OPT_USE_FAST_MASK_EP: "1" + SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE: "1" + SGLANG_OPT_FIX_HASH_MEGA_MOE: "1" + SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK: "9216" + SGLANG_OPT_FIX_MEGA_MOE_MEMORY: "1" + SGLANG_OPT_FIX_NEXTN_MEGA_MOE: "1" + SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK: "0" + + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 4 + expert-parallel-size: 4 + + enable-dp-attention: true + enable-dp-lm-head: true + + moe-a2a-backend: "deepep" + deepep-config: '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + + benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" + use_chat_template: false + + +zip_override_1k1k_lowlat: + resources: + decode_nodes: [1, 2, 4, 6] + decode_workers: [1, 2, 4, 6] + backend: + sglang_config: + decode: + max-running-requests: [128, 128, 128, 128] + cuda-graph-max-bs: [128, 128, 128, 128] + benchmark: + concurrencies: ["1x8x32x64x128", "1x8x32x64x128x256", "1x8x32x64x128x256x512", "1x8x32x64x128x256x512x1024"] diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-tp4.yaml new file mode 100644 index 000000000..4edfc27b2 --- /dev/null +++ b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-tp4.yaml @@ -0,0 +1,126 @@ +base: + name: "dsv4-pro-gb300-disagg-1k1k-stp-low-latency-tp4" + + frontend: + type: sglang + enable_multiple_frontends: false + args: + policy: "cache_aware" + + model: + path: "dsv4-pro" + container: "dsv4-grace-blackwell" + precision: "mxfp4" + + resources: + gpu_type: "gb300" + gpus_per_node: 4 + prefill_nodes: 1 + prefill_workers: 1 + decode_nodes: 1 + decode_workers: 1 + + backend: + type: sglang + + prefill_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + + decode_environment: + PYTHONUNBUFFERED: "1" + SGLANG_JIT_DEEPGEMM_PRECOMPILE: "0" + SGLANG_ENABLE_THINKING: "1" + SGLANG_REASONING_EFFORT: "max" + SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT: "1" + SGLANG_OPT_USE_JIT_NORM: "1" + SGLANG_OPT_USE_JIT_INDEXER_METADATA: "1" + SGLANG_OPT_USE_TOPK_V2: "1" + NCCL_MNNVL_ENABLE: "1" + NCCL_CUMEM_ENABLE: "1" + SGLANG_MOONCAKE_CUSTOM_MEM_POOL: "True" + MC_FORCE_MNNVL: "1" + SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT: "100000" + SGLANG_DISAGGREGATION_WAITING_TIMEOUT: "100000" + SGLANG_OPT_SWA_RELEASE_LEAF_LOCK_AFTER_WINDOW: "1" + # SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2 intentionally NOT set: CAR_V2 + # is single-node only and corrupts results in 2-node decode setups. + + sglang_config: + prefill: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "prefill" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.90 + max-running-requests: 512 + cuda-graph-max-bs: 512 + chunked-prefill-size: 4096 + + decode: + served-model-name: "deepseek-ai/DeepSeek-V4-Pro" + model-path: "/model/" + trust-remote-code: true + disable-radix-cache: true + + disaggregation-mode: "decode" + disaggregation-transfer-backend: mooncake + + tensor-parallel-size: 4 + data-parallel-size: 1 + expert-parallel-size: 1 + + moe-runner-backend: "flashinfer_mxfp4" + disable-flashinfer-autotune: true + + mem-fraction-static: 0.9 + max-running-requests: 512 + cuda-graph-max-bs: 512 + swa-full-tokens-ratio: 0.1 + context-length: 4096 + + benchmark: + type: "sa-bench" + isl: 1024 + osl: 1024 + random_range_ratio: 0.8 + concurrencies: "1x2x4x8x16x32x64x128x256x512x1024" + req_rate: "inf" + use_chat_template: false + + +zip_override_1k1k_lowlat: + resources: + decode_nodes: [1, 2, 4, 6] + decode_workers: [1, 2, 4, 6] + backend: + sglang_config: + decode: + max-running-requests: [8, 8, 8, 8] + cuda-graph-max-bs: [8, 8, 8, 8] + benchmark: + concurrencies: "1x2" From 3a31fa3dc6308c1de7a5867d68e99ee3f629ca70 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 20 May 2026 22:28:35 +0800 Subject: [PATCH 2/4] Move 1k1k configs out of stp/ subdirectory --- .../deepseek-v4/1k1k/{stp => }/disagg-1p1d-dep16-conc8192.yaml | 0 .../deepseek-v4/1k1k/{stp => }/disagg-1p1d-dep8-conc8192.yaml | 0 .../sglang/deepseek-v4/1k1k/{stp => }/disagg-1p4d-dep4-tp4.yaml | 0 .../sglang/deepseek-v4/1k1k/{stp => }/disagg-1p6d-dep4-tp4.yaml | 0 .../deepseek-v4/1k1k/{stp => }/disagg-2p1d-dep12-conc18432.yaml | 0 .../deepseek-v4/1k1k/{stp => }/disagg-2p1d-dep16-conc16384.yaml | 0 .../deepseek-v4/1k1k/{stp => }/disagg-2p1d-dep48-conc16384.yaml | 0 .../deepseek-v4/1k1k/{stp => }/disagg-low-latency-dep4.yaml | 0 .../sglang/deepseek-v4/1k1k/{stp => }/disagg-low-latency-tp4.yaml | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/{stp => }/disagg-1p1d-dep16-conc8192.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/{stp => }/disagg-1p1d-dep8-conc8192.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/{stp => }/disagg-1p4d-dep4-tp4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/{stp => }/disagg-1p6d-dep4-tp4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/{stp => }/disagg-2p1d-dep12-conc18432.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/{stp => }/disagg-2p1d-dep16-conc16384.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/{stp => }/disagg-2p1d-dep48-conc16384.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/{stp => }/disagg-low-latency-dep4.yaml (100%) rename benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/{stp => }/disagg-low-latency-tp4.yaml (100%) diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep16-conc8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep16-conc8192.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep8-conc8192.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p1d-dep8-conc8192.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p4d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p4d-dep4-tp4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p4d-dep4-tp4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p4d-dep4-tp4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p6d-dep4-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p6d-dep4-tp4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-1p6d-dep4-tp4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-1p6d-dep4-tp4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep12-conc18432.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep12-conc18432.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep12-conc18432.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep12-conc18432.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep16-conc16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc16384.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep16-conc16384.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc16384.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep48-conc16384.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep48-conc16384.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-2p1d-dep48-conc16384.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep48-conc16384.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-dep4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-dep4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-dep4.yaml diff --git a/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-tp4.yaml b/benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4.yaml similarity index 100% rename from benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/stp/disagg-low-latency-tp4.yaml rename to benchmarks/multi_node/srt-slurm-recipes/sglang/deepseek-v4/1k1k/disagg-low-latency-tp4.yaml From 4535bbb3b45f2514f57022f3d56b03d423127c07 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Wed, 20 May 2026 22:39:57 +0800 Subject: [PATCH 3/4] Append perf-changelog entries for PR #1529 and #1530 --- perf-changelog.yaml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 38df66d21..bfa010481 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3089,3 +3089,11 @@ description: - "Update SGLang image from v0.5.10.post1-cu130 / v0.5.11-cu130 (30d old) to v0.5.12-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1451 + +- config-keys: + - dsv4-fp4-gb300-dynamo-sglang-1k1k-stp + description: + - "Add 7 DSV4 GB300 1k1k STP disagg configs from NVIDIA/srt-slurm#161" + - "Configs: 1p1d dep8/dep16, 1p4d, 1p6d, 2p1d dep12/dep16/dep48" + - "Register in nvidia-master.yaml for CI sweep" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1530 From 4f923af63b7e59b1d48e15cd87c6344583ac71c0 Mon Sep 17 00:00:00 2001 From: yhyang201 Date: Thu, 21 May 2026 00:42:48 +0800 Subject: [PATCH 4/4] Register dsv4-fp4-gb300-dynamo-sglang-1k1k-stp in nvidia-master.yaml --- .github/configs/nvidia-master.yaml | 114 +++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0eba30478..6e05beb5a 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9177,6 +9177,120 @@ dsv4-fp4-gb300-dynamo-sglang-mtp: ep: 8 dp-attn: true +# DSV4 1k1k STP (non-MTP) disagg configs from NVIDIA/srt-slurm#161. +dsv4-fp4-gb300-dynamo-sglang-1k1k-stp: + image: dsv4-grace-blackwell-baizhou-revert + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: gb300-cw + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + scenarios: + fixed-seq-len: + - isl: 1024 + osl: 1024 + search-space: + # 1p1d-dep8. 3 nodes. Custom conc sweep 2048/4096/8192. + - conc-list: [8192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep8-conc8192.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + # 1p1d-dep16. 5 nodes. Custom conc sweep 2048/4096/8192. + - conc-list: [8192] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p1d-dep16-conc8192.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 1p4d-dep4-tp4. 5 nodes. sa-bench conc 8x64. + - conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p4d-dep4-tp4.yaml" + decode: + num-worker: 4 + tp: 4 + ep: 1 + dp-attn: false + # 1p6d-dep4-tp4. 7 nodes. sa-bench conc 32x64. + - conc-list: [64] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-1p6d-dep4-tp4.yaml" + decode: + num-worker: 6 + tp: 4 + ep: 1 + dp-attn: false + # 2p1d-dep12. 5 nodes. Custom conc 18432. + - conc-list: [18432] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep12-conc18432.yaml" + decode: + num-worker: 1 + tp: 12 + ep: 12 + dp-attn: true + # 2p1d-dep16. 6 nodes. Custom conc sweep 4096/8192/16384. + - conc-list: [16384] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep16-conc16384.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + # 2p1d-dep48. 14 nodes. Custom conc sweep 4096/8192/16384. + - conc-list: [16384] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/sglang/deepseek-v4/1k1k/disagg-2p1d-dep48-conc16384.yaml" + decode: + num-worker: 1 + tp: 48 + ep: 48 + dp-attn: true + kimik2.5-int4-h100-vllm: image: vllm/vllm-openai:v0.20.2 model: moonshotai/Kimi-K2.5