diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a683d372..152c3d53c 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -9210,11 +9210,15 @@ qwen3.5-fp8-h100-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 1, conc-start: 1, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 } qwen3.5-fp8-h100-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/qwen3.5_fp8_h100.sh index 4c70657aa..fa7697b25 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h100.sh @@ -1,17 +1,15 @@ #!/usr/bin/env bash # Qwen-3.5-397B-A17B FP8 on H100 via sglang. -# Mirrors qwen3.5_fp8_h200.sh but with tighter memory accommodations: -# H100 has 80GB HBM3 vs H200's 141GB HBM3e, so weights + KV cache fit -# more snugly. Mem-fraction-static lowered from 0.8 → 0.75 and -# chunked-prefill-size from 16384 → 8192 to leave more headroom. -# Sweep tops out at conc=32 instead of 64 for the same reason. +# Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64, +# and TP8/EP8 with DP attention at conc 128-256. source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ MODEL \ TP \ + DP_ATTENTION \ CONC \ ISL \ OSL \ @@ -35,7 +33,47 @@ if [ "${EVAL_ONLY}" = "true" ]; then MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN" fi -echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" +PARALLEL_ARGS=(--tp "$TP") +if [ "${EP_SIZE}" -gt 1 ]; then + PARALLEL_ARGS+=(--expert-parallel-size "$EP_SIZE") +fi + +SCHEDULER_RECV_INTERVAL= +if [ "${DP_ATTENTION}" != "true" ]; then + case "$CONC" in + 1|2|4) + SCHEDULER_RECV_INTERVAL=2 + ;; + 8) + SCHEDULER_RECV_INTERVAL=60 + ;; + 16) + SCHEDULER_RECV_INTERVAL=30 + ;; + 32) + SCHEDULER_RECV_INTERVAL=1200 + ;; + 64) + SCHEDULER_RECV_INTERVAL=600 + ;; + *) + echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2 + exit 1 + ;; + esac +fi + +SCHEDULER_ARGS=() +if [ -n "$SCHEDULER_RECV_INTERVAL" ]; then + SCHEDULER_ARGS=(--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL") +fi +if [ "${DP_ATTENTION}" = "true" ]; then + PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention) +fi + +echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN" +echo "SCHEDULER_RECV_INTERVAL: ${SCHEDULER_RECV_INTERVAL:-none}" +echo "SCHEDULER_ARGS: ${SCHEDULER_ARGS[*]}" start_gpu_monitor @@ -44,15 +82,14 @@ python3 -m sglang.launch_server \ --model "$MODEL" \ --host 0.0.0.0 \ --port "$PORT" \ - --tp "$TP" \ - --expert-parallel-size "$EP_SIZE" \ + "${PARALLEL_ARGS[@]}" \ --reasoning-parser qwen3 \ --tool-call-parser qwen3_coder \ --enable-flashinfer-allreduce-fusion \ - --max-running-requests 64 \ - --chunked-prefill-size 8192 \ + --max-running-requests 256 \ + --chunked-prefill-size 16384 \ --decode-log-interval 1 \ - --mem-fraction-static 0.75 \ + --mem-fraction-static 0.8 \ --cuda-graph-max-bs "$CONC" \ --context-length "$MAX_SEQ_LEN" \ --kv-cache-dtype fp8_e4m3 \ @@ -62,7 +99,9 @@ python3 -m sglang.launch_server \ --tokenizer-worker-num 6 \ --mamba-ssm-dtype bfloat16 \ --disable-radix-cache \ + --enable-symm-mem \ --trust-remote-code \ + "${SCHEDULER_ARGS[@]}" \ > "$SERVER_LOG" 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c4894c47..f2fe7ef75 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3050,3 +3050,12 @@ description: - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475 + +- config-keys: + - qwen3.5-fp8-h100-sglang + description: + - "Tune Qwen3.5-397B-A17B-FP8 H100 SGLang aggregate recipe for 1k/1k and 8k/1k sweeps" + - "Use TP8/EP1 for conc 1-8, TP8/EP8 for conc 16-64, and TP8/EP8 DP-attention for conc 128-256" + - "Use scheduler-recv-interval values 2/60/30/1200/600 for non-DP conc 1-4/8/16/32/64" + - "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544