Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9210,11 +9210,15 @@ qwen3.5-fp8-h100-sglang:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
- { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
- { tp: 8, ep: 1, conc-start: 1, conc-end: 8 }
- { tp: 8, ep: 8, conc-start: 16, conc-end: 64 }
- { tp: 8, ep: 8, dp-attn: true, conc-start: 128, conc-end: 256 }

qwen3.5-fp8-h100-sglang-mtp:
image: lmsysorg/sglang:v0.5.12-cu130
Expand Down
61 changes: 50 additions & 11 deletions benchmarks/single_node/qwen3.5_fp8_h100.sh
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
#!/usr/bin/env bash

# Qwen-3.5-397B-A17B FP8 on H100 via sglang.
# Mirrors qwen3.5_fp8_h200.sh but with tighter memory accommodations:
# H100 has 80GB HBM3 vs H200's 141GB HBM3e, so weights + KV cache fit
# more snugly. Mem-fraction-static lowered from 0.8 → 0.75 and
# chunked-prefill-size from 16384 → 8192 to leave more headroom.
# Sweep tops out at conc=32 instead of 64 for the same reason.
# Uses TP8/EP1 at conc 1-8, TP8/EP8 at conc 16-64,
# and TP8/EP8 with DP attention at conc 128-256.

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
DP_ATTENTION \
CONC \
ISL \
OSL \
Expand All @@ -35,7 +33,47 @@ if [ "${EVAL_ONLY}" = "true" ]; then
MAX_SEQ_LEN="$EVAL_MAX_MODEL_LEN"
fi

echo "CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
PARALLEL_ARGS=(--tp "$TP")
if [ "${EP_SIZE}" -gt 1 ]; then
PARALLEL_ARGS+=(--expert-parallel-size "$EP_SIZE")
fi

SCHEDULER_RECV_INTERVAL=
if [ "${DP_ATTENTION}" != "true" ]; then
case "$CONC" in
1|2|4)
SCHEDULER_RECV_INTERVAL=2
;;
8)
SCHEDULER_RECV_INTERVAL=60
;;
16)
SCHEDULER_RECV_INTERVAL=30
;;
32)
SCHEDULER_RECV_INTERVAL=1200
;;
64)
SCHEDULER_RECV_INTERVAL=600
;;
*)
echo "Unsupported CONC=$CONC for qwen3.5 FP8 H100 SGLang recipe" >&2
exit 1
;;
esac
fi

SCHEDULER_ARGS=()
if [ -n "$SCHEDULER_RECV_INTERVAL" ]; then
SCHEDULER_ARGS=(--scheduler-recv-interval "$SCHEDULER_RECV_INTERVAL")
fi
if [ "${DP_ATTENTION}" = "true" ]; then
PARALLEL_ARGS+=(--dp-size "$TP" --enable-dp-attention)
fi

echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL, MAX_SEQ_LEN: $MAX_SEQ_LEN"
echo "SCHEDULER_RECV_INTERVAL: ${SCHEDULER_RECV_INTERVAL:-none}"
echo "SCHEDULER_ARGS: ${SCHEDULER_ARGS[*]}"

start_gpu_monitor

Expand All @@ -44,15 +82,14 @@ python3 -m sglang.launch_server \
--model "$MODEL" \
--host 0.0.0.0 \
--port "$PORT" \
--tp "$TP" \
--expert-parallel-size "$EP_SIZE" \
"${PARALLEL_ARGS[@]}" \
--reasoning-parser qwen3 \
--tool-call-parser qwen3_coder \
--enable-flashinfer-allreduce-fusion \
--max-running-requests 64 \
--chunked-prefill-size 8192 \
--max-running-requests 256 \
--chunked-prefill-size 16384 \
--decode-log-interval 1 \
--mem-fraction-static 0.75 \
--mem-fraction-static 0.8 \
--cuda-graph-max-bs "$CONC" \
--context-length "$MAX_SEQ_LEN" \
--kv-cache-dtype fp8_e4m3 \
Expand All @@ -62,7 +99,9 @@ python3 -m sglang.launch_server \
--tokenizer-worker-num 6 \
--mamba-ssm-dtype bfloat16 \
--disable-radix-cache \
--enable-symm-mem \
--trust-remote-code \
"${SCHEDULER_ARGS[@]}" \
> "$SERVER_LOG" 2>&1 &

SERVER_PID=$!
Expand Down
9 changes: 9 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3050,3 +3050,12 @@
description:
- "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475

- config-keys:
- qwen3.5-fp8-h100-sglang
description:
- "Tune Qwen3.5-397B-A17B-FP8 H100 SGLang aggregate recipe for 1k/1k and 8k/1k sweeps"
- "Use TP8/EP1 for conc 1-8, TP8/EP8 for conc 16-64, and TP8/EP8 DP-attention for conc 128-256"
- "Use scheduler-recv-interval values 2/60/30/1200/600 for non-DP conc 1-4/8/16/32/64"
- "Set max-running-requests=256, chunked-prefill-size=16384, mem-fraction-static=0.8, cuda-graph-max-bs=CONC, and enable symm-mem"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1544