diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a683d372..9648e52e5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1883,14 +1883,7 @@ dsr1-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } - + - { tp: 4, ep: 4, conc-start: 4, conc-end: 4 } dsr1-fp4-b200-trt: image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14 model: nvidia/DeepSeek-R1-0528-FP4-V2 @@ -1990,20 +1983,9 @@ dsr1-fp8-b300-sglang: scenarios: fixed-seq-len: - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 osl: 1024 search-space: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } - -# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# lists B200 (not B300) as the Blackwell target. This config reuses the -# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300 -# until a B300-specific recipe ships. Prefix caching is disabled. -# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15 model: deepseek-ai/DeepSeek-V4-Pro @@ -2025,26 +2007,7 @@ dsv4-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 } - - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 } - - # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is - # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by - # DP_ATTENTION: - # dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192 - # + EAGLE (3,1,4) + mem-fraction 0.90 - # dp-attn: true -> DP-attn + flashinfer_mxfp4 + chunked-prefill 32768 - # + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256 + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 } dsv4-fp4-b300-sglang-mtp: image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3 model: deepseek-ai/DeepSeek-V4-Pro @@ -2062,14 +2025,7 @@ dsv4-fp4-b300-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } - + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B @@ -2281,12 +2237,7 @@ glm5-fp8-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } glm5-fp8-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: zai-org/GLM-5-FP8 @@ -2300,12 +2251,7 @@ glm5-fp8-b300-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } glm5-fp4-b200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: nvidia/GLM-5-NVFP4 @@ -2364,14 +2310,7 @@ glm5-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 } glm5-fp4-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.11-cu130 model: nvidia/GLM-5-NVFP4 @@ -2385,14 +2324,7 @@ glm5-fp4-b300-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } qwen3.5-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -2428,12 +2360,7 @@ qwen3.5-fp8-b300-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp } - + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } qwen3.5-fp8-b300-sglang: image: lmsysorg/sglang:v0.5.10.post1-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -2447,12 +2374,7 @@ qwen3.5-fp8-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 } - + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 } qwen3.5-fp4-b300-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: nvidia/Qwen3.5-397B-A17B-NVFP4 @@ -2466,14 +2388,7 @@ qwen3.5-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 } - + - { tp: 2, ep: 2, conc-start: 4, conc-end: 4 } qwen3.5-fp4-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: nvidia/Qwen3.5-397B-A17B-NVFP4 @@ -2487,14 +2402,7 @@ qwen3.5-fp4-b300-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp } - - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp } - + - { tp: 2, ep: 2, conc-start: 4, conc-end: 4, spec-decoding: mtp } qwen3.5-bf16-b300-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B @@ -2508,14 +2416,7 @@ qwen3.5-bf16-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 } qwen3.5-bf16-b300-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B @@ -2529,14 +2430,7 @@ qwen3.5-bf16-b300-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp } - + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } kimik2.5-int4-b200-vllm: image: vllm/vllm-openai:v0.21.0 model: moonshotai/Kimi-K2.5 @@ -2600,14 +2494,7 @@ kimik2.5-int4-b300-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 } kimik2.5-int4-h200-vllm: image: vllm/vllm-openai:v0.21.0 model: moonshotai/Kimi-K2.5 @@ -2716,14 +2603,7 @@ kimik2.5-fp4-b300-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 } - + - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 } dsr1-fp8-b200-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 model: deepseek-ai/DeepSeek-R1-0528 @@ -2760,20 +2640,7 @@ dsr1-fp8-b300-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp } - -# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below; -# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130' -# - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4' -# - model-prefix: 'dsr1' -> 'kimik2.5' -# - precision: 'fp8' -> 'fp4' -# - framework: 'sglang' -> 'vllm' + - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp } kimik2.5-fp4-b300-vllm-agentic: # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the @@ -3014,24 +2881,7 @@ dsv4-fp4-b300-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 1, conc-end: 128 } - - { tp: 8, conc-start: 1, conc-end: 4 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 2048, conc-end: 2048 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 8192 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 64 } - - { tp: 8, conc-start: 1, conc-end: 4 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 } - -# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is -# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this -# PR adds an agentic-coding scenarios block that differs from main -# (either main had none or had a different conc/offload sweep). -# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main. + - { tp: 4, conc-start: 4, conc-end: 4 } dsv4-fp4-b300-vllm-agentic: image: vllm/vllm-openai:v0.20.0-cu130 model: deepseek-ai/DeepSeek-V4-Pro @@ -3064,16 +2914,7 @@ dsv4-fp4-b300-trt: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 1, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - + - { tp: 4, conc-start: 4, conc-end: 4 } dsv4-fp4-b300-trt-mtp: image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715 model: deepseek-ai/DeepSeek-V4-Pro @@ -3087,16 +2928,7 @@ dsv4-fp4-b300-trt-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp } - + - { tp: 4, conc-start: 4, conc-end: 4, spec-decoding: mtp } dsv4-fp4-b300-vllm-mtp: image: vllm/vllm-openai:v0.21.0 model: deepseek-ai/DeepSeek-V4-Pro @@ -3110,17 +2942,7 @@ dsv4-fp4-b300-vllm-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp } - - { tp: 8, conc-start: 1, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp } - - { tp: 8, conc-start: 1, conc-end: 8, spec-decoding: mtp } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp } - + - { tp: 4, conc-start: 4, conc-end: 4, spec-decoding: mtp } qwen3.5-fp8-h200-sglang: image: lmsysorg/sglang:v0.5.12-cu130 model: Qwen/Qwen3.5-397B-A17B-FP8 @@ -4373,21 +4195,7 @@ minimaxm2.5-fp8-b300-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 4, ep: 4, conc-start: 256, conc-end: 512 } - - { tp: 2, ep: 2, conc-start: 512, conc-end: 1024 } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 1024, conc-end: 1024 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 16 } - - { tp: 2, conc-start: 64, conc-end: 256 } - - { tp: 4, conc-start: 4, conc-end: 8 } - -# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below; -# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so -# its fixed-seq-len sweep is unaffected. -# - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1' + - { tp: 1, conc-start: 4, conc-end: 4 } minimaxm2.5-fp8-b300-vllm-agentic: image: vllm/vllm-openai:v0.19.1 model: MiniMaxAI/MiniMax-M2.5 @@ -4477,20 +4285,7 @@ minimaxm2.5-fp4-b300-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 1, conc-start: 4, conc-end: 8 } - - { tp: 2, ep: 2, conc-start: 128, conc-end: 128 } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 256, conc-end: 2048 } - - { tp: 4, conc-start: 8, conc-end: 8 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 256 } - - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512 } - - { tp: 4, conc-start: 4, conc-end: 8 } - - { tp: 8, conc-start: 4, conc-end: 4 } - + - { tp: 1, conc-start: 4, conc-end: 4 } gptoss-fp4-h100-vllm: image: vllm/vllm-openai:v0.21.0 model: openai/gpt-oss-120b diff --git a/benchmarks/single_node/dsr1_fp4_b300.sh b/benchmarks/single_node/dsr1_fp4_b300.sh index 917f4f5f3..56b5f6256 100644 --- a/benchmarks/single_node/dsr1_fp4_b300.sh +++ b/benchmarks/single_node/dsr1_fp4_b300.sh @@ -16,11 +16,22 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi @@ -44,8 +55,8 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ +--tensor-parallel-size $TP --data-parallel-size 1 \ --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \ --chunked-prefill-size 16384 \ --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ diff --git a/benchmarks/single_node/dsr1_fp8_b300.sh b/benchmarks/single_node/dsr1_fp8_b300.sh index 2d475bc0b..e84abfe64 100644 --- a/benchmarks/single_node/dsr1_fp8_b300.sh +++ b/benchmarks/single_node/dsr1_fp8_b300.sh @@ -16,13 +16,24 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGL_ENABLE_JIT_DEEPGEMM=false export SGLANG_ENABLE_FLASHINFER_GEMM=true @@ -76,8 +87,8 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ ---tensor-parallel-size=$TP --data-parallel-size=1 \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ +--tensor-parallel-size $TP --data-parallel-size 1 \ --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \ diff --git a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh b/benchmarks/single_node/dsr1_fp8_b300_mtp.sh index d16cbcf8e..0e132a4c0 100755 --- a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh +++ b/benchmarks/single_node/dsr1_fp8_b300_mtp.sh @@ -16,13 +16,24 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_ENABLE_JIT_DEEPGEMM=false @@ -70,11 +81,11 @@ start_gpu_monitor set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ - --model-path=$MODEL \ - --host=0.0.0.0 \ - --port=$PORT \ - --tensor-parallel-size=$TP \ - --data-parallel-size=1 \ + --model-path $MODEL_PATH --served-model-name $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --tensor-parallel-size $TP \ + --data-parallel-size 1 \ --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \ --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC \ @@ -84,7 +95,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \ --enable-flashinfer-allreduce-fusion \ --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --disable-radix-cache \ - --fp8-gemm-backend=flashinfer_trtllm \ + --fp8-gemm-backend flashinfer_trtllm \ --attention-backend trtllm_mla \ --stream-interval 30 \ --ep-size $EP_SIZE \ diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 8f43ea8a3..b6ced157c 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -12,14 +12,20 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" fi -# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip -# `hf download`. Only fetch when MODEL looks like a HF repo ID. -if [[ "$MODEL" != /* ]]; then - hf download "$MODEL" +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi @@ -172,7 +178,7 @@ fi set -x PYTHONNOUSERSITE=1 sglang serve \ - --model-path $MODEL \ + --model-path $MODEL_PATH --served-model-name $MODEL \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh index 03102778d..3f0e24cee 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh @@ -23,14 +23,20 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" fi -# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip -# `hf download`. Only fetch when MODEL looks like a HF repo ID. -if [[ "$MODEL" != /* ]]; then - hf download "$MODEL" +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi @@ -121,7 +127,7 @@ fi set -x PYTHONNOUSERSITE=1 sglang serve \ - --model-path $MODEL \ + --model-path $MODEL_PATH --served-model-name $MODEL \ --host 0.0.0.0 \ --port $PORT \ --trust-remote-code \ diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh index 754846912..99be66eac 100644 --- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh @@ -18,6 +18,18 @@ check_env_vars \ DP_ATTENTION \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi @@ -47,10 +59,6 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" -if [[ "$MODEL" != /* ]]; then - hf download "$MODEL" -fi - nvidia-smi SERVER_LOG="$PWD/server.log" @@ -108,7 +116,7 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x SERVE_CMD=( - trtllm-serve "$MODEL" \ + trtllm-serve "$MODEL_PATH" \ --host 0.0.0.0 \ --port "$PORT" \ --trust_remote_code \ diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh index 8aa9d0e78..6f617ddf0 100644 --- a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh @@ -17,6 +17,18 @@ check_env_vars \ DP_ATTENTION \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi @@ -46,10 +58,6 @@ sanitize_slurm_mpi_env_for_trtllm export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}" echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE" -if [[ "$MODEL" != /* ]]; then - hf download "$MODEL" -fi - nvidia-smi SERVER_LOG="$PWD/server.log" @@ -111,7 +119,7 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x SERVE_CMD=( - trtllm-serve "$MODEL" \ + trtllm-serve "$MODEL_PATH" \ --host 0.0.0.0 \ --port "$PORT" \ --trust_remote_code \ diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh index 92d4bf4ad..fbdacb120 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh @@ -17,13 +17,24 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -67,7 +78,7 @@ fi start_gpu_monitor set -x -vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ --pipeline-parallel-size 1 \ --kv-cache-dtype fp8 \ diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh index cb41a9eb1..3a35a84c3 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh @@ -13,13 +13,24 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -60,7 +71,7 @@ NUM_SPEC_TOKENS=2 start_gpu_monitor set -x -vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \ +vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" \ "${PARALLEL_ARGS[@]}" \ --pipeline-parallel-size 1 \ --kv-cache-dtype fp8 \ diff --git a/benchmarks/single_node/glm5_fp4_b300.sh b/benchmarks/single_node/glm5_fp4_b300.sh index b751ddf7a..64c3551b2 100755 --- a/benchmarks/single_node/glm5_fp4_b300.sh +++ b/benchmarks/single_node/glm5_fp4_b300.sh @@ -16,13 +16,24 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -38,9 +49,9 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ ---tensor-parallel-size=$TP \ +--tensor-parallel-size $TP \ --data-parallel-size 1 --expert-parallel-size $EP_SIZE \ --disable-radix-cache \ --quantization modelopt_fp4 \ @@ -56,7 +67,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --stream-interval 30 \ --scheduler-recv-interval 10 \ --tokenizer-worker-num 6 \ ---tokenizer-path $MODEL $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +--tokenizer-path $MODEL_PATH $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/glm5_fp4_b300_mtp.sh b/benchmarks/single_node/glm5_fp4_b300_mtp.sh index db586dad8..546bb429b 100755 --- a/benchmarks/single_node/glm5_fp4_b300_mtp.sh +++ b/benchmarks/single_node/glm5_fp4_b300_mtp.sh @@ -15,13 +15,24 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1" @@ -43,9 +54,9 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ ---tensor-parallel-size=$TP \ +--tensor-parallel-size $TP \ --data-parallel-size 1 --expert-parallel-size 1 \ --tool-call-parser glm47 \ --reasoning-parser glm45 \ diff --git a/benchmarks/single_node/glm5_fp8_b300.sh b/benchmarks/single_node/glm5_fp8_b300.sh index 730cc3950..1bdfbdfda 100644 --- a/benchmarks/single_node/glm5_fp8_b300.sh +++ b/benchmarks/single_node/glm5_fp8_b300.sh @@ -15,13 +15,24 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1" @@ -46,9 +57,9 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ ---tensor-parallel-size=$TP \ +--tensor-parallel-size $TP \ --data-parallel-size 1 --expert-parallel-size 1 \ --tool-call-parser glm47 \ --reasoning-parser glm45 \ diff --git a/benchmarks/single_node/glm5_fp8_b300_mtp.sh b/benchmarks/single_node/glm5_fp8_b300_mtp.sh index 0d4290dd3..00137c62a 100755 --- a/benchmarks/single_node/glm5_fp8_b300_mtp.sh +++ b/benchmarks/single_node/glm5_fp8_b300_mtp.sh @@ -15,13 +15,24 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1" @@ -47,9 +58,9 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ ---tensor-parallel-size=$TP \ +--tensor-parallel-size $TP \ --data-parallel-size 1 --expert-parallel-size 1 \ --tool-call-parser glm47 \ --reasoning-parser glm45 \ diff --git a/benchmarks/single_node/kimik2.5_fp4_b300.sh b/benchmarks/single_node/kimik2.5_fp4_b300.sh index 44a06ebd5..65cc13e30 100755 --- a/benchmarks/single_node/kimik2.5_fp4_b300.sh +++ b/benchmarks/single_node/kimik2.5_fp4_b300.sh @@ -16,11 +16,22 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi @@ -38,8 +49,8 @@ fi start_gpu_monitor set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT \ ---tensor-parallel-size=$TP \ +vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ +--tensor-parallel-size $TP \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ --max-num-seqs $CONC \ diff --git a/benchmarks/single_node/kimik2.5_int4_b300.sh b/benchmarks/single_node/kimik2.5_int4_b300.sh index 6674ad8dd..105543e5f 100755 --- a/benchmarks/single_node/kimik2.5_int4_b300.sh +++ b/benchmarks/single_node/kimik2.5_int4_b300.sh @@ -16,11 +16,22 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi @@ -38,7 +49,7 @@ fi start_gpu_monitor set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT \ +vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --gpu-memory-utilization 0.95 \ --tensor-parallel-size $TP \ --max-model-len $MAX_MODEL_LEN \ diff --git a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh b/benchmarks/single_node/minimaxm2.5_fp4_b300.sh index a2861b441..446472e35 100755 --- a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh +++ b/benchmarks/single_node/minimaxm2.5_fp4_b300.sh @@ -18,13 +18,24 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -32,11 +43,11 @@ PORT=${PORT:-8888} export VLLM_FLOAT32_MATMUL_PRECISION=high if [ "${DP_ATTENTION}" = "true" ]; then - PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel" + PARALLEL_ARGS="--tensor-parallel-size 1 --data-parallel-size $TP --enable-expert-parallel" elif [ "$EP_SIZE" -gt 1 ]; then - PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel" + PARALLEL_ARGS="--tensor-parallel-size $TP --enable-expert-parallel" else - PARALLEL_ARGS="--tensor-parallel-size=$TP" + PARALLEL_ARGS="--tensor-parallel-size $TP" fi if [ "${EVAL_ONLY}" = "true" ]; then @@ -47,7 +58,7 @@ fi start_gpu_monitor set -x -vllm serve $MODEL --port $PORT \ +vllm serve $MODEL_PATH --served-model-name $MODEL --port $PORT \ $PARALLEL_ARGS \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/minimaxm2.5_fp8_b300.sh index 30821961f..11b48e848 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_b300.sh @@ -17,13 +17,24 @@ check_env_vars \ RANDOM_RANGE_RATIO \ RESULT_FILENAME +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -44,12 +55,12 @@ fi start_gpu_monitor set -x -vllm serve $MODEL --port $PORT \ ---tensor-parallel-size=$TP \ +vllm serve $MODEL_PATH --served-model-name $MODEL --port $PORT \ +--tensor-parallel-size $TP \ $EP \ --gpu-memory-utilization 0.90 \ --max-model-len $MAX_MODEL_LEN \ ---block-size=32 \ +--block-size 32 \ --kv-cache-dtype fp8 \ --max-cudagraph-capture-size 2048 \ --max-num-batched-tokens "$((ISL * 2 ))" \ diff --git a/benchmarks/single_node/qwen3.5_bf16_b300.sh b/benchmarks/single_node/qwen3.5_bf16_b300.sh index f1056c896..29c3f7551 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b300.sh +++ b/benchmarks/single_node/qwen3.5_bf16_b300.sh @@ -12,13 +12,24 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export NCCL_NVLS_ENABLE=1 export SGL_ENABLE_JIT_DEEPGEMM=false @@ -52,9 +63,9 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ ---served-model-name "Qwen/Qwen3.5-397B-A17B" --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --host 0.0.0.0 --port $PORT \ +--served-model-name $MODEL --trust-remote-code \ +--tensor-parallel-size $TP --data-parallel-size 1 --ep-size $EP_SIZE \ --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --context-length $CONTEXT_LENGTH --disable-radix-cache \ diff --git a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh index 705ca9775..91678a12e 100755 --- a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh +++ b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh @@ -12,13 +12,24 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export NCCL_NVLS_ENABLE=1 export SGL_ENABLE_JIT_DEEPGEMM=false @@ -52,9 +63,9 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ ---served-model-name "Qwen/Qwen3.5-397B-A17B" --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --host 0.0.0.0 --port $PORT \ +--served-model-name $MODEL --trust-remote-code \ +--tensor-parallel-size $TP --data-parallel-size 1 --ep-size $EP_SIZE \ --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \ --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \ --context-length $CONTEXT_LENGTH --disable-radix-cache \ diff --git a/benchmarks/single_node/qwen3.5_fp4_b300.sh b/benchmarks/single_node/qwen3.5_fp4_b300.sh index 18b6cda09..41c891e05 100755 --- a/benchmarks/single_node/qwen3.5_fp4_b300.sh +++ b/benchmarks/single_node/qwen3.5_fp4_b300.sh @@ -15,13 +15,24 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export NCCL_NVLS_ENABLE=1 export SGL_ENABLE_JIT_DEEPGEMM=false @@ -61,9 +72,9 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \ +--tensor-parallel-size $TP --data-parallel-size 1 --ep-size $EP_SIZE \ --reasoning-parser qwen3 \ --tool-call-parser qwen3_coder \ --mamba-scheduler-strategy no_buffer \ diff --git a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh index 9cb5d5464..cc957a623 100755 --- a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh @@ -15,13 +15,24 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export NCCL_NVLS_ENABLE=1 export SGL_ENABLE_JIT_DEEPGEMM=false @@ -61,9 +72,9 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \ +--tensor-parallel-size $TP --data-parallel-size 1 --ep-size $EP_SIZE \ --reasoning-parser qwen3 \ --tool-call-parser qwen3_coder \ --mamba-scheduler-strategy no_buffer \ diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh index b87d25e91..42c7a3c0b 100644 --- a/benchmarks/single_node/qwen3.5_fp8_b300.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b300.sh @@ -12,6 +12,18 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi @@ -31,9 +43,9 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \ +--tensor-parallel-size $TP --data-parallel-size 1 --expert-parallel-size $EP_SIZE \ --enable-symm-mem \ --disable-radix-cache \ --quantization fp8 \ @@ -49,7 +61,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0. --stream-interval 50 \ --scheduler-recv-interval 10 \ --tokenizer-worker-num 6 \ ---tokenizer-path $MODEL \ +--tokenizer-path $MODEL_PATH \ --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh index a0c5f4828..ea521769d 100644 --- a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh @@ -12,6 +12,18 @@ check_env_vars \ RESULT_FILENAME \ EP_SIZE +# `hf download` creates the target dir if missing and is itself idempotent. +# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE +# Either way, MODEL_PATH is what the server is launched with. +if [[ -n "${MODEL_PATH:-}" ]]; then + if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then + hf download "$MODEL" --local-dir "$MODEL_PATH" + fi +else + hf download "$MODEL" + export MODEL_PATH="$MODEL" +fi + if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi @@ -31,9 +43,9 @@ fi start_gpu_monitor set -x -SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \ --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \ +--tensor-parallel-size $TP --data-parallel-size 1 --expert-parallel-size $EP_SIZE \ --enable-symm-mem \ --disable-radix-cache \ --quantization fp8 \ @@ -49,7 +61,7 @@ SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --mod --stream-interval 50 \ --scheduler-recv-interval 10 \ --tokenizer-worker-num 6 \ ---tokenizer-path $MODEL \ +--tokenizer-path $MODEL_PATH \ --speculative-algorithm EAGLE \ --speculative-num-steps 3 \ --speculative-eagle-topk 1 \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 684d40dcc..423bad7cb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3068,3 +3068,32 @@ description: - "Bump image to rocm/sgl-dev:rocm720-mi35x-8c3b5aa-20260521-DSv4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1548 + +- config-keys: + - dsr1-fp4-b300-sglang + - dsr1-fp8-b300-sglang + - dsr1-fp8-b300-sglang-mtp + - dsv4-fp4-b300-sglang + - dsv4-fp4-b300-sglang-mtp + - dsv4-fp4-b300-vllm + - dsv4-fp4-b300-vllm-mtp + - dsv4-fp4-b300-trt + - dsv4-fp4-b300-trt-mtp + - glm5-fp8-b300-sglang + - glm5-fp8-b300-sglang-mtp + - glm5-fp4-b300-sglang + - glm5-fp4-b300-sglang-mtp + - qwen3.5-fp8-b300-sglang + - qwen3.5-fp8-b300-sglang-mtp + - qwen3.5-fp4-b300-sglang + - qwen3.5-fp4-b300-sglang-mtp + - qwen3.5-bf16-b300-sglang + - qwen3.5-bf16-b300-sglang-mtp + - kimik2.5-int4-b300-vllm + - kimik2.5-fp4-b300-vllm + - minimaxm2.5-fp8-b300-vllm + - minimaxm2.5-fp4-b300-vllm + description: + - "Migrate B300 agg model path to /scratch/models/; set HF_HUB_CACHE separately for eval dataset" + - "Introduce MODEL_PATH (filesystem path) separately from MODEL (HF id)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1539 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index cca8b4ab0..7b4a712f9 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -279,17 +279,48 @@ done find . -name '.nfs*' -delete 2>/dev/null || true else + # HF_HUB_CACHE is set to help with dataset download inside the container + # for eval jobs. Can be updated to some other path on the cluster and + # mounted just like HF_HUB_CACHE_MOUNT. + export HF_HUB_CACHE="$HOME/.cache/huggingface" + + # HF_HUB_CACHE_MOUNT is read-only and holds the pre-staged weights below. + # WRITABLE_MODELS_DIR is writable; the benchmark script downloads anything not + # in the staged list there. + HF_HUB_CACHE_MOUNT="/scratch/models/" + WRITABLE_MODELS_DIR="/data/models/" + + # Pre-staged model + STAGED_MODELS=( + DeepSeek-R1-0528 + DeepSeek-R1-0528-NVFP4-v2 + DeepSeek-V4-Flash + DeepSeek-V4-Pro + GLM-5-FP8 + GLM-5-NVFP4 + GLM-5.1 + Kimi-K2.5 + Kimi-K2.5-NVFP4 + Kimi-K2.6 + MiniMax-M2.5 + MiniMax-M2.5-NVFP4 + MiniMax-M2.7 + MiniMax-M2.7-NVFP4 + Qwen3.5-397B-A17B + Qwen3.5-397B-A17B-FP8 + Qwen3.5-397B-A17B-NVFP4 + gpt-oss-120b + ) - # Pre-staged models on the B300 cluster live under /data/models. Point MODEL - # at the local copy so the benchmark skips `hf download` and reads from the - # mounted dir. Other models fall through and use `hf download` from their - # benchmark script. - HF_HUB_CACHE_MOUNT="/data/models" - if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then - export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}" - elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then - export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro" + # MODEL stays as the HF id for the client (--served-model-name, tokenizer); + # MODEL_PATH is what the server reads weights from. + MODEL_BASENAME="${MODEL##*/}" + if [[ " ${STAGED_MODELS[*]} " == *" ${MODEL_BASENAME} "* ]]; then + export MODEL_PATH="${HF_HUB_CACHE_MOUNT%/}/${MODEL_BASENAME}" + else + export MODEL_PATH="${WRITABLE_MODELS_DIR%/}/${MODEL_BASENAME}" fi + SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') # Prefer a framework-tagged script (e.g. dsv4_fp4_b300_sglang.sh) so models @@ -337,7 +368,7 @@ else srun --jobid=$JOB_ID \ --mpi=none \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ + --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \ --no-container-mount-home \ --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \