From 6377d3531df3b9f69187b5c0f15900a31857046b Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Wed, 20 May 2026 16:19:22 -0700
Subject: [PATCH 1/5] b300(agg): migrate model path to /scratch/models/, drop
 hf download

- launcher (launch_b300-nv.sh, agg block): switch HF_HUB_CACHE_MOUNT from /data/models to /scratch/models/ (audit-recommended root), export HF_HUB_CACHE to container's ~/.cache/huggingface for eval datasets, and replace the Qwen-FP8/dsv4 special-case rewrite with a universal MODEL rewrite from HF id (org/name) to absolute local path under the mount.
- bench scripts: drop "hf download $MODEL" from 21 in-scope B300 scripts; contract is now staged-or-fail.
- configs: pin 23 in-scope B300 keys to runner: b300-nv so jobs route only to the NVIDIA runner, and reduce search-space to a single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end.

Out of scope: disagg/multi-node block, multi-node B300 configs (dynamo-trt/-vllm), and agentic configs/bench scripts.
---
 .github/configs/nvidia-master.yaml            | 297 +++---------------
 benchmarks/single_node/dsr1_fp4_b300.sh       |   1 -
 benchmarks/single_node/dsr1_fp8_b300.sh       |   1 -
 benchmarks/single_node/dsr1_fp8_b300_mtp.sh   |   1 -
 .../single_node/dsv4_fp4_b300_sglang.sh       |   6 -
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh   |   6 -
 benchmarks/single_node/dsv4_fp4_b300_trt.sh   |   4 -
 .../single_node/dsv4_fp4_b300_trt_mtp.sh      |   4 -
 benchmarks/single_node/dsv4_fp4_b300_vllm.sh  |   1 -
 .../single_node/dsv4_fp4_b300_vllm_mtp.sh     |   1 -
 benchmarks/single_node/glm5_fp4_b300.sh       |   1 -
 benchmarks/single_node/glm5_fp4_b300_mtp.sh   |   1 -
 benchmarks/single_node/glm5_fp8_b300.sh       |   1 -
 benchmarks/single_node/glm5_fp8_b300_mtp.sh   |   1 -
 benchmarks/single_node/kimik2.5_fp4_b300.sh   |   1 -
 benchmarks/single_node/kimik2.5_int4_b300.sh  |   1 -
 .../single_node/minimaxm2.5_fp4_b300.sh       |   1 -
 .../single_node/minimaxm2.5_fp8_b300.sh       |   1 -
 benchmarks/single_node/qwen3.5_bf16_b300.sh   |   1 -
 .../single_node/qwen3.5_bf16_b300_mtp.sh      |   1 -
 benchmarks/single_node/qwen3.5_fp4_b300.sh    |   1 -
 .../single_node/qwen3.5_fp4_b300_mtp.sh       |   1 -
 perf-changelog.yaml                           |  31 ++
 runners/launch_b300-nv.sh                     |  21 +-
 24 files changed, 89 insertions(+), 297 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4a683d372..90a3b35c1 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1874,7 +1874,7 @@ dsr1-fp4-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/DeepSeek-R1-0528-FP4-V2
   model-prefix: dsr1
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: sglang
   multinode: false
@@ -1883,14 +1883,7 @@ dsr1-fp4-b300-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
-
+      - { tp: 4, ep: 4, conc-start: 4, conc-end: 4 }
 dsr1-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.3.0rc14
   model: nvidia/DeepSeek-R1-0528-FP4-V2
@@ -1983,7 +1976,7 @@ dsr1-fp8-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
-  runner: b300
+  runner: b300-nv
   precision: fp8
   framework: sglang
   multinode: false
@@ -1992,23 +1985,12 @@ dsr1-fp8-b300-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
-
-# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# lists B200 (not B300) as the Blackwell target. This config reuses the
-# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
-# until a B300-specific recipe ships. Prefix caching is disabled.
-# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 }
 dsv4-fp4-b300-sglang:
   image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: sglang
   multinode: false
@@ -2025,31 +2007,12 @@ dsv4-fp4-b300-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-      - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 8192, conc-end: 8192 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-      - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 4096 }
-
-  # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
-  # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by
-  # DP_ATTENTION:
-  #   dp-attn: false -> TP-only + flashinfer_mxfp4 + chunked-prefill 8192
-  #                     + EAGLE (3,1,4) + mem-fraction 0.90
-  #   dp-attn: true  -> DP-attn  + flashinfer_mxfp4 + chunked-prefill 32768
-  #                     + EAGLE (1,1,2) + mem-fraction 0.92 + max-running 256
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 }
 dsv4-fp4-b300-sglang-mtp:
   image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: sglang
   multinode: false
@@ -2062,14 +2025,7 @@ dsv4-fp4-b300-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 1, conc-end: 8, spec-decoding: mtp }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B
@@ -2272,7 +2228,7 @@ glm5-fp8-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
-  runner: b300
+  runner: b300-nv
   precision: fp8
   framework: sglang
   multinode: false
@@ -2281,17 +2237,12 @@ glm5-fp8-b300-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 }
-
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
 glm5-fp8-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
-  runner: b300
+  runner: b300-nv
   precision: fp8
   framework: sglang
   multinode: false
@@ -2300,12 +2251,7 @@ glm5-fp8-b300-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
 glm5-fp4-b200-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/GLM-5-NVFP4
@@ -2355,7 +2301,7 @@ glm5-fp4-b300-sglang:
   image: lmsysorg/sglang:v0.5.11-cu130
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: sglang
   multinode: false
@@ -2364,19 +2310,12 @@ glm5-fp4-b300-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
-
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 }
 glm5-fp4-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.11-cu130
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: sglang
   multinode: false
@@ -2385,14 +2324,7 @@ glm5-fp4-b300-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
 qwen3.5-fp8-b200-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -2419,7 +2351,7 @@ qwen3.5-fp8-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.11-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
-  runner: b300
+  runner: b300-nv
   precision: fp8
   framework: sglang
   multinode: false
@@ -2428,17 +2360,12 @@ qwen3.5-fp8-b300-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256, spec-decoding: mtp }
-
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
 qwen3.5-fp8-b300-sglang:
   image: lmsysorg/sglang:v0.5.10.post1-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
-  runner: b300
+  runner: b300-nv
   precision: fp8
   framework: sglang
   multinode: false
@@ -2447,17 +2374,12 @@ qwen3.5-fp8-b300-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 256 }
-
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 }
 qwen3.5-fp4-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/Qwen3.5-397B-A17B-NVFP4
   model-prefix: qwen3.5
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: sglang
   multinode: false
@@ -2466,19 +2388,12 @@ qwen3.5-fp4-b300-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 }
-      - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 128 }
-      - { tp: 2, ep: 2, conc-start: 4, conc-end: 128 }
-
+      - { tp: 2, ep: 2, conc-start: 4, conc-end: 4 }
 qwen3.5-fp4-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/Qwen3.5-397B-A17B-NVFP4
   model-prefix: qwen3.5
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: sglang
   multinode: false
@@ -2487,19 +2402,12 @@ qwen3.5-fp4-b300-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-      - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-      - { tp: 2, ep: 2, conc-start: 4, conc-end: 128, spec-decoding: mtp }
-
+      - { tp: 2, ep: 2, conc-start: 4, conc-end: 4, spec-decoding: mtp }
 qwen3.5-bf16-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
-  runner: b300
+  runner: b300-nv
   precision: bf16
   framework: sglang
   multinode: false
@@ -2508,19 +2416,12 @@ qwen3.5-bf16-b300-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
-
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 }
 qwen3.5-bf16-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
-  runner: b300
+  runner: b300-nv
   precision: bf16
   framework: sglang
   multinode: false
@@ -2529,14 +2430,7 @@ qwen3.5-bf16-b300-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64, spec-decoding: mtp }
-
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
 kimik2.5-int4-b200-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: moonshotai/Kimi-K2.5
@@ -2591,7 +2485,7 @@ kimik2.5-int4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
-  runner: b300
+  runner: b300-nv
   precision: int4
   framework: vllm
   multinode: false
@@ -2600,14 +2494,7 @@ kimik2.5-int4-b300-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
-
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 }
 kimik2.5-int4-h200-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: moonshotai/Kimi-K2.5
@@ -2707,7 +2594,7 @@ kimik2.5-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: vllm
   multinode: false
@@ -2716,14 +2603,7 @@ kimik2.5-fp4-b300-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 64 }
-
+      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 }
 dsr1-fp8-b200-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: deepseek-ai/DeepSeek-R1-0528
@@ -2751,7 +2631,7 @@ dsr1-fp8-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
-  runner: b300
+  runner: b300-nv
   precision: fp8
   framework: sglang
   multinode: false
@@ -2760,20 +2640,7 @@ dsr1-fp8-b300-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 1, conc-start: 4, conc-end: 512, spec-decoding: mtp }
-
-# Diverged from kimik2.5-fp4-b300-vllm (agentic-coding sibling). Reasons below;
-# the original kimik2.5-fp4-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'lmsysorg/sglang:v0.5.10.post1-cu130' -> 'vllm/vllm-openai:v0.20.0-cu130'
-#   - model: 'deepseek-ai/DeepSeek-R1-0528' -> 'nvidia/Kimi-K2.5-NVFP4'
-#   - model-prefix: 'dsr1' -> 'kimik2.5'
-#   - precision: 'fp8' -> 'fp4'
-#   - framework: 'sglang' -> 'vllm'
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4, spec-decoding: mtp }
 kimik2.5-fp4-b300-vllm-agentic:
   # v0.20.2 (cu129) lacks the flashinfer kernels for B300's reported SM
   # (sm_12x); workers hit "Only SM 10.x and 11.x are supported" in the
@@ -3005,7 +2872,7 @@ dsv4-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: vllm
   multinode: false
@@ -3014,24 +2881,7 @@ dsv4-fp4-b300-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 1, conc-end: 128 }
-      - { tp: 8, conc-start: 1, conc-end: 4 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 4096, conc-end: 8192 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 64 }
-      - { tp: 8, conc-start: 1, conc-end: 4 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 2048, conc-end: 2048 }
-
-# Diverged from dsv4-fp4-b300-vllm (agentic-coding sibling). Metadata is
-# identical to origin/main's dsv4-fp4-b300-vllm; the split exists because this
-# PR adds an agentic-coding scenarios block that differs from main
-# (either main had none or had a different conc/offload sweep).
-# The original dsv4-fp4-b300-vllm entry stays byte-identical to origin/main.
+      - { tp: 4, conc-start: 4, conc-end: 4 }
 dsv4-fp4-b300-vllm-agentic:
   image: vllm/vllm-openai:v0.20.0-cu130
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -3055,7 +2905,7 @@ dsv4-fp4-b300-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: trt
   multinode: false
@@ -3064,21 +2914,12 @@ dsv4-fp4-b300-trt:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 1, conc-end: 64 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 32 }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64 }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
-
+      - { tp: 4, conc-start: 4, conc-end: 4 }
 dsv4-fp4-b300-trt-mtp:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: trt
   multinode: false
@@ -3087,21 +2928,12 @@ dsv4-fp4-b300-trt-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 512, conc-end: 2048, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 32, spec-decoding: mtp }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp }
-
+      - { tp: 4, conc-start: 4, conc-end: 4, spec-decoding: mtp }
 dsv4-fp4-b300-vllm-mtp:
   image: vllm/vllm-openai:v0.21.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: vllm
   multinode: false
@@ -3110,17 +2942,7 @@ dsv4-fp4-b300-vllm-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 1, conc-end: 256, spec-decoding: mtp }
-      - { tp: 8, conc-start: 1, conc-end: 8, spec-decoding: mtp }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 1024, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 4, conc-start: 1, conc-end: 64, spec-decoding: mtp }
-      - { tp: 8, conc-start: 1, conc-end: 8, spec-decoding: mtp }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 256, spec-decoding: mtp }
-      - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 512, spec-decoding: mtp }
-
+      - { tp: 4, conc-start: 4, conc-end: 4, spec-decoding: mtp }
 qwen3.5-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
@@ -4364,7 +4186,7 @@ minimaxm2.5-fp8-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
-  runner: b300
+  runner: b300-nv
   precision: fp8
   framework: vllm
   multinode: false
@@ -4373,21 +4195,7 @@ minimaxm2.5-fp8-b300-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, conc-start: 4, conc-end: 128 }
-      - { tp: 4, ep: 4, conc-start: 256, conc-end: 512 }
-      - { tp: 2, ep: 2, conc-start: 512, conc-end: 1024 }
-      - { tp: 2, ep: 2, dp-attn: true, conc-start: 1024, conc-end: 1024 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 16 }
-      - { tp: 2, conc-start: 64, conc-end: 256 }
-      - { tp: 4, conc-start: 4, conc-end: 8 }
-
-# Diverged from minimaxm2.5-fp8-b300-vllm (agentic-coding sibling). Reasons below;
-# the original minimaxm2.5-fp8-b300-vllm entry is left identical to origin/main so
-# its fixed-seq-len sweep is unaffected.
-#   - image: 'vllm/vllm-openai:v0.19.0-cu130' -> 'vllm/vllm-openai:v0.19.1'
+      - { tp: 1, conc-start: 4, conc-end: 4 }
 minimaxm2.5-fp8-b300-vllm-agentic:
   image: vllm/vllm-openai:v0.19.1
   model: MiniMaxAI/MiniMax-M2.5
@@ -4468,7 +4276,7 @@ minimaxm2.5-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: nvidia/MiniMax-M2.5-NVFP4
   model-prefix: minimaxm2.5
-  runner: b300
+  runner: b300-nv
   precision: fp4
   framework: vllm
   multinode: false
@@ -4477,20 +4285,7 @@ minimaxm2.5-fp4-b300-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 1, conc-start: 4, conc-end: 8 }
-      - { tp: 2, ep: 2, conc-start: 128, conc-end: 128 }
-      - { tp: 2, ep: 2, dp-attn: true, conc-start: 256, conc-end: 2048 }
-      - { tp: 4, conc-start: 8, conc-end: 8 }
-      - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 }
-      - { tp: 8, conc-start: 4, conc-end: 8 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 1, conc-start: 4, conc-end: 256 }
-      - { tp: 2, ep: 2, dp-attn: true, conc-start: 512, conc-end: 512 }
-      - { tp: 4, conc-start: 4, conc-end: 8 }
-      - { tp: 8, conc-start: 4, conc-end: 4 }
-
+      - { tp: 1, conc-start: 4, conc-end: 4 }
 gptoss-fp4-h100-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: openai/gpt-oss-120b
diff --git a/benchmarks/single_node/dsr1_fp4_b300.sh b/benchmarks/single_node/dsr1_fp4_b300.sh
index 917f4f5f3..afd26b3e1 100644
--- a/benchmarks/single_node/dsr1_fp4_b300.sh
+++ b/benchmarks/single_node/dsr1_fp4_b300.sh
@@ -20,7 +20,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 nvidia-smi
 
diff --git a/benchmarks/single_node/dsr1_fp8_b300.sh b/benchmarks/single_node/dsr1_fp8_b300.sh
index 2d475bc0b..94d73782f 100644
--- a/benchmarks/single_node/dsr1_fp8_b300.sh
+++ b/benchmarks/single_node/dsr1_fp8_b300.sh
@@ -22,7 +22,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export SGL_ENABLE_JIT_DEEPGEMM=false
 export SGLANG_ENABLE_FLASHINFER_GEMM=true
diff --git a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh b/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
index d16cbcf8e..abf8a86c8 100755
--- a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
@@ -22,7 +22,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export SGLANG_ENABLE_JIT_DEEPGEMM=false
 
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index 8f43ea8a3..20320916d 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -16,12 +16,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
-# `hf download`. Only fetch when MODEL looks like a HF repo ID.
-if [[ "$MODEL" != /* ]]; then
-    hf download "$MODEL"
-fi
-
 nvidia-smi
 
 # Common SGLANG env vars (apply to every config).
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 03102778d..463acb964 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -27,12 +27,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
-# `hf download`. Only fetch when MODEL looks like a HF repo ID.
-if [[ "$MODEL" != /* ]]; then
-    hf download "$MODEL"
-fi
-
 nvidia-smi
 
 # Common SGLANG env vars (apply to every config).
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
index 754846912..0fb0a7674 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -47,10 +47,6 @@ sanitize_slurm_mpi_env_for_trtllm
 export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
 echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
 
-if [[ "$MODEL" != /* ]]; then
-    hf download "$MODEL"
-fi
-
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
index 8aa9d0e78..290b0b782 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
@@ -46,10 +46,6 @@ sanitize_slurm_mpi_env_for_trtllm
 export NCCL_NVLS_ENABLE="${NCCL_NVLS_ENABLE:-0}"
 echo "NCCL_NVLS_ENABLE: $NCCL_NVLS_ENABLE"
 
-if [[ "$MODEL" != /* ]]; then
-    hf download "$MODEL"
-fi
-
 nvidia-smi
 
 SERVER_LOG="$PWD/server.log"
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index 92d4bf4ad..001c7ce10 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -23,7 +23,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index cb41a9eb1..d2c8e29b0 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -19,7 +19,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
diff --git a/benchmarks/single_node/glm5_fp4_b300.sh b/benchmarks/single_node/glm5_fp4_b300.sh
index b751ddf7a..73ef6caa2 100755
--- a/benchmarks/single_node/glm5_fp4_b300.sh
+++ b/benchmarks/single_node/glm5_fp4_b300.sh
@@ -22,7 +22,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
diff --git a/benchmarks/single_node/glm5_fp4_b300_mtp.sh b/benchmarks/single_node/glm5_fp4_b300_mtp.sh
index db586dad8..86be76c5d 100755
--- a/benchmarks/single_node/glm5_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/glm5_fp4_b300_mtp.sh
@@ -21,7 +21,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
 
diff --git a/benchmarks/single_node/glm5_fp8_b300.sh b/benchmarks/single_node/glm5_fp8_b300.sh
index 730cc3950..c5debfd6b 100644
--- a/benchmarks/single_node/glm5_fp8_b300.sh
+++ b/benchmarks/single_node/glm5_fp8_b300.sh
@@ -21,7 +21,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
 
diff --git a/benchmarks/single_node/glm5_fp8_b300_mtp.sh b/benchmarks/single_node/glm5_fp8_b300_mtp.sh
index 0d4290dd3..dc2178e15 100755
--- a/benchmarks/single_node/glm5_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/glm5_fp8_b300_mtp.sh
@@ -21,7 +21,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 pip install --no-deps "transformers==5.2.0" "huggingface-hub==1.4.1"
 
diff --git a/benchmarks/single_node/kimik2.5_fp4_b300.sh b/benchmarks/single_node/kimik2.5_fp4_b300.sh
index 44a06ebd5..a68773b8a 100755
--- a/benchmarks/single_node/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_b300.sh
@@ -20,7 +20,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 nvidia-smi
 
diff --git a/benchmarks/single_node/kimik2.5_int4_b300.sh b/benchmarks/single_node/kimik2.5_int4_b300.sh
index 6674ad8dd..17d081159 100755
--- a/benchmarks/single_node/kimik2.5_int4_b300.sh
+++ b/benchmarks/single_node/kimik2.5_int4_b300.sh
@@ -20,7 +20,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 nvidia-smi
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh b/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
index a2861b441..3a18fda41 100755
--- a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
@@ -24,7 +24,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
index 30821961f..ba8cc6117 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
@@ -23,7 +23,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300.sh b/benchmarks/single_node/qwen3.5_bf16_b300.sh
index f1056c896..2092391fb 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300.sh
@@ -18,7 +18,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export NCCL_NVLS_ENABLE=1
 export SGL_ENABLE_JIT_DEEPGEMM=false
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
index 705ca9775..ed039f182 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
@@ -18,7 +18,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export NCCL_NVLS_ENABLE=1
 export SGL_ENABLE_JIT_DEEPGEMM=false
diff --git a/benchmarks/single_node/qwen3.5_fp4_b300.sh b/benchmarks/single_node/qwen3.5_fp4_b300.sh
index 18b6cda09..ddcc96ca0 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b300.sh
+++ b/benchmarks/single_node/qwen3.5_fp4_b300.sh
@@ -21,7 +21,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export NCCL_NVLS_ENABLE=1
 export SGL_ENABLE_JIT_DEEPGEMM=false
diff --git a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
index 9cb5d5464..eac6f08cd 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
@@ -21,7 +21,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export NCCL_NVLS_ENABLE=1
 export SGL_ENABLE_JIT_DEEPGEMM=false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 684d40dcc..4333fc1b3 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3068,3 +3068,34 @@
   description:
     - "Bump image to rocm/sgl-dev:rocm720-mi35x-8c3b5aa-20260521-DSv4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1548
+
+- config-keys:
+    - dsr1-fp4-b300-sglang
+    - dsr1-fp8-b300-sglang
+    - dsr1-fp8-b300-sglang-mtp
+    - dsv4-fp4-b300-sglang
+    - dsv4-fp4-b300-sglang-mtp
+    - dsv4-fp4-b300-vllm
+    - dsv4-fp4-b300-vllm-mtp
+    - dsv4-fp4-b300-trt
+    - dsv4-fp4-b300-trt-mtp
+    - glm5-fp8-b300-sglang
+    - glm5-fp8-b300-sglang-mtp
+    - glm5-fp4-b300-sglang
+    - glm5-fp4-b300-sglang-mtp
+    - qwen3.5-fp8-b300-sglang
+    - qwen3.5-fp8-b300-sglang-mtp
+    - qwen3.5-fp4-b300-sglang
+    - qwen3.5-fp4-b300-sglang-mtp
+    - qwen3.5-bf16-b300-sglang
+    - qwen3.5-bf16-b300-sglang-mtp
+    - kimik2.5-int4-b300-vllm
+    - kimik2.5-fp4-b300-vllm
+    - minimaxm2.5-fp8-b300-vllm
+    - minimaxm2.5-fp4-b300-vllm
+  description:
+    - "Migrate B300 agg model path to /scratch/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads"
+    - "Replace the partial Qwen-FP8/dsv4 special-case MODEL rewrite with a universal rewrite from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail"
+    - "Pin in-scope configs to runner: b300-nv so jobs are picked up by the NVIDIA runner only"
+    - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index cca8b4ab0..6d574b61c 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -280,16 +280,19 @@ find . -name '.nfs*' -delete 2>/dev/null || true
 
 else
 
-    # Pre-staged models on the B300 cluster live under /data/models. Point MODEL
-    # at the local copy so the benchmark skips `hf download` and reads from the
-    # mounted dir. Other models fall through and use `hf download` from their
-    # benchmark script.
-    HF_HUB_CACHE_MOUNT="/data/models"
-    if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
-        export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
-    elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then
-        export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro"
+    HF_HUB_CACHE_MOUNT="/scratch/models/"
+
+    # HF_HUB_CACHE is set to help with dataset download inside the container
+    # for eval jobs. Can be updated to some other path on the cluster and
+    # mounted just like HF_HUB_CACHE_MOUNT.
+    export HF_HUB_CACHE="$HOME/.cache/huggingface"
+
+    # Rewrite MODEL from HF id (org/name) to the pre-staged local path under
+    # HF_HUB_CACHE_MOUNT. Skip if MODEL is already an absolute path.
+    if [[ -n "$MODEL" && "$MODEL" != /* ]]; then
+        export MODEL="${HF_HUB_CACHE_MOUNT}${MODEL##*/}"
     fi
+
     SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     # Prefer a framework-tagged script (e.g. dsv4_fp4_b300_sglang.sh) so models

From 629c2ac7400e2cf5a296bd9e66908888a079f345 Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Wed, 20 May 2026 16:19:46 -0700
Subject: [PATCH 2/5] perf-changelog: set PR link to #1539

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 4333fc1b3..45fc75f0f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3098,4 +3098,4 @@
     - "Replace the partial Qwen-FP8/dsv4 special-case MODEL rewrite with a universal rewrite from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail"
     - "Pin in-scope configs to runner: b300-nv so jobs are picked up by the NVIDIA runner only"
     - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1539

From b46504ac6c44ed5ad8c30c2847aed4da578dd6a6 Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Wed, 20 May 2026 17:06:17 -0700
Subject: [PATCH 3/5] revert runner to b300 (drop b300-nv suffix)

---
 .github/configs/nvidia-master.yaml | 46 +++++++++++++++---------------
 perf-changelog.yaml                |  1 -
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 90a3b35c1..04a5d7e64 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1874,7 +1874,7 @@ dsr1-fp4-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/DeepSeek-R1-0528-FP4-V2
   model-prefix: dsr1
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: sglang
   multinode: false
@@ -1976,7 +1976,7 @@ dsr1-fp8-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
-  runner: b300-nv
+  runner: b300
   precision: fp8
   framework: sglang
   multinode: false
@@ -1990,7 +1990,7 @@ dsv4-fp4-b300-sglang:
   image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: sglang
   multinode: false
@@ -2012,7 +2012,7 @@ dsv4-fp4-b300-sglang-mtp:
   image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: sglang
   multinode: false
@@ -2228,7 +2228,7 @@ glm5-fp8-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
-  runner: b300-nv
+  runner: b300
   precision: fp8
   framework: sglang
   multinode: false
@@ -2242,7 +2242,7 @@ glm5-fp8-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: zai-org/GLM-5-FP8
   model-prefix: glm5
-  runner: b300-nv
+  runner: b300
   precision: fp8
   framework: sglang
   multinode: false
@@ -2301,7 +2301,7 @@ glm5-fp4-b300-sglang:
   image: lmsysorg/sglang:v0.5.11-cu130
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: sglang
   multinode: false
@@ -2315,7 +2315,7 @@ glm5-fp4-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.11-cu130
   model: nvidia/GLM-5-NVFP4
   model-prefix: glm5
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: sglang
   multinode: false
@@ -2351,7 +2351,7 @@ qwen3.5-fp8-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.11-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
-  runner: b300-nv
+  runner: b300
   precision: fp8
   framework: sglang
   multinode: false
@@ -2365,7 +2365,7 @@ qwen3.5-fp8-b300-sglang:
   image: lmsysorg/sglang:v0.5.10.post1-cu130
   model: Qwen/Qwen3.5-397B-A17B-FP8
   model-prefix: qwen3.5
-  runner: b300-nv
+  runner: b300
   precision: fp8
   framework: sglang
   multinode: false
@@ -2379,7 +2379,7 @@ qwen3.5-fp4-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/Qwen3.5-397B-A17B-NVFP4
   model-prefix: qwen3.5
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: sglang
   multinode: false
@@ -2393,7 +2393,7 @@ qwen3.5-fp4-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: nvidia/Qwen3.5-397B-A17B-NVFP4
   model-prefix: qwen3.5
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: sglang
   multinode: false
@@ -2407,7 +2407,7 @@ qwen3.5-bf16-b300-sglang:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
-  runner: b300-nv
+  runner: b300
   precision: bf16
   framework: sglang
   multinode: false
@@ -2421,7 +2421,7 @@ qwen3.5-bf16-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: Qwen/Qwen3.5-397B-A17B
   model-prefix: qwen3.5
-  runner: b300-nv
+  runner: b300
   precision: bf16
   framework: sglang
   multinode: false
@@ -2485,7 +2485,7 @@ kimik2.5-int4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: moonshotai/Kimi-K2.5
   model-prefix: kimik2.5
-  runner: b300-nv
+  runner: b300
   precision: int4
   framework: vllm
   multinode: false
@@ -2594,7 +2594,7 @@ kimik2.5-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: nvidia/Kimi-K2.5-NVFP4
   model-prefix: kimik2.5
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: vllm
   multinode: false
@@ -2631,7 +2631,7 @@ dsr1-fp8-b300-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
-  runner: b300-nv
+  runner: b300
   precision: fp8
   framework: sglang
   multinode: false
@@ -2872,7 +2872,7 @@ dsv4-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: vllm
   multinode: false
@@ -2905,7 +2905,7 @@ dsv4-fp4-b300-trt:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: trt
   multinode: false
@@ -2919,7 +2919,7 @@ dsv4-fp4-b300-trt-mtp:
   image: ghcr.io#semianalysisai/trtllm-deepseek-v4:feat-deepseek_v4-9aa3715
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: trt
   multinode: false
@@ -2933,7 +2933,7 @@ dsv4-fp4-b300-vllm-mtp:
   image: vllm/vllm-openai:v0.21.0
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: vllm
   multinode: false
@@ -4186,7 +4186,7 @@ minimaxm2.5-fp8-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: MiniMaxAI/MiniMax-M2.5
   model-prefix: minimaxm2.5
-  runner: b300-nv
+  runner: b300
   precision: fp8
   framework: vllm
   multinode: false
@@ -4276,7 +4276,7 @@ minimaxm2.5-fp4-b300-vllm:
   image: vllm/vllm-openai:v0.21.0
   model: nvidia/MiniMax-M2.5-NVFP4
   model-prefix: minimaxm2.5
-  runner: b300-nv
+  runner: b300
   precision: fp4
   framework: vllm
   multinode: false
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 45fc75f0f..e31331b3a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3096,6 +3096,5 @@
   description:
     - "Migrate B300 agg model path to /scratch/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads"
     - "Replace the partial Qwen-FP8/dsv4 special-case MODEL rewrite with a universal rewrite from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail"
-    - "Pin in-scope configs to runner: b300-nv so jobs are picked up by the NVIDIA runner only"
     - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1539

From 5d2e1d4e29d4a3541d61a61b42544ca74ca56636 Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Thu, 21 May 2026 13:00:30 -0700
Subject: [PATCH 4/5] b300(agg): add hf download fallback, separate MODEL_PATH
 from MODEL

Launcher resolves MODEL_PATH from a B300 staged allow-list: pre-staged
models map to read-only /scratch/models/<basename>; everything else maps
to writable /data/models/<basename> (now bind-mounted into the srun
container). Bench scripts run hf download --local-dir "$MODEL_PATH"
(idempotent: hf manages a .cache/huggingface/ inside --local-dir and
skips up-to-date files) then serve weights from $MODEL_PATH; when
MODEL_PATH is unset (stand-alone runs) they fall back to the default HF
cache and serve the HF id directly. $MODEL stays the HF id for the
client (tokenizer, --served-model-name).
---
 benchmarks/single_node/dsr1_fp4_b300.sh       | 12 ++++-
 benchmarks/single_node/dsr1_fp8_b300.sh       | 12 ++++-
 benchmarks/single_node/dsr1_fp8_b300_mtp.sh   | 12 ++++-
 .../single_node/dsv4_fp4_b300_sglang.sh       | 12 ++++-
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh   | 12 ++++-
 benchmarks/single_node/dsv4_fp4_b300_trt.sh   | 12 ++++-
 .../single_node/dsv4_fp4_b300_trt_mtp.sh      | 12 ++++-
 benchmarks/single_node/dsv4_fp4_b300_vllm.sh  | 12 ++++-
 .../single_node/dsv4_fp4_b300_vllm_mtp.sh     | 12 ++++-
 benchmarks/single_node/glm5_fp4_b300.sh       | 14 +++++-
 benchmarks/single_node/glm5_fp4_b300_mtp.sh   | 12 ++++-
 benchmarks/single_node/glm5_fp8_b300.sh       | 12 ++++-
 benchmarks/single_node/glm5_fp8_b300_mtp.sh   | 12 ++++-
 benchmarks/single_node/kimik2.5_fp4_b300.sh   | 12 ++++-
 benchmarks/single_node/kimik2.5_int4_b300.sh  | 12 ++++-
 .../single_node/minimaxm2.5_fp4_b300.sh       | 12 ++++-
 .../single_node/minimaxm2.5_fp8_b300.sh       | 12 ++++-
 benchmarks/single_node/qwen3.5_bf16_b300.sh   | 12 ++++-
 .../single_node/qwen3.5_bf16_b300_mtp.sh      | 12 ++++-
 benchmarks/single_node/qwen3.5_fp4_b300.sh    | 12 ++++-
 .../single_node/qwen3.5_fp4_b300_mtp.sh       | 12 ++++-
 benchmarks/single_node/qwen3.5_fp8_b300.sh    | 14 +++++-
 .../single_node/qwen3.5_fp8_b300_mtp.sh       | 14 +++++-
 perf-changelog.yaml                           |  5 +--
 runners/launch_b300-nv.sh                     | 44 +++++++++++++++----
 25 files changed, 294 insertions(+), 37 deletions(-)

diff --git a/benchmarks/single_node/dsr1_fp4_b300.sh b/benchmarks/single_node/dsr1_fp4_b300.sh
index afd26b3e1..8d86c7d3e 100644
--- a/benchmarks/single_node/dsr1_fp4_b300.sh
+++ b/benchmarks/single_node/dsr1_fp4_b300.sh
@@ -16,6 +16,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -43,7 +53,7 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --host 0.0.0.0 --port $PORT --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 \
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
diff --git a/benchmarks/single_node/dsr1_fp8_b300.sh b/benchmarks/single_node/dsr1_fp8_b300.sh
index 94d73782f..780f9d2d7 100644
--- a/benchmarks/single_node/dsr1_fp8_b300.sh
+++ b/benchmarks/single_node/dsr1_fp8_b300.sh
@@ -16,6 +16,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -75,7 +85,7 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --tensor-parallel-size=$TP --data-parallel-size=1 \
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
diff --git a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh b/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
index abf8a86c8..87bdf26dd 100755
--- a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
@@ -16,6 +16,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -69,7 +79,7 @@ start_gpu_monitor
 
 set -x
 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
-    --model-path=$MODEL \
+    --model-path=$MODEL_PATH \
     --host=0.0.0.0 \
     --port=$PORT \
     --tensor-parallel-size=$TP \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index 20320916d..b441125cf 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -12,6 +12,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -166,7 +176,7 @@ fi
 
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
-    --model-path $MODEL \
+    --model-path $MODEL_PATH \
     --host 0.0.0.0 \
     --port $PORT \
     --trust-remote-code \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index 463acb964..ae2f59dc5 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -23,6 +23,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -115,7 +125,7 @@ fi
 
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
-    --model-path $MODEL \
+    --model-path $MODEL_PATH \
     --host 0.0.0.0 \
     --port $PORT \
     --trust-remote-code \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
index 0fb0a7674..1a9205483 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -18,6 +18,16 @@ check_env_vars \
     DP_ATTENTION \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -104,7 +114,7 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
 SERVE_CMD=(
-    trtllm-serve "$MODEL" \
+    trtllm-serve "$MODEL_PATH" \
     --host 0.0.0.0 \
     --port "$PORT" \
     --trust_remote_code \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
index 290b0b782..c84d41648 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
@@ -17,6 +17,16 @@ check_env_vars \
     DP_ATTENTION \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -107,7 +117,7 @@ start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
 SERVE_CMD=(
-    trtllm-serve "$MODEL" \
+    trtllm-serve "$MODEL_PATH" \
     --host 0.0.0.0 \
     --port "$PORT" \
     --trust_remote_code \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index 001c7ce10..270b896d1 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -17,6 +17,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -66,7 +76,7 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
+vllm serve "$MODEL_PATH" --host 0.0.0.0 --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
     --pipeline-parallel-size 1 \
     --kv-cache-dtype fp8 \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index d2c8e29b0..3f2953df0 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -13,6 +13,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -59,7 +69,7 @@ NUM_SPEC_TOKENS=2
 start_gpu_monitor
 
 set -x
-vllm serve "$MODEL" --host 0.0.0.0 --port "$PORT" \
+vllm serve "$MODEL_PATH" --host 0.0.0.0 --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
     --pipeline-parallel-size 1 \
     --kv-cache-dtype fp8 \
diff --git a/benchmarks/single_node/glm5_fp4_b300.sh b/benchmarks/single_node/glm5_fp4_b300.sh
index 73ef6caa2..ba3fc5d57 100755
--- a/benchmarks/single_node/glm5_fp4_b300.sh
+++ b/benchmarks/single_node/glm5_fp4_b300.sh
@@ -16,6 +16,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -37,7 +47,7 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --trust-remote-code \
 --tensor-parallel-size=$TP \
 --data-parallel-size 1 --expert-parallel-size $EP_SIZE \
@@ -55,7 +65,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --stream-interval 30 \
 --scheduler-recv-interval 10 \
 --tokenizer-worker-num 6 \
---tokenizer-path $MODEL $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+--tokenizer-path $MODEL_PATH $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
diff --git a/benchmarks/single_node/glm5_fp4_b300_mtp.sh b/benchmarks/single_node/glm5_fp4_b300_mtp.sh
index 86be76c5d..12bce3024 100755
--- a/benchmarks/single_node/glm5_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/glm5_fp4_b300_mtp.sh
@@ -15,6 +15,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -42,7 +52,7 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --trust-remote-code \
 --tensor-parallel-size=$TP \
 --data-parallel-size 1 --expert-parallel-size 1 \
diff --git a/benchmarks/single_node/glm5_fp8_b300.sh b/benchmarks/single_node/glm5_fp8_b300.sh
index c5debfd6b..adb05653a 100644
--- a/benchmarks/single_node/glm5_fp8_b300.sh
+++ b/benchmarks/single_node/glm5_fp8_b300.sh
@@ -15,6 +15,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -45,7 +55,7 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --trust-remote-code \
 --tensor-parallel-size=$TP \
 --data-parallel-size 1 --expert-parallel-size 1 \
diff --git a/benchmarks/single_node/glm5_fp8_b300_mtp.sh b/benchmarks/single_node/glm5_fp8_b300_mtp.sh
index dc2178e15..2d330ee59 100755
--- a/benchmarks/single_node/glm5_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/glm5_fp8_b300_mtp.sh
@@ -15,6 +15,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -46,7 +56,7 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --trust-remote-code \
 --tensor-parallel-size=$TP \
 --data-parallel-size 1 --expert-parallel-size 1 \
diff --git a/benchmarks/single_node/kimik2.5_fp4_b300.sh b/benchmarks/single_node/kimik2.5_fp4_b300.sh
index a68773b8a..0a02a4371 100755
--- a/benchmarks/single_node/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_b300.sh
@@ -16,6 +16,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -37,7 +47,7 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+vllm serve $MODEL_PATH --host 0.0.0.0 --port $PORT \
 --tensor-parallel-size=$TP \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
diff --git a/benchmarks/single_node/kimik2.5_int4_b300.sh b/benchmarks/single_node/kimik2.5_int4_b300.sh
index 17d081159..1bb8fbdd3 100755
--- a/benchmarks/single_node/kimik2.5_int4_b300.sh
+++ b/benchmarks/single_node/kimik2.5_int4_b300.sh
@@ -16,6 +16,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -37,7 +47,7 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL --host 0.0.0.0 --port $PORT \
+vllm serve $MODEL_PATH --host 0.0.0.0 --port $PORT \
 --gpu-memory-utilization 0.95 \
 --tensor-parallel-size $TP \
 --max-model-len $MAX_MODEL_LEN \
diff --git a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh b/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
index 3a18fda41..ea8fc5c98 100755
--- a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
@@ -18,6 +18,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -46,7 +56,7 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL --port $PORT \
+vllm serve $MODEL_PATH --port $PORT \
 $PARALLEL_ARGS \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
index ba8cc6117..c5592d16a 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
@@ -17,6 +17,16 @@ check_env_vars \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -43,7 +53,7 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL --port $PORT \
+vllm serve $MODEL_PATH --port $PORT \
 --tensor-parallel-size=$TP \
 $EP \
 --gpu-memory-utilization 0.90 \
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300.sh b/benchmarks/single_node/qwen3.5_bf16_b300.sh
index 2092391fb..14dfcc33d 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300.sh
@@ -12,6 +12,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -51,7 +61,7 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL,
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --served-model-name "Qwen/Qwen3.5-397B-A17B" --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
index ed039f182..a2a9dbee2 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
@@ -12,6 +12,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -51,7 +61,7 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL,
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --served-model-name "Qwen/Qwen3.5-397B-A17B" --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
diff --git a/benchmarks/single_node/qwen3.5_fp4_b300.sh b/benchmarks/single_node/qwen3.5_fp4_b300.sh
index ddcc96ca0..83c18a84f 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b300.sh
+++ b/benchmarks/single_node/qwen3.5_fp4_b300.sh
@@ -15,6 +15,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -60,7 +70,7 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL,
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
 --reasoning-parser qwen3 \
diff --git a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
index eac6f08cd..8f1cc92ac 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
@@ -15,6 +15,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -60,7 +70,7 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL,
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
 --reasoning-parser qwen3 \
diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh
index b87d25e91..39cf0f8b1 100644
--- a/benchmarks/single_node/qwen3.5_fp8_b300.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b300.sh
@@ -12,6 +12,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -31,7 +41,7 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \
 --enable-symm-mem \
@@ -49,7 +59,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.
 --stream-interval 50 \
 --scheduler-recv-interval 10 \
 --tokenizer-worker-num 6 \
---tokenizer-path $MODEL \
+--tokenizer-path $MODEL_PATH \
 --context-length $CONTEXT_LENGTH > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
index a0c5f4828..62538b3c8 100644
--- a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
@@ -12,6 +12,16 @@ check_env_vars \
     RESULT_FILENAME \
     EP_SIZE
 
+# `hf download` creates the target dir if missing and is itself idempotent. 
+# When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
+# Either way, MODEL_PATH is what the server is launched with.
+if [[ -n "${MODEL_PATH:-}" ]]; then
+    hf download "$MODEL" --local-dir "$MODEL_PATH"
+else
+    hf download "$MODEL"
+    export MODEL_PATH="$MODEL"
+fi
+
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
@@ -31,7 +41,7 @@ fi
 start_gpu_monitor
 
 set -x
-SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
 --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \
 --enable-symm-mem \
@@ -49,7 +59,7 @@ SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --mod
 --stream-interval 50 \
 --scheduler-recv-interval 10 \
 --tokenizer-worker-num 6 \
---tokenizer-path $MODEL \
+--tokenizer-path $MODEL_PATH \
 --speculative-algorithm EAGLE \
 --speculative-num-steps 3 \
 --speculative-eagle-topk 1 \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index e31331b3a..423bad7cb 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3094,7 +3094,6 @@
     - minimaxm2.5-fp8-b300-vllm
     - minimaxm2.5-fp4-b300-vllm
   description:
-    - "Migrate B300 agg model path to /scratch/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads"
-    - "Replace the partial Qwen-FP8/dsv4 special-case MODEL rewrite with a universal rewrite from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail"
-    - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end"
+    - "Migrate B300 agg model path to /scratch/models/; set HF_HUB_CACHE separately for eval dataset"
+    - "Introduce MODEL_PATH (filesystem path) separately from MODEL (HF id)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1539
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 6d574b61c..7b4a712f9 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -279,18 +279,46 @@ done
 find . -name '.nfs*' -delete 2>/dev/null || true
 
 else
-
-    HF_HUB_CACHE_MOUNT="/scratch/models/"
-
     # HF_HUB_CACHE is set to help with dataset download inside the container
     # for eval jobs. Can be updated to some other path on the cluster and
     # mounted just like HF_HUB_CACHE_MOUNT.
     export HF_HUB_CACHE="$HOME/.cache/huggingface"
 
-    # Rewrite MODEL from HF id (org/name) to the pre-staged local path under
-    # HF_HUB_CACHE_MOUNT. Skip if MODEL is already an absolute path.
-    if [[ -n "$MODEL" && "$MODEL" != /* ]]; then
-        export MODEL="${HF_HUB_CACHE_MOUNT}${MODEL##*/}"
+    # HF_HUB_CACHE_MOUNT is read-only and holds the pre-staged weights below.
+    # WRITABLE_MODELS_DIR is writable; the benchmark script downloads anything not
+    # in the staged list there.
+    HF_HUB_CACHE_MOUNT="/scratch/models/"
+    WRITABLE_MODELS_DIR="/data/models/"
+
+    # Pre-staged model 
+    STAGED_MODELS=(
+        DeepSeek-R1-0528
+        DeepSeek-R1-0528-NVFP4-v2
+        DeepSeek-V4-Flash
+        DeepSeek-V4-Pro
+        GLM-5-FP8
+        GLM-5-NVFP4
+        GLM-5.1
+        Kimi-K2.5
+        Kimi-K2.5-NVFP4
+        Kimi-K2.6
+        MiniMax-M2.5
+        MiniMax-M2.5-NVFP4
+        MiniMax-M2.7
+        MiniMax-M2.7-NVFP4
+        Qwen3.5-397B-A17B
+        Qwen3.5-397B-A17B-FP8
+        Qwen3.5-397B-A17B-NVFP4
+        gpt-oss-120b
+    )
+
+    # MODEL stays as the HF id for the client (--served-model-name, tokenizer);
+    # MODEL_PATH is what the server reads weights from.
+    MODEL_BASENAME="${MODEL##*/}"
+    if [[ " ${STAGED_MODELS[*]} " == *" ${MODEL_BASENAME} "* ]]; then
+        export MODEL_PATH="${HF_HUB_CACHE_MOUNT%/}/${MODEL_BASENAME}"
+    else
+        export MODEL_PATH="${WRITABLE_MODELS_DIR%/}/${MODEL_BASENAME}"
     fi
 
     SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
@@ -340,7 +368,7 @@ else
     srun --jobid=$JOB_ID \
         --mpi=none \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
+        --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT,$WRITABLE_MODELS_DIR:$WRITABLE_MODELS_DIR \
         --no-container-mount-home \
         --container-workdir=$CONTAINER_MOUNT_DIR \
         --no-container-entrypoint --export=ALL,PORT=8888 \

From bb1a4ae4b18ff6c877901d76b9703d72b6f111f5 Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Thu, 21 May 2026 15:16:30 -0700
Subject: [PATCH 5/5] b300(agg): fix vllm 404s, skip hf download on read-only
 mount, normalize flag style

- Add --served-model-name $MODEL to vllm and sglang launches so the API name
  stays the HF id even though the server loads weights from $MODEL_PATH.
  Without this, vllm strictly rejects client requests (HTTP 404) when the
  request's model field doesn't match the served name (= the local path).
  sglang and trtllm are lenient about the mismatch.
- Drop the hardcoded --served-model-name "Qwen/Qwen3.5-397B-A17B" in the
  qwen3.5-bf16 scripts in favor of $MODEL.
- Guard hf download with a dir-non-empty check: hf download --local-dir
  writes per-file metadata locks under <local-dir>/.cache/huggingface/download/
  even on no-op runs, which fails on the read-only /scratch/models/ mount.
- Normalize all server-launch flags to --flag value (space) style; sglang
  scripts previously used --flag=value.
- nvidia-master.yaml: dsr1-fp8-b300-sglang 1k1k tp 4 -> 8 (the script's
  benchmarks/single_node/dsr1_fp8_b300.sh:57-61 guard doesn't support tp=4
  at isl=osl=1024).
---
 .github/configs/nvidia-master.yaml               |  2 +-
 benchmarks/single_node/dsr1_fp4_b300.sh          |  8 +++++---
 benchmarks/single_node/dsr1_fp8_b300.sh          |  8 +++++---
 benchmarks/single_node/dsr1_fp8_b300_mtp.sh      | 16 +++++++++-------
 benchmarks/single_node/dsv4_fp4_b300_sglang.sh   |  6 ++++--
 .../single_node/dsv4_fp4_b300_sglang_mtp.sh      |  6 ++++--
 benchmarks/single_node/dsv4_fp4_b300_trt.sh      |  4 +++-
 benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh  |  4 +++-
 benchmarks/single_node/dsv4_fp4_b300_vllm.sh     |  6 ++++--
 benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh |  6 ++++--
 benchmarks/single_node/glm5_fp4_b300.sh          |  8 +++++---
 benchmarks/single_node/glm5_fp4_b300_mtp.sh      |  8 +++++---
 benchmarks/single_node/glm5_fp8_b300.sh          |  8 +++++---
 benchmarks/single_node/glm5_fp8_b300_mtp.sh      |  8 +++++---
 benchmarks/single_node/kimik2.5_fp4_b300.sh      |  8 +++++---
 benchmarks/single_node/kimik2.5_int4_b300.sh     |  6 ++++--
 benchmarks/single_node/minimaxm2.5_fp4_b300.sh   | 12 +++++++-----
 benchmarks/single_node/minimaxm2.5_fp8_b300.sh   | 10 ++++++----
 benchmarks/single_node/qwen3.5_bf16_b300.sh      | 10 ++++++----
 benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh  | 10 ++++++----
 benchmarks/single_node/qwen3.5_fp4_b300.sh       |  8 +++++---
 benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh   |  8 +++++---
 benchmarks/single_node/qwen3.5_fp8_b300.sh       |  8 +++++---
 benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh   |  8 +++++---
 24 files changed, 116 insertions(+), 70 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 04a5d7e64..9648e52e5 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1985,7 +1985,7 @@ dsr1-fp8-b300-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 4, ep: 1, conc-start: 4, conc-end: 4 }
+      - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
 dsv4-fp4-b300-sglang:
   image: lmsysorg/sglang:deepseek-v4-b300@sha256:2fec8d7958bb0d53b50d7bf04d6ae6a7de8a35503775826e0550a45dd8c3ee15
   model: deepseek-ai/DeepSeek-V4-Pro
diff --git a/benchmarks/single_node/dsr1_fp4_b300.sh b/benchmarks/single_node/dsr1_fp4_b300.sh
index 8d86c7d3e..56b5f6256 100644
--- a/benchmarks/single_node/dsr1_fp4_b300.sh
+++ b/benchmarks/single_node/dsr1_fp4_b300.sh
@@ -20,7 +20,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -53,8 +55,8 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --host 0.0.0.0 --port $PORT --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
+--tensor-parallel-size $TP --data-parallel-size 1 \
 --cuda-graph-max-bs 256 --max-running-requests 256 --mem-fraction-static 0.85 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 16384 \
 --ep-size $EP_SIZE --quantization modelopt_fp4 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
diff --git a/benchmarks/single_node/dsr1_fp8_b300.sh b/benchmarks/single_node/dsr1_fp8_b300.sh
index 780f9d2d7..e84abfe64 100644
--- a/benchmarks/single_node/dsr1_fp8_b300.sh
+++ b/benchmarks/single_node/dsr1_fp8_b300.sh
@@ -20,7 +20,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -85,8 +87,8 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
---tensor-parallel-size=$TP --data-parallel-size=1 \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
+--tensor-parallel-size $TP --data-parallel-size 1 \
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --kv-cache-dtype fp8_e4m3 --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --enable-flashinfer-allreduce-fusion --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL --disable-radix-cache \
diff --git a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh b/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
index 87bdf26dd..0e132a4c0 100755
--- a/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/dsr1_fp8_b300_mtp.sh
@@ -20,7 +20,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -79,11 +81,11 @@ start_gpu_monitor
 
 set -x
 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
-    --model-path=$MODEL_PATH \
-    --host=0.0.0.0 \
-    --port=$PORT \
-    --tensor-parallel-size=$TP \
-    --data-parallel-size=1 \
+    --model-path $MODEL_PATH --served-model-name $MODEL \
+    --host 0.0.0.0 \
+    --port $PORT \
+    --tensor-parallel-size $TP \
+    --data-parallel-size 1 \
     --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE \
     --max-running-requests $MAX_RUNNING_REQUESTS \
     --mem-fraction-static $MEM_FRAC_STATIC \
@@ -93,7 +95,7 @@ PYTHONNOUSERSITE=1 python3 -m sglang.launch_server \
     --enable-flashinfer-allreduce-fusion \
     --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
     --disable-radix-cache \
-    --fp8-gemm-backend=flashinfer_trtllm \
+    --fp8-gemm-backend flashinfer_trtllm \
     --attention-backend trtllm_mla \
     --stream-interval 30 \
     --ep-size $EP_SIZE \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index b441125cf..b6ced157c 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -16,7 +16,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -176,7 +178,7 @@ fi
 
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
-    --model-path $MODEL_PATH \
+    --model-path $MODEL_PATH --served-model-name $MODEL \
     --host 0.0.0.0 \
     --port $PORT \
     --trust-remote-code \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
index ae2f59dc5..3f0e24cee 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh
@@ -27,7 +27,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -125,7 +127,7 @@ fi
 
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
-    --model-path $MODEL_PATH \
+    --model-path $MODEL_PATH --served-model-name $MODEL \
     --host 0.0.0.0 \
     --port $PORT \
     --trust-remote-code \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt.sh b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
index 1a9205483..99be66eac 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt.sh
@@ -22,7 +22,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
diff --git a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
index c84d41648..6f617ddf0 100644
--- a/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_trt_mtp.sh
@@ -21,7 +21,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
index 270b896d1..fbdacb120 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm.sh
@@ -21,7 +21,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -76,7 +78,7 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve "$MODEL_PATH" --host 0.0.0.0 --port "$PORT" \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
     --pipeline-parallel-size 1 \
     --kv-cache-dtype fp8 \
diff --git a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
index 3f2953df0..3a35a84c3 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_vllm_mtp.sh
@@ -17,7 +17,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -69,7 +71,7 @@ NUM_SPEC_TOKENS=2
 start_gpu_monitor
 
 set -x
-vllm serve "$MODEL_PATH" --host 0.0.0.0 --port "$PORT" \
+vllm serve "$MODEL_PATH" --served-model-name "$MODEL" --host 0.0.0.0 --port "$PORT" \
     "${PARALLEL_ARGS[@]}" \
     --pipeline-parallel-size 1 \
     --kv-cache-dtype fp8 \
diff --git a/benchmarks/single_node/glm5_fp4_b300.sh b/benchmarks/single_node/glm5_fp4_b300.sh
index ba3fc5d57..64c3551b2 100755
--- a/benchmarks/single_node/glm5_fp4_b300.sh
+++ b/benchmarks/single_node/glm5_fp4_b300.sh
@@ -20,7 +20,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -47,9 +49,9 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
---tensor-parallel-size=$TP \
+--tensor-parallel-size $TP \
 --data-parallel-size 1 --expert-parallel-size $EP_SIZE \
 --disable-radix-cache \
 --quantization modelopt_fp4 \
diff --git a/benchmarks/single_node/glm5_fp4_b300_mtp.sh b/benchmarks/single_node/glm5_fp4_b300_mtp.sh
index 12bce3024..546bb429b 100755
--- a/benchmarks/single_node/glm5_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/glm5_fp4_b300_mtp.sh
@@ -19,7 +19,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -52,9 +54,9 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
---tensor-parallel-size=$TP \
+--tensor-parallel-size $TP \
 --data-parallel-size 1 --expert-parallel-size 1 \
 --tool-call-parser glm47 \
 --reasoning-parser glm45 \
diff --git a/benchmarks/single_node/glm5_fp8_b300.sh b/benchmarks/single_node/glm5_fp8_b300.sh
index adb05653a..1bdfbdfda 100644
--- a/benchmarks/single_node/glm5_fp8_b300.sh
+++ b/benchmarks/single_node/glm5_fp8_b300.sh
@@ -19,7 +19,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -55,9 +57,9 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
---tensor-parallel-size=$TP \
+--tensor-parallel-size $TP \
 --data-parallel-size 1 --expert-parallel-size 1 \
 --tool-call-parser glm47 \
 --reasoning-parser glm45 \
diff --git a/benchmarks/single_node/glm5_fp8_b300_mtp.sh b/benchmarks/single_node/glm5_fp8_b300_mtp.sh
index 2d330ee59..00137c62a 100755
--- a/benchmarks/single_node/glm5_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/glm5_fp8_b300_mtp.sh
@@ -19,7 +19,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -56,9 +58,9 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
---tensor-parallel-size=$TP \
+--tensor-parallel-size $TP \
 --data-parallel-size 1 --expert-parallel-size 1 \
 --tool-call-parser glm47 \
 --reasoning-parser glm45 \
diff --git a/benchmarks/single_node/kimik2.5_fp4_b300.sh b/benchmarks/single_node/kimik2.5_fp4_b300.sh
index 0a02a4371..65cc13e30 100755
--- a/benchmarks/single_node/kimik2.5_fp4_b300.sh
+++ b/benchmarks/single_node/kimik2.5_fp4_b300.sh
@@ -20,7 +20,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -47,8 +49,8 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL_PATH --host 0.0.0.0 --port $PORT \
---tensor-parallel-size=$TP \
+vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
+--tensor-parallel-size $TP \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
 --max-num-seqs $CONC \
diff --git a/benchmarks/single_node/kimik2.5_int4_b300.sh b/benchmarks/single_node/kimik2.5_int4_b300.sh
index 1bb8fbdd3..105543e5f 100755
--- a/benchmarks/single_node/kimik2.5_int4_b300.sh
+++ b/benchmarks/single_node/kimik2.5_int4_b300.sh
@@ -20,7 +20,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -47,7 +49,7 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL_PATH --host 0.0.0.0 --port $PORT \
+vllm serve $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --gpu-memory-utilization 0.95 \
 --tensor-parallel-size $TP \
 --max-model-len $MAX_MODEL_LEN \
diff --git a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh b/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
index ea8fc5c98..446472e35 100755
--- a/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp4_b300.sh
@@ -22,7 +22,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -41,11 +43,11 @@ PORT=${PORT:-8888}
 export VLLM_FLOAT32_MATMUL_PRECISION=high
 
 if [ "${DP_ATTENTION}" = "true" ]; then
-  PARALLEL_ARGS="--tensor-parallel-size=1 --data-parallel-size=$TP --enable-expert-parallel"
+  PARALLEL_ARGS="--tensor-parallel-size 1 --data-parallel-size $TP --enable-expert-parallel"
 elif [ "$EP_SIZE" -gt 1 ]; then
-  PARALLEL_ARGS="--tensor-parallel-size=$TP --enable-expert-parallel"
+  PARALLEL_ARGS="--tensor-parallel-size $TP --enable-expert-parallel"
 else
-  PARALLEL_ARGS="--tensor-parallel-size=$TP"
+  PARALLEL_ARGS="--tensor-parallel-size $TP"
 fi
 
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -56,7 +58,7 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL_PATH --port $PORT \
+vllm serve $MODEL_PATH --served-model-name $MODEL --port $PORT \
 $PARALLEL_ARGS \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh b/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
index c5592d16a..11b48e848 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_b300.sh
@@ -21,7 +21,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -53,12 +55,12 @@ fi
 start_gpu_monitor
 
 set -x
-vllm serve $MODEL_PATH --port $PORT \
---tensor-parallel-size=$TP \
+vllm serve $MODEL_PATH --served-model-name $MODEL --port $PORT \
+--tensor-parallel-size $TP \
 $EP \
 --gpu-memory-utilization 0.90 \
 --max-model-len $MAX_MODEL_LEN \
---block-size=32 \
+--block-size 32 \
 --kv-cache-dtype fp8 \
 --max-cudagraph-capture-size 2048 \
 --max-num-batched-tokens "$((ISL * 2 ))" \
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300.sh b/benchmarks/single_node/qwen3.5_bf16_b300.sh
index 14dfcc33d..29c3f7551 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300.sh
@@ -16,7 +16,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -61,9 +63,9 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL,
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
---served-model-name "Qwen/Qwen3.5-397B-A17B" --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --host 0.0.0.0 --port $PORT \
+--served-model-name $MODEL --trust-remote-code \
+--tensor-parallel-size $TP --data-parallel-size 1 --ep-size $EP_SIZE \
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --context-length $CONTEXT_LENGTH --disable-radix-cache \
diff --git a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
index a2a9dbee2..91678a12e 100755
--- a/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_bf16_b300_mtp.sh
@@ -16,7 +16,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -61,9 +63,9 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL,
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
---served-model-name "Qwen/Qwen3.5-397B-A17B" --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --host 0.0.0.0 --port $PORT \
+--served-model-name $MODEL --trust-remote-code \
+--tensor-parallel-size $TP --data-parallel-size 1 --ep-size $EP_SIZE \
 --cuda-graph-max-bs $CUDA_GRAPH_MAX_BATCH_SIZE --max-running-requests $MAX_RUNNING_REQUESTS \
 --mem-fraction-static $MEM_FRAC_STATIC --chunked-prefill-size $CHUNKED_PREFILL_SIZE --max-prefill-tokens $MAX_PREFILL_TOKENS \
 --context-length $CONTEXT_LENGTH --disable-radix-cache \
diff --git a/benchmarks/single_node/qwen3.5_fp4_b300.sh b/benchmarks/single_node/qwen3.5_fp4_b300.sh
index 83c18a84f..41c891e05 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b300.sh
+++ b/benchmarks/single_node/qwen3.5_fp4_b300.sh
@@ -19,7 +19,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -70,9 +72,9 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL,
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
+--tensor-parallel-size $TP --data-parallel-size 1 --ep-size $EP_SIZE \
 --reasoning-parser qwen3 \
 --tool-call-parser qwen3_coder \
 --mamba-scheduler-strategy no_buffer \
diff --git a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
index 8f1cc92ac..cc957a623 100755
--- a/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp4_b300_mtp.sh
@@ -19,7 +19,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -70,9 +72,9 @@ echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL,
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
+--tensor-parallel-size $TP --data-parallel-size 1 --ep-size $EP_SIZE \
 --reasoning-parser qwen3 \
 --tool-call-parser qwen3_coder \
 --mamba-scheduler-strategy no_buffer \
diff --git a/benchmarks/single_node/qwen3.5_fp8_b300.sh b/benchmarks/single_node/qwen3.5_fp8_b300.sh
index 39cf0f8b1..42c7a3c0b 100644
--- a/benchmarks/single_node/qwen3.5_fp8_b300.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b300.sh
@@ -16,7 +16,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -41,9 +43,9 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \
+--tensor-parallel-size $TP --data-parallel-size 1 --expert-parallel-size $EP_SIZE \
 --enable-symm-mem \
 --disable-radix-cache \
 --quantization fp8 \
diff --git a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
index 62538b3c8..ea521769d 100644
--- a/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_b300_mtp.sh
@@ -16,7 +16,9 @@ check_env_vars \
 # When MODEL_PATH is unset (stand-alone runs), fall back to the HF_HUB_CACHE
 # Either way, MODEL_PATH is what the server is launched with.
 if [[ -n "${MODEL_PATH:-}" ]]; then
-    hf download "$MODEL" --local-dir "$MODEL_PATH"
+    if [[ ! -d "$MODEL_PATH" || -z "$(ls -A "$MODEL_PATH" 2>/dev/null)" ]]; then
+        hf download "$MODEL" --local-dir "$MODEL_PATH"
+    fi
 else
     hf download "$MODEL"
     export MODEL_PATH="$MODEL"
@@ -41,9 +43,9 @@ fi
 start_gpu_monitor
 
 set -x
-SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL_PATH --host=0.0.0.0 --port=$PORT \
+SGLANG_ENABLE_SPEC_V2=1 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL_PATH --served-model-name $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 --expert-parallel-size=$EP_SIZE \
+--tensor-parallel-size $TP --data-parallel-size 1 --expert-parallel-size $EP_SIZE \
 --enable-symm-mem \
 --disable-radix-cache \
 --quantization fp8 \