From ce884ceaa7441570a31117921be4ef103ae5a7f7 Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Wed, 20 May 2026 15:28:42 -0700
Subject: [PATCH 1/2] h100(agg): migrate model path to
 /mnt/numa1/shared/models/, drop hf download

- launcher: switch HF_HUB_CACHE_MOUNT to /mnt/numa1/shared/models/, export HF_HUB_CACHE to container's ~/.cache/huggingface for eval datasets, and rewrite MODEL from HF id (org/name) to absolute local path under the mount; align the bind-mount target with the host path so the rewritten MODEL resolves inside the container.
- bench scripts: drop "hf download $MODEL" from the four in-scope H100 scripts (gptoss, minimaxm2.5, qwen3.5, qwen3.5-mtp); contract is now staged-or-fail.
- configs: reduce the same four H100 configs to a single (isl=1024, osl=1024, conc=4) test point to verify model-path wiring end-to-end without burning a full sweep.

Out of scope: disagg/multi-node block, agentic configs/bench scripts, and kimik2.5-int4-h100-vllm (agentic-only).
---
 .github/configs/nvidia-master.yaml            | 30 +++----------------
 benchmarks/single_node/gptoss_fp4_h100.sh     |  1 -
 .../single_node/minimaxm2.5_fp8_h100.sh       |  1 -
 benchmarks/single_node/qwen3.5_fp8_h100.sh    |  1 -
 .../single_node/qwen3.5_fp8_h100_mtp.sh       |  1 -
 perf-changelog.yaml                           | 11 +++++++
 runners/launch_h100-dgxc-slurm.sh             | 16 ++++++++--
 7 files changed, 29 insertions(+), 32 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 4a683d372..4d54a1814 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -4504,15 +4504,7 @@ gptoss-fp4-h100-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 2, conc-start: 4, conc-end: 64 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 2, conc-start: 4, conc-end: 64 }
-      - { tp: 4, conc-start: 4, conc-end: 64 }
-      - { tp: 8, conc-start: 4, conc-end: 16 }
+      - { tp: 2, conc-start: 4, conc-end: 4 }
 
 minimaxm2.5-fp8-h100-vllm:
   image: vllm/vllm-openai:v0.21.0
@@ -4527,13 +4519,7 @@ minimaxm2.5-fp8-h100-vllm:
     - isl: 1024
       osl: 1024
       search-space:
-      # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-      - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+      - { tp: 4, ep: 4, conc-start: 4, conc-end: 4 }
 
 # Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
 # identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
@@ -9210,11 +9196,7 @@ qwen3.5-fp8-h100-sglang:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 }
 
 qwen3.5-fp8-h100-sglang-mtp:
   image: lmsysorg/sglang:v0.5.12-cu130
@@ -9229,8 +9211,4 @@ qwen3.5-fp8-h100-sglang-mtp:
     - isl: 1024
       osl: 1024
       search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
-    - isl: 8192
-      osl: 1024
-      search-space:
-      - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
+      - { tp: 8, ep: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp }
diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh
index 7208e1b19..c5109306a 100644
--- a/benchmarks/single_node/gptoss_fp4_h100.sh
+++ b/benchmarks/single_node/gptoss_fp4_h100.sh
@@ -15,7 +15,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 MAX_MODEL_LEN=10240
 
diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
index 5fd0482cf..076838dd2 100755
--- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
+++ b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh
@@ -17,7 +17,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 nvidia-smi
 
diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/qwen3.5_fp8_h100.sh
index 4c70657aa..cfc5bc9b4 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h100.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h100.sh
@@ -25,7 +25,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh
index 86b35f5e7..8f774e641 100755
--- a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh
+++ b/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh
@@ -22,7 +22,6 @@ fi
 
 nvidia-smi
 
-if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi
 
 export SGLANG_ENABLE_SPEC_V2=1
 
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9c4894c47..870452c87 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3050,3 +3050,14 @@
   description:
     - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475
+
+- config-keys:
+    - gptoss-fp4-h100-vllm
+    - minimaxm2.5-fp8-h100-vllm
+    - qwen3.5-fp8-h100-sglang
+    - qwen3.5-fp8-h100-sglang-mtp
+  description:
+    - "Migrate H100 agg model path to /mnt/numa1/shared/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads"
+    - "Rewrite MODEL in launcher from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail"
+    - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh
index b4f594d51..f18f586cd 100644
--- a/runners/launch_h100-dgxc-slurm.sh
+++ b/runners/launch_h100-dgxc-slurm.sh
@@ -279,7 +279,19 @@ EOF
 
 else
 
-    HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/"
+    HF_HUB_CACHE_MOUNT="/mnt/numa1/shared/models/"
+    
+    # HF_HUB_CACHE is set to help with dataset download inside the container
+    # for eval jobs. Can be updated to some other path on the cluster and
+    # mounted just like HF_HUB_CACHE_MOUNT.
+    export HF_HUB_CACHE="$HOME/.cache/huggingface"
+
+    # Rewrite MODEL from HF id (org/name) to the pre-staged local path under
+    # HF_HUB_CACHE_MOUNT. Skip if MODEL is already an absolute path.
+    if [[ -n "$MODEL" && "$MODEL" != /* ]]; then
+        export MODEL="${HF_HUB_CACHE_MOUNT}${MODEL##*/}"
+    fi
+
     SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     LOCK_FILE="${SQUASH_FILE}.lock"
 
@@ -306,7 +318,7 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \

From 3ab2f5fe7ec46f8afdd7efc5f83f2402e077d1cb Mon Sep 17 00:00:00 2001
From: Ankur-singh <ankusingh@nvidia.com>
Date: Wed, 20 May 2026 15:29:45 -0700
Subject: [PATCH 2/2] perf-changelog: set PR link to #1537

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 870452c87..c5603ffe9 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -3060,4 +3060,4 @@
     - "Migrate H100 agg model path to /mnt/numa1/shared/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads"
     - "Rewrite MODEL in launcher from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail"
     - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1537