From ce884ceaa7441570a31117921be4ef103ae5a7f7 Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Wed, 20 May 2026 15:28:42 -0700 Subject: [PATCH 1/2] h100(agg): migrate model path to /mnt/numa1/shared/models/, drop hf download - launcher: switch HF_HUB_CACHE_MOUNT to /mnt/numa1/shared/models/, export HF_HUB_CACHE to container's ~/.cache/huggingface for eval datasets, and rewrite MODEL from HF id (org/name) to absolute local path under the mount; align the bind-mount target with the host path so the rewritten MODEL resolves inside the container. - bench scripts: drop "hf download $MODEL" from the four in-scope H100 scripts (gptoss, minimaxm2.5, qwen3.5, qwen3.5-mtp); contract is now staged-or-fail. - configs: reduce the same four H100 configs to a single (isl=1024, osl=1024, conc=4) test point to verify model-path wiring end-to-end without burning a full sweep. Out of scope: disagg/multi-node block, agentic configs/bench scripts, and kimik2.5-int4-h100-vllm (agentic-only). --- .github/configs/nvidia-master.yaml | 30 +++---------------- benchmarks/single_node/gptoss_fp4_h100.sh | 1 - .../single_node/minimaxm2.5_fp8_h100.sh | 1 - benchmarks/single_node/qwen3.5_fp8_h100.sh | 1 - .../single_node/qwen3.5_fp8_h100_mtp.sh | 1 - perf-changelog.yaml | 11 +++++++ runners/launch_h100-dgxc-slurm.sh | 16 ++++++++-- 7 files changed, 29 insertions(+), 32 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a683d372..4d54a1814 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -4504,15 +4504,7 @@ gptoss-fp4-h100-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 2, conc-start: 4, conc-end: 4 } minimaxm2.5-fp8-h100-vllm: image: vllm/vllm-openai:v0.21.0 @@ -4527,13 +4519,7 @@ minimaxm2.5-fp8-h100-vllm: - isl: 1024 osl: 1024 search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 4 } # Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is # identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this @@ -9210,11 +9196,7 @@ qwen3.5-fp8-h100-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 } qwen3.5-fp8-h100-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 @@ -9229,8 +9211,4 @@ qwen3.5-fp8-h100-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp } diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index 7208e1b19..c5109306a 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -15,7 +15,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi MAX_MODEL_LEN=10240 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh index 5fd0482cf..076838dd2 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh @@ -17,7 +17,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/qwen3.5_fp8_h100.sh index 4c70657aa..cfc5bc9b4 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h100.sh @@ -25,7 +25,6 @@ fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh index 86b35f5e7..8f774e641 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh @@ -22,7 +22,6 @@ fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_ENABLE_SPEC_V2=1 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c4894c47..870452c87 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3050,3 +3050,14 @@ description: - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475 + +- config-keys: + - gptoss-fp4-h100-vllm + - minimaxm2.5-fp8-h100-vllm + - qwen3.5-fp8-h100-sglang + - qwen3.5-fp8-h100-sglang-mtp + description: + - "Migrate H100 agg model path to /mnt/numa1/shared/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads" + - "Rewrite MODEL in launcher from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail" + - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index b4f594d51..f18f586cd 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -279,7 +279,19 @@ EOF else - HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" + HF_HUB_CACHE_MOUNT="/mnt/numa1/shared/models/" + + # HF_HUB_CACHE is set to help with dataset download inside the container + # for eval jobs. Can be updated to some other path on the cluster and + # mounted just like HF_HUB_CACHE_MOUNT. + export HF_HUB_CACHE="$HOME/.cache/huggingface" + + # Rewrite MODEL from HF id (org/name) to the pre-staged local path under + # HF_HUB_CACHE_MOUNT. Skip if MODEL is already an absolute path. + if [[ -n "$MODEL" && "$MODEL" != /* ]]; then + export MODEL="${HF_HUB_CACHE_MOUNT}${MODEL##*/}" + fi + SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" @@ -306,7 +318,7 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ From 3ab2f5fe7ec46f8afdd7efc5f83f2402e077d1cb Mon Sep 17 00:00:00 2001 From: Ankur-singh Date: Wed, 20 May 2026 15:29:45 -0700 Subject: [PATCH 2/2] perf-changelog: set PR link to #1537 --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 870452c87..c5603ffe9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3060,4 +3060,4 @@ - "Migrate H100 agg model path to /mnt/numa1/shared/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads" - "Rewrite MODEL in launcher from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail" - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1537