diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 4a683d372..4d54a1814 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -4504,15 +4504,7 @@ gptoss-fp4-h100-vllm: - isl: 1024 osl: 1024 search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 2, conc-start: 4, conc-end: 4 } minimaxm2.5-fp8-h100-vllm: image: vllm/vllm-openai:v0.21.0 @@ -4527,13 +4519,7 @@ minimaxm2.5-fp8-h100-vllm: - isl: 1024 osl: 1024 search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - # - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 4, conc-end: 4 } # Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is # identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this @@ -9210,11 +9196,7 @@ qwen3.5-fp8-h100-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 4 } qwen3.5-fp8-h100-sglang-mtp: image: lmsysorg/sglang:v0.5.12-cu130 @@ -9229,8 +9211,4 @@ qwen3.5-fp8-h100-sglang-mtp: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp } diff --git a/benchmarks/single_node/gptoss_fp4_h100.sh b/benchmarks/single_node/gptoss_fp4_h100.sh index 7208e1b19..c5109306a 100644 --- a/benchmarks/single_node/gptoss_fp4_h100.sh +++ b/benchmarks/single_node/gptoss_fp4_h100.sh @@ -15,7 +15,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi MAX_MODEL_LEN=10240 diff --git a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh index 5fd0482cf..076838dd2 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_h100.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_h100.sh @@ -17,7 +17,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi nvidia-smi diff --git a/benchmarks/single_node/qwen3.5_fp8_h100.sh b/benchmarks/single_node/qwen3.5_fp8_h100.sh index 4c70657aa..cfc5bc9b4 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h100.sh @@ -25,7 +25,6 @@ fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} diff --git a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh b/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh index 86b35f5e7..8f774e641 100755 --- a/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh +++ b/benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh @@ -22,7 +22,6 @@ fi nvidia-smi -if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi export SGLANG_ENABLE_SPEC_V2=1 diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c4894c47..c5603ffe9 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -3050,3 +3050,14 @@ description: - "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475 + +- config-keys: + - gptoss-fp4-h100-vllm + - minimaxm2.5-fp8-h100-vllm + - qwen3.5-fp8-h100-sglang + - qwen3.5-fp8-h100-sglang-mtp + description: + - "Migrate H100 agg model path to /mnt/numa1/shared/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads" + - "Rewrite MODEL in launcher from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail" + - "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1537 diff --git a/runners/launch_h100-dgxc-slurm.sh b/runners/launch_h100-dgxc-slurm.sh index b4f594d51..f18f586cd 100644 --- a/runners/launch_h100-dgxc-slurm.sh +++ b/runners/launch_h100-dgxc-slurm.sh @@ -279,7 +279,19 @@ EOF else - HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/" + HF_HUB_CACHE_MOUNT="/mnt/numa1/shared/models/" + + # HF_HUB_CACHE is set to help with dataset download inside the container + # for eval jobs. Can be updated to some other path on the cluster and + # mounted just like HF_HUB_CACHE_MOUNT. + export HF_HUB_CACHE="$HOME/.cache/huggingface" + + # Rewrite MODEL from HF id (org/name) to the pre-staged local path under + # HF_HUB_CACHE_MOUNT. Skip if MODEL is already an absolute path. + if [[ -n "$MODEL" && "$MODEL" != /* ]]; then + export MODEL="${HF_HUB_CACHE_MOUNT}${MODEL##*/}" + fi + SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" @@ -306,7 +318,7 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \