Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 4 additions & 26 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4504,15 +4504,7 @@ gptoss-fp4-h100-vllm:
- isl: 1024
osl: 1024
search-space:
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
- { tp: 2, conc-start: 4, conc-end: 64 }
- { tp: 4, conc-start: 4, conc-end: 64 }
- { tp: 8, conc-start: 4, conc-end: 16 }
- { tp: 2, conc-start: 4, conc-end: 4 }

minimaxm2.5-fp8-h100-vllm:
image: vllm/vllm-openai:v0.21.0
Expand All @@ -4527,13 +4519,7 @@ minimaxm2.5-fp8-h100-vllm:
- isl: 1024
osl: 1024
search-space:
# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
- isl: 8192
osl: 1024
search-space:
# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
- { tp: 4, ep: 4, conc-start: 4, conc-end: 4 }

# Diverged from minimaxm2.5-fp8-h100-vllm (agentic-coding sibling). Metadata is
# identical to origin/main's minimaxm2.5-fp8-h100-vllm; the split exists because this
Expand Down Expand Up @@ -9210,11 +9196,7 @@ qwen3.5-fp8-h100-sglang:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 4 }

qwen3.5-fp8-h100-sglang-mtp:
image: lmsysorg/sglang:v0.5.12-cu130
Expand All @@ -9229,8 +9211,4 @@ qwen3.5-fp8-h100-sglang-mtp:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 8, conc-start: 4, conc-end: 32, spec-decoding: mtp }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 4, spec-decoding: mtp }
1 change: 0 additions & 1 deletion benchmarks/single_node/gptoss_fp4_h100.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

MAX_MODEL_LEN=10240

Expand Down
1 change: 0 additions & 1 deletion benchmarks/single_node/minimaxm2.5_fp8_h100.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

nvidia-smi

Expand Down
1 change: 0 additions & 1 deletion benchmarks/single_node/qwen3.5_fp8_h100.sh
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ fi

nvidia-smi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
Expand Down
1 change: 0 additions & 1 deletion benchmarks/single_node/qwen3.5_fp8_h100_mtp.sh
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ fi

nvidia-smi

if [[ "$MODEL" != /* ]]; then hf download "$MODEL"; fi

export SGLANG_ENABLE_SPEC_V2=1

Expand Down
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3050,3 +3050,14 @@
description:
- "Update SGLang image from v0.5.11-cu130 (5d old) to v0.5.12-cu130"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1475

- config-keys:
- gptoss-fp4-h100-vllm
- minimaxm2.5-fp8-h100-vllm
- qwen3.5-fp8-h100-sglang
- qwen3.5-fp8-h100-sglang-mtp
description:
- "Migrate H100 agg model path to /mnt/numa1/shared/models/ (audit-recommended root); set HF_HUB_CACHE to container's ~/.cache/huggingface for eval dataset downloads"
- "Rewrite MODEL in launcher from HF id (org/name) to absolute local path under HF_HUB_CACHE_MOUNT; remove hf download from bench scripts so contract is staged-or-fail"
- "Reduce search-space to single (isl=1024, osl=1024, conc=4) point per config to verify model-path wiring end-to-end"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1537
16 changes: 14 additions & 2 deletions runners/launch_h100-dgxc-slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -279,7 +279,19 @@ EOF

else

HF_HUB_CACHE_MOUNT="/mnt/nfs/sa-shared/gharunners/hf-hub-cache/"
HF_HUB_CACHE_MOUNT="/mnt/numa1/shared/models/"

# HF_HUB_CACHE is set to help with dataset download inside the container
# for eval jobs. Can be updated to some other path on the cluster and
# mounted just like HF_HUB_CACHE_MOUNT.
export HF_HUB_CACHE="$HOME/.cache/huggingface"

# Rewrite MODEL from HF id (org/name) to the pre-staged local path under
# HF_HUB_CACHE_MOUNT. Skip if MODEL is already an absolute path.
if [[ -n "$MODEL" && "$MODEL" != /* ]]; then
export MODEL="${HF_HUB_CACHE_MOUNT}${MODEL##*/}"
fi

SQUASH_FILE="/mnt/nfs/lustre/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"

Expand All @@ -306,7 +318,7 @@ else

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--no-container-mount-home \
--container-workdir=/workspace/ \
--no-container-entrypoint --export=ALL,PORT=8888 \
Expand Down