diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 5ea1c86b7..2f700d4e7 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -187,7 +187,11 @@ else LOCK_FILE="${SQUASH_FILE}.lock" set -x - salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME" + # Exclude known-bad mi355x compute nodes (KLAUD_DEBUG §5.1 / §5.2): + # mia1-p01-g09: pyxis broken (persistently fails to create container filesystem) + # mia1-p01-g11: docker.sock permissions denied (cluster-cleanup step fails) + # Both have been root-caused via #1431/#1432/#1440/#1441/#1443 sweep failures. + salloc --partition=$PARTITION --exclude=mia1-p01-g09,mia1-p01-g11 --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"