From 48c33883dfe641ec5e88d887a4d406cd4a17c043 Mon Sep 17 00:00:00 2001 From: functionstackx <47992694+functionstackx@users.noreply.github.com> Date: Mon, 18 May 2026 12:26:58 -0400 Subject: [PATCH] runners(mi355x): exclude broken nodes mia1-p01-g09 + mia1-p01-g11 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root-caused via the failed sweeps on #1431, #1432, #1440, #1441, #1443 — every failure landed on either: mia1-p01-g09 pyxis: failed to create container filesystem (extended attributes not supported on the destination filesystem; pyxis can't mount the squashfs) mia1-p01-g11 permission denied while trying to connect to docker.sock (cluster-cleanup `docker stop` step fails; cascading into pyxis-init failure) Both are already known-bad per KLAUD_DEBUG.md §5.1 / §5.2, but the launcher wasn't excluding them. This mirrors the existing pattern in runners/launch_mi300x-amds.sh (#1462 — pin to known-good nodes) and runners/launch_mi325x-amds.sh (#1477 — exclude chi-mi325x-pod1-121). Once this lands the 5 affected mi355x PRs can be rebased to pick it up and the failed jobs will land on healthy nodes only. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_mi355x-amds.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 5ea1c86b7..2f700d4e7 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -187,7 +187,11 @@ else LOCK_FILE="${SQUASH_FILE}.lock" set -x - salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME" + # Exclude known-bad mi355x compute nodes (KLAUD_DEBUG §5.1 / §5.2): + # mia1-p01-g09: pyxis broken (persistently fails to create container filesystem) + # mia1-p01-g11: docker.sock permissions denied (cluster-cleanup step fails) + # Both have been root-caused via #1431/#1432/#1440/#1441/#1443 sweep failures. + salloc --partition=$PARTITION --exclude=mia1-p01-g09,mia1-p01-g11 --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1) srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"