SemiAnalysisAI · functionstackx · May 18, 2026 · May 18, 2026 · claude · May 18, 2026
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
@@ -187,7 +187,11 @@
     LOCK_FILE="${SQUASH_FILE}.lock"
 
     set -x
-    salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME"
+    # Exclude known-bad mi355x compute nodes (KLAUD_DEBUG §5.1 / §5.2):
+    #   mia1-p01-g09: pyxis broken (persistently fails to create container filesystem)
+    #   mia1-p01-g11: docker.sock permissions denied (cluster-cleanup step fails)
+    # Both have been root-caused via #1431/#1432/#1440/#1441/#1443 sweep failures.
+    salloc --partition=$PARTITION --exclude=mia1-p01-g09,mia1-p01-g11 --gres=gpu:$TP --exclusive --cpus-per-task=128 --time=500 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -h -o %A | head -n1)
 
     srun --jobid=$JOB_ID bash -c "docker stop \$(docker ps -a -q)"