From 12c32f1454e9f391741b7bca21e79da52da77a58 Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Sat, 11 Apr 2026 20:38:32 +0100
Subject: [PATCH 1/3] Remove stale preconditioner config names from
 `__init__.py`

`AmortizedPreconditionerConfig` was renamed to
`BaseShampooPreconditionerConfig` and `ShampooPreconditionerConfig` to
`ClassicShampooPreconditionerConfig` in `shampoo_types.py`, but
`__init__.py` still imported and re-exported the old names. Replace
them with the new names in the imports and `__all__`.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 distributed_shampoo/__init__.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/distributed_shampoo/__init__.py b/distributed_shampoo/__init__.py
index b767a52..767cf6a 100644
--- a/distributed_shampoo/__init__.py
+++ b/distributed_shampoo/__init__.py
@@ -34,7 +34,8 @@
 from distributed_shampoo.shampoo_types import (
     AdaGradPreconditionerConfig,
     AdamPreconditionerConfig,
-    AmortizedPreconditionerConfig,
+    BaseShampooPreconditionerConfig,
+    ClassicShampooPreconditionerConfig,
     DDPDistributedConfig,
     DefaultEigenvalueCorrectedShampooConfig,
     DefaultShampooConfig,
@@ -59,7 +60,6 @@
     RootInvShampooPreconditionerConfig,
     ScheduleFreeConfig,
     SGDPreconditionerConfig,
-    ShampooPreconditionerConfig,
     ShampooPT2CompileConfig,
     SignDescentPreconditionerConfig,
     SingleDeviceDistributedConfig,
@@ -85,14 +85,14 @@
     # `precision_config`.
     # `preconditioner_config` options.
     "PreconditionerConfig",  # Abstract base class.
-    "AmortizedPreconditionerConfig",  # Abstract base class (based on `PreconditionerConfig`).
-    "ShampooPreconditionerConfig",  # Abstract base class (based on `AmortizedPreconditionerConfig`).
-    "RootInvShampooPreconditionerConfig",  # Based on `ShampooPreconditionerConfig`.
+    "BaseShampooPreconditionerConfig",  # Abstract base class (based on `PreconditionerConfig`).
+    "ClassicShampooPreconditionerConfig",  # Abstract base class (based on `BaseShampooPreconditionerConfig`).
+    "RootInvShampooPreconditionerConfig",  # Based on `ClassicShampooPreconditionerConfig`.
     "DefaultShampooConfig",  # Default `RootInvShampooPreconditionerConfig` using `EigenConfig`.
     "RootInvKLShampooPreconditionerConfig",  # Based on `RootInvShampooPreconditionerConfig`.
-    "EigendecomposedShampooPreconditionerConfig",  # Based on `ShampooPreconditionerConfig`.
+    "EigendecomposedShampooPreconditionerConfig",  # Based on `ClassicShampooPreconditionerConfig`.
     "EigendecomposedKLShampooPreconditionerConfig",  # Based on `EigendecomposedShampooPreconditionerConfig`.
-    "EigenvalueCorrectedShampooPreconditionerConfig",  # Based on `AmortizedPreconditionerConfig`.
+    "EigenvalueCorrectedShampooPreconditionerConfig",  # Based on `BaseShampooPreconditionerConfig`.
     "DefaultEigenvalueCorrectedShampooConfig",  # Default `EigenvalueCorrectedShampooPreconditionerConfig` using `EighEigendecompositionConfig`.
     "DefaultSOAPConfig",  # Default `EigenvalueCorrectedShampooPreconditionerConfig` using `QREigendecompositionConfig`.
     "SpectralDescentPreconditionerConfig",  # Based on `PreconditionerConfig`.

From 010e368d22d30a6b7ecdc9f382ea987a155924a8 Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Sat, 25 Apr 2026 17:03:21 +0100
Subject: [PATCH 2/3] Fix examples workflow grafting config overrides
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CLI overrides `optimizer.grafting_config={...}` use OmegaConf's
default merge semantics, so fields from the base shampoo.yaml
grafting_config (notably `beta2: 0.999`) leak into overrides whose
`_target_` does not accept them — breaking the AdaGrad and SGD
grafting cases. Replace each override with a delete (`~`) followed by
add (`+`) so the new mapping fully replaces the inherited one.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/examples.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/examples.yaml b/.github/workflows/examples.yaml
index 2cb302e..e0f3e99 100644
--- a/.github/workflows/examples.yaml
+++ b/.github/workflows/examples.yaml
@@ -26,19 +26,19 @@ jobs:
         - name: Run single GPU examples with Distributed Shampoo and different graftings on CPU.
           run: |
             source .venv/bin/activate
-            CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdaGradPreconditionerConfig,epsilon:1e-8}' epochs=1 batch_size=1024
-            CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
-            CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.RMSpropPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
-            CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.SGDPreconditionerConfig}' epochs=1 batch_size=1024
+            CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdaGradPreconditionerConfig,epsilon:1e-8}' epochs=1 batch_size=1024
+            CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
+            CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.RMSpropPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
+            CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.SGDPreconditionerConfig}' epochs=1 batch_size=1024
         - name: Run single GPU example on GPU.
           run: |
             source .venv/bin/activate
-            python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
+            python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
         - name: Run DDP example on CPU.
           run: |
             source .venv/bin/activate
-            CUDA_VISIBLE_DEVICES="" torchrun --standalone --nnodes=1 --nproc_per_node=2 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=15 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024 backend=gloo
+            CUDA_VISIBLE_DEVICES="" torchrun --standalone --nnodes=1 --nproc_per_node=2 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=15 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024 backend=gloo
         - name: Run DDP example on GPU.
           run: |
             source .venv/bin/activate
-            torchrun --standalone --nnodes=1 --nproc_per_node=1 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024
+            torchrun --standalone --nnodes=1 --nproc_per_node=1 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024

From cafa945a93735c9cbdcc82b11b81ccef72e4c45e Mon Sep 17 00:00:00 2001
From: runame <re393@cam.ac.uk>
Date: Sat, 25 Apr 2026 17:33:46 +0100
Subject: [PATCH 3/3] Skip GPU-only example steps when runner has no usable GPU

The `4-core-ubuntu-gpu-t4` runner currently has an outdated NVIDIA
driver, so `torch.cuda.is_available()` returns False and the
"Run DDP example on GPU." step crashes with
`ProcessGroupNCCL is only supported with GPUs, no GPUs found!`.
The "Run single GPU example on GPU." step "passes" only because
torch silently falls back to CPU.

Add a `gpu_check` step that probes `torch.cuda.is_available()` and
gate both GPU-only steps on its output. If no GPU is detected, those
steps are skipped (with a workflow warning) and the job stays green.
When the runner image is fixed and a GPU is actually available, both
steps run as before with no other changes needed.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/workflows/examples.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/.github/workflows/examples.yaml b/.github/workflows/examples.yaml
index e0f3e99..5a0fe19 100644
--- a/.github/workflows/examples.yaml
+++ b/.github/workflows/examples.yaml
@@ -19,6 +19,16 @@ jobs:
           run: |
             uv venv && source .venv/bin/activate
             uv pip install ".[examples]"
+        - name: Detect GPU availability.
+          id: gpu_check
+          run: |
+            source .venv/bin/activate
+            if python -c "import torch; raise SystemExit(0 if torch.cuda.is_available() else 1)"; then
+              echo "has_gpu=true" >> "$GITHUB_OUTPUT"
+            else
+              echo "has_gpu=false" >> "$GITHUB_OUTPUT"
+              echo "::warning::No usable GPU detected on this runner; GPU-only steps will be skipped."
+            fi
         - name: Run single GPU example with Adam to serve as a baseline.
           run: |
             source .venv/bin/activate
@@ -31,6 +41,7 @@ jobs:
             CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.RMSpropPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
             CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.SGDPreconditionerConfig}' epochs=1 batch_size=1024
         - name: Run single GPU example on GPU.
+          if: steps.gpu_check.outputs.has_gpu == 'true'
           run: |
             source .venv/bin/activate
             python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024
@@ -39,6 +50,7 @@ jobs:
             source .venv/bin/activate
             CUDA_VISIBLE_DEVICES="" torchrun --standalone --nnodes=1 --nproc_per_node=2 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=15 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024 backend=gloo
         - name: Run DDP example on GPU.
+          if: steps.gpu_check.outputs.has_gpu == 'true'
           run: |
             source .venv/bin/activate
             torchrun --standalone --nnodes=1 --nproc_per_node=1 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024