From 12c32f1454e9f391741b7bca21e79da52da77a58 Mon Sep 17 00:00:00 2001 From: runame Date: Sat, 11 Apr 2026 20:38:32 +0100 Subject: [PATCH 1/3] Remove stale preconditioner config names from `__init__.py` `AmortizedPreconditionerConfig` was renamed to `BaseShampooPreconditionerConfig` and `ShampooPreconditionerConfig` to `ClassicShampooPreconditionerConfig` in `shampoo_types.py`, but `__init__.py` still imported and re-exported the old names. Replace them with the new names in the imports and `__all__`. Co-Authored-By: Claude Opus 4.6 (1M context) --- distributed_shampoo/__init__.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/distributed_shampoo/__init__.py b/distributed_shampoo/__init__.py index b767a52..767cf6a 100644 --- a/distributed_shampoo/__init__.py +++ b/distributed_shampoo/__init__.py @@ -34,7 +34,8 @@ from distributed_shampoo.shampoo_types import ( AdaGradPreconditionerConfig, AdamPreconditionerConfig, - AmortizedPreconditionerConfig, + BaseShampooPreconditionerConfig, + ClassicShampooPreconditionerConfig, DDPDistributedConfig, DefaultEigenvalueCorrectedShampooConfig, DefaultShampooConfig, @@ -59,7 +60,6 @@ RootInvShampooPreconditionerConfig, ScheduleFreeConfig, SGDPreconditionerConfig, - ShampooPreconditionerConfig, ShampooPT2CompileConfig, SignDescentPreconditionerConfig, SingleDeviceDistributedConfig, @@ -85,14 +85,14 @@ # `precision_config`. # `preconditioner_config` options. "PreconditionerConfig", # Abstract base class. - "AmortizedPreconditionerConfig", # Abstract base class (based on `PreconditionerConfig`). - "ShampooPreconditionerConfig", # Abstract base class (based on `AmortizedPreconditionerConfig`). - "RootInvShampooPreconditionerConfig", # Based on `ShampooPreconditionerConfig`. + "BaseShampooPreconditionerConfig", # Abstract base class (based on `PreconditionerConfig`). + "ClassicShampooPreconditionerConfig", # Abstract base class (based on `BaseShampooPreconditionerConfig`). + "RootInvShampooPreconditionerConfig", # Based on `ClassicShampooPreconditionerConfig`. "DefaultShampooConfig", # Default `RootInvShampooPreconditionerConfig` using `EigenConfig`. "RootInvKLShampooPreconditionerConfig", # Based on `RootInvShampooPreconditionerConfig`. - "EigendecomposedShampooPreconditionerConfig", # Based on `ShampooPreconditionerConfig`. + "EigendecomposedShampooPreconditionerConfig", # Based on `ClassicShampooPreconditionerConfig`. "EigendecomposedKLShampooPreconditionerConfig", # Based on `EigendecomposedShampooPreconditionerConfig`. - "EigenvalueCorrectedShampooPreconditionerConfig", # Based on `AmortizedPreconditionerConfig`. + "EigenvalueCorrectedShampooPreconditionerConfig", # Based on `BaseShampooPreconditionerConfig`. "DefaultEigenvalueCorrectedShampooConfig", # Default `EigenvalueCorrectedShampooPreconditionerConfig` using `EighEigendecompositionConfig`. "DefaultSOAPConfig", # Default `EigenvalueCorrectedShampooPreconditionerConfig` using `QREigendecompositionConfig`. "SpectralDescentPreconditionerConfig", # Based on `PreconditionerConfig`. From 010e368d22d30a6b7ecdc9f382ea987a155924a8 Mon Sep 17 00:00:00 2001 From: runame Date: Sat, 25 Apr 2026 17:03:21 +0100 Subject: [PATCH 2/3] Fix examples workflow grafting config overrides MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CLI overrides `optimizer.grafting_config={...}` use OmegaConf's default merge semantics, so fields from the base shampoo.yaml grafting_config (notably `beta2: 0.999`) leak into overrides whose `_target_` does not accept them — breaking the AdaGrad and SGD grafting cases. Replace each override with a delete (`~`) followed by add (`+`) so the new mapping fully replaces the inherited one. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/examples.yaml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/examples.yaml b/.github/workflows/examples.yaml index 2cb302e..e0f3e99 100644 --- a/.github/workflows/examples.yaml +++ b/.github/workflows/examples.yaml @@ -26,19 +26,19 @@ jobs: - name: Run single GPU examples with Distributed Shampoo and different graftings on CPU. run: | source .venv/bin/activate - CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdaGradPreconditionerConfig,epsilon:1e-8}' epochs=1 batch_size=1024 - CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024 - CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.RMSpropPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024 - CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.SGDPreconditionerConfig}' epochs=1 batch_size=1024 + CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdaGradPreconditionerConfig,epsilon:1e-8}' epochs=1 batch_size=1024 + CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024 + CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.RMSpropPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024 + CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.SGDPreconditionerConfig}' epochs=1 batch_size=1024 - name: Run single GPU example on GPU. run: | source .venv/bin/activate - python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024 + python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024 - name: Run DDP example on CPU. run: | source .venv/bin/activate - CUDA_VISIBLE_DEVICES="" torchrun --standalone --nnodes=1 --nproc_per_node=2 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=15 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024 backend=gloo + CUDA_VISIBLE_DEVICES="" torchrun --standalone --nnodes=1 --nproc_per_node=2 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=15 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024 backend=gloo - name: Run DDP example on GPU. run: | source .venv/bin/activate - torchrun --standalone --nnodes=1 --nproc_per_node=1 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=30 'optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024 + torchrun --standalone --nnodes=1 --nproc_per_node=1 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024 From cafa945a93735c9cbdcc82b11b81ccef72e4c45e Mon Sep 17 00:00:00 2001 From: runame Date: Sat, 25 Apr 2026 17:33:46 +0100 Subject: [PATCH 3/3] Skip GPU-only example steps when runner has no usable GPU The `4-core-ubuntu-gpu-t4` runner currently has an outdated NVIDIA driver, so `torch.cuda.is_available()` returns False and the "Run DDP example on GPU." step crashes with `ProcessGroupNCCL is only supported with GPUs, no GPUs found!`. The "Run single GPU example on GPU." step "passes" only because torch silently falls back to CPU. Add a `gpu_check` step that probes `torch.cuda.is_available()` and gate both GPU-only steps on its output. If no GPU is detected, those steps are skipped (with a workflow warning) and the job stays green. When the runner image is fixed and a GPU is actually available, both steps run as before with no other changes needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/examples.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/examples.yaml b/.github/workflows/examples.yaml index e0f3e99..5a0fe19 100644 --- a/.github/workflows/examples.yaml +++ b/.github/workflows/examples.yaml @@ -19,6 +19,16 @@ jobs: run: | uv venv && source .venv/bin/activate uv pip install ".[examples]" + - name: Detect GPU availability. + id: gpu_check + run: | + source .venv/bin/activate + if python -c "import torch; raise SystemExit(0 if torch.cuda.is_available() else 1)"; then + echo "has_gpu=true" >> "$GITHUB_OUTPUT" + else + echo "has_gpu=false" >> "$GITHUB_OUTPUT" + echo "::warning::No usable GPU detected on this runner; GPU-only steps will be skipped." + fi - name: Run single GPU example with Adam to serve as a baseline. run: | source .venv/bin/activate @@ -31,6 +41,7 @@ jobs: CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.RMSpropPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024 CUDA_VISIBLE_DEVICES="" python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.SGDPreconditionerConfig}' epochs=1 batch_size=1024 - name: Run single GPU example on GPU. + if: steps.gpu_check.outputs.has_gpu == 'true' run: | source .venv/bin/activate python -m distributed_shampoo.examples.cifar10_example optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 batch_size=1024 @@ -39,6 +50,7 @@ jobs: source .venv/bin/activate CUDA_VISIBLE_DEVICES="" torchrun --standalone --nnodes=1 --nproc_per_node=2 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=15 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024 backend=gloo - name: Run DDP example on GPU. + if: steps.gpu_check.outputs.has_gpu == 'true' run: | source .venv/bin/activate torchrun --standalone --nnodes=1 --nproc_per_node=1 -m distributed_shampoo.examples.cifar10_example parallelism=ddp optimizer=shampoo optimizer.precondition_frequency=30 '~optimizer.grafting_config' '+optimizer.grafting_config={_target_:distributed_shampoo.AdamPreconditionerConfig,beta2:0.999,epsilon:1e-8}' epochs=1 local_batch_size=1024