NVIDIA · yuanchen8911 · Jun 11, 2026 · coderabbitai · Jun 11, 2026 · coderabbitai
@@ -1,6 +1,8 @@
 # GKE TCPXO Networking Prerequisites
 
-For `*-gke-cos-training*` recipes, GPUDirect TCPXO enables high-speed inter-node GPU communication on GKE. Without it, NCCL falls back to TCP (~4 GB/s vs ~340 GB/s with TCPXO).
+For the **H100 GKE COS training** recipes (`h100-gke-cos-training*`, on `a3-megagpu-8g` nodes), GPUDirect TCPXO enables high-speed inter-node GPU communication on GKE. Without it, NCCL falls back to TCP (~4 GB/s vs ~340 GB/s with TCPXO).
+
+> **A100 (a2) exception:** the `a100-gke-cos-training*` recipes intentionally omit the `gke-nccl-tcpxo` component — GPUDirect TCPXO targets H100 `a3-megagpu-8g` nodes, not the A100 `a2-highgpu`/`a2-ultragpu` machine family. The prerequisites below do **not** apply to A100 GKE recipes, and the generated A100 bundle does not install the TCPXO DaemonSets.
 
 ## Infrastructure Prerequisites
 

@@ -0,0 +1,54 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Cross-cutting overlay applied via criteria-wildcard matching.
+#
+# Carries the deployment-phase floor (the 4 standard checks plus the
+# gpu-operator version pin) and applies to every A100 query regardless
+# of service or intent, so every concrete A100 leaf (training or
+# inference, any service) inherits the version pin.
+#
+# Per-field union merge (see pkg/recipe/metadata.go) means concrete leaves
+# that declare their own `deployment:` block add to or override this floor
+# without dropping its inherited checks — same-name constraints from the
+# leaf win, additional checks are appended.
+#
+# See docs/contributor/recipe.md#criteria-wildcard-overlays for details.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: a100-any
+
+spec:
+  base: base
+
+  criteria:
+    service: any
+    accelerator: a100
+
+  validation:
+    deployment:
+      checks:
+        - operator-health
+        - expected-resources
+        - gpu-operator-version
+        - check-nvidia-smi
+      constraints:
+        # A100 has been supported since the early gpu-operator releases
+        # (v22.9). Floor at the same generation baseline as H100/H200
+        # (v24.6.0) rather than the Blackwell floor; concrete leaves can
+        # tighten if they pin to a later working version.
+        - name: Deployment.gpu-operator.version
+          value: ">= v24.6.0"
@@ -0,0 +1,49 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: a100-gke-cos-training-kubeflow
+
+spec:
+  # Inherits from a100-gke-cos-training recipe (A100 + GKE COS + training settings)
+  # This overlay adds Kubeflow Training Operator for distributed training with TrainJob
+  base: a100-gke-cos-training
+
+  criteria:
+    service: gke
+    accelerator: a100
+    os: cos
+    intent: training
+    platform: kubeflow
+
+  # Constraints for A100 on GKE COS for Kubeflow training workloads
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.30"
+
+  # Kubeflow Training Operator for TrainJob support.
+  # Declared inline (not via the platform-kubeflow mixin) to match the GKE COS
+  # pattern in h100-gke-cos-training-kubeflow.
+  componentRefs:
+    - name: kubeflow-trainer
+      type: Helm
+      valuesFile: components/kubeflow-trainer/values.yaml
+      manifestFiles:
+        - components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml
+      dependencyRefs:
+        - cert-manager
+        - kube-prometheus-stack
+        - gpu-operator
@@ -0,0 +1,89 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+kind: RecipeMetadata
+apiVersion: aicr.nvidia.com/v1alpha1
+metadata:
+  name: a100-gke-cos-training
+
+spec:
+  # Inherits from gke-cos-training recipe (GKE COS + training settings)
+  base: gke-cos-training
+
+  criteria:
+    service: gke
+    accelerator: a100
+    os: cos
+    intent: training
+
+  # Specific constraints for A100 on GKE COS training workloads.
+  # A100 has no IMEX/NVLink ComputeDomain requirement, so the recipe keeps
+  # the GKE COS training baseline rather than the H100 1.32 floor.
+  constraints:
+    - name: K8s.server.version
+      value: ">= 1.30"
+
+  componentRefs:
+    # A100-specific GPU Operator overrides (inherits valuesFile from gke-cos-training).
+    #
+    # Nodewright tuning is intentionally omitted. The nvidia-tuning-gke package
+    # ships baked-in profiles only for gke-h100 / gke-b200; there is no A100
+    # target. The EKS nvidia-tuned "generic" profile is not a fallback here: it
+    # applies reboot/bootloader changes, but GKE COS is immutable and
+    # tuning-gke.yaml deliberately limits itself to non-disruptive tuning. The
+    # nodewright-operator itself is still inherited from gke-cos.
+    #
+    # gke-nccl-tcpxo is also omitted: GPUDirect-TCPXO targets H100 a3-mega
+    # nodes, not the A100 a2 (a2-highgpu / a2-ultragpu) machine family.
+    - name: gpu-operator
+      type: Helm
+      dependencyRefs:
+        - nfd
+        - cert-manager
+        - kube-prometheus-stack
+      overrides:
+        cdi:
+          enabled: true
+
+    - name: nfd
+      type: Helm
+      overrides:
+        topologyUpdater:
+          enable: true
+
+  # Validation checks for A100 on GKE COS training workloads.
+  # Defined at the intent layer (not OS-specific) so all variants inherit them.
+  #
+  # The deployment-phase floor (4 standard checks + gpu-operator version pin)
+  # is contributed by the a100-any cross-cutting overlay and is not duplicated
+  # here.
+  #
+  # Performance gating is intentionally omitted until an empirical A100-on-GKE
+  # NCCL baseline is established. The H100 GKE training overlay pins an absolute
+  # nccl-all-reduce-bw floor (>= 250) calibrated on 8-GPU H100 NVLink nodes;
+  # that value is neither fabric-class aware (https://github.com/NVIDIA/aicr/issues/1256)
+  # nor valid for A100, so carrying it would only false-fail healthy runs.
+  validation:
+    conformance:
+      checks:
+        - platform-health
+        - gpu-operator-health
+        - dra-support
+        - accelerator-metrics
+        - ai-service-metrics
+        - gang-scheduling
+        - pod-autoscaling
+        - cluster-autoscaling
+        - robust-controller
+        - secure-accelerator-access