diff --git a/docs/integrator/gke-tcpxo-networking.md b/docs/integrator/gke-tcpxo-networking.md index 28b869867..6935f7b3a 100644 --- a/docs/integrator/gke-tcpxo-networking.md +++ b/docs/integrator/gke-tcpxo-networking.md @@ -1,6 +1,8 @@ # GKE TCPXO Networking Prerequisites -For `*-gke-cos-training*` recipes, GPUDirect TCPXO enables high-speed inter-node GPU communication on GKE. Without it, NCCL falls back to TCP (~4 GB/s vs ~340 GB/s with TCPXO). +For the **H100 GKE COS training** recipes (`h100-gke-cos-training*`, on `a3-megagpu-8g` nodes), GPUDirect TCPXO enables high-speed inter-node GPU communication on GKE. Without it, the NVIDIA Collective Communications Library (NCCL) falls back to TCP (~4 GB/s vs ~340 GB/s with TCPXO). + +> **A100 (a2) exception:** the `a100-gke-cos-training*` recipes intentionally omit the `gke-nccl-tcpxo` component — GPUDirect TCPXO targets H100 `a3-megagpu-8g` nodes, not the A100 `a2-highgpu`/`a2-ultragpu` machine family. The prerequisites below do **not** apply to A100 GKE recipes, and the generated A100 bundle does not install the TCPXO DaemonSets. ## Infrastructure Prerequisites diff --git a/recipes/overlays/a100-any.yaml b/recipes/overlays/a100-any.yaml new file mode 100644 index 000000000..7199be02c --- /dev/null +++ b/recipes/overlays/a100-any.yaml @@ -0,0 +1,54 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Cross-cutting overlay applied via criteria-wildcard matching. +# +# Carries the deployment-phase floor (the 4 standard checks plus the +# gpu-operator version pin) and applies to every A100 query regardless +# of service or intent, so every concrete A100 leaf (training or +# inference, any service) inherits the version pin. +# +# Per-field union merge (see pkg/recipe/metadata.go) means concrete leaves +# that declare their own `deployment:` block add to or override this floor +# without dropping its inherited checks — same-name constraints from the +# leaf win, additional checks are appended. +# +# See docs/contributor/recipe.md#criteria-wildcard-overlays for details. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: a100-any + +spec: + base: base + + criteria: + service: any + accelerator: a100 + + validation: + deployment: + checks: + - operator-health + - expected-resources + - gpu-operator-version + - check-nvidia-smi + constraints: + # A100 has been supported since the early gpu-operator releases + # (v22.9). Floor at the same generation baseline as H100/H200 + # (v24.6.0) rather than the Blackwell floor; concrete leaves can + # tighten if they pin to a later working version. + - name: Deployment.gpu-operator.version + value: ">= v24.6.0" diff --git a/recipes/overlays/a100-gke-cos-training-kubeflow.yaml b/recipes/overlays/a100-gke-cos-training-kubeflow.yaml new file mode 100644 index 000000000..22520f5d8 --- /dev/null +++ b/recipes/overlays/a100-gke-cos-training-kubeflow.yaml @@ -0,0 +1,49 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: a100-gke-cos-training-kubeflow + +spec: + # Inherits from a100-gke-cos-training recipe (A100 + GKE COS + training settings) + # This overlay adds Kubeflow Training Operator for distributed training with TrainJob + base: a100-gke-cos-training + + criteria: + service: gke + accelerator: a100 + os: cos + intent: training + platform: kubeflow + + # Constraints for A100 on GKE COS for Kubeflow training workloads + constraints: + - name: K8s.server.version + value: ">= 1.30" + + # Kubeflow Training Operator for TrainJob support. + # Declared inline (not via the platform-kubeflow mixin) to match the GKE COS + # pattern in h100-gke-cos-training-kubeflow. + componentRefs: + - name: kubeflow-trainer + type: Helm + valuesFile: components/kubeflow-trainer/values.yaml + manifestFiles: + - components/kubeflow-trainer/manifests/torch-distributed-cluster-training-runtime.yaml + dependencyRefs: + - cert-manager + - kube-prometheus-stack + - gpu-operator diff --git a/recipes/overlays/a100-gke-cos-training.yaml b/recipes/overlays/a100-gke-cos-training.yaml new file mode 100644 index 000000000..d8450b745 --- /dev/null +++ b/recipes/overlays/a100-gke-cos-training.yaml @@ -0,0 +1,89 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +kind: RecipeMetadata +apiVersion: aicr.nvidia.com/v1alpha1 +metadata: + name: a100-gke-cos-training + +spec: + # Inherits from gke-cos-training recipe (GKE COS + training settings) + base: gke-cos-training + + criteria: + service: gke + accelerator: a100 + os: cos + intent: training + + # Specific constraints for A100 on GKE COS training workloads. + # A100 has no IMEX/NVLink ComputeDomain requirement, so the recipe keeps + # the GKE COS training baseline rather than the H100 1.32 floor. + constraints: + - name: K8s.server.version + value: ">= 1.30" + + componentRefs: + # A100-specific GPU Operator overrides (inherits valuesFile from gke-cos-training). + # + # Nodewright tuning is intentionally omitted. The nvidia-tuning-gke package + # ships baked-in profiles only for gke-h100 / gke-b200; there is no A100 + # target. The EKS nvidia-tuned "generic" profile is not a fallback here: it + # applies reboot/bootloader changes, but GKE COS is immutable and + # tuning-gke.yaml deliberately limits itself to non-disruptive tuning. The + # nodewright-operator itself is still inherited from gke-cos. + # + # gke-nccl-tcpxo is also omitted: GPUDirect-TCPXO targets H100 a3-mega + # nodes, not the A100 a2 (a2-highgpu / a2-ultragpu) machine family. + - name: gpu-operator + type: Helm + dependencyRefs: + - nfd + - cert-manager + - kube-prometheus-stack + overrides: + cdi: + enabled: true + + - name: nfd + type: Helm + overrides: + topologyUpdater: + enable: true + + # Validation checks for A100 on GKE COS training workloads. + # Defined at the intent layer (not OS-specific) so all variants inherit them. + # + # The deployment-phase floor (4 standard checks + gpu-operator version pin) + # is contributed by the a100-any cross-cutting overlay and is not duplicated + # here. + # + # Performance gating is intentionally omitted until an empirical A100-on-GKE + # NCCL baseline is established. The H100 GKE training overlay pins an absolute + # nccl-all-reduce-bw floor (>= 250) calibrated on 8-GPU H100 NVLink nodes; + # that value is neither fabric-class aware (https://github.com/NVIDIA/aicr/issues/1256) + # nor valid for A100, so carrying it would only false-fail healthy runs. + validation: + conformance: + checks: + - platform-health + - gpu-operator-health + - dra-support + - accelerator-metrics + - ai-service-metrics + - gang-scheduling + - pod-autoscaling + - cluster-autoscaling + - robust-controller + - secure-accelerator-access