NVIDIA · dmitsh · May 27, 2026 · greptile-apps · May 27, 2026
@@ -50,6 +50,10 @@
               "type": "string",
               "description": "Scheduler-output engine. Must match a registered engine in pkg/registry/registry.go.",
               "enum": ["graph", "k8s", "slinky", "slurm"]
+            },
+            "params": {
+              "type": "object",
+              "description": "Engine-specific parameters. For slinky, useGpuCliqueLabel=true reads nvidia.com/gpu.clique as the topology/block domain source."
             }
           },
           "required": ["name"]

@@ -20,6 +20,7 @@ global:
           app.kubernetes.io/component: compute
       plugin: topology/block
       blockSizes: [4]
+      # useGpuCliqueLabel: true
       topologyConfigPath: topology.conf
       topologyConfigmapName: slurm-config
       useDynamicNodes: false

@@ -0,0 +1,67 @@
+# Default values for topology-generator.
+# This is a YAML-formatted file.
+# Declare variables to be passed into your templates.
+
+global:
+  provider:
+    # name: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
+    name: infiniband-k8s
+    params:
+      nodeSelector:
+        slurmCluster: my-cluster
+      useGpuCliqueLabel: true
+  engine:
+    name: slinky
+    params:
+      namespace: slurm
+      nodeSelector:
+        slurmCluster: my-cluster
+      podSelector:
+        matchLabels:
+          app.kubernetes.io/component: compute
+      plugin: topology/block
+      blockSizes: [4]
+      useGpuCliqueLabel: true
+      topologyConfigPath: topology.conf
+      topologyConfigmapName: slurm-config
+
+nodeSelector:
+  dedicated: user-workload
+
+tolerations:
+  - operator: Exists
+
+node-observer:
+  nodeSelector:
+    dedicated: user-workload
+  topograph:
+    trigger:
+      podSelector:
+        matchLabels:
+          app.kubernetes.io/component: compute
+
+node-data-broker:
+  image:
+    repository: ghcr.io/nvidia/topograph/ib
+    pullPolicy: IfNotPresent
+    tag: main
+  initc:
+    enabled: true
+  verbosity: 4
+  securityContext:
+    privileged: true
+  nodeSelector:
+    node.dgxc.nvidia.com/has-gpu: "true"
+  volumeMounts:
+  - name: sys-class-volume
+    mountPath: /sys/class
+  volumes:
+  - name: sys-class-volume
+    hostPath:
+      path: /sys/class
+      type: Directory
+  tolerations:
+  - key: "nvidia.com/gpu"
+    operator: "Equal"
+    value: "present"
+    effect: "NoSchedule"
@@ -44,6 +44,7 @@ global:
           clusterDefault: true
       topologyConfigPath: topology.conf
       topologyConfigmapName: slurm-config
+      # useGpuCliqueLabel: true
       useDynamicNodes: true
       configUpdateMode: skeleton-only
 

@@ -14,6 +14,10 @@ global:
   engine:
     # name: "k8s", "slinky", "slurm" or "graph"
     name: k8s
+    # params:
+    #   # For slinky topology/block output, use the GPU Operator's existing
+    #   # Kubernetes node label as the block-domain source.
+    #   useGpuCliqueLabel: true
 
   service:
     type: ClusterIP

@@ -92,6 +92,7 @@ Topograph exposes three endpoints for interacting with the service. Below are th
       - **nodeSelector**: (optional) Used in: [`k8s`, `slinky`]. A Kubernetes node label map that filters which nodes participate in topology generation.
       - **topologyConfigmapName**: Used in: [`slinky`]. The required name of the ConfigMap containing the topology config.
       - **useDynamicNodes**: (optional) Used in: [`slinky`]. If `true`, Kubernetes nodes matched by the Node Selector will be annotated with the topology spec.
+      - **useGpuCliqueLabel**: (optional) Used in: [`slinky`]. If `true`, `topology/block` domains are built from the GPU Operator's `nvidia.com/gpu.clique` node label instead of provider accelerator-domain data.
       - **configUpdateMode**: (optional) Used in: [`slinky`]. By default, the full topology YAML is written in the Slurm ConfigMap. `skeleton-only` overrides to include switches or blocks only (no node lines); `none` skips updating the topology key in the ConfigMap.
   - **nodes**: (optional) Supplies the cluster nodes used for topology generation as an array of regions mapping instance IDs to node names.
 

@@ -76,6 +76,30 @@ global:
         clusterDefault: true                           # no podSelector, no nodes → scontrol fallback
 ```
 
+### Using `nvidia.com/gpu.clique` for block topology
+
+On MNNVL Kubernetes clusters, the NVIDIA GPU Operator can label nodes with `nvidia.com/gpu.clique`. When `useGpuCliqueLabel` is enabled, the Slinky engine uses that label as the source for `topology/block` domains instead of the accelerator domains returned by the provider. This is useful with cloud API providers whose `InstanceTopology.AcceleratorID` describes a broader provider domain than the GPU Operator clique label.
+
+The option only affects block topology. Tree topology still comes from the selected provider, and the engine still maps Kubernetes nodes to Slurm nodes through the configured slurmd pod selector.
+
+```yaml
+global:
+  engine:
+    name: slinky
+    params:
+      namespace: ns-slinky
+      podSelector:
+        matchLabels:
+          app.kubernetes.io/component: compute
+      plugin: topology/block
+      blockSizes: [8, 16]
+      topologyConfigmapName: slurm-config
+      topologyConfigPath: topology.conf
+      useGpuCliqueLabel: true
+```
+
+If `useGpuCliqueLabel` is enabled for a block topology and no matching nodes have the `nvidia.com/gpu.clique` label plus the Topograph instance annotation, topology generation fails with a `502` error instead of falling back to provider accelerator domains.
+
 ## ConfigMap Annotations
 
 Slinky automatically adds metadata annotations to managed ConfigMaps for improved observability:

@@ -33,7 +33,7 @@ Not all providers produce both topology types:
 | `infiniband-bm` | Yes (`ClusterUUID.CliqueId`) | Yes (IB switch hierarchy) |
 | `infiniband-k8s` | Yes (`ClusterUUID.CliqueId`) | Yes (IB switch hierarchy) |
 
-**Relationship to `nvidia.com/gpu.clique`**: The GPU Operator device plugin sets `nvidia.com/gpu.clique` on nodes with Multi-Node NVLink (MNNVL) GPUs. The k8s engine treats that label as authoritative when present and does not write Topograph's configured accelerator label for that node, regardless of whether the selected provider also returned an accelerator domain from API data. For `infiniband-k8s`, setting `global.provider.params.useGpuCliqueLabel: true` also makes the provider read that existing node label instead of collecting the same value through `nvidia-smi`. The `netq` provider uses a `DomainUUID` from the NMX management API — a different identifier that refers to the same physical domain but cannot be compared as a string.
+**Relationship to `nvidia.com/gpu.clique`**: The GPU Operator device plugin sets `nvidia.com/gpu.clique` on nodes with Multi-Node NVLink (MNNVL) GPUs. The k8s engine treats that label as authoritative when present and does not write Topograph's configured accelerator label for that node, regardless of whether the selected provider also returned an accelerator domain from API data. For Slinky block topology, setting `global.engine.params.useGpuCliqueLabel: true` makes the Slinky engine build `topology/block` domains from `nvidia.com/gpu.clique` instead of provider accelerator-domain data. For `infiniband-k8s`, setting `global.provider.params.useGpuCliqueLabel: true` also makes the provider read that existing node label instead of collecting the same value through `nvidia-smi`. The `netq` provider uses a `DomainUUID` from the NMX management API — a different identifier that refers to the same physical domain but cannot be compared as a string.
 
 [NVIDIA Fabric Manager](https://docs.nvidia.com/datacenter/tesla/fabric-manager-user-guide/) runs at node init on MNNVL-capable hardware, discovers the NVLink fabric across GPUs, and registers each GPU with [NVML](https://docs.nvidia.com/deploy/nvml-api/) (NVIDIA Management Library — a C API that exposes per-GPU state). The GPU Operator's IMEX labeler writes `nvidia.com/gpu.clique` only once NVML reports the node's fabric state as `GPU_FABRIC_STATE_COMPLETED` — meaning Fabric Manager finished initialization successfully and the node is part of an NVLink domain.
 

@@ -58,6 +58,11 @@ type SlinkyEngine struct {
 	params *Params
 }
 
+type clusterNodes struct {
+	nodes   *corev1.NodeList
+	nodeMap map[string]string
+}
+
 type Params struct {
 	slurm.BaseParams `mapstructure:",squash"`
 	// Namespace specifies the namespace where Slinky cluster is deployed
@@ -72,6 +77,9 @@ type Params struct {
 	ConfigPath string `mapstructure:"topologyConfigPath"`
 	// UseDynamicNodes specifies whether to use dynamic nodes for reporting: true or false
 	UseDynamicNodes bool `mapstructure:"useDynamicNodes" default:"false"`
+	// UseGPUCliqueLabel uses the GPU Operator's nvidia.com/gpu.clique node label
+	// as the block-domain source for topology/block output.
+	UseGPUCliqueLabel bool `mapstructure:"useGpuCliqueLabel"`
 	// ConfigUpdateMode specifies the mode for updating the slurm config: valid values {"none", "skeleton-only"}
 	ConfigUpdateMode string `mapstructure:"configUpdateMode,omitempty"`
 	// Topologies specifies per-partition topology configuration
@@ -172,24 +180,27 @@ func isEmptySelector(sel *metav1.LabelSelector) bool {
 }
 
 func (eng *SlinkyEngine) GetComputeInstances(ctx context.Context, _ any) ([]topology.ComputeInstances, *httperr.Error) {
-
-	nodes, nodeMap, err := eng.getClusterNodes(ctx)
+	clusterNodes, err := eng.getClusterNodes(ctx)
 	if err != nil {
 		return nil, err
 	}
 
-	return getComputeInstances(nodes, nodeMap)
+	return getComputeInstances(clusterNodes.nodes, clusterNodes.nodeMap)
 }
 
-func (eng *SlinkyEngine) getClusterNodes(ctx context.Context) (*corev1.NodeList, map[string]string, *httperr.Error) {
+// getClusterNodes returns the Kubernetes nodes selected for topology generation
+// and a map from Kubernetes node name to Slurm node name. The mapping is built
+// from Ready slurmd pods in the configured namespace and pod selector, using the
+// slurm.node.name label when present and falling back to pod.spec.hostname.
+func (eng *SlinkyEngine) getClusterNodes(ctx context.Context) (*clusterNodes, *httperr.Error) {
 	nodes, err := k8s.GetNodes(ctx, eng.client, eng.params.nodeListOpt)
 	if err != nil {
-		return nil, nil, httperr.NewError(http.StatusBadGateway, err.Error())
+		return nil, httperr.NewError(http.StatusBadGateway, err.Error())
 	}
 
 	pods, err := eng.client.CoreV1().Pods(eng.params.Namespace).List(ctx, *eng.params.podListOpt)
 	if err != nil {
-		return nil, nil, httperr.NewError(http.StatusBadGateway,
+		return nil, httperr.NewError(http.StatusBadGateway,
 			fmt.Sprintf("failed to list SLURM pods in the cluster: %v", err))
 	}
 
@@ -208,7 +219,10 @@ func (eng *SlinkyEngine) getClusterNodes(ctx context.Context) (*corev1.NodeList,
 		klog.V(4).Infof("Mapping k8s node %s to SLURM node %s", pod.Spec.NodeName, host)
 		nodeMap[pod.Spec.NodeName] = host
 	}
-	return nodes, nodeMap, nil
+	return &clusterNodes{
+		nodes:   nodes,
+		nodeMap: nodeMap,
+	}, nil
 }
 
 func getComputeInstances(nodes *corev1.NodeList, nodeMap map[string]string) ([]topology.ComputeInstances, *httperr.Error) {
@@ -246,6 +260,64 @@ func getComputeInstances(nodes *corev1.NodeList, nodeMap map[string]string) ([]t
 	return cis, nil
 }
 
+func withGPUCliqueDomains(graph *topology.Graph, clusterNodes *clusterNodes) (*topology.Graph, *httperr.Error) {
+	domains := topology.NewDomainMap()
+	for _, node := range clusterNodes.nodes.Items {
+		slurmName, ok := clusterNodes.nodeMap[node.Name]
+		if !ok || slurmName == "" {
+			klog.V(4).Infof("Skipping node %s as it does not have a corresponding SLURM name", node.Name)
+			continue
+		}
+
+		gpuClique := strings.TrimSpace(node.Labels[topology.KeyNvidiaGPUClique])
+		if gpuClique == "" {
+			continue
+		}
+
+		instance, ok := node.Annotations[topology.KeyNodeInstance]
+		if !ok {
+			klog.Warningf("missing %q annotation in node %s", topology.KeyNodeInstance, node.Name)
+			continue
+		}
+
+		domains.AddHost(gpuClique, instance, slurmName)
+	}
+
+	if len(domains) == 0 {
+		return nil, httperr.NewError(http.StatusBadGateway,
+			fmt.Sprintf("useGpuCliqueLabel=true but no matching nodes found; check label %q and annotation %q",
+				topology.KeyNvidiaGPUClique, topology.KeyNodeInstance))
+	}
+
+	if graph == nil {
+		graph = &topology.Graph{}
+	} else {
+		cloned := *graph
+		graph = &cloned
+	}
+	graph.Domains = domains
+
+	return graph, nil
+}
+
+func usesBlockTopology(cfg *translate.Config) bool {
+	if cfg == nil {
+		return false
+	}
+
+	if cfg.Plugin == topology.TopologyBlock {
+		return true
+	}
+
+	for _, spec := range cfg.Topologies {
+		if spec != nil && spec.Plugin == topology.TopologyBlock {
+			return true
+		}
+	}
+
+	return false
+}
+
 // generateConfigMapAnnotations creates metadata annotations for ConfigMaps
 func (eng *SlinkyEngine) generateConfigMapAnnotations() map[string]string {
 	annotations := map[string]string{
@@ -283,6 +355,27 @@ func (eng *SlinkyEngine) GenerateOutput(ctx context.Context, graph *topology.Gra
 		return nil, httperr.NewError(http.StatusInternalServerError, err.Error())
 	}
 
+	var clusterNodeData *clusterNodes
+	loadClusterNodes := func() (*clusterNodes, *httperr.Error) {
+		if clusterNodeData != nil {
+			return clusterNodeData, nil
+		}
+		var httpErr *httperr.Error
+		clusterNodeData, httpErr = eng.getClusterNodes(ctx)
+		return clusterNodeData, httpErr
+	}
+
+	if p.UseGPUCliqueLabel && usesBlockTopology(cfg) {
+		clusterNodeData, httpErr := loadClusterNodes()
+		if httpErr != nil {
+			return nil, httpErr
+		}
+		graph, httpErr = withGPUCliqueDomains(graph, clusterNodeData)
+		if httpErr != nil {
+			return nil, httpErr
+		}
+	}
+
 	nt, err := translate.NewNetworkTopology(graph, cfg)
 	if err != nil {
 		return nil, httperr.NewError(http.StatusBadRequest, err.Error())
@@ -306,7 +399,11 @@ func (eng *SlinkyEngine) GenerateOutput(ctx context.Context, graph *topology.Gra
 
 	// For dynamic mode, perform reconciliation using the latest topology information from the provider (root) and the cluster (nodes and their annotations)
 	if p.UseDynamicNodes {
-		httpErr := eng.performReconciliation(ctx, nt, topologies)
+		clusterNodeData, httpErr := loadClusterNodes()
+		if httpErr != nil {
+			return nil, httpErr
+		}
+		httpErr = eng.performReconciliation(ctx, nt, topologies, clusterNodeData)
 		if httpErr != nil {
 			return nil, httpErr
 		}
@@ -466,17 +563,11 @@ func (eng *SlinkyEngine) getPartitionNodes(ctx context.Context, partition string
 	return "", fmt.Errorf("no running pods with labels %v", labels)
 }
 
-func (eng *SlinkyEngine) performReconciliation(ctx context.Context, nt *translate.NetworkTopology, topologies []*translate.TopologyUnit) *httperr.Error {
-
-	nodes, nodeMap, err := eng.getClusterNodes(ctx)
-	if err != nil {
-		return err
-	}
-
+func (eng *SlinkyEngine) performReconciliation(ctx context.Context, nt *translate.NetworkTopology, topologies []*translate.TopologyUnit, clusterNodes *clusterNodes) *httperr.Error {
 	// Update node annotations based on the desired topology and the current cluster state.
 	// This will trigger Slinky to reconfigure the nodes accordingly.
-	for _, node := range nodes.Items {
-		slurmName, ok := nodeMap[node.Name]
+	for _, node := range clusterNodes.nodes.Items {
+		slurmName, ok := clusterNodes.nodeMap[node.Name]
 		if !ok {
 			klog.V(4).Infof("Skipping node %s as it does not have a corresponding SLURM name", node.Name)
 			continue