Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions charts/topograph/values.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@
"type": "string",
"description": "Scheduler-output engine. Must match a registered engine in pkg/registry/registry.go.",
"enum": ["graph", "k8s", "slinky", "slurm"]
},
"params": {
"type": "object",
"description": "Engine-specific parameters. For slinky, useGpuCliqueLabel=true reads nvidia.com/gpu.clique as the topology/block domain source."
}
},
"required": ["name"]
Expand Down
1 change: 1 addition & 0 deletions charts/topograph/values.slinky.block-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ global:
app.kubernetes.io/component: compute
plugin: topology/block
blockSizes: [4]
# useGpuCliqueLabel: true
topologyConfigPath: topology.conf
topologyConfigmapName: slurm-config
useDynamicNodes: false
Expand Down
67 changes: 67 additions & 0 deletions charts/topograph/values.slinky.ib.block-example.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Default values for topology-generator.
# This is a YAML-formatted file.
# Declare variables to be passed into your templates.

global:
provider:
# name: "aws", "oci", "gcp", "nebius", "netq", "infiniband-k8s", "dra" or "test"
name: infiniband-k8s
params:
nodeSelector:
slurmCluster: my-cluster
useGpuCliqueLabel: true
engine:
name: slinky
params:
namespace: slurm
nodeSelector:
slurmCluster: my-cluster
podSelector:
matchLabels:
app.kubernetes.io/component: compute
plugin: topology/block
blockSizes: [4]
useGpuCliqueLabel: true
topologyConfigPath: topology.conf
topologyConfigmapName: slurm-config

nodeSelector:
dedicated: user-workload

tolerations:
- operator: Exists

node-observer:
nodeSelector:
dedicated: user-workload
topograph:
trigger:
podSelector:
matchLabels:
app.kubernetes.io/component: compute

node-data-broker:
image:
repository: ghcr.io/nvidia/topograph/ib
pullPolicy: IfNotPresent
tag: main
initc:
enabled: true
verbosity: 4
securityContext:
privileged: true
nodeSelector:
node.dgxc.nvidia.com/has-gpu: "true"
volumeMounts:
- name: sys-class-volume
mountPath: /sys/class
volumes:
- name: sys-class-volume
hostPath:
path: /sys/class
type: Directory
tolerations:
- key: "nvidia.com/gpu"
operator: "Equal"
value: "present"
effect: "NoSchedule"
1 change: 1 addition & 0 deletions charts/topograph/values.slinky.partition-example.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ global:
clusterDefault: true
topologyConfigPath: topology.conf
topologyConfigmapName: slurm-config
# useGpuCliqueLabel: true
useDynamicNodes: true
configUpdateMode: skeleton-only

Expand Down
4 changes: 4 additions & 0 deletions charts/topograph/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@ global:
engine:
# name: "k8s", "slinky", "slurm" or "graph"
name: k8s
# params:
# # For slinky topology/block output, use the GPU Operator's existing
# # Kubernetes node label as the block-domain source.
# useGpuCliqueLabel: true

service:
type: ClusterIP
Expand Down
1 change: 1 addition & 0 deletions docs/api.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ Topograph exposes three endpoints for interacting with the service. Below are th
- **nodeSelector**: (optional) Used in: [`k8s`, `slinky`]. A Kubernetes node label map that filters which nodes participate in topology generation.
- **topologyConfigmapName**: Used in: [`slinky`]. The required name of the ConfigMap containing the topology config.
- **useDynamicNodes**: (optional) Used in: [`slinky`]. If `true`, Kubernetes nodes matched by the Node Selector will be annotated with the topology spec.
- **useGpuCliqueLabel**: (optional) Used in: [`slinky`]. If `true`, `topology/block` domains are built from the GPU Operator's `nvidia.com/gpu.clique` node label instead of provider accelerator-domain data.
- **configUpdateMode**: (optional) Used in: [`slinky`]. By default, the full topology YAML is written in the Slurm ConfigMap. `skeleton-only` overrides to include switches or blocks only (no node lines); `none` skips updating the topology key in the ConfigMap.
- **nodes**: (optional) Supplies the cluster nodes used for topology generation as an array of regions mapping instance IDs to node names.

Expand Down
24 changes: 24 additions & 0 deletions docs/engines/slinky.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,30 @@ global:
clusterDefault: true # no podSelector, no nodes → scontrol fallback
```

### Using `nvidia.com/gpu.clique` for block topology

On MNNVL Kubernetes clusters, the NVIDIA GPU Operator can label nodes with `nvidia.com/gpu.clique`. When `useGpuCliqueLabel` is enabled, the Slinky engine uses that label as the source for `topology/block` domains instead of the accelerator domains returned by the provider. This is useful with cloud API providers whose `InstanceTopology.AcceleratorID` describes a broader provider domain than the GPU Operator clique label.

The option only affects block topology. Tree topology still comes from the selected provider, and the engine still maps Kubernetes nodes to Slurm nodes through the configured slurmd pod selector.

```yaml
global:
engine:
name: slinky
params:
namespace: ns-slinky
podSelector:
matchLabels:
app.kubernetes.io/component: compute
plugin: topology/block
blockSizes: [8, 16]
topologyConfigmapName: slurm-config
topologyConfigPath: topology.conf
useGpuCliqueLabel: true
```

If `useGpuCliqueLabel` is enabled for a block topology and no matching nodes have the `nvidia.com/gpu.clique` label plus the Topograph instance annotation, topology generation fails with a `502` error instead of falling back to provider accelerator domains.

## ConfigMap Annotations

Slinky automatically adds metadata annotations to managed ConfigMaps for improved observability:
Expand Down
2 changes: 1 addition & 1 deletion docs/reference/node-labels.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ Not all providers produce both topology types:
| `infiniband-bm` | Yes (`ClusterUUID.CliqueId`) | Yes (IB switch hierarchy) |
| `infiniband-k8s` | Yes (`ClusterUUID.CliqueId`) | Yes (IB switch hierarchy) |

**Relationship to `nvidia.com/gpu.clique`**: The GPU Operator device plugin sets `nvidia.com/gpu.clique` on nodes with Multi-Node NVLink (MNNVL) GPUs. The k8s engine treats that label as authoritative when present and does not write Topograph's configured accelerator label for that node, regardless of whether the selected provider also returned an accelerator domain from API data. For `infiniband-k8s`, setting `global.provider.params.useGpuCliqueLabel: true` also makes the provider read that existing node label instead of collecting the same value through `nvidia-smi`. The `netq` provider uses a `DomainUUID` from the NMX management API — a different identifier that refers to the same physical domain but cannot be compared as a string.
**Relationship to `nvidia.com/gpu.clique`**: The GPU Operator device plugin sets `nvidia.com/gpu.clique` on nodes with Multi-Node NVLink (MNNVL) GPUs. The k8s engine treats that label as authoritative when present and does not write Topograph's configured accelerator label for that node, regardless of whether the selected provider also returned an accelerator domain from API data. For Slinky block topology, setting `global.engine.params.useGpuCliqueLabel: true` makes the Slinky engine build `topology/block` domains from `nvidia.com/gpu.clique` instead of provider accelerator-domain data. For `infiniband-k8s`, setting `global.provider.params.useGpuCliqueLabel: true` also makes the provider read that existing node label instead of collecting the same value through `nvidia-smi`. The `netq` provider uses a `DomainUUID` from the NMX management API — a different identifier that refers to the same physical domain but cannot be compared as a string.

[NVIDIA Fabric Manager](https://docs.nvidia.com/datacenter/tesla/fabric-manager-user-guide/) runs at node init on MNNVL-capable hardware, discovers the NVLink fabric across GPUs, and registers each GPU with [NVML](https://docs.nvidia.com/deploy/nvml-api/) (NVIDIA Management Library — a C API that exposes per-GPU state). The GPU Operator's IMEX labeler writes `nvidia.com/gpu.clique` only once NVML reports the node's fabric state as `GPU_FABRIC_STATE_COMPLETED` — meaning Fabric Manager finished initialization successfully and the node is part of an NVLink domain.

Expand Down
125 changes: 108 additions & 17 deletions pkg/engines/slinky/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ type SlinkyEngine struct {
params *Params
}

type clusterNodes struct {
nodes *corev1.NodeList
nodeMap map[string]string
}

type Params struct {
slurm.BaseParams `mapstructure:",squash"`
// Namespace specifies the namespace where Slinky cluster is deployed
Expand All @@ -72,6 +77,9 @@ type Params struct {
ConfigPath string `mapstructure:"topologyConfigPath"`
// UseDynamicNodes specifies whether to use dynamic nodes for reporting: true or false
UseDynamicNodes bool `mapstructure:"useDynamicNodes" default:"false"`
// UseGPUCliqueLabel uses the GPU Operator's nvidia.com/gpu.clique node label
// as the block-domain source for topology/block output.
UseGPUCliqueLabel bool `mapstructure:"useGpuCliqueLabel"`
// ConfigUpdateMode specifies the mode for updating the slurm config: valid values {"none", "skeleton-only"}
ConfigUpdateMode string `mapstructure:"configUpdateMode,omitempty"`
// Topologies specifies per-partition topology configuration
Expand Down Expand Up @@ -172,24 +180,27 @@ func isEmptySelector(sel *metav1.LabelSelector) bool {
}

func (eng *SlinkyEngine) GetComputeInstances(ctx context.Context, _ any) ([]topology.ComputeInstances, *httperr.Error) {

nodes, nodeMap, err := eng.getClusterNodes(ctx)
clusterNodes, err := eng.getClusterNodes(ctx)
if err != nil {
return nil, err
}

return getComputeInstances(nodes, nodeMap)
return getComputeInstances(clusterNodes.nodes, clusterNodes.nodeMap)
}

func (eng *SlinkyEngine) getClusterNodes(ctx context.Context) (*corev1.NodeList, map[string]string, *httperr.Error) {
// getClusterNodes returns the Kubernetes nodes selected for topology generation
// and a map from Kubernetes node name to Slurm node name. The mapping is built
// from Ready slurmd pods in the configured namespace and pod selector, using the
// slurm.node.name label when present and falling back to pod.spec.hostname.
func (eng *SlinkyEngine) getClusterNodes(ctx context.Context) (*clusterNodes, *httperr.Error) {
nodes, err := k8s.GetNodes(ctx, eng.client, eng.params.nodeListOpt)
if err != nil {
return nil, nil, httperr.NewError(http.StatusBadGateway, err.Error())
return nil, httperr.NewError(http.StatusBadGateway, err.Error())
}

pods, err := eng.client.CoreV1().Pods(eng.params.Namespace).List(ctx, *eng.params.podListOpt)
if err != nil {
return nil, nil, httperr.NewError(http.StatusBadGateway,
return nil, httperr.NewError(http.StatusBadGateway,
fmt.Sprintf("failed to list SLURM pods in the cluster: %v", err))
}

Expand All @@ -208,7 +219,10 @@ func (eng *SlinkyEngine) getClusterNodes(ctx context.Context) (*corev1.NodeList,
klog.V(4).Infof("Mapping k8s node %s to SLURM node %s", pod.Spec.NodeName, host)
nodeMap[pod.Spec.NodeName] = host
}
return nodes, nodeMap, nil
return &clusterNodes{
nodes: nodes,
nodeMap: nodeMap,
}, nil
}

func getComputeInstances(nodes *corev1.NodeList, nodeMap map[string]string) ([]topology.ComputeInstances, *httperr.Error) {
Expand Down Expand Up @@ -246,6 +260,64 @@ func getComputeInstances(nodes *corev1.NodeList, nodeMap map[string]string) ([]t
return cis, nil
}

func withGPUCliqueDomains(graph *topology.Graph, clusterNodes *clusterNodes) (*topology.Graph, *httperr.Error) {
domains := topology.NewDomainMap()
for _, node := range clusterNodes.nodes.Items {
slurmName, ok := clusterNodes.nodeMap[node.Name]
if !ok || slurmName == "" {
klog.V(4).Infof("Skipping node %s as it does not have a corresponding SLURM name", node.Name)
continue
}

gpuClique := strings.TrimSpace(node.Labels[topology.KeyNvidiaGPUClique])
if gpuClique == "" {
continue
}

instance, ok := node.Annotations[topology.KeyNodeInstance]
if !ok {
klog.Warningf("missing %q annotation in node %s", topology.KeyNodeInstance, node.Name)
continue
}

domains.AddHost(gpuClique, instance, slurmName)
Comment on lines +277 to +283

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Nodes with gpu.clique label but missing instance annotation are silently dropped

A node that carries nvidia.com/gpu.clique (meaning the GPU Operator confirmed NVLink fabric is complete) but is missing the topograph.nvidia.com/instance annotation is skipped with only a klog.Warningf. If a clique spans, say, 8 nodes and one is missing the annotation, the generated block will contain only 7 nodes — no error is surfaced to the caller. The existing len(domains) == 0 guard catches the fully-empty case, but a partially populated domain that silently excludes valid GPU nodes could confuse Slurm topology scheduling without any user-visible signal beyond a log line.

}

if len(domains) == 0 {
return nil, httperr.NewError(http.StatusBadGateway,
fmt.Sprintf("useGpuCliqueLabel=true but no matching nodes found; check label %q and annotation %q",
topology.KeyNvidiaGPUClique, topology.KeyNodeInstance))
}

if graph == nil {
graph = &topology.Graph{}
} else {
cloned := *graph
graph = &cloned
}
graph.Domains = domains

return graph, nil
}

func usesBlockTopology(cfg *translate.Config) bool {
if cfg == nil {
return false
}

if cfg.Plugin == topology.TopologyBlock {
return true
}

for _, spec := range cfg.Topologies {
if spec != nil && spec.Plugin == topology.TopologyBlock {
return true
}
}

return false
}

// generateConfigMapAnnotations creates metadata annotations for ConfigMaps
func (eng *SlinkyEngine) generateConfigMapAnnotations() map[string]string {
annotations := map[string]string{
Expand Down Expand Up @@ -283,6 +355,27 @@ func (eng *SlinkyEngine) GenerateOutput(ctx context.Context, graph *topology.Gra
return nil, httperr.NewError(http.StatusInternalServerError, err.Error())
}

var clusterNodeData *clusterNodes
loadClusterNodes := func() (*clusterNodes, *httperr.Error) {
if clusterNodeData != nil {
return clusterNodeData, nil
}
var httpErr *httperr.Error
clusterNodeData, httpErr = eng.getClusterNodes(ctx)
return clusterNodeData, httpErr
}

if p.UseGPUCliqueLabel && usesBlockTopology(cfg) {
clusterNodeData, httpErr := loadClusterNodes()
if httpErr != nil {
return nil, httpErr
}
graph, httpErr = withGPUCliqueDomains(graph, clusterNodeData)
if httpErr != nil {
return nil, httpErr
}
}

nt, err := translate.NewNetworkTopology(graph, cfg)
if err != nil {
return nil, httperr.NewError(http.StatusBadRequest, err.Error())
Expand All @@ -306,7 +399,11 @@ func (eng *SlinkyEngine) GenerateOutput(ctx context.Context, graph *topology.Gra

// For dynamic mode, perform reconciliation using the latest topology information from the provider (root) and the cluster (nodes and their annotations)
if p.UseDynamicNodes {
httpErr := eng.performReconciliation(ctx, nt, topologies)
clusterNodeData, httpErr := loadClusterNodes()
if httpErr != nil {
return nil, httpErr
}
httpErr = eng.performReconciliation(ctx, nt, topologies, clusterNodeData)
if httpErr != nil {
return nil, httpErr
}
Expand Down Expand Up @@ -466,17 +563,11 @@ func (eng *SlinkyEngine) getPartitionNodes(ctx context.Context, partition string
return "", fmt.Errorf("no running pods with labels %v", labels)
}

func (eng *SlinkyEngine) performReconciliation(ctx context.Context, nt *translate.NetworkTopology, topologies []*translate.TopologyUnit) *httperr.Error {

nodes, nodeMap, err := eng.getClusterNodes(ctx)
if err != nil {
return err
}

func (eng *SlinkyEngine) performReconciliation(ctx context.Context, nt *translate.NetworkTopology, topologies []*translate.TopologyUnit, clusterNodes *clusterNodes) *httperr.Error {
// Update node annotations based on the desired topology and the current cluster state.
// This will trigger Slinky to reconfigure the nodes accordingly.
for _, node := range nodes.Items {
slurmName, ok := nodeMap[node.Name]
for _, node := range clusterNodes.nodes.Items {
slurmName, ok := clusterNodes.nodeMap[node.Name]
if !ok {
klog.V(4).Infof("Skipping node %s as it does not have a corresponding SLURM name", node.Name)
continue
Expand Down
Loading
Loading