Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rest-api/flow/docs/component-manager-architecture.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ type FirmwareStatusReader interface {

type BringUpController interface {
// CapabilityBringUpControl
BringUpControl(ctx, target) error
BringUpControl(ctx, target, info) error
}

type BringUpStatusReader interface {
Expand Down
24 changes: 15 additions & 9 deletions rest-api/flow/internal/converter/protobuf/converter.go
Original file line number Diff line number Diff line change
Expand Up @@ -1163,7 +1163,8 @@ func ScheduledOperationFrom(
}

return &operations.PowerControlTaskInfo{
Operation: operations.PowerOperationPowerOn,
Operation: operations.PowerOperationPowerOn,
OverrideAssignmentCheck: r.PowerOn.GetOverrideAssignmentCheck(),
}, ts, r.PowerOn.GetQueueOptions(), r.PowerOn.GetRuleId(), nil

case *pb.ScheduledOperation_PowerOff:
Expand All @@ -1180,8 +1181,9 @@ func ScheduledOperationFrom(
}

return &operations.PowerControlTaskInfo{
Operation: powerOp,
Forced: r.PowerOff.GetForced(),
Operation: powerOp,
Forced: r.PowerOff.GetForced(),
OverrideAssignmentCheck: r.PowerOff.GetOverrideAssignmentCheck(),
}, ts, r.PowerOff.GetQueueOptions(), r.PowerOff.GetRuleId(), nil

case *pb.ScheduledOperation_PowerReset:
Expand All @@ -1198,8 +1200,9 @@ func ScheduledOperationFrom(
}

return &operations.PowerControlTaskInfo{
Operation: powerOp,
Forced: r.PowerReset.GetForced(),
Operation: powerOp,
Forced: r.PowerReset.GetForced(),
OverrideAssignmentCheck: r.PowerReset.GetOverrideAssignmentCheck(),
}, ts, r.PowerReset.GetQueueOptions(), r.PowerReset.GetRuleId(), nil

case *pb.ScheduledOperation_BringUp:
Expand All @@ -1210,7 +1213,9 @@ func ScheduledOperationFrom(
)
}

return &operations.BringUpTaskInfo{}, ts, nil, r.BringUp.GetRuleId(), nil
return &operations.BringUpTaskInfo{
OverrideAssignmentCheck: r.BringUp.GetOverrideAssignmentCheck(),
}, ts, nil, r.BringUp.GetRuleId(), nil

case *pb.ScheduledOperation_Ingest:
ts, err := TargetSpecFrom(r.Ingest.GetTargetSpec())
Expand All @@ -1224,9 +1229,10 @@ func ScheduledOperationFrom(

case *pb.ScheduledOperation_UpgradeFirmware:
info := &operations.FirmwareControlTaskInfo{
Operation: operations.FirmwareOperationUpgrade,
TargetVersion: r.UpgradeFirmware.GetTargetVersion(),
SubTargets: r.UpgradeFirmware.GetSubTargets(),
Operation: operations.FirmwareOperationUpgrade,
TargetVersion: r.UpgradeFirmware.GetTargetVersion(),
SubTargets: r.UpgradeFirmware.GetSubTargets(),
OverrideAssignmentCheck: r.UpgradeFirmware.GetOverrideAssignmentCheck(),
}

if r.UpgradeFirmware.GetStartTime() != nil {
Expand Down
110 changes: 110 additions & 0 deletions rest-api/flow/internal/nicoapi/grpc.go
Original file line number Diff line number Diff line change
Expand Up @@ -276,6 +276,104 @@ func (c *grpcClient) FindMachinesByIds(ctx context.Context, machineIds []string)
return result, nil
}

// FindHostMachineIdsByRack queries Core for host machines (DPUs excluded) on
// the given rack and returns their machine IDs.
func (c *grpcClient) FindHostMachineIdsByRack(ctx context.Context, rackID string) ([]string, error) {
if rackID == "" {
return nil, errors.New("rack ID is required")
}

ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout)
defer cancel()

cfg := &pb.MachineSearchConfig{
RackId: &pb.RackId{Id: rackID},
// include_dpus defaults to false; exclude_hosts defaults to false.
// We want hosts only because Assigned is a host-only state.
}

res, err := c.gclient.FindMachineIds(ctx, cfg)
if err != nil {
return nil, fmt.Errorf("FindMachineIds for rack %s: %w", rackID, err)
}

ids := make([]string, 0, len(res.GetMachineIds()))
for _, mid := range res.GetMachineIds() {
if id := mid.GetId(); id != "" {
ids = append(ids, id)
}
}
return ids, nil
}

// FindSwitchRackIDs returns the rack assignment of each given switch.
func (c *grpcClient) FindSwitchRackIDs(ctx context.Context, switchIds []string) (map[string]string, error) {
if len(switchIds) == 0 {
return nil, nil
}

ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout)
defer cancel()

req := &pb.SwitchesByIdsRequest{
SwitchIds: make([]*pb.SwitchId, 0, len(switchIds)),
}
for _, id := range switchIds {
req.SwitchIds = append(req.SwitchIds, &pb.SwitchId{Id: id})
}

resp, err := c.gclient.FindSwitchesByIds(ctx, req)
if err != nil {
return nil, fmt.Errorf("FindSwitchesByIds: %w", err)
}

result := make(map[string]string, len(resp.GetSwitches()))
for _, sw := range resp.GetSwitches() {
sid := sw.GetId().GetId()
if sid == "" {
continue
}
if rid := sw.GetRackId().GetId(); rid != "" {
result[sid] = rid
}
}
return result, nil
}

// FindPowerShelfRackIDs returns the rack assignment of each given power shelf.
func (c *grpcClient) FindPowerShelfRackIDs(ctx context.Context, shelfIds []string) (map[string]string, error) {
if len(shelfIds) == 0 {
return nil, nil
}

ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout)
defer cancel()

req := &pb.PowerShelvesByIdsRequest{
PowerShelfIds: make([]*pb.PowerShelfId, 0, len(shelfIds)),
}
for _, id := range shelfIds {
req.PowerShelfIds = append(req.PowerShelfIds, &pb.PowerShelfId{Id: id})
}

resp, err := c.gclient.FindPowerShelvesByIds(ctx, req)
if err != nil {
return nil, fmt.Errorf("FindPowerShelvesByIds: %w", err)
}

result := make(map[string]string, len(resp.GetPowerShelves()))
for _, ps := range resp.GetPowerShelves() {
pid := ps.GetId().GetId()
if pid == "" {
continue
}
if rid := ps.GetRackId().GetId(); rid != "" {
result[pid] = rid
}
}
return result, nil
}

// GetMachinePositionInfo returns position information for the given machine IDs
func (c *grpcClient) GetMachinePositionInfo(ctx context.Context, machineIds []string) ([]MachinePosition, error) {
ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout)
Expand Down Expand Up @@ -648,3 +746,15 @@ func (c *grpcClient) SetLeakingMachineIds(ids []string) {
func (c *grpcClient) SetLeakingSwitchIds(ids []string) {
panic("Not a unit test")
}

func (c *grpcClient) SetSwitchRackID(switchID, rackID string) {
panic("Not a unit test")
}

func (c *grpcClient) SetPowerShelfRackID(shelfID, rackID string) {
panic("Not a unit test")
}

func (c *grpcClient) SetRackHostMachineIDs(rackID string, machineIDs []string) {
panic("Not a unit test")
}
73 changes: 69 additions & 4 deletions rest-api/flow/internal/nicoapi/mock.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ package nicoapi

import (
"context"
"errors"
"time"

"github.com/NVIDIA/infra-controller-rest/flow/internal/common/utils"
Expand All @@ -21,15 +22,23 @@ type mockClient struct {
firmwareUpdateTimeWindowErr error // If set, SetFirmwareUpdateTimeWindow will return this error
adminPowerControlErr error // If set, AdminPowerControl will return this error
desiredFirmwareVersions []*pb.DesiredFirmwareVersionEntry
// Topology lookups exercised by the rack-assignment safety check. Tests
// populate these via Set...RackId / Set...HostMachineIds helpers.
switchRackIDs map[string]string // switch ID → rack ID
powerShelfRackIDs map[string]string // power shelf ID → rack ID
hostMachinesByRackID map[string][]string
}

// NewMockClient returns a "GRPC" client that returns mock values so it can be used in unit tests.
func NewMockClient() Client {
return &mockClient{
machines: map[string]MachineDetail{},
powerStates: map[string]PowerState{},
machineInterfaces: map[string]MachineInterface{},
expectedSwitches: map[string]ExpectedSwitchInfo{},
machines: map[string]MachineDetail{},
powerStates: map[string]PowerState{},
machineInterfaces: map[string]MachineInterface{},
expectedSwitches: map[string]ExpectedSwitchInfo{},
switchRackIDs: map[string]string{},
powerShelfRackIDs: map[string]string{},
hostMachinesByRackID: map[string][]string{},
}
}

Expand Down Expand Up @@ -121,6 +130,62 @@ func (c *mockClient) FindMachinesByIds(ctx context.Context, machineIds []string)
return result, nil
}

func (c *mockClient) FindHostMachineIdsByRack(_ context.Context, rackID string) ([]string, error) {
if rackID == "" {
return nil, errors.New("rack ID is required")
}
ids := c.hostMachinesByRackID[rackID]
if len(ids) == 0 {
return nil, nil
}
out := make([]string, len(ids))
copy(out, ids)
return out, nil
}

func (c *mockClient) FindSwitchRackIDs(_ context.Context, switchIds []string) (map[string]string, error) {
if len(switchIds) == 0 {
return nil, nil
}
out := make(map[string]string, len(switchIds))
for _, id := range switchIds {
if rid, ok := c.switchRackIDs[id]; ok && rid != "" {
out[id] = rid
}
}
return out, nil
}

func (c *mockClient) FindPowerShelfRackIDs(_ context.Context, shelfIds []string) (map[string]string, error) {
if len(shelfIds) == 0 {
return nil, nil
}
out := make(map[string]string, len(shelfIds))
for _, id := range shelfIds {
if rid, ok := c.powerShelfRackIDs[id]; ok && rid != "" {
out[id] = rid
}
}
return out, nil
}

// SetSwitchRackID records the rack assignment for a switch (mock only).
func (c *mockClient) SetSwitchRackID(switchID, rackID string) {
c.switchRackIDs[switchID] = rackID
}

// SetPowerShelfRackID records the rack assignment for a power shelf (mock only).
func (c *mockClient) SetPowerShelfRackID(shelfID, rackID string) {
c.powerShelfRackIDs[shelfID] = rackID
}

// SetRackHostMachineIDs records which host machines a rack contains (mock only).
func (c *mockClient) SetRackHostMachineIDs(rackID string, machineIDs []string) {
out := make([]string, len(machineIDs))
copy(out, machineIDs)
c.hostMachinesByRackID[rackID] = out
}

func (c *mockClient) GetMachinePositionInfo(ctx context.Context, machineIds []string) ([]MachinePosition, error) {
// Mock implementation returns empty for now
return nil, nil
Expand Down
18 changes: 18 additions & 0 deletions rest-api/flow/internal/nicoapi/mod.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,21 @@ type Client interface {
// FindMachinesByIds returns detailed machine information for the given machine IDs
FindMachinesByIds(ctx context.Context, machineIds []string) ([]MachineDetail, error)

// FindHostMachineIdsByRack returns the IDs of host (non-DPU) machines that
// belong to the given rack. Empty rackID is rejected. Returns nil when the
// rack has no host machines.
FindHostMachineIdsByRack(ctx context.Context, rackID string) ([]string, error)

// FindSwitchRackIDs returns the mapping from switch ID to rack ID for the
// given switches. A switch without a rack assignment is omitted from the
// result rather than reported as an empty string.
FindSwitchRackIDs(ctx context.Context, switchIds []string) (map[string]string, error)

// FindPowerShelfRackIDs returns the mapping from power-shelf ID to rack ID
// for the given shelves. A shelf without a rack assignment is omitted from
// the result rather than reported as an empty string.
FindPowerShelfRackIDs(ctx context.Context, shelfIds []string) (map[string]string, error)

// GetMachinePositionInfo returns position information for the given machine IDs
GetMachinePositionInfo(ctx context.Context, machineIds []string) ([]MachinePosition, error)

Expand Down Expand Up @@ -114,4 +129,7 @@ type Client interface {
AddExpectedSwitchInfo(info ExpectedSwitchInfo)
SetLeakingMachineIds(ids []string)
SetLeakingSwitchIds([]string)
SetSwitchRackID(switchID, rackID string)
SetPowerShelfRackID(shelfID, rackID string)
SetRackHostMachineIDs(rackID string, machineIDs []string)
}
25 changes: 15 additions & 10 deletions rest-api/flow/internal/service/server_impl.go
Original file line number Diff line number Diff line change
Expand Up @@ -650,7 +650,8 @@ func (rs *FlowServerImpl) PowerOnRack(
req.GetQueueOptions(),
req.GetRuleId(),
&operations.PowerControlTaskInfo{
Operation: operations.PowerOperationPowerOn,
Operation: operations.PowerOperationPowerOn,
OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(),
},
)
}
Expand All @@ -670,8 +671,9 @@ func (rs *FlowServerImpl) PowerOffRack(
req.GetQueueOptions(),
req.GetRuleId(),
&operations.PowerControlTaskInfo{
Operation: op,
Forced: req.GetForced(),
Operation: op,
Forced: req.GetForced(),
OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(),
},
)
}
Expand All @@ -691,8 +693,9 @@ func (rs *FlowServerImpl) PowerResetRack(
req.GetQueueOptions(),
req.GetRuleId(),
&operations.PowerControlTaskInfo{
Operation: op,
Forced: req.GetForced(),
Operation: op,
Forced: req.GetForced(),
OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(),
},
)
}
Expand All @@ -715,7 +718,8 @@ func (rs *FlowServerImpl) BringUpRack(
}

info := &operations.BringUpTaskInfo{
RuleID: protobuf.UUIDStringFrom(req.GetRuleId()),
RuleID: protobuf.UUIDStringFrom(req.GetRuleId()),
OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(),
}
opReq, err := rs.convertTargetSpecToOperationRequest(
targetSpec, req.GetDescription(), info,
Expand Down Expand Up @@ -1233,10 +1237,11 @@ func (rs *FlowServerImpl) UpgradeFirmware(

// Build FirmwareControlTaskInfo
info := &operations.FirmwareControlTaskInfo{
Operation: operations.FirmwareOperationUpgrade,
TargetVersion: req.GetTargetVersion(),
RuleID: protobuf.UUIDStringFrom(req.GetRuleId()),
SubTargets: req.GetSubTargets(),
Operation: operations.FirmwareOperationUpgrade,
TargetVersion: req.GetTargetVersion(),
RuleID: protobuf.UUIDStringFrom(req.GetRuleId()),
SubTargets: req.GetSubTargets(),
OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(),
}

// Parse optional time parameters for scheduled upgrade
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -445,7 +445,7 @@ func TestServiceCatalog(t *testing.T) {
name: "nvswitch nvswitchmanager",
componentType: devicetypes.ComponentTypeNVSwitch,
implementation: nvswitchnsm.ImplementationName,
requiredProviders: []string{nsmprovider.ProviderName},
requiredProviders: []string{nsmprovider.ProviderName, nicoprovider.ProviderName},
capabilities: capability.CapabilitySet{
capability.CapabilityFirmwareControl,
capability.CapabilityFirmwareStatus,
Expand Down
Loading
Loading