From 82356adcb4406bd9eb2895220340ac9b3bcf0dd4 Mon Sep 17 00:00:00 2001 From: Kun Zhao Date: Tue, 2 Jun 2026 11:06:27 -0700 Subject: [PATCH] feat: Gate Flow power/firmware ops on host assignment state Power control, firmware update, and bring-up power-cycle the host or the rack's fabric/power feed, and are unsafe while any target host is still attached to a tenant Instance. Add a per-Manager policy gate (ensureMachinesOperable for compute, ensureRackOperable for nvswitch/powershelf) that polls Core at a fixed interval and fails with a clear error after a 30-minute timeout if any relevant host stays in ManagedHostState::Assigned/*. The gate delegates to a new AssignmentChecker primitive in the nicoprovider package. The NSM-based NVSwitch manager now also requires the nico provider so the rack check can run. Three nicoapi.Client methods are added for the rack-scope check: FindHostMachineIdsByRack, FindSwitchRackIDs, FindPowerShelfRackIDs - all using existing Core gRPC surface. Switches and power shelves not yet associated with a rack are logged and skipped rather than blocked. An override_assignment_check bool is added to the five disruptive request messages (UpgradeFirmware, PowerOn/Off/Reset Rack, BringUpRack) and plumbed end-to-end. When set, the gate logs a warning and skips the AssignmentChecker. Defaults to false. Signed-off-by: Kun Zhao --- .../docs/component-manager-architecture.md | 2 +- .../internal/converter/protobuf/converter.go | 24 +- rest-api/flow/internal/nicoapi/grpc.go | 110 ++++++++ rest-api/flow/internal/nicoapi/mock.go | 73 +++++- rest-api/flow/internal/nicoapi/mod.go | 18 ++ rest-api/flow/internal/service/server_impl.go | 25 +- .../componentmanager/builtin/builtin_test.go | 2 +- .../componentmanager/compute/nico/nico.go | 56 +++++ .../compute/nico/nico_test.go | 123 +++++++++ .../task/componentmanager/mock/mock.go | 5 +- .../componentmanager/nvswitch/nico/nico.go | 70 ++++++ .../nvswitch/nico/nico_test.go | 99 ++++++++ .../nvswitchmanager/nvswitchmanager.go | 97 +++++++- .../task/componentmanager/operations.go | 7 +- .../componentmanager/powershelf/nico/nico.go | 73 +++++- .../powershelf/nico/nico_test.go | 99 ++++++++ .../providers/nico/assignment.go | 235 ++++++++++++++++++ .../providers/nico/assignment_test.go | 144 +++++++++++ .../temporalworkflow/activity/activity.go | 7 +- .../activity/activity_test.go | 4 +- .../temporalworkflow/workflow/actions.go | 80 +++++- .../temporalworkflow/workflow/actions_test.go | 97 ++++++++ .../temporalworkflow/workflow/bringup_test.go | 4 +- .../workflow/child_workflow_test.go | 4 +- .../internal/task/operations/operations.go | 21 ++ rest-api/flow/pkg/proto/v1/flow.pb.go | 156 ++++++++---- rest-api/flow/proto/v1/flow.proto | 32 +++ 27 files changed, 1580 insertions(+), 87 deletions(-) create mode 100644 rest-api/flow/internal/task/componentmanager/providers/nico/assignment.go create mode 100644 rest-api/flow/internal/task/componentmanager/providers/nico/assignment_test.go create mode 100644 rest-api/flow/internal/task/executor/temporalworkflow/workflow/actions_test.go diff --git a/rest-api/flow/docs/component-manager-architecture.md b/rest-api/flow/docs/component-manager-architecture.md index 6487238b81..4fa85ab71d 100644 --- a/rest-api/flow/docs/component-manager-architecture.md +++ b/rest-api/flow/docs/component-manager-architecture.md @@ -101,7 +101,7 @@ type FirmwareStatusReader interface { type BringUpController interface { // CapabilityBringUpControl - BringUpControl(ctx, target) error + BringUpControl(ctx, target, info) error } type BringUpStatusReader interface { diff --git a/rest-api/flow/internal/converter/protobuf/converter.go b/rest-api/flow/internal/converter/protobuf/converter.go index 269058be55..6cec9ab0a9 100644 --- a/rest-api/flow/internal/converter/protobuf/converter.go +++ b/rest-api/flow/internal/converter/protobuf/converter.go @@ -1163,7 +1163,8 @@ func ScheduledOperationFrom( } return &operations.PowerControlTaskInfo{ - Operation: operations.PowerOperationPowerOn, + Operation: operations.PowerOperationPowerOn, + OverrideAssignmentCheck: r.PowerOn.GetOverrideAssignmentCheck(), }, ts, r.PowerOn.GetQueueOptions(), r.PowerOn.GetRuleId(), nil case *pb.ScheduledOperation_PowerOff: @@ -1180,8 +1181,9 @@ func ScheduledOperationFrom( } return &operations.PowerControlTaskInfo{ - Operation: powerOp, - Forced: r.PowerOff.GetForced(), + Operation: powerOp, + Forced: r.PowerOff.GetForced(), + OverrideAssignmentCheck: r.PowerOff.GetOverrideAssignmentCheck(), }, ts, r.PowerOff.GetQueueOptions(), r.PowerOff.GetRuleId(), nil case *pb.ScheduledOperation_PowerReset: @@ -1198,8 +1200,9 @@ func ScheduledOperationFrom( } return &operations.PowerControlTaskInfo{ - Operation: powerOp, - Forced: r.PowerReset.GetForced(), + Operation: powerOp, + Forced: r.PowerReset.GetForced(), + OverrideAssignmentCheck: r.PowerReset.GetOverrideAssignmentCheck(), }, ts, r.PowerReset.GetQueueOptions(), r.PowerReset.GetRuleId(), nil case *pb.ScheduledOperation_BringUp: @@ -1210,7 +1213,9 @@ func ScheduledOperationFrom( ) } - return &operations.BringUpTaskInfo{}, ts, nil, r.BringUp.GetRuleId(), nil + return &operations.BringUpTaskInfo{ + OverrideAssignmentCheck: r.BringUp.GetOverrideAssignmentCheck(), + }, ts, nil, r.BringUp.GetRuleId(), nil case *pb.ScheduledOperation_Ingest: ts, err := TargetSpecFrom(r.Ingest.GetTargetSpec()) @@ -1224,9 +1229,10 @@ func ScheduledOperationFrom( case *pb.ScheduledOperation_UpgradeFirmware: info := &operations.FirmwareControlTaskInfo{ - Operation: operations.FirmwareOperationUpgrade, - TargetVersion: r.UpgradeFirmware.GetTargetVersion(), - SubTargets: r.UpgradeFirmware.GetSubTargets(), + Operation: operations.FirmwareOperationUpgrade, + TargetVersion: r.UpgradeFirmware.GetTargetVersion(), + SubTargets: r.UpgradeFirmware.GetSubTargets(), + OverrideAssignmentCheck: r.UpgradeFirmware.GetOverrideAssignmentCheck(), } if r.UpgradeFirmware.GetStartTime() != nil { diff --git a/rest-api/flow/internal/nicoapi/grpc.go b/rest-api/flow/internal/nicoapi/grpc.go index b0018d1a29..adcc82bb00 100644 --- a/rest-api/flow/internal/nicoapi/grpc.go +++ b/rest-api/flow/internal/nicoapi/grpc.go @@ -276,6 +276,104 @@ func (c *grpcClient) FindMachinesByIds(ctx context.Context, machineIds []string) return result, nil } +// FindHostMachineIdsByRack queries Core for host machines (DPUs excluded) on +// the given rack and returns their machine IDs. +func (c *grpcClient) FindHostMachineIdsByRack(ctx context.Context, rackID string) ([]string, error) { + if rackID == "" { + return nil, errors.New("rack ID is required") + } + + ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) + defer cancel() + + cfg := &pb.MachineSearchConfig{ + RackId: &pb.RackId{Id: rackID}, + // include_dpus defaults to false; exclude_hosts defaults to false. + // We want hosts only because Assigned is a host-only state. + } + + res, err := c.gclient.FindMachineIds(ctx, cfg) + if err != nil { + return nil, fmt.Errorf("FindMachineIds for rack %s: %w", rackID, err) + } + + ids := make([]string, 0, len(res.GetMachineIds())) + for _, mid := range res.GetMachineIds() { + if id := mid.GetId(); id != "" { + ids = append(ids, id) + } + } + return ids, nil +} + +// FindSwitchRackIDs returns the rack assignment of each given switch. +func (c *grpcClient) FindSwitchRackIDs(ctx context.Context, switchIds []string) (map[string]string, error) { + if len(switchIds) == 0 { + return nil, nil + } + + ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) + defer cancel() + + req := &pb.SwitchesByIdsRequest{ + SwitchIds: make([]*pb.SwitchId, 0, len(switchIds)), + } + for _, id := range switchIds { + req.SwitchIds = append(req.SwitchIds, &pb.SwitchId{Id: id}) + } + + resp, err := c.gclient.FindSwitchesByIds(ctx, req) + if err != nil { + return nil, fmt.Errorf("FindSwitchesByIds: %w", err) + } + + result := make(map[string]string, len(resp.GetSwitches())) + for _, sw := range resp.GetSwitches() { + sid := sw.GetId().GetId() + if sid == "" { + continue + } + if rid := sw.GetRackId().GetId(); rid != "" { + result[sid] = rid + } + } + return result, nil +} + +// FindPowerShelfRackIDs returns the rack assignment of each given power shelf. +func (c *grpcClient) FindPowerShelfRackIDs(ctx context.Context, shelfIds []string) (map[string]string, error) { + if len(shelfIds) == 0 { + return nil, nil + } + + ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) + defer cancel() + + req := &pb.PowerShelvesByIdsRequest{ + PowerShelfIds: make([]*pb.PowerShelfId, 0, len(shelfIds)), + } + for _, id := range shelfIds { + req.PowerShelfIds = append(req.PowerShelfIds, &pb.PowerShelfId{Id: id}) + } + + resp, err := c.gclient.FindPowerShelvesByIds(ctx, req) + if err != nil { + return nil, fmt.Errorf("FindPowerShelvesByIds: %w", err) + } + + result := make(map[string]string, len(resp.GetPowerShelves())) + for _, ps := range resp.GetPowerShelves() { + pid := ps.GetId().GetId() + if pid == "" { + continue + } + if rid := ps.GetRackId().GetId(); rid != "" { + result[pid] = rid + } + } + return result, nil +} + // GetMachinePositionInfo returns position information for the given machine IDs func (c *grpcClient) GetMachinePositionInfo(ctx context.Context, machineIds []string) ([]MachinePosition, error) { ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) @@ -648,3 +746,15 @@ func (c *grpcClient) SetLeakingMachineIds(ids []string) { func (c *grpcClient) SetLeakingSwitchIds(ids []string) { panic("Not a unit test") } + +func (c *grpcClient) SetSwitchRackID(switchID, rackID string) { + panic("Not a unit test") +} + +func (c *grpcClient) SetPowerShelfRackID(shelfID, rackID string) { + panic("Not a unit test") +} + +func (c *grpcClient) SetRackHostMachineIDs(rackID string, machineIDs []string) { + panic("Not a unit test") +} diff --git a/rest-api/flow/internal/nicoapi/mock.go b/rest-api/flow/internal/nicoapi/mock.go index b05ae3a407..b96089708a 100644 --- a/rest-api/flow/internal/nicoapi/mock.go +++ b/rest-api/flow/internal/nicoapi/mock.go @@ -5,6 +5,7 @@ package nicoapi import ( "context" + "errors" "time" "github.com/NVIDIA/infra-controller-rest/flow/internal/common/utils" @@ -21,15 +22,23 @@ type mockClient struct { firmwareUpdateTimeWindowErr error // If set, SetFirmwareUpdateTimeWindow will return this error adminPowerControlErr error // If set, AdminPowerControl will return this error desiredFirmwareVersions []*pb.DesiredFirmwareVersionEntry + // Topology lookups exercised by the rack-assignment safety check. Tests + // populate these via Set...RackId / Set...HostMachineIds helpers. + switchRackIDs map[string]string // switch ID → rack ID + powerShelfRackIDs map[string]string // power shelf ID → rack ID + hostMachinesByRackID map[string][]string } // NewMockClient returns a "GRPC" client that returns mock values so it can be used in unit tests. func NewMockClient() Client { return &mockClient{ - machines: map[string]MachineDetail{}, - powerStates: map[string]PowerState{}, - machineInterfaces: map[string]MachineInterface{}, - expectedSwitches: map[string]ExpectedSwitchInfo{}, + machines: map[string]MachineDetail{}, + powerStates: map[string]PowerState{}, + machineInterfaces: map[string]MachineInterface{}, + expectedSwitches: map[string]ExpectedSwitchInfo{}, + switchRackIDs: map[string]string{}, + powerShelfRackIDs: map[string]string{}, + hostMachinesByRackID: map[string][]string{}, } } @@ -121,6 +130,62 @@ func (c *mockClient) FindMachinesByIds(ctx context.Context, machineIds []string) return result, nil } +func (c *mockClient) FindHostMachineIdsByRack(_ context.Context, rackID string) ([]string, error) { + if rackID == "" { + return nil, errors.New("rack ID is required") + } + ids := c.hostMachinesByRackID[rackID] + if len(ids) == 0 { + return nil, nil + } + out := make([]string, len(ids)) + copy(out, ids) + return out, nil +} + +func (c *mockClient) FindSwitchRackIDs(_ context.Context, switchIds []string) (map[string]string, error) { + if len(switchIds) == 0 { + return nil, nil + } + out := make(map[string]string, len(switchIds)) + for _, id := range switchIds { + if rid, ok := c.switchRackIDs[id]; ok && rid != "" { + out[id] = rid + } + } + return out, nil +} + +func (c *mockClient) FindPowerShelfRackIDs(_ context.Context, shelfIds []string) (map[string]string, error) { + if len(shelfIds) == 0 { + return nil, nil + } + out := make(map[string]string, len(shelfIds)) + for _, id := range shelfIds { + if rid, ok := c.powerShelfRackIDs[id]; ok && rid != "" { + out[id] = rid + } + } + return out, nil +} + +// SetSwitchRackID records the rack assignment for a switch (mock only). +func (c *mockClient) SetSwitchRackID(switchID, rackID string) { + c.switchRackIDs[switchID] = rackID +} + +// SetPowerShelfRackID records the rack assignment for a power shelf (mock only). +func (c *mockClient) SetPowerShelfRackID(shelfID, rackID string) { + c.powerShelfRackIDs[shelfID] = rackID +} + +// SetRackHostMachineIDs records which host machines a rack contains (mock only). +func (c *mockClient) SetRackHostMachineIDs(rackID string, machineIDs []string) { + out := make([]string, len(machineIDs)) + copy(out, machineIDs) + c.hostMachinesByRackID[rackID] = out +} + func (c *mockClient) GetMachinePositionInfo(ctx context.Context, machineIds []string) ([]MachinePosition, error) { // Mock implementation returns empty for now return nil, nil diff --git a/rest-api/flow/internal/nicoapi/mod.go b/rest-api/flow/internal/nicoapi/mod.go index e39e887556..5f10731714 100644 --- a/rest-api/flow/internal/nicoapi/mod.go +++ b/rest-api/flow/internal/nicoapi/mod.go @@ -35,6 +35,21 @@ type Client interface { // FindMachinesByIds returns detailed machine information for the given machine IDs FindMachinesByIds(ctx context.Context, machineIds []string) ([]MachineDetail, error) + // FindHostMachineIdsByRack returns the IDs of host (non-DPU) machines that + // belong to the given rack. Empty rackID is rejected. Returns nil when the + // rack has no host machines. + FindHostMachineIdsByRack(ctx context.Context, rackID string) ([]string, error) + + // FindSwitchRackIDs returns the mapping from switch ID to rack ID for the + // given switches. A switch without a rack assignment is omitted from the + // result rather than reported as an empty string. + FindSwitchRackIDs(ctx context.Context, switchIds []string) (map[string]string, error) + + // FindPowerShelfRackIDs returns the mapping from power-shelf ID to rack ID + // for the given shelves. A shelf without a rack assignment is omitted from + // the result rather than reported as an empty string. + FindPowerShelfRackIDs(ctx context.Context, shelfIds []string) (map[string]string, error) + // GetMachinePositionInfo returns position information for the given machine IDs GetMachinePositionInfo(ctx context.Context, machineIds []string) ([]MachinePosition, error) @@ -114,4 +129,7 @@ type Client interface { AddExpectedSwitchInfo(info ExpectedSwitchInfo) SetLeakingMachineIds(ids []string) SetLeakingSwitchIds([]string) + SetSwitchRackID(switchID, rackID string) + SetPowerShelfRackID(shelfID, rackID string) + SetRackHostMachineIDs(rackID string, machineIDs []string) } diff --git a/rest-api/flow/internal/service/server_impl.go b/rest-api/flow/internal/service/server_impl.go index fc978e3e5c..10314bdc7f 100644 --- a/rest-api/flow/internal/service/server_impl.go +++ b/rest-api/flow/internal/service/server_impl.go @@ -650,7 +650,8 @@ func (rs *FlowServerImpl) PowerOnRack( req.GetQueueOptions(), req.GetRuleId(), &operations.PowerControlTaskInfo{ - Operation: operations.PowerOperationPowerOn, + Operation: operations.PowerOperationPowerOn, + OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(), }, ) } @@ -670,8 +671,9 @@ func (rs *FlowServerImpl) PowerOffRack( req.GetQueueOptions(), req.GetRuleId(), &operations.PowerControlTaskInfo{ - Operation: op, - Forced: req.GetForced(), + Operation: op, + Forced: req.GetForced(), + OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(), }, ) } @@ -691,8 +693,9 @@ func (rs *FlowServerImpl) PowerResetRack( req.GetQueueOptions(), req.GetRuleId(), &operations.PowerControlTaskInfo{ - Operation: op, - Forced: req.GetForced(), + Operation: op, + Forced: req.GetForced(), + OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(), }, ) } @@ -715,7 +718,8 @@ func (rs *FlowServerImpl) BringUpRack( } info := &operations.BringUpTaskInfo{ - RuleID: protobuf.UUIDStringFrom(req.GetRuleId()), + RuleID: protobuf.UUIDStringFrom(req.GetRuleId()), + OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(), } opReq, err := rs.convertTargetSpecToOperationRequest( targetSpec, req.GetDescription(), info, @@ -1233,10 +1237,11 @@ func (rs *FlowServerImpl) UpgradeFirmware( // Build FirmwareControlTaskInfo info := &operations.FirmwareControlTaskInfo{ - Operation: operations.FirmwareOperationUpgrade, - TargetVersion: req.GetTargetVersion(), - RuleID: protobuf.UUIDStringFrom(req.GetRuleId()), - SubTargets: req.GetSubTargets(), + Operation: operations.FirmwareOperationUpgrade, + TargetVersion: req.GetTargetVersion(), + RuleID: protobuf.UUIDStringFrom(req.GetRuleId()), + SubTargets: req.GetSubTargets(), + OverrideAssignmentCheck: req.GetOverrideAssignmentCheck(), } // Parse optional time parameters for scheduled upgrade diff --git a/rest-api/flow/internal/task/componentmanager/builtin/builtin_test.go b/rest-api/flow/internal/task/componentmanager/builtin/builtin_test.go index 9918270a6b..b37605a3f1 100644 --- a/rest-api/flow/internal/task/componentmanager/builtin/builtin_test.go +++ b/rest-api/flow/internal/task/componentmanager/builtin/builtin_test.go @@ -445,7 +445,7 @@ func TestServiceCatalog(t *testing.T) { name: "nvswitch nvswitchmanager", componentType: devicetypes.ComponentTypeNVSwitch, implementation: nvswitchnsm.ImplementationName, - requiredProviders: []string{nsmprovider.ProviderName}, + requiredProviders: []string{nsmprovider.ProviderName, nicoprovider.ProviderName}, capabilities: capability.CapabilitySet{ capability.CapabilityFirmwareControl, capability.CapabilityFirmwareStatus, diff --git a/rest-api/flow/internal/task/componentmanager/compute/nico/nico.go b/rest-api/flow/internal/task/componentmanager/compute/nico/nico.go index 2e8851559b..07509d93e6 100644 --- a/rest-api/flow/internal/task/componentmanager/compute/nico/nico.go +++ b/rest-api/flow/internal/task/componentmanager/compute/nico/nico.go @@ -42,6 +42,9 @@ type Manager struct { // avoid overwhelming the power delivery system when commanding // multiple compute trays. 0 means no delay. powerDelay time.Duration + // assignment guards mutating operations from running while any target + // machine still has an instance attached (ManagedHostState::Assigned). + assignment *nicoprovider.AssignmentChecker } // New creates a new NICo-based compute Manager instance. @@ -49,6 +52,7 @@ func New(nicoClient nicoapi.Client, powerDelay time.Duration) *Manager { return &Manager{ nicoClient: nicoClient, powerDelay: powerDelay, + assignment: nicoprovider.NewAssignmentChecker(nicoClient, 0, 0), } } @@ -152,6 +156,15 @@ func (m *Manager) PowerControl( return fmt.Errorf("target is invalid: %w", err) } + // Refuse to power-cycle a host that is currently attached to an + // instance. The poll blocks until Core reports the host has left the + // Assigned state, or returns an error at the deadline. The operator + // may set OverrideAssignmentCheck to bypass this gate for supervised + // maintenance; the bypass is logged inside ensureMachinesOperable. + if err := m.ensureMachinesOperable(ctx, target.ComponentIDs, info.OverrideAssignmentCheck); err != nil { + return fmt.Errorf("refused: %w", err) + } + // Map common.PowerOperation to nicoapi.SystemPowerControl var action nicoapi.SystemPowerControl var desiredPowerState nicoapi.PowerState @@ -324,6 +337,15 @@ func (m *Manager) FirmwareControl(ctx context.Context, target common.Target, inf return fmt.Errorf("target is invalid: %w", err) } + // Block firmware upgrade while any target host is still attached to an + // instance: BMC/host firmware updates power-cycle the machine. The + // operator may set OverrideAssignmentCheck to bypass this gate for + // supervised maintenance; the bypass is logged inside + // ensureMachinesOperable. + if err := m.ensureMachinesOperable(ctx, target.ComponentIDs, info.OverrideAssignmentCheck); err != nil { + return fmt.Errorf("refused: %w", err) + } + if len(info.SubTargets) > 0 { // SetFirmwareUpdateTimeWindow + SetMachineAutoUpdate (the path used // here for compute trays) do not expose per-sub-target selection; @@ -734,6 +756,7 @@ func (m *Manager) GetFirmwareStatus(ctx context.Context, target common.Target) ( func (m *Manager) BringUpControl( ctx context.Context, target common.Target, + info operations.BringUpTaskInfo, ) error { log.Debug(). Str("components", target.String()). @@ -747,6 +770,15 @@ func (m *Manager) BringUpControl( return fmt.Errorf("target is invalid: %w", err) } + // Opening the power-on gate can trigger an actual power transition, + // so the same assignment-state safety check that guards PowerControl + // applies here. OverrideAssignmentCheck propagates from the parent + // BringUp request when the operator elects to bypass; the bypass is + // logged inside ensureMachinesOperable. + if err := m.ensureMachinesOperable(ctx, target.ComponentIDs, info.OverrideAssignmentCheck); err != nil { + return fmt.Errorf("refused: %w", err) + } + for _, componentID := range target.ComponentIDs { if err := m.nicoClient.AllowIngestionAndPowerOn( ctx, componentID, "", @@ -817,6 +849,30 @@ func nicoToBringUpState( } } +// ensureMachinesOperable is the per-Manager policy gate for disruptive +// operations on the given machines. The default policy refuses to proceed +// while any target host is still in Core's Assigned/* lifecycle state. +// +// When overrideAssignmentCheck is true the gate is short-circuited and the +// operation runs against the current set of machines unconditionally. The +// override is intended for operator-supervised maintenance windows where +// tenant impact has been acknowledged out-of-band; authorisation is +// enforced upstream and is not re-checked here. A warning is emitted with +// the machine IDs so the bypass is auditable from the worker log alone. +func (m *Manager) ensureMachinesOperable( + ctx context.Context, + machineIDs []string, + overrideAssignmentCheck bool, +) error { + if overrideAssignmentCheck { + log.Warn(). + Strs("machine_ids", machineIDs). + Msg("Assignment safety check bypassed by override_assignment_check on compute operation") + return nil + } + return m.assignment.WaitForMachinesUnassigned(ctx, machineIDs) +} + // isAlreadyInDesiredStateError returns true when NICo reports that the // power option is already set to the requested state (idempotent no-op). func isAlreadyInDesiredStateError(err error) bool { diff --git a/rest-api/flow/internal/task/componentmanager/compute/nico/nico_test.go b/rest-api/flow/internal/task/componentmanager/compute/nico/nico_test.go index 33d47c3827..6bb1489fbf 100644 --- a/rest-api/flow/internal/task/componentmanager/compute/nico/nico_test.go +++ b/rest-api/flow/internal/task/componentmanager/compute/nico/nico_test.go @@ -17,6 +17,7 @@ import ( "github.com/NVIDIA/infra-controller-rest/flow/internal/nicoapi" pb "github.com/NVIDIA/infra-controller-rest/flow/internal/nicoapi/gen" cmconfig "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager/config" + nicoprovider "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager/providers/nico" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/executor/temporalworkflow/common" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/operations" "github.com/NVIDIA/infra-controller-rest/flow/pkg/common/devicetypes" @@ -523,3 +524,125 @@ func TestAllFirmwareUpToDate(t *testing.T) { }) } } + +// newManagerForSafetyTest swaps the long default 30-minute assignment +// timeout for a tight one so the wait loop actually times out within the +// test budget. Tests in this file use the same package, so they can reach +// the unexported assignment field directly. +func newManagerForSafetyTest(t *testing.T, client nicoapi.Client) *Manager { + t.Helper() + m := New(client, 0) + m.assignment = nicoprovider.NewAssignmentChecker(client, 50*time.Millisecond, 10*time.Millisecond) + return m +} + +func TestPowerControl_RefusesAssignedMachine(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "machine-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeCompute, + ComponentIDs: []string{"machine-1"}, + } + + err := m.PowerControl(context.Background(), target, operations.PowerControlTaskInfo{ + Operation: operations.PowerOperationPowerOff, + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "refused") + assert.Contains(t, err.Error(), "Assigned state") + assert.Contains(t, err.Error(), "machine-1") +} + +func TestPowerControl_AllowsUnassignedMachine(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "machine-1", State: "Ready"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeCompute, + ComponentIDs: []string{"machine-1"}, + } + + err := m.PowerControl(context.Background(), target, operations.PowerControlTaskInfo{ + Operation: operations.PowerOperationPowerOn, + }) + require.NoError(t, err) +} + +func TestBringUpControl_RefusesAssignedMachine(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "machine-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeCompute, + ComponentIDs: []string{"machine-1"}, + } + + err := m.BringUpControl(context.Background(), target, operations.BringUpTaskInfo{}) + require.Error(t, err) + assert.Contains(t, err.Error(), "refused") + assert.Contains(t, err.Error(), "Assigned state") +} + +func TestFirmwareControl_RefusesAssignedMachine(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "machine-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeCompute, + ComponentIDs: []string{"machine-1"}, + } + + err := m.FirmwareControl(context.Background(), target, operations.FirmwareControlTaskInfo{ + Operation: operations.FirmwareOperationUpgrade, + TargetVersion: "1.0.0", + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "refused") + assert.Contains(t, err.Error(), "Assigned state") +} + +// TestPowerControl_OverrideBypassesAssignmentCheck verifies that the +// operator-controlled OverrideAssignmentCheck flag short-circuits the +// assignment-state gate on PowerControl. The target host is in +// Assigned/* — which would otherwise block the call — yet the operation +// is expected to proceed past the gate. PowerOperationPowerOn is chosen +// because the mock client accepts it without additional fixture setup. +func TestPowerControl_OverrideBypassesAssignmentCheck(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "machine-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeCompute, + ComponentIDs: []string{"machine-1"}, + } + + err := m.PowerControl(context.Background(), target, operations.PowerControlTaskInfo{ + Operation: operations.PowerOperationPowerOn, + OverrideAssignmentCheck: true, + }) + require.NoError(t, err) +} + +// TestBringUpControl_OverrideBypassesAssignmentCheck is the BringUp +// counterpart of TestPowerControl_OverrideBypassesAssignmentCheck. +func TestBringUpControl_OverrideBypassesAssignmentCheck(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "machine-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeCompute, + ComponentIDs: []string{"machine-1"}, + } + + err := m.BringUpControl(context.Background(), target, operations.BringUpTaskInfo{ + OverrideAssignmentCheck: true, + }) + require.NoError(t, err) +} diff --git a/rest-api/flow/internal/task/componentmanager/mock/mock.go b/rest-api/flow/internal/task/componentmanager/mock/mock.go index 92b99198c1..c4fc28f770 100644 --- a/rest-api/flow/internal/task/componentmanager/mock/mock.go +++ b/rest-api/flow/internal/task/componentmanager/mock/mock.go @@ -203,10 +203,13 @@ func (m *Manager) FirmwareControl( return nil } -// BringUpControl simulates opening the bring-up gate. +// BringUpControl simulates opening the bring-up gate. The info argument is +// accepted to satisfy the BringUpController interface; the mock ignores its +// contents. func (m *Manager) BringUpControl( ctx context.Context, target common.Target, + _ operations.BringUpTaskInfo, ) error { log.Debug(). Str("component_type", m.componentType.String()). diff --git a/rest-api/flow/internal/task/componentmanager/nvswitch/nico/nico.go b/rest-api/flow/internal/task/componentmanager/nvswitch/nico/nico.go index 868b673706..d5c9b5adb7 100644 --- a/rest-api/flow/internal/task/componentmanager/nvswitch/nico/nico.go +++ b/rest-api/flow/internal/task/componentmanager/nvswitch/nico/nico.go @@ -32,12 +32,18 @@ const ( // Manager manages NVLink switch components via the NICo API. type Manager struct { nicoClient nicoapi.Client + // assignment guards power/firmware operations on a switch from running + // while any host on the switch's rack is still attached to an instance. + // A switch reset typically disrupts NVLink traffic for the whole rack, + // so this safety check is rack-scoped rather than component-scoped. + assignment *nicoprovider.AssignmentChecker } // New creates a new NICo-based NVSwitch Manager instance. func New(nicoClient nicoapi.Client) *Manager { return &Manager{ nicoClient: nicoClient, + assignment: nicoprovider.NewAssignmentChecker(nicoClient, 0, 0), } } @@ -123,6 +129,62 @@ func switchIDsProto(ids []string) *pb.SwitchIdList { return &pb.SwitchIdList{Ids: pbIDs} } +// ensureRackOperable is the per-Manager policy gate for disruptive +// operations on the racks that own the given switches. The default policy +// refuses to proceed while any host on the resolved rack(s) is still in +// Core's Assigned/* lifecycle state, because a switch reset disrupts +// NVLink traffic for the entire rack. +// +// When overrideAssignmentCheck is true the gate is short-circuited +// without performing the rack lookup: the operator has acknowledged the +// tenant impact upstream and the rack-resolution gRPC call is no longer +// necessary. A warning is emitted with the switch IDs so the bypass is +// auditable from the worker log alone. +// +// A switch that Core does not associate with a rack is logged and +// skipped: failing closed would block bring-up flows for switches that +// have not yet been ingested into the rack topology. +func (m *Manager) ensureRackOperable( + ctx context.Context, + switchIDs []string, + overrideAssignmentCheck bool, +) error { + if len(switchIDs) == 0 { + return nil + } + + if overrideAssignmentCheck { + log.Warn(). + Strs("switch_ids", switchIDs). + Msg("Assignment safety check bypassed by override_assignment_check on NVSwitch operation") + return nil + } + + rackBySwitch, err := m.nicoClient.FindSwitchRackIDs(ctx, switchIDs) + if err != nil { + return fmt.Errorf("look up rack for switches: %w", err) + } + + rackIDs := make([]string, 0, len(rackBySwitch)) + for _, rid := range rackBySwitch { + rackIDs = append(rackIDs, rid) + } + + var orphan []string + for _, sid := range switchIDs { + if _, ok := rackBySwitch[sid]; !ok { + orphan = append(orphan, sid) + } + } + if len(orphan) > 0 { + log.Warn(). + Strs("switch_ids", orphan). + Msg("NVSwitch has no rack assignment; assignment safety check cannot be applied") + } + + return m.assignment.WaitForRacksUnassigned(ctx, rackIDs) +} + // PowerControl performs power operations on NVLink switches via NICo's // ComponentPowerControl RPC. func (m *Manager) PowerControl( @@ -140,6 +202,10 @@ func (m *Manager) PowerControl( return fmt.Errorf("target is invalid: %w", err) } + if err := m.ensureRackOperable(ctx, target.ComponentIDs, info.OverrideAssignmentCheck); err != nil { + return fmt.Errorf("refused: %w", err) + } + var action pb.SystemPowerControl switch info.Operation { case operations.PowerOperationPowerOn, operations.PowerOperationForcePowerOn: @@ -247,6 +313,10 @@ func (m *Manager) FirmwareControl(ctx context.Context, target common.Target, inf return fmt.Errorf("target is invalid: %w", err) } + if err := m.ensureRackOperable(ctx, target.ComponentIDs, info.OverrideAssignmentCheck); err != nil { + return fmt.Errorf("refused: %w", err) + } + subComponents, err := firmwarecomponents.ParseNICoNVSwitch(info.SubTargets) if err != nil { return err diff --git a/rest-api/flow/internal/task/componentmanager/nvswitch/nico/nico_test.go b/rest-api/flow/internal/task/componentmanager/nvswitch/nico/nico_test.go index 1665551ca0..5805fedf75 100644 --- a/rest-api/flow/internal/task/componentmanager/nvswitch/nico/nico_test.go +++ b/rest-api/flow/internal/task/componentmanager/nvswitch/nico/nico_test.go @@ -7,12 +7,14 @@ import ( "context" "encoding/json" "testing" + "time" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" "github.com/NVIDIA/infra-controller-rest/flow/internal/nicoapi" pb "github.com/NVIDIA/infra-controller-rest/flow/internal/nicoapi/gen" + nicoprovider "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager/providers/nico" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/executor/temporalworkflow/common" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/operations" "github.com/NVIDIA/infra-controller-rest/flow/pkg/common/devicetypes" @@ -202,6 +204,103 @@ func TestAggregateNICoStatuses(t *testing.T) { } } +// newManagerForSafetyTest swaps the long default 30-minute assignment +// timeout for a tight one so the wait loop actually times out within the +// test budget. Tests in this file use the same package, so they can reach +// the unexported assignment field directly. +func newManagerForSafetyTest(t *testing.T, client nicoapi.Client) *Manager { + t.Helper() + m := New(client) + m.assignment = nicoprovider.NewAssignmentChecker(client, 50*time.Millisecond, 10*time.Millisecond) + return m +} + +func TestPowerControl_RefusesWhenRackHostAssigned(t *testing.T) { + client := nicoapi.NewMockClient() + client.SetSwitchRackID("sw-1", "rack-A") + client.SetRackHostMachineIDs("rack-A", []string{"host-1"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "host-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeNVSwitch, + ComponentIDs: []string{"sw-1"}, + } + + err := m.PowerControl(context.Background(), target, operations.PowerControlTaskInfo{ + Operation: operations.PowerOperationPowerOff, + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "refused") + assert.Contains(t, err.Error(), "Assigned state") + assert.Contains(t, err.Error(), "host-1") +} + +func TestPowerControl_AllowsWhenRackHostsReady(t *testing.T) { + client := nicoapi.NewMockClient() + client.SetSwitchRackID("sw-1", "rack-A") + client.SetRackHostMachineIDs("rack-A", []string{"host-1"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "host-1", State: "Ready"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeNVSwitch, + ComponentIDs: []string{"sw-1"}, + } + + err := m.PowerControl(context.Background(), target, operations.PowerControlTaskInfo{ + Operation: operations.PowerOperationPowerOn, + }) + require.NoError(t, err) +} + +func TestFirmwareControl_RefusesWhenRackHostAssigned(t *testing.T) { + client := nicoapi.NewMockClient() + client.SetSwitchRackID("sw-1", "rack-A") + client.SetRackHostMachineIDs("rack-A", []string{"host-1"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "host-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeNVSwitch, + ComponentIDs: []string{"sw-1"}, + } + + err := m.FirmwareControl(context.Background(), target, operations.FirmwareControlTaskInfo{ + Operation: operations.FirmwareOperationUpgrade, + TargetVersion: "1.0.0", + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "refused") + assert.Contains(t, err.Error(), "Assigned state") +} + +// TestPowerControl_OverrideBypassesRackAssignmentCheck verifies that +// OverrideAssignmentCheck short-circuits the rack-scoped gate on +// NVSwitch PowerControl. The host on the resolved rack is in +// Assigned/* — which would otherwise block the call — yet the +// operation is expected to proceed past the gate. The switch is left +// without an explicit rack mapping to confirm the override path skips +// the rack lookup entirely. +func TestPowerControl_OverrideBypassesRackAssignmentCheck(t *testing.T) { + client := nicoapi.NewMockClient() + client.SetSwitchRackID("sw-1", "rack-A") + client.SetRackHostMachineIDs("rack-A", []string{"host-1"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "host-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypeNVSwitch, + ComponentIDs: []string{"sw-1"}, + } + + err := m.PowerControl(context.Background(), target, operations.PowerControlTaskInfo{ + Operation: operations.PowerOperationPowerOn, + OverrideAssignmentCheck: true, + }) + require.NoError(t, err) +} + func mustMarshal(t *testing.T, v any) json.RawMessage { t.Helper() data, err := json.Marshal(v) diff --git a/rest-api/flow/internal/task/componentmanager/nvswitch/nvswitchmanager/nvswitchmanager.go b/rest-api/flow/internal/task/componentmanager/nvswitch/nvswitchmanager/nvswitchmanager.go index a0bdfd3d96..634a52ad58 100644 --- a/rest-api/flow/internal/task/componentmanager/nvswitch/nvswitchmanager/nvswitchmanager.go +++ b/rest-api/flow/internal/task/componentmanager/nvswitch/nvswitchmanager/nvswitchmanager.go @@ -10,11 +10,13 @@ import ( "github.com/rs/zerolog/log" + "github.com/NVIDIA/infra-controller-rest/flow/internal/nicoapi" "github.com/NVIDIA/infra-controller-rest/flow/internal/nsmapi" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager/capability" cmcatalog "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager/catalog" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager/providerapi" + nicoprovider "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager/providers/nico" nsmprovider "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager/providers/nvswitchmanager" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/executor/temporalworkflow/common" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/operations" @@ -28,14 +30,23 @@ const ( ) // Manager manages NVLink switch components via the NV-Switch Manager gRPC API. +// +// nicoClient is used solely to resolve a switch's rack and the host +// machines on that rack so that the assignment safety check can refuse +// power/firmware operations while any host on the rack is still Assigned. +// Switch power/firmware traffic itself still goes through nsmClient. type Manager struct { - nsmClient nsmapi.Client + nsmClient nsmapi.Client + assignment *nicoprovider.AssignmentChecker + nicoClient nicoapi.Client } // New creates a new NV-Switch Manager-based NVSwitch Manager instance. -func New(nsmClient nsmapi.Client) *Manager { +func New(nsmClient nsmapi.Client, nicoClient nicoapi.Client) *Manager { return &Manager{ - nsmClient: nsmClient, + nsmClient: nsmClient, + nicoClient: nicoClient, + assignment: nicoprovider.NewAssignmentChecker(nicoClient, 0, 0), } } @@ -50,7 +61,15 @@ func Factory(providerRegistry *providerapi.ProviderRegistry) (componentmanager.C return nil, fmt.Errorf("nvswitch/nvswitchmanager requires nvswitchmanager provider: %w", err) } - return New(provider.Client()), nil + nicoProv, err := providerapi.GetTyped[*nicoprovider.Provider]( + providerRegistry, + nicoprovider.ProviderName, + ) + if err != nil { + return nil, fmt.Errorf("nvswitch/nvswitchmanager requires nico provider for rack assignment safety check: %w", err) + } + + return New(provider.Client(), nicoProv.Client()), nil } // Descriptor returns the NV-Switch Manager NVSwitch manager descriptor. @@ -60,7 +79,7 @@ func Descriptor() cmcatalog.Descriptor { Type: devicetypes.ComponentTypeNVSwitch, Implementation: ImplementationName, }, - RequiredProviders: []string{nsmprovider.ProviderName}, + RequiredProviders: []string{nsmprovider.ProviderName, nicoprovider.ProviderName}, Capabilities: capability.CapabilitySet{ capability.CapabilityFirmwareControl, capability.CapabilityFirmwareStatus, @@ -103,6 +122,10 @@ func (m *Manager) PowerControl( return fmt.Errorf("target is invalid: %w", err) } + if err := m.ensureRackOperable(ctx, target.ComponentIDs, info.OverrideAssignmentCheck); err != nil { + return fmt.Errorf("refused: %w", err) + } + action, err := mapPowerOperation(info.Operation) if err != nil { return err @@ -173,6 +196,10 @@ func (m *Manager) FirmwareControl(ctx context.Context, target common.Target, inf return fmt.Errorf("target is invalid: %w", err) } + if err := m.ensureRackOperable(ctx, target.ComponentIDs, info.OverrideAssignmentCheck); err != nil { + return fmt.Errorf("refused: %w", err) + } + subComponents, err := firmwarecomponents.ParseNSMNVSwitch(info.SubTargets) if err != nil { return err @@ -230,6 +257,66 @@ func (m *Manager) GetFirmwareStatus(ctx context.Context, target common.Target) ( return result, nil } +// ensureRackOperable is the per-Manager policy gate for disruptive +// operations on the racks that own the given switches. The default policy +// refuses to proceed while any host on the resolved rack(s) is still in +// Core's Assigned/* lifecycle state, because a switch reset disrupts +// NVLink traffic for the entire rack. +// +// When overrideAssignmentCheck is true the gate is short-circuited +// without performing the rack lookup. The override is intended for +// operator-supervised maintenance windows; authorisation is enforced +// upstream and is not re-checked here. A warning is emitted so the +// bypass is auditable from the worker log alone. +// +// A switch that Core does not associate with a rack is logged and +// skipped: failing closed would block bring-up flows for switches that +// have not yet been ingested into the rack topology. +func (m *Manager) ensureRackOperable( + ctx context.Context, + switchIDs []string, + overrideAssignmentCheck bool, +) error { + if len(switchIDs) == 0 { + return nil + } + + if overrideAssignmentCheck { + log.Warn(). + Strs("switch_ids", switchIDs). + Msg("Assignment safety check bypassed by override_assignment_check on NVSwitch (nvswitchmanager) operation") + return nil + } + + if m.nicoClient == nil || m.assignment == nil { + return fmt.Errorf("nico client is not configured; rack assignment safety check cannot run") + } + + rackBySwitch, err := m.nicoClient.FindSwitchRackIDs(ctx, switchIDs) + if err != nil { + return fmt.Errorf("look up rack for switches: %w", err) + } + + rackIDs := make([]string, 0, len(rackBySwitch)) + for _, rid := range rackBySwitch { + rackIDs = append(rackIDs, rid) + } + + var orphan []string + for _, sid := range switchIDs { + if _, ok := rackBySwitch[sid]; !ok { + orphan = append(orphan, sid) + } + } + if len(orphan) > 0 { + log.Warn(). + Strs("switch_ids", orphan). + Msg("NVSwitch has no rack assignment; assignment safety check cannot be applied") + } + + return m.assignment.WaitForRacksUnassigned(ctx, rackIDs) +} + // aggregateUpdateStatuses examines all sub-component firmware updates for a switch // and produces a single FirmwareUpdateStatus. If any sub-component failed or was // cancelled the overall status is Failed with a message listing each one. diff --git a/rest-api/flow/internal/task/componentmanager/operations.go b/rest-api/flow/internal/task/componentmanager/operations.go index b81ae6bd0e..32fd5b96f4 100644 --- a/rest-api/flow/internal/task/componentmanager/operations.go +++ b/rest-api/flow/internal/task/componentmanager/operations.go @@ -69,8 +69,11 @@ type FirmwareStatusReader interface { // Required descriptor capability: capability.CapabilityBringUpControl. type BringUpController interface { // BringUpControl opens the power-on gate for the target components, allowing - // them to proceed through the bring-up sequence. - BringUpControl(ctx context.Context, target common.Target) error + // them to proceed through the bring-up sequence. The info argument carries + // the parent BringUp task settings — notably OverrideAssignmentCheck — + // because bring-up can power-cycle hosts and must consult the same safety + // gate as PowerControl / FirmwareControl. + BringUpControl(ctx context.Context, target common.Target, info operations.BringUpTaskInfo) error //nolint } // BringUpStatusReader is implemented by component managers that can report diff --git a/rest-api/flow/internal/task/componentmanager/powershelf/nico/nico.go b/rest-api/flow/internal/task/componentmanager/powershelf/nico/nico.go index e0ecd1b9c3..24f2eeec3d 100644 --- a/rest-api/flow/internal/task/componentmanager/powershelf/nico/nico.go +++ b/rest-api/flow/internal/task/componentmanager/powershelf/nico/nico.go @@ -30,10 +30,18 @@ const ( // Manager manages power shelf components via the NICo/NICo component dispatch RPCs. type Manager struct { nicoClient nicoapi.Client + // assignment guards power/firmware operations on a shelf from running + // while any host on the shelf's rack is still attached to an instance. + // PowerShelves feed the entire rack, so toggling one can power-cycle + // every host downstream of it; the check is therefore rack-scoped. + assignment *nicoprovider.AssignmentChecker } func New(nicoClient nicoapi.Client) *Manager { - return &Manager{nicoClient: nicoClient} + return &Manager{ + nicoClient: nicoClient, + assignment: nicoprovider.NewAssignmentChecker(nicoClient, 0, 0), + } } // Factory creates a new Manager from the provided providers. @@ -87,6 +95,61 @@ func powerShelfIDsProto(ids []string) *pb.PowerShelfIdList { return &pb.PowerShelfIdList{Ids: pbIDs} } +// ensureRackOperable is the per-Manager policy gate for disruptive +// operations on the racks that own the given power shelves. The default +// policy refuses to proceed while any host on the resolved rack(s) is +// still in Core's Assigned/* lifecycle state, because a shelf reset +// power-cycles every host downstream of it. +// +// When overrideAssignmentCheck is true the gate is short-circuited +// without performing the rack lookup. The override is intended for +// operator-supervised maintenance windows; authorisation is enforced +// upstream and is not re-checked here. A warning is emitted so the +// bypass is auditable from the worker log alone. +// +// Shelves not associated with a rack in Core are skipped with a warning +// (see the equivalent NVSwitch helper for the reasoning). +func (m *Manager) ensureRackOperable( + ctx context.Context, + shelfIDs []string, + overrideAssignmentCheck bool, +) error { + if len(shelfIDs) == 0 { + return nil + } + + if overrideAssignmentCheck { + log.Warn(). + Strs("power_shelf_ids", shelfIDs). + Msg("Assignment safety check bypassed by override_assignment_check on PowerShelf operation") + return nil + } + + rackByShelf, err := m.nicoClient.FindPowerShelfRackIDs(ctx, shelfIDs) + if err != nil { + return fmt.Errorf("look up rack for power shelves: %w", err) + } + + rackIDs := make([]string, 0, len(rackByShelf)) + for _, rid := range rackByShelf { + rackIDs = append(rackIDs, rid) + } + + var orphan []string + for _, sid := range shelfIDs { + if _, ok := rackByShelf[sid]; !ok { + orphan = append(orphan, sid) + } + } + if len(orphan) > 0 { + log.Warn(). + Strs("power_shelf_ids", orphan). + Msg("PowerShelf has no rack assignment; assignment safety check cannot be applied") + } + + return m.assignment.WaitForRacksUnassigned(ctx, rackIDs) +} + // InjectExpectation registers an expected power shelf with NICo via AddExpectedPowerShelf. // The Info field should contain a JSON-encoded nicoapi.AddExpectedPowerShelfRequest. func (m *Manager) InjectExpectation( @@ -127,6 +190,10 @@ func (m *Manager) PowerControl( return fmt.Errorf("target is invalid: %w", err) } + if err := m.ensureRackOperable(ctx, target.ComponentIDs, info.OverrideAssignmentCheck); err != nil { + return fmt.Errorf("refused: %w", err) + } + var action pb.SystemPowerControl switch info.Operation { case operations.PowerOperationPowerOn, operations.PowerOperationForcePowerOn: @@ -215,6 +282,10 @@ func (m *Manager) FirmwareControl( return fmt.Errorf("target is invalid: %w", err) } + if err := m.ensureRackOperable(ctx, target.ComponentIDs, info.OverrideAssignmentCheck); err != nil { + return fmt.Errorf("refused: %w", err) + } + subComponents, err := firmwarecomponents.ParseNICoPowerShelf(info.SubTargets) if err != nil { return err diff --git a/rest-api/flow/internal/task/componentmanager/powershelf/nico/nico_test.go b/rest-api/flow/internal/task/componentmanager/powershelf/nico/nico_test.go index ef07828b5f..df6095dc70 100644 --- a/rest-api/flow/internal/task/componentmanager/powershelf/nico/nico_test.go +++ b/rest-api/flow/internal/task/componentmanager/powershelf/nico/nico_test.go @@ -7,10 +7,13 @@ import ( "context" "encoding/json" "testing" + "time" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" "github.com/NVIDIA/infra-controller-rest/flow/internal/nicoapi" + nicoprovider "github.com/NVIDIA/infra-controller-rest/flow/internal/task/componentmanager/providers/nico" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/executor/temporalworkflow/common" "github.com/NVIDIA/infra-controller-rest/flow/internal/task/operations" "github.com/NVIDIA/infra-controller-rest/flow/pkg/common/devicetypes" @@ -119,6 +122,102 @@ func TestGetFirmwareStatus(t *testing.T) { assert.NotNil(t, statuses) } +// newManagerForSafetyTest swaps the long default 30-minute assignment +// timeout for a tight one so the wait loop actually times out within the +// test budget. Tests in this file use the same package, so they can reach +// the unexported assignment field directly. +func newManagerForSafetyTest(t *testing.T, client nicoapi.Client) *Manager { + t.Helper() + m := New(client) + m.assignment = nicoprovider.NewAssignmentChecker(client, 50*time.Millisecond, 10*time.Millisecond) + return m +} + +func TestPowerControl_RefusesWhenRackHostAssigned(t *testing.T) { + client := nicoapi.NewMockClient() + client.SetPowerShelfRackID("ps-1", "rack-A") + client.SetRackHostMachineIDs("rack-A", []string{"host-1"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "host-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypePowerShelf, + ComponentIDs: []string{"ps-1"}, + } + + err := m.PowerControl(context.Background(), target, operations.PowerControlTaskInfo{ + Operation: operations.PowerOperationPowerOff, + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "refused") + assert.Contains(t, err.Error(), "Assigned state") + assert.Contains(t, err.Error(), "host-1") +} + +func TestPowerControl_AllowsWhenRackHostsReady(t *testing.T) { + client := nicoapi.NewMockClient() + client.SetPowerShelfRackID("ps-1", "rack-A") + client.SetRackHostMachineIDs("rack-A", []string{"host-1"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "host-1", State: "Ready"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypePowerShelf, + ComponentIDs: []string{"ps-1"}, + } + + err := m.PowerControl(context.Background(), target, operations.PowerControlTaskInfo{ + Operation: operations.PowerOperationPowerOn, + }) + require.NoError(t, err) +} + +func TestFirmwareControl_RefusesWhenRackHostAssigned(t *testing.T) { + client := nicoapi.NewMockClient() + client.SetPowerShelfRackID("ps-1", "rack-A") + client.SetRackHostMachineIDs("rack-A", []string{"host-1"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "host-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypePowerShelf, + ComponentIDs: []string{"ps-1"}, + } + + err := m.FirmwareControl(context.Background(), target, operations.FirmwareControlTaskInfo{ + Operation: operations.FirmwareOperationUpgrade, + TargetVersion: "1.0.0", + }) + require.Error(t, err) + assert.Contains(t, err.Error(), "refused") + assert.Contains(t, err.Error(), "Assigned state") +} + +// TestFirmwareControl_OverrideBypassesRackAssignmentCheck verifies that +// OverrideAssignmentCheck short-circuits the rack-scoped gate on +// PowerShelf FirmwareControl. The host on the resolved rack is in +// Assigned/* — which would otherwise block the call — yet the +// operation is expected to proceed past the gate. +func TestFirmwareControl_OverrideBypassesRackAssignmentCheck(t *testing.T) { + client := nicoapi.NewMockClient() + client.SetPowerShelfRackID("ps-1", "rack-A") + client.SetRackHostMachineIDs("rack-A", []string{"host-1"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "host-1", State: "Assigned/Provisioning"}) + + m := newManagerForSafetyTest(t, client) + target := common.Target{ + Type: devicetypes.ComponentTypePowerShelf, + ComponentIDs: []string{"ps-1"}, + } + + err := m.FirmwareControl(context.Background(), target, operations.FirmwareControlTaskInfo{ + Operation: operations.FirmwareOperationUpgrade, + TargetVersion: "1.0.0", + OverrideAssignmentCheck: true, + }) + require.NoError(t, err) +} + func mustMarshal(t *testing.T, v any) json.RawMessage { t.Helper() data, err := json.Marshal(v) diff --git a/rest-api/flow/internal/task/componentmanager/providers/nico/assignment.go b/rest-api/flow/internal/task/componentmanager/providers/nico/assignment.go new file mode 100644 index 0000000000..584250a014 --- /dev/null +++ b/rest-api/flow/internal/task/componentmanager/providers/nico/assignment.go @@ -0,0 +1,235 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nico + +import ( + "context" + "errors" + "fmt" + "sort" + "strings" + "time" + + "github.com/rs/zerolog/log" + + "github.com/NVIDIA/infra-controller-rest/flow/internal/nicoapi" +) + +// Default polling parameters for the rack-assignment safety check. They are +// chosen to err on the side of waiting: 30 minutes is long enough to cover a +// running tenant terminate cycle, and 60s keeps the gRPC load on Core low. +const ( + DefaultAssignmentWaitTimeout = 30 * time.Minute + DefaultAssignmentPollInterval = 60 * time.Second + + // assignedStatePrefix matches Core's `ManagedHostState::Assigned/...` + // Display form (e.g. "Assigned/Provisioning", "Assigned/Reprovision/..."). + // It is intentionally a prefix and not Contains, so "PreAssignedMeasuring" + // / "PostAssignedMeasuring" — which are early/late attestation phases + // distinct from "host has an instance attached" — are not flagged. + assignedStatePrefix = "Assigned/" +) + +// AssignmentChecker waits until no compute machines in a given scope are in +// Core's `Assigned/...` state, guarding power and firmware operations from +// running while a tenant is actively using the hardware. +type AssignmentChecker struct { + client nicoapi.Client + timeout time.Duration + pollInterval time.Duration +} + +// NewAssignmentChecker builds an AssignmentChecker with the supplied NICo +// client. Zero or negative timeout/interval values fall back to the package +// defaults so callers can opt in to overrides without having to repeat them. +func NewAssignmentChecker(client nicoapi.Client, timeout, pollInterval time.Duration) *AssignmentChecker { + if timeout <= 0 { + timeout = DefaultAssignmentWaitTimeout + } + if pollInterval <= 0 { + pollInterval = DefaultAssignmentPollInterval + } + return &AssignmentChecker{ + client: client, + timeout: timeout, + pollInterval: pollInterval, + } +} + +// IsAssignedState reports whether the given Machine.state string from Core +// represents the `Assigned` host lifecycle. Exported so callers (e.g. tests) +// can share the canonical predicate. +func IsAssignedState(state string) bool { + return strings.HasPrefix(state, assignedStatePrefix) +} + +// WaitForMachinesUnassigned blocks until every machine in machineIDs has left +// the `Assigned/...` state, or until the configured timeout elapses. An empty +// list is a no-op. A nil client also short-circuits to no-op so unit tests can +// construct managers without a NICo dependency. +func (c *AssignmentChecker) WaitForMachinesUnassigned(ctx context.Context, machineIDs []string) error { + if c == nil || c.client == nil || len(machineIDs) == 0 { + return nil + } + + uniqueIDs := dedupSorted(machineIDs) + + deadline := time.Now().Add(c.timeout) + attempt := 0 + for { + attempt++ + assigned, err := c.findAssigned(ctx, uniqueIDs) + if err != nil { + return fmt.Errorf("assignment check failed: %w", err) + } + if len(assigned) == 0 { + if attempt > 1 { + log.Info(). + Strs("machine_ids", uniqueIDs). + Int("attempts", attempt). + Msg("Machines left Assigned state, proceeding with operation") + } + return nil + } + + if !time.Now().Before(deadline) { + return fmt.Errorf( + "timed out after %s waiting for machines to leave Assigned state: %s", + c.timeout, strings.Join(assigned, ", "), + ) + } + + log.Info(). + Strs("assigned_machine_ids", assigned). + Dur("poll_interval", c.pollInterval). + Time("deadline", deadline). + Msg("Machines still Assigned, deferring operation") + + sleepErr := sleep(ctx, c.pollInterval) + if sleepErr != nil { + return sleepErr + } + } +} + +// WaitForRacksUnassigned resolves each rackID to its host machines and then +// delegates to WaitForMachinesUnassigned. Duplicate rack IDs are coalesced so +// a multi-component target on a single rack only triggers one rack lookup. +func (c *AssignmentChecker) WaitForRacksUnassigned(ctx context.Context, rackIDs []string) error { + if c == nil || c.client == nil || len(rackIDs) == 0 { + return nil + } + + uniqueRacks := dedupSorted(rackIDs) + + allMachines := make([]string, 0) + for _, rackID := range uniqueRacks { + machines, err := c.client.FindHostMachineIdsByRack(ctx, rackID) + if err != nil { + return fmt.Errorf("list host machines for rack %s: %w", rackID, err) + } + allMachines = append(allMachines, machines...) + } + + if len(allMachines) == 0 { + // No host machines on these racks (e.g., switch-only rack or empty + // rack). The safety check is therefore vacuously satisfied; log so + // the absence is visible. + log.Info(). + Strs("rack_ids", uniqueRacks). + Msg("Rack assignment check: no host machines found, skipping wait") + return nil + } + + return c.WaitForMachinesUnassigned(ctx, allMachines) +} + +// findAssigned returns the subset of machineIDs that Core currently reports in +// the `Assigned/...` state. Machines whose state cannot be retrieved are NOT +// treated as Assigned — that would conflate "missing data" with "in use" and +// fail-closed on every transient gRPC blip. The caller logs which IDs returned +// no data so the gap stays visible. +func (c *AssignmentChecker) findAssigned(ctx context.Context, machineIDs []string) ([]string, error) { + details, err := c.client.FindMachinesByIds(ctx, machineIDs) + if err != nil { + return nil, err + } + + byID := make(map[string]nicoapi.MachineDetail, len(details)) + for _, d := range details { + byID[d.MachineID] = d + } + + var assigned []string + var missing []string + for _, id := range machineIDs { + d, ok := byID[id] + if !ok { + missing = append(missing, id) + continue + } + if IsAssignedState(d.State) { + assigned = append(assigned, id) + } + } + + if len(missing) > 0 { + log.Warn(). + Strs("missing_machine_ids", missing). + Msg("Assignment check: Core returned no state for some machines, treating them as unassigned") + } + return assigned, nil +} + +// sleep returns context.Canceled (or context.DeadlineExceeded) immediately if +// the context is cancelled before the duration elapses, instead of sleeping +// the full interval and then noticing. Using time.NewTimer (not time.After) +// avoids leaking the underlying timer if the context wins the race. +func sleep(ctx context.Context, d time.Duration) error { + if d <= 0 { + return nil + } + t := time.NewTimer(d) + defer t.Stop() + select { + case <-ctx.Done(): + return errors.Join(errors.New("aborted while waiting for machines to leave Assigned state"), ctx.Err()) + case <-t.C: + return nil + } +} + +func dedupSorted(in []string) []string { + if len(in) == 0 { + return nil + } + seen := make(map[string]struct{}, len(in)) + out := make([]string, 0, len(in)) + for _, s := range in { + if s == "" { + continue + } + if _, ok := seen[s]; ok { + continue + } + seen[s] = struct{}{} + out = append(out, s) + } + sort.Strings(out) + return out +} diff --git a/rest-api/flow/internal/task/componentmanager/providers/nico/assignment_test.go b/rest-api/flow/internal/task/componentmanager/providers/nico/assignment_test.go new file mode 100644 index 0000000000..23b339ebc7 --- /dev/null +++ b/rest-api/flow/internal/task/componentmanager/providers/nico/assignment_test.go @@ -0,0 +1,144 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package nico + +import ( + "context" + "testing" + "time" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/infra-controller-rest/flow/internal/nicoapi" +) + +func TestIsAssignedState(t *testing.T) { + cases := map[string]bool{ + "Assigned/Provisioning": true, + "Assigned/Reprovision/Init": true, + "Assigned/": true, + "Ready": false, + "": false, + "PreAssignedMeasuring/PollResult": false, + "PostAssignedMeasuring/MeasuredBoot": false, + "HostInitializing/Init": false, + // Defensive: a literal "Assigned" without the trailing slash is not + // a state Core emits today (Display always writes "Assigned/..."). + // Keep this here so the predicate stays strict; flipping it later + // would silently change the contract for the safety check. + "Assigned": false, + } + for input, want := range cases { + assert.Equalf(t, want, IsAssignedState(input), "state=%q", input) + } +} + +func TestWaitForMachinesUnassigned_NoMachines_ShortCircuits(t *testing.T) { + c := NewAssignmentChecker(nicoapi.NewMockClient(), time.Second, 10*time.Millisecond) + require.NoError(t, c.WaitForMachinesUnassigned(context.Background(), nil)) + require.NoError(t, c.WaitForMachinesUnassigned(context.Background(), []string{})) +} + +func TestWaitForMachinesUnassigned_NilClient_ShortCircuits(t *testing.T) { + c := NewAssignmentChecker(nil, time.Second, 10*time.Millisecond) + require.NoError(t, c.WaitForMachinesUnassigned(context.Background(), []string{"m1"})) +} + +func TestWaitForMachinesUnassigned_AlreadyReady(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "m1", State: "Ready"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "m2", State: "HostInitializing/Init"}) + + c := NewAssignmentChecker(client, time.Second, 10*time.Millisecond) + require.NoError(t, c.WaitForMachinesUnassigned(context.Background(), []string{"m1", "m2"})) +} + +func TestWaitForMachinesUnassigned_TimesOutWhileAssigned(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "m1", State: "Assigned/Provisioning"}) + + c := NewAssignmentChecker(client, 50*time.Millisecond, 10*time.Millisecond) + err := c.WaitForMachinesUnassigned(context.Background(), []string{"m1"}) + require.Error(t, err) + assert.Contains(t, err.Error(), "timed out") + assert.Contains(t, err.Error(), "m1") +} + +// TestWaitForMachinesUnassigned_MissingTreatedAsUnassigned encodes the +// deliberate fail-open behaviour: if Core has no record of a machine, we let +// the operation proceed rather than block on indefinite missing data. +func TestWaitForMachinesUnassigned_MissingTreatedAsUnassigned(t *testing.T) { + client := nicoapi.NewMockClient() + // Intentionally do not register "m-missing". + c := NewAssignmentChecker(client, 100*time.Millisecond, 10*time.Millisecond) + require.NoError(t, c.WaitForMachinesUnassigned(context.Background(), []string{"m-missing"})) +} + +func TestWaitForRacksUnassigned_ResolvesMachinesAndPasses(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "m-A1", State: "Ready"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "m-A2", State: "Ready"}) + client.SetRackHostMachineIDs("rack-A", []string{"m-A1", "m-A2"}) + + c := NewAssignmentChecker(client, time.Second, 10*time.Millisecond) + require.NoError(t, c.WaitForRacksUnassigned(context.Background(), []string{"rack-A"})) +} + +func TestWaitForRacksUnassigned_BlocksOnAssignedHost(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "m-A1", State: "Ready"}) + client.AddMachine(nicoapi.MachineDetail{MachineID: "m-A2", State: "Assigned/Provisioning"}) + client.SetRackHostMachineIDs("rack-A", []string{"m-A1", "m-A2"}) + + c := NewAssignmentChecker(client, 50*time.Millisecond, 10*time.Millisecond) + err := c.WaitForRacksUnassigned(context.Background(), []string{"rack-A"}) + require.Error(t, err) + assert.Contains(t, err.Error(), "timed out") + assert.Contains(t, err.Error(), "m-A2") +} + +func TestWaitForRacksUnassigned_EmptyRackPasses(t *testing.T) { + client := nicoapi.NewMockClient() + // Rack-B has no host machines registered (switch-only rack). + c := NewAssignmentChecker(client, time.Second, 10*time.Millisecond) + require.NoError(t, c.WaitForRacksUnassigned(context.Background(), []string{"rack-B"})) +} + +func TestWaitForRacksUnassigned_ContextCancellationStopsPolling(t *testing.T) { + client := nicoapi.NewMockClient() + client.AddMachine(nicoapi.MachineDetail{MachineID: "m1", State: "Assigned/Provisioning"}) + client.SetRackHostMachineIDs("rack-A", []string{"m1"}) + + // Long timeout, but the context is cancelled after the first poll — + // we should return promptly with the context error wrapped, not after + // the full assignment timeout. + c := NewAssignmentChecker(client, time.Hour, 50*time.Millisecond) + + ctx, cancel := context.WithCancel(context.Background()) + go func() { + time.Sleep(20 * time.Millisecond) + cancel() + }() + + start := time.Now() + err := c.WaitForRacksUnassigned(ctx, []string{"rack-A"}) + elapsed := time.Since(start) + require.Error(t, err) + assert.Less(t, elapsed, time.Second, "cancellation should abort the poll quickly") +} diff --git a/rest-api/flow/internal/task/executor/temporalworkflow/activity/activity.go b/rest-api/flow/internal/task/executor/temporalworkflow/activity/activity.go index 282f890883..2be7815e4e 100644 --- a/rest-api/flow/internal/task/executor/temporalworkflow/activity/activity.go +++ b/rest-api/flow/internal/task/executor/temporalworkflow/activity/activity.go @@ -147,17 +147,20 @@ func (a *Activities) GetFirmwareStatus( return &GetFirmwareStatusResult{Statuses: statuses}, nil } -// BringUpControl opens the power-on gate for the target components. +// BringUpControl opens the power-on gate for the target components. The info +// argument carries the parent BringUp task settings (e.g. override of the +// host-assignment safety gate) and is forwarded to the component manager. func (a *Activities) BringUpControl( ctx context.Context, target common.Target, + info operations.BringUpTaskInfo, ) error { controller, err := requireBringUpController(a.registry, target) if err != nil { return err } - return controller.BringUpControl(ctx, target) + return controller.BringUpControl(ctx, target, info) } // GetBringUpStatusResult is the result of GetBringUpStatus activity. diff --git a/rest-api/flow/internal/task/executor/temporalworkflow/activity/activity_test.go b/rest-api/flow/internal/task/executor/temporalworkflow/activity/activity_test.go index f25adce6a4..6afa4a56fa 100644 --- a/rest-api/flow/internal/task/executor/temporalworkflow/activity/activity_test.go +++ b/rest-api/flow/internal/task/executor/temporalworkflow/activity/activity_test.go @@ -138,7 +138,7 @@ func activityCallsForMissingManagerTest( return acts.VerifyFirmwareConsistency(ctx, target) }, "BringUpControl": func() error { - return acts.BringUpControl(ctx, target) + return acts.BringUpControl(ctx, target, operations.BringUpTaskInfo{}) }, "GetBringUpStatus": func() error { _, err := acts.GetBringUpStatus(ctx, target) @@ -193,7 +193,7 @@ func capabilityCheckedActivityCalls( return acts.VerifyFirmwareConsistency(ctx, target) }, "BringUpControl": func(acts *Activities) error { - return acts.BringUpControl(ctx, target) + return acts.BringUpControl(ctx, target, operations.BringUpTaskInfo{}) }, "GetBringUpStatus": func(acts *Activities) error { _, err := acts.GetBringUpStatus(ctx, target) diff --git a/rest-api/flow/internal/task/executor/temporalworkflow/workflow/actions.go b/rest-api/flow/internal/task/executor/temporalworkflow/workflow/actions.go index ce2b11502d..2a2bedfd72 100644 --- a/rest-api/flow/internal/task/executor/temporalworkflow/workflow/actions.go +++ b/rest-api/flow/internal/task/executor/temporalworkflow/workflow/actions.go @@ -101,6 +101,11 @@ func executeSleepAction(actx actionExecutionContext) error { // must be set in the action config to specify the desired power operation. // When called from the power workflow, operationInfo is passed through // directly (Temporal handles deserialization at the activity boundary). +// +// In the synthesised path, the host-assignment override flag is read from the +// parent task's operationInfo so that a BringUp / Firmware operator who set +// override_assignment_check at the API does not have it silently dropped by +// the sub-action's fresh PowerControlTaskInfo. func executePowerControlAction(actx actionExecutionContext) error { if opParam, ok := actx.config.Parameters[operationrules.ParamOperation]; ok { opStr, _ := opParam.(string) @@ -110,7 +115,10 @@ func executePowerControlAction(actx actionExecutionContext) error { "PowerControl action: unrecognized operation %q", opStr, ) } - info := operations.PowerControlTaskInfo{Operation: op} + info := operations.PowerControlTaskInfo{ + Operation: op, + OverrideAssignmentCheck: extractOverrideAssignmentCheck(actx.operationInfo), + } return executeGenericActivity( actx.workflowContext, activity.NamePowerControl, actx.target, info, ) @@ -199,6 +207,14 @@ func executeFirmwareControlAction(actx actionExecutionContext) error { if fwInfo.Operation == operations.FirmwareOperationUnknown { fwInfo.Operation = operations.FirmwareOperationUpgrade } + // When the firmware action is fired by a BringUp parent, fwInfo is + // synthesised here and does not inherit the parent's + // OverrideAssignmentCheck through the type assertions above. Read it + // directly from the parent task info so the safety-gate decision is + // preserved across the parent / sub-action boundary. + if !fwInfo.OverrideAssignmentCheck { + fwInfo.OverrideAssignmentCheck = extractOverrideAssignmentCheck(actx.operationInfo) + } fwInfo.TargetVersion = extractComponentTargetVersion(fwInfo.TargetVersion, target.Type) @@ -388,10 +404,20 @@ func verifyPowerStatus( } } -// executeBringUpControlAction opens the power-on gate for the target components. +// executeBringUpControlAction opens the power-on gate for the target +// components. The BringUp parent task info is forwarded to the activity so +// that operator-set fields (currently OverrideAssignmentCheck) are honoured +// at the component-manager safety gate. func executeBringUpControlAction(actx actionExecutionContext) error { + info := operations.BringUpTaskInfo{ + OverrideAssignmentCheck: extractOverrideAssignmentCheck(actx.operationInfo), + } + if parent, ok := actx.operationInfo.(*operations.BringUpTaskInfo); ok && parent != nil { + info.RuleID = parent.RuleID + info.OpCode = parent.OpCode + } return workflow.ExecuteActivity( - actx.workflowContext, activity.NameBringUpControl, actx.target, + actx.workflowContext, activity.NameBringUpControl, actx.target, info, ).Get(actx.workflowContext, nil) } @@ -595,6 +621,54 @@ func executeVerifyFirmwareConsistencyAction(actx actionExecutionContext) error { ).Get(actx.workflowContext, nil) } +// extractOverrideAssignmentCheck reads the OverrideAssignmentCheck flag from +// a parent task's operationInfo regardless of which TaskInfo type it is. +// The same JSON tag (override_assignment_check) is used by every TaskInfo +// that opts in, so a JSON round-trip is a type-agnostic fallback that also +// covers the map[string]interface{} form produced by Temporal's child- +// workflow argument serialisation. +// +// Returning false on any error or unrecognised shape is the safe default: +// the assignment safety gate stays in effect when intent is ambiguous. +func extractOverrideAssignmentCheck(operationInfo any) bool { + switch v := operationInfo.(type) { + case nil: + return false + case *operations.PowerControlTaskInfo: + if v == nil { + return false + } + return v.OverrideAssignmentCheck + case operations.PowerControlTaskInfo: + return v.OverrideAssignmentCheck + case *operations.FirmwareControlTaskInfo: + if v == nil { + return false + } + return v.OverrideAssignmentCheck + case operations.FirmwareControlTaskInfo: + return v.OverrideAssignmentCheck + case *operations.BringUpTaskInfo: + if v == nil { + return false + } + return v.OverrideAssignmentCheck + case operations.BringUpTaskInfo: + return v.OverrideAssignmentCheck + } + var probe struct { + OverrideAssignmentCheck bool `json:"override_assignment_check"` + } + data, err := json.Marshal(operationInfo) + if err != nil { + return false + } + if err := json.Unmarshal(data, &probe); err != nil { + return false + } + return probe.OverrideAssignmentCheck +} + // knownComponentTypeKeys are the JSON keys recognised in a layered // TargetVersion object. Used to distinguish the new per-component-type // format from the legacy flat format. diff --git a/rest-api/flow/internal/task/executor/temporalworkflow/workflow/actions_test.go b/rest-api/flow/internal/task/executor/temporalworkflow/workflow/actions_test.go new file mode 100644 index 0000000000..56579d832d --- /dev/null +++ b/rest-api/flow/internal/task/executor/temporalworkflow/workflow/actions_test.go @@ -0,0 +1,97 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package workflow + +import ( + "testing" + + "github.com/stretchr/testify/assert" + + "github.com/NVIDIA/infra-controller-rest/flow/internal/task/operations" +) + +// TestExtractOverrideAssignmentCheck covers every branch of the +// extractOverrideAssignmentCheck helper. The helper bridges the parent / +// sub-action boundary inside the rule engine: a parent task carries its +// OverrideAssignmentCheck flag through to dynamically synthesised +// PowerControl, FirmwareControl, and BringUp sub-actions. The helper is +// intentionally type-agnostic — the workflow runtime hands it either a +// concrete TaskInfo, a pointer to one, or a JSON-deserialised +// map[string]any from a child-workflow boundary — so all of those shapes +// must extract the flag, and an unrecognised or nil input must fall back +// to false to keep the safety gate engaged by default. +func TestExtractOverrideAssignmentCheck(t *testing.T) { + t.Run("nil returns false", func(t *testing.T) { + assert.False(t, extractOverrideAssignmentCheck(nil)) + }) + + t.Run("nil typed pointers return false", func(t *testing.T) { + var pc *operations.PowerControlTaskInfo + var fw *operations.FirmwareControlTaskInfo + var bu *operations.BringUpTaskInfo + assert.False(t, extractOverrideAssignmentCheck(pc)) + assert.False(t, extractOverrideAssignmentCheck(fw)) + assert.False(t, extractOverrideAssignmentCheck(bu)) + }) + + t.Run("PowerControlTaskInfo pointer", func(t *testing.T) { + info := &operations.PowerControlTaskInfo{OverrideAssignmentCheck: true} + assert.True(t, extractOverrideAssignmentCheck(info)) + info.OverrideAssignmentCheck = false + assert.False(t, extractOverrideAssignmentCheck(info)) + }) + + t.Run("PowerControlTaskInfo value", func(t *testing.T) { + info := operations.PowerControlTaskInfo{OverrideAssignmentCheck: true} + assert.True(t, extractOverrideAssignmentCheck(info)) + }) + + t.Run("FirmwareControlTaskInfo pointer and value", func(t *testing.T) { + assert.True(t, extractOverrideAssignmentCheck(&operations.FirmwareControlTaskInfo{ + OverrideAssignmentCheck: true, + })) + assert.True(t, extractOverrideAssignmentCheck(operations.FirmwareControlTaskInfo{ + OverrideAssignmentCheck: true, + })) + }) + + t.Run("BringUpTaskInfo pointer and value", func(t *testing.T) { + assert.True(t, extractOverrideAssignmentCheck(&operations.BringUpTaskInfo{ + OverrideAssignmentCheck: true, + })) + assert.True(t, extractOverrideAssignmentCheck(operations.BringUpTaskInfo{ + OverrideAssignmentCheck: true, + })) + }) + + t.Run("map shape from child-workflow JSON round-trip", func(t *testing.T) { + // Temporal serialises child-workflow arguments through JSON; on + // receipt the typed TaskInfo struct degrades to a map[string]any + // keyed by JSON tag. The helper must still recover the flag. + assert.True(t, extractOverrideAssignmentCheck(map[string]any{ + "override_assignment_check": true, + })) + assert.False(t, extractOverrideAssignmentCheck(map[string]any{ + "override_assignment_check": false, + })) + assert.False(t, extractOverrideAssignmentCheck(map[string]any{ + "some_other_key": "value", + })) + }) + + t.Run("unrecognised struct falls through JSON probe", func(t *testing.T) { + // Anonymous struct with the matching JSON tag should still + // be readable via the marshal/unmarshal fallback. + type customInfo struct { + OverrideAssignmentCheck bool `json:"override_assignment_check"` + } + assert.True(t, extractOverrideAssignmentCheck(customInfo{OverrideAssignmentCheck: true})) + }) + + t.Run("non-marshalable value returns false", func(t *testing.T) { + // A channel is not JSON-marshalable; the helper must return + // false rather than panic. + assert.False(t, extractOverrideAssignmentCheck(make(chan int))) + }) +} diff --git a/rest-api/flow/internal/task/executor/temporalworkflow/workflow/bringup_test.go b/rest-api/flow/internal/task/executor/temporalworkflow/workflow/bringup_test.go index 8e5cee4b79..75b1b6534a 100644 --- a/rest-api/flow/internal/task/executor/temporalworkflow/workflow/bringup_test.go +++ b/rest-api/flow/internal/task/executor/temporalworkflow/workflow/bringup_test.go @@ -24,7 +24,7 @@ import ( "github.com/NVIDIA/infra-controller-rest/flow/pkg/common/devicetypes" ) -func mockBringUpControl(ctx context.Context, target common.Target) error { +func mockBringUpControl(_ context.Context, _ common.Target, _ operations.BringUpTaskInfo) error { return nil } @@ -131,7 +131,7 @@ func TestBringUpWorkflow(t *testing.T) { map[string]operations.PowerStatus{ "ps-1": operations.PowerStatusOn, }, nil) - env.OnActivity(activitypkg.NameBringUpControl, mock.Anything, mock.Anything).Return(nil) + env.OnActivity(activitypkg.NameBringUpControl, mock.Anything, mock.Anything, mock.Anything).Return(nil) env.OnActivity(activitypkg.NameGetBringUpStatus, mock.Anything, mock.Anything).Return( &activitypkg.GetBringUpStatusResult{ States: map[string]operations.MachineBringUpState{ diff --git a/rest-api/flow/internal/task/executor/temporalworkflow/workflow/child_workflow_test.go b/rest-api/flow/internal/task/executor/temporalworkflow/workflow/child_workflow_test.go index 96d58a4538..a4d06563bd 100644 --- a/rest-api/flow/internal/task/executor/temporalworkflow/workflow/child_workflow_test.go +++ b/rest-api/flow/internal/task/executor/temporalworkflow/workflow/child_workflow_test.go @@ -355,7 +355,7 @@ func TestGenericComponentStepWorkflow_BringUpAndWait(t *testing.T) { testSuite := &testsuite.WorkflowTestSuite{} env := testSuite.NewTestWorkflowEnvironment() - mockBringUpControl := func(ctx context.Context, target common.Target) error { + mockBringUpControl := func(_ context.Context, _ common.Target, _ operations.BringUpTaskInfo) error { return nil } mockGetBringUpStatus := func(ctx context.Context, target common.Target) (*activitypkg.GetBringUpStatusResult, error) { @@ -367,7 +367,7 @@ func TestGenericComponentStepWorkflow_BringUpAndWait(t *testing.T) { env.RegisterActivityWithOptions(mockGetBringUpStatus, activity.RegisterOptions{Name: activitypkg.NameGetBringUpStatus}) - env.OnActivity(mockBringUpControl, mock.Anything, mock.Anything).Return(nil) + env.OnActivity(mockBringUpControl, mock.Anything, mock.Anything, mock.Anything).Return(nil) env.OnActivity(mockGetBringUpStatus, mock.Anything, mock.Anything).Return( &activitypkg.GetBringUpStatusResult{ States: map[string]operations.MachineBringUpState{ diff --git a/rest-api/flow/internal/task/operations/operations.go b/rest-api/flow/internal/task/operations/operations.go index dc461d8055..9951e47332 100644 --- a/rest-api/flow/internal/task/operations/operations.go +++ b/rest-api/flow/internal/task/operations/operations.go @@ -73,6 +73,13 @@ type PowerControlTaskInfo struct { Operation PowerOperation `json:"operation"` Forced bool `json:"forced"` RuleID string `json:"rule_id,omitempty"` + // OverrideAssignmentCheck, when true, instructs the component manager + // to skip the host-assignment safety gate that normally blocks power + // operations against hardware whose hosts are still in the Assigned/* + // lifecycle state. The bypass is intended for operator-supervised + // maintenance windows and is recorded as a warning log on the worker + // that executes the task; authorisation lives upstream. + OverrideAssignmentCheck bool `json:"override_assignment_check,omitempty"` } func (t *PowerControlTaskInfo) Validate() error { @@ -148,6 +155,13 @@ func (t *InjectExpectationTaskInfo) CodeString() string { type BringUpTaskInfo struct { RuleID string `json:"rule_id,omitempty"` OpCode string `json:"op_code,omitempty"` + // OverrideAssignmentCheck, when true, propagates through the bring-up + // rule's synthesised PowerControl / FirmwareControl / BringUpControl + // sub-actions so they each skip the host-assignment safety gate. + // Equivalent to setting the field on every sub-task; intended for + // operator-supervised maintenance and recorded as a warning log on + // the worker that executes each sub-task. + OverrideAssignmentCheck bool `json:"override_assignment_check,omitempty"` } func (t *BringUpTaskInfo) Validate() error { @@ -213,6 +227,13 @@ type FirmwareControlTaskInfo struct { // these strings to component-manager-specific enums is done in each // FirmwareControl implementation (see pkg/common/firmwarecomponents). SubTargets []string `json:"sub_targets,omitempty"` + // OverrideAssignmentCheck, when true, instructs the component manager + // to skip the host-assignment safety gate that normally blocks + // firmware updates against hardware whose hosts are still in the + // Assigned/* lifecycle state. Intended for operator-supervised + // maintenance windows and recorded as a warning log on the worker + // that executes the task; authorisation lives upstream. + OverrideAssignmentCheck bool `json:"override_assignment_check,omitempty"` } func (t *FirmwareControlTaskInfo) Validate() error { diff --git a/rest-api/flow/pkg/proto/v1/flow.pb.go b/rest-api/flow/pkg/proto/v1/flow.pb.go index a3b661efdd..5e027112a2 100644 --- a/rest-api/flow/pkg/proto/v1/flow.pb.go +++ b/rest-api/flow/pkg/proto/v1/flow.pb.go @@ -3325,9 +3325,17 @@ type UpgradeFirmwareRequest struct { // Names are lowercase. Empty or omitted means update everything in the // bundle (current default behavior). Unknown names are rejected by the // downstream component manager. - SubTargets []string `protobuf:"bytes,8,rep,name=sub_targets,json=subTargets,proto3" json:"sub_targets,omitempty"` - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + SubTargets []string `protobuf:"bytes,8,rep,name=sub_targets,json=subTargets,proto3" json:"sub_targets,omitempty"` + // When true, proceed with the firmware update even if one or more + // target hosts (or, for rack-scoped components, any host on the + // owning rack) are still in the Assigned/* lifecycle state. The flag + // is intended for operator-supervised maintenance windows where the + // tenant impact has been acknowledged out-of-band; setting it bypasses + // the safety gate that would otherwise block disruptive operations + // against tenanted hardware. The bypass is recorded in the server log. + OverrideAssignmentCheck bool `protobuf:"varint,9,opt,name=override_assignment_check,json=overrideAssignmentCheck,proto3" json:"override_assignment_check,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *UpgradeFirmwareRequest) Reset() { @@ -3416,6 +3424,13 @@ func (x *UpgradeFirmwareRequest) GetSubTargets() []string { return nil } +func (x *UpgradeFirmwareRequest) GetOverrideAssignmentCheck() bool { + if x != nil { + return x.OverrideAssignmentCheck + } + return false +} + // GetComponents - retrieves components from local database type GetComponentsRequest struct { state protoimpl.MessageState `protogen:"open.v1"` @@ -4482,13 +4497,19 @@ func (x *QueueOptions) GetQueueTimeoutSeconds() int32 { } type PowerOnRackRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - TargetSpec *OperationTargetSpec `protobuf:"bytes,1,opt,name=target_spec,json=targetSpec,proto3" json:"target_spec,omitempty"` // Flexible targeting: rack(s) with optional type filter, or specific components - Description string `protobuf:"bytes,2,opt,name=description,proto3" json:"description,omitempty"` // optional task description - QueueOptions *QueueOptions `protobuf:"bytes,3,opt,name=queue_options,json=queueOptions,proto3,oneof" json:"queue_options,omitempty"` - RuleId *UUID `protobuf:"bytes,4,opt,name=rule_id,json=ruleId,proto3,oneof" json:"rule_id,omitempty"` // optional: override rule resolution with a specific rule - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + TargetSpec *OperationTargetSpec `protobuf:"bytes,1,opt,name=target_spec,json=targetSpec,proto3" json:"target_spec,omitempty"` // Flexible targeting: rack(s) with optional type filter, or specific components + Description string `protobuf:"bytes,2,opt,name=description,proto3" json:"description,omitempty"` // optional task description + QueueOptions *QueueOptions `protobuf:"bytes,3,opt,name=queue_options,json=queueOptions,proto3,oneof" json:"queue_options,omitempty"` + RuleId *UUID `protobuf:"bytes,4,opt,name=rule_id,json=ruleId,proto3,oneof" json:"rule_id,omitempty"` // optional: override rule resolution with a specific rule + // When true, proceed with the power-on even if one or more target + // hosts (or, for rack-scoped components, any host on the owning rack) + // are still in the Assigned/* lifecycle state. Intended for operator- + // supervised maintenance where tenant impact has been acknowledged + // out-of-band; the bypass is recorded in the server log. + OverrideAssignmentCheck bool `protobuf:"varint,5,opt,name=override_assignment_check,json=overrideAssignmentCheck,proto3" json:"override_assignment_check,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *PowerOnRackRequest) Reset() { @@ -4549,15 +4570,28 @@ func (x *PowerOnRackRequest) GetRuleId() *UUID { return nil } +func (x *PowerOnRackRequest) GetOverrideAssignmentCheck() bool { + if x != nil { + return x.OverrideAssignmentCheck + } + return false +} + type PowerOffRackRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - TargetSpec *OperationTargetSpec `protobuf:"bytes,1,opt,name=target_spec,json=targetSpec,proto3" json:"target_spec,omitempty"` // Flexible targeting: rack(s) with optional type filter, or specific components - Forced bool `protobuf:"varint,2,opt,name=forced,proto3" json:"forced,omitempty"` - Description string `protobuf:"bytes,3,opt,name=description,proto3" json:"description,omitempty"` // optional task description - QueueOptions *QueueOptions `protobuf:"bytes,4,opt,name=queue_options,json=queueOptions,proto3,oneof" json:"queue_options,omitempty"` - RuleId *UUID `protobuf:"bytes,5,opt,name=rule_id,json=ruleId,proto3,oneof" json:"rule_id,omitempty"` // optional: override rule resolution with a specific rule - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + TargetSpec *OperationTargetSpec `protobuf:"bytes,1,opt,name=target_spec,json=targetSpec,proto3" json:"target_spec,omitempty"` // Flexible targeting: rack(s) with optional type filter, or specific components + Forced bool `protobuf:"varint,2,opt,name=forced,proto3" json:"forced,omitempty"` + Description string `protobuf:"bytes,3,opt,name=description,proto3" json:"description,omitempty"` // optional task description + QueueOptions *QueueOptions `protobuf:"bytes,4,opt,name=queue_options,json=queueOptions,proto3,oneof" json:"queue_options,omitempty"` + RuleId *UUID `protobuf:"bytes,5,opt,name=rule_id,json=ruleId,proto3,oneof" json:"rule_id,omitempty"` // optional: override rule resolution with a specific rule + // When true, proceed with the power-off even if one or more target + // hosts (or, for rack-scoped components, any host on the owning rack) + // are still in the Assigned/* lifecycle state. Intended for operator- + // supervised maintenance where tenant impact has been acknowledged + // out-of-band; the bypass is recorded in the server log. + OverrideAssignmentCheck bool `protobuf:"varint,6,opt,name=override_assignment_check,json=overrideAssignmentCheck,proto3" json:"override_assignment_check,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *PowerOffRackRequest) Reset() { @@ -4625,15 +4659,28 @@ func (x *PowerOffRackRequest) GetRuleId() *UUID { return nil } +func (x *PowerOffRackRequest) GetOverrideAssignmentCheck() bool { + if x != nil { + return x.OverrideAssignmentCheck + } + return false +} + type PowerResetRackRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - TargetSpec *OperationTargetSpec `protobuf:"bytes,1,opt,name=target_spec,json=targetSpec,proto3" json:"target_spec,omitempty"` // Flexible targeting: rack(s) with optional type filter, or specific components - Forced bool `protobuf:"varint,2,opt,name=forced,proto3" json:"forced,omitempty"` - Description string `protobuf:"bytes,3,opt,name=description,proto3" json:"description,omitempty"` // optional task description - QueueOptions *QueueOptions `protobuf:"bytes,4,opt,name=queue_options,json=queueOptions,proto3,oneof" json:"queue_options,omitempty"` - RuleId *UUID `protobuf:"bytes,5,opt,name=rule_id,json=ruleId,proto3,oneof" json:"rule_id,omitempty"` // optional: override rule resolution with a specific rule - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + TargetSpec *OperationTargetSpec `protobuf:"bytes,1,opt,name=target_spec,json=targetSpec,proto3" json:"target_spec,omitempty"` // Flexible targeting: rack(s) with optional type filter, or specific components + Forced bool `protobuf:"varint,2,opt,name=forced,proto3" json:"forced,omitempty"` + Description string `protobuf:"bytes,3,opt,name=description,proto3" json:"description,omitempty"` // optional task description + QueueOptions *QueueOptions `protobuf:"bytes,4,opt,name=queue_options,json=queueOptions,proto3,oneof" json:"queue_options,omitempty"` + RuleId *UUID `protobuf:"bytes,5,opt,name=rule_id,json=ruleId,proto3,oneof" json:"rule_id,omitempty"` // optional: override rule resolution with a specific rule + // When true, proceed with the reset even if one or more target hosts + // (or, for rack-scoped components, any host on the owning rack) are + // still in the Assigned/* lifecycle state. Intended for operator- + // supervised maintenance where tenant impact has been acknowledged + // out-of-band; the bypass is recorded in the server log. + OverrideAssignmentCheck bool `protobuf:"varint,6,opt,name=override_assignment_check,json=overrideAssignmentCheck,proto3" json:"override_assignment_check,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *PowerResetRackRequest) Reset() { @@ -4701,13 +4748,26 @@ func (x *PowerResetRackRequest) GetRuleId() *UUID { return nil } +func (x *PowerResetRackRequest) GetOverrideAssignmentCheck() bool { + if x != nil { + return x.OverrideAssignmentCheck + } + return false +} + type BringUpRackRequest struct { - state protoimpl.MessageState `protogen:"open.v1"` - TargetSpec *OperationTargetSpec `protobuf:"bytes,1,opt,name=target_spec,json=targetSpec,proto3" json:"target_spec,omitempty"` // Target racks for bring-up - Description string `protobuf:"bytes,2,opt,name=description,proto3" json:"description,omitempty"` // optional task description - RuleId *UUID `protobuf:"bytes,3,opt,name=rule_id,json=ruleId,proto3,oneof" json:"rule_id,omitempty"` // optional: override rule resolution with a specific rule - unknownFields protoimpl.UnknownFields - sizeCache protoimpl.SizeCache + state protoimpl.MessageState `protogen:"open.v1"` + TargetSpec *OperationTargetSpec `protobuf:"bytes,1,opt,name=target_spec,json=targetSpec,proto3" json:"target_spec,omitempty"` // Target racks for bring-up + Description string `protobuf:"bytes,2,opt,name=description,proto3" json:"description,omitempty"` // optional task description + RuleId *UUID `protobuf:"bytes,3,opt,name=rule_id,json=ruleId,proto3,oneof" json:"rule_id,omitempty"` // optional: override rule resolution with a specific rule + // When true, allow the bring-up sequence (which may power-cycle hosts + // and reset rack-scoped components) to proceed even if any host in + // scope is still in the Assigned/* lifecycle state. Intended for + // operator-supervised maintenance where tenant impact has been + // acknowledged out-of-band; the bypass is recorded in the server log. + OverrideAssignmentCheck bool `protobuf:"varint,4,opt,name=override_assignment_check,json=overrideAssignmentCheck,proto3" json:"override_assignment_check,omitempty"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache } func (x *BringUpRackRequest) Reset() { @@ -4761,6 +4821,13 @@ func (x *BringUpRackRequest) GetRuleId() *UUID { return nil } +func (x *BringUpRackRequest) GetOverrideAssignmentCheck() bool { + if x != nil { + return x.OverrideAssignmentCheck + } + return false +} + type IngestRackRequest struct { state protoimpl.MessageState `protogen:"open.v1"` TargetSpec *OperationTargetSpec `protobuf:"bytes,1,opt,name=target_spec,json=targetSpec,proto3" json:"target_spec,omitempty"` // Target racks for ingestion @@ -7872,7 +7939,7 @@ const file_flow_proto_rawDesc = "" + "\x1bGetRacksForNVLDomainRequest\x12B\n" + "\x15nvl_domain_identifier\x18\x01 \x01(\v2\x0e.v1.IdentifierR\x13nvlDomainIdentifier\">\n" + "\x1cGetRacksForNVLDomainResponse\x12\x1e\n" + - "\x05racks\x18\x01 \x03(\v2\b.v1.RackR\x05racks\"\xee\x03\n" + + "\x05racks\x18\x01 \x03(\v2\b.v1.RackR\x05racks\"\xaa\x04\n" + "\x16UpgradeFirmwareRequest\x128\n" + "\vtarget_spec\x18\x01 \x01(\v2\x17.v1.OperationTargetSpecR\n" + "targetSpec\x12*\n" + @@ -7884,7 +7951,8 @@ const file_flow_proto_rawDesc = "" + "\rqueue_options\x18\x06 \x01(\v2\x10.v1.QueueOptionsH\x03R\fqueueOptions\x88\x01\x01\x12&\n" + "\arule_id\x18\a \x01(\v2\b.v1.UUIDH\x04R\x06ruleId\x88\x01\x01\x12\x1f\n" + "\vsub_targets\x18\b \x03(\tR\n" + - "subTargetsB\x11\n" + + "subTargets\x12:\n" + + "\x19override_assignment_check\x18\t \x01(\bR\x17overrideAssignmentCheckB\x11\n" + "\x0f_target_versionB\r\n" + "\v_start_timeB\v\n" + "\t_end_timeB\x10\n" + @@ -7976,41 +8044,45 @@ const file_flow_proto_rawDesc = "" + "\btask_ids\x18\x01 \x03(\v2\b.v1.UUIDR\ataskIds\"\x85\x01\n" + "\fQueueOptions\x12A\n" + "\x11conflict_strategy\x18\x01 \x01(\x0e2\x14.v1.ConflictStrategyR\x10conflictStrategy\x122\n" + - "\x15queue_timeout_seconds\x18\x02 \x01(\x05R\x13queueTimeoutSeconds\"\xf2\x01\n" + + "\x15queue_timeout_seconds\x18\x02 \x01(\x05R\x13queueTimeoutSeconds\"\xae\x02\n" + "\x12PowerOnRackRequest\x128\n" + "\vtarget_spec\x18\x01 \x01(\v2\x17.v1.OperationTargetSpecR\n" + "targetSpec\x12 \n" + "\vdescription\x18\x02 \x01(\tR\vdescription\x12:\n" + "\rqueue_options\x18\x03 \x01(\v2\x10.v1.QueueOptionsH\x00R\fqueueOptions\x88\x01\x01\x12&\n" + - "\arule_id\x18\x04 \x01(\v2\b.v1.UUIDH\x01R\x06ruleId\x88\x01\x01B\x10\n" + + "\arule_id\x18\x04 \x01(\v2\b.v1.UUIDH\x01R\x06ruleId\x88\x01\x01\x12:\n" + + "\x19override_assignment_check\x18\x05 \x01(\bR\x17overrideAssignmentCheckB\x10\n" + "\x0e_queue_optionsB\n" + "\n" + - "\b_rule_id\"\x8b\x02\n" + + "\b_rule_id\"\xc7\x02\n" + "\x13PowerOffRackRequest\x128\n" + "\vtarget_spec\x18\x01 \x01(\v2\x17.v1.OperationTargetSpecR\n" + "targetSpec\x12\x16\n" + "\x06forced\x18\x02 \x01(\bR\x06forced\x12 \n" + "\vdescription\x18\x03 \x01(\tR\vdescription\x12:\n" + "\rqueue_options\x18\x04 \x01(\v2\x10.v1.QueueOptionsH\x00R\fqueueOptions\x88\x01\x01\x12&\n" + - "\arule_id\x18\x05 \x01(\v2\b.v1.UUIDH\x01R\x06ruleId\x88\x01\x01B\x10\n" + + "\arule_id\x18\x05 \x01(\v2\b.v1.UUIDH\x01R\x06ruleId\x88\x01\x01\x12:\n" + + "\x19override_assignment_check\x18\x06 \x01(\bR\x17overrideAssignmentCheckB\x10\n" + "\x0e_queue_optionsB\n" + "\n" + - "\b_rule_id\"\x8d\x02\n" + + "\b_rule_id\"\xc9\x02\n" + "\x15PowerResetRackRequest\x128\n" + "\vtarget_spec\x18\x01 \x01(\v2\x17.v1.OperationTargetSpecR\n" + "targetSpec\x12\x16\n" + "\x06forced\x18\x02 \x01(\bR\x06forced\x12 \n" + "\vdescription\x18\x03 \x01(\tR\vdescription\x12:\n" + "\rqueue_options\x18\x04 \x01(\v2\x10.v1.QueueOptionsH\x00R\fqueueOptions\x88\x01\x01\x12&\n" + - "\arule_id\x18\x05 \x01(\v2\b.v1.UUIDH\x01R\x06ruleId\x88\x01\x01B\x10\n" + + "\arule_id\x18\x05 \x01(\v2\b.v1.UUIDH\x01R\x06ruleId\x88\x01\x01\x12:\n" + + "\x19override_assignment_check\x18\x06 \x01(\bR\x17overrideAssignmentCheckB\x10\n" + "\x0e_queue_optionsB\n" + "\n" + - "\b_rule_id\"\xa4\x01\n" + + "\b_rule_id\"\xe0\x01\n" + "\x12BringUpRackRequest\x128\n" + "\vtarget_spec\x18\x01 \x01(\v2\x17.v1.OperationTargetSpecR\n" + "targetSpec\x12 \n" + "\vdescription\x18\x02 \x01(\tR\vdescription\x12&\n" + - "\arule_id\x18\x03 \x01(\v2\b.v1.UUIDH\x00R\x06ruleId\x88\x01\x01B\n" + + "\arule_id\x18\x03 \x01(\v2\b.v1.UUIDH\x00R\x06ruleId\x88\x01\x01\x12:\n" + + "\x19override_assignment_check\x18\x04 \x01(\bR\x17overrideAssignmentCheckB\n" + "\n" + "\b_rule_id\"\xc9\x01\n" + "\x11IngestRackRequest\x128\n" + diff --git a/rest-api/flow/proto/v1/flow.proto b/rest-api/flow/proto/v1/flow.proto index a00b6d5843..00ffffc82b 100644 --- a/rest-api/flow/proto/v1/flow.proto +++ b/rest-api/flow/proto/v1/flow.proto @@ -443,6 +443,14 @@ message UpgradeFirmwareRequest { // bundle (current default behavior). Unknown names are rejected by the // downstream component manager. repeated string sub_targets = 8; + // When true, proceed with the firmware update even if one or more + // target hosts (or, for rack-scoped components, any host on the + // owning rack) are still in the Assigned/* lifecycle state. The flag + // is intended for operator-supervised maintenance windows where the + // tenant impact has been acknowledged out-of-band; setting it bypasses + // the safety gate that would otherwise block disruptive operations + // against tenanted hardware. The bypass is recorded in the server log. + bool override_assignment_check = 9; } // GetComponents - retrieves components from local database @@ -584,6 +592,12 @@ message PowerOnRackRequest { string description = 2; // optional task description optional QueueOptions queue_options = 3; optional UUID rule_id = 4; // optional: override rule resolution with a specific rule + // When true, proceed with the power-on even if one or more target + // hosts (or, for rack-scoped components, any host on the owning rack) + // are still in the Assigned/* lifecycle state. Intended for operator- + // supervised maintenance where tenant impact has been acknowledged + // out-of-band; the bypass is recorded in the server log. + bool override_assignment_check = 5; } message PowerOffRackRequest { @@ -592,6 +606,12 @@ message PowerOffRackRequest { string description = 3; // optional task description optional QueueOptions queue_options = 4; optional UUID rule_id = 5; // optional: override rule resolution with a specific rule + // When true, proceed with the power-off even if one or more target + // hosts (or, for rack-scoped components, any host on the owning rack) + // are still in the Assigned/* lifecycle state. Intended for operator- + // supervised maintenance where tenant impact has been acknowledged + // out-of-band; the bypass is recorded in the server log. + bool override_assignment_check = 6; } message PowerResetRackRequest { @@ -600,12 +620,24 @@ message PowerResetRackRequest { string description = 3; // optional task description optional QueueOptions queue_options = 4; optional UUID rule_id = 5; // optional: override rule resolution with a specific rule + // When true, proceed with the reset even if one or more target hosts + // (or, for rack-scoped components, any host on the owning rack) are + // still in the Assigned/* lifecycle state. Intended for operator- + // supervised maintenance where tenant impact has been acknowledged + // out-of-band; the bypass is recorded in the server log. + bool override_assignment_check = 6; } message BringUpRackRequest { OperationTargetSpec target_spec = 1; // Target racks for bring-up string description = 2; // optional task description optional UUID rule_id = 3; // optional: override rule resolution with a specific rule + // When true, allow the bring-up sequence (which may power-cycle hosts + // and reset rack-scoped components) to proceed even if any host in + // scope is still in the Assigned/* lifecycle state. Intended for + // operator-supervised maintenance where tenant impact has been + // acknowledged out-of-band; the bypass is recorded in the server log. + bool override_assignment_check = 4; } message IngestRackRequest {