diff --git a/rest-api/flow/internal/db/migrations/20260609170000_rack_external_id.down.sql b/rest-api/flow/internal/db/migrations/20260609170000_rack_external_id.down.sql new file mode 100644 index 0000000000..a4345c7b92 --- /dev/null +++ b/rest-api/flow/internal/db/migrations/20260609170000_rack_external_id.down.sql @@ -0,0 +1,7 @@ +-- SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +-- SPDX-License-Identifier: Apache-2.0 + +DROP INDEX IF EXISTS rack_external_id_idx; + +ALTER TABLE rack + DROP COLUMN IF EXISTS external_id; diff --git a/rest-api/flow/internal/db/migrations/20260609170000_rack_external_id.up.sql b/rest-api/flow/internal/db/migrations/20260609170000_rack_external_id.up.sql new file mode 100644 index 0000000000..d8654b0e7f --- /dev/null +++ b/rest-api/flow/internal/db/migrations/20260609170000_rack_external_id.up.sql @@ -0,0 +1,16 @@ +-- SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +-- SPDX-License-Identifier: Apache-2.0 + +-- Adds the Core-side stable rack identifier (ExpectedRack.rack_id, e.g. +-- "a12") so the new expected-inventory mirror can match Flow racks against +-- Core unambiguously and idempotently. The column is nullable: racks created +-- before the mirror runs (or via the legacy ingestion gRPC) start without it +-- and are adopted on the first sync that finds a Core match by +-- (manufacturer, serial_number). The partial unique index leaves NULL rows +-- unconstrained but rejects duplicate external_id assignments. +ALTER TABLE rack + ADD COLUMN external_id TEXT; + +CREATE UNIQUE INDEX rack_external_id_idx + ON rack (external_id) + WHERE external_id IS NOT NULL; diff --git a/rest-api/flow/internal/db/model/rack.go b/rest-api/flow/internal/db/model/rack.go index f3cda0dd74..a4a01cec81 100644 --- a/rest-api/flow/internal/db/model/rack.go +++ b/rest-api/flow/internal/db/model/rack.go @@ -31,13 +31,18 @@ type Rack struct { Description map[string]any `bun:"description,type:jsonb,json_use_number"` Location map[string]any `bun:"location,type:jsonb"` NVLDomainID uuid.UUID `bun:"nvldomain_id,type:uuid"` - Status RackStatus `bun:"status,type:varchar(16),default:'new'"` - CreatedAt time.Time `bun:"created_at,nullzero,notnull,default:current_timestamp"` - UpdatedAt time.Time `bun:"updated_at,nullzero,notnull,default:current_timestamp"` - IngestedAt *time.Time `bun:"ingested_at"` - DeletedAt *time.Time `bun:"deleted_at,soft_delete"` - Components []Component `bun:"rel:has-many,join:id=rack_id"` - NVLDomain *NVLDomain `bun:"rel:belongs-to,join:nvldomain_id=id"` + // ExternalID is Core's operator-supplied stable rack identifier + // (ExpectedRack.rack_id, e.g. "a12") populated by the expected-inventory + // mirror. NULL on racks that the mirror has never adopted (e.g. legacy + // ingestion-gRPC rows on first run). + ExternalID *string `bun:"external_id"` + Status RackStatus `bun:"status,type:varchar(16),default:'new'"` + CreatedAt time.Time `bun:"created_at,nullzero,notnull,default:current_timestamp"` + UpdatedAt time.Time `bun:"updated_at,nullzero,notnull,default:current_timestamp"` + IngestedAt *time.Time `bun:"ingested_at"` + DeletedAt *time.Time `bun:"deleted_at,soft_delete"` + Components []Component `bun:"rel:has-many,join:id=rack_id"` + NVLDomain *NVLDomain `bun:"rel:belongs-to,join:nvldomain_id=id"` } type RackStatus string diff --git a/rest-api/flow/internal/nicoapi/expecteddetails_test.go b/rest-api/flow/internal/nicoapi/expecteddetails_test.go new file mode 100644 index 0000000000..5efd87c157 --- /dev/null +++ b/rest-api/flow/internal/nicoapi/expecteddetails_test.go @@ -0,0 +1,245 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package nicoapi + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + pb "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi/gen" +) + +func TestExpectedRackDetailFromPb(t *testing.T) { + t.Run("full metadata + rack ids", func(t *testing.T) { + er := &pb.ExpectedRack{ + RackId: &pb.RackId{Id: "a12"}, + RackProfileId: &pb.RackProfileId{Id: "gb200-nvl72"}, + Metadata: &pb.Metadata{ + Name: "Rack A12", + Description: "Building 1, Row 3", + Labels: []*pb.Label{ + labelKV("chassis.manufacturer", "Foxconn"), + labelKV("chassis.serial-number", "SN12345"), + labelKV("location.datacenter", "DC-East"), + }, + }, + } + + got := expectedRackDetailFromPb(er) + + assert.Equal(t, "a12", got.RackID) + assert.Equal(t, "gb200-nvl72", got.RackProfileID) + assert.Equal(t, "Rack A12", got.Name) + assert.Equal(t, "Building 1, Row 3", got.Description) + assert.Equal(t, map[string]string{ + "chassis.manufacturer": "Foxconn", + "chassis.serial-number": "SN12345", + "location.datacenter": "DC-East", + }, got.Labels) + }) + + t.Run("missing optional rack_id stays empty", func(t *testing.T) { + er := &pb.ExpectedRack{ + RackProfileId: &pb.RackProfileId{Id: "gb200-nvl72"}, + } + got := expectedRackDetailFromPb(er) + assert.Empty(t, got.RackID) + assert.Equal(t, "gb200-nvl72", got.RackProfileID) + assert.Nil(t, got.Labels) + }) +} + +func TestExpectedMachineDetailFromPb(t *testing.T) { + t.Run("full proto", func(t *testing.T) { + em := &pb.ExpectedMachine{ + Id: &pb.UUID{Value: "11111111-1111-1111-1111-111111111111"}, + BmcMacAddress: "aa:bb:cc:dd:ee:01", + BmcIpAddress: strPtr("10.0.0.1"), + ChassisSerialNumber: "CSN-001", + RackId: &pb.RackId{Id: "a12"}, + Metadata: &pb.Metadata{ + Name: "host-001", + Description: "compute node", + Labels: []*pb.Label{ + labelKV("manufacturer", "Supermicro"), + labelKV("model", "ARS-211GL-NHR"), + labelKV("firmware_version", "1.2.3"), + labelKV("slot_id", "1"), + labelKV("tray_idx", "2"), + labelKV("host_id", "3"), + }, + }, + } + + got := expectedMachineDetailFromPb(em) + + assert.Equal(t, "11111111-1111-1111-1111-111111111111", got.ExpectedMachineID) + assert.Equal(t, "aa:bb:cc:dd:ee:01", got.BMCMACAddress) + assert.Equal(t, "10.0.0.1", got.BMCIPAddress) + assert.Equal(t, "CSN-001", got.ChassisSerialNumber) + assert.Equal(t, "a12", got.RackID) + assert.Equal(t, "host-001", got.Name) + assert.Equal(t, "compute node", got.Description) + require.NotNil(t, got.Labels) + assert.Equal(t, "Supermicro", got.Labels["manufacturer"]) + assert.Equal(t, "1.2.3", got.Labels["firmware_version"]) + assert.Equal(t, "1", got.Labels["slot_id"]) + }) + + t.Run("missing optional fields stay empty", func(t *testing.T) { + em := &pb.ExpectedMachine{ + BmcMacAddress: "aa:bb:cc:dd:ee:02", + ChassisSerialNumber: "CSN-002", + } + got := expectedMachineDetailFromPb(em) + assert.Empty(t, got.ExpectedMachineID) + assert.Empty(t, got.BMCIPAddress) + assert.Empty(t, got.RackID) + assert.Nil(t, got.Labels) + }) +} + +func TestExpectedSwitchDetailFromPb(t *testing.T) { + es := &pb.ExpectedSwitch{ + ExpectedSwitchId: &pb.UUID{Value: "22222222-2222-2222-2222-222222222222"}, + BmcMacAddress: "aa:bb:cc:dd:ee:11", + BmcIpAddress: "10.0.0.11", + SwitchSerialNumber: "SSN-001", + RackId: &pb.RackId{Id: "a12"}, + Metadata: &pb.Metadata{ + Name: "switch-001", + Labels: []*pb.Label{ + labelKV("manufacturer", "NVIDIA"), + labelKV("model", "Q3450-LD"), + }, + }, + } + + got := expectedSwitchDetailFromPb(es) + + assert.Equal(t, "22222222-2222-2222-2222-222222222222", got.ExpectedSwitchID) + assert.Equal(t, "aa:bb:cc:dd:ee:11", got.BMCMACAddress) + assert.Equal(t, "10.0.0.11", got.BMCIPAddress) + assert.Equal(t, "SSN-001", got.SwitchSerialNumber) + assert.Equal(t, "a12", got.RackID) + assert.Equal(t, "switch-001", got.Name) + assert.Equal(t, "NVIDIA", got.Labels["manufacturer"]) +} + +func TestExpectedPowerShelfDetailFromPb(t *testing.T) { + eps := &pb.ExpectedPowerShelf{ + ExpectedPowerShelfId: &pb.UUID{Value: "33333333-3333-3333-3333-333333333333"}, + BmcMacAddress: "aa:bb:cc:dd:ee:21", + BmcIpAddress: "10.0.0.21", + ShelfSerialNumber: "PSN-001", + RackId: &pb.RackId{Id: "a12"}, + Metadata: &pb.Metadata{ + Name: "shelf-001", + Labels: []*pb.Label{ + labelKV("manufacturer", "Lite-On"), + }, + }, + } + + got := expectedPowerShelfDetailFromPb(eps) + + assert.Equal(t, "33333333-3333-3333-3333-333333333333", got.ExpectedPowerShelfID) + assert.Equal(t, "aa:bb:cc:dd:ee:21", got.BMCMACAddress) + assert.Equal(t, "10.0.0.21", got.BMCIPAddress) + assert.Equal(t, "PSN-001", got.ShelfSerialNumber) + assert.Equal(t, "a12", got.RackID) + assert.Equal(t, "shelf-001", got.Name) + assert.Equal(t, "Lite-On", got.Labels["manufacturer"]) +} + +func TestMetadataToGoNilSafe(t *testing.T) { + name, desc, labels := metadataToGo(nil) + assert.Empty(t, name) + assert.Empty(t, desc) + assert.Nil(t, labels) +} + +func TestMetadataToGoSkipsValueNilLabels(t *testing.T) { + md := &pb.Metadata{ + Labels: []*pb.Label{ + {Key: "with-value", Value: strPtr("v")}, + {Key: "no-value"}, + nil, + }, + } + _, _, labels := metadataToGo(md) + assert.Equal(t, map[string]string{"with-value": "v"}, labels) +} + +func TestMockGetAllExpectedDetailsRoundTrip(t *testing.T) { + ctx := context.Background() + c := NewMockClient() + + c.AddExpectedRackDetail(ExpectedRackDetail{RackID: "a12", RackProfileID: "gb200"}) + c.AddExpectedRackDetail(ExpectedRackDetail{RackID: "b13", RackProfileID: "gb200"}) + c.AddExpectedMachineDetail(ExpectedMachineDetail{ + ExpectedMachineID: "uuid-m1", ChassisSerialNumber: "CSN-1", RackID: "a12", + }) + c.AddExpectedSwitchDetail(ExpectedSwitchDetail{ + ExpectedSwitchID: "uuid-s1", SwitchSerialNumber: "SSN-1", RackID: "a12", + }) + c.AddExpectedPowerShelfDetail(ExpectedPowerShelfDetail{ + ExpectedPowerShelfID: "uuid-p1", ShelfSerialNumber: "PSN-1", RackID: "a12", + }) + + racks, err := c.GetAllExpectedRackDetails(ctx) + require.NoError(t, err) + assert.Len(t, racks, 2) + + machines, err := c.GetAllExpectedMachineDetails(ctx) + require.NoError(t, err) + assert.Len(t, machines, 1) + assert.Equal(t, "uuid-m1", machines[0].ExpectedMachineID) + + switches, err := c.GetAllExpectedSwitchDetails(ctx) + require.NoError(t, err) + assert.Len(t, switches, 1) + + shelves, err := c.GetAllExpectedPowerShelfDetails(ctx) + require.NoError(t, err) + assert.Len(t, shelves, 1) +} + +func TestMockGetAllExpectedDetailsEmptyReturnsNil(t *testing.T) { + ctx := context.Background() + c := NewMockClient() + + for _, fn := range []func() (int, error){ + func() (int, error) { + r, err := c.GetAllExpectedRackDetails(ctx) + return len(r), err + }, + func() (int, error) { + r, err := c.GetAllExpectedMachineDetails(ctx) + return len(r), err + }, + func() (int, error) { + r, err := c.GetAllExpectedSwitchDetails(ctx) + return len(r), err + }, + func() (int, error) { + r, err := c.GetAllExpectedPowerShelfDetails(ctx) + return len(r), err + }, + } { + n, err := fn() + assert.NoError(t, err) + assert.Zero(t, n) + } +} + +func labelKV(k, v string) *pb.Label { + val := v + return &pb.Label{Key: k, Value: &val} +} + +func strPtr(s string) *string { return &s } diff --git a/rest-api/flow/internal/nicoapi/grpc.go b/rest-api/flow/internal/nicoapi/grpc.go index e38006a948..0cf0021993 100644 --- a/rest-api/flow/internal/nicoapi/grpc.go +++ b/rest-api/flow/internal/nicoapi/grpc.go @@ -745,6 +745,82 @@ func (c *grpcClient) GetAllExpectedPowerShelvesLinked(ctx context.Context) ([]Li return results, nil } +func (c *grpcClient) GetAllExpectedRackDetails(ctx context.Context) ([]ExpectedRackDetail, error) { + ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) + defer cancel() + + resp, err := c.gclient.GetAllExpectedRacks(ctx, &emptypb.Empty{}) + if err != nil { + return nil, fmt.Errorf("failed to get all expected racks: %w", err) + } + rows := resp.GetExpectedRacks() + if len(rows) == 0 { + return nil, nil + } + results := make([]ExpectedRackDetail, 0, len(rows)) + for _, er := range rows { + results = append(results, expectedRackDetailFromPb(er)) + } + return results, nil +} + +func (c *grpcClient) GetAllExpectedMachineDetails(ctx context.Context) ([]ExpectedMachineDetail, error) { + ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) + defer cancel() + + resp, err := c.gclient.GetAllExpectedMachines(ctx, &emptypb.Empty{}) + if err != nil { + return nil, fmt.Errorf("failed to get all expected machines: %w", err) + } + rows := resp.GetExpectedMachines() + if len(rows) == 0 { + return nil, nil + } + results := make([]ExpectedMachineDetail, 0, len(rows)) + for _, em := range rows { + results = append(results, expectedMachineDetailFromPb(em)) + } + return results, nil +} + +func (c *grpcClient) GetAllExpectedSwitchDetails(ctx context.Context) ([]ExpectedSwitchDetail, error) { + ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) + defer cancel() + + resp, err := c.gclient.GetAllExpectedSwitches(ctx, &emptypb.Empty{}) + if err != nil { + return nil, fmt.Errorf("failed to get all expected switches: %w", err) + } + rows := resp.GetExpectedSwitches() + if len(rows) == 0 { + return nil, nil + } + results := make([]ExpectedSwitchDetail, 0, len(rows)) + for _, es := range rows { + results = append(results, expectedSwitchDetailFromPb(es)) + } + return results, nil +} + +func (c *grpcClient) GetAllExpectedPowerShelfDetails(ctx context.Context) ([]ExpectedPowerShelfDetail, error) { + ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) + defer cancel() + + resp, err := c.gclient.GetAllExpectedPowerShelves(ctx, &emptypb.Empty{}) + if err != nil { + return nil, fmt.Errorf("failed to get all expected power shelves: %w", err) + } + rows := resp.GetExpectedPowerShelves() + if len(rows) == 0 { + return nil, nil + } + results := make([]ExpectedPowerShelfDetail, 0, len(rows)) + for _, eps := range rows { + results = append(results, expectedPowerShelfDetailFromPb(eps)) + } + return results, nil +} + func (c *grpcClient) GetDesiredFirmwareVersions(ctx context.Context) ([]*pb.DesiredFirmwareVersionEntry, error) { ctx, cancel := context.WithTimeout(ctx, c.grpcTimeout) defer cancel() @@ -843,3 +919,19 @@ func (c *grpcClient) SetPowerShelfControllerState(shelfID, state string) { func (c *grpcClient) SetRackHostMachineIDs(rackID string, machineIDs []string) { panic("Not a unit test") } + +func (c *grpcClient) AddExpectedRackDetail(detail ExpectedRackDetail) { + panic("Not a unit test") +} + +func (c *grpcClient) AddExpectedMachineDetail(detail ExpectedMachineDetail) { + panic("Not a unit test") +} + +func (c *grpcClient) AddExpectedSwitchDetail(detail ExpectedSwitchDetail) { + panic("Not a unit test") +} + +func (c *grpcClient) AddExpectedPowerShelfDetail(detail ExpectedPowerShelfDetail) { + panic("Not a unit test") +} diff --git a/rest-api/flow/internal/nicoapi/mock.go b/rest-api/flow/internal/nicoapi/mock.go index bffdcd9fad..e3a54e9a1b 100644 --- a/rest-api/flow/internal/nicoapi/mock.go +++ b/rest-api/flow/internal/nicoapi/mock.go @@ -29,6 +29,13 @@ type mockClient struct { switchControllerStates map[string]string // switch ID → raw core controller_state powerShelfControllerStates map[string]string // shelf ID → raw core controller_state hostMachinesByRackID map[string][]string + // Detail tables for the GetAllExpected*Details RPCs (Flow's mirror sync). + // Keyed by the natural identifier the test cares about so test helpers can + // overwrite individual entries without rebuilding the whole slice. + expectedRackDetails map[string]ExpectedRackDetail // by RackID + expectedMachineDetails map[string]ExpectedMachineDetail // by ExpectedMachineID (UUID) + expectedSwitchDetails map[string]ExpectedSwitchDetail // by ExpectedSwitchID (UUID) + expectedPowerShelfDetails map[string]ExpectedPowerShelfDetail // by ExpectedPowerShelfID (UUID) } // NewMockClient returns a "GRPC" client that returns mock values so it can be used in unit tests. @@ -43,6 +50,10 @@ func NewMockClient() Client { switchControllerStates: map[string]string{}, powerShelfControllerStates: map[string]string{}, hostMachinesByRackID: map[string][]string{}, + expectedRackDetails: map[string]ExpectedRackDetail{}, + expectedMachineDetails: map[string]ExpectedMachineDetail{}, + expectedSwitchDetails: map[string]ExpectedSwitchDetail{}, + expectedPowerShelfDetails: map[string]ExpectedPowerShelfDetail{}, } } @@ -320,3 +331,72 @@ func (c *mockClient) SetMachineAutoUpdate(_ context.Context, _ string, _ bool) e func (c *mockClient) AddExpectedSwitchInfo(info ExpectedSwitchInfo) { c.expectedSwitches[utils.NormalizeMAC(info.BMCMACAddress)] = info } + +func (c *mockClient) GetAllExpectedRackDetails(_ context.Context) ([]ExpectedRackDetail, error) { + if len(c.expectedRackDetails) == 0 { + return nil, nil + } + out := make([]ExpectedRackDetail, 0, len(c.expectedRackDetails)) + for _, d := range c.expectedRackDetails { + out = append(out, d) + } + return out, nil +} + +func (c *mockClient) GetAllExpectedMachineDetails(_ context.Context) ([]ExpectedMachineDetail, error) { + if len(c.expectedMachineDetails) == 0 { + return nil, nil + } + out := make([]ExpectedMachineDetail, 0, len(c.expectedMachineDetails)) + for _, d := range c.expectedMachineDetails { + out = append(out, d) + } + return out, nil +} + +func (c *mockClient) GetAllExpectedSwitchDetails(_ context.Context) ([]ExpectedSwitchDetail, error) { + if len(c.expectedSwitchDetails) == 0 { + return nil, nil + } + out := make([]ExpectedSwitchDetail, 0, len(c.expectedSwitchDetails)) + for _, d := range c.expectedSwitchDetails { + out = append(out, d) + } + return out, nil +} + +func (c *mockClient) GetAllExpectedPowerShelfDetails(_ context.Context) ([]ExpectedPowerShelfDetail, error) { + if len(c.expectedPowerShelfDetails) == 0 { + return nil, nil + } + out := make([]ExpectedPowerShelfDetail, 0, len(c.expectedPowerShelfDetails)) + for _, d := range c.expectedPowerShelfDetails { + out = append(out, d) + } + return out, nil +} + +// AddExpectedRackDetail registers an expected rack for the mock GetAllExpectedRackDetails call. +func (c *mockClient) AddExpectedRackDetail(detail ExpectedRackDetail) { + c.expectedRackDetails[detail.RackID] = detail +} + +// AddExpectedMachineDetail registers an expected machine for the mock +// GetAllExpectedMachineDetails call. Tests that don't care about the +// ExpectedMachineID may leave it empty; the map then uses "" as the key (only +// one such entry will survive). +func (c *mockClient) AddExpectedMachineDetail(detail ExpectedMachineDetail) { + c.expectedMachineDetails[detail.ExpectedMachineID] = detail +} + +// AddExpectedSwitchDetail registers an expected switch for the mock +// GetAllExpectedSwitchDetails call. +func (c *mockClient) AddExpectedSwitchDetail(detail ExpectedSwitchDetail) { + c.expectedSwitchDetails[detail.ExpectedSwitchID] = detail +} + +// AddExpectedPowerShelfDetail registers an expected power shelf for the mock +// GetAllExpectedPowerShelfDetails call. +func (c *mockClient) AddExpectedPowerShelfDetail(detail ExpectedPowerShelfDetail) { + c.expectedPowerShelfDetails[detail.ExpectedPowerShelfID] = detail +} diff --git a/rest-api/flow/internal/nicoapi/mod.go b/rest-api/flow/internal/nicoapi/mod.go index 76c4e1ff9c..ba65e1522b 100644 --- a/rest-api/flow/internal/nicoapi/mod.go +++ b/rest-api/flow/internal/nicoapi/mod.go @@ -118,6 +118,29 @@ type Client interface { // created), and the expected power shelf UUID. GetAllExpectedPowerShelvesLinked(ctx context.Context) ([]LinkedExpectedPowerShelf, error) + // GetAllExpectedRackDetails returns every expected rack registered with + // Core. The result is the canonical view: rack_id (operator-supplied + // stable identifier), rack_profile_id, and Metadata (name, description, + // labels including chassis.* / location.*). + GetAllExpectedRackDetails(ctx context.Context) ([]ExpectedRackDetail, error) + + // GetAllExpectedMachineDetails returns every expected machine registered + // with Core, including bmc_mac, chassis_serial_number, rack_id, the + // expected_machine UUID and the full Metadata block. This is the source + // of truth for Flow's expected-inventory sync; do not confuse with + // GetMachines, which returns runtime (discovered) machine state. + GetAllExpectedMachineDetails(ctx context.Context) ([]ExpectedMachineDetail, error) + + // GetAllExpectedSwitchDetails returns every expected switch registered + // with Core as a flat slice carrying the full proto contents. The + // existing GetAllExpectedSwitches (keyed by BMC MAC, thin info) is kept + // for its current callers and intentionally not replaced here. + GetAllExpectedSwitchDetails(ctx context.Context) ([]ExpectedSwitchDetail, error) + + // GetAllExpectedPowerShelfDetails returns every expected power shelf + // registered with Core as a flat slice with the full proto contents. + GetAllExpectedPowerShelfDetails(ctx context.Context) ([]ExpectedPowerShelfDetail, error) + // GetDesiredFirmwareVersions returns a slice of desired firmware version // entries configured in Core. Each entry carries vendor and model fields; // iterate the slice to find matching entries. @@ -144,4 +167,8 @@ type Client interface { SetSwitchControllerState(switchID, state string) SetPowerShelfControllerState(shelfID, state string) SetRackHostMachineIDs(rackID string, machineIDs []string) + AddExpectedRackDetail(detail ExpectedRackDetail) + AddExpectedMachineDetail(detail ExpectedMachineDetail) + AddExpectedSwitchDetail(detail ExpectedSwitchDetail) + AddExpectedPowerShelfDetail(detail ExpectedPowerShelfDetail) } diff --git a/rest-api/flow/internal/nicoapi/model.go b/rest-api/flow/internal/nicoapi/model.go index 9bf4a41802..850ae67e11 100644 --- a/rest-api/flow/internal/nicoapi/model.go +++ b/rest-api/flow/internal/nicoapi/model.go @@ -370,3 +370,147 @@ func bringUpStateFromPb( return BringUpStateNotDiscovered } } + +// ExpectedRackDetail is the canonical expected-rack representation returned by +// GetAllExpectedRacks. RackID is the operator-supplied stable identifier (the +// same string referenced by ExpectedMachine.RackID, ExpectedSwitchDetail.RackID, +// and ExpectedPowerShelfDetail.RackID). Labels carries well-known keys such as +// chassis.manufacturer / chassis.serial-number / chassis.model and location.*. +type ExpectedRackDetail struct { + RackID string + RackProfileID string + Name string + Description string + Labels map[string]string +} + +// ExpectedMachineDetail is the canonical expected-machine representation +// returned by GetAllExpectedMachines. ExpectedMachineID is the Core-side UUID +// for the expected_machines row (distinct from the runtime machine_id assigned +// after discovery). Labels carries the component metadata cloud REST writes +// alongside the typed fields: manufacturer, model, firmware_version, slot_id, +// tray_idx, host_id. +type ExpectedMachineDetail struct { + ExpectedMachineID string + BMCMACAddress string + BMCIPAddress string + ChassisSerialNumber string + RackID string + Name string + Description string + Labels map[string]string +} + +// ExpectedSwitchDetail is the canonical expected-switch representation returned +// by GetAllExpectedMachines's switch peer (GetAllExpectedSwitches has a +// different shape keyed by BMC MAC). ExpectedSwitchID is the Core-side UUID for +// the expected_switches row. Labels carries the same component metadata keys as +// ExpectedMachineDetail. +type ExpectedSwitchDetail struct { + ExpectedSwitchID string + BMCMACAddress string + BMCIPAddress string + SwitchSerialNumber string + RackID string + Name string + Description string + Labels map[string]string +} + +// ExpectedPowerShelfDetail is the canonical expected-power-shelf representation +// returned by GetAllExpectedPowerShelves. ExpectedPowerShelfID is the Core-side +// UUID for the expected_power_shelves row. +type ExpectedPowerShelfDetail struct { + ExpectedPowerShelfID string + BMCMACAddress string + BMCIPAddress string + ShelfSerialNumber string + RackID string + Name string + Description string + Labels map[string]string +} + +// metadataToGo extracts name, description and labels from a proto Metadata +// message into plain Go values. A nil metadata yields zero values and a nil +// labels map. +func metadataToGo(md *pb.Metadata) (name, description string, labels map[string]string) { + if md == nil { + return "", "", nil + } + name = md.GetName() + description = md.GetDescription() + if pbLabels := md.GetLabels(); len(pbLabels) > 0 { + labels = make(map[string]string, len(pbLabels)) + for _, l := range pbLabels { + if l == nil { + continue + } + if l.Value != nil { + labels[l.GetKey()] = l.GetValue() + } + } + } + return name, description, labels +} + +func expectedRackDetailFromPb(er *pb.ExpectedRack) ExpectedRackDetail { + d := ExpectedRackDetail{ + RackProfileID: er.GetRackProfileId().GetId(), + } + if er.GetRackId() != nil { + d.RackID = er.GetRackId().GetId() + } + d.Name, d.Description, d.Labels = metadataToGo(er.GetMetadata()) + return d +} + +func expectedMachineDetailFromPb(em *pb.ExpectedMachine) ExpectedMachineDetail { + d := ExpectedMachineDetail{ + BMCMACAddress: em.GetBmcMacAddress(), + ChassisSerialNumber: em.GetChassisSerialNumber(), + } + if em.Id != nil { + d.ExpectedMachineID = em.GetId().GetValue() + } + if em.BmcIpAddress != nil { + d.BMCIPAddress = em.GetBmcIpAddress() + } + if em.RackId != nil { + d.RackID = em.GetRackId().GetId() + } + d.Name, d.Description, d.Labels = metadataToGo(em.GetMetadata()) + return d +} + +func expectedSwitchDetailFromPb(es *pb.ExpectedSwitch) ExpectedSwitchDetail { + d := ExpectedSwitchDetail{ + BMCMACAddress: es.GetBmcMacAddress(), + BMCIPAddress: es.GetBmcIpAddress(), + SwitchSerialNumber: es.GetSwitchSerialNumber(), + } + if es.ExpectedSwitchId != nil { + d.ExpectedSwitchID = es.GetExpectedSwitchId().GetValue() + } + if es.RackId != nil { + d.RackID = es.GetRackId().GetId() + } + d.Name, d.Description, d.Labels = metadataToGo(es.GetMetadata()) + return d +} + +func expectedPowerShelfDetailFromPb(eps *pb.ExpectedPowerShelf) ExpectedPowerShelfDetail { + d := ExpectedPowerShelfDetail{ + BMCMACAddress: eps.GetBmcMacAddress(), + BMCIPAddress: eps.GetBmcIpAddress(), + ShelfSerialNumber: eps.GetShelfSerialNumber(), + } + if eps.ExpectedPowerShelfId != nil { + d.ExpectedPowerShelfID = eps.GetExpectedPowerShelfId().GetValue() + } + if eps.RackId != nil { + d.RackID = eps.GetRackId().GetId() + } + d.Name, d.Description, d.Labels = metadataToGo(eps.GetMetadata()) + return d +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync.go new file mode 100644 index 0000000000..011a5649ca --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync.go @@ -0,0 +1,214 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "context" + "fmt" + "time" + + "github.com/rs/zerolog/log" + "github.com/uptrace/bun" + + cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" + pb "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi/gen" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/types" +) + +// driftFieldSerialNumber is the canonical drift field name used by both the +// machine and inventory paths when chassis serial mismatches show up. +const driftFieldSerialNumber = "serial_number" + +// runActualSync runs every per-type actual-vs-expected drift detector, +// concatenates their drifts, and logs a per-type "received from Core" +// summary. Each type-specific function handles its own errors internally +// and falls back to nil drifts; one type's RPC failure doesn't suppress the +// others. The returned drifts are not yet persisted — runInventoryOne owns +// the table-replacement transaction. +func runActualSync( + ctx context.Context, + pool *cdb.Session, + nicoClient nicoapi.Client, +) []model.ComponentDrift { + var allDrifts []model.ComponentDrift + + computeReceived, machineDrifts := syncMachines(ctx, pool, nicoClient) + allDrifts = append(allDrifts, machineDrifts...) + + switchesReceived, nvSwitchDrifts := syncNVSwitchesNICo(ctx, pool, nicoClient) + allDrifts = append(allDrifts, nvSwitchDrifts...) + + powershelvesReceived, powershelfDrifts := syncPowershelvesNICo(ctx, pool, nicoClient) + allDrifts = append(allDrifts, powershelfDrifts...) + + log.Info(). + Int("compute", computeReceived). + Int("nvswitches", switchesReceived). + Int("powershelves", powershelvesReceived). + Msgf("Inventory received from Core: compute=%d nvswitches=%d powershelves=%d", + computeReceived, switchesReceived, powershelvesReceived) + + return allDrifts +} + +// mapKeys returns the keys of a string-keyed component map in arbitrary +// order. Used by the switch / power-shelf syncs to build the id slice they +// pass to the controller-state RPCs. +func mapKeys(m map[string]*model.Component) []string { + if len(m) == 0 { + return nil + } + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} + +// persistComponentStatuses maps raw core controller_state strings to +// ComponentStatus values via the per-type mapper and writes any deltas to the +// component table. components are keyed by external_id (machineID / switchID / +// shelfID). Entries without a state in statesByID are skipped — missing data +// is not a status reset. +func persistComponentStatuses( + ctx context.Context, + pool *cdb.Session, + componentType types.ComponentType, + statesByID map[string]string, + componentsByExternalID map[string]*model.Component, +) { + if len(statesByID) == 0 { + return + } + + var toUpdate []model.Component + for externalID, raw := range statesByID { + comp, ok := componentsByExternalID[externalID] + if !ok { + continue + } + newStatus := nicoapi.MapComponentStatus(componentType, raw) + if comp.Status != nil && comp.Status.Equal(newStatus) { + continue + } + comp.Status = &newStatus + toUpdate = append(toUpdate, *comp) + } + + if len(toUpdate) == 0 { + return + } + if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { + for _, cur := range toUpdate { + if err := cur.SetStatusByComponentID(ctx, tx); err != nil { + return fmt.Errorf("set component status: %w", err) + } + } + return nil + }); err != nil { + log.Error().Msgf("Unable to persist component statuses: %v", err) + } +} + +// applyInventoryToComponents extracts firmware_version and power_state from +// GetComponentInventoryResponse and direct-writes them to the matching +// components. Serial numbers are compared (not overwritten) and returned as +// drift records. componentsByID maps the component_id echoed back in each +// ComponentResult to the DB component. Shared by the switch and power-shelf +// syncs; the machine sync uses pre-fetched MachineDetail directly instead of +// going through GetComponentInventory. +func applyInventoryToComponents( + ctx context.Context, + pool *cdb.Session, + resp *pb.GetComponentInventoryResponse, + componentsByID map[string]*model.Component, +) []model.ComponentDrift { + now := time.Now() + var drifts []model.ComponentDrift + + for _, entry := range resp.GetEntries() { + result := entry.GetResult() + if result == nil { + continue + } + comp, ok := componentsByID[result.GetComponentId()] + if !ok { + continue + } + if result.GetStatus() != pb.ComponentManagerStatusCode_COMPONENT_MANAGER_STATUS_CODE_SUCCESS { + log.Warn().Msgf("Component %s: inventory status %s: %s", result.GetComponentId(), result.GetStatus(), result.GetError()) + continue + } + + report := entry.GetReport() + if report == nil { + continue + } + + needsUpdate := false + + // Extract firmware_version from the "BMC image" inventory entry + for _, svc := range report.GetService() { + for _, inv := range svc.GetInventories() { + if inv.GetDescription() == "BMC image" { + if v := inv.GetVersion(); v != "" && comp.FirmwareVersion != v { + comp.FirmwareVersion = v + needsUpdate = true + } + } + } + } + + // Compare serial_number from first Chassis entry (drift, not overwrite) + if chassisList := report.GetChassis(); len(chassisList) > 0 { + if sn := chassisList[0].GetSerialNumber(); sn != "" && comp.SerialNumber != sn { + compID := comp.ID + extID := result.GetComponentId() + drifts = append(drifts, model.ComponentDrift{ + ComponentID: &compID, + ExternalID: &extID, + DriftType: model.DriftTypeMismatch, + Diffs: []model.FieldDiff{{ + FieldName: driftFieldSerialNumber, + ExpectedValue: comp.SerialNumber, + ActualValue: sn, + }}, + CheckedAt: now, + }) + } + } + + // Extract power_state from first ComputerSystem entry + if systems := report.GetSystems(); len(systems) > 0 { + ps := computerSystemPowerStateToNICo(systems[0].GetPowerState()) + if comp.PowerState == nil || *comp.PowerState != ps { + comp.PowerState = &ps + needsUpdate = true + } + } + + if needsUpdate { + if err := comp.Patch(ctx, pool.DB); err != nil { + log.Error().Msgf("Component %s: unable to write inventory fields: %v", result.GetComponentId(), err) + } + } + } + + return drifts +} + +func computerSystemPowerStateToNICo( + ps pb.ComputerSystemPowerState, +) nicoapi.PowerState { + switch ps { + case pb.ComputerSystemPowerState_On, pb.ComputerSystemPowerState_PoweringOn: + return nicoapi.PowerStateOn + case pb.ComputerSystemPowerState_Off, pb.ComputerSystemPowerState_PoweringOff: + return nicoapi.PowerStateOff + default: + return nicoapi.PowerStateUnknown + } +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_machine.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_machine.go new file mode 100644 index 0000000000..bb857f4535 --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_machine.go @@ -0,0 +1,434 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "context" + "fmt" + "time" + + "github.com/rs/zerolog/log" + "github.com/uptrace/bun" + + cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/common/devicetypes" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/types" +) + +func isMachineComponentType(t string) bool { + return t == devicetypes.ComponentTypeToString(devicetypes.ComponentTypeCompute) +} + +// --------------------------------------------------------------------------- +// syncMachines: sync machine components against NICo +// --------------------------------------------------------------------------- +// +// NICo API calls (3 round-trips): +// - GetMachines (FindMachineIds + FindMachinesByIds): serial matching, +// firmware_version direct-write, and drift comparison data +// - GetPowerStates: power_state direct-write +// - GetMachinePositionInfo: position validation fields for drift comparison +// +// Flow: +// 1. DB: get all machine components +// 2. NICo GetMachines: fetch all machine details (reused for steps 3, 5, and drift) +// 3. Match by serial → direct-write external_id +// 4. NICo GetPowerStates: direct-write power_state +// 5. Direct-write firmware_version (from step 2 data) +// 6. NICo GetMachinePositionInfo: compare validation fields, return drifts +// +// Validation fields (compared for drift): slot_id, tray_index, host_id, serial_number +// Direct-write fields (written to DB, not compared): external_id, power_state, firmware_version +func syncMachines( + ctx context.Context, + pool *cdb.Session, + nicoClient nicoapi.Client, +) (received int, drifts []model.ComponentDrift) { + log.Debug().Msg("Syncing machines...") + + // Step 1: Get all machine components from DB + allComponents, err := model.GetAllComponents(ctx, pool.DB) + if err != nil { + log.Error().Msgf("Unable to retrieve components from db: %v", err) + return 0, nil + } + + var components []model.Component + for _, c := range allComponents { + if isMachineComponentType(c.Type) { + components = append(components, c) + } + } + + if len(components) == 0 { + return 0, nil + } + + // Step 2: Fetch all machine details from NICo + allMachineDetails, err := nicoClient.GetMachines(ctx) + if err != nil { + log.Error().Msgf("Unable to retrieve machine details from NICo: %v", err) + return 0, nil + } + received = len(allMachineDetails) + + detailByID := make(map[string]nicoapi.MachineDetail) + for _, d := range allMachineDetails { + detailByID[d.MachineID] = d + } + + // Step 3: Direct-write external_id by serial matching + syncMachineIDs(ctx, pool, allMachineDetails, components) + + // Re-read components to pick up any external_id updates + allComponents, err = model.GetAllComponents(ctx, pool.DB) + if err != nil { + log.Error().Msgf("Unable to re-read components from db after machine ID update: %v", err) + return received, nil + } + components = components[:0] + for _, c := range allComponents { + if isMachineComponentType(c.Type) { + components = append(components, c) + } + } + + // Build lookup maps for matched components + var machineIDs []string + componentsByExternalID := make(map[string]*model.Component) + for i := range components { + comp := &components[i] + if comp.ComponentID != nil && *comp.ComponentID != "" { + machineIDs = append(machineIDs, *comp.ComponentID) + componentsByExternalID[*comp.ComponentID] = comp + } + } + + if len(machineIDs) == 0 { + return received, buildDriftsForUnmatchedComponents(components, allMachineDetails) + } + + // Step 4: Direct-write power_state (requires separate NICo API) + syncPowerStates(ctx, pool, nicoClient, machineIDs, componentsByExternalID) + + // Step 5: Direct-write firmware_version (from pre-fetched details, no extra API call) + syncFirmwareVersions(ctx, pool, detailByID, componentsByExternalID) + + // Step 5b: Direct-write derived ComponentStatus (from pre-fetched detail.State). + syncMachineStatuses(ctx, pool, detailByID, componentsByExternalID) + + // Step 6: Fetch positions and build drift records (requires separate NICo API) + machinePositions, err := nicoClient.GetMachinePositionInfo(ctx, machineIDs) + if err != nil { + log.Error().Msgf("Unable to retrieve machine positions from NICo: %v", err) + return received, nil + } + + positionByID := make(map[string]nicoapi.MachinePosition) + for _, p := range machinePositions { + positionByID[p.MachineID] = p + } + + now := time.Now() + + for i := range components { + comp := &components[i] + + if comp.ComponentID == nil || *comp.ComponentID == "" { + compID := comp.ID + drifts = append(drifts, model.ComponentDrift{ + ComponentID: &compID, + ExternalID: nil, + DriftType: model.DriftTypeMissingInActual, + Diffs: []model.FieldDiff{}, + CheckedAt: now, + }) + continue + } + + externalID := *comp.ComponentID + detail, foundDetail := detailByID[externalID] + position, foundPosition := positionByID[externalID] + + if !foundDetail { + compID := comp.ID + drifts = append(drifts, model.ComponentDrift{ + ComponentID: &compID, + ExternalID: &externalID, + DriftType: model.DriftTypeMissingInActual, + Diffs: []model.FieldDiff{}, + CheckedAt: now, + }) + continue + } + + var posPtr *nicoapi.MachinePosition + if foundPosition { + posPtr = &position + } + fieldDiffs := compareMachineFieldsForDrift(comp, detail, posPtr) + if len(fieldDiffs) > 0 { + compID := comp.ID + drifts = append(drifts, model.ComponentDrift{ + ComponentID: &compID, + ExternalID: &externalID, + DriftType: model.DriftTypeMismatch, + Diffs: fieldDiffs, + CheckedAt: now, + }) + } + } + + // Detect missing_in_expected: machines in NICo but not in local DB + for _, detail := range allMachineDetails { + if _, found := componentsByExternalID[detail.MachineID]; !found { + extID := detail.MachineID + drifts = append(drifts, model.ComponentDrift{ + ComponentID: nil, + ExternalID: &extID, + DriftType: model.DriftTypeMissingInExpected, + Diffs: []model.FieldDiff{}, + CheckedAt: now, + }) + } + } + + log.Info().Msgf("Machine sync: %d drift(s) out of %d component(s)", len(drifts), len(components)) + return received, drifts +} + +// buildDriftsForUnmatchedComponents returns missing_in_actual drifts for all +// components that have no external_id, plus missing_in_expected drifts for +// every NICo machine (since no DB component has an external_id, none can +// match). +func buildDriftsForUnmatchedComponents( + components []model.Component, + allMachineDetails []nicoapi.MachineDetail, +) []model.ComponentDrift { + now := time.Now() + var drifts []model.ComponentDrift + for i := range components { + if components[i].ComponentID == nil || *components[i].ComponentID == "" { + compID := components[i].ID + drifts = append(drifts, model.ComponentDrift{ + ComponentID: &compID, + DriftType: model.DriftTypeMissingInActual, + Diffs: []model.FieldDiff{}, + CheckedAt: now, + }) + } + } + for _, detail := range allMachineDetails { + extID := detail.MachineID + drifts = append(drifts, model.ComponentDrift{ + ComponentID: nil, + ExternalID: &extID, + DriftType: model.DriftTypeMissingInExpected, + Diffs: []model.FieldDiff{}, + CheckedAt: now, + }) + } + return drifts +} + +// syncMachineIDs matches components by serial number against pre-fetched NICo +// machine details and direct-writes the external_id. +func syncMachineIDs( + ctx context.Context, + pool *cdb.Session, + allDetails []nicoapi.MachineDetail, + components []model.Component, +) { + containersBySerial := make(map[string]model.Component) + for _, cur := range components { + containersBySerial[cur.SerialNumber] = cur + } + + var toUpdate []model.Component + for _, cur := range allDetails { + if cur.ChassisSerial == nil { + continue + } + if container, ok := containersBySerial[*cur.ChassisSerial]; ok { + if container.ComponentID == nil || *container.ComponentID != cur.MachineID { + componentID := cur.MachineID + container.ComponentID = &componentID + toUpdate = append(toUpdate, container) + } + } + } + + if len(toUpdate) > 0 { + if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { + for _, cur := range toUpdate { + if err := cur.SetComponentIDBySerial(ctx, tx); err != nil { + return fmt.Errorf("Unable to update machine ID: %w", err) + } + } + return nil + }); err != nil { + log.Error().Msgf("Unable to update components with serial: %v", err) + return + } + + log.Info().Msgf("Updated %d machine ID(s)", len(toUpdate)) + } +} + +// syncPowerStates fetches power states from NICo and direct-writes to component table. +func syncPowerStates( + ctx context.Context, + pool *cdb.Session, + nicoClient nicoapi.Client, + machineIDs []string, + componentsByExternalID map[string]*model.Component, +) { + machines, err := nicoClient.GetPowerStates(ctx, machineIDs) + if err != nil { + log.Error().Msgf("Unable to retrieve power states from nico-core-api: %v", err) + return + } + + var toUpdate []model.Component + for _, cur := range machines { + if comp, ok := componentsByExternalID[cur.MachineID]; ok { + if comp.PowerState == nil || *comp.PowerState != cur.PowerState { + powerState := cur.PowerState + comp.PowerState = &powerState + toUpdate = append(toUpdate, *comp) + } + } + } + + if len(toUpdate) > 0 { + if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { + for _, cur := range toUpdate { + if err := cur.SetPowerStateByComponentID(ctx, tx); err != nil { + return fmt.Errorf("Unable to update power state: %w", err) + } + } + return nil + }); err != nil { + log.Error().Msgf("Unable to update components with power state: %v", err) + } + } +} + +// syncFirmwareVersions direct-writes firmware_version from NICo machine details to component table. +func syncFirmwareVersions( + ctx context.Context, + pool *cdb.Session, + detailByID map[string]nicoapi.MachineDetail, + componentsByExternalID map[string]*model.Component, +) { + var toUpdate []model.Component + for machineID, detail := range detailByID { + if comp, ok := componentsByExternalID[machineID]; ok { + if detail.FirmwareVersion != "" && comp.FirmwareVersion != detail.FirmwareVersion { + comp.FirmwareVersion = detail.FirmwareVersion + toUpdate = append(toUpdate, *comp) + } + } + } + + if len(toUpdate) > 0 { + if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { + for _, cur := range toUpdate { + if err := cur.SetFirmwareVersionByComponentID(ctx, tx); err != nil { + return fmt.Errorf("unable to update firmware version: %w", err) + } + } + return nil + }); err != nil { + log.Error().Msgf("Unable to update components with firmware version: %v", err) + } + } +} + +// syncMachineStatuses derives a types.ComponentStatus from each machine's +// controller_state (already fetched as detail.State) and direct-writes it to +// the component row. Only rows whose status actually changed are updated. +func syncMachineStatuses( + ctx context.Context, + pool *cdb.Session, + detailByID map[string]nicoapi.MachineDetail, + componentsByExternalID map[string]*model.Component, +) { + statesByID := make(map[string]string, len(detailByID)) + for id, d := range detailByID { + if d.State != "" { + statesByID[id] = d.State + } + } + persistComponentStatuses(ctx, pool, types.ComponentTypeCompute, statesByID, componentsByExternalID) +} + +// compareMachineFieldsForDrift compares validation fields between expected (DB) and actual (NICo). +// Validation fields: slot_id, tray_index, host_id, serial_number. +func compareMachineFieldsForDrift( + expected *model.Component, + actual nicoapi.MachineDetail, + position *nicoapi.MachinePosition, +) []model.FieldDiff { + var diffs []model.FieldDiff + + if position != nil { + if position.PhysicalSlotNum != nil && expected.SlotID != int(*position.PhysicalSlotNum) { + diffs = append(diffs, model.FieldDiff{ + FieldName: "slot_id", + ExpectedValue: fmt.Sprintf("%d", expected.SlotID), + ActualValue: fmt.Sprintf("%d", *position.PhysicalSlotNum), + }) + } + if position.ComputeTrayIndex != nil && expected.TrayIndex != int(*position.ComputeTrayIndex) { + diffs = append(diffs, model.FieldDiff{ + FieldName: "tray_index", + ExpectedValue: fmt.Sprintf("%d", expected.TrayIndex), + ActualValue: fmt.Sprintf("%d", *position.ComputeTrayIndex), + }) + } + if position.TopologyID != nil && expected.HostID != int(*position.TopologyID) { + diffs = append(diffs, model.FieldDiff{ + FieldName: "host_id", + ExpectedValue: fmt.Sprintf("%d", expected.HostID), + ActualValue: fmt.Sprintf("%d", *position.TopologyID), + }) + } + } else { + if expected.SlotID != 0 { + diffs = append(diffs, model.FieldDiff{ + FieldName: "slot_id", + ExpectedValue: fmt.Sprintf("%d", expected.SlotID), + ActualValue: "", + }) + } + if expected.TrayIndex != 0 { + diffs = append(diffs, model.FieldDiff{ + FieldName: "tray_index", + ExpectedValue: fmt.Sprintf("%d", expected.TrayIndex), + ActualValue: "", + }) + } + if expected.HostID != 0 { + diffs = append(diffs, model.FieldDiff{ + FieldName: "host_id", + ExpectedValue: fmt.Sprintf("%d", expected.HostID), + ActualValue: "", + }) + } + } + + // Compare serial_number (chassis_serial) + if actual.ChassisSerial != nil && expected.SerialNumber != *actual.ChassisSerial { + diffs = append(diffs, model.FieldDiff{ + FieldName: driftFieldSerialNumber, + ExpectedValue: expected.SerialNumber, + ActualValue: *actual.ChassisSerial, + }) + } + + return diffs +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/drift_test.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_machine_test.go similarity index 100% rename from rest-api/flow/internal/scheduler/jobs/inventorysync/drift_test.go rename to rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_machine_test.go diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_powershelf.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_powershelf.go new file mode 100644 index 0000000000..5558eb3f56 --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_powershelf.go @@ -0,0 +1,163 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "context" + "net" + "time" + + "github.com/rs/zerolog/log" + + cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/common/utils" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" + pb "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi/gen" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/common/devicetypes" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/types" +) + +// --------------------------------------------------------------------------- +// syncPowershelvesNICo: sync PowerShelf components via Core (NICo) +// --------------------------------------------------------------------------- +// +// Uses Core's NICo API. Core's PSM backend auto-registers power shelves, so no +// registration step is needed. +// +// NICo API calls (2 round-trips): +// - GetAllExpectedPowerShelvesLinked: discover Core power shelf IDs by PMC MAC +// - GetComponentInventory: get firmware, power state from site explorer +// +// Flow: +// 1. DB: get all PowerShelf components with PMCs +// 2. NICo GetAllExpectedPowerShelvesLinked: map PMC MAC → Core PowerShelfId +// 3. Direct-write external_id (Core's PowerShelfId) for matched components +// 4. NICo GetComponentInventory: extract firmware_version, power_state +// 5. Direct-write inventory fields to DB +// 6. Return drifts (missing_in_actual for components without a Core PowerShelfId) +func syncPowershelvesNICo( + ctx context.Context, + pool *cdb.Session, + nicoClient nicoapi.Client, +) (received int, drifts []model.ComponentDrift) { + log.Debug().Msg("Syncing powershelves via NICo...") + + expectedPowershelves, err := model.GetComponentsByType(ctx, pool.DB, devicetypes.ComponentTypePowerShelf) + if err != nil { + log.Error().Msgf("Unable to retrieve powershelf components from db: %v", err) + return 0, nil + } + + if len(expectedPowershelves) == 0 { + return 0, nil + } + + expectedByPmcMac := make(map[string]*model.Component) + for i := range expectedPowershelves { + ps := &expectedPowershelves[i] + if len(ps.BMCs) != 1 { + log.Error().Msgf("Powershelf %s has %d BMCs, expected exactly 1; skipping", ps.SerialNumber, len(ps.BMCs)) + continue + } + pmcMacAddr, err := net.ParseMAC(ps.BMCs[0].MacAddress) + if err != nil || pmcMacAddr == nil { + log.Error().Msgf("Powershelf %s has invalid BMC MAC address %s; skipping", ps.SerialNumber, ps.BMCs[0].MacAddress) + continue + } + expectedByPmcMac[pmcMacAddr.String()] = ps + } + + // ID discovery: map PMC MAC → Core PowerShelfId + linked, err := nicoClient.GetAllExpectedPowerShelvesLinked(ctx) + if err != nil { + log.Error().Msgf("Unable to retrieve linked expected power shelves from NICo: %v", err) + return 0, nil + } + received = len(linked) + + linkedByMac := make(map[string]nicoapi.LinkedExpectedPowerShelf) + for _, leps := range linked { + if leps.BMCMACAddress != "" { + linkedByMac[utils.NormalizeMAC(leps.BMCMACAddress)] = leps + } + } + + // Direct-write external_id for matched components + var shelfIDs []*pb.PowerShelfId + componentsByShelfID := make(map[string]*model.Component) + + for pmcMac, ps := range expectedByPmcMac { + leps, found := linkedByMac[pmcMac] + if !found || leps.PowerShelfID == "" { + continue + } + + if ps.ComponentID == nil || *ps.ComponentID != leps.PowerShelfID { + shelfID := leps.PowerShelfID + ps.ComponentID = &shelfID + if err := ps.Patch(ctx, pool.DB); err != nil { + log.Error().Msgf("Powershelf %s (PMC %s): unable to update external_id: %v", ps.ID, pmcMac, err) + continue + } + log.Info().Msgf("Powershelf %s (PMC %s): set external_id to Core PowerShelfId %s", ps.ID, pmcMac, shelfID) + } + + shelfIDs = append(shelfIDs, &pb.PowerShelfId{Id: leps.PowerShelfID}) + componentsByShelfID[leps.PowerShelfID] = ps + } + + // Fetch inventory from Core for all matched power shelves + now := time.Now() + if len(shelfIDs) > 0 { + invResp, err := nicoClient.GetComponentInventory(ctx, &pb.GetComponentInventoryRequest{ + Target: &pb.GetComponentInventoryRequest_PowerShelfIds{ + PowerShelfIds: &pb.PowerShelfIdList{Ids: shelfIDs}, + }, + }) + if err != nil { + log.Error().Msgf("Unable to retrieve powershelf inventory from NICo: %v", err) + } else { + drifts = append(drifts, applyInventoryToComponents(ctx, pool, invResp, componentsByShelfID)...) + } + } + + syncPowershelfStatuses(ctx, pool, nicoClient, componentsByShelfID) + + // Build drifts for components that don't have a Core PowerShelfId yet + for _, ps := range expectedByPmcMac { + if ps.ComponentID == nil || *ps.ComponentID == "" { + compID := ps.ID + drifts = append(drifts, model.ComponentDrift{ + ComponentID: &compID, + ExternalID: nil, + DriftType: model.DriftTypeMissingInActual, + Diffs: []model.FieldDiff{}, + CheckedAt: now, + }) + } + } + + log.Info().Msgf("Powershelf NICo sync: %d drift(s) out of %d expected", len(drifts), len(expectedPowershelves)) + return received, drifts +} + +// syncPowershelfStatuses is the power-shelf equivalent of syncSwitchStatuses. +func syncPowershelfStatuses( + ctx context.Context, + pool *cdb.Session, + nicoClient nicoapi.Client, + componentsByShelfID map[string]*model.Component, +) { + ids := mapKeys(componentsByShelfID) + if len(ids) == 0 { + return + } + statesByID, err := nicoClient.FindPowerShelfControllerStates(ctx, ids) + if err != nil { + log.Error().Msgf("Unable to retrieve power-shelf controller_states from NICo: %v", err) + return + } + persistComponentStatuses(ctx, pool, types.ComponentTypePowerShelf, statesByID, componentsByShelfID) +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_switch.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_switch.go new file mode 100644 index 0000000000..876c850c70 --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/actual_sync_switch.go @@ -0,0 +1,164 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "context" + "net" + "time" + + "github.com/rs/zerolog/log" + + cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/common/utils" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" + pb "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi/gen" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/common/devicetypes" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/types" +) + +// --------------------------------------------------------------------------- +// syncNVSwitchesNICo: sync NVSwitch components via Core (NICo) +// --------------------------------------------------------------------------- +// +// Uses Core's NICo API. Core's NSM backend auto-registers switches, so no +// registration step is needed. +// +// NICo API calls (2 round-trips): +// - GetAllExpectedSwitchesLinked: discover Core switch IDs by BMC MAC +// - GetComponentInventory: get firmware, serial, power state from site explorer +// +// Flow: +// 1. DB: get all NVSwitch components with BMCs +// 2. NICo GetAllExpectedSwitchesLinked: map BMC MAC → Core SwitchId +// 3. Direct-write external_id (Core's SwitchId) for matched components +// 4. NICo GetComponentInventory: extract firmware_version, serial_number, power_state +// 5. Direct-write inventory fields to DB +// 6. Return drifts (missing_in_actual for components without a Core SwitchId) +func syncNVSwitchesNICo( + ctx context.Context, + pool *cdb.Session, + nicoClient nicoapi.Client, +) (received int, drifts []model.ComponentDrift) { + log.Debug().Msg("Syncing NV switches via NICo...") + + expectedSwitches, err := model.GetComponentsByType(ctx, pool.DB, devicetypes.ComponentTypeNVSwitch) + if err != nil { + log.Error().Msgf("Unable to retrieve NVSwitch components from db: %v", err) + return 0, nil + } + + if len(expectedSwitches) == 0 { + return 0, nil + } + + expectedByBmcMac := make(map[string]*model.Component) + for i := range expectedSwitches { + sw := &expectedSwitches[i] + if len(sw.BMCs) != 1 { + log.Error().Msgf("NVSwitch %s has %d BMCs, expected exactly 1; skipping", sw.SerialNumber, len(sw.BMCs)) + continue + } + bmcMacAddr, err := net.ParseMAC(sw.BMCs[0].MacAddress) + if err != nil || bmcMacAddr == nil { + log.Error().Msgf("NVSwitch %s has invalid BMC MAC address %s; skipping", sw.SerialNumber, sw.BMCs[0].MacAddress) + continue + } + expectedByBmcMac[bmcMacAddr.String()] = sw + } + + // ID discovery: map BMC MAC → Core SwitchId + linked, err := nicoClient.GetAllExpectedSwitchesLinked(ctx) + if err != nil { + log.Error().Msgf("Unable to retrieve linked expected switches from NICo: %v", err) + return 0, nil + } + received = len(linked) + + linkedByMac := make(map[string]nicoapi.LinkedExpectedSwitch) + for _, les := range linked { + if les.BMCMACAddress != "" { + linkedByMac[utils.NormalizeMAC(les.BMCMACAddress)] = les + } + } + + // Direct-write external_id for matched components + var switchIDs []*pb.SwitchId + componentsBySwitchID := make(map[string]*model.Component) + + for bmcMac, sw := range expectedByBmcMac { + les, found := linkedByMac[bmcMac] + if !found || les.SwitchID == "" { + continue + } + + if sw.ComponentID == nil || *sw.ComponentID != les.SwitchID { + switchID := les.SwitchID + sw.ComponentID = &switchID + if err := sw.Patch(ctx, pool.DB); err != nil { + log.Error().Msgf("NVSwitch %s (BMC %s): unable to update external_id: %v", sw.ID, bmcMac, err) + continue + } + log.Info().Msgf("NVSwitch %s (BMC %s): set external_id to Core SwitchId %s", sw.ID, bmcMac, switchID) + } + + switchIDs = append(switchIDs, &pb.SwitchId{Id: les.SwitchID}) + componentsBySwitchID[les.SwitchID] = sw + } + + // Fetch inventory from Core for all matched switches + now := time.Now() + if len(switchIDs) > 0 { + invResp, err := nicoClient.GetComponentInventory(ctx, &pb.GetComponentInventoryRequest{ + Target: &pb.GetComponentInventoryRequest_SwitchIds{ + SwitchIds: &pb.SwitchIdList{Ids: switchIDs}, + }, + }) + if err != nil { + log.Error().Msgf("Unable to retrieve switch inventory from NICo: %v", err) + } else { + drifts = append(drifts, applyInventoryToComponents(ctx, pool, invResp, componentsBySwitchID)...) + } + } + + syncSwitchStatuses(ctx, pool, nicoClient, componentsBySwitchID) + + // Build drifts for components that don't have a Core SwitchId yet + for _, sw := range expectedByBmcMac { + if sw.ComponentID == nil || *sw.ComponentID == "" { + compID := sw.ID + drifts = append(drifts, model.ComponentDrift{ + ComponentID: &compID, + ExternalID: nil, + DriftType: model.DriftTypeMissingInActual, + Diffs: []model.FieldDiff{}, + CheckedAt: now, + }) + } + } + + log.Info().Msgf("NVSwitch NICo sync: %d drift(s) out of %d expected", len(drifts), len(expectedSwitches)) + return received, drifts +} + +// syncSwitchStatuses fetches controller_state for the matched switches and +// persists the derived ComponentStatus per DB row. +func syncSwitchStatuses( + ctx context.Context, + pool *cdb.Session, + nicoClient nicoapi.Client, + componentsBySwitchID map[string]*model.Component, +) { + ids := mapKeys(componentsBySwitchID) + if len(ids) == 0 { + return + } + statesByID, err := nicoClient.FindSwitchControllerStates(ctx, ids) + if err != nil { + log.Error().Msgf("Unable to retrieve switch controller_states from NICo: %v", err) + return + } + persistComponentStatuses(ctx, pool, types.ComponentTypeNVSwitch, statesByID, componentsBySwitchID) +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror.go new file mode 100644 index 0000000000..e67f8dffe0 --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror.go @@ -0,0 +1,148 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "context" + + "github.com/google/uuid" + "github.com/rs/zerolog/log" + "github.com/uptrace/bun" + + cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/common/devicetypes" +) + +// mirrorResult summarises one mirror pass for a single resource type. Used for +// structured logging so the operator can tell at a glance whether a sync was +// well-behaved (mostly updates, no surprises) or alarming (large delete +// counts). +type mirrorResult struct { + resource string + pulled int + inserted int + updated int + adopted int + resurrected int + softDeleted int + legacyExempt int + skippedNoIDOrKey int +} + +func (r mirrorResult) log() { + log.Info(). + Str("resource", r.resource). + Int("pulled", r.pulled). + Int("inserted", r.inserted). + Int("updated", r.updated). + Int("adopted", r.adopted). + Int("resurrected", r.resurrected). + Int("soft_deleted", r.softDeleted). + Int("legacy_exempt", r.legacyExempt). + Int("skipped_invalid", r.skippedNoIDOrKey). + Msgf("Expected-inventory mirror: %s", r.resource) +} + +// syncExpectedFromCore pulls Core's expected inventory and reconciles each +// of Flow's tables to mirror it. Racks are reconciled first so a per-cycle +// rack_id → Rack.UUID map is available to resolve every component's +// RackExternalID into the FK Flow needs. Each resource type is independent: +// an RPC failure on machines doesn't stop switches from being reconciled. +// +// Runs immediately before runInventoryOne's drift detection so the drift +// loop sees a Flow inventory that's already aligned with Core's expected +// view. +func syncExpectedFromCore( + ctx context.Context, + pool *cdb.Session, + nicoClient nicoapi.Client, +) { + racks, rackOK, rackHasRows := pullExpectedRacks(ctx, nicoClient) + if rackOK { + result := mirrorExpectedRacks(ctx, pool, racks, !rackHasRows) + result.log() + } + + // Build the cross-reference map after rack mirror so component specs + // referencing rack_id strings can resolve them to Flow Rack.UUIDs. Done + // via a fresh DB read instead of returning the map from + // mirrorExpectedRacks so the rack mirror's signature stays focused; the + // read is cheap (rack count is small) and includes adoptions / inserts + // the mirror just made. + rackIDByExtID, err := loadRackIDByExternalID(ctx, pool.DB) + if err != nil { + log.Error().Err(err).Msg("Expected-inventory mirror: loading rack external_id map failed; skipping component mirror this cycle") + return + } + + if machines, ok, hasRows := pullExpectedMachines(ctx, nicoClient); ok { + specs := make([]expectedComponentSpec, 0, len(machines)) + for _, m := range machines { + specs = append(specs, machineDetailToSpec(m)) + } + result := mirrorExpectedComponents(ctx, pool, + devicetypes.ComponentTypeToString(devicetypes.ComponentTypeCompute), + specs, rackIDByExtID, !hasRows) + result.log() + } + + if switches, ok, hasRows := pullExpectedSwitches(ctx, nicoClient); ok { + specs := make([]expectedComponentSpec, 0, len(switches)) + for _, s := range switches { + specs = append(specs, switchDetailToSpec(s)) + } + result := mirrorExpectedComponents(ctx, pool, + devicetypes.ComponentTypeToString(devicetypes.ComponentTypeNVSwitch), + specs, rackIDByExtID, !hasRows) + result.log() + } + + if shelves, ok, hasRows := pullExpectedPowerShelves(ctx, nicoClient); ok { + specs := make([]expectedComponentSpec, 0, len(shelves)) + for _, ps := range shelves { + specs = append(specs, powerShelfDetailToSpec(ps)) + } + result := mirrorExpectedComponents(ctx, pool, + devicetypes.ComponentTypeToString(devicetypes.ComponentTypePowerShelf), + specs, rackIDByExtID, !hasRows) + result.log() + } +} + +// loadRackIDByExternalID returns a map keyed by rack.external_id (Core's +// rack_id string) of the matching Flow Rack.UUID. Soft-deleted rows are +// excluded because component specs that reference a deleted rack would +// inherit a stale FK; better to skip the component spec with a warn. +func loadRackIDByExternalID(ctx context.Context, idb bun.IDB) (map[string]uuid.UUID, error) { + var rows []struct { + ID uuid.UUID `bun:"id"` + ExternalID *string `bun:"external_id"` + } + if err := idb.NewSelect(). + Model((*model.Rack)(nil)). + Column("id", "external_id"). + Where("external_id IS NOT NULL AND external_id <> ''"). + Scan(ctx, &rows); err != nil { + return nil, err + } + out := make(map[string]uuid.UUID, len(rows)) + for _, r := range rows { + if r.ExternalID != nil && *r.ExternalID != "" { + out[*r.ExternalID] = r.ID + } + } + return out, nil +} + +// rackNaturalKey joins manufacturer and serial number with a NUL byte. NUL +// can't appear inside either component, so this is collision-free without +// having to escape. Used by both the rack and component mirrors to key +// "is this row already known" maps off the shared (manufacturer, serial) +// pair, so it lives here next to the orchestrator rather than in either +// type-specific file. +func rackNaturalKey(manufacturer, serialNumber string) string { + return manufacturer + "\x00" + serialNumber +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_component.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_component.go new file mode 100644 index 0000000000..afbb40075e --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_component.go @@ -0,0 +1,734 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "context" + "database/sql" + "errors" + "fmt" + "strconv" + + "github.com/google/uuid" + "github.com/rs/zerolog/log" + "github.com/uptrace/bun" + + cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/common/devicetypes" +) + +// Well-known label keys cloud REST stuffs onto an expected component's +// Metadata when calling Core. Mirrored here so this package doesn't import the +// cloud REST DB-model crate. Keep in sync with rest-api/db/pkg/db/model/common.go +// (ExpectedComponentLabel* constants). +// +// firmware_version is intentionally absent: that column on Flow's component +// table is owned by the runtime sync (see syncFirmwareVersions in +// inventory.go), which reads what the BMC is actually running. Mirroring an +// "expected" version here would clobber the runtime value every cycle. +const ( + labelComponentManufacturer = "manufacturer" + labelComponentModel = "model" + labelComponentSlotID = "slot_id" + labelComponentTrayIdx = "tray_idx" + labelComponentHostID = "host_id" +) + +// expectedComponentSpec is the normalised view of one Core expected_* row. +// Each Core type (ExpectedMachine / ExpectedSwitch / ExpectedPowerShelf) is +// flattened into this shape so mirrorExpectedComponents is single-typed. +// +// BMC credentials (bmc_username / bmc_password on the Core side) are +// intentionally omitted: those are factory-default creds whose live value is +// kept in Vault after site-explorer's password rotation. Copying the stale +// pre-rotation value into Flow's bmc table would just give a misleading +// fallback and spread secret material across one more store. +type expectedComponentSpec struct { + Type string + Manufacturer string + SerialNumber string + Model string + Name string + SlotID int + TrayIndex int + HostID int + RackExternalID string + BMC expectedBMCSpec + // preserveFields names mirror-managed integer columns whose source Core + // label was malformed (non-integer string). The mirror keeps Flow's + // existing value for these columns on UPDATE instead of overwriting + // with the zero left in the field above. INSERT still writes zero — + // there's no existing row to preserve — but populateLabelsIntoSpec logs + // the malformation either way so operators see the Core data bug. + preserveFields map[string]bool +} + +func (s *expectedComponentSpec) markPreserve(field string) { + if s.preserveFields == nil { + s.preserveFields = make(map[string]bool) + } + s.preserveFields[field] = true +} + +type expectedBMCSpec struct { + MACAddress string + IPAddress string +} + +// fieldChange captures one before/after value pair for change-logging. The +// strings are pre-formatted so the log site doesn't have to switch on type. +type fieldChange struct { + field string + old string + new string +} + +// machineDetailToSpec maps a Core ExpectedMachineDetail to the normalised +// component spec. ChassisSerialNumber is the natural identity field; the +// label-carried Manufacturer / Model / FirmwareVersion / SlotID / TrayIdx / +// HostID are the per-row metadata cloud REST writes via +// expectedComponentLabelsInput.ToProto(). +func machineDetailToSpec(d nicoapi.ExpectedMachineDetail) expectedComponentSpec { + s := expectedComponentSpec{ + Type: devicetypes.ComponentTypeToString(devicetypes.ComponentTypeCompute), + SerialNumber: d.ChassisSerialNumber, + Name: d.Name, + RackExternalID: d.RackID, + BMC: expectedBMCSpec{ + MACAddress: d.BMCMACAddress, + IPAddress: d.BMCIPAddress, + }, + } + populateLabelsIntoSpec(&s, d.Labels) + return s +} + +func switchDetailToSpec(d nicoapi.ExpectedSwitchDetail) expectedComponentSpec { + s := expectedComponentSpec{ + Type: devicetypes.ComponentTypeToString(devicetypes.ComponentTypeNVSwitch), + SerialNumber: d.SwitchSerialNumber, + Name: d.Name, + RackExternalID: d.RackID, + BMC: expectedBMCSpec{ + MACAddress: d.BMCMACAddress, + IPAddress: d.BMCIPAddress, + }, + } + populateLabelsIntoSpec(&s, d.Labels) + return s +} + +func powerShelfDetailToSpec(d nicoapi.ExpectedPowerShelfDetail) expectedComponentSpec { + s := expectedComponentSpec{ + Type: devicetypes.ComponentTypeToString(devicetypes.ComponentTypePowerShelf), + SerialNumber: d.ShelfSerialNumber, + Name: d.Name, + RackExternalID: d.RackID, + BMC: expectedBMCSpec{ + MACAddress: d.BMCMACAddress, + IPAddress: d.BMCIPAddress, + }, + } + populateLabelsIntoSpec(&s, d.Labels) + return s +} + +// populateLabelsIntoSpec fills in the label-derived fields on spec. Each int +// label parsed by parseLabelInt that turns out to be non-integer is logged +// and marked in spec.preserveFields so the mirror's update path will keep +// Flow's existing value for that column instead of overwriting it with the +// zero strconv.Atoi left behind. spec.Type must already be set so the warn +// carries the component type for log filtering. +func populateLabelsIntoSpec(s *expectedComponentSpec, labels map[string]string) { + s.Manufacturer = labels[labelComponentManufacturer] + s.Model = labels[labelComponentModel] + + for _, lbl := range []struct { + labelKey string + fieldName string + assign func(int) + }{ + {labelComponentSlotID, "slot_id", func(v int) { s.SlotID = v }}, + {labelComponentTrayIdx, "tray_index", func(v int) { s.TrayIndex = v }}, + {labelComponentHostID, "host_id", func(v int) { s.HostID = v }}, + } { + raw := labels[lbl.labelKey] + v, ok := parseLabelInt(raw) + if ok { + lbl.assign(v) + continue + } + s.markPreserve(lbl.fieldName) + log.Warn(). + Str("type", s.Type). + Str("serial", s.SerialNumber). + Str("label", lbl.labelKey). + Str("raw", raw). + Msg("Expected-inventory mirror: Core label is not an integer; preserving Flow's existing value on update (insert path falls back to 0)") + } +} + +// parseLabelInt distinguishes "Core omitted the label" (empty input → 0, +// ok=true) from "Core sent something that isn't an integer" (non-empty +// non-numeric → 0, ok=false). The caller treats the first as Core +// authoritatively saying zero, and the second as a Core-side data bug +// worth logging + falling back on (preserve Flow's value on UPDATE). +func parseLabelInt(raw string) (int, bool) { + if raw == "" { + return 0, true + } + n, err := strconv.Atoi(raw) + if err != nil { + return 0, false + } + return n, true +} + +// pullExpectedMachines / Switches / PowerShelves apply the same two-step +// guard as pullExpectedRacks (see expected_mirror.go for the rationale): +// (1) RPC error short-circuits the type, (2) empty success suppresses the +// delete phase to survive Core blips. + +func pullExpectedMachines(ctx context.Context, c nicoapi.Client) (rows []nicoapi.ExpectedMachineDetail, rpcOK, hasRows bool) { + rows, err := c.GetAllExpectedMachineDetails(ctx) + if err != nil { + log.Error().Err(err).Msg("Expected-inventory mirror: pulling expected machines from Core failed; skipping machine mirror this cycle") + return nil, false, false + } + if len(rows) == 0 { + log.Warn().Msg("Expected-inventory mirror: Core returned zero expected machines; skipping machine delete phase this cycle") + return nil, true, false + } + return rows, true, true +} + +func pullExpectedSwitches(ctx context.Context, c nicoapi.Client) (rows []nicoapi.ExpectedSwitchDetail, rpcOK, hasRows bool) { + rows, err := c.GetAllExpectedSwitchDetails(ctx) + if err != nil { + log.Error().Err(err).Msg("Expected-inventory mirror: pulling expected switches from Core failed; skipping switch mirror this cycle") + return nil, false, false + } + if len(rows) == 0 { + log.Warn().Msg("Expected-inventory mirror: Core returned zero expected switches; skipping switch delete phase this cycle") + return nil, true, false + } + return rows, true, true +} + +func pullExpectedPowerShelves(ctx context.Context, c nicoapi.Client) (rows []nicoapi.ExpectedPowerShelfDetail, rpcOK, hasRows bool) { + rows, err := c.GetAllExpectedPowerShelfDetails(ctx) + if err != nil { + log.Error().Err(err).Msg("Expected-inventory mirror: pulling expected power shelves from Core failed; skipping power-shelf mirror this cycle") + return nil, false, false + } + if len(rows) == 0 { + log.Warn().Msg("Expected-inventory mirror: Core returned zero expected power shelves; skipping power-shelf delete phase this cycle") + return nil, true, false + } + return rows, true, true +} + +// mirrorExpectedComponents reconciles Flow's component table for a single +// component type against the supplied normalised specs. Matching key is +// (manufacturer, serial_number), the same unique key Flow's ingestion path +// already enforces; resurrect behaviour is symmetrical to the rack mirror so +// transient Core absence doesn't cause UUID churn. +// +// rackIDByExtID resolves a Core rack_id string (e.g. "a12") to the Flow rack +// UUID. A spec whose RackExternalID can't be resolved is skipped with a warn +// — usually because the rack mirror earlier in the same cycle dropped that +// rack for missing labels. +// +// componentType is the model.Component.Type value the caller is mirroring +// ("Compute" / "NVSwitch" / "PowerShelf"); it gates the per-type DB load and +// the delete scope so machines and switches never interfere with each other. +// +// All writes for one type's reconciliation land in a single transaction. +func mirrorExpectedComponents( + ctx context.Context, + pool *cdb.Session, + componentType string, + specs []expectedComponentSpec, + rackIDByExtID map[string]uuid.UUID, + skipDelete bool, +) mirrorResult { + result := mirrorResult{resource: componentType, pulled: len(specs)} + + existing, err := getAllComponentsByTypeIncludingDeleted(ctx, pool.DB, componentType) + if err != nil { + log.Error().Err(err).Str("type", componentType).Msg("Expected-inventory mirror: loading Flow components failed; skipping component mirror this cycle") + return result + } + + flowBySerial := make(map[string]*model.Component, len(existing)) + for i := range existing { + c := &existing[i] + flowBySerial[rackNaturalKey(c.Manufacturer, c.SerialNumber)] = c + } + + type plan struct { + toInsert []model.Component + toInsertBMCs []model.BMC // parallel to toInsert; component_id filled after insert + toUpdate []model.Component + toUpdateBMCs []bmcOps // one per toUpdate entry (any/all of insert/update/deletes may be set) + toDelete []model.Component + } + var p plan + + seenKeys := make(map[string]struct{}, len(specs)) + + for _, s := range specs { + if !specValid(s) { + log.Warn(). + Str("type", componentType). + Str("serial", s.SerialNumber). + Str("manufacturer", s.Manufacturer). + Str("bmc_mac", s.BMC.MACAddress). + Msg("Expected-inventory mirror: skipping Core expected component missing required identity (manufacturer / serial / BMC MAC)") + result.skippedNoIDOrKey++ + continue + } + + // Track this key as "Core is still reporting it" before we get to + // any of the conditional skips below. The delete phase consults + // seenKeys to decide what to soft-delete; if we forgot to track a + // row whose Core spec we couldn't fully apply (e.g. rack + // resolution failed), the delete phase would wipe the Flow row + // even though Core hadn't dropped it. + key := rackNaturalKey(s.Manufacturer, s.SerialNumber) + if _, dup := seenKeys[key]; dup { + log.Warn(). + Str("type", componentType). + Str("manufacturer", s.Manufacturer). + Str("serial", s.SerialNumber). + Msg("Expected-inventory mirror: Core returned duplicate spec for this component; later occurrence overwrites earlier in this cycle's mirror plan (Cloud REST is producing duplicates)") + } + seenKeys[key] = struct{}{} + + rackID, ok := resolveRackID(s, rackIDByExtID) + if !ok { + // Core references a rack Flow doesn't currently know about + // (rack mirror dropped it this cycle, or Core/Flow have a + // one-cycle skew). Per the mirror contract Core is the source + // of truth and the component is still expected; soft-deleting + // it would lose its UUID. component.rack_id is nullable and + // has no FK, so writing uuid.Nil (NULL) is the documented + // "ingested but not yet assigned to a rack" state and lets + // the rack association heal on a subsequent cycle. + log.Warn(). + Str("type", componentType). + Str("serial", s.SerialNumber). + Str("rack_external_id", s.RackExternalID). + Msg("Expected-inventory mirror: Core's rack_id is not in Flow's rack table; mirroring component with NULL rack_id (rack association will heal next cycle once the rack reappears)") + rackID = uuid.Nil + } + + desired := componentFromSpec(s, rackID) + + if cur, ok := flowBySerial[key]; ok { + candidate := *cur + needUpdate := false + if candidate.DeletedAt != nil { + candidate.DeletedAt = nil + needUpdate = true + result.resurrected++ + log.Info(). + Str("type", componentType). + Str("serial", candidate.SerialNumber). + Str("component_id", candidate.ID.String()). + Msg("Expected-inventory mirror: resurrecting soft-deleted component") + } + + diffs := diffComponentFields(&candidate, &desired, s) + if len(diffs) > 0 { + applyComponentChanges(&candidate, &desired, s) + needUpdate = true + logComponentChanges(componentType, candidate.ID, candidate.SerialNumber, diffs) + } + + bmcOps := planBMCReconciliation(&candidate, s.BMC) + if needUpdate || bmcOps.insert != nil || bmcOps.update != nil || len(bmcOps.deletes) > 0 { + p.toUpdate = append(p.toUpdate, candidate) + p.toUpdateBMCs = append(p.toUpdateBMCs, bmcOps) + } + continue + } + + p.toInsert = append(p.toInsert, desired) + p.toInsertBMCs = append(p.toInsertBMCs, model.BMC{ + MacAddress: s.BMC.MACAddress, + Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), + IPAddress: optionalString(s.BMC.IPAddress), + }) + log.Info(). + Str("type", componentType). + Str("serial", desired.SerialNumber). + Str("manufacturer", desired.Manufacturer). + Msg("Expected-inventory mirror: inserting new component from Core") + } + + for i := range existing { + c := &existing[i] + if c.DeletedAt != nil { + continue + } + if _, seen := seenKeys[rackNaturalKey(c.Manufacturer, c.SerialNumber)]; seen { + continue + } + if skipDelete { + continue + } + p.toDelete = append(p.toDelete, *c) + log.Info(). + Str("type", componentType). + Str("serial", c.SerialNumber). + Str("component_id", c.ID.String()). + Msg("Expected-inventory mirror: soft-deleting component absent from Core") + } + + if len(p.toInsert) == 0 && len(p.toUpdate) == 0 && len(p.toDelete) == 0 { + return result + } + + if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { + for i := range p.toInsert { + if _, err := tx.NewInsert().Model(&p.toInsert[i]).Exec(ctx); err != nil { + return fmt.Errorf("insert component %q: %w", p.toInsert[i].SerialNumber, err) + } + p.toInsertBMCs[i].ComponentID = p.toInsert[i].ID + if err := evictOrphanBMC(ctx, tx, p.toInsertBMCs[i].MacAddress, p.toInsert[i].ID); err != nil { + return err + } + if _, err := tx.NewInsert().Model(&p.toInsertBMCs[i]).Exec(ctx); err != nil { + return fmt.Errorf("insert BMC for component %q: %w", p.toInsert[i].SerialNumber, err) + } + } + for i := range p.toUpdate { + if _, err := tx.NewUpdate().Model(&p.toUpdate[i]).Where("id = ?", p.toUpdate[i].ID).Exec(ctx); err != nil { + return fmt.Errorf("update component %q: %w", p.toUpdate[i].SerialNumber, err) + } + ops := p.toUpdateBMCs[i] + for j := range ops.deletes { + if _, err := tx.NewDelete().Model(&ops.deletes[j]).Where("mac_address = ?", ops.deletes[j].MacAddress).ForceDelete().Exec(ctx); err != nil { + return fmt.Errorf("delete BMC %q: %w", ops.deletes[j].MacAddress, err) + } + } + if ops.insert != nil { + ops.insert.ComponentID = p.toUpdate[i].ID + if err := evictOrphanBMC(ctx, tx, ops.insert.MacAddress, p.toUpdate[i].ID); err != nil { + return err + } + if _, err := tx.NewInsert().Model(ops.insert).Exec(ctx); err != nil { + return fmt.Errorf("insert BMC for component %q: %w", p.toUpdate[i].SerialNumber, err) + } + } + if ops.update != nil { + if _, err := tx.NewUpdate().Model(ops.update).Where("mac_address = ?", ops.update.MacAddress).Exec(ctx); err != nil { + return fmt.Errorf("update BMC %q: %w", ops.update.MacAddress, err) + } + } + } + for i := range p.toDelete { + if _, err := tx.NewDelete().Model(&p.toDelete[i]).Where("id = ?", p.toDelete[i].ID).Exec(ctx); err != nil { + return fmt.Errorf("soft-delete component %q: %w", p.toDelete[i].SerialNumber, err) + } + } + return nil + }); err != nil { + log.Error().Err(err).Str("type", componentType).Msg("Expected-inventory mirror: component reconciliation transaction failed; mirror is no-op this cycle") + // Tx rolled back: every per-spec decision logged above represents + // intent, not committed state. Strip success-side counters so the + // summary log line reflects what actually landed. pulled and + // skippedNoIDOrKey survive: pulled is input size; skippedNoIDOrKey + // is decided before we ever opened the tx, so neither is + // invalidated by the rollback. + result.resurrected = 0 + return result + } + + result.inserted = len(p.toInsert) + result.updated = len(p.toUpdate) + result.softDeleted = len(p.toDelete) + return result +} + +// specValid rejects rows missing fields the mirror needs to construct a row +// that both inserts cleanly (Component.Manufacturer / SerialNumber are +// NOT NULL and form a unique index) and reconciles BMC (MAC is BMC PK). +func specValid(s expectedComponentSpec) bool { + return s.Manufacturer != "" && s.SerialNumber != "" && s.BMC.MACAddress != "" +} + +// resolveRackID translates Core's rack_id string into the Flow Rack.ID +// resolved by the rack mirror earlier in the same cycle. An empty +// RackExternalID is allowed and resolves to uuid.Nil — the Component model +// already documents that "uuid.Nil when the component has been ingested but +// is not yet assigned to a rack". A non-empty value that doesn't resolve is +// rejected; risking a foreign-key violation (or worse, silently mis-routing +// a component into the wrong rack) is worse than skipping the row. +func resolveRackID(s expectedComponentSpec, rackIDByExtID map[string]uuid.UUID) (uuid.UUID, bool) { + if s.RackExternalID == "" { + return uuid.Nil, true + } + id, ok := rackIDByExtID[s.RackExternalID] + return id, ok +} + +// componentFromSpec builds the model.Component the mirror would insert for +// this spec. Mirror-managed fields only — ComponentID/external_id (runtime +// sync), PowerState (runtime), Status (lifecycle), IngestedAt are all left +// at their zero values so they don't clobber state owned by other code paths +// when this struct is used as the "desired" side of an UPDATE. +func componentFromSpec(s expectedComponentSpec, rackID uuid.UUID) model.Component { + name := s.Name + if name == "" { + // Component.Name is part of the user-visible identity but the table + // doesn't enforce non-empty; matching Flow's lenient default keeps + // inserts safe when Core omits the name. + name = s.SerialNumber + } + return model.Component{ + Name: name, + Type: s.Type, + Manufacturer: s.Manufacturer, + SerialNumber: s.SerialNumber, + Model: s.Model, + SlotID: s.SlotID, + TrayIndex: s.TrayIndex, + HostID: s.HostID, + RackID: rackID, + } +} + +// applyComponentChanges copies mirror-managed fields from desired into +// existing. Identity (Manufacturer/SerialNumber/Type), runtime (ComponentID, +// PowerState, FirmwareVersion), lifecycle (Status, IngestedAt) and audit +// (CreatedAt, UpdatedAt) are intentionally not touched. Fields named in +// spec.preserveFields are also skipped — those are the columns whose Core +// labels were malformed and so should keep Flow's existing value rather +// than be overwritten with the parseLabelInt fallback zero. +func applyComponentChanges(existing, desired *model.Component, spec expectedComponentSpec) { + existing.Name = desired.Name + existing.Model = desired.Model + existing.RackID = desired.RackID + if !spec.preserveFields["slot_id"] { + existing.SlotID = desired.SlotID + } + if !spec.preserveFields["tray_index"] { + existing.TrayIndex = desired.TrayIndex + } + if !spec.preserveFields["host_id"] { + existing.HostID = desired.HostID + } +} + +// diffComponentFields returns the per-field deltas the mirror would apply. +// Used both to decide whether an UPDATE is needed and to log what changed. +// Fields the mirror doesn't manage (external_id / status / power_state / +// firmware_version / timestamps) are deliberately omitted; comparing them +// would queue UPDATE rows for state owned by other loops. Fields named in +// spec.preserveFields are also skipped so a malformed Core label can't +// drive a spurious UPDATE that would clobber Flow's value with the +// fallback zero. +func diffComponentFields(existing, desired *model.Component, spec expectedComponentSpec) []fieldChange { + var diffs []fieldChange + if existing.Name != desired.Name { + diffs = append(diffs, fieldChange{"name", existing.Name, desired.Name}) + } + if existing.Model != desired.Model { + diffs = append(diffs, fieldChange{"model", existing.Model, desired.Model}) + } + if !spec.preserveFields["slot_id"] && existing.SlotID != desired.SlotID { + diffs = append(diffs, fieldChange{"slot_id", strconv.Itoa(existing.SlotID), strconv.Itoa(desired.SlotID)}) + } + if !spec.preserveFields["tray_index"] && existing.TrayIndex != desired.TrayIndex { + diffs = append(diffs, fieldChange{"tray_index", strconv.Itoa(existing.TrayIndex), strconv.Itoa(desired.TrayIndex)}) + } + if !spec.preserveFields["host_id"] && existing.HostID != desired.HostID { + diffs = append(diffs, fieldChange{"host_id", strconv.Itoa(existing.HostID), strconv.Itoa(desired.HostID)}) + } + if existing.RackID != desired.RackID { + diffs = append(diffs, fieldChange{"rack_id", existing.RackID.String(), desired.RackID.String()}) + } + return diffs +} + +// logComponentChanges emits one INFO line per mirror cycle per component that +// actually changed, listing the fields the mirror is about to write. Done +// before the transaction so the line is preserved even if the tx rolls back +// (caller surfaces the rollback at ERROR separately). +func logComponentChanges(componentType string, id uuid.UUID, serial string, diffs []fieldChange) { + evt := log.Info(). + Str("type", componentType). + Str("component_id", id.String()). + Str("serial", serial) + for _, d := range diffs { + evt = evt.Str("change."+d.field+".old", d.old).Str("change."+d.field+".new", d.new) + } + evt.Msg("Expected-inventory mirror: updating component from Core") +} + +// bmcOps captures the set of writes the mirror plans against the BMC table +// for one component. The deletes slice can be longer than one entry: a +// well-formed component carries exactly one type='Host' BMC, but ingestion +// bugs and data drift can leave several, and the mirror hard-deletes the +// stale ones so Core's "exactly one host BMC" view is enforced. +type bmcOps struct { + insert *model.BMC + update *model.BMC + deletes []model.BMC +} + +// planBMCReconciliation works out the BMC writes needed to make this +// component's host-BMC row match the spec. Only type='Host' BMCs are +// considered: Core's ExpectedMachine describes the host BMC only, the DPU +// BMC's MAC/IP isn't a field there. Non-host rows are left strictly alone; +// they're owned by the ingestion path or runtime discovery. +// +// Cases: +// - No existing host BMC: insert the spec'd one. +// - Exactly one existing host BMC matching the spec MAC: update its IP if +// it drifted. +// - One or more existing host BMCs and at most one matches the spec MAC: +// keep the matching one (IP-drift update if needed); hard-delete every +// non-matching host BMC. Multiple host BMCs is an ingestion bug; the +// mirror cleans it up since Core's authoritative view is one. +// - One or more existing host BMCs and none match the spec MAC: hard-delete +// all of them and insert the spec'd one. This is the MAC-change path +// (chassis got a new BMC board) generalised over an existing dirty state. +func planBMCReconciliation(component *model.Component, spec expectedBMCSpec) (ops bmcOps) { + hostType := devicetypes.BMCTypeToString(devicetypes.BMCTypeHost) + want := model.BMC{ + MacAddress: spec.MACAddress, + Type: hostType, + IPAddress: optionalString(spec.IPAddress), + ComponentID: component.ID, + } + + var hosts []model.BMC + for i := range component.BMCs { + if component.BMCs[i].Type == hostType { + hosts = append(hosts, component.BMCs[i]) + } + } + + if len(hosts) > 1 { + log.Warn(). + Str("component_id", component.ID.String()). + Str("serial", component.SerialNumber). + Int("host_bmc_count", len(hosts)). + Msg("Expected-inventory mirror: component has multiple type='Host' BMC rows (Flow data should have at most one); extras will be hard-deleted to match Core") + } + + // Pick the one matching the spec MAC; that's the keeper. + var keeper *model.BMC + for i := range hosts { + if hosts[i].MacAddress == spec.MACAddress { + keeper = &hosts[i] + break + } + } + + if keeper != nil { + for i := range hosts { + if hosts[i].MacAddress == keeper.MacAddress { + continue + } + ops.deletes = append(ops.deletes, hosts[i]) + } + if !equalOptionalString(keeper.IPAddress, want.IPAddress) { + updated := *keeper + updated.IPAddress = want.IPAddress + ops.update = &updated + } + return ops + } + + // No host BMC matches the spec MAC. Insert the new one; hard-delete + // any stale host rows. ComponentID stays the same so downstream FKs + // keep resolving. + ops.insert = &want + for i := range hosts { + ops.deletes = append(ops.deletes, hosts[i]) + } + return ops +} + +// evictOrphanBMC hard-deletes any BMC row whose mac_address collides with +// the one about to be inserted under newOwnerID. The bmc table's PK is the +// MAC alone, so two components claiming the same host MAC would fail the +// INSERT and roll back the whole component-mirror tx. The collision is +// rare — typically a physical BMC card moved to a new chassis without the +// old row being cleaned up — but the failure mode is severe (entire type's +// mirror frozen until manual cleanup), so the mirror clears the orphan and +// logs loudly so operators see the data-corruption signal. +func evictOrphanBMC(ctx context.Context, tx bun.Tx, mac string, newOwnerID uuid.UUID) error { + var orphan model.BMC + err := tx.NewSelect(). + Model(&orphan). + Where("mac_address = ?", mac). + Scan(ctx) + if errors.Is(err, sql.ErrNoRows) { + return nil + } + if err != nil { + return fmt.Errorf("look up potential BMC orphan %q: %w", mac, err) + } + if orphan.ComponentID == newOwnerID { + return nil + } + if _, err := tx.NewDelete(). + Model(&orphan). + Where("mac_address = ?", mac). + ForceDelete(). + Exec(ctx); err != nil { + return fmt.Errorf("evict orphan BMC %q from component %s: %w", mac, orphan.ComponentID, err) + } + log.Warn(). + Str("bmc_mac", mac). + Str("orphan_owner_component_id", orphan.ComponentID.String()). + Str("new_owner_component_id", newOwnerID.String()). + Msg("Expected-inventory mirror: BMC MAC already owned by a different component; evicted to honour Core's claim") + return nil +} + +func optionalString(s string) *string { + if s == "" { + return nil + } + out := s + return &out +} + +func equalOptionalString(a, b *string) bool { + if a == nil && b == nil { + return true + } + if a == nil || b == nil { + return false + } + return *a == *b +} + +// getAllComponentsByTypeIncludingDeleted loads every row of the given type, +// soft-deleted included, with BMCs preloaded so the per-component BMC +// reconciliation in mirrorExpectedComponents can read them without a second +// round-trip per row. The "including deleted" semantics matches the rack +// mirror's getAllRacksIncludingDeleted: it's how the resurrect path knows a +// row exists, and how the delete path avoids double-deleting. +func getAllComponentsByTypeIncludingDeleted(ctx context.Context, idb bun.IDB, componentType string) ([]model.Component, error) { + var components []model.Component + err := idb.NewSelect(). + Model(&components). + Where("type = ?", componentType). + WhereAllWithDeleted(). + Relation("BMCs"). + Scan(ctx) + if err != nil { + return nil, err + } + return components, nil +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_component_test.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_component_test.go new file mode 100644 index 0000000000..967df3bf90 --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_component_test.go @@ -0,0 +1,563 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "context" + "errors" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" + "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/common/devicetypes" +) + +func TestParseLabelInt(t *testing.T) { + // Empty input is "Core didn't write this label" — ok=true so callers + // treat it as Core authoritatively saying zero (the unset default). + // Non-empty unparsable input is a Core data bug — ok=false so callers + // can preserve Flow's existing value rather than clobber it with 0. + for _, tc := range []struct { + in string + want int + wantOK bool + }{ + {"", 0, true}, + {"0", 0, true}, + {"7", 7, true}, + {"-3", -3, true}, + {"abc", 0, false}, // strconv.Atoi rejects non-numeric + {"3.14", 0, false}, // strconv.Atoi rejects floats + {" 4 ", 0, false}, // strconv.Atoi rejects whitespace + } { + t.Run(tc.in, func(t *testing.T) { + got, ok := parseLabelInt(tc.in) + assert.Equal(t, tc.want, got) + assert.Equal(t, tc.wantOK, ok) + }) + } +} + +func TestPopulateLabelsIntoSpec_MalformedIntMarksPreserve(t *testing.T) { + s := expectedComponentSpec{ + Type: "Compute", + SerialNumber: "SN-1", + } + populateLabelsIntoSpec(&s, map[string]string{ + labelComponentManufacturer: "Foxconn", + labelComponentSlotID: "abc", // malformed + labelComponentTrayIdx: "1", + labelComponentHostID: "", + }) + assert.True(t, s.preserveFields["slot_id"], "malformed slot_id label must mark preserve so UPDATE doesn't clobber Flow's existing value with 0") + assert.False(t, s.preserveFields["tray_index"]) + assert.False(t, s.preserveFields["host_id"], "empty label is Core saying zero, not a malformation") + assert.Equal(t, 0, s.SlotID, "malformed input still falls back to 0 for the spec field; the preserve flag is what gates the write") + assert.Equal(t, 1, s.TrayIndex) + assert.Equal(t, 0, s.HostID) +} + +func TestMachineDetailToSpec(t *testing.T) { + d := nicoapi.ExpectedMachineDetail{ + ExpectedMachineID: "em-uuid", + BMCMACAddress: "aa:bb:cc:dd:ee:ff", + BMCIPAddress: "10.0.0.1", + ChassisSerialNumber: "SN-001", + RackID: "a12", + Name: "node-001", + Description: "compute node", + Labels: map[string]string{ + labelComponentManufacturer: "Foxconn", + labelComponentModel: "MGX-Compute-Gen2", + labelComponentSlotID: "5", + labelComponentTrayIdx: "1", + labelComponentHostID: "3", + }, + } + s := machineDetailToSpec(d) + + assert.Equal(t, devicetypes.ComponentTypeToString(devicetypes.ComponentTypeCompute), s.Type) + assert.Equal(t, "SN-001", s.SerialNumber) + assert.Equal(t, "Foxconn", s.Manufacturer) + assert.Equal(t, "MGX-Compute-Gen2", s.Model) + assert.Equal(t, 5, s.SlotID) + assert.Equal(t, 1, s.TrayIndex) + assert.Equal(t, 3, s.HostID) + assert.Equal(t, "a12", s.RackExternalID) + assert.Equal(t, "aa:bb:cc:dd:ee:ff", s.BMC.MACAddress) + assert.Equal(t, "10.0.0.1", s.BMC.IPAddress) +} + +func TestSwitchDetailToSpec_TypeIsNVSwitch(t *testing.T) { + s := switchDetailToSpec(nicoapi.ExpectedSwitchDetail{ + SwitchSerialNumber: "SW-1", + BMCMACAddress: "00:00:00:00:00:01", + Labels: map[string]string{labelComponentManufacturer: "NVIDIA"}, + }) + assert.Equal(t, devicetypes.ComponentTypeToString(devicetypes.ComponentTypeNVSwitch), s.Type) + assert.Equal(t, "SW-1", s.SerialNumber) +} + +func TestPowerShelfDetailToSpec_TypeIsPowerShelf(t *testing.T) { + s := powerShelfDetailToSpec(nicoapi.ExpectedPowerShelfDetail{ + ShelfSerialNumber: "PS-1", + BMCMACAddress: "00:00:00:00:00:02", + Labels: map[string]string{labelComponentManufacturer: "NVIDIA"}, + }) + assert.Equal(t, devicetypes.ComponentTypeToString(devicetypes.ComponentTypePowerShelf), s.Type) + assert.Equal(t, "PS-1", s.SerialNumber) +} + +func TestSpecValid(t *testing.T) { + base := expectedComponentSpec{ + Manufacturer: "Foxconn", + SerialNumber: "SN-1", + BMC: expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:ff"}, + } + assert.True(t, specValid(base), "complete spec should be valid") + + for name, mutate := range map[string]func(*expectedComponentSpec){ + "missing manufacturer": func(s *expectedComponentSpec) { s.Manufacturer = "" }, + "missing serial": func(s *expectedComponentSpec) { s.SerialNumber = "" }, + "missing bmc mac": func(s *expectedComponentSpec) { s.BMC.MACAddress = "" }, + } { + t.Run(name, func(t *testing.T) { + s := base + mutate(&s) + assert.False(t, specValid(s)) + }) + } +} + +func TestResolveRackID(t *testing.T) { + rackUUID := uuid.New() + rackMap := map[string]uuid.UUID{"a12": rackUUID} + + t.Run("empty external id resolves to uuid.Nil with ok=true (component intentionally unassigned)", func(t *testing.T) { + id, ok := resolveRackID(expectedComponentSpec{RackExternalID: ""}, rackMap) + assert.True(t, ok) + assert.Equal(t, uuid.Nil, id) + }) + t.Run("known external id resolves to the flow rack uuid", func(t *testing.T) { + id, ok := resolveRackID(expectedComponentSpec{RackExternalID: "a12"}, rackMap) + assert.True(t, ok) + assert.Equal(t, rackUUID, id) + }) + t.Run("unknown external id is rejected so we don't insert with a stale FK", func(t *testing.T) { + _, ok := resolveRackID(expectedComponentSpec{RackExternalID: "ghost"}, rackMap) + assert.False(t, ok) + }) +} + +func TestComponentFromSpec(t *testing.T) { + rackID := uuid.New() + s := expectedComponentSpec{ + Type: "Compute", + Manufacturer: "Foxconn", + SerialNumber: "SN-1", + Model: "MGX", + Name: "node-1", + SlotID: 5, + TrayIndex: 1, + HostID: 3, + } + c := componentFromSpec(s, rackID) + assert.Equal(t, "node-1", c.Name) + assert.Equal(t, "Compute", c.Type) + assert.Equal(t, "Foxconn", c.Manufacturer) + assert.Equal(t, "SN-1", c.SerialNumber) + assert.Equal(t, "MGX", c.Model) + assert.Empty(t, c.FirmwareVersion, "firmware_version is owned by runtime sync, mirror must leave it unset") + assert.Equal(t, 5, c.SlotID) + assert.Equal(t, 1, c.TrayIndex) + assert.Equal(t, 3, c.HostID) + assert.Equal(t, rackID, c.RackID) +} + +func TestComponentFromSpec_NameFallsBackToSerial(t *testing.T) { + s := expectedComponentSpec{Type: "Compute", Manufacturer: "Foxconn", SerialNumber: "SN-1"} + c := componentFromSpec(s, uuid.Nil) + assert.Equal(t, "SN-1", c.Name, "empty Core name should fall back to serial so notnull-checks downstream don't trip") +} + +func TestDiffComponentFields(t *testing.T) { + rackA := uuid.New() + rackB := uuid.New() + base := func() *model.Component { + return &model.Component{ + Name: "n", + Model: "m", + SlotID: 1, + TrayIndex: 2, + HostID: 3, + RackID: rackA, + } + } + + t.Run("identical fields produce no diffs", func(t *testing.T) { + assert.Empty(t, diffComponentFields(base(), base(), expectedComponentSpec{})) + }) + + t.Run("firmware_version drift is ignored (runtime-owned, not mirrored)", func(t *testing.T) { + desired := base() + desired.FirmwareVersion = "2.0" + assert.Empty(t, diffComponentFields(base(), desired, expectedComponentSpec{})) + }) + + for name, mutate := range map[string]func(*model.Component){ + "name": func(c *model.Component) { c.Name = "n2" }, + "model": func(c *model.Component) { c.Model = "m2" }, + "slot_id": func(c *model.Component) { c.SlotID = 9 }, + "tray_index": func(c *model.Component) { c.TrayIndex = 9 }, + "host_id": func(c *model.Component) { c.HostID = 9 }, + "rack_id": func(c *model.Component) { c.RackID = rackB }, + } { + t.Run("change in "+name+" is detected", func(t *testing.T) { + desired := base() + mutate(desired) + diffs := diffComponentFields(base(), desired, expectedComponentSpec{}) + require.Len(t, diffs, 1) + assert.Equal(t, name, diffs[0].field) + }) + } + + t.Run("preserved fields don't surface as drift even when desired differs", func(t *testing.T) { + desired := base() + desired.SlotID = 9 + desired.TrayIndex = 9 + desired.HostID = 9 + spec := expectedComponentSpec{ + preserveFields: map[string]bool{"slot_id": true, "tray_index": true, "host_id": true}, + } + assert.Empty(t, diffComponentFields(base(), desired, spec), + "preserve flags suppress diffs so a malformed Core label can't drive a spurious UPDATE") + }) +} + +func TestApplyComponentChanges_DoesNotTouchIdentityOrRuntimeFields(t *testing.T) { + id := uuid.New() + rackA := uuid.New() + rackB := uuid.New() + extID := "runtime-id" + existing := &model.Component{ + ID: id, + Name: "old", + Type: "Compute", + Manufacturer: "Foxconn", + SerialNumber: "SN-1", + Model: "old-model", + RackID: rackA, + ComponentID: &extID, // runtime-owned, must not be touched + } + desired := &model.Component{ + Name: "new", + Model: "new-model", + RackID: rackB, + } + + applyComponentChanges(existing, desired, expectedComponentSpec{}) + + assert.Equal(t, "new", existing.Name) + assert.Equal(t, "new-model", existing.Model) + assert.Equal(t, rackB, existing.RackID) + assert.Equal(t, "Compute", existing.Type, "Type is identity; mirror must not touch") + assert.Equal(t, "Foxconn", existing.Manufacturer, "Manufacturer is identity") + assert.Equal(t, "SN-1", existing.SerialNumber, "SerialNumber is identity") + require.NotNil(t, existing.ComponentID) + assert.Equal(t, "runtime-id", *existing.ComponentID, "external_id is runtime-owned") +} + +func TestApplyComponentChanges_PreservedFieldsKeepFlowValue(t *testing.T) { + existing := &model.Component{ + Name: "n", + Model: "m", + SlotID: 7, // Flow's existing value — must survive + TrayIndex: 8, + HostID: 9, + } + desired := &model.Component{ + Name: "n", + Model: "m", + SlotID: 0, // would-be overwrite from parseLabelInt fallback + TrayIndex: 0, + HostID: 0, + } + spec := expectedComponentSpec{ + preserveFields: map[string]bool{"slot_id": true, "tray_index": true, "host_id": true}, + } + applyComponentChanges(existing, desired, spec) + assert.Equal(t, 7, existing.SlotID, "preserve flag must protect Flow's value from malformed-label fallback zero") + assert.Equal(t, 8, existing.TrayIndex) + assert.Equal(t, 9, existing.HostID) +} + +func TestPlanBMCReconciliation(t *testing.T) { + compID := uuid.New() + + t.Run("no existing bmc -> insert", func(t *testing.T) { + c := &model.Component{ID: compID, BMCs: nil} + ops := planBMCReconciliation(c, expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:01", IPAddress: "10.0.0.1"}) + require.NotNil(t, ops.insert) + assert.Nil(t, ops.update) + assert.Empty(t, ops.deletes) + assert.Equal(t, "aa:bb:cc:dd:ee:01", ops.insert.MacAddress) + assert.Equal(t, compID, ops.insert.ComponentID) + require.NotNil(t, ops.insert.IPAddress) + assert.Equal(t, "10.0.0.1", *ops.insert.IPAddress) + assert.Equal(t, devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), ops.insert.Type) + }) + + t.Run("same mac, same ip -> no op", func(t *testing.T) { + ip := "10.0.0.1" + c := &model.Component{ + ID: compID, + BMCs: []model.BMC{{MacAddress: "aa:bb:cc:dd:ee:01", Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), IPAddress: &ip, ComponentID: compID}}, + } + ops := planBMCReconciliation(c, expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:01", IPAddress: "10.0.0.1"}) + assert.Nil(t, ops.insert) + assert.Nil(t, ops.update) + assert.Empty(t, ops.deletes) + }) + + t.Run("same mac, different ip -> update only", func(t *testing.T) { + ip := "10.0.0.1" + c := &model.Component{ + ID: compID, + BMCs: []model.BMC{{MacAddress: "aa:bb:cc:dd:ee:01", Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), IPAddress: &ip, ComponentID: compID}}, + } + ops := planBMCReconciliation(c, expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:01", IPAddress: "10.0.0.2"}) + require.NotNil(t, ops.update) + assert.Nil(t, ops.insert) + assert.Empty(t, ops.deletes) + require.NotNil(t, ops.update.IPAddress) + assert.Equal(t, "10.0.0.2", *ops.update.IPAddress) + }) + + t.Run("different mac -> delete old + insert new", func(t *testing.T) { + ip := "10.0.0.1" + c := &model.Component{ + ID: compID, + BMCs: []model.BMC{{MacAddress: "aa:bb:cc:dd:ee:01", Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), IPAddress: &ip, ComponentID: compID}}, + } + ops := planBMCReconciliation(c, expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:99", IPAddress: "10.0.0.9"}) + require.Len(t, ops.deletes, 1) + require.NotNil(t, ops.insert) + assert.Nil(t, ops.update) + assert.Equal(t, "aa:bb:cc:dd:ee:01", ops.deletes[0].MacAddress) + assert.Equal(t, "aa:bb:cc:dd:ee:99", ops.insert.MacAddress) + assert.Equal(t, compID, ops.insert.ComponentID) + }) + + t.Run("multiple host BMCs, keeper matches spec -> delete the extras", func(t *testing.T) { + ip1 := "10.0.0.1" + ip2 := "10.0.0.2" + c := &model.Component{ + ID: compID, + BMCs: []model.BMC{ + {MacAddress: "aa:bb:cc:dd:ee:01", Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), IPAddress: &ip1, ComponentID: compID}, + {MacAddress: "aa:bb:cc:dd:ee:02", Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), IPAddress: &ip2, ComponentID: compID}, + }, + } + ops := planBMCReconciliation(c, expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:01", IPAddress: "10.0.0.1"}) + assert.Nil(t, ops.insert, "spec MAC matches a keeper, no insert needed") + assert.Nil(t, ops.update, "IP on the keeper already matches") + require.Len(t, ops.deletes, 1) + assert.Equal(t, "aa:bb:cc:dd:ee:02", ops.deletes[0].MacAddress, "stale host BMC gets hard-deleted; Core says exactly one host BMC") + }) + + t.Run("multiple host BMCs, none match spec -> delete all + insert new", func(t *testing.T) { + ip1 := "10.0.0.1" + ip2 := "10.0.0.2" + c := &model.Component{ + ID: compID, + BMCs: []model.BMC{ + {MacAddress: "aa:bb:cc:dd:ee:01", Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), IPAddress: &ip1, ComponentID: compID}, + {MacAddress: "aa:bb:cc:dd:ee:02", Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), IPAddress: &ip2, ComponentID: compID}, + }, + } + ops := planBMCReconciliation(c, expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:99", IPAddress: "10.0.0.9"}) + require.NotNil(t, ops.insert) + assert.Equal(t, "aa:bb:cc:dd:ee:99", ops.insert.MacAddress) + assert.Nil(t, ops.update) + require.Len(t, ops.deletes, 2, "both stale host BMCs must go before the new one is inserted") + }) + + // The next three tests cover the DPU-coexistence case. A Compute + // component in Flow can carry both a host BMC and a DPU BMC; bun's + // .Relation("BMCs") preload has no stable ORDER BY so the DPU row may + // land at BMCs[0]. The mirror MUST ignore non-host BMCs — Core's + // ExpectedMachine doesn't describe the DPU BMC at all, so any op + // the mirror takes against the DPU row would be data loss. + + dpuIP := "10.0.0.50" + hostIP := "10.0.0.1" + withDPUFirst := func(host model.BMC) []model.BMC { + return []model.BMC{ + {MacAddress: "aa:bb:cc:dd:ee:50", Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeDPU), IPAddress: &dpuIP, ComponentID: compID}, + host, + } + } + + t.Run("dpu BMC at index 0, no host BMC -> insert host, dpu untouched", func(t *testing.T) { + c := &model.Component{ + ID: compID, + BMCs: []model.BMC{ + {MacAddress: "aa:bb:cc:dd:ee:50", Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeDPU), IPAddress: &dpuIP, ComponentID: compID}, + }, + } + ops := planBMCReconciliation(c, expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:01", IPAddress: "10.0.0.1"}) + require.NotNil(t, ops.insert) + assert.Equal(t, "aa:bb:cc:dd:ee:01", ops.insert.MacAddress) + assert.Equal(t, devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), ops.insert.Type) + assert.Empty(t, ops.deletes, "dpu BMC must not be deleted; Core has no opinion on it") + assert.Nil(t, ops.update) + }) + + t.Run("dpu BMC at index 0 + host BMC same MAC -> no op, dpu untouched", func(t *testing.T) { + c := &model.Component{ + ID: compID, + BMCs: withDPUFirst(model.BMC{ + MacAddress: "aa:bb:cc:dd:ee:01", + Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), + IPAddress: &hostIP, + ComponentID: compID, + }), + } + ops := planBMCReconciliation(c, expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:01", IPAddress: "10.0.0.1"}) + assert.Nil(t, ops.insert) + assert.Nil(t, ops.update) + assert.Empty(t, ops.deletes) + }) + + t.Run("dpu BMC at index 0 + host BMC with MAC change -> swap host only", func(t *testing.T) { + c := &model.Component{ + ID: compID, + BMCs: withDPUFirst(model.BMC{ + MacAddress: "aa:bb:cc:dd:ee:01", + Type: devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), + IPAddress: &hostIP, + ComponentID: compID, + }), + } + ops := planBMCReconciliation(c, expectedBMCSpec{MACAddress: "aa:bb:cc:dd:ee:02", IPAddress: "10.0.0.2"}) + require.Len(t, ops.deletes, 1) + require.NotNil(t, ops.insert) + assert.Equal(t, "aa:bb:cc:dd:ee:01", ops.deletes[0].MacAddress, "must delete the host BMC, never the DPU one") + assert.Equal(t, devicetypes.BMCTypeToString(devicetypes.BMCTypeHost), ops.deletes[0].Type) + assert.Equal(t, "aa:bb:cc:dd:ee:02", ops.insert.MacAddress) + }) +} + +func TestEqualOptionalString(t *testing.T) { + s1 := "a" + s2 := "b" + s1b := "a" + assert.True(t, equalOptionalString(nil, nil)) + assert.True(t, equalOptionalString(&s1, &s1b)) + assert.False(t, equalOptionalString(&s1, &s2)) + assert.False(t, equalOptionalString(nil, &s1)) + assert.False(t, equalOptionalString(&s1, nil)) +} + +func TestOptionalString(t *testing.T) { + assert.Nil(t, optionalString("")) + out := optionalString("x") + require.NotNil(t, out) + assert.Equal(t, "x", *out) +} + +// Pull-guard tests for the 3 component types, mirroring TestPullExpectedRacks. + +type errExpectedMachinesClient struct { + nicoapi.Client + err error + rows []nicoapi.ExpectedMachineDetail +} + +func (c *errExpectedMachinesClient) GetAllExpectedMachineDetails(_ context.Context) ([]nicoapi.ExpectedMachineDetail, error) { + if c.err != nil { + return nil, c.err + } + return c.rows, nil +} + +type errExpectedSwitchesClient struct { + nicoapi.Client + err error + rows []nicoapi.ExpectedSwitchDetail +} + +func (c *errExpectedSwitchesClient) GetAllExpectedSwitchDetails(_ context.Context) ([]nicoapi.ExpectedSwitchDetail, error) { + if c.err != nil { + return nil, c.err + } + return c.rows, nil +} + +type errExpectedPowerShelvesClient struct { + nicoapi.Client + err error + rows []nicoapi.ExpectedPowerShelfDetail +} + +func (c *errExpectedPowerShelvesClient) GetAllExpectedPowerShelfDetails(_ context.Context) ([]nicoapi.ExpectedPowerShelfDetail, error) { + if c.err != nil { + return nil, c.err + } + return c.rows, nil +} + +func TestPullExpectedMachines(t *testing.T) { + ctx := context.Background() + t.Run("rpc error -> rpcOK=false", func(t *testing.T) { + _, ok, hasRows := pullExpectedMachines(ctx, &errExpectedMachinesClient{Client: nicoapi.NewMockClient(), err: errors.New("boom")}) + assert.False(t, ok) + assert.False(t, hasRows) + }) + t.Run("empty -> rpcOK=true, hasRows=false", func(t *testing.T) { + _, ok, hasRows := pullExpectedMachines(ctx, &errExpectedMachinesClient{Client: nicoapi.NewMockClient()}) + assert.True(t, ok) + assert.False(t, hasRows) + }) + t.Run("populated -> both true", func(t *testing.T) { + _, ok, hasRows := pullExpectedMachines(ctx, &errExpectedMachinesClient{ + Client: nicoapi.NewMockClient(), + rows: []nicoapi.ExpectedMachineDetail{{ExpectedMachineID: "x"}}, + }) + assert.True(t, ok) + assert.True(t, hasRows) + }) +} + +func TestPullExpectedSwitches(t *testing.T) { + ctx := context.Background() + t.Run("rpc error -> rpcOK=false", func(t *testing.T) { + _, ok, hasRows := pullExpectedSwitches(ctx, &errExpectedSwitchesClient{Client: nicoapi.NewMockClient(), err: errors.New("boom")}) + assert.False(t, ok) + assert.False(t, hasRows) + }) + t.Run("empty -> rpcOK=true, hasRows=false", func(t *testing.T) { + _, ok, hasRows := pullExpectedSwitches(ctx, &errExpectedSwitchesClient{Client: nicoapi.NewMockClient()}) + assert.True(t, ok) + assert.False(t, hasRows) + }) +} + +func TestPullExpectedPowerShelves(t *testing.T) { + ctx := context.Background() + t.Run("rpc error -> rpcOK=false", func(t *testing.T) { + _, ok, hasRows := pullExpectedPowerShelves(ctx, &errExpectedPowerShelvesClient{Client: nicoapi.NewMockClient(), err: errors.New("boom")}) + assert.False(t, ok) + assert.False(t, hasRows) + }) + t.Run("empty -> rpcOK=true, hasRows=false", func(t *testing.T) { + _, ok, hasRows := pullExpectedPowerShelves(ctx, &errExpectedPowerShelvesClient{Client: nicoapi.NewMockClient()}) + assert.True(t, ok) + assert.False(t, hasRows) + }) +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_rack.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_rack.go new file mode 100644 index 0000000000..76906a90b3 --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_rack.go @@ -0,0 +1,467 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "context" + "fmt" + "reflect" + + "github.com/google/uuid" + "github.com/rs/zerolog/log" + "github.com/uptrace/bun" + + cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" +) + +// Well-known label keys Core writes on ExpectedRack.metadata.labels. Mirrored +// here so this package doesn't pull in the api-model crate's Rust constants. +// Keep in sync with crates/api-model/src/rack.rs. +const ( + labelChassisManufacturer = "chassis.manufacturer" + labelChassisSerialNumber = "chassis.serial-number" + labelChassisModel = "chassis.model" + labelLocationRegion = "location.region" + labelLocationDatacenter = "location.datacenter" + labelLocationRoom = "location.room" + labelLocationPosition = "location.position" +) + +// pullExpectedRacks wraps the nicoapi RPC with the two safety guards the +// mirror needs: +// +// 1. RPC-failure guard. If GetAllExpectedRackDetails errors, return rpcOK +// false so the caller skips reconciliation entirely. Treating an RPC +// error as "Core has no racks" would soft-delete every Flow rack on the +// next pass. +// +// 2. Empty-response guard. If the RPC succeeds but returns zero rows, return +// hasRows false so the caller can apply inserts/updates (no-ops in this +// case) but skip the delete phase. Core sometimes briefly serves an empty +// expected_* table during restarts and schema upgrades; leaving deletes +// to a subsequent run that still sees the table empty makes the mirror +// tolerant of those transients. +func pullExpectedRacks( + ctx context.Context, + nicoClient nicoapi.Client, +) (rows []nicoapi.ExpectedRackDetail, rpcOK bool, hasRows bool) { + rows, err := nicoClient.GetAllExpectedRackDetails(ctx) + if err != nil { + log.Error().Err(err).Msg("Expected-inventory mirror: pulling expected racks from Core failed; skipping rack mirror this cycle") + return nil, false, false + } + if len(rows) == 0 { + log.Warn().Msg("Expected-inventory mirror: Core returned zero expected racks; skipping rack delete phase this cycle") + return nil, true, false + } + return rows, true, true +} + +// mirrorExpectedRacks reconciles Flow's rack table against Core's +// expected_racks view. The algorithm is, in order: +// +// 1. Index every Flow rack — including soft-deleted ones — by external_id +// (mirror-owned) and by (manufacturer, serial_number) (the natural key +// shared with Core). Including soft-deleted rows is what makes +// resurrection work: a rack that briefly disappeared from Core and came +// back keeps its UUID, and a re-insert would otherwise collide on the +// (manufacturer, serial_number) unique index that the soft-deleted row +// still occupies. +// +// 2. For each Core row, find the matching Flow row preferring external_id +// and falling back to (manufacturer, serial_number) to adopt rows that +// predate the mirror. New rows are inserted. A matched row that's +// currently soft-deleted is resurrected by clearing deleted_at; +// mirror-managed fields are updated alongside on real deltas. +// +// 3. If skipDelete is false, live Flow rows whose external_id is set but no +// longer appear in Core are soft-deleted. Soft-deleted rows that Core +// doesn't report either are left alone (already gone). Rows with a NULL +// external_id (legacy ingestion-gRPC rows the mirror has never adopted) +// are exempted and warn-logged so the operator has a visible signal of +// pending cleanup. +// +// All writes for one pass happen in a single transaction so partial failures +// can't leave the table half-mirrored. +func mirrorExpectedRacks( + ctx context.Context, + pool *cdb.Session, + coreRacks []nicoapi.ExpectedRackDetail, + skipDelete bool, +) mirrorResult { + result := mirrorResult{resource: "rack", pulled: len(coreRacks)} + + flowRacks, err := getAllRacksIncludingDeleted(ctx, pool.DB) + if err != nil { + log.Error().Err(err).Msg("Expected-inventory mirror: loading Flow racks failed; skipping rack mirror this cycle") + return result + } + + flowByExtID := make(map[string]*model.Rack, len(flowRacks)) + flowBySerial := make(map[string]*model.Rack, len(flowRacks)) + for i := range flowRacks { + r := &flowRacks[i] + if r.ExternalID != nil && *r.ExternalID != "" { + flowByExtID[*r.ExternalID] = r + } + flowBySerial[rackNaturalKey(r.Manufacturer, r.SerialNumber)] = r + } + + type plan struct { + toInsert []model.Rack + toUpdate []model.Rack + toDelete []model.Rack + } + var p plan + + // Tombstones (soft-deleted rows) indexed by name. Used to GC stale rows + // that occupy the full unique rack_name_idx index before INSERT/UPDATE + // statements that would otherwise collide. Same row can be matched at + // most once: gcTombstoneForNameReuse deletes the entry after firing so + // we don't attempt to GC the same tombstone twice in the same cycle. + tombstonesByName := make(map[string]*model.Rack) + for i := range flowRacks { + r := &flowRacks[i] + if r.DeletedAt != nil { + tombstonesByName[r.Name] = r + } + } + + seenExtID := make(map[string]struct{}, len(coreRacks)) + + for _, cr := range coreRacks { + built, ok := buildRackFromCore(cr) + if !ok { + // Required fields (manufacturer / serial) missing in Core's labels; + // inserting would violate NOT NULL or the (manufacturer, serial) + // unique constraint. Skip and let the operator fix the Core data. + log.Warn(). + Str("rack_id", cr.RackID). + Str("name", cr.Name). + Msg("Expected-inventory mirror: skipping Core expected rack missing chassis manufacturer or serial-number labels") + result.skippedNoIDOrKey++ + continue + } + + // Track which Core rack_ids are present so the delete phase can spot + // Flow rows whose external_id is no longer covered. Empty rack_ids + // can't be tracked (would collide on the partial unique index too); + // the warn below makes the operator gap visible. + if cr.RackID != "" { + seenExtID[cr.RackID] = struct{}{} + } else { + log.Warn(). + Str("rack_profile_id", cr.RackProfileID). + Str("name", cr.Name). + Str("manufacturer", built.Manufacturer). + Str("serial", built.SerialNumber). + Msg("Expected-inventory mirror: Core expected rack has no rack_id; rack will be mirrored but components can't reference it") + } + + // Prefer external_id match (already adopted on a previous cycle). + // Empty rack_ids never hit flowByExtID by construction. + if existing, ok := flowByExtID[cr.RackID]; ok && cr.RackID != "" { + candidate := *existing + needUpdate := false + if candidate.DeletedAt != nil { + candidate.DeletedAt = nil + needUpdate = true + result.resurrected++ + } + if patched := rackUpdatedFromCore(&candidate, &built); patched != nil { + candidate = *patched + needUpdate = true + } + if needUpdate { + p.toUpdate = append(p.toUpdate, candidate) + } + continue + } + + // Fall back to natural key (legacy ingestion-gRPC rows the mirror has + // never adopted; adopt by writing external_id alongside any deltas). + // A serial match that's also soft-deleted gets resurrected at the + // same time — see the function-level comment for why this matters. + if existing, ok := flowBySerial[rackNaturalKey(built.Manufacturer, built.SerialNumber)]; ok { + candidate := *existing + candidate.ExternalID = built.ExternalID + if candidate.DeletedAt != nil { + candidate.DeletedAt = nil + result.resurrected++ + } + if patched := rackUpdatedFromCore(&candidate, &built); patched != nil { + candidate = *patched + } + p.toUpdate = append(p.toUpdate, candidate) + result.adopted++ + continue + } + + p.toInsert = append(p.toInsert, built) + } + + // Reconcile the delete side. Already soft-deleted rows are skipped: if + // Core still lists them, the match path above resurrected them; if not, + // they're correctly gone already. Live Flow rows whose external_id is set + // but absent from Core get soft-deleted; legacy (NULL external_id) rows + // are exempted with a warn so the operator notices. + for i := range flowRacks { + r := &flowRacks[i] + if r.DeletedAt != nil { + continue + } + hasExt := r.ExternalID != nil && *r.ExternalID != "" + if hasExt { + if _, present := seenExtID[*r.ExternalID]; present { + continue + } + if skipDelete { + continue + } + p.toDelete = append(p.toDelete, *r) + continue + } + // External_id is NULL — never adopted. Only legacy-warn if the + // (manufacturer, serial) doesn't appear in Core's set either, + // otherwise it'll be picked up by the adoption path above and a + // "future GC" warn would be misleading. + if _, adoptable := flowBySerialInCore(r, coreRacks); !adoptable { + result.legacyExempt++ + log.Warn(). + Str("rack_name", r.Name). + Str("rack_serial", r.SerialNumber). + Str("rack_manufacturer", r.Manufacturer). + Msg("Expected-inventory mirror: legacy Flow rack not present in Core's expected inventory; left in place for now (a follow-up will GC these once all sites have migrated)") + } + } + + if len(p.toInsert) == 0 && len(p.toUpdate) == 0 && len(p.toDelete) == 0 { + return result + } + + if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { + for i := range p.toInsert { + if err := gcTombstoneForNameReuse(ctx, tx, tombstonesByName, p.toInsert[i].Name, uuid.Nil); err != nil { + return err + } + if _, err := tx.NewInsert().Model(&p.toInsert[i]).Exec(ctx); err != nil { + return fmt.Errorf("insert rack %q: %w", p.toInsert[i].Name, err) + } + } + for i := range p.toUpdate { + if err := gcTombstoneForNameReuse(ctx, tx, tombstonesByName, p.toUpdate[i].Name, p.toUpdate[i].ID); err != nil { + return err + } + if _, err := tx.NewUpdate().Model(&p.toUpdate[i]).Where("id = ?", p.toUpdate[i].ID).Exec(ctx); err != nil { + return fmt.Errorf("update rack %q: %w", p.toUpdate[i].Name, err) + } + } + for i := range p.toDelete { + if _, err := tx.NewDelete().Model(&p.toDelete[i]).Where("id = ?", p.toDelete[i].ID).Exec(ctx); err != nil { + return fmt.Errorf("soft-delete rack %q: %w", p.toDelete[i].Name, err) + } + } + return nil + }); err != nil { + log.Error().Err(err).Msg("Expected-inventory mirror: rack reconciliation transaction failed; mirror is no-op this cycle") + // Tx rolled back: per-spec decisions logged above describe intent, + // not committed state. Strip success-side counters so the summary + // log reflects what actually landed (nothing). pulled, + // skippedNoIDOrKey and legacyExempt survive: they're decided + // before the tx opened and aren't invalidated by the rollback. + result.resurrected = 0 + result.adopted = 0 + return result + } + + result.inserted = len(p.toInsert) + result.updated = len(p.toUpdate) + result.softDeleted = len(p.toDelete) + return result +} + +// gcTombstoneForNameReuse hard-deletes a soft-deleted rack that's occupying +// the supplied name so the caller's INSERT or UPDATE doesn't collide on +// rack_name_idx (which is a full unique constraint and so applies to +// tombstones too). excludeID lets the UPDATE path skip the row that's +// itself being resurrected (the tombstone IS that row; deleting it would +// erase what we're about to write). uuid.Nil for INSERT — no exclusion +// needed. The map entry is removed on hit so a later op against the same +// name doesn't replay the same delete. +func gcTombstoneForNameReuse( + ctx context.Context, + tx bun.Tx, + tombstonesByName map[string]*model.Rack, + name string, + excludeID uuid.UUID, +) error { + tomb, ok := tombstonesByName[name] + if !ok || tomb.ID == excludeID { + return nil + } + if _, err := tx.NewDelete().Model(tomb).Where("id = ?", tomb.ID).ForceDelete().Exec(ctx); err != nil { + return fmt.Errorf("GC stale rack tombstone occupying name %q: %w", name, err) + } + delete(tombstonesByName, name) + log.Info(). + Str("rack_name", name). + Str("tombstone_id", tomb.ID.String()). + Str("tombstone_manufacturer", tomb.Manufacturer). + Str("tombstone_serial", tomb.SerialNumber). + Msg("Expected-inventory mirror: GC'd stale rack tombstone to free up name for reuse") + return nil +} + +// getAllRacksIncludingDeleted returns every rack in the Flow DB, soft-deleted +// rows included. The mirror needs the deleted ones so it can (a) resurrect a +// rack that comes back in Core instead of attempting an INSERT that would +// collide on the (manufacturer, serial_number) unique index the tombstone +// still holds, and (b) not double-delete a row that's already gone. +func getAllRacksIncludingDeleted(ctx context.Context, idb bun.IDB) ([]model.Rack, error) { + var racks []model.Rack + if err := idb.NewSelect().Model(&racks).WhereAllWithDeleted().Scan(ctx); err != nil { + return nil, err + } + return racks, nil +} + +// flowBySerialInCore is a small helper: it scans Core's racks and returns +// whether any of them shares this Flow rack's (manufacturer, serial_number). +// Used to suppress the "legacy not in Core" warn for rows that the adoption +// path will pick up on this same cycle. +func flowBySerialInCore(r *model.Rack, coreRacks []nicoapi.ExpectedRackDetail) (string, bool) { + want := rackNaturalKey(r.Manufacturer, r.SerialNumber) + for _, cr := range coreRacks { + manufacturer := cr.Labels[labelChassisManufacturer] + serial := cr.Labels[labelChassisSerialNumber] + if manufacturer == "" || serial == "" { + continue + } + if rackNaturalKey(manufacturer, serial) == want { + return cr.RackID, true + } + } + return "", false +} + +// buildRackFromCore translates one Core ExpectedRackDetail into the Flow Rack +// shape the mirror will insert. Returns false if the Core row is missing +// fields that Flow's rack table requires (manufacturer / serial_number are +// NOT NULL and form a unique key). +func buildRackFromCore(cr nicoapi.ExpectedRackDetail) (model.Rack, bool) { + manufacturer := cr.Labels[labelChassisManufacturer] + serial := cr.Labels[labelChassisSerialNumber] + if manufacturer == "" || serial == "" { + return model.Rack{}, false + } + + name := cr.Name + if name == "" { + // Flow's rack.name is NOT NULL with a unique index. Fall back to + // Core's stable rack_id first (operator-meaningful), then to + // manufacturer-serial so the row is still insertable when Core has + // neither. Operators can always rename later via the existing rack + // PATCH path. + switch { + case cr.RackID != "": + name = cr.RackID + default: + name = manufacturer + "-" + serial + } + } + + r := model.Rack{ + Name: name, + Manufacturer: manufacturer, + SerialNumber: serial, + } + // Leave ExternalID NULL when Core has no rack_id. Storing an empty + // string would still hit the partial unique index (which excludes NULL + // but not the empty string), so two such racks would collide. + if cr.RackID != "" { + extID := cr.RackID + r.ExternalID = &extID + } + + if desc := rackDescriptionFromLabels(cr.Labels, cr.Description); len(desc) > 0 { + r.Description = desc + } + if loc := rackLocationFromLabels(cr.Labels); len(loc) > 0 { + r.Location = loc + } + return r, true +} + +// rackDescriptionFromLabels extracts the JSONB-bound description fields the +// existing GetListOfRacks filter knows about (currently just "model") and +// preserves Core's free-form description text under "text". Returns an empty +// map when there's nothing to record so the caller can leave Description as +// SQL NULL. +func rackDescriptionFromLabels(labels map[string]string, description string) map[string]any { + out := map[string]any{} + if v := labels[labelChassisModel]; v != "" { + out["model"] = v + } + if description != "" { + out["text"] = description + } + return out +} + +// rackLocationFromLabels extracts the well-known location.* labels into the +// JSONB Location column. Returns an empty map when none are present. +func rackLocationFromLabels(labels map[string]string) map[string]any { + out := map[string]any{} + if v := labels[labelLocationRegion]; v != "" { + out["region"] = v + } + if v := labels[labelLocationDatacenter]; v != "" { + out["datacenter"] = v + } + if v := labels[labelLocationRoom]; v != "" { + out["room"] = v + } + if v := labels[labelLocationPosition]; v != "" { + out["position"] = v + } + return out +} + +// rackUpdatedFromCore returns a copy of `existing` with mirror-managed fields +// overwritten from `fromCore`. It deliberately does not touch identity +// (manufacturer / serial_number), lifecycle (status / ingested_at), or fields +// the mirror has no opinion on (nvldomain_id is out of scope for this PR; the +// runtime sync owns it). +// +// Returns nil when no patchable field changed so the caller can skip a no-op +// UPDATE. +func rackUpdatedFromCore(existing, fromCore *model.Rack) *model.Rack { + patched := *existing + changed := false + + if fromCore.Name != "" && existing.Name != fromCore.Name { + patched.Name = fromCore.Name + changed = true + } + if !reflect.DeepEqual(existing.Description, fromCore.Description) { + patched.Description = fromCore.Description + changed = true + } + if !reflect.DeepEqual(existing.Location, fromCore.Location) { + patched.Location = fromCore.Location + changed = true + } + // Adopt: existing.ExternalID was nil but fromCore now provides one. + if (existing.ExternalID == nil || *existing.ExternalID == "") && fromCore.ExternalID != nil && *fromCore.ExternalID != "" { + patched.ExternalID = fromCore.ExternalID + changed = true + } + + if !changed { + return nil + } + return &patched +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_rack_test.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_rack_test.go new file mode 100644 index 0000000000..abda2a7f74 --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/expected_mirror_rack_test.go @@ -0,0 +1,307 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "context" + "errors" + "testing" + + "github.com/google/uuid" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" + "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" +) + +func TestRackNaturalKeyIsCollisionFree(t *testing.T) { + // The classic concatenation bug: "ab"+"cd" and "abc"+"d" collide if you + // don't separate with a forbidden byte. + a := rackNaturalKey("ab", "cd") + b := rackNaturalKey("abc", "d") + assert.NotEqual(t, a, b, "manufacturer/serial collisions must be impossible") +} + +func TestBuildRackFromCore(t *testing.T) { + tests := []struct { + name string + in nicoapi.ExpectedRackDetail + wantOK bool + assertRow func(t *testing.T, r model.Rack) + }{ + { + name: "happy path with all labels", + in: nicoapi.ExpectedRackDetail{ + RackID: "a12", + Name: "Rack A12", + Labels: map[string]string{ + labelChassisManufacturer: "Foxconn", + labelChassisSerialNumber: "SN-A12", + labelChassisModel: "MGX-Rack-Gen2", + labelLocationRegion: "us-east", + labelLocationDatacenter: "DC1", + }, + Description: "Building 1, Row 3", + }, + wantOK: true, + assertRow: func(t *testing.T, r model.Rack) { + assert.Equal(t, "Rack A12", r.Name) + assert.Equal(t, "Foxconn", r.Manufacturer) + assert.Equal(t, "SN-A12", r.SerialNumber) + require.NotNil(t, r.ExternalID) + assert.Equal(t, "a12", *r.ExternalID) + assert.Equal(t, "MGX-Rack-Gen2", r.Description["model"]) + assert.Equal(t, "Building 1, Row 3", r.Description["text"]) + assert.Equal(t, "us-east", r.Location["region"]) + assert.Equal(t, "DC1", r.Location["datacenter"]) + assert.NotContains(t, r.Location, "room") + assert.NotContains(t, r.Location, "position") + }, + }, + { + name: "empty name falls back to rack_id so the NOT NULL/unique name constraint holds", + in: nicoapi.ExpectedRackDetail{ + RackID: "b07", + Labels: map[string]string{ + labelChassisManufacturer: "Foxconn", + labelChassisSerialNumber: "SN-B07", + }, + }, + wantOK: true, + assertRow: func(t *testing.T, r model.Rack) { + assert.Equal(t, "b07", r.Name) + }, + }, + { + name: "missing manufacturer is unusable", + in: nicoapi.ExpectedRackDetail{ + RackID: "c01", + Labels: map[string]string{ + labelChassisSerialNumber: "SN-C01", + }, + }, + wantOK: false, + }, + { + name: "missing serial is unusable", + in: nicoapi.ExpectedRackDetail{ + RackID: "c02", + Labels: map[string]string{ + labelChassisManufacturer: "Foxconn", + }, + }, + wantOK: false, + }, + { + name: "no description/location labels leaves jsonb columns nil", + in: nicoapi.ExpectedRackDetail{ + RackID: "d05", + Name: "bare", + Labels: map[string]string{ + labelChassisManufacturer: "Foxconn", + labelChassisSerialNumber: "SN-D05", + }, + }, + wantOK: true, + assertRow: func(t *testing.T, r model.Rack) { + assert.Nil(t, r.Description) + assert.Nil(t, r.Location) + }, + }, + { + name: "missing rack_id yields nil ExternalID (NULL in DB) so partial unique index stays clean", + in: nicoapi.ExpectedRackDetail{ + Name: "noext", + Labels: map[string]string{ + labelChassisManufacturer: "Foxconn", + labelChassisSerialNumber: "SN-E01", + }, + }, + wantOK: true, + assertRow: func(t *testing.T, r model.Rack) { + assert.Nil(t, r.ExternalID) + assert.Equal(t, "noext", r.Name) + }, + }, + { + name: "missing both rack_id and name falls back to manufacturer-serial", + in: nicoapi.ExpectedRackDetail{ + Labels: map[string]string{ + labelChassisManufacturer: "Foxconn", + labelChassisSerialNumber: "SN-F01", + }, + }, + wantOK: true, + assertRow: func(t *testing.T, r model.Rack) { + assert.Equal(t, "Foxconn-SN-F01", r.Name) + assert.Nil(t, r.ExternalID) + }, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + got, ok := buildRackFromCore(tc.in) + assert.Equal(t, tc.wantOK, ok) + if ok && tc.assertRow != nil { + tc.assertRow(t, got) + } + }) + } +} + +func TestRackUpdatedFromCore(t *testing.T) { + id := uuid.New() + base := func() *model.Rack { + return &model.Rack{ + ID: id, + Name: "old-name", + Manufacturer: "Foxconn", + SerialNumber: "SN-1", + Description: map[string]any{"model": "old-model"}, + Location: map[string]any{"region": "us-west"}, + } + } + + t.Run("name change produces an update", func(t *testing.T) { + existing := base() + fromCore := *base() + fromCore.Name = "new-name" + got := rackUpdatedFromCore(existing, &fromCore) + require.NotNil(t, got) + assert.Equal(t, "new-name", got.Name) + }) + + t.Run("identical inputs produce no update", func(t *testing.T) { + existing := base() + fromCore := *base() + assert.Nil(t, rackUpdatedFromCore(existing, &fromCore)) + }) + + t.Run("description swap is detected", func(t *testing.T) { + existing := base() + fromCore := *base() + fromCore.Description = map[string]any{"model": "new-model"} + got := rackUpdatedFromCore(existing, &fromCore) + require.NotNil(t, got) + assert.Equal(t, "new-model", got.Description["model"]) + }) + + t.Run("location swap is detected", func(t *testing.T) { + existing := base() + fromCore := *base() + fromCore.Location = map[string]any{"region": "us-east"} + got := rackUpdatedFromCore(existing, &fromCore) + require.NotNil(t, got) + assert.Equal(t, "us-east", got.Location["region"]) + }) + + t.Run("adoption: existing has nil external_id, core provides one", func(t *testing.T) { + existing := base() + existing.ExternalID = nil + fromCore := *base() + ext := "a12" + fromCore.ExternalID = &ext + got := rackUpdatedFromCore(existing, &fromCore) + require.NotNil(t, got) + require.NotNil(t, got.ExternalID) + assert.Equal(t, "a12", *got.ExternalID) + }) + + t.Run("empty name in fromCore does not clobber existing name", func(t *testing.T) { + existing := base() + fromCore := *base() + fromCore.Name = "" + assert.Nil(t, rackUpdatedFromCore(existing, &fromCore)) + }) +} + +func TestFlowBySerialInCore(t *testing.T) { + core := []nicoapi.ExpectedRackDetail{ + { + RackID: "a12", + Labels: map[string]string{ + labelChassisManufacturer: "Foxconn", + labelChassisSerialNumber: "SN-A12", + }, + }, + { + RackID: "b07", + Labels: map[string]string{ + // Missing manufacturer; should not match anything. + labelChassisSerialNumber: "SN-B07", + }, + }, + } + + t.Run("matches by (manufacturer, serial)", func(t *testing.T) { + flow := &model.Rack{Manufacturer: "Foxconn", SerialNumber: "SN-A12"} + ext, ok := flowBySerialInCore(flow, core) + assert.True(t, ok) + assert.Equal(t, "a12", ext) + }) + + t.Run("no match returns false", func(t *testing.T) { + flow := &model.Rack{Manufacturer: "Foxconn", SerialNumber: "SN-ZZ"} + _, ok := flowBySerialInCore(flow, core) + assert.False(t, ok) + }) + + t.Run("core row without manufacturer is ignored", func(t *testing.T) { + flow := &model.Rack{Manufacturer: "Foxconn", SerialNumber: "SN-B07"} + _, ok := flowBySerialInCore(flow, core) + assert.False(t, ok) + }) +} + +// errExpectedRacksClient is a tiny test wrapper around the production mock +// that overrides GetAllExpectedRackDetails to inject either an RPC error or a +// custom row set. It satisfies the same nicoapi.Client interface so it slots +// straight into pullExpectedRacks without touching the production mock. +type errExpectedRacksClient struct { + nicoapi.Client + err error + rows []nicoapi.ExpectedRackDetail +} + +func (c *errExpectedRacksClient) GetAllExpectedRackDetails(_ context.Context) ([]nicoapi.ExpectedRackDetail, error) { + if c.err != nil { + return nil, c.err + } + return c.rows, nil +} + +func TestPullExpectedRacks(t *testing.T) { + ctx := context.Background() + + t.Run("rpc error returns rpcOK=false so caller skips the type", func(t *testing.T) { + c := &errExpectedRacksClient{Client: nicoapi.NewMockClient(), err: errors.New("boom")} + rows, rpcOK, hasRows := pullExpectedRacks(ctx, c) + assert.Nil(t, rows) + assert.False(t, rpcOK) + assert.False(t, hasRows) + }) + + t.Run("empty response returns rpcOK=true, hasRows=false so caller skips delete only", func(t *testing.T) { + c := &errExpectedRacksClient{Client: nicoapi.NewMockClient()} + rows, rpcOK, hasRows := pullExpectedRacks(ctx, c) + assert.Nil(t, rows) + assert.True(t, rpcOK) + assert.False(t, hasRows) + }) + + t.Run("populated response returns both flags true", func(t *testing.T) { + c := &errExpectedRacksClient{ + Client: nicoapi.NewMockClient(), + rows: []nicoapi.ExpectedRackDetail{ + {RackID: "a12"}, + }, + } + rows, rpcOK, hasRows := pullExpectedRacks(ctx, c) + assert.Len(t, rows, 1) + assert.True(t, rpcOK) + assert.True(t, hasRows) + }) +} diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/inventory.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/inventory.go index 657a05b1cd..c08c37ee16 100644 --- a/rest-api/flow/internal/scheduler/jobs/inventorysync/inventory.go +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/inventory.go @@ -5,912 +5,51 @@ package inventorysync import ( "context" - "fmt" - "net" - "time" "github.com/rs/zerolog/log" "github.com/uptrace/bun" cdb "github.com/NVIDIA/infra-controller/rest-api/db/pkg/db" - "github.com/NVIDIA/infra-controller/rest-api/flow/internal/common/utils" "github.com/NVIDIA/infra-controller/rest-api/flow/internal/db/model" "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi" - pb "github.com/NVIDIA/infra-controller/rest-api/flow/internal/nicoapi/gen" - "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/common/devicetypes" - "github.com/NVIDIA/infra-controller/rest-api/flow/pkg/types" ) -const driftFieldSerialNumber = "serial_number" - -// runInventoryOne is a single iteration for RunInventory. -// It syncs each resource type against Core (NICo), collects all drifts, and -// persists them in one shot. +// runInventoryOne is a single iteration of the inventory sync job. Order: +// +// 1. syncExpectedFromCore mirrors Core's expected inventory into Flow's +// rack / component tables (the "expected" half of the package — see +// expected_mirror*.go). Gated by expectedSyncEnabled; when false the +// step is skipped entirely and Flow's existing ingestion path is the +// sole writer to rack / component. +// 2. runActualSync reconciles each component type against Core's runtime +// view and returns one combined drift set (the "actual" half — see +// actual_sync*.go). +// 3. The drift set replaces the whole component_drift table atomically so +// stale rows from previous runs can't linger. +// +// Errors are handled inside each step: any per-type RPC failure is logged +// and that type's drifts are skipped, but the rest of the cycle continues. +// A persistence failure is also logged rather than propagated — the +// scheduler retries on the next trigger. func runInventoryOne( ctx context.Context, pool *cdb.Session, nicoClient nicoapi.Client, + expectedSyncEnabled bool, ) { - var allDrifts []model.ComponentDrift - - computeReceived, machineDrifts := syncMachines(ctx, pool, nicoClient) - allDrifts = append(allDrifts, machineDrifts...) - - switchesReceived, nvSwitchDrifts := syncNVSwitchesNICo(ctx, pool, nicoClient) - allDrifts = append(allDrifts, nvSwitchDrifts...) - - powershelvesReceived, powershelfDrifts := syncPowershelvesNICo(ctx, pool, nicoClient) - allDrifts = append(allDrifts, powershelfDrifts...) - - log.Info(). - Int("compute", computeReceived). - Int("nvswitches", switchesReceived). - Int("powershelves", powershelvesReceived). - Msgf("Inventory received from Core: compute=%d nvswitches=%d powershelves=%d", - computeReceived, switchesReceived, powershelvesReceived) - - // Persist all drifts atomically (replace entire table) - if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { - return model.ReplaceAllDrifts(ctx, tx, allDrifts) - }); err != nil { - log.Error().Msgf("Unable to persist drift records: %v", err) + if expectedSyncEnabled { + syncExpectedFromCore(ctx, pool, nicoClient) } else { - log.Info().Msgf("Drift detection complete: %d drift(s) detected", len(allDrifts)) - } -} - -func isMachineComponentType(t string) bool { - return t == devicetypes.ComponentTypeToString(devicetypes.ComponentTypeCompute) -} - -// --------------------------------------------------------------------------- -// syncMachines: sync machine components against NICo -// --------------------------------------------------------------------------- -// -// NICo API calls (3 round-trips): -// - GetMachines (FindMachineIds + FindMachinesByIds): serial matching, -// firmware_version direct-write, and drift comparison data -// - GetPowerStates: power_state direct-write -// - GetMachinePositionInfo: position validation fields for drift comparison -// -// Flow: -// 1. DB: get all machine components -// 2. NICo GetMachines: fetch all machine details (reused for steps 3, 5, and drift) -// 3. Match by serial → direct-write external_id -// 4. NICo GetPowerStates: direct-write power_state -// 5. Direct-write firmware_version (from step 2 data) -// 6. NICo GetMachinePositionInfo: compare validation fields, return drifts -// -// Validation fields (compared for drift): slot_id, tray_index, host_id, serial_number -// Direct-write fields (written to DB, not compared): external_id, power_state, firmware_version -func syncMachines( - ctx context.Context, - pool *cdb.Session, - nicoClient nicoapi.Client, -) (received int, drifts []model.ComponentDrift) { - log.Debug().Msg("Syncing machines...") - - // Step 1: Get all machine components from DB - allComponents, err := model.GetAllComponents(ctx, pool.DB) - if err != nil { - log.Error().Msgf("Unable to retrieve components from db: %v", err) - return 0, nil - } - - var components []model.Component - for _, c := range allComponents { - if isMachineComponentType(c.Type) { - components = append(components, c) - } - } - - if len(components) == 0 { - return 0, nil - } - - // Step 2: Fetch all machine details from NICo - allMachineDetails, err := nicoClient.GetMachines(ctx) - if err != nil { - log.Error().Msgf("Unable to retrieve machine details from NICo: %v", err) - return 0, nil - } - received = len(allMachineDetails) - - detailByID := make(map[string]nicoapi.MachineDetail) - for _, d := range allMachineDetails { - detailByID[d.MachineID] = d - } - - // Step 3: Direct-write external_id by serial matching - syncMachineIDs(ctx, pool, allMachineDetails, components) - - // Re-read components to pick up any external_id updates - allComponents, err = model.GetAllComponents(ctx, pool.DB) - if err != nil { - log.Error().Msgf("Unable to re-read components from db after machine ID update: %v", err) - return received, nil - } - components = components[:0] - for _, c := range allComponents { - if isMachineComponentType(c.Type) { - components = append(components, c) - } - } - - // Build lookup maps for matched components - var machineIDs []string - componentsByExternalID := make(map[string]*model.Component) - for i := range components { - comp := &components[i] - if comp.ComponentID != nil && *comp.ComponentID != "" { - machineIDs = append(machineIDs, *comp.ComponentID) - componentsByExternalID[*comp.ComponentID] = comp - } - } - - if len(machineIDs) == 0 { - return received, buildDriftsForUnmatchedComponents(components, allMachineDetails) - } - - // Step 4: Direct-write power_state (requires separate NICo API) - syncPowerStates(ctx, pool, nicoClient, machineIDs, componentsByExternalID) - - // Step 5: Direct-write firmware_version (from pre-fetched details, no extra API call) - syncFirmwareVersions(ctx, pool, detailByID, componentsByExternalID) - - // Step 5b: Direct-write derived ComponentStatus (from pre-fetched detail.State). - syncMachineStatuses(ctx, pool, detailByID, componentsByExternalID) - - // Step 6: Fetch positions and build drift records (requires separate NICo API) - machinePositions, err := nicoClient.GetMachinePositionInfo(ctx, machineIDs) - if err != nil { - log.Error().Msgf("Unable to retrieve machine positions from NICo: %v", err) - return received, nil - } - - positionByID := make(map[string]nicoapi.MachinePosition) - for _, p := range machinePositions { - positionByID[p.MachineID] = p - } - - now := time.Now() - - for i := range components { - comp := &components[i] - - if comp.ComponentID == nil || *comp.ComponentID == "" { - compID := comp.ID - drifts = append(drifts, model.ComponentDrift{ - ComponentID: &compID, - ExternalID: nil, - DriftType: model.DriftTypeMissingInActual, - Diffs: []model.FieldDiff{}, - CheckedAt: now, - }) - continue - } - - externalID := *comp.ComponentID - detail, foundDetail := detailByID[externalID] - position, foundPosition := positionByID[externalID] - - if !foundDetail { - compID := comp.ID - drifts = append(drifts, model.ComponentDrift{ - ComponentID: &compID, - ExternalID: &externalID, - DriftType: model.DriftTypeMissingInActual, - Diffs: []model.FieldDiff{}, - CheckedAt: now, - }) - continue - } - - var posPtr *nicoapi.MachinePosition - if foundPosition { - posPtr = &position - } - fieldDiffs := compareMachineFieldsForDrift(comp, detail, posPtr) - if len(fieldDiffs) > 0 { - compID := comp.ID - drifts = append(drifts, model.ComponentDrift{ - ComponentID: &compID, - ExternalID: &externalID, - DriftType: model.DriftTypeMismatch, - Diffs: fieldDiffs, - CheckedAt: now, - }) - } - } - - // Detect missing_in_expected: machines in NICo but not in local DB - for _, detail := range allMachineDetails { - if _, found := componentsByExternalID[detail.MachineID]; !found { - extID := detail.MachineID - drifts = append(drifts, model.ComponentDrift{ - ComponentID: nil, - ExternalID: &extID, - DriftType: model.DriftTypeMissingInExpected, - Diffs: []model.FieldDiff{}, - CheckedAt: now, - }) - } - } - - log.Info().Msgf("Machine sync: %d drift(s) out of %d component(s)", len(drifts), len(components)) - return received, drifts -} - -// buildDriftsForUnmatchedComponents returns missing_in_actual drifts for all -// components that have no external_id, plus missing_in_expected drifts for -// every NICo machine (since no DB component has an external_id, none can -// match). -func buildDriftsForUnmatchedComponents( - components []model.Component, - allMachineDetails []nicoapi.MachineDetail, -) []model.ComponentDrift { - now := time.Now() - var drifts []model.ComponentDrift - for i := range components { - if components[i].ComponentID == nil || *components[i].ComponentID == "" { - compID := components[i].ID - drifts = append(drifts, model.ComponentDrift{ - ComponentID: &compID, - DriftType: model.DriftTypeMissingInActual, - Diffs: []model.FieldDiff{}, - CheckedAt: now, - }) - } - } - for _, detail := range allMachineDetails { - extID := detail.MachineID - drifts = append(drifts, model.ComponentDrift{ - ComponentID: nil, - ExternalID: &extID, - DriftType: model.DriftTypeMissingInExpected, - Diffs: []model.FieldDiff{}, - CheckedAt: now, - }) - } - return drifts -} - -// syncMachineIDs matches components by serial number against pre-fetched NICo -// machine details and direct-writes the external_id. -func syncMachineIDs( - ctx context.Context, - pool *cdb.Session, - allDetails []nicoapi.MachineDetail, - components []model.Component, -) { - containersBySerial := make(map[string]model.Component) - for _, cur := range components { - containersBySerial[cur.SerialNumber] = cur - } - - var toUpdate []model.Component - for _, cur := range allDetails { - if cur.ChassisSerial == nil { - continue - } - if container, ok := containersBySerial[*cur.ChassisSerial]; ok { - if container.ComponentID == nil || *container.ComponentID != cur.MachineID { - componentID := cur.MachineID - container.ComponentID = &componentID - toUpdate = append(toUpdate, container) - } - } - } - - if len(toUpdate) > 0 { - if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { - for _, cur := range toUpdate { - if err := cur.SetComponentIDBySerial(ctx, tx); err != nil { - return fmt.Errorf("Unable to update machine ID: %w", err) - } - } - return nil - }); err != nil { - log.Error().Msgf("Unable to update components with serial: %v", err) - return - } - - log.Info().Msgf("Updated %d machine ID(s)", len(toUpdate)) - } -} - -// syncPowerStates fetches power states from NICo and direct-writes to component table. -func syncPowerStates( - ctx context.Context, - pool *cdb.Session, - nicoClient nicoapi.Client, - machineIDs []string, - componentsByExternalID map[string]*model.Component, -) { - machines, err := nicoClient.GetPowerStates(ctx, machineIDs) - if err != nil { - log.Error().Msgf("Unable to retrieve power states from nico-core-api: %v", err) - return - } - - var toUpdate []model.Component - for _, cur := range machines { - if comp, ok := componentsByExternalID[cur.MachineID]; ok { - if comp.PowerState == nil || *comp.PowerState != cur.PowerState { - powerState := cur.PowerState - comp.PowerState = &powerState - toUpdate = append(toUpdate, *comp) - } - } - } - - if len(toUpdate) > 0 { - if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { - for _, cur := range toUpdate { - if err := cur.SetPowerStateByComponentID(ctx, tx); err != nil { - return fmt.Errorf("Unable to update power state: %w", err) - } - } - return nil - }); err != nil { - log.Error().Msgf("Unable to update components with power state: %v", err) - } - } -} - -// syncFirmwareVersions direct-writes firmware_version from NICo machine details to component table. -func syncFirmwareVersions( - ctx context.Context, - pool *cdb.Session, - detailByID map[string]nicoapi.MachineDetail, - componentsByExternalID map[string]*model.Component, -) { - var toUpdate []model.Component - for machineID, detail := range detailByID { - if comp, ok := componentsByExternalID[machineID]; ok { - if detail.FirmwareVersion != "" && comp.FirmwareVersion != detail.FirmwareVersion { - comp.FirmwareVersion = detail.FirmwareVersion - toUpdate = append(toUpdate, *comp) - } - } - } - - if len(toUpdate) > 0 { - if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { - for _, cur := range toUpdate { - if err := cur.SetFirmwareVersionByComponentID(ctx, tx); err != nil { - return fmt.Errorf("unable to update firmware version: %w", err) - } - } - return nil - }); err != nil { - log.Error().Msgf("Unable to update components with firmware version: %v", err) - } - } -} - -// syncMachineStatuses derives a types.ComponentStatus from each machine's -// controller_state (already fetched as detail.State) and direct-writes it to -// the component row. Only rows whose status actually changed are updated. -func syncMachineStatuses( - ctx context.Context, - pool *cdb.Session, - detailByID map[string]nicoapi.MachineDetail, - componentsByExternalID map[string]*model.Component, -) { - statesByID := make(map[string]string, len(detailByID)) - for id, d := range detailByID { - if d.State != "" { - statesByID[id] = d.State - } - } - persistComponentStatuses(ctx, pool, types.ComponentTypeCompute, statesByID, componentsByExternalID) -} - -// syncSwitchStatuses fetches controller_state for the matched switches and -// persists the derived ComponentStatus per DB row. -func syncSwitchStatuses( - ctx context.Context, - pool *cdb.Session, - nicoClient nicoapi.Client, - componentsBySwitchID map[string]*model.Component, -) { - ids := mapKeys(componentsBySwitchID) - if len(ids) == 0 { - return - } - statesByID, err := nicoClient.FindSwitchControllerStates(ctx, ids) - if err != nil { - log.Error().Msgf("Unable to retrieve switch controller_states from NICo: %v", err) - return + log.Debug().Msgf("Expected-inventory mirror: skipped this cycle (gate %s is off)", envExpectedSyncEnabled) } - persistComponentStatuses(ctx, pool, types.ComponentTypeNVSwitch, statesByID, componentsBySwitchID) -} - -// syncPowershelfStatuses is the power-shelf equivalent of syncSwitchStatuses. -func syncPowershelfStatuses( - ctx context.Context, - pool *cdb.Session, - nicoClient nicoapi.Client, - componentsByShelfID map[string]*model.Component, -) { - ids := mapKeys(componentsByShelfID) - if len(ids) == 0 { - return - } - statesByID, err := nicoClient.FindPowerShelfControllerStates(ctx, ids) - if err != nil { - log.Error().Msgf("Unable to retrieve power-shelf controller_states from NICo: %v", err) - return - } - persistComponentStatuses(ctx, pool, types.ComponentTypePowerShelf, statesByID, componentsByShelfID) -} - -func mapKeys(m map[string]*model.Component) []string { - if len(m) == 0 { - return nil - } - out := make([]string, 0, len(m)) - for k := range m { - out = append(out, k) - } - return out -} -// persistComponentStatuses maps raw core controller_state strings to -// ComponentStatus values via the per-type mapper and writes any deltas to the -// component table. components are keyed by external_id (machineID / switchID / -// shelfID). Entries without a state in statesByID are skipped — missing data -// is not a status reset. -func persistComponentStatuses( - ctx context.Context, - pool *cdb.Session, - componentType types.ComponentType, - statesByID map[string]string, - componentsByExternalID map[string]*model.Component, -) { - if len(statesByID) == 0 { - return - } + drifts := runActualSync(ctx, pool, nicoClient) - var toUpdate []model.Component - for externalID, raw := range statesByID { - comp, ok := componentsByExternalID[externalID] - if !ok { - continue - } - newStatus := nicoapi.MapComponentStatus(componentType, raw) - if comp.Status != nil && comp.Status.Equal(newStatus) { - continue - } - comp.Status = &newStatus - toUpdate = append(toUpdate, *comp) - } - - if len(toUpdate) == 0 { - return - } if err := pool.RunInTx(ctx, func(ctx context.Context, tx bun.Tx) error { - for _, cur := range toUpdate { - if err := cur.SetStatusByComponentID(ctx, tx); err != nil { - return fmt.Errorf("set component status: %w", err) - } - } - return nil + return model.ReplaceAllDrifts(ctx, tx, drifts) }); err != nil { - log.Error().Msgf("Unable to persist component statuses: %v", err) - } -} - -// compareMachineFieldsForDrift compares validation fields between expected (DB) and actual (NICo). -// Validation fields: slot_id, tray_index, host_id, serial_number. -func compareMachineFieldsForDrift( - expected *model.Component, - actual nicoapi.MachineDetail, - position *nicoapi.MachinePosition, -) []model.FieldDiff { - var diffs []model.FieldDiff - - if position != nil { - if position.PhysicalSlotNum != nil && expected.SlotID != int(*position.PhysicalSlotNum) { - diffs = append(diffs, model.FieldDiff{ - FieldName: "slot_id", - ExpectedValue: fmt.Sprintf("%d", expected.SlotID), - ActualValue: fmt.Sprintf("%d", *position.PhysicalSlotNum), - }) - } - if position.ComputeTrayIndex != nil && expected.TrayIndex != int(*position.ComputeTrayIndex) { - diffs = append(diffs, model.FieldDiff{ - FieldName: "tray_index", - ExpectedValue: fmt.Sprintf("%d", expected.TrayIndex), - ActualValue: fmt.Sprintf("%d", *position.ComputeTrayIndex), - }) - } - if position.TopologyID != nil && expected.HostID != int(*position.TopologyID) { - diffs = append(diffs, model.FieldDiff{ - FieldName: "host_id", - ExpectedValue: fmt.Sprintf("%d", expected.HostID), - ActualValue: fmt.Sprintf("%d", *position.TopologyID), - }) - } + log.Error().Msgf("Unable to persist drift records: %v", err) } else { - if expected.SlotID != 0 { - diffs = append(diffs, model.FieldDiff{ - FieldName: "slot_id", - ExpectedValue: fmt.Sprintf("%d", expected.SlotID), - ActualValue: "", - }) - } - if expected.TrayIndex != 0 { - diffs = append(diffs, model.FieldDiff{ - FieldName: "tray_index", - ExpectedValue: fmt.Sprintf("%d", expected.TrayIndex), - ActualValue: "", - }) - } - if expected.HostID != 0 { - diffs = append(diffs, model.FieldDiff{ - FieldName: "host_id", - ExpectedValue: fmt.Sprintf("%d", expected.HostID), - ActualValue: "", - }) - } - } - - // Compare serial_number (chassis_serial) - if actual.ChassisSerial != nil && expected.SerialNumber != *actual.ChassisSerial { - diffs = append(diffs, model.FieldDiff{ - FieldName: driftFieldSerialNumber, - ExpectedValue: expected.SerialNumber, - ActualValue: *actual.ChassisSerial, - }) - } - - return diffs -} - -// --------------------------------------------------------------------------- -// syncNVSwitchesNICo: sync NVSwitch components via Core (NICo) -// --------------------------------------------------------------------------- -// -// Uses Core's NICo API. Core's NSM backend auto-registers switches, so no -// registration step is needed. -// -// NICo API calls (2 round-trips): -// - GetAllExpectedSwitchesLinked: discover Core switch IDs by BMC MAC -// - GetComponentInventory: get firmware, serial, power state from site explorer -// -// Flow: -// 1. DB: get all NVSwitch components with BMCs -// 2. NICo GetAllExpectedSwitchesLinked: map BMC MAC → Core SwitchId -// 3. Direct-write external_id (Core's SwitchId) for matched components -// 4. NICo GetComponentInventory: extract firmware_version, serial_number, power_state -// 5. Direct-write inventory fields to DB -// 6. Return drifts (missing_in_actual for components without a Core SwitchId) -func syncNVSwitchesNICo( - ctx context.Context, - pool *cdb.Session, - nicoClient nicoapi.Client, -) (received int, drifts []model.ComponentDrift) { - log.Debug().Msg("Syncing NV switches via NICo...") - - expectedSwitches, err := model.GetComponentsByType(ctx, pool.DB, devicetypes.ComponentTypeNVSwitch) - if err != nil { - log.Error().Msgf("Unable to retrieve NVSwitch components from db: %v", err) - return 0, nil - } - - if len(expectedSwitches) == 0 { - return 0, nil - } - - expectedByBmcMac := make(map[string]*model.Component) - for i := range expectedSwitches { - sw := &expectedSwitches[i] - if len(sw.BMCs) != 1 { - log.Error().Msgf("NVSwitch %s has %d BMCs, expected exactly 1; skipping", sw.SerialNumber, len(sw.BMCs)) - continue - } - bmcMacAddr, err := net.ParseMAC(sw.BMCs[0].MacAddress) - if err != nil || bmcMacAddr == nil { - log.Error().Msgf("NVSwitch %s has invalid BMC MAC address %s; skipping", sw.SerialNumber, sw.BMCs[0].MacAddress) - continue - } - expectedByBmcMac[bmcMacAddr.String()] = sw - } - - // ID discovery: map BMC MAC → Core SwitchId - linked, err := nicoClient.GetAllExpectedSwitchesLinked(ctx) - if err != nil { - log.Error().Msgf("Unable to retrieve linked expected switches from NICo: %v", err) - return 0, nil - } - received = len(linked) - - linkedByMac := make(map[string]nicoapi.LinkedExpectedSwitch) - for _, les := range linked { - if les.BMCMACAddress != "" { - linkedByMac[utils.NormalizeMAC(les.BMCMACAddress)] = les - } - } - - // Direct-write external_id for matched components - var switchIDs []*pb.SwitchId - componentsBySwitchID := make(map[string]*model.Component) - - for bmcMac, sw := range expectedByBmcMac { - les, found := linkedByMac[bmcMac] - if !found || les.SwitchID == "" { - continue - } - - if sw.ComponentID == nil || *sw.ComponentID != les.SwitchID { - switchID := les.SwitchID - sw.ComponentID = &switchID - if err := sw.Patch(ctx, pool.DB); err != nil { - log.Error().Msgf("NVSwitch %s (BMC %s): unable to update external_id: %v", sw.ID, bmcMac, err) - continue - } - log.Info().Msgf("NVSwitch %s (BMC %s): set external_id to Core SwitchId %s", sw.ID, bmcMac, switchID) - } - - switchIDs = append(switchIDs, &pb.SwitchId{Id: les.SwitchID}) - componentsBySwitchID[les.SwitchID] = sw - } - - // Fetch inventory from Core for all matched switches - now := time.Now() - if len(switchIDs) > 0 { - invResp, err := nicoClient.GetComponentInventory(ctx, &pb.GetComponentInventoryRequest{ - Target: &pb.GetComponentInventoryRequest_SwitchIds{ - SwitchIds: &pb.SwitchIdList{Ids: switchIDs}, - }, - }) - if err != nil { - log.Error().Msgf("Unable to retrieve switch inventory from NICo: %v", err) - } else { - drifts = append(drifts, applyInventoryToComponents(ctx, pool, invResp, componentsBySwitchID)...) - } - } - - syncSwitchStatuses(ctx, pool, nicoClient, componentsBySwitchID) - - // Build drifts for components that don't have a Core SwitchId yet - for _, sw := range expectedByBmcMac { - if sw.ComponentID == nil || *sw.ComponentID == "" { - compID := sw.ID - drifts = append(drifts, model.ComponentDrift{ - ComponentID: &compID, - ExternalID: nil, - DriftType: model.DriftTypeMissingInActual, - Diffs: []model.FieldDiff{}, - CheckedAt: now, - }) - } - } - - log.Info().Msgf("NVSwitch NICo sync: %d drift(s) out of %d expected", len(drifts), len(expectedSwitches)) - return received, drifts -} - -// --------------------------------------------------------------------------- -// syncPowershelvesNICo: sync PowerShelf components via Core (NICo) -// --------------------------------------------------------------------------- -// -// Uses Core's NICo API. Core's PSM backend auto-registers power shelves, so no -// registration step is needed. -// -// NICo API calls (2 round-trips): -// - GetAllExpectedPowerShelvesLinked: discover Core power shelf IDs by PMC MAC -// - GetComponentInventory: get firmware, power state from site explorer -// -// Flow: -// 1. DB: get all PowerShelf components with PMCs -// 2. NICo GetAllExpectedPowerShelvesLinked: map PMC MAC → Core PowerShelfId -// 3. Direct-write external_id (Core's PowerShelfId) for matched components -// 4. NICo GetComponentInventory: extract firmware_version, power_state -// 5. Direct-write inventory fields to DB -// 6. Return drifts (missing_in_actual for components without a Core PowerShelfId) -func syncPowershelvesNICo( - ctx context.Context, - pool *cdb.Session, - nicoClient nicoapi.Client, -) (received int, drifts []model.ComponentDrift) { - log.Debug().Msg("Syncing powershelves via NICo...") - - expectedPowershelves, err := model.GetComponentsByType(ctx, pool.DB, devicetypes.ComponentTypePowerShelf) - if err != nil { - log.Error().Msgf("Unable to retrieve powershelf components from db: %v", err) - return 0, nil - } - - if len(expectedPowershelves) == 0 { - return 0, nil - } - - expectedByPmcMac := make(map[string]*model.Component) - for i := range expectedPowershelves { - ps := &expectedPowershelves[i] - if len(ps.BMCs) != 1 { - log.Error().Msgf("Powershelf %s has %d BMCs, expected exactly 1; skipping", ps.SerialNumber, len(ps.BMCs)) - continue - } - pmcMacAddr, err := net.ParseMAC(ps.BMCs[0].MacAddress) - if err != nil || pmcMacAddr == nil { - log.Error().Msgf("Powershelf %s has invalid BMC MAC address %s; skipping", ps.SerialNumber, ps.BMCs[0].MacAddress) - continue - } - expectedByPmcMac[pmcMacAddr.String()] = ps - } - - // ID discovery: map PMC MAC → Core PowerShelfId - linked, err := nicoClient.GetAllExpectedPowerShelvesLinked(ctx) - if err != nil { - log.Error().Msgf("Unable to retrieve linked expected power shelves from NICo: %v", err) - return 0, nil - } - received = len(linked) - - linkedByMac := make(map[string]nicoapi.LinkedExpectedPowerShelf) - for _, leps := range linked { - if leps.BMCMACAddress != "" { - linkedByMac[utils.NormalizeMAC(leps.BMCMACAddress)] = leps - } - } - - // Direct-write external_id for matched components - var shelfIDs []*pb.PowerShelfId - componentsByShelfID := make(map[string]*model.Component) - - for pmcMac, ps := range expectedByPmcMac { - leps, found := linkedByMac[pmcMac] - if !found || leps.PowerShelfID == "" { - continue - } - - if ps.ComponentID == nil || *ps.ComponentID != leps.PowerShelfID { - shelfID := leps.PowerShelfID - ps.ComponentID = &shelfID - if err := ps.Patch(ctx, pool.DB); err != nil { - log.Error().Msgf("Powershelf %s (PMC %s): unable to update external_id: %v", ps.ID, pmcMac, err) - continue - } - log.Info().Msgf("Powershelf %s (PMC %s): set external_id to Core PowerShelfId %s", ps.ID, pmcMac, shelfID) - } - - shelfIDs = append(shelfIDs, &pb.PowerShelfId{Id: leps.PowerShelfID}) - componentsByShelfID[leps.PowerShelfID] = ps - } - - // Fetch inventory from Core for all matched power shelves - now := time.Now() - if len(shelfIDs) > 0 { - invResp, err := nicoClient.GetComponentInventory(ctx, &pb.GetComponentInventoryRequest{ - Target: &pb.GetComponentInventoryRequest_PowerShelfIds{ - PowerShelfIds: &pb.PowerShelfIdList{Ids: shelfIDs}, - }, - }) - if err != nil { - log.Error().Msgf("Unable to retrieve powershelf inventory from NICo: %v", err) - } else { - drifts = append(drifts, applyInventoryToComponents(ctx, pool, invResp, componentsByShelfID)...) - } - } - - syncPowershelfStatuses(ctx, pool, nicoClient, componentsByShelfID) - - // Build drifts for components that don't have a Core PowerShelfId yet - for _, ps := range expectedByPmcMac { - if ps.ComponentID == nil || *ps.ComponentID == "" { - compID := ps.ID - drifts = append(drifts, model.ComponentDrift{ - ComponentID: &compID, - ExternalID: nil, - DriftType: model.DriftTypeMissingInActual, - Diffs: []model.FieldDiff{}, - CheckedAt: now, - }) - } - } - - log.Info().Msgf("Powershelf NICo sync: %d drift(s) out of %d expected", len(drifts), len(expectedPowershelves)) - return received, drifts -} - -// applyInventoryToComponents extracts firmware_version and power_state from -// GetComponentInventoryResponse and direct-writes them to the matching -// components. Serial numbers are compared (not overwritten) and returned as -// drift records. componentsByID maps the component_id echoed back in each -// ComponentResult to the DB component. -func applyInventoryToComponents( - ctx context.Context, - pool *cdb.Session, - resp *pb.GetComponentInventoryResponse, - componentsByID map[string]*model.Component, -) []model.ComponentDrift { - now := time.Now() - var drifts []model.ComponentDrift - - for _, entry := range resp.GetEntries() { - result := entry.GetResult() - if result == nil { - continue - } - comp, ok := componentsByID[result.GetComponentId()] - if !ok { - continue - } - if result.GetStatus() != pb.ComponentManagerStatusCode_COMPONENT_MANAGER_STATUS_CODE_SUCCESS { - log.Warn().Msgf("Component %s: inventory status %s: %s", result.GetComponentId(), result.GetStatus(), result.GetError()) - continue - } - - report := entry.GetReport() - if report == nil { - continue - } - - needsUpdate := false - - // Extract firmware_version from the "BMC image" inventory entry - for _, svc := range report.GetService() { - for _, inv := range svc.GetInventories() { - if inv.GetDescription() == "BMC image" { - if v := inv.GetVersion(); v != "" && comp.FirmwareVersion != v { - comp.FirmwareVersion = v - needsUpdate = true - } - } - } - } - - // Compare serial_number from first Chassis entry (drift, not overwrite) - if chassisList := report.GetChassis(); len(chassisList) > 0 { - if sn := chassisList[0].GetSerialNumber(); sn != "" && comp.SerialNumber != sn { - compID := comp.ID - extID := result.GetComponentId() - drifts = append(drifts, model.ComponentDrift{ - ComponentID: &compID, - ExternalID: &extID, - DriftType: model.DriftTypeMismatch, - Diffs: []model.FieldDiff{{ - FieldName: driftFieldSerialNumber, - ExpectedValue: comp.SerialNumber, - ActualValue: sn, - }}, - CheckedAt: now, - }) - } - } - - // Extract power_state from first ComputerSystem entry - if systems := report.GetSystems(); len(systems) > 0 { - ps := computerSystemPowerStateToNICo(systems[0].GetPowerState()) - if comp.PowerState == nil || *comp.PowerState != ps { - comp.PowerState = &ps - needsUpdate = true - } - } - - if needsUpdate { - if err := comp.Patch(ctx, pool.DB); err != nil { - log.Error().Msgf("Component %s: unable to write inventory fields: %v", result.GetComponentId(), err) - } - } - } - - return drifts -} - -func computerSystemPowerStateToNICo( - ps pb.ComputerSystemPowerState, -) nicoapi.PowerState { - switch ps { - case pb.ComputerSystemPowerState_On, pb.ComputerSystemPowerState_PoweringOn: - return nicoapi.PowerStateOn - case pb.ComputerSystemPowerState_Off, pb.ComputerSystemPowerState_PoweringOff: - return nicoapi.PowerStateOff - default: - return nicoapi.PowerStateUnknown + log.Info().Msgf("Drift detection complete: %d drift(s) detected", len(drifts)) } } diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/inventory_test.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/inventory_test.go index a1cd4c2f51..35a698ec5c 100644 --- a/rest-api/flow/internal/scheduler/jobs/inventorysync/inventory_test.go +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/inventory_test.go @@ -61,7 +61,11 @@ func TestInventory(t *testing.T) { err = c.Create(ctx, pool.DB) assert.Nil(t, err) - runInventoryOne(ctx, pool, grpcMock) + // Pass expectedSyncEnabled=true so the mirror step runs against the + // empty mock (no-op via the safety guard) — matches the cycle layout + // production will have once the feature gate is flipped on, without + // changing what this test asserts about actual-sync outputs. + runInventoryOne(ctx, pool, grpcMock, true) rows, err := pool.DB.Query("SELECT serial_number, power_state FROM component;") assert.NotNil(t, rows) @@ -127,7 +131,7 @@ func TestSyncFirmwareVersion(t *testing.T) { err = c2.Create(ctx, pool.DB) assert.Nil(t, err) - runInventoryOne(ctx, pool, grpcMock) + runInventoryOne(ctx, pool, grpcMock, true) var updated1 model.Component err = pool.DB.NewSelect().Model(&updated1).Where("id = ?", c1.ID).Scan(ctx) diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/job.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/job.go index c4622ec474..c663f814de 100644 --- a/rest-api/flow/internal/scheduler/jobs/inventorysync/job.go +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/job.go @@ -6,6 +6,8 @@ package inventorysync import ( "context" "fmt" + "os" + "strconv" "github.com/rs/zerolog/log" @@ -17,11 +19,37 @@ import ( nicoprovider "github.com/NVIDIA/infra-controller/rest-api/flow/internal/task/componentmanager/providers/nico" //nolint ) +// envExpectedSyncEnabled gates the expected-inventory mirror that runs at +// the start of each inventory cycle (see expected_mirror*.go). Default is +// "off": Flow keeps using its existing ingestion path until an operator +// opts in. Accepted truthy values are anything strconv.ParseBool accepts +// (1, t, T, true, True, TRUE, ...). An unset, empty, or unparseable value +// resolves to disabled — the conservative default given the mirror writes +// directly to the rack / component tables. +const envExpectedSyncEnabled = "FLOW_EXPECTED_INVENTORY_SYNC_ENABLED" + // Job implements scheduler.Job for the inventory sync task. type Job struct { - dbConf *cdb.Config - nicoClient nicoapi.Client - pool *cdb.Session + dbConf *cdb.Config + nicoClient nicoapi.Client + pool *cdb.Session + expectedSyncEnabled bool +} + +func readExpectedSyncEnabled() bool { + raw := os.Getenv(envExpectedSyncEnabled) + if raw == "" { + return false + } + enabled, err := strconv.ParseBool(raw) + if err != nil { + log.Warn(). + Str("env", envExpectedSyncEnabled). + Str("raw", raw). + Msg("Expected-inventory mirror toggle: env var value is not a boolean (use 1/0/true/false); treating as disabled") + return false + } + return enabled } // New constructs an inventory sync Job using clients sourced from the provider @@ -69,10 +97,17 @@ func New( // the component manager so jobs receive ready-to-use domain clients // instead of low-level provider handles. + expectedSyncEnabled := readExpectedSyncEnabled() + log.Info(). + Bool("enabled", expectedSyncEnabled). + Str("env", envExpectedSyncEnabled). + Msg("Expected-inventory mirror: feature gate resolved at job construction") + return &Job{ - dbConf: dbConf, - nicoClient: nicoProvider.Client(), - pool: pool, + dbConf: dbConf, + nicoClient: nicoProvider.Client(), + pool: pool, + expectedSyncEnabled: expectedSyncEnabled, }, nil } @@ -85,6 +120,6 @@ func (j *Job) Name() string { return "inventory-sync" } // error is also logged rather than propagated. A failed iteration is not // fatal — the scheduler will simply retry on the next trigger fire. func (j *Job) Run(ctx context.Context, _ types.Event) error { - runInventoryOne(ctx, j.pool, j.nicoClient) + runInventoryOne(ctx, j.pool, j.nicoClient, j.expectedSyncEnabled) return nil } diff --git a/rest-api/flow/internal/scheduler/jobs/inventorysync/job_test.go b/rest-api/flow/internal/scheduler/jobs/inventorysync/job_test.go new file mode 100644 index 0000000000..69aee67960 --- /dev/null +++ b/rest-api/flow/internal/scheduler/jobs/inventorysync/job_test.go @@ -0,0 +1,39 @@ +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +package inventorysync + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestReadExpectedSyncEnabled(t *testing.T) { + // The mirror writes to rack / component tables, so the default has to + // be off — an operator should have to opt in explicitly. These cases + // pin both the truthy / falsy ParseBool grammar and the + // "unparseable / unset is conservatively off" guarantee. + for _, tc := range []struct { + raw string + want bool + }{ + {"", false}, // unset env var + {"true", true}, // canonical truthy + {"True", true}, // ParseBool accepts mixed case + {"TRUE", true}, // and upper case + {"1", true}, // and 1 + {"t", true}, // and t + {"false", false}, + {"0", false}, + {"f", false}, + {"on", false}, // ParseBool rejects on/off; treated as disabled with warn + {"yes", false}, + {"garbage", false}, + } { + t.Run(tc.raw, func(t *testing.T) { + t.Setenv(envExpectedSyncEnabled, tc.raw) + assert.Equal(t, tc.want, readExpectedSyncEnabled()) + }) + } +}