diff --git a/crates/api-db/src/machine.rs b/crates/api-db/src/machine.rs index 09b73d0868..d5f41d21a3 100644 --- a/crates/api-db/src/machine.rs +++ b/crates/api-db/src/machine.rs @@ -26,6 +26,7 @@ use carbide_uuid::dpa_interface::DpaInterfaceId; use carbide_uuid::instance_type::InstanceTypeId; use carbide_uuid::machine::{MachineId, MachineType}; use carbide_uuid::machine_validation::MachineValidationId; +use carbide_uuid::rack::RackId; use chrono::{DateTime, Utc}; use config_version::{ConfigVersion, Versioned}; use health_report::{HealthReport, HealthReportApplyMode}; @@ -2583,6 +2584,52 @@ impl<'r> FromRow<'r, PgRow> for _HealthReportWrapper { } } +/// RMS identity for a compute tray machine: the machine ID (used as the RMS +/// node_id), the BMC IP address, the BMC MAC address, and the rack_id. +#[derive(Debug, sqlx::FromRow)] +pub struct MachineRmsIdentity { + pub id: String, + pub bmc_ip: IpAddr, + pub bmc_mac_address: MacAddress, + pub rack_id: Option, +} + +/// Look up RMS identities (node_id, rack_id) for compute tray machines by their +/// BMC IP addresses. +pub async fn find_rms_identities_by_bmc_ips( + db: impl crate::db_read::DbReader<'_>, + bmc_ips: &[IpAddr], +) -> DatabaseResult> { + let ip_strings: Vec = bmc_ips.iter().map(ToString::to_string).collect(); + let sql = r#" + SELECT m.id::text, mia.address AS bmc_ip, mi.mac_address AS bmc_mac_address, m.rack_id + FROM machines m + JOIN machine_interfaces mi + ON mi.machine_id = m.id + AND mi.interface_type = 'Bmc' + JOIN machine_interface_addresses mia + ON mia.interface_id = mi.id + WHERE host(mia.address) = ANY($1) + "#; + + let rows: Vec = sqlx::query_as(sql) + .bind(ip_strings) + .fetch_all(db) + .await + .map_err(|err| DatabaseError::new("machine::find_rms_identities_by_bmc_ips", err))?; + + let mut seen = std::collections::HashSet::with_capacity(rows.len()); + for row in &rows { + if !seen.insert(row.bmc_ip) { + return Err(DatabaseError::internal(format!( + "duplicate machine RMS identity mapping for bmc_ip={}", + row.bmc_ip + ))); + } + } + Ok(rows) +} + pub fn count_healthy_unhealthy_host_machines( all_machines: &HashMap, ) -> (i32, i32) { diff --git a/crates/component-manager/src/component_manager.rs b/crates/component-manager/src/component_manager.rs index 8096be2b1a..d4e082eaa7 100644 --- a/crates/component-manager/src/component_manager.rs +++ b/crates/component-manager/src/component_manager.rs @@ -145,11 +145,22 @@ pub async fn build_component_manager( }; let compute_tray: Arc = match config.compute_tray_backend { - // TODO: implement ComputeTrayManager for RmsBackend Backend::Rms => { - return Err(ComponentManagerError::InvalidArgument( - "compute_tray_backend 'rms' is not yet supported".into(), - )); + let client = rms_client.clone().ok_or_else(|| { + ComponentManagerError::InvalidArgument( + "compute_tray_backend is 'rms' but RMS client is not configured".into(), + ) + })?; + let db = db.clone().ok_or_else(|| { + ComponentManagerError::InvalidArgument( + "compute_tray_backend is 'rms' but database pool is not configured".into(), + ) + })?; + Arc::new(crate::rms::RmsBackend::new( + client, + rms_switch_system_image_client.clone(), + db, + )) } Backend::Core => { let pool = redfish_pool.ok_or_else(|| { diff --git a/crates/component-manager/src/rms.rs b/crates/component-manager/src/rms.rs index 1154a29dc5..ac3dbb29dc 100644 --- a/crates/component-manager/src/rms.rs +++ b/crates/component-manager/src/rms.rs @@ -16,6 +16,7 @@ */ use std::collections::HashMap; +use std::net::IpAddr; use std::sync::{Arc, Mutex}; use forge_secrets::credentials::Credentials; @@ -23,11 +24,15 @@ use librms::protos::rack_manager as rms; use librms::{RackManagerError, RmsApi}; use mac_address::MacAddress; use model::component_manager::{ - FirmwareState, NvSwitchComponent, PowerAction, PowerShelfComponent, + ComputeTrayComponent, FirmwareState, NvSwitchComponent, PowerAction, PowerShelfComponent, }; use sqlx::PgPool; use tracing::instrument; +use crate::compute_tray_manager::{ + Backend as ComputeTrayBackend, ComputeTrayEndpoint, ComputeTrayFirmwareUpdateStatus, + ComputeTrayManager, ComputeTrayResult, +}; use crate::error::ComponentManagerError; use crate::nv_switch_manager::{ NvSwitchManager, SwitchComponentResult, SwitchEndpoint, SwitchFirmwareUpdateStatus, @@ -64,6 +69,7 @@ const RMS_SWITCH_SYSTEM_IMAGE_SOFTWARE_TYPE: &str = "prod"; const RMS_FIRMWARE_OBJECT_HARDWARE_TYPE: &str = "any"; const RMS_NOAUTH_ACCESS_TOKEN: &str = "NOAUTH"; const RMS_SWITCH_NODE_TYPE: rms::NodeType = rms::NodeType::Switch; +const RMS_COMPUTE_NODE_TYPE: rms::NodeType = rms::NodeType::Compute; pub struct RmsBackend { client: Arc, @@ -144,6 +150,45 @@ async fn resolve_power_shelf_identities( Ok(map) } +/// Resolved RMS identity for a compute tray, keyed by BMC IP. +struct ComputeTrayRmsIdentity { + identity: RmsIdentity, + bmc_mac: MacAddress, +} + +/// Resolve compute tray BMC IP addresses to RMS identities via the api-db layer. +async fn resolve_compute_tray_identities( + db: &PgPool, + bmc_ips: &[IpAddr], +) -> Result, ComponentManagerError> { + let rows = db::machine::find_rms_identities_by_bmc_ips(db, bmc_ips) + .await + .map_err(|e| { + ComponentManagerError::Internal(format!( + "failed to resolve compute tray RMS identities: {e}" + )) + })?; + + let mut map = HashMap::with_capacity(rows.len()); + for row in rows { + let Some(rack_id) = row.rack_id else { + tracing::warn!(bmc_ip = %row.bmc_ip, "compute tray has no rack_id, skipping"); + continue; + }; + map.insert( + row.bmc_ip, + ComputeTrayRmsIdentity { + identity: RmsIdentity { + node_id: row.id, + rack_id: rack_id.to_string(), + }, + bmc_mac: row.bmc_mac_address, + }, + ); + } + Ok(map) +} + /// Resolve switch MAC addresses to RMS identities via the api-db layer. async fn resolve_switch_identities( db: &PgPool, @@ -624,6 +669,10 @@ async fn list_firmware_object_ids( /// switches. Mirrors the value used by `crate::rack::firmware_update`. const SWITCH_BMC_PORT: u32 = 443; +/// Default BMC HTTPS port used when populating `rms::Endpoint` for compute +/// trays. +const COMPUTE_TRAY_BMC_PORT: u32 = 443; + fn credentials_to_rms(creds: &Credentials) -> rms::Credentials { let Credentials::UsernamePassword { username, password } = creds; rms::Credentials { @@ -871,6 +920,43 @@ fn switch_firmware_object_component_filters(components: &[NvSwitchComponent]) -> .collect() } +fn compute_tray_firmware_object_component_filters( + components: &[ComputeTrayComponent], +) -> Vec { + if components.is_empty() { + Vec::new() + } else { + components + .iter() + .map(|component| component.to_string()) + .collect() + } +} + +/// Build the `rms::NodeInfo` describing a compute tray for inclusion in an +/// RMS batch request. Compute trays expose only a BMC endpoint. +fn build_compute_tray_node_info( + ep: &ComputeTrayEndpoint, + identity: &RmsIdentity, + bmc_mac: MacAddress, +) -> rms::NodeInfo { + rms::NodeInfo { + node_id: identity.node_id.clone(), + rack_id: identity.rack_id.clone(), + r#type: Some(RMS_COMPUTE_NODE_TYPE as i32), + bmc_endpoint: Some(rms::Endpoint { + interface: Some(rms::NetworkInterface { + ip_address: ep.bmc_ip.to_string(), + mac_address: bmc_mac.to_string(), + }), + port: COMPUTE_TRAY_BMC_PORT, + credentials: Some(credentials_to_rms(&ep.bmc_credentials)), + dangerously_accept_invalid_certs: true, + }), + host_endpoint: None, + } +} + fn summarize_firmware_object_apply_response( response: rms::ApplyFirmwareObjectResponse, node_id: &str, @@ -1437,14 +1523,263 @@ impl NvSwitchManager for RmsBackend { } } +#[async_trait::async_trait] +impl ComputeTrayManager for RmsBackend { + fn name(&self) -> &str { + "rms" + } + + fn backend(&self) -> ComputeTrayBackend { + ComputeTrayBackend::Rms + } + + #[instrument(skip(self), fields(backend = "rms"))] + async fn power_control( + &self, + endpoints: &[ComputeTrayEndpoint], + action: PowerAction, + ) -> Result, ComponentManagerError> { + let bmc_ips: Vec = endpoints.iter().map(|ep| ep.bmc_ip).collect(); + let ids = resolve_compute_tray_identities(&self.db, &bmc_ips).await?; + let operation = to_rms_power_operation(action); + let mut results = Vec::with_capacity(endpoints.len()); + + for ep in endpoints { + let Some(identity) = ids.get(&ep.bmc_ip) else { + results.push(ComputeTrayResult { + bmc_ip: ep.bmc_ip, + success: false, + error: Some("could not resolve RMS identity from database".into()), + }); + continue; + }; + + let device = build_compute_tray_node_info(ep, &identity.identity, identity.bmc_mac); + let request = rms::BatchSetPowerStateRequest { + nodes: Some(rms::NodeSet { + nodes: vec![device], + }), + operation, + }; + + match self.client.batch_set_power_state(request).await { + Ok(response) => { + let (success, error) = + summarize_power_batch(response.response.unwrap_or_default()); + results.push(ComputeTrayResult { + bmc_ip: ep.bmc_ip, + success, + error, + }); + } + Err(e) => { + tracing::warn!( + bmc_ip = %ep.bmc_ip, + error = %e, + "RMS power control failed for compute tray" + ); + results.push(ComputeTrayResult { + bmc_ip: ep.bmc_ip, + success: false, + error: Some(e.to_string()), + }); + } + } + } + + Ok(results) + } + + #[instrument(skip(self, target_version, options), fields(backend = "rms", force_update = options.force_update))] + async fn update_firmware( + &self, + endpoints: &[ComputeTrayEndpoint], + target_version: &str, + components: &[ComputeTrayComponent], + options: &FirmwareUpdateOptions, + ) -> Result, ComponentManagerError> { + let bmc_ips: Vec = endpoints.iter().map(|ep| ep.bmc_ip).collect(); + let ids = resolve_compute_tray_identities(&self.db, &bmc_ips).await?; + let component_filters = compute_tray_firmware_object_component_filters(components); + let mut results = Vec::with_capacity(endpoints.len()); + + for ep in endpoints { + let Some(identity) = ids.get(&ep.bmc_ip) else { + results.push(ComputeTrayResult { + bmc_ip: ep.bmc_ip, + success: false, + error: Some("could not resolve RMS identity from database".into()), + }); + continue; + }; + + let device = build_compute_tray_node_info(ep, &identity.identity, identity.bmc_mac); + let request = match apply_firmware_object_request( + device, + &identity.identity, + target_version, + options, + RMS_COMPUTE_NODE_TYPE, + component_filters.clone(), + ) { + Ok(request) => request, + Err(e) => { + results.push(ComputeTrayResult { + bmc_ip: ep.bmc_ip, + success: false, + error: Some(e.to_string()), + }); + continue; + } + }; + + match self.client.apply_firmware_object(request).await { + Ok(response) => { + let (success, error, job_id) = summarize_firmware_object_apply_response( + response, + &identity.identity.node_id, + ); + + if success { + if let Some(job_id) = job_id { + self.firmware_jobs.lock().unwrap().insert( + identity.bmc_mac, + vec![RmsTrackedFirmwareJob::FirmwareObject(job_id)], + ); + } else { + self.firmware_jobs.lock().unwrap().remove(&identity.bmc_mac); + } + } else { + self.firmware_jobs.lock().unwrap().remove(&identity.bmc_mac); + } + + results.push(ComputeTrayResult { + bmc_ip: ep.bmc_ip, + success, + error, + }); + } + Err(e) => { + tracing::warn!( + bmc_ip = %ep.bmc_ip, + error = %e, + "RMS firmware update failed for compute tray" + ); + results.push(ComputeTrayResult { + bmc_ip: ep.bmc_ip, + success: false, + error: Some(e.to_string()), + }); + } + } + } + + Ok(results) + } + + #[instrument(skip(self), fields(backend = "rms"))] + async fn get_firmware_status( + &self, + endpoints: &[ComputeTrayEndpoint], + ) -> Result, ComponentManagerError> { + let bmc_ips: Vec = endpoints.iter().map(|ep| ep.bmc_ip).collect(); + let ids = resolve_compute_tray_identities(&self.db, &bmc_ips).await?; + + let endpoint_jobs: Vec<(IpAddr, Option)> = { + let jobs = self.firmware_jobs.lock().unwrap(); + endpoints + .iter() + .map(|ep| { + let job_id = ids.get(&ep.bmc_ip).and_then(|identity| { + jobs.get(&identity.bmc_mac).and_then(|jobs| { + jobs.iter().find_map(|job| match job { + RmsTrackedFirmwareJob::FirmwareObject(job_id) => { + Some(job_id.clone()) + } + RmsTrackedFirmwareJob::SwitchSystemImage { .. } => None, + }) + }) + }); + (ep.bmc_ip, job_id) + }) + .collect() + }; + + let mut statuses = Vec::with_capacity(endpoints.len()); + + for (bmc_ip, job_id) in &endpoint_jobs { + let Some(job_id) = job_id else { + statuses.push(ComputeTrayFirmwareUpdateStatus { + bmc_ip: *bmc_ip, + state: FirmwareState::Unknown, + target_version: String::new(), + error: Some("no firmware job tracked for this compute tray".into()), + }); + continue; + }; + + let request = rms::GetFirmwareJobStatusRequest { + job_id: job_id.clone(), + }; + + match self.client.get_firmware_job_status(request).await { + Ok(response) => { + let status_success = response.status == rms::ReturnCode::Success as i32; + let state = if status_success { + map_rms_firmware_job_state(response.job_state) + } else { + FirmwareState::Unknown + }; + let error = if response.error_message.is_empty() { + (!status_success).then(|| { + format!("RMS could not report status for firmware job {job_id}") + }) + } else { + Some(response.error_message) + }; + statuses.push(ComputeTrayFirmwareUpdateStatus { + bmc_ip: *bmc_ip, + state, + target_version: String::new(), + error, + }); + } + Err(e) => { + tracing::warn!( + bmc_ip = %bmc_ip, + job_id = %job_id, + error = %e, + "RMS firmware job status query failed" + ); + statuses.push(ComputeTrayFirmwareUpdateStatus { + bmc_ip: *bmc_ip, + state: FirmwareState::Unknown, + target_version: String::new(), + error: Some(e.to_string()), + }); + } + } + } + + Ok(statuses) + } + + #[instrument(skip(self), fields(backend = "rms"))] + async fn list_firmware_bundles(&self) -> Result, ComponentManagerError> { + list_firmware_object_ids(self.client.as_ref()).await + } +} + #[cfg(test)] mod tests { use api_test_helper::mock_rms::MockRmsApi; + use carbide_uuid::machine::MachineId; use carbide_uuid::power_shelf::PowerShelfId; use carbide_uuid::rack::RackId; use carbide_uuid::switch::SwitchId; use super::*; + use crate::compute_tray_manager::{ComputeTrayManager, ComputeTrayVendor}; use crate::power_shelf_manager::PowerShelfVendor; #[async_trait::async_trait] @@ -1457,7 +1792,8 @@ mod tests { } } use crate::test_support::{ - PS_MAC_1, PS_MAC_2, SW_MAC_1, SW_MAC_2, UNKNOWN_MAC, seed_test_data, + CT_IP_1, CT_IP_2, CT_MAC_1, CT_MAC_2, PS_MAC_1, PS_MAC_2, SW_MAC_1, SW_MAC_2, UNKNOWN_MAC, + seed_machine, seed_test_data, }; // ---- Mapping unit tests ---- @@ -1636,6 +1972,18 @@ mod tests { assert!(switch_firmware_object_component_filters(&[]).is_empty()); } + #[test] + fn compute_tray_component_filters_map_to_rms_names() { + assert_eq!( + compute_tray_firmware_object_component_filters(&[ + ComputeTrayComponent::Bmc, + ComputeTrayComponent::Bios, + ]), + vec!["BMC".to_owned(), "BIOS".to_owned()] + ); + assert!(compute_tray_firmware_object_component_filters(&[]).is_empty()); + } + #[test] fn firmware_update_missing_batch_response_is_failure() { let response = rms::ApplyFirmwareObjectResponse { @@ -1705,6 +2053,41 @@ mod tests { (mock, backend, rack_id, ps1, ps2, sw1, sw2) } + async fn make_compute_tray_backend( + pool: &sqlx::PgPool, + ) -> (Arc, RmsBackend, RackId, MachineId, MachineId) { + let mut txn = pool.begin().await.unwrap(); + let rack_id = RackId::new(uuid::Uuid::new_v4().to_string()); + db::rack::create( + &mut txn, + &rack_id, + None, + &model::rack::RackConfig::default(), + None, + ) + .await + .expect("failed to create rack"); + let ct1 = seed_machine(&mut txn, CT_MAC_1, CT_IP_1, "CT-001", &rack_id).await; + let ct2 = seed_machine(&mut txn, CT_MAC_2, CT_IP_2, "CT-002", &rack_id).await; + txn.commit().await.unwrap(); + + let mock = Arc::new(MockRmsApi::new()); + let backend = RmsBackend::new(mock.clone(), Some(mock.clone()), pool.clone()); + (mock, backend, rack_id, ct1, ct2) + } + + fn make_ct_endpoint(bmc_ip: &str) -> ComputeTrayEndpoint { + use forge_secrets::credentials::Credentials; + ComputeTrayEndpoint { + vendor: ComputeTrayVendor::Nvidia, + bmc_ip: bmc_ip.parse().unwrap(), + bmc_credentials: Credentials::UsernamePassword { + username: "admin".into(), + password: "pass".into(), + }, + } + } + fn firmware_update_options() -> FirmwareUpdateOptions { FirmwareUpdateOptions { access_token: Some("token".to_owned()), @@ -1887,15 +2270,15 @@ mod tests { .await; let eps = vec![make_ps_endpoint(PS_MAC_1), make_ps_endpoint(PS_MAC_2)]; - let results = backend - .update_firmware( - &eps, - r#"{"Id":"fw-json"}"#, - &[PowerShelfComponent::Pmc], - &firmware_update_options(), - ) - .await - .unwrap(); + let results = PowerShelfManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[PowerShelfComponent::Pmc], + &firmware_update_options(), + ) + .await + .unwrap(); assert!(results[0].success); assert!(results[1].success); @@ -1940,15 +2323,15 @@ mod tests { .await; let eps = vec![make_ps_endpoint(PS_MAC_1)]; - let results = backend - .update_firmware( - &eps, - r#"{"Id":"fw-json"}"#, - &[PowerShelfComponent::Pmc, PowerShelfComponent::Psu], - &firmware_update_options(), - ) - .await - .unwrap(); + let results = PowerShelfManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[PowerShelfComponent::Pmc, PowerShelfComponent::Psu], + &firmware_update_options(), + ) + .await + .unwrap(); assert!(results[0].success); @@ -1967,15 +2350,15 @@ mod tests { .await; let eps = vec![make_ps_endpoint(PS_MAC_1)]; - let results = backend - .update_firmware( - &eps, - r#"{"Id":"fw-json"}"#, - &[PowerShelfComponent::Pmc], - &firmware_update_options(), - ) - .await - .unwrap(); + let results = PowerShelfManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[PowerShelfComponent::Pmc], + &firmware_update_options(), + ) + .await + .unwrap(); assert!(!results[0].success); assert_eq!(results[0].error.as_deref(), Some("bad firmware file")); @@ -1991,30 +2374,30 @@ mod tests { "job-old", ))) .await; - backend - .update_firmware( - &eps, - r#"{"Id":"fw-json"}"#, - &[PowerShelfComponent::Pmc], - &firmware_update_options(), - ) - .await - .unwrap(); + PowerShelfManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[PowerShelfComponent::Pmc], + &firmware_update_options(), + ) + .await + .unwrap(); mock.enqueue_apply_firmware_object(Ok(MockRmsApi::firmware_object_apply_fail( &ps1.to_string(), "bad firmware file", ))) .await; - backend - .update_firmware( - &eps, - r#"{"Id":"fw-json"}"#, - &[PowerShelfComponent::Pmc], - &firmware_update_options(), - ) - .await - .unwrap(); + PowerShelfManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[PowerShelfComponent::Pmc], + &firmware_update_options(), + ) + .await + .unwrap(); let jobs = backend.firmware_jobs.lock().unwrap(); assert!(!jobs.contains_key(&PS_MAC_1.parse::().unwrap())); @@ -2030,15 +2413,15 @@ mod tests { ))) .await; let eps = vec![make_ps_endpoint(PS_MAC_1)]; - backend - .update_firmware( - &eps, - r#"{"Id":"fw-json"}"#, - &[PowerShelfComponent::Pmc], - &firmware_update_options(), - ) - .await - .unwrap(); + PowerShelfManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[PowerShelfComponent::Pmc], + &firmware_update_options(), + ) + .await + .unwrap(); mock.enqueue_get_firmware_job_status(Ok(MockRmsApi::firmware_job_status_ok( rms::FirmwareJobState::Running, @@ -2085,15 +2468,15 @@ mod tests { ))) .await; let eps = vec![make_ps_endpoint(PS_MAC_1)]; - backend - .update_firmware( - &eps, - r#"{"Id":"fw-json"}"#, - &[PowerShelfComponent::Pmc], - &firmware_update_options(), - ) - .await - .unwrap(); + PowerShelfManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[PowerShelfComponent::Pmc], + &firmware_update_options(), + ) + .await + .unwrap(); mock.enqueue_get_firmware_job_status(Ok(MockRmsApi::firmware_job_status_ok( rms::FirmwareJobState::Completed, @@ -2116,15 +2499,15 @@ mod tests { ))) .await; let eps = vec![make_ps_endpoint(PS_MAC_1)]; - backend - .update_firmware( - &eps, - r#"{"Id":"fw-json"}"#, - &[PowerShelfComponent::Pmc], - &firmware_update_options(), - ) - .await - .unwrap(); + PowerShelfManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[PowerShelfComponent::Pmc], + &firmware_update_options(), + ) + .await + .unwrap(); mock.enqueue_get_firmware_job_status(Ok(rms::GetFirmwareJobStatusResponse { status: rms::ReturnCode::Success as i32, @@ -2151,15 +2534,15 @@ mod tests { ))) .await; let eps = vec![make_ps_endpoint(PS_MAC_1)]; - backend - .update_firmware( - &eps, - r#"{"Id":"fw-json"}"#, - &[PowerShelfComponent::Pmc], - &firmware_update_options(), - ) - .await - .unwrap(); + PowerShelfManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[PowerShelfComponent::Pmc], + &firmware_update_options(), + ) + .await + .unwrap(); mock.enqueue_get_firmware_job_status(Ok(rms::GetFirmwareJobStatusResponse { status: rms::ReturnCode::Failure as i32, @@ -2634,8 +3017,116 @@ mod tests { })) .await; - let bundles = backend.list_firmware_bundles().await.unwrap(); + let bundles = NvSwitchManager::list_firmware_bundles(&backend) + .await + .unwrap(); assert!(bundles.is_empty()); } + + // ---- ComputeTrayManager tests ---- + + #[carbide_macros::sqlx_test] + async fn ct_power_control_success(pool: sqlx::PgPool) { + let (mock, backend, rack_id, ct1, ct2) = make_compute_tray_backend(&pool).await; + mock.enqueue_batch_set_power_state(Ok(MockRmsApi::batch_set_power_state_ok( + &ct1.to_string(), + ))) + .await; + mock.enqueue_batch_set_power_state(Ok(MockRmsApi::batch_set_power_state_ok( + &ct2.to_string(), + ))) + .await; + + let eps = vec![make_ct_endpoint(CT_IP_1), make_ct_endpoint(CT_IP_2)]; + let results = ComputeTrayManager::power_control(&backend, &eps, PowerAction::On) + .await + .unwrap(); + + assert_eq!(results.len(), 2); + assert!(results[0].success); + assert!(results[1].success); + + let calls = mock.batch_set_power_state_calls().await; + assert_eq!(calls.len(), 2); + assert_eq!(calls[0].operation, rms::PowerOperation::On as i32); + let dev0 = &calls[0].nodes.as_ref().unwrap().nodes[0]; + assert_eq!(dev0.node_id, ct1.to_string()); + assert_eq!(dev0.rack_id, rack_id.to_string()); + assert_eq!(dev0.r#type, Some(rms::NodeType::Compute as i32)); + assert!(dev0.bmc_endpoint.is_some()); + assert!(dev0.host_endpoint.is_none()); + } + + #[carbide_macros::sqlx_test] + async fn ct_update_firmware_success(pool: sqlx::PgPool) { + let (mock, backend, rack_id, ct1, _) = make_compute_tray_backend(&pool).await; + mock.enqueue_apply_firmware_object(Ok(MockRmsApi::firmware_object_apply_ok( + &ct1.to_string(), + "ct-job-1", + ))) + .await; + + let eps = vec![make_ct_endpoint(CT_IP_1)]; + let results = ComputeTrayManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[ComputeTrayComponent::Bmc], + &firmware_update_options(), + ) + .await + .unwrap(); + + assert!(results[0].success); + + let calls = mock.apply_firmware_object_calls().await; + assert_eq!(calls.len(), 1); + assert_eq!(calls[0].rack_id, rack_id.to_string()); + let filters = component_filters_for(&calls[0], rms::NodeType::Compute); + assert_eq!(filters, &["BMC".to_owned()]); + let dev0 = &calls[0].nodes.as_ref().unwrap().nodes[0]; + assert_eq!(dev0.r#type, Some(rms::NodeType::Compute as i32)); + + let jobs = backend.firmware_jobs.lock().unwrap(); + assert_eq!( + jobs.get(&CT_MAC_1.parse::().unwrap()), + Some(&vec![RmsTrackedFirmwareJob::FirmwareObject( + "ct-job-1".to_string() + )]) + ); + } + + #[carbide_macros::sqlx_test] + async fn ct_firmware_status_tracks_job(pool: sqlx::PgPool) { + let (mock, backend, _, ct1, _) = make_compute_tray_backend(&pool).await; + mock.enqueue_apply_firmware_object(Ok(MockRmsApi::firmware_object_apply_ok( + &ct1.to_string(), + "ct-job-status", + ))) + .await; + + let eps = vec![make_ct_endpoint(CT_IP_1)]; + ComputeTrayManager::update_firmware( + &backend, + &eps, + r#"{"Id":"fw-json"}"#, + &[ComputeTrayComponent::Bmc], + &firmware_update_options(), + ) + .await + .unwrap(); + + mock.enqueue_get_firmware_job_status(Ok(MockRmsApi::firmware_job_status_ok( + rms::FirmwareJobState::Completed, + ))) + .await; + + let statuses = ComputeTrayManager::get_firmware_status(&backend, &eps) + .await + .unwrap(); + + assert_eq!(statuses[0].state, FirmwareState::Completed); + assert!(statuses[0].error.is_none()); + } } diff --git a/crates/component-manager/src/test_support.rs b/crates/component-manager/src/test_support.rs index f8992c0c16..72840e5b6d 100644 --- a/crates/component-manager/src/test_support.rs +++ b/crates/component-manager/src/test_support.rs @@ -1,12 +1,18 @@ // SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 +use std::net::IpAddr; + +use carbide_uuid::machine::{MachineId, MachineIdSource, MachineInterfaceId, MachineType}; +use carbide_uuid::network::NetworkSegmentId; use carbide_uuid::power_shelf::{PowerShelfId, PowerShelfIdSource, PowerShelfType}; use carbide_uuid::rack::RackId; use carbide_uuid::switch::{SwitchId, SwitchIdSource, SwitchType}; use mac_address::MacAddress; +use model::expected_machine::{ExpectedMachine, ExpectedMachineData}; use model::expected_power_shelf::ExpectedPowerShelf; use model::expected_switch::ExpectedSwitch; +use model::machine::ManagedHostState; use model::metadata::Metadata; use model::power_shelf::{NewPowerShelf, PowerShelfConfig}; use model::rack::{RackConfig, RackState}; @@ -17,6 +23,10 @@ pub(crate) const PS_MAC_1: &str = "AA:BB:CC:DD:EE:01"; pub(crate) const PS_MAC_2: &str = "AA:BB:CC:DD:EE:02"; pub(crate) const SW_MAC_1: &str = "AA:BB:CC:DD:FF:01"; pub(crate) const SW_MAC_2: &str = "AA:BB:CC:DD:FF:02"; +pub(crate) const CT_MAC_1: &str = "AA:BB:CC:DD:CC:01"; +pub(crate) const CT_MAC_2: &str = "AA:BB:CC:DD:CC:02"; +pub(crate) const CT_IP_1: &str = "10.0.1.1"; +pub(crate) const CT_IP_2: &str = "10.0.1.2"; pub(crate) const UNKNOWN_MAC: &str = "FF:FF:FF:FF:FF:FF"; pub(crate) fn test_power_shelf_id(label: &str) -> PowerShelfId { @@ -30,6 +40,17 @@ pub(crate) fn test_power_shelf_id(label: &str) -> PowerShelfId { ) } +pub(crate) fn test_machine_id(label: &str) -> MachineId { + let mut hash = [0u8; 32]; + let bytes = label.as_bytes(); + hash[..bytes.len().min(32)].copy_from_slice(&bytes[..bytes.len().min(32)]); + MachineId::new( + MachineIdSource::ProductBoardChassisSerial, + hash, + MachineType::Host, + ) +} + pub(crate) fn test_switch_id(label: &str) -> SwitchId { let mut hash = [0u8; 32]; let bytes = label.as_bytes(); @@ -170,3 +191,112 @@ pub(crate) async fn seed_switch( sw_id } + +/// Ensure an admin network segment exists for component-manager sqlx tests. +/// Migrated template DBs used by `#[carbide_macros::sqlx_test]` do not seed +/// network segments, but compute tray RMS tests need BMC interface rows. +async fn ensure_admin_network_segment(txn: &mut sqlx::PgConnection) -> NetworkSegmentId { + if let Ok(segments) = db::network_segment::admin(&mut *txn).await { + if let Some(segment) = segments.into_iter().next() { + return segment.id; + } + } + + let segment_id: NetworkSegmentId = sqlx::query_scalar( + "INSERT INTO network_segments (name, version, network_segment_type) \ + VALUES ($1, 'V1-T0', 'admin') RETURNING id", + ) + .bind(format!("cm-test-admin-{}", uuid::Uuid::new_v4())) + .fetch_one(&mut *txn) + .await + .expect("failed to create admin network segment"); + + sqlx::query( + "INSERT INTO network_prefixes (segment_id, prefix, gateway, num_reserved) \ + VALUES ($1, '10.0.0.0/8'::cidr, '10.0.0.1'::inet, 0)", + ) + .bind(segment_id) + .execute(&mut *txn) + .await + .expect("failed to create admin network prefix"); + + segment_id +} + +async fn seed_bmc_interface( + txn: &mut sqlx::PgConnection, + segment_id: NetworkSegmentId, + machine_id: &MachineId, + mac: MacAddress, + bmc_ip: IpAddr, + hostname: &str, +) { + let interface_id: MachineInterfaceId = sqlx::query_scalar( + "INSERT INTO machine_interfaces \ + (segment_id, mac_address, primary_interface, hostname, machine_id, interface_type, association_type) \ + VALUES ($1, $2, false, $3, $4, 'Bmc', 'Machine') \ + RETURNING id", + ) + .bind(segment_id) + .bind(mac) + .bind(hostname) + .bind(machine_id.to_string()) + .fetch_one(&mut *txn) + .await + .expect("failed to create BMC machine interface"); + + sqlx::query( + "INSERT INTO machine_interface_addresses (interface_id, address, allocation_type) \ + VALUES ($1, $2, 'static')", + ) + .bind(interface_id) + .bind(bmc_ip) + .execute(&mut *txn) + .await + .expect("failed to create BMC machine interface address"); +} + +pub(crate) async fn seed_machine( + txn: &mut sqlx::PgConnection, + mac: &str, + bmc_ip: &str, + label: &str, + rack_id: &RackId, +) -> MachineId { + let machine_id = test_machine_id(label); + let mac: MacAddress = mac.parse().unwrap(); + let bmc_ip: IpAddr = bmc_ip.parse().unwrap(); + let expected_data = ExpectedMachineData { + serial_number: label.to_owned(), + rack_id: Some(rack_id.clone()), + bmc_ip_address: Some(bmc_ip), + ..Default::default() + }; + + db::expected_machine::create( + &mut *txn, + ExpectedMachine { + id: None, + bmc_mac_address: mac, + data: expected_data.clone(), + }, + ) + .await + .expect("failed to create expected machine"); + + db::machine::create( + &mut *txn, + None, + &machine_id, + ManagedHostState::Ready, + Some(&expected_data), + 2, + ) + .await + .expect("failed to create machine"); + + let segment_id = ensure_admin_network_segment(&mut *txn).await; + seed_bmc_interface(&mut *txn, segment_id, &machine_id, mac, bmc_ip, label).await; + + machine_id +}