Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions crates/api-core/src/cfg/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ applicable.
| `initial_dpu_agent_upgrade_policy` | `Option<AgentUpgradePolicyChoice>` | — | Policy for nico-dpu-agent upgrades. Also settable via `nico-admin-cli`. |
| `max_concurrent_machine_updates` | `Option<i32>` | — | **Deprecated.** Use `machine_updater` instead. |
| `machine_update_run_interval` | `Option<u64>` | — | Interval (seconds) at which the machine update manager checks for updates. |
| `retained_boot_interface_window` | `Option<Duration>` | — (forever) | How long a retained boot interface pair (`retained_boot_interfaces` table) stays applicable after its `machine_interfaces` row was deleted. Unset retains forever; set a window (e.g. `30d`) so a MAC reappearing on different hardware doesn't inherit an obsolete Redfish interface id. |
| `site_explorer` | `SiteExplorerConfig` | *(see below)* | SiteExplorer hardware discovery settings (see [SiteExplorerConfig](#siteexplorerconfig)). |
| `nvue_enabled` | `bool` | `true` | DPU agent uses NVUE for config instead of writing files directly. |
| `vpc_peering_policy` | `Option<VpcPeeringPolicy>` | — | Policy for VPC peering based on network virtualization type at creation time. |
Expand Down
32 changes: 31 additions & 1 deletion crates/api-core/src/cfg/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ use carbide_preingestion_manager::PreingestionManagerConfig;
use carbide_rack_controller::config::{RackValidationConfig, RmsConfig};
use carbide_site_explorer::config::SiteExplorerConfig;
use carbide_state_controller_common::config::StateControllerConfig;
use carbide_utils::config::{as_duration, as_std_duration};
use carbide_utils::config::{as_duration, as_option_duration, as_std_duration};
use chrono::Duration;
use db::host_naming::HostNamingStrategyKind;
use duration_str::{deserialize_duration, deserialize_duration_chrono};
Expand All @@ -64,6 +64,20 @@ use serde::{Deserialize, Deserializer, Serialize};
pub(crate) const DEFAULT_DPU_NUM_OF_VFS: u32 = 16;
pub(crate) const MAX_DPU_NUM_OF_VFS: u32 = 126;

/// Parses an optional duration ("30d", "12h", ...; absent = `None`) into
/// `Option<chrono::Duration>`. Hand-rolled because `duration_str` deprecated
/// its own Option variant -- we do NOT use the deprecated function.
fn deserialize_option_duration_chrono<'de, D>(
deserializer: D,
) -> Result<Option<chrono::Duration>, D::Error>
where
D: serde::Deserializer<'de>,
{
Option::<String>::deserialize(deserializer)?
.map(|value| duration_str::parse_chrono(&value).map_err(serde::de::Error::custom))
.transpose()
}

/// nico-api configuration file content
#[derive(Clone, Debug, Deserialize, Serialize)]
pub struct CarbideConfig {
Expand Down Expand Up @@ -246,6 +260,19 @@ pub struct CarbideConfig {
/// The interval at which the machine update manager checks for machine updates in seconds.
pub machine_update_run_interval: Option<u64>,

/// How long a retained boot interface pair (see the
/// `retained_boot_interfaces` table) stays applicable after its
/// `machine_interfaces` row was deleted. The default (`None`) retains
/// forever: if the machine eventually comes back, the pair is waiting.
/// Set a window (e.g. "30d") to keep a MAC that reappears on different
/// hardware from inheriting an obsolete Redfish interface id.
#[serde(
default,
deserialize_with = "deserialize_option_duration_chrono",
serialize_with = "as_option_duration"
)]
pub retained_boot_interface_window: Option<chrono::Duration>,

/// SiteExplorer related configuration
#[serde(default)]
pub site_explorer: SiteExplorerConfig,
Expand Down Expand Up @@ -2690,6 +2717,7 @@ mod tests {
assert_eq!(
config.site_explorer,
SiteExplorerConfig {
retained_boot_interface_window: None,
enabled: Arc::new(false.into()),
run_interval: std::time::Duration::from_secs(120),
concurrent_explorations: 10,
Expand Down Expand Up @@ -2881,6 +2909,7 @@ mod tests {
assert_eq!(
config.site_explorer,
SiteExplorerConfig {
retained_boot_interface_window: None,
enabled: Arc::new(true.into()),
run_interval: std::time::Duration::from_secs(100),
concurrent_explorations: 30,
Expand Down Expand Up @@ -3207,6 +3236,7 @@ mod tests {
assert_eq!(
config.site_explorer,
SiteExplorerConfig {
retained_boot_interface_window: None,
enabled: Arc::new(false.into()),
run_interval: std::time::Duration::from_secs(100),
concurrent_explorations: 10,
Expand Down
17 changes: 14 additions & 3 deletions crates/api-core/src/dhcp/discover.rs
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@ pub async fn discover_dhcp(
&mut txn,
&expected_interface,
relay_ip,
api.runtime_config.retained_boot_interface_window,
)
.await?;
Some(expected_interface.machine_id)
Expand Down Expand Up @@ -269,7 +270,10 @@ pub async fn discover_dhcp(
// machine interface (and machine interface address) for it,
// creating one if needed.
db::machine_interface::preallocate_machine_interface(
&mut txn, parsed_mac, fixed_ip,
&mut txn,
parsed_mac,
fixed_ip,
api.runtime_config.retained_boot_interface_window,
)
.await?;
}
Expand All @@ -286,7 +290,10 @@ pub async fn discover_dhcp(
// InterfaceType::Bmc (and primary=false). Races against
// site-explorer's reconciliation pass are handled inside preallocate.
db::machine_interface::preallocate_bmc_machine_interface(
&mut txn, parsed_mac, bmc_ip,
&mut txn,
parsed_mac,
bmc_ip,
api.runtime_config.retained_boot_interface_window,
)
.await?;
} else if let Some(s) =
Expand All @@ -303,7 +310,10 @@ pub async fn discover_dhcp(
// Races against site-explorer's reconciliation pass are handled
// inside preallocate.
db::machine_interface::preallocate_machine_interface(
&mut txn, parsed_mac, nvos_ip,
&mut txn,
parsed_mac,
nvos_ip,
api.runtime_config.retained_boot_interface_window,
)
.await?;
}
Expand All @@ -319,6 +329,7 @@ pub async fn discover_dhcp(
std::slice::from_ref(&parsed_relay),
host_nic,
is_primary_nic,
api.runtime_config.retained_boot_interface_window,
)
.await?;

Expand Down
53 changes: 46 additions & 7 deletions crates/api-core/src/handlers/expected_machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -224,13 +224,25 @@ pub(crate) async fn update(

// Update BMC interface if bmc_ip_address is set.
if let Some(bmc_ip) = machine.data.bmc_ip_address {
update_preallocated_machine_interface(&mut txn, machine.bmc_mac_address, bmc_ip).await?;
update_preallocated_machine_interface(
&mut txn,
machine.bmc_mac_address,
bmc_ip,
api.runtime_config.retained_boot_interface_window,
)
.await?;
}

// Update/create machine interfaces for host NICs with fixed IPs.
for nic in &machine.data.host_nics {
if let Some(ip) = nic.fixed_ip {
update_preallocated_machine_interface(&mut txn, nic.mac_address, ip).await?;
update_preallocated_machine_interface(
&mut txn,
nic.mac_address,
ip,
api.runtime_config.retained_boot_interface_window,
)
.await?;
}
}

Expand Down Expand Up @@ -426,6 +438,7 @@ async fn update_expected_machine(
machine: rpc::ExpectedMachine,
id: Uuid,
parsed_mac: MacAddress,
retained_window: Option<chrono::Duration>,
) -> Result<(), CarbideError> {
let data: ExpectedMachineData = machine.try_into()?;

Expand All @@ -436,8 +449,13 @@ async fn update_expected_machine(
};

if let Some(bmc_ip) = expected_machine.data.bmc_ip_address {
update_preallocated_machine_interface(txn, expected_machine.bmc_mac_address, bmc_ip)
.await?;
update_preallocated_machine_interface(
txn,
expected_machine.bmc_mac_address,
bmc_ip,
retained_window,
)
.await?;
}

db::expected_machine::update(txn, &expected_machine).await?;
Expand Down Expand Up @@ -491,10 +509,13 @@ async fn apply_operation(
machine: rpc::ExpectedMachine,
id: Uuid,
parsed_mac: MacAddress,
retained_window: Option<chrono::Duration>,
) -> Result<(), CarbideError> {
match op {
BatchOperation::Create => create_expected_machine(txn, machine, id, parsed_mac).await,
BatchOperation::Update => update_expected_machine(txn, machine, id, parsed_mac).await,
BatchOperation::Update => {
update_expected_machine(txn, machine, id, parsed_mac, retained_window).await
}
}
}

Expand Down Expand Up @@ -542,7 +563,16 @@ async fn process_batch_operations(
}
};

match apply_operation(op, txn.as_pgconn(), machine, id, parsed_mac).await {
match apply_operation(
op,
txn.as_pgconn(),
machine,
id,
parsed_mac,
api.runtime_config.retained_boot_interface_window,
)
.await
{
Ok(_) => match txn.commit().await {
Ok(_) => results.push(build_success_result(machine_for_result)),
Err(e) => {
Expand Down Expand Up @@ -574,7 +604,16 @@ async fn process_batch_operations(
value: id.to_string(),
});

if let Err(e) = apply_operation(op, txn.as_pgconn(), machine, id, parsed_mac).await {
if let Err(e) = apply_operation(
op,
txn.as_pgconn(),
machine,
id,
parsed_mac,
api.runtime_config.retained_boot_interface_window,
)
.await
{
let _ = txn.rollback().await;
return Err(e);
}
Expand Down
9 changes: 7 additions & 2 deletions crates/api-core/src/handlers/expected_power_shelf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,13 @@ pub async fn update_expected_power_shelf(
})?;

if let Some(bmc_ip) = power_shelf.bmc_ip_address {
update_preallocated_machine_interface(&mut txn, power_shelf.bmc_mac_address, bmc_ip)
.await?;
update_preallocated_machine_interface(
&mut txn,
power_shelf.bmc_mac_address,
bmc_ip,
api.runtime_config.retained_boot_interface_window,
)
.await?;
}

db_expected_power_shelf::update(&mut txn, &power_shelf)
Expand Down
16 changes: 14 additions & 2 deletions crates/api-core/src/handlers/expected_switch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,24 @@ pub async fn update_expected_switch(
})?;

if let Some(bmc_ip) = switch.bmc_ip_address {
update_preallocated_machine_interface(&mut txn, switch.bmc_mac_address, bmc_ip).await?;
update_preallocated_machine_interface(
&mut txn,
switch.bmc_mac_address,
bmc_ip,
api.runtime_config.retained_boot_interface_window,
)
.await?;
}
if let Some(nvos_ip) = switch.nvos_ip_address {
// Pairing already validated above; nvos_mac_addresses has exactly one entry.
let nvos_mac = switch.nvos_mac_addresses[0];
update_preallocated_machine_interface(&mut txn, nvos_mac, nvos_ip).await?;
update_preallocated_machine_interface(
&mut txn,
nvos_mac,
nvos_ip,
api.runtime_config.retained_boot_interface_window,
)
.await?;
}

db_expected_switch::update(&mut txn, &switch)
Expand Down
3 changes: 3 additions & 0 deletions crates/api-core/src/handlers/machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,9 @@ pub(crate) async fn admin_force_delete_machine(

if request.delete_interfaces {
for interface in &machine.interfaces {
// The delete retains each row's boot interface pair in
// `retained_boot_interfaces`, so a re-ingested machine
// recovers its boot target before its first DHCP.
db::machine_interface::delete(&interface.id, &mut txn).await?;
}
response.host_interfaces_deleted = true;
Expand Down
1 change: 1 addition & 0 deletions crates/api-core/src/handlers/machine_discovery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,7 @@ pub(crate) async fn discover_machine(
&mut txn,
Some(&hardware_info),
&machine_id,
api.runtime_config.retained_boot_interface_window,
)
.await?;

Expand Down
2 changes: 2 additions & 0 deletions crates/api-core/src/handlers/machine_interface_address.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ pub async fn update_preallocated_machine_interface(
txn: &mut sqlx::PgConnection,
bmc_mac_address: MacAddress,
bmc_ip: std::net::IpAddr,
retained_window: Option<chrono::Duration>,
) -> Result<(), CarbideError> {
let existing = db::machine_interface::find_by_mac_address(&mut *txn, bmc_mac_address).await?;

Expand Down Expand Up @@ -98,6 +99,7 @@ pub async fn update_preallocated_machine_interface(
&bmc_mac_address,
true,
AddressSelectionStrategy::StaticAddress(bmc_ip),
retained_window,
)
.await?;

Expand Down
20 changes: 19 additions & 1 deletion crates/api-core/src/setup.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1357,9 +1357,27 @@ pub async fn initialize_and_start_controllers<'a>(
.start(join_set, cancel_token.clone())?;
}

let site_explorer_config = {
let mut config = carbide_config.site_explorer.clone();
// `retained_boot_interface_window` is a single top-level knob
// (retention spans DHCP, deletion, and ingestion -- it isn't a
// site-explorer feature). Site-explorer's copy is `#[serde(skip)]`,
// so it can't be set under `[site_explorer]`; this hand-off is the
// only way the value gets in, sparing a constructor parameter
// through `SiteExplorer::new` and every test fixture.
config.retained_boot_interface_window = carbide_config.retained_boot_interface_window;
if let Some(window) = config.retained_boot_interface_window {
tracing::info!(
window_seconds = window.num_seconds(),
"retained_boot_interface_window configured; retained boot interface \
records expire instead of waiting forever"
);
}
config
};
SiteExplorer::new(
db_pool.clone(),
carbide_config.site_explorer.clone(),
site_explorer_config,
meter.clone(),
bmc_explorer.clone(),
Arc::new(carbide_config.get_firmware_config()),
Expand Down
1 change: 1 addition & 0 deletions crates/api-core/src/test_support/default_config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ pub fn get() -> CarbideConfig {
initial_dpu_agent_upgrade_policy: None,
max_concurrent_machine_updates: None,
machine_update_run_interval: Some(1),
retained_boot_interface_window: None,
site_explorer: SiteExplorerConfig {
enabled: Arc::new(false.into()),
run_interval: std::time::Duration::from_secs(0),
Expand Down
28 changes: 28 additions & 0 deletions crates/api-core/src/tests/common/api_fixtures/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1028,6 +1028,33 @@ pub async fn create_test_env(db_pool: sqlx::PgPool) -> TestEnv {
create_test_env_with_overrides(db_pool, Default::default()).await
}

/// `create_test_env` with the fixture admin + host-inband site prefixes
/// routable and the host-inband network segment created -- the standard
/// setup for zero-DPU / NicMode ingestion tests.
pub async fn create_test_env_with_host_inband(db_pool: sqlx::PgPool) -> TestEnv {
let env = create_test_env_with_overrides(
db_pool,
TestEnvOverrides {
site_prefixes: Some(vec![
IpNetwork::new(
network_segment::FIXTURE_ADMIN_NETWORK_SEGMENT_GATEWAY.network(),
network_segment::FIXTURE_ADMIN_NETWORK_SEGMENT_GATEWAY.prefix(),
)
.unwrap(),
IpNetwork::new(
network_segment::FIXTURE_HOST_INBAND_NETWORK_SEGMENT_GATEWAY.network(),
network_segment::FIXTURE_HOST_INBAND_NETWORK_SEGMENT_GATEWAY.prefix(),
)
.unwrap(),
]),
..Default::default()
},
)
.await;
network_segment::create_host_inband_network_segment(&env.api, None).await;
env
}

#[derive(Debug, Default)]
pub struct VerifierSimImpl {
should_fail_parsing: Arc<AtomicBool>,
Expand Down Expand Up @@ -1495,6 +1522,7 @@ pub async fn create_test_env_with_overrides(
db_pool.clone(),
SiteExplorerConfig {
enabled: Arc::new(true.into()),
retained_boot_interface_window: None,
// run_interval shouldn't matter, this should not be run(), we only trigger intervals manually.
run_interval: Duration::seconds(0).to_std().unwrap(),
concurrent_explorations: 100,
Expand Down
Loading
Loading