From 36435ae36fb5a8d14700df64bac53b7ed68b51e7 Mon Sep 17 00:00:00 2001 From: Adnan Dosani Date: Tue, 9 Jun 2026 22:40:58 -0700 Subject: [PATCH 1/3] feat(logging): add a component field to log metadata MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a `component` field to NICo's structured (logfmt) log lines, so logs can be filtered by the emitting component in centralized logging instead of grepping free-text messages. Resolves #2049. `component` is one flat value per line: the binary by default, or — for a line emitted from an in-process subsystem of nico-api (e.g. Site Explorer) — that subsystem. Values: nico-api — API handlers, DB, startup: anything not in a subsystem below ├── site-explorer ├── machine_state_controller ├── switch_controller ├── rack_controller ├── power_shelf_controller ├── network_segments_controller ├── vpc_prefix_controller ├── ib_partition_controller └── attestation_controller nico-bmc-proxy nico-dhcp nico-dsx-exchange-consumer nico-fmds nico-hardware-health nico-rvs nico-test-artifact-cache nico-dpu-agent nico-scout Set per binary via `logfmt::layer().with_component(...)`; subsystems set it on their root span and nested lines inherit it. The existing `controller=` field is unchanged and coexists. Details in docs/observability/logging.md. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Adnan Dosani --- crates/agent/src/ethernet_virtualization.rs | 2 +- crates/agent/src/main.rs | 2 +- crates/agent/src/tests/full.rs | 2 +- .../agent/src/tests/test_network_monitor.rs | 2 +- crates/agent/src/tests/upgrade.rs | 2 +- crates/api-core/src/logging/setup.rs | 4 +- crates/bmc-proxy/src/setup.rs | 6 +- crates/dhcp-server/src/main.rs | 6 +- crates/dsx-exchange-consumer/src/main.rs | 6 +- crates/fmds/src/main.rs | 2 +- crates/health/src/main.rs | 2 +- crates/host-support/src/lib.rs | 14 +- crates/logfmt/src/lib.rs | 212 +++++++++++++++++- crates/rvs/src/bin/carbide-rvs.rs | 2 +- crates/rvs/src/bin/test-artifact-cache.rs | 2 +- crates/scout/src/main.rs | 2 +- crates/site-explorer/src/lib.rs | 1 + .../src/controller/periodic_enqueuer.rs | 1 + .../src/controller/processor.rs | 1 + docs/index.yml | 2 + docs/observability/logging.md | 98 ++++++++ 21 files changed, 351 insertions(+), 20 deletions(-) create mode 100644 docs/observability/logging.md diff --git a/crates/agent/src/ethernet_virtualization.rs b/crates/agent/src/ethernet_virtualization.rs index 30db817e98..5cbc8c342b 100644 --- a/crates/agent/src/ethernet_virtualization.rs +++ b/crates/agent/src/ethernet_virtualization.rs @@ -1886,7 +1886,7 @@ mod tests { use crate::{HBNDeviceNames, dhcp, nvue}; #[ctor::ctor(unsafe)] fn setup() { - carbide_host_support::init_logging().unwrap(); + carbide_host_support::init_logging("nico-dpu-agent").unwrap(); } #[test] diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index e4219e1187..c63c4d6cd4 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -18,7 +18,7 @@ use std::time::Duration; fn main() -> eyre::Result<()> { - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-dpu-agent")?; // We need a multi-threaded runtime since background threads will queue work // on it, and the foreground thread might not be blocked onto the runtime diff --git a/crates/agent/src/tests/full.rs b/crates/agent/src/tests/full.rs index 4b1922d3e2..90f9dd3f9e 100644 --- a/crates/agent/src/tests/full.rs +++ b/crates/agent/src/tests/full.rs @@ -276,7 +276,7 @@ async fn run_common_parts( virtualization_type: VpcVirtualizationType, test_metadata_service: bool, ) -> eyre::Result { - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-dpu-agent")?; let state: Arc> = Arc::new(Mutex::new(Default::default())); state.lock().await.virtualization_type = virtualization_type; diff --git a/crates/agent/src/tests/test_network_monitor.rs b/crates/agent/src/tests/test_network_monitor.rs index 413048b0e5..251bdf99f3 100644 --- a/crates/agent/src/tests/test_network_monitor.rs +++ b/crates/agent/src/tests/test_network_monitor.rs @@ -52,7 +52,7 @@ struct State { #[tokio::test] pub async fn test_network_monitor() -> eyre::Result<()> { - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-dpu-agent")?; let state: Arc> = Arc::new(Mutex::new(Default::default())); diff --git a/crates/agent/src/tests/upgrade.rs b/crates/agent/src/tests/upgrade.rs index d735b6498a..27b39923d3 100644 --- a/crates/agent/src/tests/upgrade.rs +++ b/crates/agent/src/tests/upgrade.rs @@ -29,7 +29,7 @@ const ROOT_CERT_PATH: &str = "dev/certs/forge_developer_local_only_root_cert_pem #[tokio::test] async fn test_upgrade_check() -> eyre::Result<()> { - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-dpu-agent")?; unsafe { env::set_var("DISABLE_TLS_ENFORCEMENT", "true"); diff --git a/crates/api-core/src/logging/setup.rs b/crates/api-core/src/logging/setup.rs index dcb8a0c0fa..45e44884fe 100644 --- a/crates/api-core/src/logging/setup.rs +++ b/crates/api-core/src/logging/setup.rs @@ -104,7 +104,9 @@ pub fn setup_logging( let (logfmt_stdout_filter, logfmt_stdout_reload_handle) = reload::Layer::new(initial_log_filter.clone()); - let logfmt_stdout_formatter = logfmt::layer().with_event_fields(extra_logfmt_event_fields); + let logfmt_stdout_formatter = logfmt::layer() + .with_component("nico-api") + .with_event_fields(extra_logfmt_event_fields); let spancount_layer = spancounter::layer(); let spancount_reader = spancount_layer.reader(); diff --git a/crates/bmc-proxy/src/setup.rs b/crates/bmc-proxy/src/setup.rs index 11c6b12a1f..46b1ec6e30 100644 --- a/crates/bmc-proxy/src/setup.rs +++ b/crates/bmc-proxy/src/setup.rs @@ -49,7 +49,11 @@ pub fn setup_logging(debug: bool) -> SetupResult<()> { ); tracing_subscriber::registry() - .with(logfmt::layer().with_filter(log_filter)) + .with( + logfmt::layer() + .with_component("nico-bmc-proxy") + .with_filter(log_filter), + ) .try_init()?; tracing::info!("current log level: {}", LevelFilter::current()); diff --git a/crates/dhcp-server/src/main.rs b/crates/dhcp-server/src/main.rs index ba3a85da44..6ac12632f1 100644 --- a/crates/dhcp-server/src/main.rs +++ b/crates/dhcp-server/src/main.rs @@ -221,7 +221,11 @@ fn setup_tracing() -> Result<(), Box> { .add_directive("hickory_proto=info".parse().unwrap()); tracing_subscriber::registry() - .with(logfmt::layer().with_filter(env_filter)) + .with( + logfmt::layer() + .with_component("nico-dhcp") + .with_filter(env_filter), + ) .try_init()?; Ok(()) } diff --git a/crates/dsx-exchange-consumer/src/main.rs b/crates/dsx-exchange-consumer/src/main.rs index 8e59bdeba8..d20692e341 100644 --- a/crates/dsx-exchange-consumer/src/main.rs +++ b/crates/dsx-exchange-consumer/src/main.rs @@ -31,7 +31,11 @@ async fn main() -> Result<(), DsxConsumerError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer().with_filter(env_filter)) + .with( + logfmt::layer() + .with_component("nico-dsx-exchange-consumer") + .with_filter(env_filter), + ) .try_init() .map_err(|e| DsxConsumerError::Config(e.to_string()))?; diff --git a/crates/fmds/src/main.rs b/crates/fmds/src/main.rs index 202235b86b..6791048689 100644 --- a/crates/fmds/src/main.rs +++ b/crates/fmds/src/main.rs @@ -47,7 +47,7 @@ pub fn subscriber() -> impl SubscriberInitExt { .add_directive("hickory_resolver::name_server=info".parse().unwrap()) .add_directive("hickory_proto=info".parse().unwrap()) .add_directive("netlink_proto=warn".parse().unwrap()); - let stdout_formatter = logfmt::layer(); + let stdout_formatter = logfmt::layer().with_component("nico-fmds"); Box::new(tracing_subscriber::registry().with(stdout_formatter.with_filter(env_filter))) } diff --git a/crates/health/src/main.rs b/crates/health/src/main.rs index 3fb08094c9..f9a807cdca 100644 --- a/crates/health/src/main.rs +++ b/crates/health/src/main.rs @@ -31,7 +31,7 @@ async fn main() -> Result<(), HealthError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer()) + .with(logfmt::layer().with_component("nico-hardware-health")) .with(env_filter) .init(); diff --git a/crates/host-support/src/lib.rs b/crates/host-support/src/lib.rs index b3329b85af..a926a8594c 100644 --- a/crates/host-support/src/lib.rs +++ b/crates/host-support/src/lib.rs @@ -35,9 +35,13 @@ static LOG_SETUP: Once = Once::new(); /// Initialize global logging output to STDOUT. Applies to all threads. /// Use `export RUST_LOG=trace|debug|info|warn|error` to change log level. -pub fn init_logging() -> eyre::Result<()> { +/// +/// `component` tags every log line with `component=` (e.g. `nico-scout`, +/// `nico-dpu-agent`) so logs can be filtered by the emitting binary. It must be +/// passed by the caller because this setup is shared across binaries. +pub fn init_logging(component: &str) -> eyre::Result<()> { LOG_SETUP.call_once(|| { - subscriber() + subscriber(component) .try_init() .expect("tracing_subscriber setup failed"); }); @@ -47,9 +51,9 @@ pub fn init_logging() -> eyre::Result<()> { // A logging subscriber for use on the current thread. // Usually you want `init_logging()` instead. // -// Usage: `let guard = subscriber().set_default()` +// Usage: `let guard = subscriber("nico-scout").set_default()` // Subscriber is unregistered when guard is dropped. -pub fn subscriber() -> impl SubscriberInitExt { +pub fn subscriber(component: &str) -> impl SubscriberInitExt { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) .from_env_lossy() @@ -64,6 +68,6 @@ pub fn subscriber() -> impl SubscriberInitExt { .add_directive("hickory_resolver::name_server=info".parse().unwrap()) .add_directive("hickory_proto=info".parse().unwrap()) .add_directive("netlink_proto=warn".parse().unwrap()); - let stdout_formatter = logfmt::layer(); + let stdout_formatter = logfmt::layer().with_component(component); Box::new(tracing_subscriber::registry().with(stdout_formatter.with_filter(env_filter))) } diff --git a/crates/logfmt/src/lib.rs b/crates/logfmt/src/lib.rs index fec9bacd6d..7b7418dc4d 100644 --- a/crates/logfmt/src/lib.rs +++ b/crates/logfmt/src/lib.rs @@ -40,6 +40,7 @@ where emit_span_logs: true, write_end_time: false, extra_event_fields: Vec::new(), + default_component: None, } } @@ -57,6 +58,12 @@ pub struct LogFmtLayer { emit_span_logs: bool, write_end_time: bool, extra_event_fields: Vec, + /// The per-binary default `component` value, emitted as `component=` + /// right after the `level=` field on every event and span line (e.g. + /// `nico-api`). A span may override it for lines within its scope by setting + /// its own `component` attribute; see `resolve_component`. `None` (the default + /// until `with_component` is called) emits no `component` field. + default_component: Option, } impl LogFmtLayer @@ -94,6 +101,37 @@ where ..self } } + + /// Set the per-binary default `component`, emitted as `component=` + /// right after `level=` on every event and span line (e.g. `nico-api`). + /// + /// This identifies the emitting service so logs can be filtered by component + /// in centralized logging. A span may override it for the lines within its + /// scope by setting its own `component` attribute (see `resolve_component`). + pub fn with_component(self, component: impl Into) -> Self { + Self { + default_component: Some(component.into()), + ..self + } + } + + /// Resolve the `component` for a log line: the nearest enclosing span that + /// set a `component` attribute (walking innermost -> outermost), else the + /// per-binary default from `with_component`. `None` if neither is present. + fn resolve_component(&self, span: Option<&SpanRef<'_, S>>) -> Option { + if let Some(span) = span { + for span in span.scope() { + let ext = span.extensions(); + if let Some(value) = ext + .get::() + .and_then(|data| data.attributes.get("component")) + { + return Some(value.clone()); + } + } + } + self.default_component.clone() + } } struct Timing { @@ -210,9 +248,17 @@ where current_span: Option<&SpanRef<'a, R>>, out: &mut Vec, extra_event_fields: &[String], + component: Option<&str>, ) -> Result<(), std::io::Error> { write!(out, "level={} ", event.metadata().level())?; + // `component` (the nearest enclosing span's value, else the per-binary + // default) is emitted right after `level=` so logs can be filtered by + // the emitting component. Absent when unset, so output is unchanged. + if let Some(component) = component { + write!(out, "{} ", kvp("component", component))?; + } + // TODO: More metadata? if let Some(current_span) = current_span { @@ -257,6 +303,7 @@ where // A temporary format buffer is used because accessing stdout can be expensive // The format buffer is kept around as a threadlocal variable to reduce allocations + let component = self.resolve_component(current_span.as_ref()); FORMAT_BUFFER.with(|fbuf| { let format_buffer: &mut Vec = &mut fbuf.borrow_mut(); if let Ok(()) = write_event_data( @@ -264,6 +311,7 @@ where current_span.as_ref(), format_buffer, &self.extra_event_fields, + component.as_deref(), ) { let mut writer = (self.make_writer)(); let _ = writer.write_all(format_buffer); @@ -275,6 +323,10 @@ where fn on_close(&self, id: span::Id, ctx: Context<'_, S>) { let span = ctx.span(&id).expect("Span not found, this is a bug"); + // Resolve `component` before taking the mutable extensions lock below + // (resolve_component reads extensions immutably while walking the scope). + let component = self.resolve_component(Some(&span)); + let mut extensions = span.extensions_mut(); let Some(mut data) = extensions.remove::() else { @@ -332,9 +384,18 @@ where span_metadata: &tracing::Metadata, mut data: LogFmtData, out: &mut Vec, + component: Option<&str>, ) -> Result<(), std::io::Error> { write!(out, "level=SPAN")?; + // `component` is emitted right after `level=SPAN`, mirroring event + // lines. Remove it from the span's own attributes first so it is not + // emitted twice (a span may set `component` directly, e.g. site-explorer). + data.attributes.remove("component"); + if let Some(component) = component { + write!(out, " {}", kvp("component", component))?; + } + // Start writing the span_id and span_name for consistency if let Some(value) = data.attributes.remove("span_id") { write!(out, " {}", kvp("span_id", value))?; @@ -356,7 +417,9 @@ where // The format buffer is kept around as a threadlocal variable to reduce allocations FORMAT_BUFFER.with(|fbuf| { let format_buffer: &mut Vec = &mut fbuf.borrow_mut(); - if let Ok(()) = write_span_data(span.metadata(), data, format_buffer) { + if let Ok(()) = + write_span_data(span.metadata(), data, format_buffer, component.as_deref()) + { let mut writer = (self.make_writer)(); let _ = writer.write_all(format_buffer); } @@ -849,6 +912,153 @@ mod tests { ); } + #[test] + fn test_component_default_on_event_and_span() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_component("nico-api"); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let span = tracing::span!(tracing::Level::INFO, "test_span", span_id = "s1234",); + let _entered = span.enter(); + tracing::info!("inside"); + drop(_entered); + drop(span); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + assert_eq!(lines.len(), 2, "got {}: {:?}", lines.len(), lines); + // The per-binary default is emitted right after `level=` on both lines. + assert!( + lines[0].starts_with(r#"level=INFO component=nico-api span_id=s1234 msg=inside "#), + "Line is: {}", + lines[0] + ); + assert!( + lines[1] + .starts_with(r#"level=SPAN component=nico-api span_id=s1234 span_name=test_span "#), + "Line is: {}", + lines[1] + ); + } + + #[test] + fn test_component_span_overrides_default_once() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_component("nico-api"); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let span = tracing::span!( + tracing::Level::INFO, + "explore_site", + span_id = "s1", + component = "site-explorer", + ); + let _entered = span.enter(); + tracing::info!("inside"); + drop(_entered); + drop(span); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + assert_eq!(lines.len(), 2, "got {}: {:?}", lines.len(), lines); + // The span value overrides the default, and `component` appears once. + assert!( + lines[0].starts_with(r#"level=INFO component=site-explorer span_id=s1 msg=inside "#), + "Line is: {}", + lines[0] + ); + assert!(!lines[0].contains("nico-api"), "default leaked: {}", lines[0]); + assert_eq!( + lines[0].matches("component=").count(), + 1, + "duplicate component key: {}", + lines[0] + ); + assert!( + lines[1].starts_with( + r#"level=SPAN component=site-explorer span_id=s1 span_name=explore_site "# + ), + "Line is: {}", + lines[1] + ); + assert_eq!( + lines[1].matches("component=").count(), + 1, + "duplicate component key: {}", + lines[1] + ); + } + + #[test] + fn test_component_inherited_by_nested_span() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_component("nico-api"); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + // Root span sets `component`; the nested child span does not. + let root = tracing::span!( + tracing::Level::INFO, + "explore_site", + span_id = "root", + component = "site-explorer", + ); + let _root_entered = root.enter(); + let child = tracing::span!(tracing::Level::INFO, "probe_endpoint", span_id = "child"); + let _child_entered = child.enter(); + tracing::warn!("redfish_timeout"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + // The event fires in the child span, which set no `component`; it must + // inherit `site-explorer` by walking up to the root span. + let event_line = lines + .iter() + .find(|l| l.contains("msg=redfish_timeout")) + .expect("event line not found"); + assert!( + event_line.contains("component=site-explorer"), + "nested event did not inherit component: {event_line}" + ); + assert!( + !event_line.contains("component=nico-api"), + "nested event fell back to default: {event_line}" + ); + } + + #[test] + fn test_no_component_when_unset() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer().with_writer(Arc::new(move || Box::new(cloned_writer.clone()))); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + tracing::warn!("e"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + // With no default and no span override, no `component` field is emitted, + // guaranteeing byte-identical output for the existing snapshot tests. + assert!( + lines[0].starts_with(r#"level=WARN msg=e "#), + "Line is: {}", + lines[0] + ); + assert!( + !lines[0].contains("component="), + "unexpected component: {}", + lines[0] + ); + } + #[test] fn test_object_id_in_event_logs() { let writer = TestWriter::new(); diff --git a/crates/rvs/src/bin/carbide-rvs.rs b/crates/rvs/src/bin/carbide-rvs.rs index 8caa2cde24..31f133ce40 100644 --- a/crates/rvs/src/bin/carbide-rvs.rs +++ b/crates/rvs/src/bin/carbide-rvs.rs @@ -53,7 +53,7 @@ async fn main() -> Result<(), RvsError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer()) + .with(logfmt::layer().with_component("nico-rvs")) .with(env_filter) .init(); diff --git a/crates/rvs/src/bin/test-artifact-cache.rs b/crates/rvs/src/bin/test-artifact-cache.rs index 018796f1a4..d2e08e1953 100644 --- a/crates/rvs/src/bin/test-artifact-cache.rs +++ b/crates/rvs/src/bin/test-artifact-cache.rs @@ -73,7 +73,7 @@ async fn main() -> Result<(), RvsError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer()) + .with(logfmt::layer().with_component("nico-test-artifact-cache")) .with(env_filter) .init(); diff --git a/crates/scout/src/main.rs b/crates/scout/src/main.rs index d38a12f0f4..61a3db6090 100644 --- a/crates/scout/src/main.rs +++ b/crates/scout/src/main.rs @@ -95,7 +95,7 @@ async fn main() -> Result<(), eyre::Report> { check_if_running_in_qemu().await; - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-scout")?; tracing::info!("Running as {}...{}", config.mode, config.version); diff --git a/crates/site-explorer/src/lib.rs b/crates/site-explorer/src/lib.rs index 7f2b3e5e64..f5672bec37 100644 --- a/crates/site-explorer/src/lib.rs +++ b/crates/site-explorer/src/lib.rs @@ -404,6 +404,7 @@ impl SiteExplorer { "explore_site", span_id, carbide.trace_root = true, + component = "site-explorer", otel.status_code = tracing::field::Empty, otel.status_message = tracing::field::Empty, created_machines = tracing::field::Empty, diff --git a/crates/state-controller/src/controller/periodic_enqueuer.rs b/crates/state-controller/src/controller/periodic_enqueuer.rs index 017b0f43a3..47858c8d61 100644 --- a/crates/state-controller/src/controller/periodic_enqueuer.rs +++ b/crates/state-controller/src/controller/periodic_enqueuer.rs @@ -118,6 +118,7 @@ impl PeriodicEnqueuer { span_id, carbide.trace_root = true, controller = IO::LOG_SPAN_CONTROLLER_NAME, + component = IO::LOG_SPAN_CONTROLLER_NAME, iteration_id = tracing::field::Empty, otel.status_code = tracing::field::Empty, otel.status_message = tracing::field::Empty, diff --git a/crates/state-controller/src/controller/processor.rs b/crates/state-controller/src/controller/processor.rs index 9edace5210..315fbe233e 100644 --- a/crates/state-controller/src/controller/processor.rs +++ b/crates/state-controller/src/controller/processor.rs @@ -213,6 +213,7 @@ impl StateProcessor { span_id, carbide.trace_root = true, controller = IO::LOG_SPAN_CONTROLLER_NAME, + component = IO::LOG_SPAN_CONTROLLER_NAME, ); async { diff --git a/docs/index.yml b/docs/index.yml index 596900e8c4..6f234cffc2 100644 --- a/docs/index.yml +++ b/docs/index.yml @@ -194,6 +194,8 @@ navigation: path: observability/core_metrics.md - page: Traces path: observability/tracing.md + - page: Logging + path: observability/logging.md - section: Playbooks contents: - page: Azure OIDC for Infra Controller Web UI diff --git a/docs/observability/logging.md b/docs/observability/logging.md new file mode 100644 index 0000000000..a41f6d34f4 --- /dev/null +++ b/docs/observability/logging.md @@ -0,0 +1,98 @@ +# NICo Logging + +How NICo services emit logs, the structured fields each line carries, and how to set log levels. + +--- + +## TL;DR + +- **Most** NICo services log in **[logfmt](https://brandur.org/logfmt)** (`key=value` pairs) to **stdout**; + a few use other formats (e.g. `nico-dns` emits JSON — see [Coverage](#coverage)). +- Every **logfmt** line begins with `level=` and carries a **`component=`** field identifying the emitting + component, so logs can be filtered by component instead of grepping the message text. +- Log verbosity is controlled by `RUST_LOG` (default `info`). +- For distributed **tracing** (spans exported via OTLP), see [Traces](tracing.md). + +## Log format + +There are two kinds of line. + +**Events** — one per log call: +``` +level=INFO component=nico-api span_id=0x4f… msg="…" location="crates/api-core/src/…" +``` + +**Spans** — emitted when a unit of work closes (`level=SPAN`), carrying the span name, id, its attributes, +and timing: +``` +level=SPAN component=site-explorer span_id=0xf7… span_name=explore_site timing_elapsed_us=… timing_busy_ns=… timing_idle_ns=… +``` + +Common keys: `level`, `component`, `msg`, `location` (`file:line`); `span_id` when the line is inside a +span; plus any structured fields the call site adds (e.g. `controller=`, `object_id=`). + +## Log level + +Verbosity is an [`EnvFilter`](https://docs.rs/tracing-subscriber) directive set from `RUST_LOG`, defaulting +to `info`: +``` +RUST_LOG=debug +RUST_LOG=info,carbide_site_explorer=debug # raise one module +``` +`nico-api` can also adjust its filter at runtime via `nico-admin-cli set log-filter`. + +## The `component` field + +On logfmt lines, the `component` field is set to one of the following: + +``` +nico-api — API handlers, DB, startup: anything not in a subsystem below +├── site-explorer +├── machine_state_controller +├── switch_controller +├── rack_controller +├── power_shelf_controller +├── network_segments_controller +├── vpc_prefix_controller +├── ib_partition_controller +└── attestation_controller +nico-bmc-proxy +nico-dhcp +nico-dsx-exchange-consumer +nico-fmds +nico-hardware-health +nico-rvs +nico-test-artifact-cache +nico-dpu-agent +nico-scout +``` + +State-controller lines also carry a `controller=` field with the same value. + +### Adding it to new code + +- **New binary** — set the default when building the `logfmt` layer: + `logfmt::layer().with_component("nico-my-service")`. Host-side binaries (agent/scout) pass it through + `carbide_host_support::init_logging("nico-my-service")`. A binary that omits this emits no `component`. +- **New in-process subsystem of `nico-api`** — add a `component` field to the subsystem's root span: + `tracing::span!(parent: None, Level::INFO, "my_subsystem", component = "my-subsystem", /* … */)`. Nested + instrumented functions and spawned tasks carried with `.instrument(...)` / `.in_current_span()` inherit it. + +> **A line can carry more than one `component=`.** The logging layer always adds the emitting +> service/subsystem `component` (above). Independently, a call site may include its *own* `component` field +> for domain data — e.g. `nico-hardware-health`'s "Firmware info event" sets `component= component>` (BMC/BIOS/…). Such lines then show two `component=` keys: the layer's first, the call site's +> after. For new code, prefer a specific field name (e.g. `firmware_component`) to avoid the ambiguity. + +### Coverage + +Binaries that do not use the `logfmt` layer carry no `component`: `nico-dns` (emits JSON), `nico-pxe` +(hand-rolled formatter), `nico-ssh-console`, `nico-dpu-otel-agent`. CLI/dev tools and mocks are out of scope. + +## Querying + +Parse the logfmt line and filter on the field in whatever store the logs land in. For example, with logql +(Loki): +```logql +{namespace="nico-system"} | logfmt | component="site-explorer" +``` From e23d9760605ad989d7e57ccf62a3b0b145245ca9 Mon Sep 17 00:00:00 2001 From: Adnan Dosani Date: Wed, 10 Jun 2026 07:47:55 -0700 Subject: [PATCH 2/3] fix(logging): rename firmware-event field to firmware_component MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Renames the `component` field on nico-hardware-health's "Firmware info event" to `firmware_component`, so the logging `component` stays unambiguous when filtering by it — the event's own firmware value no longer collides with the layer-added emitting-component value. Per PR review. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Adnan Dosani --- crates/health/src/sink/tracing.rs | 2 +- docs/observability/logging.md | 8 +++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/crates/health/src/sink/tracing.rs b/crates/health/src/sink/tracing.rs index f253b486e0..b1aff54784 100644 --- a/crates/health/src/sink/tracing.rs +++ b/crates/health/src/sink/tracing.rs @@ -72,7 +72,7 @@ impl DataSink for TracingSink { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, - component = %info.component, + firmware_component = %info.component, version = %info.version, "Firmware info event" ); diff --git a/docs/observability/logging.md b/docs/observability/logging.md index a41f6d34f4..992c94c7b0 100644 --- a/docs/observability/logging.md +++ b/docs/observability/logging.md @@ -78,11 +78,9 @@ State-controller lines also carry a `controller=` field with the same valu `tracing::span!(parent: None, Level::INFO, "my_subsystem", component = "my-subsystem", /* … */)`. Nested instrumented functions and spawned tasks carried with `.instrument(...)` / `.in_current_span()` inherit it. -> **A line can carry more than one `component=`.** The logging layer always adds the emitting -> service/subsystem `component` (above). Independently, a call site may include its *own* `component` field -> for domain data — e.g. `nico-hardware-health`'s "Firmware info event" sets `component= component>` (BMC/BIOS/…). Such lines then show two `component=` keys: the layer's first, the call site's -> after. For new code, prefer a specific field name (e.g. `firmware_component`) to avoid the ambiguity. +> **`component` is reserved for the emitting component** — don't reuse it as an event/span field for +> unrelated data. Give domain values their own key (e.g. `nico-hardware-health`'s firmware-inventory events +> use `firmware_component`, not `component`). ### Coverage From 8232784fc422b85ce135e4877724cb422da94387 Mon Sep 17 00:00:00 2001 From: Adnan Dosani <258082943+adnandnv@users.noreply.github.com> Date: Wed, 10 Jun 2026 21:13:54 -0700 Subject: [PATCH 3/3] refactor(logging): unify event fields into an inheritable EventField API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reworks the logfmt layer's component support (#2049) into a single EventField type — a field name with an optional default — configured through one with_event_fields builder. This replaces the earlier with_component setter and folds the separate default/extra field paths into one API. Each configured field is resolved once, when a span is created: the span's own value if it set one, else the value inherited from its parent span. Child spans therefore inherit fields like `component` (and nico-api's `object_id`) with no ancestor walk at log time — event and span lines just read the span's resolved value, falling back to the field's configured default or omitting it. Update all logfmt call sites to with_event_fields and refresh the logging docs. Co-Authored-By: Claude Opus 4.8 Signed-off-by: Adnan Dosani <258082943+adnandnv@users.noreply.github.com> --- crates/api-core/src/logging/setup.rs | 10 +- crates/bmc-proxy/src/setup.rs | 5 +- crates/dhcp-server/src/main.rs | 2 +- crates/dsx-exchange-consumer/src/main.rs | 5 +- crates/fmds/src/main.rs | 3 +- crates/health/src/main.rs | 7 +- crates/host-support/src/lib.rs | 3 +- crates/logfmt/src/lib.rs | 609 +++++++++++++++++----- crates/rvs/src/bin/carbide-rvs.rs | 5 +- crates/rvs/src/bin/test-artifact-cache.rs | 7 +- docs/observability/logging.md | 9 +- 11 files changed, 519 insertions(+), 146 deletions(-) diff --git a/crates/api-core/src/logging/setup.rs b/crates/api-core/src/logging/setup.rs index 45e44884fe..20e64e704b 100644 --- a/crates/api-core/src/logging/setup.rs +++ b/crates/api-core/src/logging/setup.rs @@ -104,9 +104,13 @@ pub fn setup_logging( let (logfmt_stdout_filter, logfmt_stdout_reload_handle) = reload::Layer::new(initial_log_filter.clone()); - let logfmt_stdout_formatter = logfmt::layer() - .with_component("nico-api") - .with_event_fields(extra_logfmt_event_fields); + let mut event_fields = vec![logfmt::EventField::with_default("component", "nico-api")]; + event_fields.extend( + extra_logfmt_event_fields + .into_iter() + .map(logfmt::EventField::new), + ); + let logfmt_stdout_formatter = logfmt::layer().with_event_fields(event_fields); let spancount_layer = spancounter::layer(); let spancount_reader = spancount_layer.reader(); diff --git a/crates/bmc-proxy/src/setup.rs b/crates/bmc-proxy/src/setup.rs index 46b1ec6e30..3ed0e0c66b 100644 --- a/crates/bmc-proxy/src/setup.rs +++ b/crates/bmc-proxy/src/setup.rs @@ -51,7 +51,10 @@ pub fn setup_logging(debug: bool) -> SetupResult<()> { tracing_subscriber::registry() .with( logfmt::layer() - .with_component("nico-bmc-proxy") + .with_event_fields([logfmt::EventField::with_default( + "component", + "nico-bmc-proxy", + )]) .with_filter(log_filter), ) .try_init()?; diff --git a/crates/dhcp-server/src/main.rs b/crates/dhcp-server/src/main.rs index 6ac12632f1..b2528bd105 100644 --- a/crates/dhcp-server/src/main.rs +++ b/crates/dhcp-server/src/main.rs @@ -223,7 +223,7 @@ fn setup_tracing() -> Result<(), Box> { tracing_subscriber::registry() .with( logfmt::layer() - .with_component("nico-dhcp") + .with_event_fields([logfmt::EventField::with_default("component", "nico-dhcp")]) .with_filter(env_filter), ) .try_init()?; diff --git a/crates/dsx-exchange-consumer/src/main.rs b/crates/dsx-exchange-consumer/src/main.rs index d20692e341..c303e78e11 100644 --- a/crates/dsx-exchange-consumer/src/main.rs +++ b/crates/dsx-exchange-consumer/src/main.rs @@ -33,7 +33,10 @@ async fn main() -> Result<(), DsxConsumerError> { tracing_subscriber::registry() .with( logfmt::layer() - .with_component("nico-dsx-exchange-consumer") + .with_event_fields([logfmt::EventField::with_default( + "component", + "nico-dsx-exchange-consumer", + )]) .with_filter(env_filter), ) .try_init() diff --git a/crates/fmds/src/main.rs b/crates/fmds/src/main.rs index 6791048689..88a2211869 100644 --- a/crates/fmds/src/main.rs +++ b/crates/fmds/src/main.rs @@ -47,7 +47,8 @@ pub fn subscriber() -> impl SubscriberInitExt { .add_directive("hickory_resolver::name_server=info".parse().unwrap()) .add_directive("hickory_proto=info".parse().unwrap()) .add_directive("netlink_proto=warn".parse().unwrap()); - let stdout_formatter = logfmt::layer().with_component("nico-fmds"); + let stdout_formatter = logfmt::layer() + .with_event_fields([logfmt::EventField::with_default("component", "nico-fmds")]); Box::new(tracing_subscriber::registry().with(stdout_formatter.with_filter(env_filter))) } diff --git a/crates/health/src/main.rs b/crates/health/src/main.rs index f9a807cdca..96aa7f5392 100644 --- a/crates/health/src/main.rs +++ b/crates/health/src/main.rs @@ -31,7 +31,12 @@ async fn main() -> Result<(), HealthError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer().with_component("nico-hardware-health")) + .with( + logfmt::layer().with_event_fields([logfmt::EventField::with_default( + "component", + "nico-hardware-health", + )]), + ) .with(env_filter) .init(); diff --git a/crates/host-support/src/lib.rs b/crates/host-support/src/lib.rs index a926a8594c..3e14d587b9 100644 --- a/crates/host-support/src/lib.rs +++ b/crates/host-support/src/lib.rs @@ -68,6 +68,7 @@ pub fn subscriber(component: &str) -> impl SubscriberInitExt { .add_directive("hickory_resolver::name_server=info".parse().unwrap()) .add_directive("hickory_proto=info".parse().unwrap()) .add_directive("netlink_proto=warn".parse().unwrap()); - let stdout_formatter = logfmt::layer().with_component(component); + let stdout_formatter = logfmt::layer() + .with_event_fields([logfmt::EventField::with_default("component", component)]); Box::new(tracing_subscriber::registry().with(stdout_formatter.with_filter(env_filter))) } diff --git a/crates/logfmt/src/lib.rs b/crates/logfmt/src/lib.rs index 7b7418dc4d..fd5931596b 100644 --- a/crates/logfmt/src/lib.rs +++ b/crates/logfmt/src/lib.rs @@ -27,7 +27,7 @@ use tracing::span::{self, Attributes}; use tracing::{Event, Subscriber}; use tracing_subscriber::Layer; use tracing_subscriber::layer::Context; -use tracing_subscriber::registry::{LookupSpan, SpanRef}; +use tracing_subscriber::registry::LookupSpan; /// Construct a new `LogFmtLayer` pub fn layer() -> LogFmtLayer @@ -39,8 +39,37 @@ where make_writer: Arc::new(|| Box::new(std::io::stdout())), emit_span_logs: true, write_end_time: false, - extra_event_fields: Vec::new(), - default_component: None, + event_fields: Vec::new(), + } +} + +/// A span attribute surfaced on event and `level=SPAN` lines. +/// +/// On each line the rendered value is, in precedence order: the value the line's +/// own span set for `name`; else the value inherited from a parent span; else the +/// configured default, if any; else the field is omitted. So a field built with +/// [`EventField::with_default`] renders on every line, and one built with +/// [`EventField::new`] renders only where a span set or inherited it. +pub struct EventField { + name: String, + default: Option, +} + +impl EventField { + /// Surfaced only when a span sets or inherits it (no fallback). + pub fn new(name: impl Into) -> Self { + Self { + name: name.into(), + default: None, + } + } + + /// Surfaced on every line, falling back to `default` when no span set it. + pub fn with_default(name: impl Into, default: impl Into) -> Self { + Self { + name: name.into(), + default: Some(default.into()), + } } } @@ -57,13 +86,9 @@ pub struct LogFmtLayer { make_writer: Arc Box + Send + Sync>, emit_span_logs: bool, write_end_time: bool, - extra_event_fields: Vec, - /// The per-binary default `component` value, emitted as `component=` - /// right after the `level=` field on every event and span line (e.g. - /// `nico-api`). A span may override it for lines within its scope by setting - /// its own `component` attribute; see `resolve_component`. `None` (the default - /// until `with_component` is called) emits no `component` field. - default_component: Option, + /// Span attributes surfaced on event and span lines, each with an optional + /// fallback. See [`EventField`]. Set via `with_event_fields`. + event_fields: Vec, } impl LogFmtLayer @@ -94,43 +119,46 @@ where } } - /// Add extra span attribute keys to emit in event logs - pub fn with_event_fields(self, extra_event_fields: Vec) -> Self { + /// Surface span attributes on event and `level=SPAN` lines. Each [`EventField`] + /// renders with its span's own value, else an inherited parent value, else its + /// default (if any). Calling this replaces the field set (it assigns, it does + /// not append); duplicate names are dropped (the first wins) so a name is never + /// emitted twice. + pub fn with_event_fields(self, fields: impl IntoIterator) -> Self { + let mut seen = std::collections::HashSet::new(); Self { - extra_event_fields, + event_fields: fields + .into_iter() + .filter(|field| seen.insert(field.name.clone())) + .collect(), ..self } } - /// Set the per-binary default `component`, emitted as `component=` - /// right after `level=` on every event and span line (e.g. `nico-api`). - /// - /// This identifies the emitting service so logs can be filtered by component - /// in centralized logging. A span may override it for the lines within its - /// scope by setting its own `component` attribute (see `resolve_component`). - pub fn with_component(self, component: impl Into) -> Self { - Self { - default_component: Some(component.into()), - ..self - } + /// Iterate the names of every configured event field. Used to decide which of + /// a span's attributes are inherited by its child spans. + fn configured_field_names(&self) -> impl Iterator { + self.event_fields.iter().map(|field| field.name.as_str()) } - /// Resolve the `component` for a log line: the nearest enclosing span that - /// set a `component` attribute (walking innermost -> outermost), else the - /// per-binary default from `with_component`. `None` if neither is present. - fn resolve_component(&self, span: Option<&SpanRef<'_, S>>) -> Option { - if let Some(span) = span { - for span in span.scope() { - let ext = span.extensions(); - if let Some(value) = ext - .get::() - .and_then(|data| data.attributes.get("component")) - { - return Some(value.clone()); - } - } - } - self.default_component.clone() + /// Resolve every configured event field for a line from a span's + /// `resolved_event_fields` (its own or inherited value), applying each field's + /// fallback. A field with a default always yields a value; one without is + /// included only when the span set or inherited it. + fn resolve_event_fields( + &self, + resolved: Option<&BTreeMap>, + ) -> Vec<(String, String)> { + self.event_fields + .iter() + .filter_map(|field| { + let value = resolved + .and_then(|fields| fields.get(&field.name)) + .cloned() + .or_else(|| field.default.clone()); + value.map(|value| (field.name.clone(), value)) + }) + .collect() } } @@ -164,6 +192,22 @@ struct LogFmtData { /// Formatted Span attributes /// This is a BTreeMap to guarantee order when formatting attributes: BTreeMap, + /// The resolved value of each configured event field for this span (one entry + /// per `EventField` set on the layer that this span or an ancestor set). Kept + /// separate from `attributes` because it also holds values inherited from + /// ancestor spans, which the span never recorded itself. + /// + /// Computed when the span is created (`on_new_span`): this span's own + /// attribute if it set one, otherwise the value inherited from its parent + /// span. Refreshed in `on_record` if the span sets a value later. Event and + /// span lines read this map directly. A name is absent when neither this span + /// nor any of its parent spans set it; the configured default (or omission) is + /// applied at write time. + /// + /// Inheritance is captured at child-creation time: a parent that sets a field + /// (via `record`) after a child span already exists does not propagate to that + /// child. + resolved_event_fields: BTreeMap, } impl LogFmtData { @@ -172,6 +216,7 @@ impl LogFmtData { timing: Timing::new(), suppressed: false, attributes: BTreeMap::new(), + resolved_event_fields: BTreeMap::new(), } } @@ -198,6 +243,44 @@ where let mut visitor = SpanAttributeVisitor { data: &mut data }; attrs.record(&mut visitor); + // Resolve every configured field's value now, when the span is + // created, so log lines can read it directly. + // + // Resolve the parent span before locking the new span's extensions. + // `attrs.parent()` gives an explicit parent (`span!(parent: ..)`); when + // absent and the span is not a root, the parent is the contextual current + // span. Copying the parent's already-resolved `resolved_event_fields` (a + // small map — only the configured fields) is what lets a field set on a + // parent span be inherited here. + let parent = if let Some(parent_id) = attrs.parent() { + ctx.span(parent_id) + } else if attrs.is_root() { + None + } else { + ctx.lookup_current() + }; + let parent_resolved = parent.as_ref().map(|p| { + p.extensions() + .get::() + .map(|d| d.resolved_event_fields.clone()) + .unwrap_or_default() + }); + + for name in self.configured_field_names() { + if let Some(value) = data.attributes.get(name) { + // This span set the field itself -> its own value wins. + data.resolved_event_fields + .insert(name.to_string(), value.clone()); + } else if let Some(value) = parent_resolved.as_ref().and_then(|fields| fields.get(name)) + { + // Inherit the parent's already-resolved value. + data.resolved_event_fields + .insert(name.to_string(), value.clone()); + } + // Otherwise leave it unset; the default (or omission) is applied at + // write time. + } + let span = ctx.span(id).expect("Span not found, this is a bug"); let mut extensions = span.extensions_mut(); extensions.insert(data); @@ -232,50 +315,62 @@ where let mut extensions = span.extensions_mut(); if let Some(data) = extensions.get_mut::() { values.record(&mut SpanAttributeVisitor { data }); + + // A value recorded after span creation (e.g. a field declared + // `tracing::field::Empty` and filled in via `span.record`) must update + // the span's resolved value. A span's own value always wins over an + // inherited one, so re-sync each configured field that now has a + // concrete attribute value. + for name in self.configured_field_names() { + if let Some(value) = data.attributes.get(name).cloned() { + data.resolved_event_fields.insert(name.to_string(), value); + } + } } } fn on_follows_from(&self, _id: &span::Id, _follows: &span::Id, _ctx: Context) {} fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) { - // If the event doesn't happen in the scope of a Span, we don't log some span data let current_span = ctx.lookup_current(); + // Resolve the event fields and read `span_id` under a single extensions + // borrow (no extra clone of the span's resolved map). Outside any span the + // fields fall back to their defaults (or are omitted) and there is no + // `span_id`. + let (event_fields, span_id) = match current_span.as_ref() { + Some(span) => { + let ext = span.extensions(); + let data = ext + .get::() + .expect("Unable to find LogFmtData in extensions; this is a bug"); + let fields = self.resolve_event_fields(Some(&data.resolved_event_fields)); + let span_id = data.attributes.get("span_id").cloned(); + (fields, span_id) + } + None => (self.resolve_event_fields(None), None), + }; + /// This formatting subfunction exists so that we can use the ? operator /// on write! results - fn write_event_data<'a, R: LookupSpan<'a>>( + fn write_event_data( event: &Event<'_>, - current_span: Option<&SpanRef<'a, R>>, out: &mut Vec, - extra_event_fields: &[String], - component: Option<&str>, + event_fields: &[(String, String)], + span_id: Option<&str>, ) -> Result<(), std::io::Error> { write!(out, "level={} ", event.metadata().level())?; - // `component` (the nearest enclosing span's value, else the per-binary - // default) is emitted right after `level=` so logs can be filtered by - // the emitting component. Absent when unset, so output is unchanged. - if let Some(component) = component { - write!(out, "{} ", kvp("component", component))?; + // Configured event fields (the span's own or inherited value, else a + // default) are emitted right after `level=`, so every line carries + // them — including events emitted outside any span. + for (name, value) in event_fields { + write!(out, "{} ", kvp(name, value))?; } - // TODO: More metadata? - - if let Some(current_span) = current_span { - // Get the span_id to correlate the event with the actual span - let ext = current_span.extensions(); - let data = ext - .get::() - .expect("Unable to find LogFmtData in extensions; this is a bug"); - - if let Some(span_id) = data.attributes.get("span_id") { - write!(out, "{} ", kvp("span_id", span_id))?; - } - for key in extra_event_fields { - if let Some(value) = data.attributes.get(key) { - write!(out, "{} ", kvp(key, value))?; - } - } + // span_id correlates the event with the span it was logged in. + if let Some(span_id) = span_id { + write!(out, "{} ", kvp("span_id", span_id))?; } let mut visitor = FieldVisitor { @@ -303,16 +398,9 @@ where // A temporary format buffer is used because accessing stdout can be expensive // The format buffer is kept around as a threadlocal variable to reduce allocations - let component = self.resolve_component(current_span.as_ref()); FORMAT_BUFFER.with(|fbuf| { let format_buffer: &mut Vec = &mut fbuf.borrow_mut(); - if let Ok(()) = write_event_data( - event, - current_span.as_ref(), - format_buffer, - &self.extra_event_fields, - component.as_deref(), - ) { + if write_event_data(event, format_buffer, &event_fields, span_id.as_deref()).is_ok() { let mut writer = (self.make_writer)(); let _ = writer.write_all(format_buffer); } @@ -323,10 +411,6 @@ where fn on_close(&self, id: span::Id, ctx: Context<'_, S>) { let span = ctx.span(&id).expect("Span not found, this is a bug"); - // Resolve `component` before taking the mutable extensions lock below - // (resolve_component reads extensions immutably while walking the scope). - let component = self.resolve_component(Some(&span)); - let mut extensions = span.extensions_mut(); let Some(mut data) = extensions.remove::() else { @@ -346,6 +430,9 @@ where return; } + // Resolve from this span's own resolved values. + let event_fields = self.resolve_event_fields(Some(&data.resolved_event_fields)); + data.attributes.insert( "timing_start_time".to_string(), format!("{:?}", data.timing.start_time), @@ -384,16 +471,16 @@ where span_metadata: &tracing::Metadata, mut data: LogFmtData, out: &mut Vec, - component: Option<&str>, + event_fields: &[(String, String)], ) -> Result<(), std::io::Error> { write!(out, "level=SPAN")?; - // `component` is emitted right after `level=SPAN`, mirroring event - // lines. Remove it from the span's own attributes first so it is not - // emitted twice (a span may set `component` directly, e.g. site-explorer). - data.attributes.remove("component"); - if let Some(component) = component { - write!(out, " {}", kvp("component", component))?; + // Resolved event fields are emitted right after `level=SPAN`, mirroring + // event lines. Remove each from the span's own attributes first so it + // is not emitted twice (a span may set the attribute directly). + for (name, value) in event_fields { + data.attributes.remove(name); + write!(out, " {}", kvp(name, value))?; } // Start writing the span_id and span_name for consistency @@ -417,9 +504,7 @@ where // The format buffer is kept around as a threadlocal variable to reduce allocations FORMAT_BUFFER.with(|fbuf| { let format_buffer: &mut Vec = &mut fbuf.borrow_mut(); - if let Ok(()) = - write_span_data(span.metadata(), data, format_buffer, component.as_deref()) - { + if let Ok(()) = write_span_data(span.metadata(), data, format_buffer, &event_fields) { let mut writer = (self.make_writer)(); let _ = writer.write_all(format_buffer); } @@ -913,12 +998,12 @@ mod tests { } #[test] - fn test_component_default_on_event_and_span() { + fn test_default_field_on_event_and_span() { let writer = TestWriter::new(); let cloned_writer = writer.clone(); let layer = layer() .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) - .with_component("nico-api"); + .with_event_fields([EventField::with_default("service", "service-a")]); let _subscriber = tracing_subscriber::registry().with(layer).set_default(); @@ -930,35 +1015,35 @@ mod tests { let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); assert_eq!(lines.len(), 2, "got {}: {:?}", lines.len(), lines); - // The per-binary default is emitted right after `level=` on both lines. + // The default is emitted right after `level=` on both lines. assert!( - lines[0].starts_with(r#"level=INFO component=nico-api span_id=s1234 msg=inside "#), + lines[0].starts_with(r#"level=INFO service=service-a span_id=s1234 msg=inside "#), "Line is: {}", lines[0] ); assert!( lines[1] - .starts_with(r#"level=SPAN component=nico-api span_id=s1234 span_name=test_span "#), + .starts_with(r#"level=SPAN service=service-a span_id=s1234 span_name=test_span "#), "Line is: {}", lines[1] ); } #[test] - fn test_component_span_overrides_default_once() { + fn test_span_value_overrides_default_once() { let writer = TestWriter::new(); let cloned_writer = writer.clone(); let layer = layer() .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) - .with_component("nico-api"); + .with_event_fields([EventField::with_default("service", "service-a")]); let _subscriber = tracing_subscriber::registry().with(layer).set_default(); let span = tracing::span!( tracing::Level::INFO, - "explore_site", + "outer", span_id = "s1", - component = "site-explorer", + service = "service-b", ); let _entered = span.enter(); tracing::info!("inside"); @@ -967,75 +1052,77 @@ mod tests { let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); assert_eq!(lines.len(), 2, "got {}: {:?}", lines.len(), lines); - // The span value overrides the default, and `component` appears once. + // The span value overrides the default, and `service` appears once. assert!( - lines[0].starts_with(r#"level=INFO component=site-explorer span_id=s1 msg=inside "#), + lines[0].starts_with(r#"level=INFO service=service-b span_id=s1 msg=inside "#), "Line is: {}", lines[0] ); - assert!(!lines[0].contains("nico-api"), "default leaked: {}", lines[0]); + assert!( + !lines[0].contains("service-a"), + "default leaked: {}", + lines[0] + ); assert_eq!( - lines[0].matches("component=").count(), + lines[0].matches("service=").count(), 1, - "duplicate component key: {}", + "duplicate service key: {}", lines[0] ); assert!( - lines[1].starts_with( - r#"level=SPAN component=site-explorer span_id=s1 span_name=explore_site "# - ), + lines[1].starts_with(r#"level=SPAN service=service-b span_id=s1 span_name=outer "#), "Line is: {}", lines[1] ); assert_eq!( - lines[1].matches("component=").count(), + lines[1].matches("service=").count(), 1, - "duplicate component key: {}", + "duplicate service key: {}", lines[1] ); } #[test] - fn test_component_inherited_by_nested_span() { + fn test_default_field_inherited_by_nested_span() { let writer = TestWriter::new(); let cloned_writer = writer.clone(); let layer = layer() .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) - .with_component("nico-api"); + .with_event_fields([EventField::with_default("service", "service-a")]); let _subscriber = tracing_subscriber::registry().with(layer).set_default(); - // Root span sets `component`; the nested child span does not. + // Root span sets `service`; the nested child span does not. let root = tracing::span!( tracing::Level::INFO, - "explore_site", + "outer", span_id = "root", - component = "site-explorer", + service = "service-b", ); let _root_entered = root.enter(); - let child = tracing::span!(tracing::Level::INFO, "probe_endpoint", span_id = "child"); + let child = tracing::span!(tracing::Level::INFO, "inner", span_id = "child"); let _child_entered = child.enter(); - tracing::warn!("redfish_timeout"); + tracing::warn!("inner_event"); let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); - // The event fires in the child span, which set no `component`; it must - // inherit `site-explorer` by walking up to the root span. + // The event fires in the child span, which set no `service`; it must + // inherit `service-b` from the root span. let event_line = lines .iter() - .find(|l| l.contains("msg=redfish_timeout")) + .find(|l| l.contains("msg=inner_event")) .expect("event line not found"); assert!( - event_line.contains("component=site-explorer"), - "nested event did not inherit component: {event_line}" + event_line.contains("service=service-b"), + "nested event did not inherit service: {event_line}" ); assert!( - !event_line.contains("component=nico-api"), + !event_line.contains("service=service-a"), "nested event fell back to default: {event_line}" ); } #[test] - fn test_no_component_when_unset() { + fn test_no_field_when_unset() { let writer = TestWriter::new(); let cloned_writer = writer.clone(); let layer = layer().with_writer(Arc::new(move || Box::new(cloned_writer.clone()))); @@ -1045,7 +1132,7 @@ mod tests { tracing::warn!("e"); let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); - // With no default and no span override, no `component` field is emitted, + // With no default and no span override, no `service` field is emitted, // guaranteeing byte-identical output for the existing snapshot tests. assert!( lines[0].starts_with(r#"level=WARN msg=e "#), @@ -1053,19 +1140,19 @@ mod tests { lines[0] ); assert!( - !lines[0].contains("component="), - "unexpected component: {}", + !lines[0].contains("service="), + "unexpected service: {}", lines[0] ); } #[test] - fn test_object_id_in_event_logs() { + fn test_extra_field_on_event_line() { let writer = TestWriter::new(); let cloned_writer = writer.clone(); let layer = layer() .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) - .with_event_fields(vec!["object_id".to_string()]); + .with_event_fields([EventField::new("request_id")]); let _subscriber = tracing_subscriber::registry().with(layer).set_default(); @@ -1073,11 +1160,11 @@ mod tests { tracing::Level::INFO, "test_span", span_id = "s1234", - object_id = "machine-abc-123", + request_id = "req-abc-123", ); let _entered = span.enter(); - tracing::info!("event inside span with object_id"); + tracing::info!("event inside span with request_id"); drop(_entered); drop(span); @@ -1090,11 +1177,271 @@ mod tests { lines.len(), lines ); - // Event should include both span_id and object_id + // Event should include both span_id and request_id assert!( - lines[0].starts_with(r#"level=INFO span_id=s1234 object_id=machine-abc-123 msg="event inside span with object_id" location=""#), + lines[0].starts_with(r#"level=INFO request_id=req-abc-123 span_id=s1234 msg="event inside span with request_id" location=""#), "Line is: {}", lines[0] ); } + + #[test] + fn test_extra_field_inherited_from_parent_span() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::new("request_id")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + // Parent sets `request_id`; the nested child does not -> the child's event + // inherits it from the parent span. + let root = tracing::span!( + tracing::Level::INFO, + "outer", + span_id = "root", + request_id = "req-1", + ); + let _root = root.enter(); + let child = tracing::span!(tracing::Level::INFO, "inner", span_id = "child"); + let _child = child.enter(); + tracing::info!("inner_event"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let event_line = lines + .iter() + .find(|l| l.contains("msg=inner_event")) + .expect("event line not found"); + assert!( + event_line.contains("request_id=req-1"), + "nested event did not inherit request_id: {event_line}" + ); + } + + #[test] + fn test_extra_field_child_value_overrides_parent() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::new("request_id")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + // Both parent and child set `request_id` -> the child's own value wins + // over the inherited parent value; a parent never replaces a child's value. + let root = tracing::span!( + tracing::Level::INFO, + "outer", + span_id = "root", + request_id = "req-parent", + ); + let _root = root.enter(); + let child = tracing::span!( + tracing::Level::INFO, + "inner", + span_id = "child", + request_id = "req-child", + ); + let _child = child.enter(); + tracing::info!("inner_event"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let event_line = lines + .iter() + .find(|l| l.contains("msg=inner_event")) + .expect("event line not found"); + assert!( + event_line.contains("request_id=req-child"), + "child's own request_id should win: {event_line}" + ); + assert!( + !event_line.contains("req-parent"), + "parent value leaked: {event_line}" + ); + } + + #[test] + fn test_extra_field_on_span_line() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::new("request_id")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + // A plain extra field is emitted on span-close lines too (not just event + // lines): the span that sets it carries it on its own close line, and a + // nested span that doesn't set it inherits it onto its close line. + let root = tracing::span!( + tracing::Level::INFO, + "outer", + span_id = "root", + request_id = "req-1", + ); + let _root = root.enter(); + let child = tracing::span!(tracing::Level::INFO, "inner", span_id = "child"); + drop(child); // child's SPAN line + drop(_root); + drop(root); // outer's SPAN line + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let child_span = lines + .iter() + .find(|l| l.contains("span_name=inner")) + .expect("child span line not found"); + let outer_span = lines + .iter() + .find(|l| l.contains("span_name=outer")) + .expect("outer span line not found"); + // Inherited onto the child's close line, present on the outer's, once each. + assert!( + child_span.contains("request_id=req-1"), + "child span line missing inherited request_id: {child_span}" + ); + assert_eq!( + child_span.matches("request_id=").count(), + 1, + "duplicate request_id on child span line: {child_span}" + ); + assert!( + outer_span.contains("request_id=req-1"), + "outer span line missing request_id: {outer_span}" + ); + assert_eq!( + outer_span.matches("request_id=").count(), + 1, + "duplicate request_id on outer span line: {outer_span}" + ); + } + + // --- event-field inheritance edge cases --- + + #[test] + fn test_explicit_parent_inheritance() { + // Inheritance must follow an explicit `parent:` link, not just the + // contextual current span. Here the child is created with an explicit + // parent that is NOT entered, so contextual lookup would miss it. + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::with_default("service", "service-a")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let parent = tracing::span!( + tracing::Level::INFO, + "parent", + span_id = "p", + service = "service-b", + ); + // Note: parent is never entered. The child names it explicitly. + let child = tracing::span!(parent: &parent, tracing::Level::INFO, "child", span_id = "c"); + let _entered = child.enter(); + tracing::info!("explicit_parent_event"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let event_line = lines + .iter() + .find(|l| l.contains("msg=explicit_parent_event")) + .expect("event line not found"); + assert!( + event_line.contains("service=service-b"), + "explicit parent value not inherited: {event_line}" + ); + assert!( + !event_line.contains("service-a"), + "fell back to default instead of inheriting explicit parent: {event_line}" + ); + } + + #[test] + fn test_empty_field_recorded_after_creation_overrides_inherited() { + // A child declares the field as `Empty` (so it has no own value at + // creation and inherits the parent's resolved value), then fills it in + // via `span.record`. `on_record` must refresh the resolved value so the + // span's own value wins over the inherited one. + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::with_default("service", "service-a")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let root = tracing::span!( + tracing::Level::INFO, + "root", + span_id = "root", + service = "parent-comp", + ); + let _root = root.enter(); + let child = tracing::span!( + tracing::Level::INFO, + "child", + span_id = "child", + service = tracing::field::Empty, + ); + let _child = child.enter(); + // Before record: should inherit `parent-comp`. + tracing::info!("before_record"); + child.record("service", "child-comp"); + // After record: the span's own value wins. + tracing::info!("after_record"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let before = lines + .iter() + .find(|l| l.contains("msg=before_record")) + .expect("before line not found"); + let after = lines + .iter() + .find(|l| l.contains("msg=after_record")) + .expect("after line not found"); + assert!( + before.contains("service=parent-comp"), + "child with Empty field did not inherit parent before record: {before}" + ); + assert!( + after.contains("service=child-comp"), + "on_record did not refresh resolved value: {after}" + ); + } + + #[test] + fn test_deeply_nested_inheritance() { + // Only the outermost span sets the field; deeply nested descendants must + // still see it. Each level copies its parent's already-resolved value, so + // the value propagates down through every nested span. + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::with_default("service", "service-a")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let s1 = tracing::span!(tracing::Level::INFO, "s1", service = "deep-comp"); + let _e1 = s1.enter(); + let s2 = tracing::span!(tracing::Level::INFO, "s2"); + let _e2 = s2.enter(); + let s3 = tracing::span!(tracing::Level::INFO, "s3"); + let _e3 = s3.enter(); + let s4 = tracing::span!(tracing::Level::INFO, "s4"); + let _e4 = s4.enter(); + tracing::info!("deep_event"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let event_line = lines + .iter() + .find(|l| l.contains("msg=deep_event")) + .expect("event line not found"); + assert!( + event_line.contains("service=deep-comp"), + "deeply nested span did not inherit service: {event_line}" + ); + } } diff --git a/crates/rvs/src/bin/carbide-rvs.rs b/crates/rvs/src/bin/carbide-rvs.rs index 31f133ce40..e731d433a4 100644 --- a/crates/rvs/src/bin/carbide-rvs.rs +++ b/crates/rvs/src/bin/carbide-rvs.rs @@ -53,7 +53,10 @@ async fn main() -> Result<(), RvsError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer().with_component("nico-rvs")) + .with( + logfmt::layer() + .with_event_fields([logfmt::EventField::with_default("component", "nico-rvs")]), + ) .with(env_filter) .init(); diff --git a/crates/rvs/src/bin/test-artifact-cache.rs b/crates/rvs/src/bin/test-artifact-cache.rs index d2e08e1953..45e0284e1c 100644 --- a/crates/rvs/src/bin/test-artifact-cache.rs +++ b/crates/rvs/src/bin/test-artifact-cache.rs @@ -73,7 +73,12 @@ async fn main() -> Result<(), RvsError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer().with_component("nico-test-artifact-cache")) + .with( + logfmt::layer().with_event_fields([logfmt::EventField::with_default( + "component", + "nico-test-artifact-cache", + )]), + ) .with(env_filter) .init(); diff --git a/docs/observability/logging.md b/docs/observability/logging.md index 992c94c7b0..abb074c0a1 100644 --- a/docs/observability/logging.md +++ b/docs/observability/logging.md @@ -43,7 +43,7 @@ RUST_LOG=info,carbide_site_explorer=debug # raise one module ## The `component` field -On logfmt lines, the `component` field is set to one of the following: +On logfmt lines, NICo sets the `component` field to one of the following: ``` nico-api — API handlers, DB, startup: anything not in a subsystem below @@ -72,13 +72,14 @@ State-controller lines also carry a `controller=` field with the same valu ### Adding it to new code - **New binary** — set the default when building the `logfmt` layer: - `logfmt::layer().with_component("nico-my-service")`. Host-side binaries (agent/scout) pass it through - `carbide_host_support::init_logging("nico-my-service")`. A binary that omits this emits no `component`. + `logfmt::layer().with_event_fields([logfmt::EventField::with_default("component", "nico-my-service")])` + — or pass the name to `carbide_host_support::init_logging("nico-my-service")` if the binary uses that + helper. A binary that omits this sets no default `component`. - **New in-process subsystem of `nico-api`** — add a `component` field to the subsystem's root span: `tracing::span!(parent: None, Level::INFO, "my_subsystem", component = "my-subsystem", /* … */)`. Nested instrumented functions and spawned tasks carried with `.instrument(...)` / `.in_current_span()` inherit it. -> **`component` is reserved for the emitting component** — don't reuse it as an event/span field for +> **By convention, NICo uses the `component` key for the emitting component** — don't reuse the key for > unrelated data. Give domain values their own key (e.g. `nico-hardware-health`'s firmware-inventory events > use `firmware_component`, not `component`).