diff --git a/crates/agent/src/ethernet_virtualization.rs b/crates/agent/src/ethernet_virtualization.rs index 30db817e98..5cbc8c342b 100644 --- a/crates/agent/src/ethernet_virtualization.rs +++ b/crates/agent/src/ethernet_virtualization.rs @@ -1886,7 +1886,7 @@ mod tests { use crate::{HBNDeviceNames, dhcp, nvue}; #[ctor::ctor(unsafe)] fn setup() { - carbide_host_support::init_logging().unwrap(); + carbide_host_support::init_logging("nico-dpu-agent").unwrap(); } #[test] diff --git a/crates/agent/src/main.rs b/crates/agent/src/main.rs index e4219e1187..c63c4d6cd4 100644 --- a/crates/agent/src/main.rs +++ b/crates/agent/src/main.rs @@ -18,7 +18,7 @@ use std::time::Duration; fn main() -> eyre::Result<()> { - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-dpu-agent")?; // We need a multi-threaded runtime since background threads will queue work // on it, and the foreground thread might not be blocked onto the runtime diff --git a/crates/agent/src/tests/full.rs b/crates/agent/src/tests/full.rs index 4b1922d3e2..90f9dd3f9e 100644 --- a/crates/agent/src/tests/full.rs +++ b/crates/agent/src/tests/full.rs @@ -276,7 +276,7 @@ async fn run_common_parts( virtualization_type: VpcVirtualizationType, test_metadata_service: bool, ) -> eyre::Result { - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-dpu-agent")?; let state: Arc> = Arc::new(Mutex::new(Default::default())); state.lock().await.virtualization_type = virtualization_type; diff --git a/crates/agent/src/tests/test_network_monitor.rs b/crates/agent/src/tests/test_network_monitor.rs index 413048b0e5..251bdf99f3 100644 --- a/crates/agent/src/tests/test_network_monitor.rs +++ b/crates/agent/src/tests/test_network_monitor.rs @@ -52,7 +52,7 @@ struct State { #[tokio::test] pub async fn test_network_monitor() -> eyre::Result<()> { - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-dpu-agent")?; let state: Arc> = Arc::new(Mutex::new(Default::default())); diff --git a/crates/agent/src/tests/upgrade.rs b/crates/agent/src/tests/upgrade.rs index d735b6498a..27b39923d3 100644 --- a/crates/agent/src/tests/upgrade.rs +++ b/crates/agent/src/tests/upgrade.rs @@ -29,7 +29,7 @@ const ROOT_CERT_PATH: &str = "dev/certs/forge_developer_local_only_root_cert_pem #[tokio::test] async fn test_upgrade_check() -> eyre::Result<()> { - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-dpu-agent")?; unsafe { env::set_var("DISABLE_TLS_ENFORCEMENT", "true"); diff --git a/crates/api-core/src/logging/setup.rs b/crates/api-core/src/logging/setup.rs index dcb8a0c0fa..20e64e704b 100644 --- a/crates/api-core/src/logging/setup.rs +++ b/crates/api-core/src/logging/setup.rs @@ -104,7 +104,13 @@ pub fn setup_logging( let (logfmt_stdout_filter, logfmt_stdout_reload_handle) = reload::Layer::new(initial_log_filter.clone()); - let logfmt_stdout_formatter = logfmt::layer().with_event_fields(extra_logfmt_event_fields); + let mut event_fields = vec![logfmt::EventField::with_default("component", "nico-api")]; + event_fields.extend( + extra_logfmt_event_fields + .into_iter() + .map(logfmt::EventField::new), + ); + let logfmt_stdout_formatter = logfmt::layer().with_event_fields(event_fields); let spancount_layer = spancounter::layer(); let spancount_reader = spancount_layer.reader(); diff --git a/crates/bmc-proxy/src/setup.rs b/crates/bmc-proxy/src/setup.rs index 11c6b12a1f..3ed0e0c66b 100644 --- a/crates/bmc-proxy/src/setup.rs +++ b/crates/bmc-proxy/src/setup.rs @@ -49,7 +49,14 @@ pub fn setup_logging(debug: bool) -> SetupResult<()> { ); tracing_subscriber::registry() - .with(logfmt::layer().with_filter(log_filter)) + .with( + logfmt::layer() + .with_event_fields([logfmt::EventField::with_default( + "component", + "nico-bmc-proxy", + )]) + .with_filter(log_filter), + ) .try_init()?; tracing::info!("current log level: {}", LevelFilter::current()); diff --git a/crates/dhcp-server/src/main.rs b/crates/dhcp-server/src/main.rs index ba3a85da44..b2528bd105 100644 --- a/crates/dhcp-server/src/main.rs +++ b/crates/dhcp-server/src/main.rs @@ -221,7 +221,11 @@ fn setup_tracing() -> Result<(), Box> { .add_directive("hickory_proto=info".parse().unwrap()); tracing_subscriber::registry() - .with(logfmt::layer().with_filter(env_filter)) + .with( + logfmt::layer() + .with_event_fields([logfmt::EventField::with_default("component", "nico-dhcp")]) + .with_filter(env_filter), + ) .try_init()?; Ok(()) } diff --git a/crates/dsx-exchange-consumer/src/main.rs b/crates/dsx-exchange-consumer/src/main.rs index 8e59bdeba8..c303e78e11 100644 --- a/crates/dsx-exchange-consumer/src/main.rs +++ b/crates/dsx-exchange-consumer/src/main.rs @@ -31,7 +31,14 @@ async fn main() -> Result<(), DsxConsumerError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer().with_filter(env_filter)) + .with( + logfmt::layer() + .with_event_fields([logfmt::EventField::with_default( + "component", + "nico-dsx-exchange-consumer", + )]) + .with_filter(env_filter), + ) .try_init() .map_err(|e| DsxConsumerError::Config(e.to_string()))?; diff --git a/crates/fmds/src/main.rs b/crates/fmds/src/main.rs index 202235b86b..88a2211869 100644 --- a/crates/fmds/src/main.rs +++ b/crates/fmds/src/main.rs @@ -47,7 +47,8 @@ pub fn subscriber() -> impl SubscriberInitExt { .add_directive("hickory_resolver::name_server=info".parse().unwrap()) .add_directive("hickory_proto=info".parse().unwrap()) .add_directive("netlink_proto=warn".parse().unwrap()); - let stdout_formatter = logfmt::layer(); + let stdout_formatter = logfmt::layer() + .with_event_fields([logfmt::EventField::with_default("component", "nico-fmds")]); Box::new(tracing_subscriber::registry().with(stdout_formatter.with_filter(env_filter))) } diff --git a/crates/health/src/main.rs b/crates/health/src/main.rs index 3fb08094c9..96aa7f5392 100644 --- a/crates/health/src/main.rs +++ b/crates/health/src/main.rs @@ -31,7 +31,12 @@ async fn main() -> Result<(), HealthError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer()) + .with( + logfmt::layer().with_event_fields([logfmt::EventField::with_default( + "component", + "nico-hardware-health", + )]), + ) .with(env_filter) .init(); diff --git a/crates/health/src/sink/tracing.rs b/crates/health/src/sink/tracing.rs index f253b486e0..b1aff54784 100644 --- a/crates/health/src/sink/tracing.rs +++ b/crates/health/src/sink/tracing.rs @@ -72,7 +72,7 @@ impl DataSink for TracingSink { tracing::info!( endpoint = %context.endpoint_key(), collector = %context.collector_type, - component = %info.component, + firmware_component = %info.component, version = %info.version, "Firmware info event" ); diff --git a/crates/host-support/src/lib.rs b/crates/host-support/src/lib.rs index b3329b85af..3e14d587b9 100644 --- a/crates/host-support/src/lib.rs +++ b/crates/host-support/src/lib.rs @@ -35,9 +35,13 @@ static LOG_SETUP: Once = Once::new(); /// Initialize global logging output to STDOUT. Applies to all threads. /// Use `export RUST_LOG=trace|debug|info|warn|error` to change log level. -pub fn init_logging() -> eyre::Result<()> { +/// +/// `component` tags every log line with `component=` (e.g. `nico-scout`, +/// `nico-dpu-agent`) so logs can be filtered by the emitting binary. It must be +/// passed by the caller because this setup is shared across binaries. +pub fn init_logging(component: &str) -> eyre::Result<()> { LOG_SETUP.call_once(|| { - subscriber() + subscriber(component) .try_init() .expect("tracing_subscriber setup failed"); }); @@ -47,9 +51,9 @@ pub fn init_logging() -> eyre::Result<()> { // A logging subscriber for use on the current thread. // Usually you want `init_logging()` instead. // -// Usage: `let guard = subscriber().set_default()` +// Usage: `let guard = subscriber("nico-scout").set_default()` // Subscriber is unregistered when guard is dropped. -pub fn subscriber() -> impl SubscriberInitExt { +pub fn subscriber(component: &str) -> impl SubscriberInitExt { let env_filter = EnvFilter::builder() .with_default_directive(LevelFilter::INFO.into()) .from_env_lossy() @@ -64,6 +68,7 @@ pub fn subscriber() -> impl SubscriberInitExt { .add_directive("hickory_resolver::name_server=info".parse().unwrap()) .add_directive("hickory_proto=info".parse().unwrap()) .add_directive("netlink_proto=warn".parse().unwrap()); - let stdout_formatter = logfmt::layer(); + let stdout_formatter = logfmt::layer() + .with_event_fields([logfmt::EventField::with_default("component", component)]); Box::new(tracing_subscriber::registry().with(stdout_formatter.with_filter(env_filter))) } diff --git a/crates/logfmt/src/lib.rs b/crates/logfmt/src/lib.rs index fec9bacd6d..fd5931596b 100644 --- a/crates/logfmt/src/lib.rs +++ b/crates/logfmt/src/lib.rs @@ -27,7 +27,7 @@ use tracing::span::{self, Attributes}; use tracing::{Event, Subscriber}; use tracing_subscriber::Layer; use tracing_subscriber::layer::Context; -use tracing_subscriber::registry::{LookupSpan, SpanRef}; +use tracing_subscriber::registry::LookupSpan; /// Construct a new `LogFmtLayer` pub fn layer() -> LogFmtLayer @@ -39,7 +39,37 @@ where make_writer: Arc::new(|| Box::new(std::io::stdout())), emit_span_logs: true, write_end_time: false, - extra_event_fields: Vec::new(), + event_fields: Vec::new(), + } +} + +/// A span attribute surfaced on event and `level=SPAN` lines. +/// +/// On each line the rendered value is, in precedence order: the value the line's +/// own span set for `name`; else the value inherited from a parent span; else the +/// configured default, if any; else the field is omitted. So a field built with +/// [`EventField::with_default`] renders on every line, and one built with +/// [`EventField::new`] renders only where a span set or inherited it. +pub struct EventField { + name: String, + default: Option, +} + +impl EventField { + /// Surfaced only when a span sets or inherits it (no fallback). + pub fn new(name: impl Into) -> Self { + Self { + name: name.into(), + default: None, + } + } + + /// Surfaced on every line, falling back to `default` when no span set it. + pub fn with_default(name: impl Into, default: impl Into) -> Self { + Self { + name: name.into(), + default: Some(default.into()), + } } } @@ -56,7 +86,9 @@ pub struct LogFmtLayer { make_writer: Arc Box + Send + Sync>, emit_span_logs: bool, write_end_time: bool, - extra_event_fields: Vec, + /// Span attributes surfaced on event and span lines, each with an optional + /// fallback. See [`EventField`]. Set via `with_event_fields`. + event_fields: Vec, } impl LogFmtLayer @@ -87,13 +119,47 @@ where } } - /// Add extra span attribute keys to emit in event logs - pub fn with_event_fields(self, extra_event_fields: Vec) -> Self { + /// Surface span attributes on event and `level=SPAN` lines. Each [`EventField`] + /// renders with its span's own value, else an inherited parent value, else its + /// default (if any). Calling this replaces the field set (it assigns, it does + /// not append); duplicate names are dropped (the first wins) so a name is never + /// emitted twice. + pub fn with_event_fields(self, fields: impl IntoIterator) -> Self { + let mut seen = std::collections::HashSet::new(); Self { - extra_event_fields, + event_fields: fields + .into_iter() + .filter(|field| seen.insert(field.name.clone())) + .collect(), ..self } } + + /// Iterate the names of every configured event field. Used to decide which of + /// a span's attributes are inherited by its child spans. + fn configured_field_names(&self) -> impl Iterator { + self.event_fields.iter().map(|field| field.name.as_str()) + } + + /// Resolve every configured event field for a line from a span's + /// `resolved_event_fields` (its own or inherited value), applying each field's + /// fallback. A field with a default always yields a value; one without is + /// included only when the span set or inherited it. + fn resolve_event_fields( + &self, + resolved: Option<&BTreeMap>, + ) -> Vec<(String, String)> { + self.event_fields + .iter() + .filter_map(|field| { + let value = resolved + .and_then(|fields| fields.get(&field.name)) + .cloned() + .or_else(|| field.default.clone()); + value.map(|value| (field.name.clone(), value)) + }) + .collect() + } } struct Timing { @@ -126,6 +192,22 @@ struct LogFmtData { /// Formatted Span attributes /// This is a BTreeMap to guarantee order when formatting attributes: BTreeMap, + /// The resolved value of each configured event field for this span (one entry + /// per `EventField` set on the layer that this span or an ancestor set). Kept + /// separate from `attributes` because it also holds values inherited from + /// ancestor spans, which the span never recorded itself. + /// + /// Computed when the span is created (`on_new_span`): this span's own + /// attribute if it set one, otherwise the value inherited from its parent + /// span. Refreshed in `on_record` if the span sets a value later. Event and + /// span lines read this map directly. A name is absent when neither this span + /// nor any of its parent spans set it; the configured default (or omission) is + /// applied at write time. + /// + /// Inheritance is captured at child-creation time: a parent that sets a field + /// (via `record`) after a child span already exists does not propagate to that + /// child. + resolved_event_fields: BTreeMap, } impl LogFmtData { @@ -134,6 +216,7 @@ impl LogFmtData { timing: Timing::new(), suppressed: false, attributes: BTreeMap::new(), + resolved_event_fields: BTreeMap::new(), } } @@ -160,6 +243,44 @@ where let mut visitor = SpanAttributeVisitor { data: &mut data }; attrs.record(&mut visitor); + // Resolve every configured field's value now, when the span is + // created, so log lines can read it directly. + // + // Resolve the parent span before locking the new span's extensions. + // `attrs.parent()` gives an explicit parent (`span!(parent: ..)`); when + // absent and the span is not a root, the parent is the contextual current + // span. Copying the parent's already-resolved `resolved_event_fields` (a + // small map — only the configured fields) is what lets a field set on a + // parent span be inherited here. + let parent = if let Some(parent_id) = attrs.parent() { + ctx.span(parent_id) + } else if attrs.is_root() { + None + } else { + ctx.lookup_current() + }; + let parent_resolved = parent.as_ref().map(|p| { + p.extensions() + .get::() + .map(|d| d.resolved_event_fields.clone()) + .unwrap_or_default() + }); + + for name in self.configured_field_names() { + if let Some(value) = data.attributes.get(name) { + // This span set the field itself -> its own value wins. + data.resolved_event_fields + .insert(name.to_string(), value.clone()); + } else if let Some(value) = parent_resolved.as_ref().and_then(|fields| fields.get(name)) + { + // Inherit the parent's already-resolved value. + data.resolved_event_fields + .insert(name.to_string(), value.clone()); + } + // Otherwise leave it unset; the default (or omission) is applied at + // write time. + } + let span = ctx.span(id).expect("Span not found, this is a bug"); let mut extensions = span.extensions_mut(); extensions.insert(data); @@ -194,42 +315,62 @@ where let mut extensions = span.extensions_mut(); if let Some(data) = extensions.get_mut::() { values.record(&mut SpanAttributeVisitor { data }); + + // A value recorded after span creation (e.g. a field declared + // `tracing::field::Empty` and filled in via `span.record`) must update + // the span's resolved value. A span's own value always wins over an + // inherited one, so re-sync each configured field that now has a + // concrete attribute value. + for name in self.configured_field_names() { + if let Some(value) = data.attributes.get(name).cloned() { + data.resolved_event_fields.insert(name.to_string(), value); + } + } } } fn on_follows_from(&self, _id: &span::Id, _follows: &span::Id, _ctx: Context) {} fn on_event(&self, event: &Event<'_>, ctx: Context<'_, S>) { - // If the event doesn't happen in the scope of a Span, we don't log some span data let current_span = ctx.lookup_current(); + // Resolve the event fields and read `span_id` under a single extensions + // borrow (no extra clone of the span's resolved map). Outside any span the + // fields fall back to their defaults (or are omitted) and there is no + // `span_id`. + let (event_fields, span_id) = match current_span.as_ref() { + Some(span) => { + let ext = span.extensions(); + let data = ext + .get::() + .expect("Unable to find LogFmtData in extensions; this is a bug"); + let fields = self.resolve_event_fields(Some(&data.resolved_event_fields)); + let span_id = data.attributes.get("span_id").cloned(); + (fields, span_id) + } + None => (self.resolve_event_fields(None), None), + }; + /// This formatting subfunction exists so that we can use the ? operator /// on write! results - fn write_event_data<'a, R: LookupSpan<'a>>( + fn write_event_data( event: &Event<'_>, - current_span: Option<&SpanRef<'a, R>>, out: &mut Vec, - extra_event_fields: &[String], + event_fields: &[(String, String)], + span_id: Option<&str>, ) -> Result<(), std::io::Error> { write!(out, "level={} ", event.metadata().level())?; - // TODO: More metadata? - - if let Some(current_span) = current_span { - // Get the span_id to correlate the event with the actual span - let ext = current_span.extensions(); - let data = ext - .get::() - .expect("Unable to find LogFmtData in extensions; this is a bug"); + // Configured event fields (the span's own or inherited value, else a + // default) are emitted right after `level=`, so every line carries + // them — including events emitted outside any span. + for (name, value) in event_fields { + write!(out, "{} ", kvp(name, value))?; + } - if let Some(span_id) = data.attributes.get("span_id") { - write!(out, "{} ", kvp("span_id", span_id))?; - } - for key in extra_event_fields { - if let Some(value) = data.attributes.get(key) { - write!(out, "{} ", kvp(key, value))?; - } - } + // span_id correlates the event with the span it was logged in. + if let Some(span_id) = span_id { + write!(out, "{} ", kvp("span_id", span_id))?; } let mut visitor = FieldVisitor { @@ -259,12 +400,7 @@ where // The format buffer is kept around as a threadlocal variable to reduce allocations FORMAT_BUFFER.with(|fbuf| { let format_buffer: &mut Vec = &mut fbuf.borrow_mut(); - if let Ok(()) = write_event_data( - event, - current_span.as_ref(), - format_buffer, - &self.extra_event_fields, - ) { + if write_event_data(event, format_buffer, &event_fields, span_id.as_deref()).is_ok() { let mut writer = (self.make_writer)(); let _ = writer.write_all(format_buffer); } @@ -294,6 +430,9 @@ where return; } + // Resolve from this span's own resolved values. + let event_fields = self.resolve_event_fields(Some(&data.resolved_event_fields)); + data.attributes.insert( "timing_start_time".to_string(), format!("{:?}", data.timing.start_time), @@ -332,9 +471,18 @@ where span_metadata: &tracing::Metadata, mut data: LogFmtData, out: &mut Vec, + event_fields: &[(String, String)], ) -> Result<(), std::io::Error> { write!(out, "level=SPAN")?; + // Resolved event fields are emitted right after `level=SPAN`, mirroring + // event lines. Remove each from the span's own attributes first so it + // is not emitted twice (a span may set the attribute directly). + for (name, value) in event_fields { + data.attributes.remove(name); + write!(out, " {}", kvp(name, value))?; + } + // Start writing the span_id and span_name for consistency if let Some(value) = data.attributes.remove("span_id") { write!(out, " {}", kvp("span_id", value))?; @@ -356,7 +504,7 @@ where // The format buffer is kept around as a threadlocal variable to reduce allocations FORMAT_BUFFER.with(|fbuf| { let format_buffer: &mut Vec = &mut fbuf.borrow_mut(); - if let Ok(()) = write_span_data(span.metadata(), data, format_buffer) { + if let Ok(()) = write_span_data(span.metadata(), data, format_buffer, &event_fields) { let mut writer = (self.make_writer)(); let _ = writer.write_all(format_buffer); } @@ -850,12 +998,161 @@ mod tests { } #[test] - fn test_object_id_in_event_logs() { + fn test_default_field_on_event_and_span() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::with_default("service", "service-a")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let span = tracing::span!(tracing::Level::INFO, "test_span", span_id = "s1234",); + let _entered = span.enter(); + tracing::info!("inside"); + drop(_entered); + drop(span); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + assert_eq!(lines.len(), 2, "got {}: {:?}", lines.len(), lines); + // The default is emitted right after `level=` on both lines. + assert!( + lines[0].starts_with(r#"level=INFO service=service-a span_id=s1234 msg=inside "#), + "Line is: {}", + lines[0] + ); + assert!( + lines[1] + .starts_with(r#"level=SPAN service=service-a span_id=s1234 span_name=test_span "#), + "Line is: {}", + lines[1] + ); + } + + #[test] + fn test_span_value_overrides_default_once() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::with_default("service", "service-a")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let span = tracing::span!( + tracing::Level::INFO, + "outer", + span_id = "s1", + service = "service-b", + ); + let _entered = span.enter(); + tracing::info!("inside"); + drop(_entered); + drop(span); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + assert_eq!(lines.len(), 2, "got {}: {:?}", lines.len(), lines); + // The span value overrides the default, and `service` appears once. + assert!( + lines[0].starts_with(r#"level=INFO service=service-b span_id=s1 msg=inside "#), + "Line is: {}", + lines[0] + ); + assert!( + !lines[0].contains("service-a"), + "default leaked: {}", + lines[0] + ); + assert_eq!( + lines[0].matches("service=").count(), + 1, + "duplicate service key: {}", + lines[0] + ); + assert!( + lines[1].starts_with(r#"level=SPAN service=service-b span_id=s1 span_name=outer "#), + "Line is: {}", + lines[1] + ); + assert_eq!( + lines[1].matches("service=").count(), + 1, + "duplicate service key: {}", + lines[1] + ); + } + + #[test] + fn test_default_field_inherited_by_nested_span() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::with_default("service", "service-a")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + // Root span sets `service`; the nested child span does not. + let root = tracing::span!( + tracing::Level::INFO, + "outer", + span_id = "root", + service = "service-b", + ); + let _root_entered = root.enter(); + let child = tracing::span!(tracing::Level::INFO, "inner", span_id = "child"); + let _child_entered = child.enter(); + tracing::warn!("inner_event"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + // The event fires in the child span, which set no `service`; it must + // inherit `service-b` from the root span. + let event_line = lines + .iter() + .find(|l| l.contains("msg=inner_event")) + .expect("event line not found"); + assert!( + event_line.contains("service=service-b"), + "nested event did not inherit service: {event_line}" + ); + assert!( + !event_line.contains("service=service-a"), + "nested event fell back to default: {event_line}" + ); + } + + #[test] + fn test_no_field_when_unset() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer().with_writer(Arc::new(move || Box::new(cloned_writer.clone()))); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + tracing::warn!("e"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + // With no default and no span override, no `service` field is emitted, + // guaranteeing byte-identical output for the existing snapshot tests. + assert!( + lines[0].starts_with(r#"level=WARN msg=e "#), + "Line is: {}", + lines[0] + ); + assert!( + !lines[0].contains("service="), + "unexpected service: {}", + lines[0] + ); + } + + #[test] + fn test_extra_field_on_event_line() { let writer = TestWriter::new(); let cloned_writer = writer.clone(); let layer = layer() .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) - .with_event_fields(vec!["object_id".to_string()]); + .with_event_fields([EventField::new("request_id")]); let _subscriber = tracing_subscriber::registry().with(layer).set_default(); @@ -863,11 +1160,11 @@ mod tests { tracing::Level::INFO, "test_span", span_id = "s1234", - object_id = "machine-abc-123", + request_id = "req-abc-123", ); let _entered = span.enter(); - tracing::info!("event inside span with object_id"); + tracing::info!("event inside span with request_id"); drop(_entered); drop(span); @@ -880,11 +1177,271 @@ mod tests { lines.len(), lines ); - // Event should include both span_id and object_id + // Event should include both span_id and request_id assert!( - lines[0].starts_with(r#"level=INFO span_id=s1234 object_id=machine-abc-123 msg="event inside span with object_id" location=""#), + lines[0].starts_with(r#"level=INFO request_id=req-abc-123 span_id=s1234 msg="event inside span with request_id" location=""#), "Line is: {}", lines[0] ); } + + #[test] + fn test_extra_field_inherited_from_parent_span() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::new("request_id")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + // Parent sets `request_id`; the nested child does not -> the child's event + // inherits it from the parent span. + let root = tracing::span!( + tracing::Level::INFO, + "outer", + span_id = "root", + request_id = "req-1", + ); + let _root = root.enter(); + let child = tracing::span!(tracing::Level::INFO, "inner", span_id = "child"); + let _child = child.enter(); + tracing::info!("inner_event"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let event_line = lines + .iter() + .find(|l| l.contains("msg=inner_event")) + .expect("event line not found"); + assert!( + event_line.contains("request_id=req-1"), + "nested event did not inherit request_id: {event_line}" + ); + } + + #[test] + fn test_extra_field_child_value_overrides_parent() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::new("request_id")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + // Both parent and child set `request_id` -> the child's own value wins + // over the inherited parent value; a parent never replaces a child's value. + let root = tracing::span!( + tracing::Level::INFO, + "outer", + span_id = "root", + request_id = "req-parent", + ); + let _root = root.enter(); + let child = tracing::span!( + tracing::Level::INFO, + "inner", + span_id = "child", + request_id = "req-child", + ); + let _child = child.enter(); + tracing::info!("inner_event"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let event_line = lines + .iter() + .find(|l| l.contains("msg=inner_event")) + .expect("event line not found"); + assert!( + event_line.contains("request_id=req-child"), + "child's own request_id should win: {event_line}" + ); + assert!( + !event_line.contains("req-parent"), + "parent value leaked: {event_line}" + ); + } + + #[test] + fn test_extra_field_on_span_line() { + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::new("request_id")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + // A plain extra field is emitted on span-close lines too (not just event + // lines): the span that sets it carries it on its own close line, and a + // nested span that doesn't set it inherits it onto its close line. + let root = tracing::span!( + tracing::Level::INFO, + "outer", + span_id = "root", + request_id = "req-1", + ); + let _root = root.enter(); + let child = tracing::span!(tracing::Level::INFO, "inner", span_id = "child"); + drop(child); // child's SPAN line + drop(_root); + drop(root); // outer's SPAN line + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let child_span = lines + .iter() + .find(|l| l.contains("span_name=inner")) + .expect("child span line not found"); + let outer_span = lines + .iter() + .find(|l| l.contains("span_name=outer")) + .expect("outer span line not found"); + // Inherited onto the child's close line, present on the outer's, once each. + assert!( + child_span.contains("request_id=req-1"), + "child span line missing inherited request_id: {child_span}" + ); + assert_eq!( + child_span.matches("request_id=").count(), + 1, + "duplicate request_id on child span line: {child_span}" + ); + assert!( + outer_span.contains("request_id=req-1"), + "outer span line missing request_id: {outer_span}" + ); + assert_eq!( + outer_span.matches("request_id=").count(), + 1, + "duplicate request_id on outer span line: {outer_span}" + ); + } + + // --- event-field inheritance edge cases --- + + #[test] + fn test_explicit_parent_inheritance() { + // Inheritance must follow an explicit `parent:` link, not just the + // contextual current span. Here the child is created with an explicit + // parent that is NOT entered, so contextual lookup would miss it. + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::with_default("service", "service-a")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let parent = tracing::span!( + tracing::Level::INFO, + "parent", + span_id = "p", + service = "service-b", + ); + // Note: parent is never entered. The child names it explicitly. + let child = tracing::span!(parent: &parent, tracing::Level::INFO, "child", span_id = "c"); + let _entered = child.enter(); + tracing::info!("explicit_parent_event"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let event_line = lines + .iter() + .find(|l| l.contains("msg=explicit_parent_event")) + .expect("event line not found"); + assert!( + event_line.contains("service=service-b"), + "explicit parent value not inherited: {event_line}" + ); + assert!( + !event_line.contains("service-a"), + "fell back to default instead of inheriting explicit parent: {event_line}" + ); + } + + #[test] + fn test_empty_field_recorded_after_creation_overrides_inherited() { + // A child declares the field as `Empty` (so it has no own value at + // creation and inherits the parent's resolved value), then fills it in + // via `span.record`. `on_record` must refresh the resolved value so the + // span's own value wins over the inherited one. + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::with_default("service", "service-a")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let root = tracing::span!( + tracing::Level::INFO, + "root", + span_id = "root", + service = "parent-comp", + ); + let _root = root.enter(); + let child = tracing::span!( + tracing::Level::INFO, + "child", + span_id = "child", + service = tracing::field::Empty, + ); + let _child = child.enter(); + // Before record: should inherit `parent-comp`. + tracing::info!("before_record"); + child.record("service", "child-comp"); + // After record: the span's own value wins. + tracing::info!("after_record"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let before = lines + .iter() + .find(|l| l.contains("msg=before_record")) + .expect("before line not found"); + let after = lines + .iter() + .find(|l| l.contains("msg=after_record")) + .expect("after line not found"); + assert!( + before.contains("service=parent-comp"), + "child with Empty field did not inherit parent before record: {before}" + ); + assert!( + after.contains("service=child-comp"), + "on_record did not refresh resolved value: {after}" + ); + } + + #[test] + fn test_deeply_nested_inheritance() { + // Only the outermost span sets the field; deeply nested descendants must + // still see it. Each level copies its parent's already-resolved value, so + // the value propagates down through every nested span. + let writer = TestWriter::new(); + let cloned_writer = writer.clone(); + let layer = layer() + .with_writer(Arc::new(move || Box::new(cloned_writer.clone()))) + .with_event_fields([EventField::with_default("service", "service-a")]); + + let _subscriber = tracing_subscriber::registry().with(layer).set_default(); + + let s1 = tracing::span!(tracing::Level::INFO, "s1", service = "deep-comp"); + let _e1 = s1.enter(); + let s2 = tracing::span!(tracing::Level::INFO, "s2"); + let _e2 = s2.enter(); + let s3 = tracing::span!(tracing::Level::INFO, "s3"); + let _e3 = s3.enter(); + let s4 = tracing::span!(tracing::Level::INFO, "s4"); + let _e4 = s4.enter(); + tracing::info!("deep_event"); + + let lines: Vec = writer.text().lines().map(ToString::to_string).collect(); + let event_line = lines + .iter() + .find(|l| l.contains("msg=deep_event")) + .expect("event line not found"); + assert!( + event_line.contains("service=deep-comp"), + "deeply nested span did not inherit service: {event_line}" + ); + } } diff --git a/crates/rvs/src/bin/carbide-rvs.rs b/crates/rvs/src/bin/carbide-rvs.rs index 8caa2cde24..e731d433a4 100644 --- a/crates/rvs/src/bin/carbide-rvs.rs +++ b/crates/rvs/src/bin/carbide-rvs.rs @@ -53,7 +53,10 @@ async fn main() -> Result<(), RvsError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer()) + .with( + logfmt::layer() + .with_event_fields([logfmt::EventField::with_default("component", "nico-rvs")]), + ) .with(env_filter) .init(); diff --git a/crates/rvs/src/bin/test-artifact-cache.rs b/crates/rvs/src/bin/test-artifact-cache.rs index 018796f1a4..45e0284e1c 100644 --- a/crates/rvs/src/bin/test-artifact-cache.rs +++ b/crates/rvs/src/bin/test-artifact-cache.rs @@ -73,7 +73,12 @@ async fn main() -> Result<(), RvsError> { .from_env_lossy(); tracing_subscriber::registry() - .with(logfmt::layer()) + .with( + logfmt::layer().with_event_fields([logfmt::EventField::with_default( + "component", + "nico-test-artifact-cache", + )]), + ) .with(env_filter) .init(); diff --git a/crates/scout/src/main.rs b/crates/scout/src/main.rs index 61626a5a63..695b2563d5 100644 --- a/crates/scout/src/main.rs +++ b/crates/scout/src/main.rs @@ -95,7 +95,7 @@ async fn main() -> Result<(), eyre::Report> { check_if_running_in_qemu().await; - carbide_host_support::init_logging()?; + carbide_host_support::init_logging("nico-scout")?; tracing::info!("Running as {}...{}", config.mode, config.version); diff --git a/crates/site-explorer/src/lib.rs b/crates/site-explorer/src/lib.rs index 18a3d135dd..a44e6b3021 100644 --- a/crates/site-explorer/src/lib.rs +++ b/crates/site-explorer/src/lib.rs @@ -410,6 +410,7 @@ impl SiteExplorer { "explore_site", span_id, carbide.trace_root = true, + component = "site-explorer", otel.status_code = tracing::field::Empty, otel.status_message = tracing::field::Empty, created_machines = tracing::field::Empty, diff --git a/crates/state-controller/src/controller/periodic_enqueuer.rs b/crates/state-controller/src/controller/periodic_enqueuer.rs index 017b0f43a3..47858c8d61 100644 --- a/crates/state-controller/src/controller/periodic_enqueuer.rs +++ b/crates/state-controller/src/controller/periodic_enqueuer.rs @@ -118,6 +118,7 @@ impl PeriodicEnqueuer { span_id, carbide.trace_root = true, controller = IO::LOG_SPAN_CONTROLLER_NAME, + component = IO::LOG_SPAN_CONTROLLER_NAME, iteration_id = tracing::field::Empty, otel.status_code = tracing::field::Empty, otel.status_message = tracing::field::Empty, diff --git a/crates/state-controller/src/controller/processor.rs b/crates/state-controller/src/controller/processor.rs index 9edace5210..315fbe233e 100644 --- a/crates/state-controller/src/controller/processor.rs +++ b/crates/state-controller/src/controller/processor.rs @@ -213,6 +213,7 @@ impl StateProcessor { span_id, carbide.trace_root = true, controller = IO::LOG_SPAN_CONTROLLER_NAME, + component = IO::LOG_SPAN_CONTROLLER_NAME, ); async { diff --git a/docs/index.yml b/docs/index.yml index 5e4e6f8355..661a58da3f 100644 --- a/docs/index.yml +++ b/docs/index.yml @@ -194,6 +194,8 @@ navigation: path: observability/core_metrics.md - page: Traces path: observability/tracing.md + - page: Logging + path: observability/logging.md - page: NICo Debug WebUI path: operations/debug_webui.md - section: Playbooks diff --git a/docs/observability/logging.md b/docs/observability/logging.md new file mode 100644 index 0000000000..abb074c0a1 --- /dev/null +++ b/docs/observability/logging.md @@ -0,0 +1,97 @@ +# NICo Logging + +How NICo services emit logs, the structured fields each line carries, and how to set log levels. + +--- + +## TL;DR + +- **Most** NICo services log in **[logfmt](https://brandur.org/logfmt)** (`key=value` pairs) to **stdout**; + a few use other formats (e.g. `nico-dns` emits JSON — see [Coverage](#coverage)). +- Every **logfmt** line begins with `level=` and carries a **`component=`** field identifying the emitting + component, so logs can be filtered by component instead of grepping the message text. +- Log verbosity is controlled by `RUST_LOG` (default `info`). +- For distributed **tracing** (spans exported via OTLP), see [Traces](tracing.md). + +## Log format + +There are two kinds of line. + +**Events** — one per log call: +``` +level=INFO component=nico-api span_id=0x4f… msg="…" location="crates/api-core/src/…" +``` + +**Spans** — emitted when a unit of work closes (`level=SPAN`), carrying the span name, id, its attributes, +and timing: +``` +level=SPAN component=site-explorer span_id=0xf7… span_name=explore_site timing_elapsed_us=… timing_busy_ns=… timing_idle_ns=… +``` + +Common keys: `level`, `component`, `msg`, `location` (`file:line`); `span_id` when the line is inside a +span; plus any structured fields the call site adds (e.g. `controller=`, `object_id=`). + +## Log level + +Verbosity is an [`EnvFilter`](https://docs.rs/tracing-subscriber) directive set from `RUST_LOG`, defaulting +to `info`: +``` +RUST_LOG=debug +RUST_LOG=info,carbide_site_explorer=debug # raise one module +``` +`nico-api` can also adjust its filter at runtime via `nico-admin-cli set log-filter`. + +## The `component` field + +On logfmt lines, NICo sets the `component` field to one of the following: + +``` +nico-api — API handlers, DB, startup: anything not in a subsystem below +├── site-explorer +├── machine_state_controller +├── switch_controller +├── rack_controller +├── power_shelf_controller +├── network_segments_controller +├── vpc_prefix_controller +├── ib_partition_controller +└── attestation_controller +nico-bmc-proxy +nico-dhcp +nico-dsx-exchange-consumer +nico-fmds +nico-hardware-health +nico-rvs +nico-test-artifact-cache +nico-dpu-agent +nico-scout +``` + +State-controller lines also carry a `controller=` field with the same value. + +### Adding it to new code + +- **New binary** — set the default when building the `logfmt` layer: + `logfmt::layer().with_event_fields([logfmt::EventField::with_default("component", "nico-my-service")])` + — or pass the name to `carbide_host_support::init_logging("nico-my-service")` if the binary uses that + helper. A binary that omits this sets no default `component`. +- **New in-process subsystem of `nico-api`** — add a `component` field to the subsystem's root span: + `tracing::span!(parent: None, Level::INFO, "my_subsystem", component = "my-subsystem", /* … */)`. Nested + instrumented functions and spawned tasks carried with `.instrument(...)` / `.in_current_span()` inherit it. + +> **By convention, NICo uses the `component` key for the emitting component** — don't reuse the key for +> unrelated data. Give domain values their own key (e.g. `nico-hardware-health`'s firmware-inventory events +> use `firmware_component`, not `component`). + +### Coverage + +Binaries that do not use the `logfmt` layer carry no `component`: `nico-dns` (emits JSON), `nico-pxe` +(hand-rolled formatter), `nico-ssh-console`, `nico-dpu-otel-agent`. CLI/dev tools and mocks are out of scope. + +## Querying + +Parse the logfmt line and filter on the field in whatever store the logs land in. For example, with logql +(Loki): +```logql +{namespace="nico-system"} | logfmt | component="site-explorer" +```