From 0a4b1bcce30eedc45d1fb1ff631745b7dedb04ae Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Fri, 29 May 2026 15:12:47 -0400 Subject: [PATCH 1/2] fix(metrics): emit OOM metric when max_memory_used == memory_size, with per-request dedup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Customer report (#1237): a Node.js Lambda that hit its memory limit (Memory Size 192 MB / Max Memory Used 192 MB, Status: timeout) did not emit aws.lambda.enhanced.out_of_memory because none of the existing detection paths matched. The Node runtime did not log "JavaScript heap out of memory" (V8 spent its time in GC instead of declaring an OOM), and PlatformRuntimeDone reported no error_type — just a wall-clock timeout — so the log-string and Runtime.OutOfMemory paths both stayed silent. Drop the provided.al* restriction on the PlatformReport equality check so any runtime emits OOM when max_memory_used_mb == memory_size_mb. To avoid double-counting against the two pre-existing paths (some invocations satisfy both equality and Runtime.OutOfMemory at the same time), add a per-Context oom_emitted flag. All three detection paths now funnel through Processor::try_increment_oom_metric, which checks the flag, sets it on first emission, and is a no-op on subsequent calls for the same request_id. The flag lives with the per-invocation Context and is cleared automatically when on_platform_report removes the context. Plumbing: Event::OutOfMemory now carries an Option request_id (the log-path detector reads it from the logs processor's invocation_context.request_id, set on PlatformStart and cleared on PlatformRuntimeDone). When request_id is None — only realistic in Managed Instance mode, where extensions cannot subscribe to INVOKE — the helper falls back to a best-effort emit without dedup. Tests cover three scenarios: same request_id emits exactly once, two distinct request_ids each emit, and the equality path still fires (regression coverage for the dropped provided.al* check). Co-Authored-By: Claude Opus 4.7 (1M context) --- bottlecap/src/bin/bottlecap/main.rs | 7 +- bottlecap/src/event_bus/mod.rs | 8 +- bottlecap/src/lifecycle/invocation/context.rs | 7 + .../src/lifecycle/invocation/processor.rs | 185 ++++++++++++++++-- .../lifecycle/invocation/processor_service.rs | 15 +- bottlecap/src/logs/lambda/processor.rs | 38 +++- bottlecap/src/metrics/enhanced/lambda.rs | 12 +- 7 files changed, 245 insertions(+), 27 deletions(-) diff --git a/bottlecap/src/bin/bottlecap/main.rs b/bottlecap/src/bin/bottlecap/main.rs index 3dcc13bb2..a41a3f51f 100644 --- a/bottlecap/src/bin/bottlecap/main.rs +++ b/bottlecap/src/bin/bottlecap/main.rs @@ -841,9 +841,12 @@ async fn handle_event_bus_event( stats_concentrator: StatsConcentratorHandle, ) -> Option { match event { - Event::OutOfMemory(event_timestamp) => { + Event::OutOfMemory { + request_id, + timestamp, + } => { if let Err(e) = invocation_processor_handle - .on_out_of_memory_error(event_timestamp) + .on_out_of_memory_error(request_id, timestamp) .await { error!("Failed to send out of memory error to processor: {}", e); diff --git a/bottlecap/src/event_bus/mod.rs b/bottlecap/src/event_bus/mod.rs index 0ea20969e..0be3a86ca 100644 --- a/bottlecap/src/event_bus/mod.rs +++ b/bottlecap/src/event_bus/mod.rs @@ -7,7 +7,13 @@ mod constants; #[derive(Debug)] pub enum Event { Telemetry(TelemetryEvent), - OutOfMemory(i64), + OutOfMemory { + /// Lambda `request_id` of the invocation the OOM belongs to, when known. + /// Used by the invocation processor to dedupe against other OOM detection + /// paths (`PlatformRuntimeDone` `error_type`, `PlatformReport` memory equality). + request_id: Option, + timestamp: i64, + }, Tombstone, } diff --git a/bottlecap/src/lifecycle/invocation/context.rs b/bottlecap/src/lifecycle/invocation/context.rs index 04894f9c6..3aef5e4bf 100644 --- a/bottlecap/src/lifecycle/invocation/context.rs +++ b/bottlecap/src/lifecycle/invocation/context.rs @@ -43,6 +43,12 @@ pub struct Context { /// tracing. /// pub extracted_span_context: Option, + /// Whether the `aws.lambda.enhanced.out_of_memory` metric has already been + /// emitted for this invocation. Multiple detection paths can fire for the + /// same OOM (runtime log, `Runtime.OutOfMemory` `error_type` in + /// `PlatformRuntimeDone`, `max_memory_used == memory_size` in `PlatformReport`); + /// this flag dedupes them. + pub oom_emitted: bool, } /// Struct containing the information needed to reparent a span. @@ -94,6 +100,7 @@ impl Default for Context { snapstart_restore_span: None, tracer_span: None, extracted_span_context: None, + oom_emitted: false, } } } diff --git a/bottlecap/src/lifecycle/invocation/processor.rs b/bottlecap/src/lifecycle/invocation/processor.rs index ec1af99ab..c77c224a8 100644 --- a/bottlecap/src/lifecycle/invocation/processor.rs +++ b/bottlecap/src/lifecycle/invocation/processor.rs @@ -508,7 +508,7 @@ impl Processor { debug!( "Invocation Processor | PlatformRuntimeDone | Got Runtime.OutOfMemory. Incrementing OOM metric." ); - self.enhanced_metrics.increment_oom_metric(timestamp); + self.try_increment_oom_metric(Some(request_id), timestamp); } } @@ -909,25 +909,25 @@ impl Processor { /// Handles `OnDemand` mode platform report processing. /// - /// Processes OnDemand-specific metrics including OOM detection for provided.al runtimes - /// and post-runtime duration calculation. + /// Processes OnDemand-specific metrics including OOM detection by memory-size + /// equality and post-runtime duration calculation. fn handle_ondemand_report( &mut self, request_id: &String, metrics: OnDemandReportMetrics, timestamp: i64, ) { - // For provided.al runtimes, if the last invocation hit the memory limit, increment the OOM metric. - // We do this for provided.al runtimes because we didn't find another way to detect this under provided.al. - // We don't do this for other runtimes to avoid double counting. - if let Some(runtime) = &self.runtime - && runtime.starts_with("provided.al") - && metrics.max_memory_used_mb == metrics.memory_size_mb - { + // If the invocation hit the memory limit, increment the OOM metric. This catches + // OOM-induced failures that don't surface through a runtime-specific log line or a + // `Runtime.OutOfMemory` error_type — most notably the suppressed-init / timeout-at-cap + // pattern reported in datadog-lambda-extension#1237 (Node) and the historical + // provided.al case. Dedup against the other two detection paths is handled by + // `Context::oom_emitted`, which `try_increment_oom_metric` checks and sets. + if metrics.max_memory_used_mb == metrics.memory_size_mb { debug!( "Invocation Processor | PlatformReport | Last invocation hit memory limit. Incrementing OOM metric." ); - self.enhanced_metrics.increment_oom_metric(timestamp); + self.try_increment_oom_metric(Some(request_id), timestamp); } // Calculate and set post-runtime duration if context is available @@ -1395,7 +1395,34 @@ impl Processor { Some(error_tags) } - pub fn on_out_of_memory_error(&mut self, timestamp: i64) { + pub fn on_out_of_memory_error(&mut self, request_id: Option<&String>, timestamp: i64) { + self.try_increment_oom_metric(request_id, timestamp); + } + + /// Increments the OOM enhanced metric exactly once per `request_id`. + /// + /// Several detection paths can fire for the same invocation: + /// 1. A runtime-specific OOM log line (logs processor → `Event::OutOfMemory`) + /// 2. `error_type == "Runtime.OutOfMemory"` in `PlatformRuntimeDone` + /// 3. `max_memory_used_mb == memory_size_mb` in `PlatformReport` + /// + /// To avoid double-counting, the per-invocation `Context::oom_emitted` flag is + /// set on the first emission. Subsequent emissions for the same `request_id` are + /// skipped. If `request_id` is `None` (log path saw the OOM outside an active + /// invocation window) or no context is found, we emit best-effort without dedup. + fn try_increment_oom_metric(&mut self, request_id: Option<&String>, timestamp: i64) { + if let Some(rid) = request_id + && let Some(ctx) = self.context_buffer.get_mut(rid) + { + if ctx.oom_emitted { + debug!( + "Invocation Processor | OOM metric already emitted for request_id {}, skipping", + rid + ); + return; + } + ctx.oom_emitted = true; + } self.enhanced_metrics.increment_oom_metric(timestamp); } @@ -2445,4 +2472,138 @@ mod tests { "pre-existing _dd.appsec.enabled value must not be overwritten" ); } + + /// Two OOM signals for the same `request_id` increment the metric exactly once. + /// Exercises the `Context::oom_emitted` dedup flag. + #[tokio::test] + async fn test_try_increment_oom_metric_dedupes_same_request_id() { + let mut p = setup(); + // Insert the context directly so we don't go through `on_invoke_event`, which + // would populate dynamic tags (`cold_start:true`) and complicate the query. + let request_id = String::from("req-dedup"); + p.context_buffer.start_context(&request_id, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + p.on_out_of_memory_error(Some(&request_id), now); + p.on_out_of_memory_error(Some(&request_id), now); + + let ts = (now / 10) * 10; + let entry = p + .enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts, + ) + .await + .unwrap() + .expect("OOM metric must be emitted at least once"); + + let sketch = entry.value.get_sketch().expect("distribution sketch"); + let sum = sketch.sum().expect("sketch sum"); + assert!( + (sum - 1.0).abs() < f64::EPSILON, + "OOM sum must be 1.0 (deduped), got {sum}" + ); + + // And the context flag should now reflect that we emitted. + assert!( + p.context_buffer + .get(&request_id) + .expect("context") + .oom_emitted, + "oom_emitted flag must be set after the first emission" + ); + } + + /// OOM signals for different `request_id`s each emit a metric — dedup is scoped + /// per request, not globally. + #[tokio::test] + async fn test_try_increment_oom_metric_distinct_request_ids_emit_separately() { + let mut p = setup(); + let req1 = String::from("req-a"); + let req2 = String::from("req-b"); + p.context_buffer.start_context(&req1, Span::default()); + p.context_buffer.start_context(&req2, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + p.on_out_of_memory_error(Some(&req1), now); + p.on_out_of_memory_error(Some(&req2), now); + + let ts = (now / 10) * 10; + let entry = p + .enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts, + ) + .await + .unwrap() + .expect("OOM metric must be emitted"); + + let sketch = entry.value.get_sketch().expect("distribution sketch"); + let sum = sketch.sum().expect("sketch sum"); + assert!( + (sum - 2.0).abs() < f64::EPSILON, + "OOM sum must be 2.0 (one per request_id), got {sum}" + ); + } + + /// Regression: the `max_memory_used_mb == memory_size_mb` path used to be gated + /// on `runtime.starts_with("provided.al")`. After generalising the rule to all + /// runtimes (with dedup via `Context::oom_emitted`), the equality case must + /// still emit OOM. + #[tokio::test] + async fn test_handle_ondemand_report_emits_oom_on_memory_equality() { + let mut p = setup(); + let request_id = String::from("req-eq"); + p.context_buffer.start_context(&request_id, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + let metrics = OnDemandReportMetrics { + duration_ms: 100.0, + billed_duration_ms: 100, + memory_size_mb: 1024, + max_memory_used_mb: 1024, + init_duration_ms: None, + restore_duration_ms: None, + }; + p.handle_ondemand_report(&request_id, metrics, now); + + let ts = (now / 10) * 10; + assert!( + p.enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts + ) + .await + .unwrap() + .is_some(), + "OOM must be emitted when max_memory_used_mb == memory_size_mb" + ); + } } diff --git a/bottlecap/src/lifecycle/invocation/processor_service.rs b/bottlecap/src/lifecycle/invocation/processor_service.rs index a41a95b26..61c48b479 100644 --- a/bottlecap/src/lifecycle/invocation/processor_service.rs +++ b/bottlecap/src/lifecycle/invocation/processor_service.rs @@ -118,6 +118,7 @@ pub enum ProcessorCommand { execution_status: Option, }, OnOutOfMemoryError { + request_id: Option, timestamp: i64, }, OnShutdownEvent, @@ -407,10 +408,14 @@ impl InvocationProcessorHandle { pub async fn on_out_of_memory_error( &self, + request_id: Option, timestamp: i64, ) -> Result<(), mpsc::error::SendError> { self.sender - .send(ProcessorCommand::OnOutOfMemoryError { timestamp }) + .send(ProcessorCommand::OnOutOfMemoryError { + request_id, + timestamp, + }) .await } @@ -632,8 +637,12 @@ impl InvocationProcessorService { ) .await; } - ProcessorCommand::OnOutOfMemoryError { timestamp } => { - self.processor.on_out_of_memory_error(timestamp); + ProcessorCommand::OnOutOfMemoryError { + request_id, + timestamp, + } => { + self.processor + .on_out_of_memory_error(request_id.as_ref(), timestamp); } ProcessorCommand::OnShutdownEvent => { self.processor.on_shutdown_event(); diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index 643e32854..9c922ac69 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -163,6 +163,32 @@ impl LambdaProcessor { } } + /// Returns the `request_id` of the currently-active invocation, if known. + /// Set by `PlatformStart`, cleared by `PlatformRuntimeDone` / `PlatformReport`. + /// + /// Returns `None` when: + /// - **Managed Instance mode**: extensions cannot subscribe to the `INVOKE` event, + /// so `platform.start` is not delivered and this slot is never populated. OOM logs + /// parsed in MI mode are therefore always tagged `None`. The synthesized + /// `PlatformRuntimeDone` produced by `handle_managed_instance_report` does carry a + /// real `request_id`, so dedup still works for that path. Worst case is a thin + /// double-count window if a runtime emits both an OOM log line and + /// `error_type = Runtime.OutOfMemory` for the same invocation — not observed in + /// practice today. + /// - **Pre-`PlatformStart` init crash**: a FATAL OOM log emitted by init code can + /// arrive before `PlatformStart` (or with no `PlatformStart` at all, if init + /// fails outright). In the no-`PlatformStart` case no other detection path fires, + /// so no double-count. + /// - **Late log race**: a FATAL log parsed after `PlatformRuntimeDone` clears the + /// slot. By then the context has been removed, so no double-count. + fn current_request_id(&self) -> Option { + if self.invocation_context.request_id.is_empty() { + None + } else { + Some(self.invocation_context.request_id.clone()) + } + } + #[allow(clippy::too_many_lines)] async fn get_message(&mut self, event: TelemetryEvent) -> Result> { let copy = event.clone(); @@ -194,7 +220,10 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - if let Err(e) = self.event_bus.send(Event::OutOfMemory(event.time.timestamp())).await { + if let Err(e) = self.event_bus.send(Event::OutOfMemory { + request_id: self.current_request_id(), + timestamp: event.time.timestamp(), + }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); } } @@ -206,7 +235,7 @@ impl LambdaProcessor { event.time.timestamp_millis(), None, ); - // If the message is logged from the durable execution SDK, + // If the message is logged from the durable execution SDK, // set durable execution id and name as log attributes. if let Some((exec_id, exec_name)) = durable_ctx { msg.lambda.durable_execution_id = Some(exec_id); @@ -227,7 +256,10 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - if let Err(e) = self.event_bus.send(Event::OutOfMemory(event.time.timestamp())).await { + if let Err(e) = self.event_bus.send(Event::OutOfMemory { + request_id: self.current_request_id(), + timestamp: event.time.timestamp(), + }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); } } diff --git a/bottlecap/src/metrics/enhanced/lambda.rs b/bottlecap/src/metrics/enhanced/lambda.rs index abed7d5b9..67535967e 100644 --- a/bottlecap/src/metrics/enhanced/lambda.rs +++ b/bottlecap/src/metrics/enhanced/lambda.rs @@ -91,12 +91,12 @@ impl Lambda { self.increment_metric(constants::TIMEOUTS_METRIC, timestamp); } - // This function is called in three cases: - // 1. Runtime-specific OOM error (can happen in .NET, Node.js and Java as far as we know) - // 2. PlatformRuntimeDone event reports "error_type: Runtime.OutOfMemory" (can happen in Ruby and Python as far as we know) - // 3. PlatformReport event reports "max_memory_used_mb == memory_size_mb" (can happen in many runtimes, but - // we only call increment_oom_metric() for provided.al runtimes) - // This is our best effort to cover different cases without double counting. We can adjust this if we find more cases. + // Callers should generally go through `Processor::try_increment_oom_metric`, which + // dedupes by `request_id` so the same invocation isn't counted multiple times when + // more than one detection path fires. The three paths are: + // 1. Runtime-specific OOM log line (.NET, Node, Java, Go, Ruby, Python) + // 2. PlatformRuntimeDone with error_type == "Runtime.OutOfMemory" (Ruby, Python; Node as of 2026-05) + // 3. PlatformReport with max_memory_used_mb == memory_size_mb (all runtimes) pub fn increment_oom_metric(&self, timestamp: i64) { self.increment_metric(constants::OUT_OF_MEMORY_METRIC, timestamp); } From 5a833ac9f95dfe96300b28347d79b7bda1ac9401 Mon Sep 17 00:00:00 2001 From: Yiming Luo <10097700+lym953@users.noreply.github.com> Date: Fri, 29 May 2026 15:52:52 -0400 Subject: [PATCH 2/2] test(integration): add cross-runtime OOM test suite MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a new `oom` integration-test suite that exercises the OOM dedup change (Context::oom_emitted, #1241) end-to-end across every supported runtime. Each lambda intentionally allocates until it OOMs; the test asserts aws.lambda.enhanced.out_of_memory increments by exactly one data point per function over the invocation window — which fails if the dedup flag stops working and two detection paths emit for the same invocation. New lambda apps under integration-tests/lambda/: - oom-node-v8-heap : exercises log-line path (JavaScript heap OOM) - oom-node-sigkill : exercises PlatformRuntimeDone Runtime.OutOfMemory path - oom-python : MemoryError — log path AND PlatformRuntimeDone path both fire, so dedup is necessary for count==1 - oom-ruby : NoMemoryError — same dual-path coverage as Python - oom-java : OutOfMemoryError (log-line path) - oom-dotnet : OutOfMemoryException (log-line path) - oom-go : fatal: runtime: out of memory — log path AND PlatformReport memory-equality path both fire Framework additions: - Ruby and Go runtime/layer helpers in lib/util.ts (Ruby tracer layer; Go has no tracer layer — extension layer alone covers the test). - Oom CDK stack registered in bin/app.ts. - build-ruby.sh (zip-as-is for now; Gemfile build stubbed) and build-go.sh (Docker cross-compile to ARM64 Linux, bootstrap binary). - Pipeline template additions for the two new build stages and oom suite registration in test-suites.yaml. - getMetricCount() + OUT_OF_MEMORY_METRIC in tests/utils/datadog.ts. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitlab/datasources/test-suites.yaml | 1 + .gitlab/templates/pipeline.yaml.tpl | 38 +++ integration-tests/bin/app.ts | 4 + .../lambda/oom-dotnet/Function.cs | 25 ++ .../lambda/oom-dotnet/Function.csproj | 14 ++ integration-tests/lambda/oom-go/go.mod | 5 + integration-tests/lambda/oom-go/main.go | 23 ++ integration-tests/lambda/oom-java/pom.xml | 50 ++++ .../src/main/java/example/Handler.java | 24 ++ .../lambda/oom-node-sigkill/index.mjs | 13 ++ .../lambda/oom-node-v8-heap/index.mjs | 10 + .../lambda/oom-python/lambda_function.py | 12 + .../lambda/oom-ruby/lambda_function.rb | 13 ++ integration-tests/lib/stacks/oom.ts | 216 ++++++++++++++++++ integration-tests/lib/util.ts | 12 + integration-tests/scripts/build-go.sh | 123 ++++++++++ integration-tests/scripts/build-ruby.sh | 88 +++++++ integration-tests/scripts/local_deploy.sh | 2 + integration-tests/tests/oom.test.ts | 90 ++++++++ integration-tests/tests/utils/datadog.ts | 37 +++ 20 files changed, 800 insertions(+) create mode 100644 integration-tests/lambda/oom-dotnet/Function.cs create mode 100644 integration-tests/lambda/oom-dotnet/Function.csproj create mode 100644 integration-tests/lambda/oom-go/go.mod create mode 100644 integration-tests/lambda/oom-go/main.go create mode 100644 integration-tests/lambda/oom-java/pom.xml create mode 100644 integration-tests/lambda/oom-java/src/main/java/example/Handler.java create mode 100644 integration-tests/lambda/oom-node-sigkill/index.mjs create mode 100644 integration-tests/lambda/oom-node-v8-heap/index.mjs create mode 100644 integration-tests/lambda/oom-python/lambda_function.py create mode 100644 integration-tests/lambda/oom-ruby/lambda_function.rb create mode 100644 integration-tests/lib/stacks/oom.ts create mode 100755 integration-tests/scripts/build-go.sh create mode 100755 integration-tests/scripts/build-ruby.sh create mode 100644 integration-tests/tests/oom.test.ts diff --git a/.gitlab/datasources/test-suites.yaml b/.gitlab/datasources/test-suites.yaml index 257b1ba04..b6d82c369 100644 --- a/.gitlab/datasources/test-suites.yaml +++ b/.gitlab/datasources/test-suites.yaml @@ -4,3 +4,4 @@ test_suites: - name: snapstart - name: lmi - name: auth + - name: oom diff --git a/.gitlab/templates/pipeline.yaml.tpl b/.gitlab/templates/pipeline.yaml.tpl index 60788606b..a87bfaa14 100644 --- a/.gitlab/templates/pipeline.yaml.tpl +++ b/.gitlab/templates/pipeline.yaml.tpl @@ -505,6 +505,40 @@ build node lambdas: - cd integration-tests - ./scripts/build-node.sh +build ruby lambdas: + stage: integration-tests + image: registry.ddbuild.io/images/docker:27.3.1 + tags: ["docker-in-docker:arm64"] + rules: + - when: on_success + needs: [] + artifacts: + expire_in: 1 hour + paths: + - integration-tests/lambda/*/*.rb + script: + - cd integration-tests + - ./scripts/build-ruby.sh + +build go lambdas: + stage: integration-tests + image: registry.ddbuild.io/images/docker:27.3.1 + tags: ["docker-in-docker:arm64"] + rules: + - when: on_success + needs: [] + cache: + key: go-mod-cache-${CI_COMMIT_REF_SLUG} + paths: + - integration-tests/.cache/go-mod/ + artifacts: + expire_in: 1 hour + paths: + - integration-tests/lambda/*/bin/bootstrap + script: + - cd integration-tests + - ./scripts/build-go.sh + # Integration Tests - Publish arm64 layer with integration test prefix publish integration layer (arm64): stage: integration-tests @@ -581,12 +615,16 @@ integration-suite: - build dotnet lambdas - build python lambdas - build node lambdas + - build ruby lambdas + - build go lambdas dependencies: - publish integration layer (arm64) - build java lambdas - build dotnet lambdas - build python lambdas - build node lambdas + - build ruby lambdas + - build go lambdas variables: IDENTIFIER: ${CI_COMMIT_SHORT_SHA} AWS_DEFAULT_REGION: us-east-1 diff --git a/integration-tests/bin/app.ts b/integration-tests/bin/app.ts index d822e6cac..affce4270 100644 --- a/integration-tests/bin/app.ts +++ b/integration-tests/bin/app.ts @@ -6,6 +6,7 @@ import {Otlp} from '../lib/stacks/otlp'; import {Snapstart} from '../lib/stacks/snapstart'; import {LambdaManagedInstancesStack} from '../lib/stacks/lmi'; import {AuthStack} from '../lib/stacks/auth'; +import {Oom} from '../lib/stacks/oom'; import {AuthRoleStack} from '../lib/auth-role'; import {ACCOUNT, getIdentifier, REGION} from '../config'; import {CapacityProviderStack} from "../lib/capacity-provider"; @@ -40,6 +41,9 @@ const stacks = [ new AuthStack(app, `integ-${identifier}-auth`, { env, }), + new Oom(app, `integ-${identifier}-oom`, { + env, + }), ] // Tag all stacks so we can easily clean them up diff --git a/integration-tests/lambda/oom-dotnet/Function.cs b/integration-tests/lambda/oom-dotnet/Function.cs new file mode 100644 index 000000000..b5c861493 --- /dev/null +++ b/integration-tests/lambda/oom-dotnet/Function.cs @@ -0,0 +1,25 @@ +using Amazon.Lambda.Core; +using System.Collections.Generic; +using System.Text.Json; + +[assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.SystemTextJson.DefaultLambdaJsonSerializer))] + +namespace Function +{ + /// + /// OOM reproducer for .NET. Allocates and retains 10 MB byte arrays in a list + /// until the CLR throws System.OutOfMemoryException. Bottlecap's runtime-specific + /// log-line detection matches "OutOfMemoryException". + /// + public class Handler + { + public Dictionary FunctionHandler(JsonElement input, ILambdaContext context) + { + var data = new List(); + while (true) + { + data.Add(new byte[10 * 1024 * 1024]); + } + } + } +} diff --git a/integration-tests/lambda/oom-dotnet/Function.csproj b/integration-tests/lambda/oom-dotnet/Function.csproj new file mode 100644 index 000000000..2dfcbac5f --- /dev/null +++ b/integration-tests/lambda/oom-dotnet/Function.csproj @@ -0,0 +1,14 @@ + + + net8.0 + enable + enable + true + Lambda + true + + + + + + diff --git a/integration-tests/lambda/oom-go/go.mod b/integration-tests/lambda/oom-go/go.mod new file mode 100644 index 000000000..a73b6d85d --- /dev/null +++ b/integration-tests/lambda/oom-go/go.mod @@ -0,0 +1,5 @@ +module oom-go + +go 1.22 + +require github.com/aws/aws-lambda-go v1.49.0 diff --git a/integration-tests/lambda/oom-go/main.go b/integration-tests/lambda/oom-go/main.go new file mode 100644 index 000000000..99821b0ad --- /dev/null +++ b/integration-tests/lambda/oom-go/main.go @@ -0,0 +1,23 @@ +// OOM reproducer for Go. +// Allocates and retains 10 MB byte slices in a slice header until the Go +// runtime aborts with "fatal error: runtime: out of memory". Bottlecap's +// runtime-specific log-line detection matches that fatal-error message. +// Without that detection (and historically for provided.al runtimes), the +// equality path in PlatformReport (max_memory_used_mb == memory_size_mb) also +// fires. The per-Context dedup flag ensures the metric increments only once. +package main + +import ( + "github.com/aws/aws-lambda-go/lambda" +) + +func handler() error { + var data [][]byte + for { + data = append(data, make([]byte, 10*1024*1024)) + } +} + +func main() { + lambda.Start(handler) +} diff --git a/integration-tests/lambda/oom-java/pom.xml b/integration-tests/lambda/oom-java/pom.xml new file mode 100644 index 000000000..1ead70ea0 --- /dev/null +++ b/integration-tests/lambda/oom-java/pom.xml @@ -0,0 +1,50 @@ + + + 4.0.0 + + example + oom-java-lambda + 1.0.0 + jar + + OOM Java Lambda + Java Lambda function that triggers OutOfMemoryError for integration tests + + + 21 + 21 + UTF-8 + + + + + com.amazonaws + aws-lambda-java-core + 1.4.0 + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.0 + + + package + + shade + + + function + false + + + + + + + diff --git a/integration-tests/lambda/oom-java/src/main/java/example/Handler.java b/integration-tests/lambda/oom-java/src/main/java/example/Handler.java new file mode 100644 index 000000000..92edb9c18 --- /dev/null +++ b/integration-tests/lambda/oom-java/src/main/java/example/Handler.java @@ -0,0 +1,24 @@ +package example; + +import com.amazonaws.services.lambda.runtime.Context; +import com.amazonaws.services.lambda.runtime.RequestHandler; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * OOM reproducer for Java. Allocates and retains 10 MB byte arrays in a list + * until the JVM throws java.lang.OutOfMemoryError: Java heap space. + * Bottlecap's runtime-specific log-line detection matches + * "java.lang.OutOfMemoryError". + */ +public class Handler implements RequestHandler, Map> { + + @Override + public Map handleRequest(Map event, Context context) { + List data = new ArrayList<>(); + while (true) { + data.add(new byte[10 * 1024 * 1024]); + } + } +} diff --git a/integration-tests/lambda/oom-node-sigkill/index.mjs b/integration-tests/lambda/oom-node-sigkill/index.mjs new file mode 100644 index 000000000..d6b245f36 --- /dev/null +++ b/integration-tests/lambda/oom-node-sigkill/index.mjs @@ -0,0 +1,13 @@ +// OOM reproducer: off-heap Buffer growth → kernel SIGKILL. +// Buffer.allocUnsafe(>8KB) goes through V8's ArrayBuffer allocator (external +// memory) and bypasses --max-old-space-size, so RSS grows until the cgroup +// limit triggers a kernel SIGKILL. Lambda surfaces this as PlatformRuntimeDone +// with error_type=Runtime.OutOfMemory — bottlecap's path 2 detection. +export const handler = async () => { + const bufs = []; + while (true) { + const b = Buffer.allocUnsafe(20 * 1024 * 1024); + b.fill(0); + bufs.push(b); + } +}; diff --git a/integration-tests/lambda/oom-node-v8-heap/index.mjs b/integration-tests/lambda/oom-node-v8-heap/index.mjs new file mode 100644 index 000000000..fb4e71c6f --- /dev/null +++ b/integration-tests/lambda/oom-node-v8-heap/index.mjs @@ -0,0 +1,10 @@ +// OOM reproducer: classic V8 heap exhaustion. Allocates retained strings in a +// loop until V8 hits its --max-old-space-size cap and prints +// "FATAL ERROR: ... JavaScript heap out of memory". Exercises bottlecap's +// runtime-specific log-line OOM detection path. +export const handler = async () => { + const arr = []; + while (true) { + arr.push('x'.repeat(10 * 1024 * 1024)); + } +}; diff --git a/integration-tests/lambda/oom-python/lambda_function.py b/integration-tests/lambda/oom-python/lambda_function.py new file mode 100644 index 000000000..12aa196ed --- /dev/null +++ b/integration-tests/lambda/oom-python/lambda_function.py @@ -0,0 +1,12 @@ +# OOM reproducer for Python. +# Allocates and retains 10 MB strings in a list until CPython raises +# MemoryError. Lambda surfaces this as PlatformRuntimeDone with +# error_type=Runtime.OutOfMemory; the function log line also contains +# "MemoryError". Both bottlecap detection paths fire — the dedup flag is +# what makes the OOM metric emit exactly once. + + +def handler(event, context): + data = [] + while True: + data.append("x" * (10 * 1024 * 1024)) diff --git a/integration-tests/lambda/oom-ruby/lambda_function.rb b/integration-tests/lambda/oom-ruby/lambda_function.rb new file mode 100644 index 000000000..674a70086 --- /dev/null +++ b/integration-tests/lambda/oom-ruby/lambda_function.rb @@ -0,0 +1,13 @@ +# OOM reproducer for Ruby. +# Allocates and retains 10 MB strings in an array until Ruby raises +# NoMemoryError. Lambda surfaces this as PlatformRuntimeDone with +# error_type=Runtime.OutOfMemory; the function log line also contains +# "failed to allocate memory (NoMemoryError)". Both bottlecap detection +# paths fire — the dedup flag is what makes the OOM metric emit exactly once. + +def handler(event:, context:) + data = [] + loop do + data << ("x" * (10 * 1024 * 1024)) + end +end diff --git a/integration-tests/lib/stacks/oom.ts b/integration-tests/lib/stacks/oom.ts new file mode 100644 index 000000000..dc7c6314e --- /dev/null +++ b/integration-tests/lib/stacks/oom.ts @@ -0,0 +1,216 @@ +import * as cdk from 'aws-cdk-lib'; +import * as lambda from 'aws-cdk-lib/aws-lambda'; +import { Construct } from 'constructs'; +import { + createLogGroup, + defaultDatadogEnvVariables, + defaultDatadogSecretPolicy, + getExtensionLayer, + getDefaultNodeLayer, + getDefaultPythonLayer, + getDefaultJavaLayer, + getDefaultDotnetLayer, + getDefaultRubyLayer, + defaultNodeRuntime, + defaultPythonRuntime, + defaultJavaRuntime, + defaultDotnetRuntime, + defaultRubyRuntime, + defaultGoRuntime, +} from '../util'; + +/** + * OOM cross-runtime test stack. + * + * Deploys one Lambda per OOM "shape" so the bottlecap dedup change + * (Context::oom_emitted + try_increment_oom_metric, covering issue #1237) + * can be exercised end-to-end across every supported runtime. Each function + * intentionally allocates until it OOMs; the test then asserts the + * `aws.lambda.enhanced.out_of_memory` metric increments by exactly 1. + * + * The detection paths exercised per case: + * - oom-node-v8-heap : log-line match `JavaScript heap out of memory` + * - oom-node-sigkill : PlatformRuntimeDone `error_type=Runtime.OutOfMemory` + * - oom-python : log line `MemoryError` + PlatformRuntimeDone (dedup) + * - oom-ruby : log line `NoMemoryError` + PlatformRuntimeDone (dedup) + * - oom-java : log line `java.lang.OutOfMemoryError` + * - oom-dotnet : log line `OutOfMemoryException` + * - oom-go : log line `fatal error: runtime: out of memory` + * + PlatformReport memory equality (dedup) + * + * Each function is configured with low memory (192 MB) and a short timeout + * (30 s) so the OOM fires quickly during the integration-test run. + */ +export class Oom extends cdk.Stack { + constructor(scope: Construct, id: string, props: cdk.StackProps) { + super(scope, id, props); + + const extensionLayer = getExtensionLayer(this); + const nodeLayer = getDefaultNodeLayer(this); + const pythonLayer = getDefaultPythonLayer(this); + const javaLayer = getDefaultJavaLayer(this); + const dotnetLayer = getDefaultDotnetLayer(this); + const rubyLayer = getDefaultRubyLayer(this); + + const oomMemorySize = 192; + const oomTimeout = cdk.Duration.seconds(30); + + // Node case A — V8 heap exhaustion (log-line path). + const nodeV8FunctionName = `${id}-node-v8-heap-lambda`; + const nodeV8Function = new lambda.Function(this, nodeV8FunctionName, { + runtime: defaultNodeRuntime, + architecture: lambda.Architecture.ARM_64, + handler: '/opt/nodejs/node_modules/datadog-lambda-js/handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-node-v8-heap'), + functionName: nodeV8FunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: nodeV8FunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'index.handler', + // Cap V8 heap below the Lambda memory cap so V8 throws its OOM error + // before the kernel SIGKILLs the process. + NODE_OPTIONS: '--max-old-space-size=128', + }, + logGroup: createLogGroup(this, nodeV8FunctionName), + }); + nodeV8Function.addToRolePolicy(defaultDatadogSecretPolicy); + nodeV8Function.addLayers(extensionLayer); + nodeV8Function.addLayers(nodeLayer); + + // Node case B — off-heap Buffer / kernel SIGKILL (PlatformRuntimeDone path). + const nodeSigkillFunctionName = `${id}-node-sigkill-lambda`; + const nodeSigkillFunction = new lambda.Function(this, nodeSigkillFunctionName, { + runtime: defaultNodeRuntime, + architecture: lambda.Architecture.ARM_64, + handler: '/opt/nodejs/node_modules/datadog-lambda-js/handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-node-sigkill'), + functionName: nodeSigkillFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: nodeSigkillFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'index.handler', + }, + logGroup: createLogGroup(this, nodeSigkillFunctionName), + }); + nodeSigkillFunction.addToRolePolicy(defaultDatadogSecretPolicy); + nodeSigkillFunction.addLayers(extensionLayer); + nodeSigkillFunction.addLayers(nodeLayer); + + // Python — MemoryError; log path and PlatformRuntimeDone path both fire. + const pythonFunctionName = `${id}-python-lambda`; + const pythonFunction = new lambda.Function(this, pythonFunctionName, { + runtime: defaultPythonRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'datadog_lambda.handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-python'), + functionName: pythonFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: pythonFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'lambda_function.handler', + }, + logGroup: createLogGroup(this, pythonFunctionName), + }); + pythonFunction.addToRolePolicy(defaultDatadogSecretPolicy); + pythonFunction.addLayers(extensionLayer); + pythonFunction.addLayers(pythonLayer); + + // Ruby — NoMemoryError; log path and PlatformRuntimeDone path both fire. + const rubyFunctionName = `${id}-ruby-lambda`; + const rubyFunction = new lambda.Function(this, rubyFunctionName, { + runtime: defaultRubyRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'datadog_lambda_rb.handler', + code: lambda.Code.fromAsset('./lambda/oom-ruby'), + functionName: rubyFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: rubyFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'lambda_function.handler', + }, + logGroup: createLogGroup(this, rubyFunctionName), + }); + rubyFunction.addToRolePolicy(defaultDatadogSecretPolicy); + rubyFunction.addLayers(extensionLayer); + rubyFunction.addLayers(rubyLayer); + + // Java — OutOfMemoryError (log-line path). + const javaFunctionName = `${id}-java-lambda`; + const javaFunction = new lambda.Function(this, javaFunctionName, { + runtime: defaultJavaRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'example.Handler::handleRequest', + code: lambda.Code.fromAsset('./lambda/oom-java/target/function.jar'), + functionName: javaFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: javaFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + DD_TRACE_ENABLED: 'true', + }, + logGroup: createLogGroup(this, javaFunctionName), + }); + javaFunction.addToRolePolicy(defaultDatadogSecretPolicy); + javaFunction.addLayers(extensionLayer); + javaFunction.addLayers(javaLayer); + + // .NET — OutOfMemoryException (log-line path). + const dotnetFunctionName = `${id}-dotnet-lambda`; + const dotnetFunction = new lambda.Function(this, dotnetFunctionName, { + runtime: defaultDotnetRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'Function::Function.Handler::FunctionHandler', + code: lambda.Code.fromAsset('./lambda/oom-dotnet/bin/function.zip'), + functionName: dotnetFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: dotnetFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + }, + logGroup: createLogGroup(this, dotnetFunctionName), + }); + dotnetFunction.addToRolePolicy(defaultDatadogSecretPolicy); + dotnetFunction.addLayers(extensionLayer); + dotnetFunction.addLayers(dotnetLayer); + + // Go — runtime fatal error + PlatformReport memory equality (dedup). + // Go runs on the custom runtime, so the binary itself is the handler. + const goFunctionName = `${id}-go-lambda`; + const goFunction = new lambda.Function(this, goFunctionName, { + runtime: defaultGoRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'bootstrap', + code: lambda.Code.fromAsset('./lambda/oom-go/bin'), + functionName: goFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: goFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + }, + logGroup: createLogGroup(this, goFunctionName), + }); + goFunction.addToRolePolicy(defaultDatadogSecretPolicy); + goFunction.addLayers(extensionLayer); + // Go has no tracer layer — the Datadog tracer for Go is a Go module imported + // into the function source. The extension layer alone is enough for the + // enhanced metrics this test asserts on. + } +} diff --git a/integration-tests/lib/util.ts b/integration-tests/lib/util.ts index dd8309789..24fe04164 100644 --- a/integration-tests/lib/util.ts +++ b/integration-tests/lib/util.ts @@ -13,11 +13,15 @@ export const defaultNodeRuntime = lambda.Runtime.NODEJS_24_X; export const defaultPythonRuntime = lambda.Runtime.PYTHON_3_13; export const defaultJavaRuntime = lambda.Runtime.JAVA_21; export const defaultDotnetRuntime = lambda.Runtime.DOTNET_8; +export const defaultRubyRuntime = lambda.Runtime.RUBY_3_4; +// Go runs on the custom runtime; the Datadog tracer is a Go module, not a layer. +export const defaultGoRuntime = lambda.Runtime.PROVIDED_AL2023; export const defaultNodeLayerArn = process.env.NODE_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Node24-x:132'; export const defaultPythonLayerArn = process.env.PYTHON_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Python313-ARM:117'; export const defaultJavaLayerArn = process.env.JAVA_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-java:25'; export const defaultDotnetLayerArn = process.env.DOTNET_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-dotnet-ARM:23'; +export const defaultRubyLayerArn = process.env.RUBY_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Ruby3-4-ARM:9'; export const defaultDatadogEnvVariables = { DD_API_KEY_SECRET_ARN: datadogSecretArn, @@ -87,6 +91,14 @@ export const getDefaultDotnetLayer = (scope: Construct) => { ); }; +export const getDefaultRubyLayer = (scope: Construct) => { + return LayerVersion.fromLayerVersionArn( + scope, + 'DatadogRubyLayer', + defaultRubyLayerArn + ); +}; + export const capacityProviderArn = `arn:aws:lambda:${REGION}:${ACCOUNT}:capacity-provider:integ-default-capacity-provider-cp`; export function setCapacityProvider(lambdaFunction: lambda.Function) { diff --git a/integration-tests/scripts/build-go.sh b/integration-tests/scripts/build-go.sh new file mode 100755 index 000000000..8f24bc45c --- /dev/null +++ b/integration-tests/scripts/build-go.sh @@ -0,0 +1,123 @@ +#!/bin/bash +set -e + +# Reusable script to cross-compile Go Lambda functions for ARM64 Linux. +# Outputs a binary named `bootstrap` (required by the AWS Lambda custom runtime +# provided.al2023) under /bin/. +# +# Usage: +# ./build-go.sh # Build all Go Lambda functions +# ./build-go.sh # Build a specific Lambda function + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LAMBDA_BASE_DIR="$SCRIPT_DIR/../lambda" + +build_go_lambda() { + local LAMBDA_DIR="$1" + local FUNCTION_NAME=$(basename "$LAMBDA_DIR") + + if [ ! -d "$LAMBDA_DIR" ]; then + echo "Error: Directory not found: $LAMBDA_DIR" + return 1 + fi + + if [ ! -f "$LAMBDA_DIR/go.mod" ]; then + echo "Error: go.mod not found in $LAMBDA_DIR" + return 1 + fi + + echo "Building Go Lambda: $FUNCTION_NAME" + + if ! command -v docker &> /dev/null; then + echo "Error: Docker is not installed or not in PATH" + return 1 + fi + + # Clean previous build (idempotent). + rm -rf "$LAMBDA_DIR/bin" + mkdir -p "$LAMBDA_DIR/bin" + + # Module cache: reuse the host's $GOPATH/pkg/mod when running locally; + # use a project-local cache in CI so it can be cached between jobs. + if [ -n "$CI" ]; then + GO_MOD_CACHE="$SCRIPT_DIR/../.cache/go-mod" + mkdir -p "$GO_MOD_CACHE" + else + GO_MOD_CACHE="${GOPATH:-$HOME/go}/pkg/mod" + mkdir -p "$GO_MOD_CACHE" + fi + + # Cross-compile to ARM64 Linux inside the official Go image. + # CGO is disabled so the binary runs on the provided.al2023 base image + # without a libc mismatch. + docker run --rm --platform linux/arm64 \ + -v "$LAMBDA_DIR":/workspace \ + -v "$GO_MOD_CACHE":/go/pkg/mod \ + -w /workspace \ + -e GOOS=linux \ + -e GOARCH=arm64 \ + -e CGO_ENABLED=0 \ + public.ecr.aws/docker/library/golang:1.22-bookworm \ + sh -c "go mod tidy && go build -o bin/bootstrap ." + + if [ ! -f "$LAMBDA_DIR/bin/bootstrap" ]; then + echo "✗ Build failed: bin/bootstrap not produced" + return 1 + fi + + echo "✓ Build complete: $LAMBDA_DIR/bin/bootstrap" + return 0 +} + +if [ -z "$1" ]; then + echo "==========================================" + echo "Building all Go Lambda functions" + echo "==========================================" + echo "" + + FOUND_GO=0 + FAILED_BUILDS=() + + for LAMBDA_PATH in "$LAMBDA_BASE_DIR"/*; do + if [ ! -d "$LAMBDA_PATH" ]; then + continue + fi + + FUNCTION_NAME=$(basename "$LAMBDA_PATH") + + # Match directories whose suffix is `-go` or whose name is exactly `go`. + if [[ "$FUNCTION_NAME" == *"-go" || "$FUNCTION_NAME" == "go" ]]; then + FOUND_GO=1 + echo "----------------------------------------" + if build_go_lambda "$LAMBDA_PATH"; then + echo "✓ $FUNCTION_NAME built successfully" + else + echo "✗ $FUNCTION_NAME failed" + FAILED_BUILDS+=("$FUNCTION_NAME") + fi + echo "" + fi + done + + if [ $FOUND_GO -eq 0 ]; then + echo "No Go Lambda functions found (looking for directories ending in -go)" + exit 0 + fi + + if [ ${#FAILED_BUILDS[@]} -eq 0 ]; then + echo "✓ All Go Lambda builds completed successfully!" + exit 0 + fi + + echo "✗ ${#FAILED_BUILDS[@]} Go Lambda build(s) failed:" + for failed in "${FAILED_BUILDS[@]}"; do + echo " - $failed" + done + exit 1 +else + LAMBDA_DIR="$1" + if [[ "$LAMBDA_DIR" != /* ]]; then + LAMBDA_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/$LAMBDA_DIR" + fi + build_go_lambda "$LAMBDA_DIR" +fi diff --git a/integration-tests/scripts/build-ruby.sh b/integration-tests/scripts/build-ruby.sh new file mode 100755 index 000000000..0ca36064d --- /dev/null +++ b/integration-tests/scripts/build-ruby.sh @@ -0,0 +1,88 @@ +#!/bin/bash +set -e + +# Reusable script to build Ruby Lambda functions. +# For simple Ruby Lambdas with no gem dependencies, this just packages the +# source as-is — the runtime + Datadog tracer layer provide everything needed. +# If the function gains a Gemfile, this script grows a bundle install step +# in a Docker container (mirroring build-python.sh / build-node.sh). +# +# Usage: +# ./build-ruby.sh # Build all Ruby Lambda functions +# ./build-ruby.sh # Build a specific Lambda function + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LAMBDA_BASE_DIR="$SCRIPT_DIR/../lambda" + +build_ruby_lambda() { + local LAMBDA_DIR="$1" + local FUNCTION_NAME=$(basename "$LAMBDA_DIR") + + if [ ! -d "$LAMBDA_DIR" ]; then + echo "Error: Directory not found: $LAMBDA_DIR" + return 1 + fi + + echo "Building Ruby Lambda: $FUNCTION_NAME" + + if [ ! -f "$LAMBDA_DIR/Gemfile" ]; then + echo "ℹ No Gemfile found — source files are deployed as-is" + return 0 + fi + + echo "Error: Gemfile-based Ruby builds are not implemented yet" >&2 + echo " Add a Dockerised \`bundle install\` step to this script when needed." >&2 + return 1 +} + +if [ -z "$1" ]; then + echo "==========================================" + echo "Building all Ruby Lambda functions" + echo "==========================================" + echo "" + + FOUND_RUBY=0 + FAILED_BUILDS=() + + for LAMBDA_PATH in "$LAMBDA_BASE_DIR"/*; do + if [ ! -d "$LAMBDA_PATH" ]; then + continue + fi + + FUNCTION_NAME=$(basename "$LAMBDA_PATH") + + if [[ "$FUNCTION_NAME" == *"ruby"* ]]; then + FOUND_RUBY=1 + echo "----------------------------------------" + if build_ruby_lambda "$LAMBDA_PATH"; then + echo "✓ $FUNCTION_NAME built successfully" + else + echo "✗ $FUNCTION_NAME failed" + FAILED_BUILDS+=("$FUNCTION_NAME") + fi + echo "" + fi + done + + if [ $FOUND_RUBY -eq 0 ]; then + echo "No Ruby Lambda functions found (looking for directories with 'ruby' in name)" + exit 0 + fi + + if [ ${#FAILED_BUILDS[@]} -eq 0 ]; then + echo "✓ All Ruby Lambda builds completed successfully!" + exit 0 + fi + + echo "✗ ${#FAILED_BUILDS[@]} Ruby Lambda build(s) failed:" + for failed in "${FAILED_BUILDS[@]}"; do + echo " - $failed" + done + exit 1 +else + LAMBDA_DIR="$1" + if [[ "$LAMBDA_DIR" != /* ]]; then + LAMBDA_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/$LAMBDA_DIR" + fi + build_ruby_lambda "$LAMBDA_DIR" +fi diff --git a/integration-tests/scripts/local_deploy.sh b/integration-tests/scripts/local_deploy.sh index b432261da..451b81cf6 100755 --- a/integration-tests/scripts/local_deploy.sh +++ b/integration-tests/scripts/local_deploy.sh @@ -43,6 +43,8 @@ echo "Building all Lambda functions in parallel..." "$SCRIPT_DIR/build-dotnet.sh" & "$SCRIPT_DIR/build-python.sh" & "$SCRIPT_DIR/build-node.sh" & +"$SCRIPT_DIR/build-ruby.sh" & +"$SCRIPT_DIR/build-go.sh" & wait echo "All Lambda builds complete" diff --git a/integration-tests/tests/oom.test.ts b/integration-tests/tests/oom.test.ts new file mode 100644 index 000000000..5e33e4c8e --- /dev/null +++ b/integration-tests/tests/oom.test.ts @@ -0,0 +1,90 @@ +import { invokeLambda } from './utils/lambda'; +import { getMetricCount, OUT_OF_MEMORY_METRIC } from './utils/datadog'; +import { DEFAULT_DATADOG_INDEXING_WAIT_MS } from '../config'; +import { getIdentifier } from '../config'; + +/** + * Cross-runtime OOM test. + * + * Each function is intentionally configured to OOM on its first invocation. + * Bottlecap has three detection paths that can fire for the same invocation + * (runtime-specific log line, `Runtime.OutOfMemory` `error_type` in + * `PlatformRuntimeDone`, `max_memory_used_mb == memory_size_mb` in + * `PlatformReport`); the `Context::oom_emitted` flag introduced for #1237 + * dedupes them so the metric increments exactly once per invocation. + * + * The Python/Ruby/Go cases are particularly meaningful regressions because + * they trigger more than one detection path naturally — if dedup is broken, + * those counts go to 2. + */ +const identifier = getIdentifier(); +const stackName = `integ-${identifier}-oom`; + +interface OomCase { + runtime: string; + functionName: string; +} + +const cases: OomCase[] = [ + { runtime: 'node-v8-heap', functionName: `${stackName}-node-v8-heap-lambda` }, + { runtime: 'node-sigkill', functionName: `${stackName}-node-sigkill-lambda` }, + { runtime: 'python', functionName: `${stackName}-python-lambda` }, + { runtime: 'ruby', functionName: `${stackName}-ruby-lambda` }, + { runtime: 'java', functionName: `${stackName}-java-lambda` }, + { runtime: 'dotnet', functionName: `${stackName}-dotnet-lambda` }, + { runtime: 'go', functionName: `${stackName}-go-lambda` }, +]; + +async function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +describe('OOM Integration Tests', () => { + let countsByRuntime: Record; + let windowStart: number; + let windowEnd: number; + + // Invoke every function once, wait for Datadog to ingest, then query once + // for each. Keeping invocations and the query inside `beforeAll` lets each + // per-runtime test below assert against the same data set. + beforeAll(async () => { + windowStart = Date.now(); + + await Promise.all( + cases.map((c) => + invokeLambda(c.functionName).catch((err) => { + // OOM functions usually succeed at the Invoke API layer (the function + // is run, just crashes), so a thrown error here is unexpected + // infrastructure failure rather than the OOM itself. Re-throw so the + // test surfaces it. + throw new Error(`Invoke failed for ${c.functionName}: ${err}`); + }), + ), + ); + + await sleep(DEFAULT_DATADOG_INDEXING_WAIT_MS); + windowEnd = Date.now(); + + const results = await Promise.all( + cases.map(async (c) => ({ + runtime: c.runtime, + count: await getMetricCount( + OUT_OF_MEMORY_METRIC, + c.functionName, + windowStart, + windowEnd, + ), + })), + ); + + countsByRuntime = Object.fromEntries(results.map((r) => [r.runtime, r.count])); + console.log('OOM counts by runtime:', countsByRuntime); + }, 10 * 60 * 1000); + + describe.each(cases)('$runtime runtime', ({ runtime }) => { + it('should emit exactly one out_of_memory metric for one OOM invocation', () => { + const count = countsByRuntime[runtime]; + expect(count).toBe(1); + }); + }); +}); diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index ed3768ea0..e25225235 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -91,6 +91,8 @@ export const DURATION_METRICS = [ 'aws.lambda.enhanced.init_duration', ]; +export const OUT_OF_MEMORY_METRIC = 'aws.lambda.enhanced.out_of_memory'; + export type EnhancedMetrics = Record; export interface MetricPoint { @@ -289,6 +291,41 @@ export async function getEnhancedMetrics( return metrics; } +/** + * Returns the total emission count of a counter / distribution enhanced metric + * for a single function over the given window, by summing all data-point + * values returned by Datadog. Used by oom.test.ts to assert that + * `aws.lambda.enhanced.out_of_memory` increments exactly once per invocation — + * verifying the per-Context `oom_emitted` dedup flag introduced for #1237. + */ +export async function getMetricCount( + metricName: string, + functionName: string, + fromTime: number, + toTime: number, +): Promise { + const baseFunctionName = getServiceName(functionName).toLowerCase(); + const query = `sum:${metricName}{functionname:${baseFunctionName}}.as_count()`; + + console.log(`Querying metric count: ${query}`); + + const response = await datadogClient.get('/api/v1/query', { + params: { + query, + from: Math.floor(fromTime / 1000), + to: Math.floor(toTime / 1000), + }, + }); + + const series = response.data.series || []; + if (series.length === 0) { + return 0; + } + + const pointlist: [number, number][] = series[0].pointlist || []; + return pointlist.reduce((acc, [, value]) => acc + (value || 0), 0); +} + async function getMetrics( metricName: string, functionName: string,