diff --git a/.gitlab/datasources/test-suites.yaml b/.gitlab/datasources/test-suites.yaml index 257b1ba04..b6d82c369 100644 --- a/.gitlab/datasources/test-suites.yaml +++ b/.gitlab/datasources/test-suites.yaml @@ -4,3 +4,4 @@ test_suites: - name: snapstart - name: lmi - name: auth + - name: oom diff --git a/.gitlab/templates/pipeline.yaml.tpl b/.gitlab/templates/pipeline.yaml.tpl index 60788606b..a87bfaa14 100644 --- a/.gitlab/templates/pipeline.yaml.tpl +++ b/.gitlab/templates/pipeline.yaml.tpl @@ -505,6 +505,40 @@ build node lambdas: - cd integration-tests - ./scripts/build-node.sh +build ruby lambdas: + stage: integration-tests + image: registry.ddbuild.io/images/docker:27.3.1 + tags: ["docker-in-docker:arm64"] + rules: + - when: on_success + needs: [] + artifacts: + expire_in: 1 hour + paths: + - integration-tests/lambda/*/*.rb + script: + - cd integration-tests + - ./scripts/build-ruby.sh + +build go lambdas: + stage: integration-tests + image: registry.ddbuild.io/images/docker:27.3.1 + tags: ["docker-in-docker:arm64"] + rules: + - when: on_success + needs: [] + cache: + key: go-mod-cache-${CI_COMMIT_REF_SLUG} + paths: + - integration-tests/.cache/go-mod/ + artifacts: + expire_in: 1 hour + paths: + - integration-tests/lambda/*/bin/bootstrap + script: + - cd integration-tests + - ./scripts/build-go.sh + # Integration Tests - Publish arm64 layer with integration test prefix publish integration layer (arm64): stage: integration-tests @@ -581,12 +615,16 @@ integration-suite: - build dotnet lambdas - build python lambdas - build node lambdas + - build ruby lambdas + - build go lambdas dependencies: - publish integration layer (arm64) - build java lambdas - build dotnet lambdas - build python lambdas - build node lambdas + - build ruby lambdas + - build go lambdas variables: IDENTIFIER: ${CI_COMMIT_SHORT_SHA} AWS_DEFAULT_REGION: us-east-1 diff --git a/bottlecap/src/bin/bottlecap/main.rs b/bottlecap/src/bin/bottlecap/main.rs index 3dcc13bb2..a41a3f51f 100644 --- a/bottlecap/src/bin/bottlecap/main.rs +++ b/bottlecap/src/bin/bottlecap/main.rs @@ -841,9 +841,12 @@ async fn handle_event_bus_event( stats_concentrator: StatsConcentratorHandle, ) -> Option { match event { - Event::OutOfMemory(event_timestamp) => { + Event::OutOfMemory { + request_id, + timestamp, + } => { if let Err(e) = invocation_processor_handle - .on_out_of_memory_error(event_timestamp) + .on_out_of_memory_error(request_id, timestamp) .await { error!("Failed to send out of memory error to processor: {}", e); diff --git a/bottlecap/src/event_bus/mod.rs b/bottlecap/src/event_bus/mod.rs index 0ea20969e..0be3a86ca 100644 --- a/bottlecap/src/event_bus/mod.rs +++ b/bottlecap/src/event_bus/mod.rs @@ -7,7 +7,13 @@ mod constants; #[derive(Debug)] pub enum Event { Telemetry(TelemetryEvent), - OutOfMemory(i64), + OutOfMemory { + /// Lambda `request_id` of the invocation the OOM belongs to, when known. + /// Used by the invocation processor to dedupe against other OOM detection + /// paths (`PlatformRuntimeDone` `error_type`, `PlatformReport` memory equality). + request_id: Option, + timestamp: i64, + }, Tombstone, } diff --git a/bottlecap/src/lifecycle/invocation/context.rs b/bottlecap/src/lifecycle/invocation/context.rs index 04894f9c6..3aef5e4bf 100644 --- a/bottlecap/src/lifecycle/invocation/context.rs +++ b/bottlecap/src/lifecycle/invocation/context.rs @@ -43,6 +43,12 @@ pub struct Context { /// tracing. /// pub extracted_span_context: Option, + /// Whether the `aws.lambda.enhanced.out_of_memory` metric has already been + /// emitted for this invocation. Multiple detection paths can fire for the + /// same OOM (runtime log, `Runtime.OutOfMemory` `error_type` in + /// `PlatformRuntimeDone`, `max_memory_used == memory_size` in `PlatformReport`); + /// this flag dedupes them. + pub oom_emitted: bool, } /// Struct containing the information needed to reparent a span. @@ -94,6 +100,7 @@ impl Default for Context { snapstart_restore_span: None, tracer_span: None, extracted_span_context: None, + oom_emitted: false, } } } diff --git a/bottlecap/src/lifecycle/invocation/processor.rs b/bottlecap/src/lifecycle/invocation/processor.rs index ec1af99ab..c77c224a8 100644 --- a/bottlecap/src/lifecycle/invocation/processor.rs +++ b/bottlecap/src/lifecycle/invocation/processor.rs @@ -508,7 +508,7 @@ impl Processor { debug!( "Invocation Processor | PlatformRuntimeDone | Got Runtime.OutOfMemory. Incrementing OOM metric." ); - self.enhanced_metrics.increment_oom_metric(timestamp); + self.try_increment_oom_metric(Some(request_id), timestamp); } } @@ -909,25 +909,25 @@ impl Processor { /// Handles `OnDemand` mode platform report processing. /// - /// Processes OnDemand-specific metrics including OOM detection for provided.al runtimes - /// and post-runtime duration calculation. + /// Processes OnDemand-specific metrics including OOM detection by memory-size + /// equality and post-runtime duration calculation. fn handle_ondemand_report( &mut self, request_id: &String, metrics: OnDemandReportMetrics, timestamp: i64, ) { - // For provided.al runtimes, if the last invocation hit the memory limit, increment the OOM metric. - // We do this for provided.al runtimes because we didn't find another way to detect this under provided.al. - // We don't do this for other runtimes to avoid double counting. - if let Some(runtime) = &self.runtime - && runtime.starts_with("provided.al") - && metrics.max_memory_used_mb == metrics.memory_size_mb - { + // If the invocation hit the memory limit, increment the OOM metric. This catches + // OOM-induced failures that don't surface through a runtime-specific log line or a + // `Runtime.OutOfMemory` error_type — most notably the suppressed-init / timeout-at-cap + // pattern reported in datadog-lambda-extension#1237 (Node) and the historical + // provided.al case. Dedup against the other two detection paths is handled by + // `Context::oom_emitted`, which `try_increment_oom_metric` checks and sets. + if metrics.max_memory_used_mb == metrics.memory_size_mb { debug!( "Invocation Processor | PlatformReport | Last invocation hit memory limit. Incrementing OOM metric." ); - self.enhanced_metrics.increment_oom_metric(timestamp); + self.try_increment_oom_metric(Some(request_id), timestamp); } // Calculate and set post-runtime duration if context is available @@ -1395,7 +1395,34 @@ impl Processor { Some(error_tags) } - pub fn on_out_of_memory_error(&mut self, timestamp: i64) { + pub fn on_out_of_memory_error(&mut self, request_id: Option<&String>, timestamp: i64) { + self.try_increment_oom_metric(request_id, timestamp); + } + + /// Increments the OOM enhanced metric exactly once per `request_id`. + /// + /// Several detection paths can fire for the same invocation: + /// 1. A runtime-specific OOM log line (logs processor → `Event::OutOfMemory`) + /// 2. `error_type == "Runtime.OutOfMemory"` in `PlatformRuntimeDone` + /// 3. `max_memory_used_mb == memory_size_mb` in `PlatformReport` + /// + /// To avoid double-counting, the per-invocation `Context::oom_emitted` flag is + /// set on the first emission. Subsequent emissions for the same `request_id` are + /// skipped. If `request_id` is `None` (log path saw the OOM outside an active + /// invocation window) or no context is found, we emit best-effort without dedup. + fn try_increment_oom_metric(&mut self, request_id: Option<&String>, timestamp: i64) { + if let Some(rid) = request_id + && let Some(ctx) = self.context_buffer.get_mut(rid) + { + if ctx.oom_emitted { + debug!( + "Invocation Processor | OOM metric already emitted for request_id {}, skipping", + rid + ); + return; + } + ctx.oom_emitted = true; + } self.enhanced_metrics.increment_oom_metric(timestamp); } @@ -2445,4 +2472,138 @@ mod tests { "pre-existing _dd.appsec.enabled value must not be overwritten" ); } + + /// Two OOM signals for the same `request_id` increment the metric exactly once. + /// Exercises the `Context::oom_emitted` dedup flag. + #[tokio::test] + async fn test_try_increment_oom_metric_dedupes_same_request_id() { + let mut p = setup(); + // Insert the context directly so we don't go through `on_invoke_event`, which + // would populate dynamic tags (`cold_start:true`) and complicate the query. + let request_id = String::from("req-dedup"); + p.context_buffer.start_context(&request_id, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + p.on_out_of_memory_error(Some(&request_id), now); + p.on_out_of_memory_error(Some(&request_id), now); + + let ts = (now / 10) * 10; + let entry = p + .enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts, + ) + .await + .unwrap() + .expect("OOM metric must be emitted at least once"); + + let sketch = entry.value.get_sketch().expect("distribution sketch"); + let sum = sketch.sum().expect("sketch sum"); + assert!( + (sum - 1.0).abs() < f64::EPSILON, + "OOM sum must be 1.0 (deduped), got {sum}" + ); + + // And the context flag should now reflect that we emitted. + assert!( + p.context_buffer + .get(&request_id) + .expect("context") + .oom_emitted, + "oom_emitted flag must be set after the first emission" + ); + } + + /// OOM signals for different `request_id`s each emit a metric — dedup is scoped + /// per request, not globally. + #[tokio::test] + async fn test_try_increment_oom_metric_distinct_request_ids_emit_separately() { + let mut p = setup(); + let req1 = String::from("req-a"); + let req2 = String::from("req-b"); + p.context_buffer.start_context(&req1, Span::default()); + p.context_buffer.start_context(&req2, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + p.on_out_of_memory_error(Some(&req1), now); + p.on_out_of_memory_error(Some(&req2), now); + + let ts = (now / 10) * 10; + let entry = p + .enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts, + ) + .await + .unwrap() + .expect("OOM metric must be emitted"); + + let sketch = entry.value.get_sketch().expect("distribution sketch"); + let sum = sketch.sum().expect("sketch sum"); + assert!( + (sum - 2.0).abs() < f64::EPSILON, + "OOM sum must be 2.0 (one per request_id), got {sum}" + ); + } + + /// Regression: the `max_memory_used_mb == memory_size_mb` path used to be gated + /// on `runtime.starts_with("provided.al")`. After generalising the rule to all + /// runtimes (with dedup via `Context::oom_emitted`), the equality case must + /// still emit OOM. + #[tokio::test] + async fn test_handle_ondemand_report_emits_oom_on_memory_equality() { + let mut p = setup(); + let request_id = String::from("req-eq"); + p.context_buffer.start_context(&request_id, Span::default()); + + let now: i64 = std::time::UNIX_EPOCH + .elapsed() + .expect("clock") + .as_secs() + .try_into() + .unwrap_or_default(); + + let metrics = OnDemandReportMetrics { + duration_ms: 100.0, + billed_duration_ms: 100, + memory_size_mb: 1024, + max_memory_used_mb: 1024, + init_duration_ms: None, + restore_duration_ms: None, + }; + p.handle_ondemand_report(&request_id, metrics, now); + + let ts = (now / 10) * 10; + assert!( + p.enhanced_metrics + .aggr_handle + .get_entry_by_id( + crate::metrics::enhanced::constants::OUT_OF_MEMORY_METRIC.into(), + None, + ts + ) + .await + .unwrap() + .is_some(), + "OOM must be emitted when max_memory_used_mb == memory_size_mb" + ); + } } diff --git a/bottlecap/src/lifecycle/invocation/processor_service.rs b/bottlecap/src/lifecycle/invocation/processor_service.rs index a41a95b26..61c48b479 100644 --- a/bottlecap/src/lifecycle/invocation/processor_service.rs +++ b/bottlecap/src/lifecycle/invocation/processor_service.rs @@ -118,6 +118,7 @@ pub enum ProcessorCommand { execution_status: Option, }, OnOutOfMemoryError { + request_id: Option, timestamp: i64, }, OnShutdownEvent, @@ -407,10 +408,14 @@ impl InvocationProcessorHandle { pub async fn on_out_of_memory_error( &self, + request_id: Option, timestamp: i64, ) -> Result<(), mpsc::error::SendError> { self.sender - .send(ProcessorCommand::OnOutOfMemoryError { timestamp }) + .send(ProcessorCommand::OnOutOfMemoryError { + request_id, + timestamp, + }) .await } @@ -632,8 +637,12 @@ impl InvocationProcessorService { ) .await; } - ProcessorCommand::OnOutOfMemoryError { timestamp } => { - self.processor.on_out_of_memory_error(timestamp); + ProcessorCommand::OnOutOfMemoryError { + request_id, + timestamp, + } => { + self.processor + .on_out_of_memory_error(request_id.as_ref(), timestamp); } ProcessorCommand::OnShutdownEvent => { self.processor.on_shutdown_event(); diff --git a/bottlecap/src/logs/lambda/processor.rs b/bottlecap/src/logs/lambda/processor.rs index 643e32854..9c922ac69 100644 --- a/bottlecap/src/logs/lambda/processor.rs +++ b/bottlecap/src/logs/lambda/processor.rs @@ -163,6 +163,32 @@ impl LambdaProcessor { } } + /// Returns the `request_id` of the currently-active invocation, if known. + /// Set by `PlatformStart`, cleared by `PlatformRuntimeDone` / `PlatformReport`. + /// + /// Returns `None` when: + /// - **Managed Instance mode**: extensions cannot subscribe to the `INVOKE` event, + /// so `platform.start` is not delivered and this slot is never populated. OOM logs + /// parsed in MI mode are therefore always tagged `None`. The synthesized + /// `PlatformRuntimeDone` produced by `handle_managed_instance_report` does carry a + /// real `request_id`, so dedup still works for that path. Worst case is a thin + /// double-count window if a runtime emits both an OOM log line and + /// `error_type = Runtime.OutOfMemory` for the same invocation — not observed in + /// practice today. + /// - **Pre-`PlatformStart` init crash**: a FATAL OOM log emitted by init code can + /// arrive before `PlatformStart` (or with no `PlatformStart` at all, if init + /// fails outright). In the no-`PlatformStart` case no other detection path fires, + /// so no double-count. + /// - **Late log race**: a FATAL log parsed after `PlatformRuntimeDone` clears the + /// slot. By then the context has been removed, so no double-count. + fn current_request_id(&self) -> Option { + if self.invocation_context.request_id.is_empty() { + None + } else { + Some(self.invocation_context.request_id.clone()) + } + } + #[allow(clippy::too_many_lines)] async fn get_message(&mut self, event: TelemetryEvent) -> Result> { let copy = event.clone(); @@ -194,7 +220,10 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - if let Err(e) = self.event_bus.send(Event::OutOfMemory(event.time.timestamp())).await { + if let Err(e) = self.event_bus.send(Event::OutOfMemory { + request_id: self.current_request_id(), + timestamp: event.time.timestamp(), + }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); } } @@ -206,7 +235,7 @@ impl LambdaProcessor { event.time.timestamp_millis(), None, ); - // If the message is logged from the durable execution SDK, + // If the message is logged from the durable execution SDK, // set durable execution id and name as log attributes. if let Some((exec_id, exec_name)) = durable_ctx { msg.lambda.durable_execution_id = Some(exec_id); @@ -227,7 +256,10 @@ impl LambdaProcessor { if let Some(message) = message { if is_oom_error(&message) { debug!("LOGS | Got a runtime-specific OOM error. Incrementing OOM metric."); - if let Err(e) = self.event_bus.send(Event::OutOfMemory(event.time.timestamp())).await { + if let Err(e) = self.event_bus.send(Event::OutOfMemory { + request_id: self.current_request_id(), + timestamp: event.time.timestamp(), + }).await { error!("LOGS | Failed to send OOM event to the main event bus: {e}"); } } diff --git a/bottlecap/src/metrics/enhanced/lambda.rs b/bottlecap/src/metrics/enhanced/lambda.rs index abed7d5b9..67535967e 100644 --- a/bottlecap/src/metrics/enhanced/lambda.rs +++ b/bottlecap/src/metrics/enhanced/lambda.rs @@ -91,12 +91,12 @@ impl Lambda { self.increment_metric(constants::TIMEOUTS_METRIC, timestamp); } - // This function is called in three cases: - // 1. Runtime-specific OOM error (can happen in .NET, Node.js and Java as far as we know) - // 2. PlatformRuntimeDone event reports "error_type: Runtime.OutOfMemory" (can happen in Ruby and Python as far as we know) - // 3. PlatformReport event reports "max_memory_used_mb == memory_size_mb" (can happen in many runtimes, but - // we only call increment_oom_metric() for provided.al runtimes) - // This is our best effort to cover different cases without double counting. We can adjust this if we find more cases. + // Callers should generally go through `Processor::try_increment_oom_metric`, which + // dedupes by `request_id` so the same invocation isn't counted multiple times when + // more than one detection path fires. The three paths are: + // 1. Runtime-specific OOM log line (.NET, Node, Java, Go, Ruby, Python) + // 2. PlatformRuntimeDone with error_type == "Runtime.OutOfMemory" (Ruby, Python; Node as of 2026-05) + // 3. PlatformReport with max_memory_used_mb == memory_size_mb (all runtimes) pub fn increment_oom_metric(&self, timestamp: i64) { self.increment_metric(constants::OUT_OF_MEMORY_METRIC, timestamp); } diff --git a/integration-tests/bin/app.ts b/integration-tests/bin/app.ts index d822e6cac..affce4270 100644 --- a/integration-tests/bin/app.ts +++ b/integration-tests/bin/app.ts @@ -6,6 +6,7 @@ import {Otlp} from '../lib/stacks/otlp'; import {Snapstart} from '../lib/stacks/snapstart'; import {LambdaManagedInstancesStack} from '../lib/stacks/lmi'; import {AuthStack} from '../lib/stacks/auth'; +import {Oom} from '../lib/stacks/oom'; import {AuthRoleStack} from '../lib/auth-role'; import {ACCOUNT, getIdentifier, REGION} from '../config'; import {CapacityProviderStack} from "../lib/capacity-provider"; @@ -40,6 +41,9 @@ const stacks = [ new AuthStack(app, `integ-${identifier}-auth`, { env, }), + new Oom(app, `integ-${identifier}-oom`, { + env, + }), ] // Tag all stacks so we can easily clean them up diff --git a/integration-tests/lambda/oom-dotnet/Function.cs b/integration-tests/lambda/oom-dotnet/Function.cs new file mode 100644 index 000000000..b5c861493 --- /dev/null +++ b/integration-tests/lambda/oom-dotnet/Function.cs @@ -0,0 +1,25 @@ +using Amazon.Lambda.Core; +using System.Collections.Generic; +using System.Text.Json; + +[assembly: LambdaSerializer(typeof(Amazon.Lambda.Serialization.SystemTextJson.DefaultLambdaJsonSerializer))] + +namespace Function +{ + /// + /// OOM reproducer for .NET. Allocates and retains 10 MB byte arrays in a list + /// until the CLR throws System.OutOfMemoryException. Bottlecap's runtime-specific + /// log-line detection matches "OutOfMemoryException". + /// + public class Handler + { + public Dictionary FunctionHandler(JsonElement input, ILambdaContext context) + { + var data = new List(); + while (true) + { + data.Add(new byte[10 * 1024 * 1024]); + } + } + } +} diff --git a/integration-tests/lambda/oom-dotnet/Function.csproj b/integration-tests/lambda/oom-dotnet/Function.csproj new file mode 100644 index 000000000..2dfcbac5f --- /dev/null +++ b/integration-tests/lambda/oom-dotnet/Function.csproj @@ -0,0 +1,14 @@ + + + net8.0 + enable + enable + true + Lambda + true + + + + + + diff --git a/integration-tests/lambda/oom-go/go.mod b/integration-tests/lambda/oom-go/go.mod new file mode 100644 index 000000000..a73b6d85d --- /dev/null +++ b/integration-tests/lambda/oom-go/go.mod @@ -0,0 +1,5 @@ +module oom-go + +go 1.22 + +require github.com/aws/aws-lambda-go v1.49.0 diff --git a/integration-tests/lambda/oom-go/main.go b/integration-tests/lambda/oom-go/main.go new file mode 100644 index 000000000..99821b0ad --- /dev/null +++ b/integration-tests/lambda/oom-go/main.go @@ -0,0 +1,23 @@ +// OOM reproducer for Go. +// Allocates and retains 10 MB byte slices in a slice header until the Go +// runtime aborts with "fatal error: runtime: out of memory". Bottlecap's +// runtime-specific log-line detection matches that fatal-error message. +// Without that detection (and historically for provided.al runtimes), the +// equality path in PlatformReport (max_memory_used_mb == memory_size_mb) also +// fires. The per-Context dedup flag ensures the metric increments only once. +package main + +import ( + "github.com/aws/aws-lambda-go/lambda" +) + +func handler() error { + var data [][]byte + for { + data = append(data, make([]byte, 10*1024*1024)) + } +} + +func main() { + lambda.Start(handler) +} diff --git a/integration-tests/lambda/oom-java/pom.xml b/integration-tests/lambda/oom-java/pom.xml new file mode 100644 index 000000000..1ead70ea0 --- /dev/null +++ b/integration-tests/lambda/oom-java/pom.xml @@ -0,0 +1,50 @@ + + + 4.0.0 + + example + oom-java-lambda + 1.0.0 + jar + + OOM Java Lambda + Java Lambda function that triggers OutOfMemoryError for integration tests + + + 21 + 21 + UTF-8 + + + + + com.amazonaws + aws-lambda-java-core + 1.4.0 + + + + + + + org.apache.maven.plugins + maven-shade-plugin + 3.5.0 + + + package + + shade + + + function + false + + + + + + + diff --git a/integration-tests/lambda/oom-java/src/main/java/example/Handler.java b/integration-tests/lambda/oom-java/src/main/java/example/Handler.java new file mode 100644 index 000000000..92edb9c18 --- /dev/null +++ b/integration-tests/lambda/oom-java/src/main/java/example/Handler.java @@ -0,0 +1,24 @@ +package example; + +import com.amazonaws.services.lambda.runtime.Context; +import com.amazonaws.services.lambda.runtime.RequestHandler; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; + +/** + * OOM reproducer for Java. Allocates and retains 10 MB byte arrays in a list + * until the JVM throws java.lang.OutOfMemoryError: Java heap space. + * Bottlecap's runtime-specific log-line detection matches + * "java.lang.OutOfMemoryError". + */ +public class Handler implements RequestHandler, Map> { + + @Override + public Map handleRequest(Map event, Context context) { + List data = new ArrayList<>(); + while (true) { + data.add(new byte[10 * 1024 * 1024]); + } + } +} diff --git a/integration-tests/lambda/oom-node-sigkill/index.mjs b/integration-tests/lambda/oom-node-sigkill/index.mjs new file mode 100644 index 000000000..d6b245f36 --- /dev/null +++ b/integration-tests/lambda/oom-node-sigkill/index.mjs @@ -0,0 +1,13 @@ +// OOM reproducer: off-heap Buffer growth → kernel SIGKILL. +// Buffer.allocUnsafe(>8KB) goes through V8's ArrayBuffer allocator (external +// memory) and bypasses --max-old-space-size, so RSS grows until the cgroup +// limit triggers a kernel SIGKILL. Lambda surfaces this as PlatformRuntimeDone +// with error_type=Runtime.OutOfMemory — bottlecap's path 2 detection. +export const handler = async () => { + const bufs = []; + while (true) { + const b = Buffer.allocUnsafe(20 * 1024 * 1024); + b.fill(0); + bufs.push(b); + } +}; diff --git a/integration-tests/lambda/oom-node-v8-heap/index.mjs b/integration-tests/lambda/oom-node-v8-heap/index.mjs new file mode 100644 index 000000000..fb4e71c6f --- /dev/null +++ b/integration-tests/lambda/oom-node-v8-heap/index.mjs @@ -0,0 +1,10 @@ +// OOM reproducer: classic V8 heap exhaustion. Allocates retained strings in a +// loop until V8 hits its --max-old-space-size cap and prints +// "FATAL ERROR: ... JavaScript heap out of memory". Exercises bottlecap's +// runtime-specific log-line OOM detection path. +export const handler = async () => { + const arr = []; + while (true) { + arr.push('x'.repeat(10 * 1024 * 1024)); + } +}; diff --git a/integration-tests/lambda/oom-python/lambda_function.py b/integration-tests/lambda/oom-python/lambda_function.py new file mode 100644 index 000000000..12aa196ed --- /dev/null +++ b/integration-tests/lambda/oom-python/lambda_function.py @@ -0,0 +1,12 @@ +# OOM reproducer for Python. +# Allocates and retains 10 MB strings in a list until CPython raises +# MemoryError. Lambda surfaces this as PlatformRuntimeDone with +# error_type=Runtime.OutOfMemory; the function log line also contains +# "MemoryError". Both bottlecap detection paths fire — the dedup flag is +# what makes the OOM metric emit exactly once. + + +def handler(event, context): + data = [] + while True: + data.append("x" * (10 * 1024 * 1024)) diff --git a/integration-tests/lambda/oom-ruby/lambda_function.rb b/integration-tests/lambda/oom-ruby/lambda_function.rb new file mode 100644 index 000000000..674a70086 --- /dev/null +++ b/integration-tests/lambda/oom-ruby/lambda_function.rb @@ -0,0 +1,13 @@ +# OOM reproducer for Ruby. +# Allocates and retains 10 MB strings in an array until Ruby raises +# NoMemoryError. Lambda surfaces this as PlatformRuntimeDone with +# error_type=Runtime.OutOfMemory; the function log line also contains +# "failed to allocate memory (NoMemoryError)". Both bottlecap detection +# paths fire — the dedup flag is what makes the OOM metric emit exactly once. + +def handler(event:, context:) + data = [] + loop do + data << ("x" * (10 * 1024 * 1024)) + end +end diff --git a/integration-tests/lib/stacks/oom.ts b/integration-tests/lib/stacks/oom.ts new file mode 100644 index 000000000..dc7c6314e --- /dev/null +++ b/integration-tests/lib/stacks/oom.ts @@ -0,0 +1,216 @@ +import * as cdk from 'aws-cdk-lib'; +import * as lambda from 'aws-cdk-lib/aws-lambda'; +import { Construct } from 'constructs'; +import { + createLogGroup, + defaultDatadogEnvVariables, + defaultDatadogSecretPolicy, + getExtensionLayer, + getDefaultNodeLayer, + getDefaultPythonLayer, + getDefaultJavaLayer, + getDefaultDotnetLayer, + getDefaultRubyLayer, + defaultNodeRuntime, + defaultPythonRuntime, + defaultJavaRuntime, + defaultDotnetRuntime, + defaultRubyRuntime, + defaultGoRuntime, +} from '../util'; + +/** + * OOM cross-runtime test stack. + * + * Deploys one Lambda per OOM "shape" so the bottlecap dedup change + * (Context::oom_emitted + try_increment_oom_metric, covering issue #1237) + * can be exercised end-to-end across every supported runtime. Each function + * intentionally allocates until it OOMs; the test then asserts the + * `aws.lambda.enhanced.out_of_memory` metric increments by exactly 1. + * + * The detection paths exercised per case: + * - oom-node-v8-heap : log-line match `JavaScript heap out of memory` + * - oom-node-sigkill : PlatformRuntimeDone `error_type=Runtime.OutOfMemory` + * - oom-python : log line `MemoryError` + PlatformRuntimeDone (dedup) + * - oom-ruby : log line `NoMemoryError` + PlatformRuntimeDone (dedup) + * - oom-java : log line `java.lang.OutOfMemoryError` + * - oom-dotnet : log line `OutOfMemoryException` + * - oom-go : log line `fatal error: runtime: out of memory` + * + PlatformReport memory equality (dedup) + * + * Each function is configured with low memory (192 MB) and a short timeout + * (30 s) so the OOM fires quickly during the integration-test run. + */ +export class Oom extends cdk.Stack { + constructor(scope: Construct, id: string, props: cdk.StackProps) { + super(scope, id, props); + + const extensionLayer = getExtensionLayer(this); + const nodeLayer = getDefaultNodeLayer(this); + const pythonLayer = getDefaultPythonLayer(this); + const javaLayer = getDefaultJavaLayer(this); + const dotnetLayer = getDefaultDotnetLayer(this); + const rubyLayer = getDefaultRubyLayer(this); + + const oomMemorySize = 192; + const oomTimeout = cdk.Duration.seconds(30); + + // Node case A — V8 heap exhaustion (log-line path). + const nodeV8FunctionName = `${id}-node-v8-heap-lambda`; + const nodeV8Function = new lambda.Function(this, nodeV8FunctionName, { + runtime: defaultNodeRuntime, + architecture: lambda.Architecture.ARM_64, + handler: '/opt/nodejs/node_modules/datadog-lambda-js/handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-node-v8-heap'), + functionName: nodeV8FunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: nodeV8FunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'index.handler', + // Cap V8 heap below the Lambda memory cap so V8 throws its OOM error + // before the kernel SIGKILLs the process. + NODE_OPTIONS: '--max-old-space-size=128', + }, + logGroup: createLogGroup(this, nodeV8FunctionName), + }); + nodeV8Function.addToRolePolicy(defaultDatadogSecretPolicy); + nodeV8Function.addLayers(extensionLayer); + nodeV8Function.addLayers(nodeLayer); + + // Node case B — off-heap Buffer / kernel SIGKILL (PlatformRuntimeDone path). + const nodeSigkillFunctionName = `${id}-node-sigkill-lambda`; + const nodeSigkillFunction = new lambda.Function(this, nodeSigkillFunctionName, { + runtime: defaultNodeRuntime, + architecture: lambda.Architecture.ARM_64, + handler: '/opt/nodejs/node_modules/datadog-lambda-js/handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-node-sigkill'), + functionName: nodeSigkillFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: nodeSigkillFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'index.handler', + }, + logGroup: createLogGroup(this, nodeSigkillFunctionName), + }); + nodeSigkillFunction.addToRolePolicy(defaultDatadogSecretPolicy); + nodeSigkillFunction.addLayers(extensionLayer); + nodeSigkillFunction.addLayers(nodeLayer); + + // Python — MemoryError; log path and PlatformRuntimeDone path both fire. + const pythonFunctionName = `${id}-python-lambda`; + const pythonFunction = new lambda.Function(this, pythonFunctionName, { + runtime: defaultPythonRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'datadog_lambda.handler.handler', + code: lambda.Code.fromAsset('./lambda/oom-python'), + functionName: pythonFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: pythonFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'lambda_function.handler', + }, + logGroup: createLogGroup(this, pythonFunctionName), + }); + pythonFunction.addToRolePolicy(defaultDatadogSecretPolicy); + pythonFunction.addLayers(extensionLayer); + pythonFunction.addLayers(pythonLayer); + + // Ruby — NoMemoryError; log path and PlatformRuntimeDone path both fire. + const rubyFunctionName = `${id}-ruby-lambda`; + const rubyFunction = new lambda.Function(this, rubyFunctionName, { + runtime: defaultRubyRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'datadog_lambda_rb.handler', + code: lambda.Code.fromAsset('./lambda/oom-ruby'), + functionName: rubyFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: rubyFunctionName, + DD_TRACE_ENABLED: 'true', + DD_LAMBDA_HANDLER: 'lambda_function.handler', + }, + logGroup: createLogGroup(this, rubyFunctionName), + }); + rubyFunction.addToRolePolicy(defaultDatadogSecretPolicy); + rubyFunction.addLayers(extensionLayer); + rubyFunction.addLayers(rubyLayer); + + // Java — OutOfMemoryError (log-line path). + const javaFunctionName = `${id}-java-lambda`; + const javaFunction = new lambda.Function(this, javaFunctionName, { + runtime: defaultJavaRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'example.Handler::handleRequest', + code: lambda.Code.fromAsset('./lambda/oom-java/target/function.jar'), + functionName: javaFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: javaFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + DD_TRACE_ENABLED: 'true', + }, + logGroup: createLogGroup(this, javaFunctionName), + }); + javaFunction.addToRolePolicy(defaultDatadogSecretPolicy); + javaFunction.addLayers(extensionLayer); + javaFunction.addLayers(javaLayer); + + // .NET — OutOfMemoryException (log-line path). + const dotnetFunctionName = `${id}-dotnet-lambda`; + const dotnetFunction = new lambda.Function(this, dotnetFunctionName, { + runtime: defaultDotnetRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'Function::Function.Handler::FunctionHandler', + code: lambda.Code.fromAsset('./lambda/oom-dotnet/bin/function.zip'), + functionName: dotnetFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: dotnetFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + }, + logGroup: createLogGroup(this, dotnetFunctionName), + }); + dotnetFunction.addToRolePolicy(defaultDatadogSecretPolicy); + dotnetFunction.addLayers(extensionLayer); + dotnetFunction.addLayers(dotnetLayer); + + // Go — runtime fatal error + PlatformReport memory equality (dedup). + // Go runs on the custom runtime, so the binary itself is the handler. + const goFunctionName = `${id}-go-lambda`; + const goFunction = new lambda.Function(this, goFunctionName, { + runtime: defaultGoRuntime, + architecture: lambda.Architecture.ARM_64, + handler: 'bootstrap', + code: lambda.Code.fromAsset('./lambda/oom-go/bin'), + functionName: goFunctionName, + timeout: oomTimeout, + memorySize: oomMemorySize, + environment: { + ...defaultDatadogEnvVariables, + DD_SERVICE: goFunctionName, + AWS_LAMBDA_EXEC_WRAPPER: '/opt/datadog_wrapper', + }, + logGroup: createLogGroup(this, goFunctionName), + }); + goFunction.addToRolePolicy(defaultDatadogSecretPolicy); + goFunction.addLayers(extensionLayer); + // Go has no tracer layer — the Datadog tracer for Go is a Go module imported + // into the function source. The extension layer alone is enough for the + // enhanced metrics this test asserts on. + } +} diff --git a/integration-tests/lib/util.ts b/integration-tests/lib/util.ts index dd8309789..24fe04164 100644 --- a/integration-tests/lib/util.ts +++ b/integration-tests/lib/util.ts @@ -13,11 +13,15 @@ export const defaultNodeRuntime = lambda.Runtime.NODEJS_24_X; export const defaultPythonRuntime = lambda.Runtime.PYTHON_3_13; export const defaultJavaRuntime = lambda.Runtime.JAVA_21; export const defaultDotnetRuntime = lambda.Runtime.DOTNET_8; +export const defaultRubyRuntime = lambda.Runtime.RUBY_3_4; +// Go runs on the custom runtime; the Datadog tracer is a Go module, not a layer. +export const defaultGoRuntime = lambda.Runtime.PROVIDED_AL2023; export const defaultNodeLayerArn = process.env.NODE_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Node24-x:132'; export const defaultPythonLayerArn = process.env.PYTHON_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Python313-ARM:117'; export const defaultJavaLayerArn = process.env.JAVA_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-java:25'; export const defaultDotnetLayerArn = process.env.DOTNET_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:dd-trace-dotnet-ARM:23'; +export const defaultRubyLayerArn = process.env.RUBY_TRACER_LAYER_ARN || 'arn:aws:lambda:us-east-1:464622532012:layer:Datadog-Ruby3-4-ARM:9'; export const defaultDatadogEnvVariables = { DD_API_KEY_SECRET_ARN: datadogSecretArn, @@ -87,6 +91,14 @@ export const getDefaultDotnetLayer = (scope: Construct) => { ); }; +export const getDefaultRubyLayer = (scope: Construct) => { + return LayerVersion.fromLayerVersionArn( + scope, + 'DatadogRubyLayer', + defaultRubyLayerArn + ); +}; + export const capacityProviderArn = `arn:aws:lambda:${REGION}:${ACCOUNT}:capacity-provider:integ-default-capacity-provider-cp`; export function setCapacityProvider(lambdaFunction: lambda.Function) { diff --git a/integration-tests/scripts/build-go.sh b/integration-tests/scripts/build-go.sh new file mode 100755 index 000000000..8f24bc45c --- /dev/null +++ b/integration-tests/scripts/build-go.sh @@ -0,0 +1,123 @@ +#!/bin/bash +set -e + +# Reusable script to cross-compile Go Lambda functions for ARM64 Linux. +# Outputs a binary named `bootstrap` (required by the AWS Lambda custom runtime +# provided.al2023) under /bin/. +# +# Usage: +# ./build-go.sh # Build all Go Lambda functions +# ./build-go.sh # Build a specific Lambda function + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LAMBDA_BASE_DIR="$SCRIPT_DIR/../lambda" + +build_go_lambda() { + local LAMBDA_DIR="$1" + local FUNCTION_NAME=$(basename "$LAMBDA_DIR") + + if [ ! -d "$LAMBDA_DIR" ]; then + echo "Error: Directory not found: $LAMBDA_DIR" + return 1 + fi + + if [ ! -f "$LAMBDA_DIR/go.mod" ]; then + echo "Error: go.mod not found in $LAMBDA_DIR" + return 1 + fi + + echo "Building Go Lambda: $FUNCTION_NAME" + + if ! command -v docker &> /dev/null; then + echo "Error: Docker is not installed or not in PATH" + return 1 + fi + + # Clean previous build (idempotent). + rm -rf "$LAMBDA_DIR/bin" + mkdir -p "$LAMBDA_DIR/bin" + + # Module cache: reuse the host's $GOPATH/pkg/mod when running locally; + # use a project-local cache in CI so it can be cached between jobs. + if [ -n "$CI" ]; then + GO_MOD_CACHE="$SCRIPT_DIR/../.cache/go-mod" + mkdir -p "$GO_MOD_CACHE" + else + GO_MOD_CACHE="${GOPATH:-$HOME/go}/pkg/mod" + mkdir -p "$GO_MOD_CACHE" + fi + + # Cross-compile to ARM64 Linux inside the official Go image. + # CGO is disabled so the binary runs on the provided.al2023 base image + # without a libc mismatch. + docker run --rm --platform linux/arm64 \ + -v "$LAMBDA_DIR":/workspace \ + -v "$GO_MOD_CACHE":/go/pkg/mod \ + -w /workspace \ + -e GOOS=linux \ + -e GOARCH=arm64 \ + -e CGO_ENABLED=0 \ + public.ecr.aws/docker/library/golang:1.22-bookworm \ + sh -c "go mod tidy && go build -o bin/bootstrap ." + + if [ ! -f "$LAMBDA_DIR/bin/bootstrap" ]; then + echo "✗ Build failed: bin/bootstrap not produced" + return 1 + fi + + echo "✓ Build complete: $LAMBDA_DIR/bin/bootstrap" + return 0 +} + +if [ -z "$1" ]; then + echo "==========================================" + echo "Building all Go Lambda functions" + echo "==========================================" + echo "" + + FOUND_GO=0 + FAILED_BUILDS=() + + for LAMBDA_PATH in "$LAMBDA_BASE_DIR"/*; do + if [ ! -d "$LAMBDA_PATH" ]; then + continue + fi + + FUNCTION_NAME=$(basename "$LAMBDA_PATH") + + # Match directories whose suffix is `-go` or whose name is exactly `go`. + if [[ "$FUNCTION_NAME" == *"-go" || "$FUNCTION_NAME" == "go" ]]; then + FOUND_GO=1 + echo "----------------------------------------" + if build_go_lambda "$LAMBDA_PATH"; then + echo "✓ $FUNCTION_NAME built successfully" + else + echo "✗ $FUNCTION_NAME failed" + FAILED_BUILDS+=("$FUNCTION_NAME") + fi + echo "" + fi + done + + if [ $FOUND_GO -eq 0 ]; then + echo "No Go Lambda functions found (looking for directories ending in -go)" + exit 0 + fi + + if [ ${#FAILED_BUILDS[@]} -eq 0 ]; then + echo "✓ All Go Lambda builds completed successfully!" + exit 0 + fi + + echo "✗ ${#FAILED_BUILDS[@]} Go Lambda build(s) failed:" + for failed in "${FAILED_BUILDS[@]}"; do + echo " - $failed" + done + exit 1 +else + LAMBDA_DIR="$1" + if [[ "$LAMBDA_DIR" != /* ]]; then + LAMBDA_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/$LAMBDA_DIR" + fi + build_go_lambda "$LAMBDA_DIR" +fi diff --git a/integration-tests/scripts/build-ruby.sh b/integration-tests/scripts/build-ruby.sh new file mode 100755 index 000000000..0ca36064d --- /dev/null +++ b/integration-tests/scripts/build-ruby.sh @@ -0,0 +1,88 @@ +#!/bin/bash +set -e + +# Reusable script to build Ruby Lambda functions. +# For simple Ruby Lambdas with no gem dependencies, this just packages the +# source as-is — the runtime + Datadog tracer layer provide everything needed. +# If the function gains a Gemfile, this script grows a bundle install step +# in a Docker container (mirroring build-python.sh / build-node.sh). +# +# Usage: +# ./build-ruby.sh # Build all Ruby Lambda functions +# ./build-ruby.sh # Build a specific Lambda function + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +LAMBDA_BASE_DIR="$SCRIPT_DIR/../lambda" + +build_ruby_lambda() { + local LAMBDA_DIR="$1" + local FUNCTION_NAME=$(basename "$LAMBDA_DIR") + + if [ ! -d "$LAMBDA_DIR" ]; then + echo "Error: Directory not found: $LAMBDA_DIR" + return 1 + fi + + echo "Building Ruby Lambda: $FUNCTION_NAME" + + if [ ! -f "$LAMBDA_DIR/Gemfile" ]; then + echo "ℹ No Gemfile found — source files are deployed as-is" + return 0 + fi + + echo "Error: Gemfile-based Ruby builds are not implemented yet" >&2 + echo " Add a Dockerised \`bundle install\` step to this script when needed." >&2 + return 1 +} + +if [ -z "$1" ]; then + echo "==========================================" + echo "Building all Ruby Lambda functions" + echo "==========================================" + echo "" + + FOUND_RUBY=0 + FAILED_BUILDS=() + + for LAMBDA_PATH in "$LAMBDA_BASE_DIR"/*; do + if [ ! -d "$LAMBDA_PATH" ]; then + continue + fi + + FUNCTION_NAME=$(basename "$LAMBDA_PATH") + + if [[ "$FUNCTION_NAME" == *"ruby"* ]]; then + FOUND_RUBY=1 + echo "----------------------------------------" + if build_ruby_lambda "$LAMBDA_PATH"; then + echo "✓ $FUNCTION_NAME built successfully" + else + echo "✗ $FUNCTION_NAME failed" + FAILED_BUILDS+=("$FUNCTION_NAME") + fi + echo "" + fi + done + + if [ $FOUND_RUBY -eq 0 ]; then + echo "No Ruby Lambda functions found (looking for directories with 'ruby' in name)" + exit 0 + fi + + if [ ${#FAILED_BUILDS[@]} -eq 0 ]; then + echo "✓ All Ruby Lambda builds completed successfully!" + exit 0 + fi + + echo "✗ ${#FAILED_BUILDS[@]} Ruby Lambda build(s) failed:" + for failed in "${FAILED_BUILDS[@]}"; do + echo " - $failed" + done + exit 1 +else + LAMBDA_DIR="$1" + if [[ "$LAMBDA_DIR" != /* ]]; then + LAMBDA_DIR="$(cd "$SCRIPT_DIR/.." && pwd)/$LAMBDA_DIR" + fi + build_ruby_lambda "$LAMBDA_DIR" +fi diff --git a/integration-tests/scripts/local_deploy.sh b/integration-tests/scripts/local_deploy.sh index b432261da..451b81cf6 100755 --- a/integration-tests/scripts/local_deploy.sh +++ b/integration-tests/scripts/local_deploy.sh @@ -43,6 +43,8 @@ echo "Building all Lambda functions in parallel..." "$SCRIPT_DIR/build-dotnet.sh" & "$SCRIPT_DIR/build-python.sh" & "$SCRIPT_DIR/build-node.sh" & +"$SCRIPT_DIR/build-ruby.sh" & +"$SCRIPT_DIR/build-go.sh" & wait echo "All Lambda builds complete" diff --git a/integration-tests/tests/oom.test.ts b/integration-tests/tests/oom.test.ts new file mode 100644 index 000000000..5e33e4c8e --- /dev/null +++ b/integration-tests/tests/oom.test.ts @@ -0,0 +1,90 @@ +import { invokeLambda } from './utils/lambda'; +import { getMetricCount, OUT_OF_MEMORY_METRIC } from './utils/datadog'; +import { DEFAULT_DATADOG_INDEXING_WAIT_MS } from '../config'; +import { getIdentifier } from '../config'; + +/** + * Cross-runtime OOM test. + * + * Each function is intentionally configured to OOM on its first invocation. + * Bottlecap has three detection paths that can fire for the same invocation + * (runtime-specific log line, `Runtime.OutOfMemory` `error_type` in + * `PlatformRuntimeDone`, `max_memory_used_mb == memory_size_mb` in + * `PlatformReport`); the `Context::oom_emitted` flag introduced for #1237 + * dedupes them so the metric increments exactly once per invocation. + * + * The Python/Ruby/Go cases are particularly meaningful regressions because + * they trigger more than one detection path naturally — if dedup is broken, + * those counts go to 2. + */ +const identifier = getIdentifier(); +const stackName = `integ-${identifier}-oom`; + +interface OomCase { + runtime: string; + functionName: string; +} + +const cases: OomCase[] = [ + { runtime: 'node-v8-heap', functionName: `${stackName}-node-v8-heap-lambda` }, + { runtime: 'node-sigkill', functionName: `${stackName}-node-sigkill-lambda` }, + { runtime: 'python', functionName: `${stackName}-python-lambda` }, + { runtime: 'ruby', functionName: `${stackName}-ruby-lambda` }, + { runtime: 'java', functionName: `${stackName}-java-lambda` }, + { runtime: 'dotnet', functionName: `${stackName}-dotnet-lambda` }, + { runtime: 'go', functionName: `${stackName}-go-lambda` }, +]; + +async function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +describe('OOM Integration Tests', () => { + let countsByRuntime: Record; + let windowStart: number; + let windowEnd: number; + + // Invoke every function once, wait for Datadog to ingest, then query once + // for each. Keeping invocations and the query inside `beforeAll` lets each + // per-runtime test below assert against the same data set. + beforeAll(async () => { + windowStart = Date.now(); + + await Promise.all( + cases.map((c) => + invokeLambda(c.functionName).catch((err) => { + // OOM functions usually succeed at the Invoke API layer (the function + // is run, just crashes), so a thrown error here is unexpected + // infrastructure failure rather than the OOM itself. Re-throw so the + // test surfaces it. + throw new Error(`Invoke failed for ${c.functionName}: ${err}`); + }), + ), + ); + + await sleep(DEFAULT_DATADOG_INDEXING_WAIT_MS); + windowEnd = Date.now(); + + const results = await Promise.all( + cases.map(async (c) => ({ + runtime: c.runtime, + count: await getMetricCount( + OUT_OF_MEMORY_METRIC, + c.functionName, + windowStart, + windowEnd, + ), + })), + ); + + countsByRuntime = Object.fromEntries(results.map((r) => [r.runtime, r.count])); + console.log('OOM counts by runtime:', countsByRuntime); + }, 10 * 60 * 1000); + + describe.each(cases)('$runtime runtime', ({ runtime }) => { + it('should emit exactly one out_of_memory metric for one OOM invocation', () => { + const count = countsByRuntime[runtime]; + expect(count).toBe(1); + }); + }); +}); diff --git a/integration-tests/tests/utils/datadog.ts b/integration-tests/tests/utils/datadog.ts index ed3768ea0..e25225235 100644 --- a/integration-tests/tests/utils/datadog.ts +++ b/integration-tests/tests/utils/datadog.ts @@ -91,6 +91,8 @@ export const DURATION_METRICS = [ 'aws.lambda.enhanced.init_duration', ]; +export const OUT_OF_MEMORY_METRIC = 'aws.lambda.enhanced.out_of_memory'; + export type EnhancedMetrics = Record; export interface MetricPoint { @@ -289,6 +291,41 @@ export async function getEnhancedMetrics( return metrics; } +/** + * Returns the total emission count of a counter / distribution enhanced metric + * for a single function over the given window, by summing all data-point + * values returned by Datadog. Used by oom.test.ts to assert that + * `aws.lambda.enhanced.out_of_memory` increments exactly once per invocation — + * verifying the per-Context `oom_emitted` dedup flag introduced for #1237. + */ +export async function getMetricCount( + metricName: string, + functionName: string, + fromTime: number, + toTime: number, +): Promise { + const baseFunctionName = getServiceName(functionName).toLowerCase(); + const query = `sum:${metricName}{functionname:${baseFunctionName}}.as_count()`; + + console.log(`Querying metric count: ${query}`); + + const response = await datadogClient.get('/api/v1/query', { + params: { + query, + from: Math.floor(fromTime / 1000), + to: Math.floor(toTime / 1000), + }, + }); + + const series = response.data.series || []; + if (series.length === 0) { + return 0; + } + + const pointlist: [number, number][] = series[0].pointlist || []; + return pointlist.reduce((acc, [, value]) => acc + (value || 0), 0); +} + async function getMetrics( metricName: string, functionName: string,