From 21ac66986db70d4db99fd44dd681cb32c454c3e2 Mon Sep 17 00:00:00 2001 From: alexanderkirtzel Date: Wed, 3 Jun 2026 15:38:30 +0200 Subject: [PATCH 01/10] scoped generic --- .changeset/scoped-generic-data-attribute.md | 10 ++ packages/collector/src/constants.ts | 1 + packages/collector/src/types/collector.ts | 1 + .../mcps/source-browser/src/resources/docs.ts | 18 ++- .../browser/src/__tests__/html/walker.html | 99 ++++++++++++++ .../browser/src/__tests__/shadowDom.test.ts | 1 + .../browser/src/__tests__/trigger.test.ts | 1 + .../browser/src/__tests__/walker.test.ts | 124 ++++++++++++++++++ packages/web/sources/browser/src/walker.ts | 12 ++ .../web/browser/tagging/html-attributes.mdx | 42 +++++- 10 files changed, 306 insertions(+), 3 deletions(-) create mode 100644 .changeset/scoped-generic-data-attribute.md diff --git a/.changeset/scoped-generic-data-attribute.md b/.changeset/scoped-generic-data-attribute.md new file mode 100644 index 000000000..dda489a26 --- /dev/null +++ b/.changeset/scoped-generic-data-attribute.md @@ -0,0 +1,10 @@ +--- +'@walkeros/web-source-browser': minor +'@walkeros/mcp-source-browser': patch +--- + +Add the `data-elb_` scoped generic attribute. It carries the same `key:value` +properties as the blanket `data-elb-` generic, but only events whose triggered +element is nested below the `data-elb_` element receive them. Use `data-elb-` +for properties every trigger in an entity should carry, and `data-elb_` when +only triggers within a specific branch should. diff --git a/packages/collector/src/constants.ts b/packages/collector/src/constants.ts index ea5b97c44..dedc32a9e 100644 --- a/packages/collector/src/constants.ts +++ b/packages/collector/src/constants.ts @@ -18,6 +18,7 @@ export const Commands: Record = { Prefix: 'data-elb', Ready: 'ready', Run: 'run', + Scoped: '_', Session: 'session', Shutdown: 'shutdown', User: 'user', diff --git a/packages/collector/src/types/collector.ts b/packages/collector/src/types/collector.ts index d6754c856..afd5bdced 100644 --- a/packages/collector/src/types/collector.ts +++ b/packages/collector/src/types/collector.ts @@ -32,6 +32,7 @@ export type CommandTypes = | 'Prefix' | 'Ready' | 'Run' + | 'Scoped' | 'Session' | 'Shutdown' | 'User' diff --git a/packages/mcps/source-browser/src/resources/docs.ts b/packages/mcps/source-browser/src/resources/docs.ts index 7c0990f9d..0014b796c 100644 --- a/packages/mcps/source-browser/src/resources/docs.ts +++ b/packages/mcps/source-browser/src/resources/docs.ts @@ -163,6 +163,22 @@ Add the \`[]\` suffix to a property's name, such as \`size[]:m\`. It will genera Leave the entity name empty (only \`data-elb-\`) to add the property to any related entity. Explicitly named properties are preferred over generic ones. +### Scoped generic properties + +Use \`data-elb_\` (trailing underscore, no dash) to add a generic property that reaches only the triggers nested below it. It uses the same \`key:value\` payload as \`data-elb-\` (including \`[]\` arrays and \`#dynamic\` values), but it is collected only while bubbling up from the triggered element, so sibling triggers outside its branch never receive it. A scoped value closer to the trigger wins over a value set farther up the tree. + +\`\`\`html +
+
+ +
+
+ +
+\`\`\` + +The first button gets \`{ name: 'A', color: 'red', size: 'L' }\`; the second button gets \`{ name: 'A', color: 'red' }\` (no \`size\`). + ## Globals Properties that apply to **all events on a page**. Define them anywhere using the \`data-elbglobals\` attribute. Globals are collected once, right before the first event. @@ -203,7 +219,7 @@ A \`data-elb\` entity within another \`data-elb\` entity is called a nested enti ## Reserved attributes -\`data-elb\`, \`data-elbaction\`, \`data-elbactions\`, \`data-elbcontext\`, \`data-elbglobals\`, and \`data-elblink\` are reserved attributes. \`data-elb-*\` attributes may be arbitrary combinations based on the related entity name. +\`data-elb\`, \`data-elbaction\`, \`data-elbactions\`, \`data-elbcontext\`, \`data-elbglobals\`, and \`data-elblink\` are reserved attributes. \`data-elb-*\` attributes may be arbitrary combinations based on the related entity name. \`data-elb_\` is a reserved path-scoped generic that uses the same value syntax as \`data-elb-\` but only reaches triggers nested below it. `; const TAGGER_DOCS = `# Tagger diff --git a/packages/web/sources/browser/src/__tests__/html/walker.html b/packages/web/sources/browser/src/__tests__/html/walker.html index b5895a6f4..ea987f84c 100644 --- a/packages/web/sources/browser/src/__tests__/html/walker.html +++ b/packages/web/sources/browser/src/__tests__/html/walker.html @@ -139,6 +139,105 @@ + +
+
+ +
+
+ +
+ +
+ +
+ +
+ +
+ +
+
+ +
+
+ +
+
+ +
+
+
+ +
+
+
+ +
+
+
+ +
+
+
+ +
+ +
+ +
+ +
+ +
+
+ +
+ +
+
+
+ +
+
+
+ +
+ + + +
+
+ +
+
+ +
+
+
+ +
+
+
+ +
+ + +
+
`} + language="html" +/> + +Clicking the first button bubbles up through the `data-elb_` element, so it +receives `size:L` along with the blanket `color:red`. Clicking the second +button never passes through that element, so it has no `size`. + + + +A scoped value on an element closer to the trigger wins over a value set +farther up the tree, including an explicit entity property on a higher element. + ## Globals There might be properties that don't belong to just one event but to **all From c27d3c1d0520caf16ae0d4fb33e66d79d976d792 Mon Sep 17 00:00:00 2001 From: alexanderkirtzel Date: Wed, 3 Jun 2026 16:34:25 +0200 Subject: [PATCH 02/10] batch n cache --- .changeset/config-batch-enabler.md | 13 + .changeset/request-cache-serialization.md | 10 + .../src/__tests__/batch-all.test.ts | 63 ++++ apps/quickstart/src/batch-all.ts | 51 +++ .../__tests__/cache-codec-roundtrip.test.ts | 316 ++++++++++++++++++ .../src/__tests__/destination.test.ts | 179 ++++++++++ .../collector/src/__tests__/shutdown.test.ts | 15 + packages/collector/src/destination.ts | 22 +- packages/collector/src/shutdown.ts | 31 +- packages/core/src/__tests__/cache.test.ts | 159 ++++++++- packages/core/src/cache.ts | 106 +++++- packages/core/src/schemas/destination.ts | 2 +- packages/core/src/types/destination.ts | 14 +- packages/core/src/types/mapping.ts | 11 +- .../gcp/src/bigquery/__tests__/index.test.ts | 42 +++ .../gcp/src/bigquery/pushBatch.ts | 7 +- .../src/__tests__/cache-roundtrip.test.ts | 170 ++++++++++ .../stores/fs/src/__tests__/store.test.ts | 26 ++ packages/server/stores/fs/src/store.ts | 9 +- .../stores/gcs/src/__tests__/store.test.ts | 49 +++ .../stores/s3/src/__tests__/store.test.ts | 43 +++ .../web/destinations/api/src/index.test.ts | 43 ++- packages/web/destinations/api/src/index.ts | 4 +- .../SKILL.md | 13 +- .../walkeros-understanding-mapping/SKILL.md | 18 +- website/docs/destinations/api/server.mdx | 26 ++ website/docs/destinations/code.mdx | 34 +- website/docs/destinations/server/gcp.mdx | 7 +- website/docs/mapping/rule.mdx | 14 +- .../web/browser/tagging/html-attributes.mdx | 5 +- website/docs/stores/cache.mdx | 8 + website/docs/stores/server/fs.mdx | 4 +- website/docs/stores/server/sheets.mdx | 1 + 33 files changed, 1448 insertions(+), 67 deletions(-) create mode 100644 .changeset/config-batch-enabler.md create mode 100644 .changeset/request-cache-serialization.md create mode 100644 apps/quickstart/src/__tests__/batch-all.test.ts create mode 100644 apps/quickstart/src/batch-all.ts create mode 100644 packages/collector/src/__tests__/cache-codec-roundtrip.test.ts create mode 100644 packages/server/sources/express/src/__tests__/cache-roundtrip.test.ts diff --git a/.changeset/config-batch-enabler.md b/.changeset/config-batch-enabler.md new file mode 100644 index 000000000..495e71d86 --- /dev/null +++ b/.changeset/config-batch-enabler.md @@ -0,0 +1,13 @@ +--- +'@walkeros/collector': minor +--- + +Destinations now batch every event when you set `config.batch`, with no `'* *'` +wildcard mapping rule needed. A bare number sets the debounce wait; +`{ wait, size, age }` tunes the window. Rule-level `batch` still overrides per +event type, and pending batches now flush on shutdown. + +Migration: if you previously set `config.batch` alongside a single non-wildcard +rule `batch`, `config.batch` only capped that rule before; it now batches all of +the destination's events. To batch only specific events, drop `config.batch` and +set `batch` on those rules. diff --git a/.changeset/request-cache-serialization.md b/.changeset/request-cache-serialization.md new file mode 100644 index 000000000..d68387e68 --- /dev/null +++ b/.changeset/request-cache-serialization.md @@ -0,0 +1,10 @@ +--- +'@walkeros/core': patch +'@walkeros/server-store-fs': patch +--- + +Request caching now persists structured HTTP responses (including binary bodies) +to any store backend and honors TTL. Previously, caching a response to a +filesystem, S3, or GCS store could crash the process or never populate, and +entries never expired. Cached values now round-trip safely and expire correctly +instead of serving stale content after a redeploy. diff --git a/apps/quickstart/src/__tests__/batch-all.test.ts b/apps/quickstart/src/__tests__/batch-all.test.ts new file mode 100644 index 000000000..11c5f2ae7 --- /dev/null +++ b/apps/quickstart/src/__tests__/batch-all.test.ts @@ -0,0 +1,63 @@ +import type { SendWebOptions } from '@walkeros/web-core'; +import type { WalkerOS } from '@walkeros/core'; +import { startFlow } from '@walkeros/collector'; +import destinationAPI from '@walkeros/web-destination-api'; + +describe('walkerOS Batch-All Example', () => { + beforeEach(() => jest.useFakeTimers()); + afterEach(() => jest.useRealTimers()); + + test('batches every event into one delivery with config.batch', async () => { + const mockSendWeb = jest.fn( + (url: string, body: string, options: SendWebOptions) => { + // console.log('πŸ“‘ API batch:', { url, body: JSON.parse(body), options }); + }, + ); + + // config.batch enables batch-all: every event flows into one shared + // buffer, no mapping wildcard rule needed. + const { elb } = await startFlow({ + destinations: { + api: { + code: destinationAPI, + config: { + settings: { + url: 'https://analytics.example.com/events', + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + }, + batch: { + wait: 1000, + size: 50, + }, + }, + env: { + sendWeb: mockSendWeb, + }, + }, + }, + }); + + // Push two distinct events + await elb('page view'); + await elb('product add'); + + // Nothing delivered yet: the events sit in the shared batch buffer + expect(mockSendWeb).not.toHaveBeenCalled(); + + // Advance past the debounce window to flush the buffer + jest.advanceTimersByTime(1100); + + // ONE delivery carrying BOTH events as a single array + expect(mockSendWeb).toHaveBeenCalledTimes(1); + const [calledUrl, calledBody] = mockSendWeb.mock.calls[0]; + expect(calledUrl).toBe('https://analytics.example.com/events'); + + const payload = JSON.parse(calledBody) as WalkerOS.Event[]; + expect(payload).toHaveLength(2); + expect(payload.map((event) => event.name)).toEqual([ + 'page view', + 'product add', + ]); + }); +}); diff --git a/apps/quickstart/src/batch-all.ts b/apps/quickstart/src/batch-all.ts new file mode 100644 index 000000000..37cb052d7 --- /dev/null +++ b/apps/quickstart/src/batch-all.ts @@ -0,0 +1,51 @@ +import type { WalkerOS } from '@walkeros/core'; +import { startFlow } from '@walkeros/collector'; +import { destinationAPI } from '@walkeros/web-destination-api'; + +/** + * Batch every event with config.batch (no mapping rule needed) + * + * Setting config.batch on a destination batches ALL of its events into one + * shared buffer. There is no need for a `mapping: { '*': { '*': { batch } } }` + * wildcard rule. The buffer flushes after `wait` milliseconds of quiet or once + * it reaches `size` events, whichever comes first. A rule-level `batch` still + * splits that entity-action into its own buffer and overrides per field. + */ +export async function setupBatchAll(): Promise<{ + collector: unknown; + elb: WalkerOS.Elb; +}> { + const { collector, elb } = await startFlow({ + destinations: { + // API destination batches every event with no per-rule mapping + api: { + code: destinationAPI, + config: { + settings: { + url: 'https://analytics.example.com/events', + method: 'POST', + headers: { + 'Content-Type': 'application/json', + }, + }, + // Enables batch-all: one shared buffer for every event, + // flushed after 1s of quiet or once 50 events accumulate. + batch: { + wait: 1000, + size: 50, + }, + }, + env: { + // Mock sendWeb function + sendWeb: (url: unknown, body: unknown, options: unknown) => { + console.log('API batch:', { url, body, options }); + }, + }, + }, + }, + }); + + return { collector, elb }; +} + +export default setupBatchAll; diff --git a/packages/collector/src/__tests__/cache-codec-roundtrip.test.ts b/packages/collector/src/__tests__/cache-codec-roundtrip.test.ts new file mode 100644 index 000000000..7f52f51d3 --- /dev/null +++ b/packages/collector/src/__tests__/cache-codec-roundtrip.test.ts @@ -0,0 +1,316 @@ +import { startFlow } from '..'; +import { + buildCacheContext, + checkCache, + compileCache, + decodeCacheValue, + encodeCacheValue, + storeCache, +} from '@walkeros/core'; +import { createCacheStore } from '../cache-store'; +import type { Store, Transformer, WalkerOS } from '@walkeros/core'; + +/** + * Regression sweep for the cache-boundary serialization codec + * (`encodeCacheValue` / `decodeCacheValue` in `@walkeros/core`). + * + * The request-cache has three kinds of callers that store DIFFERENT + * payloads β€” all must survive a MISSβ†’HIT round-trip: + * - transformer.ts storeCache: a processed event object + * - destination.ts storeCache: a push result (`SendResponse` shape) or `true` + * - source.ts: RespondOptions + the `true` sentinel (covered elsewhere) + * + * These tests route the cache through a BYTE-ONLY store (mirroring the fs + * store's `set` guard: Buffer or string only) so a HIT is only possible if + * the codec genuinely serialized the value to bytes and decoded it back. + * A plain Map store would pass even without the codec, so it would not + * prove serialization. The `__cache` (in-memory, by-reference) path is + * exercised separately below. + */ + +interface ByteStoreCounters { + rejected: number; +} + +/** + * A store that persists ONLY Buffer or string values, like the production + * fs/s3/gcs stores. Any other `set` payload throws β€” this is what forces + * the cache codec to do real serialization for transformer/destination + * cache writes. + */ +function createByteStore( + data: Map, + counters: ByteStoreCounters, +): Store.Init { + return (context) => ({ + type: 'byte-only', + config: context.config as Store.Config, + get: async (key: string) => data.get(key), + set: async (key: string, value: unknown) => { + if (!Buffer.isBuffer(value) && typeof value !== 'string') { + counters.rejected++; + throw new Error( + `byte store persists Buffer or string values only, received ${typeof value}`, + ); + } + data.set(key, value); + }, + delete: async (key: string) => { + data.delete(key); + }, + }); +} + +describe('cache codec round-trip (request-cache callers)', () => { + // A. Transformer event-object round-trip through a byte-only store. + it('round-trips a processed event object through a byte-only cache store', async () => { + const data = new Map(); + const counters: ByteStoreCounters = { rejected: 0 }; + let enricherCalls = 0; + const destinationEvents: WalkerOS.DeepPartialEvent[] = []; + + // Representative processed event with nested + array + primitive shapes. + const enrichedEvent: WalkerOS.DeepPartialEvent = { + name: 'page view', + data: { id: '/x', count: 3, items: ['a', 'b'] }, + context: { stage: ['checkout', 1] }, + nested: [{ entity: 'product', data: { id: 'p1' } }], + }; + + const { elb } = await startFlow({ + stores: { + byteCache: { code: createByteStore(data, counters) }, + }, + transformers: { + enricher: { + code: async (ctx): Promise => ({ + type: 'enricher', + config: ctx.config, + push() { + enricherCalls++; + return { event: enrichedEvent }; + }, + }), + cache: { + store: 'byteCache', + rules: [{ key: ['event.name'], ttl: 60 }], + }, + }, + }, + destinations: { + spy: { + before: 'enricher', + code: { + type: 'spy', + config: {}, + push: async (event: WalkerOS.DeepPartialEvent) => { + destinationEvents.push(event); + }, + }, + }, + }, + }); + + // First push: MISS β€” enricher runs, processed event is encoded to bytes. + await elb({ name: 'page view', data: {} }); + expect(enricherCalls).toBe(1); + expect(counters.rejected).toBe(0); // codec produced a Buffer, store accepted it + expect(destinationEvents).toHaveLength(1); + + // The byte store now holds a Buffer (the encoded envelope), not a raw object. + const stored = [...data.values()][0]; + expect(Buffer.isBuffer(stored)).toBe(true); + + // Second push same name: HIT β€” enricher skipped, event decoded from bytes. + destinationEvents.length = 0; + await elb({ name: 'page view', data: {} }); + expect(enricherCalls).toBe(1); // cached, not re-run + + // Structural round-trip: the decoded event matches the original exactly. + expect(destinationEvents).toHaveLength(1); + expect(destinationEvents[0]).toEqual(enrichedEvent); + expect(destinationEvents[0].data).toEqual({ + id: '/x', + count: 3, + items: ['a', 'b'], + }); + expect(destinationEvents[0].context).toEqual({ stage: ['checkout', 1] }); + }); + + // B. Destination push-result round-trip through a byte-only store. + it('round-trips a destination push result (SendResponse) through a byte-only cache store', async () => { + const data = new Map(); + const counters: ByteStoreCounters = { rejected: 0 }; + let pushCount = 0; + + // Canonical SendResponse: { ok, data, error? }. `data` is JSON-safe + // (matches sendServer, which JSON.parses the response body or keeps the + // raw string), `error` is always a string. No Date/Error/function. + const pushResult = { ok: true, data: { id: 'resp-1', items: [1, 2, 3] } }; + + const { elb } = await startFlow({ + stores: { + byteCache: { code: createByteStore(data, counters) }, + }, + destinations: { + api: { + code: { + type: 'api', + config: {}, + push: async () => { + pushCount++; + return pushResult; + }, + }, + cache: { + store: 'byteCache', + rules: [{ key: ['event.name'], ttl: 60 }], + }, + }, + }, + }); + + // First push: MISS β€” destination push runs, result encoded to bytes. + await elb({ name: 'page view', data: {} }); + expect(pushCount).toBe(1); + expect(counters.rejected).toBe(0); // result serialized to a Buffer + + const stored = [...data.values()][0]; + expect(Buffer.isBuffer(stored)).toBe(true); + + // Second push: HIT β€” push skipped (deduplicated). The cached result is + // never re-emitted downstream, but it must DECODE without error so the + // HIT path resolves. We assert the stored bytes decode back to the + // original push result structurally. + await elb({ name: 'page view', data: {} }); + expect(pushCount).toBe(1); // deduplicated via cached result + }); + + // B (codec unit-level): prove a SendResponse round-trips byte-for-byte + // through the public codec, including a nested Buffer in `data`. + it('codec round-trips a SendResponse with a nested Buffer in data', () => { + const result = { + ok: false, + data: { raw: Buffer.from('body-bytes'), nested: { n: 1 } }, + error: '500 Internal Server Error', + }; + const encoded = encodeCacheValue(result, 60_000); + expect(Buffer.isBuffer(encoded)).toBe(true); + const decoded = decodeCacheValue(encoded); + expect(decoded).toBeDefined(); + if (!decoded || 'expired' in decoded) throw new Error('unexpected decode'); + expect(decoded.value).toEqual(result); + const value = decoded.value; + if ( + value === null || + typeof value !== 'object' || + !('data' in value) || + value.data === null || + typeof value.data !== 'object' || + !('raw' in value.data) + ) { + throw new Error('decoded shape mismatch'); + } + expect(Buffer.isBuffer(value.data.raw)).toBe(true); + }); + + // C. In-memory `__cache` (default) round-trips a Buffer body on HIT. + // The default store keeps the encoded Buffer by reference; `decodeCacheValue` + // inside `checkCache` restores the original Buffer body. This mirrors the + // source.ts caller, which caches RespondOptions whose `body` is a Buffer. + it('round-trips a RespondOptions Buffer body through the default in-memory __cache on HIT', async () => { + const store = createCacheStore(); + const compiled = compileCache({ + rules: [{ key: ['event.name'], ttl: 60 }], + }); + const context = buildCacheContext({}, { name: 'page view' }); + + // Cold: MISS β€” store does not yet hold the key. + const miss = await checkCache(compiled, store, context); + expect(miss?.status).toBe('MISS'); + if (!miss || miss.status !== 'MISS') throw new Error('expected MISS'); + + // Cache a RespondOptions-shaped value with a Buffer body (the source path). + const respondOptions = { + status: 200, + headers: { 'content-type': 'application/json' }, + body: Buffer.from('cached-body-bytes'), + }; + storeCache(store, miss.key, respondOptions, miss.rule.ttl); + + // The default __cache holds the encoded envelope Buffer by reference. + const rawStored = store.get(miss.key); + expect(Buffer.isBuffer(rawStored)).toBe(true); + + // Warm: HIT β€” checkCache decodes the envelope and restores the structure, + // including the Buffer body as a real Buffer (not a base64 string). + const hit = await checkCache(compiled, store, context); + expect(hit?.status).toBe('HIT'); + if (!hit || hit.status !== 'HIT') throw new Error('expected HIT'); + + const value = hit.value; + if ( + value === null || + typeof value !== 'object' || + !('body' in value) || + !('status' in value) + ) { + throw new Error('decoded RespondOptions shape mismatch'); + } + expect(value.status).toBe(200); + expect(Buffer.isBuffer(value.body)).toBe(true); + if (!Buffer.isBuffer(value.body)) throw new Error('body not a Buffer'); + expect(value.body.toString()).toBe('cached-body-bytes'); + expect(value).toEqual(respondOptions); + }); + + // E. State-to-byte-store limitation: `applyState` writes payloads DIRECTLY + // (no codec), so a `state[set]` of a plain object to a byte-only store + // throws at the store and the fail-open state handler swallows it. The + // object is NOT persisted. This documents that fs/s3/gcs persist + // Buffer/string only; structured state payloads need an object-capable + // store like the default in-memory `__cache`. + it('state[set] of a plain object to a byte-only store is rejected (codec bypassed)', async () => { + const data = new Map(); + const counters: ByteStoreCounters = { rejected: 0 }; + + const { elb } = await startFlow({ + stores: { + byteStore: { code: createByteStore(data, counters) }, + }, + transformers: { + writer: { + code: async (ctx): Promise => ({ + type: 'writer', + config: ctx.config, + push(event) { + return { event }; + }, + }), + // state[set] writes a plain OBJECT payload straight to the store. + state: [ + { + mode: 'set', + store: 'byteStore', + key: { value: 'profile' }, + value: { map: { id: 'event.data.id' } }, + }, + ], + }, + }, + destinations: { + spy: { + before: 'writer', + code: { type: 'spy', config: {}, push: async () => {} }, + }, + }, + }); + + await elb({ name: 'page view', data: { id: 'u-1' } }); + + // The byte store rejected the object payload (no codec on the state path) + // and nothing was persisted; the failure was fail-open (no throw to caller). + expect(counters.rejected).toBe(1); + expect(data.size).toBe(0); + }); +}); diff --git a/packages/collector/src/__tests__/destination.test.ts b/packages/collector/src/__tests__/destination.test.ts index d22aa5e1d..342ff8f3e 100644 --- a/packages/collector/src/__tests__/destination.test.ts +++ b/packages/collector/src/__tests__/destination.test.ts @@ -812,6 +812,185 @@ describe('Destination', () => { }); }); + describe('config.batch enables batch-all', () => { + beforeEach(() => jest.useFakeTimers()); + afterEach(() => jest.useRealTimers()); + + it('batches all events when only config.batch is set (no mapping rule)', async () => { + const captured: { events: WalkerOS.Events; key: string }[] = []; + const mockPushBatch = jest.fn((batch) => + captured.push({ events: [...batch.events], key: batch.key }), + ); + const mockPush = jest.fn(); + const dest: Destination.Instance = { + push: mockPush, + pushBatch: mockPushBatch, + config: { init: true, batch: { wait: 50, size: 20 } }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + + await elb('page view'); + await elb('product add'); + jest.advanceTimersByTime(100); + + expect(mockPushBatch).toHaveBeenCalledTimes(1); + expect(captured[0].events).toHaveLength(2); + expect(mockPush).not.toHaveBeenCalled(); + }); + + it('keeps batching OFF when neither config.batch nor a rule batch is set', async () => { + const mockPushBatch = jest.fn(); + const mockPush = jest.fn(); + const dest: Destination.Instance = { + push: mockPush, + pushBatch: mockPushBatch, + config: { init: true }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + + await elb('page view'); + jest.advanceTimersByTime(100); + + expect(mockPush).toHaveBeenCalledTimes(1); + expect(mockPushBatch).not.toHaveBeenCalled(); + }); + + it('treats a bare-number config.batch as the wait window (legacy form)', async () => { + const mockPushBatch = jest.fn(); + const dest: Destination.Instance = { + pushBatch: mockPushBatch, + push: jest.fn(), + config: { init: true, batch: 50 }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + await elb('page view'); + expect(mockPushBatch).not.toHaveBeenCalled(); // debounce not yet fired + jest.advanceTimersByTime(60); + expect(mockPushBatch).toHaveBeenCalledTimes(1); + }); + + it('does not push or pushBatch when config.mock is set alongside config.batch', async () => { + const mockPushBatch = jest.fn(); + const mockPush = jest.fn(); + const dest: Destination.Instance = { + push: mockPush, + pushBatch: mockPushBatch, + config: { init: true, batch: { wait: 50 }, mock: { ok: true } }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + await elb('page view'); + jest.advanceTimersByTime(100); + expect(mockPush).not.toHaveBeenCalled(); + expect(mockPushBatch).not.toHaveBeenCalled(); + }); + + it('does not enqueue into a config.batch buffer when consent is denied', async () => { + const mockPushBatch = jest.fn(); + const mockPush = jest.fn(); + const dest: Destination.Instance = { + push: mockPush, + pushBatch: mockPushBatch, + config: { + init: true, + batch: { wait: 50 }, + consent: { marketing: true }, + }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + await elb('page view'); + jest.advanceTimersByTime(100); + expect(mockPush).not.toHaveBeenCalled(); + expect(mockPushBatch).not.toHaveBeenCalled(); + }); + }); + + describe('config.batch shared default buffer', () => { + beforeEach(() => jest.useFakeTimers()); + afterEach(() => jest.useRealTimers()); + + it('routes two data-only rules into one shared default buffer under config.batch', async () => { + const captured: { count: number; key: string }[] = []; + const mockPushBatch = jest.fn((batch) => + captured.push({ count: batch.events.length, key: batch.key }), + ); + const dest: Destination.Instance = { + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + batch: { wait: 50, size: 20 }, + mapping: { + page: { view: { data: { map: { foo: { value: 'bar' } } } } }, + product: { add: { data: { map: { foo: { value: 'baz' } } } } }, + }, + }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + await elb('page view'); + await elb('product add'); + jest.advanceTimersByTime(100); + + expect(mockPushBatch).toHaveBeenCalledTimes(1); // ONE buffer, one flush + expect(captured[0].count).toBe(2); + }); + + it('passes rule: undefined to pushBatch for the config.batch default buffer', async () => { + let seenRule: unknown = 'unset'; + const dest: Destination.Instance = { + push: jest.fn(), + pushBatch: jest.fn((_batch, ctx) => { + seenRule = ctx.rule; + }), + config: { init: true, batch: { wait: 50 } }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + await elb('page view'); + jest.advanceTimersByTime(60); + expect(seenRule).toBeUndefined(); + }); + + it('does not collide config.batch default buffer with an explicit wildcard batch rule', async () => { + const keys: string[] = []; + const dest: Destination.Instance = { + push: jest.fn(), + pushBatch: jest.fn((batch) => keys.push(batch.key)), + config: { + init: true, + batch: { wait: 50 }, + mapping: { '*': { '*': { batch: { wait: 50, size: 1 } } } }, // own rule buffer + }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + await elb('page view'); // matches wildcard rule -> its own '* *' buffer, size 1 flush + jest.advanceTimersByTime(60); + expect(keys).toContain('* *'); + expect(keys).not.toContain(' batch-all'); + }); + }); + + describe('config.batch rule-level override', () => { + beforeEach(() => jest.useFakeTimers()); + afterEach(() => jest.useRealTimers()); + + it('rule-level batch overrides config.batch per field for its own buffer', async () => { + const flushes: { key: string; size: number }[] = []; + const dest: Destination.Instance = { + push: jest.fn(), + pushBatch: jest.fn((batch) => + flushes.push({ key: batch.key, size: batch.events.length }), + ), + config: { + init: true, + batch: { wait: 60_000, size: 50 }, // default: big wait, size 50 + mapping: { order: { complete: { batch: { size: 1 } } } }, // override size only + }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + await elb('order complete'); // own buffer, size 1 -> flush immediately, inherits wait from config + expect(flushes).toEqual([{ key: 'order complete', size: 1 }]); + }); + }); + describe('auto-generated id charset', () => { test('auto-generated destination id uses lowercase alpha charset (a-z, length 5)', async () => { const autoIdDestination: Destination.Instance = { diff --git a/packages/collector/src/__tests__/shutdown.test.ts b/packages/collector/src/__tests__/shutdown.test.ts index 28d027fce..684bcf1d9 100644 --- a/packages/collector/src/__tests__/shutdown.test.ts +++ b/packages/collector/src/__tests__/shutdown.test.ts @@ -105,6 +105,21 @@ describe('shutdown command', () => { await elb('walker shutdown'); }); + it('flushes pending destination batches on shutdown', async () => { + const mockPushBatch = jest.fn(); + const dest: Destination.Instance = { + push: jest.fn(), + pushBatch: mockPushBatch, + config: { init: true, batch: { wait: 60_000 } }, // wait long; only shutdown can flush + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + await elb('page view'); + expect(mockPushBatch).not.toHaveBeenCalled(); // still buffered + + await elb('walker shutdown'); + expect(mockPushBatch).toHaveBeenCalledTimes(1); + }); + it('respects shutdown order: sources before destinations before transformers', async () => { const order: string[] = []; const { collector, elb } = await startFlow({}); diff --git a/packages/collector/src/destination.ts b/packages/collector/src/destination.ts index 50fe0e5b4..af2557953 100644 --- a/packages/collector/src/destination.ts +++ b/packages/collector/src/destination.ts @@ -63,6 +63,16 @@ function isBatchedResult(value: unknown): boolean { return value === BATCHED_RESULT; } +/** + * Reserved buffer key for the destination-wide default batch created by + * `config.batch`. A produced mappingKey is `" "` with both + * segments guaranteed non-empty (getMappingEvent returns no key when either + * is empty, packages/core/src/mapping.ts:24-25), so no event-derived key can + * begin with a space. The leading space here therefore never collides with a + * rule buffer or the `'* *'` fallback. + */ +const BATCH_ALL_KEY = ' batch-all'; + /** * Normalize a batch config value to the canonical `{ wait?, size?, age? }` * shape. A bare number is treated as `wait` (legacy form). `undefined` @@ -938,10 +948,13 @@ export async function destinationPush( } const eventMapping = processed.mapping; - const mappingKey = processed.mappingKey || '* *'; + const ruleHasBatch = Boolean(eventMapping?.batch); + const mappingKey = ruleHasBatch + ? processed.mappingKey || '* *' + : BATCH_ALL_KEY; if ( - eventMapping?.batch && + (ruleHasBatch || config.batch) && destination.pushBatch && config.mock === undefined ) { @@ -960,7 +973,7 @@ export async function destinationPush( // Resolve scheduling options: mapping-level overrides destination-level. // Mapping rule `batch` (number form = wait; object = wait/size/age). // Destination config `batch` (number = wait; object = wait/size/age). - const ruleOpts = normalizeBatchOptions(eventMapping.batch); + const ruleOpts = normalizeBatchOptions(eventMapping?.batch); const destOpts = normalizeBatchOptions(config.batch); const wait = ruleOpts.wait ?? @@ -998,7 +1011,7 @@ export async function destinationPush( id: destId, config, data: undefined, - rule: rep.rule, + rule: batchState.isDefault ? undefined : rep.rule, ingest: rep.ingest!, env: { ...baseEnv, @@ -1115,6 +1128,7 @@ export async function destinationPush( destination.batches[mappingKey] = { batched, + isDefault: !ruleHasBatch, // created via config.batch, not a rule's own batch batchFn: () => { void scheduler(); }, diff --git a/packages/collector/src/shutdown.ts b/packages/collector/src/shutdown.ts index 955f1dea7..ac8a366bd 100644 --- a/packages/collector/src/shutdown.ts +++ b/packages/collector/src/shutdown.ts @@ -14,7 +14,8 @@ export async function destroyAllSteps( // Phase 1: Sources (stop intake) await destroyStepGroup(collector.sources, 'source', logger); - // Phase 2: Destinations (close connections) + // Phase 2: Destinations, flush pending batches first, then close connections. + await flushDestinationBatches(collector.destinations, logger); await destroyStepGroup(collector.destinations, 'destination', logger); // Phase 3: Transformers (release caches) @@ -24,6 +25,34 @@ export async function destroyAllSteps( await destroyStepGroup(collector.stores, 'store', logger); } +async function flushDestinationBatches( + destinations: Collector.Instance['destinations'], + rootLogger: Logger.Instance, +): Promise { + const promises = Object.entries(destinations).flatMap(([id, dest]) => { + const batches = dest.batches; + if (!batches) return []; + const logger = rootLogger.scope(dest.type || 'destination'); + return Object.values(batches).map(async (b) => { + try { + await Promise.race([ + b.flush(), + new Promise((_, reject) => + setTimeout( + () => + reject(new Error(`destination '${id}' batch flush timed out`)), + STEP_TIMEOUT, + ), + ), + ]); + } catch (err) { + logger.error(`destination '${id}' batch flush failed: ${err}`); + } + }); + }); + await Promise.allSettled(promises); +} + async function destroyStepGroup< T extends { config: unknown; env?: unknown; type?: string }, >( diff --git a/packages/core/src/__tests__/cache.test.ts b/packages/core/src/__tests__/cache.test.ts index 20057fc44..0e7763331 100644 --- a/packages/core/src/__tests__/cache.test.ts +++ b/packages/core/src/__tests__/cache.test.ts @@ -4,12 +4,15 @@ import { storeCache, applyUpdate, buildCacheContext, + encodeCacheValue, + decodeCacheValue, } from '../cache'; import { createMockCollector, createMockStore, createAsyncMockStore, } from './helpers/mocks'; +import type { Store } from '../types'; describe('compileCache', () => { it('compiles cache rules with matchers', () => { @@ -112,6 +115,9 @@ describe('checkCache', () => { }); it('returns HIT when store has entry', async () => { + // Covers decodeCacheValue's live-value tolerance branch (a store returning + // a non-encoded live value), NOT the production HIT path; that is covered + // by the byte-store round-trip tests in 'request-cache codec wiring'. const store = createMockStore(); store._data.set('express:GET:/api/data', { body: 'cached' }); @@ -209,6 +215,9 @@ describe('checkCache', () => { }); it('awaits async store.get for HIT path', async () => { + // Covers decodeCacheValue's live-value tolerance branch (a store returning + // a non-encoded live value), NOT the production HIT path; that is covered + // by the byte-store round-trip tests in 'request-cache codec wiring'. const store = createAsyncMockStore(); store._data.set('express:GET:/api/data', { body: 'cached' }); @@ -247,13 +256,18 @@ describe('checkCache', () => { }); describe('storeCache', () => { - it('stores value with TTL in milliseconds', () => { + it('stores an encoded Buffer with TTL in milliseconds', () => { const store = createMockStore(); const setSpy = jest.spyOn(store, 'set'); storeCache(store, 'mykey', { data: 'value' }, 300); - expect(setSpy).toHaveBeenCalledWith('mykey', { data: 'value' }, 300000); + expect(setSpy).toHaveBeenCalledTimes(1); + const [key, encoded, ttlMs] = setSpy.mock.calls[0]; + expect(key).toBe('mykey'); + expect(ttlMs).toBe(300000); + expect(Buffer.isBuffer(encoded)).toBe(true); + expect(decodeCacheValue(encoded)).toEqual({ value: { data: 'value' } }); }); }); @@ -360,3 +374,144 @@ describe('buildCacheContext', () => { expect('event' in ctx).toBe(false); }); }); + +describe('cache value codec', () => { + it('round-trips a RespondOptions object with a Buffer body', () => { + const value = { + status: 200, + headers: { 'Content-Type': 'application/javascript' }, + body: Buffer.from('var walkerOS = 1;'), + }; + expect(decodeCacheValue(encodeCacheValue(value))).toEqual({ value }); + }); + + it('round-trips the boolean sentinel', () => { + expect(decodeCacheValue(encodeCacheValue(true))).toEqual({ value: true }); + }); + + it('round-trips a processed event object (transformer cache payload)', () => { + const event = { + name: 'page view', + data: { id: '/x', count: 3 }, + nested: [], + }; + expect(decodeCacheValue(encodeCacheValue(event))).toEqual({ value: event }); + }); + + it('does not coerce a user object shaped like the marker', () => { + const value = { note: { __walkeros_cache__: 'not-a-real-tag', d: 'x' } }; + expect(decodeCacheValue(encodeCacheValue(value))).toEqual({ value }); + }); + + it('does not mistake a user object for the envelope', () => { + const value = { __walkeros_cache_v__: 'user-data', other: 1 }; + expect(decodeCacheValue(encodeCacheValue(value))).toEqual({ value }); + }); + + it('reports an expired entry as expired', () => { + expect(decodeCacheValue(encodeCacheValue('x', -1))).toEqual({ + expired: true, + }); + }); + + it('treats undefined (missing) as undefined', () => { + expect(decodeCacheValue(undefined)).toBeUndefined(); + }); + + it('rehydrates a Buffer nested inside an escaped marker object', () => { + const value = { + __walkeros_cache__: 'user-supplied', + items: [{ body: Buffer.from('payload') }], + }; + expect(decodeCacheValue(encodeCacheValue(value))).toEqual({ value }); + }); + + it('decodes corrupt/non-envelope bytes as a miss', () => { + expect(decodeCacheValue(Buffer.from([0x00, 0x01, 0x02]))).toBeUndefined(); + expect(decodeCacheValue(Buffer.from('not json'))).toBeUndefined(); + expect(decodeCacheValue(Buffer.from('{"x":1}'))).toBeUndefined(); + }); +}); + +describe('request-cache codec wiring', () => { + function byteStore(): Store.Instance { + const map = new Map(); + return { + type: 'mem-bytes', + config: { settings: {} }, + get: (k) => map.get(k), + set: (k, v) => { + if (!Buffer.isBuffer(v)) throw new Error('byte store needs a Buffer'); + map.set(k, v); + }, + delete: (k) => void map.delete(k), + }; + } + + it('round-trips a Buffer-bodied response through a byte store (MISS then HIT)', async () => { + const store = byteStore(); + const compiled = compileCache({ + stop: true, + store: 'cache', + rules: [{ key: ['ingest.method', 'ingest.path'], ttl: 3600 }], + }); + const ctx = { ingest: { method: 'GET', path: '/walker.js' } }; + + const miss = await checkCache(compiled, store, ctx); + expect(miss?.status).toBe('MISS'); + + const response = { + status: 200, + headers: { 'Content-Type': 'application/javascript' }, + body: Buffer.from('var walkerOS = 1;'), + }; + storeCache(store, miss!.key, response, miss!.rule.ttl); + const hit = await checkCache(compiled, store, ctx); + expect(hit?.status).toBe('HIT'); + expect(hit?.value).toEqual(response); + }); + + function asyncByteStore(): Store.Instance { + const map = new Map(); + return { + type: 'mem-bytes-async', + config: { settings: {} }, + get: async (k) => map.get(k), + set: async (k, v) => { + if (!Buffer.isBuffer(v)) throw new Error('byte store needs a Buffer'); + map.set(k, v); + }, + delete: async (k) => void map.delete(k), + }; + } + + it('decodes an encoded Buffer to a HIT through an async byte store', async () => { + const store = asyncByteStore(); + const compiled = compileCache({ + stop: true, + store: 'cache', + rules: [{ key: ['ingest.path'], ttl: 3600 }], + }); + const ctx = { ingest: { path: '/walker.js' } }; + const miss = await checkCache(compiled, store, ctx); + const response = { status: 200, headers: {}, body: Buffer.from('js') }; + await store.set(miss!.key, encodeCacheValue(response, 3600_000), 3600_000); + const hit = await checkCache(compiled, store, ctx); + expect(hit?.status).toBe('HIT'); + expect(hit?.value).toEqual(response); + }); + + it('expired entry reads as MISS and is purged', async () => { + const store = byteStore(); + const compiled = compileCache({ + stop: true, + store: 'cache', + rules: [{ key: ['ingest.path'], ttl: -1 }], + }); + const ctx = { ingest: { path: '/x' } }; + const m = await checkCache(compiled, store, ctx); + storeCache(store, m!.key, { status: 200 }, m!.rule.ttl); + const again = await checkCache(compiled, store, ctx); + expect(again?.status).toBe('MISS'); + }); +}); diff --git a/packages/core/src/cache.ts b/packages/core/src/cache.ts index 75cbe73b2..75fb2b9c7 100644 --- a/packages/core/src/cache.ts +++ b/packages/core/src/cache.ts @@ -80,13 +80,19 @@ export async function checkCache( // Promise`). Always await so a Promise return from an // async store (Redis, fs, the cache wrapper) never lands in the HIT // path as the cached "value". - const cached = await store.get(namespacedKey); + const decoded = decodeCacheValue(await store.get(namespacedKey)); - if (cached !== undefined) { - return { status: 'HIT', key: namespacedKey, value: cached, rule }; + if (decoded === undefined) + return { status: 'MISS', key: namespacedKey, rule }; + if ('expired' in decoded) { + try { + await store.delete(namespacedKey); + } catch { + /* best-effort purge; degrade to MISS rather than throw into the request */ + } + return { status: 'MISS', key: namespacedKey, rule }; } - - return { status: 'MISS', key: namespacedKey, rule }; + return { status: 'HIT', key: namespacedKey, value: decoded.value, rule }; } export function storeCache( @@ -95,7 +101,95 @@ export function storeCache( value: unknown, ttlSeconds: number, ): void { - store.set(key, value, ttlSeconds * 1000); + // exp inside the value = uniform expiry for stores that ignore the ttl arg (fs/s3/gcs); the ttl arg lets honoring stores (in-memory) evict instead of retaining until read. + store.set(key, encodeCacheValue(value, ttlSeconds * 1000), ttlSeconds * 1000); +} + +// --- Cache value codec ------------------------------------------------------- +// Serializes a cached value (HTTP RespondOptions, sentinels, events, results) +// to a Buffer so any byte/string store can persist it, and restores it on read. +// Buffer fields are tagged base64 behind a reserved marker; user objects that +// happen to carry the marker are escaped, so NO payload can be corrupted. An +// `exp` timestamp gives every backend uniform TTL even when the store ignores +// the ttl argument. Envelope keys are namespaced so a user object with a literal +// `__walkeros_cache_v__` key is never mistaken for an envelope. + +const CACHE_MARKER = '__walkeros_cache__'; +const ENVELOPE_VALUE = '__walkeros_cache_v__'; +const ENVELOPE_EXP = '__walkeros_cache_exp__'; + +function isRecord(value: unknown): value is Record { + return value !== null && typeof value === 'object' && !Array.isArray(value); +} + +// Precondition: cache values are acyclic JSON-shaped data (RespondOptions / +// events / push results / sentinel); Buffer is the only non-JSON leaf handled. +function toSerializable(value: unknown): unknown { + if (Buffer.isBuffer(value)) + return { [CACHE_MARKER]: 'buffer', d: value.toString('base64') }; + if (Array.isArray(value)) return value.map(toSerializable); + if (isRecord(value)) { + if (CACHE_MARKER in value) { + const inner: Record = {}; + for (const [k, v] of Object.entries(value)) inner[k] = toSerializable(v); + return { [CACHE_MARKER]: 'escape', d: inner }; + } + const out: Record = {}; + for (const [k, v] of Object.entries(value)) out[k] = toSerializable(v); + return out; + } + return value; +} + +function fromSerializable(value: unknown): unknown { + if (isRecord(value)) { + if (CACHE_MARKER in value) { + const tag = value[CACHE_MARKER]; + const data = value.d; + if (tag === 'buffer' && typeof data === 'string') + return Buffer.from(data, 'base64'); + if (tag === 'escape' && isRecord(data)) { + const out: Record = {}; + for (const [k, v] of Object.entries(data)) out[k] = fromSerializable(v); + return out; + } + } + const out: Record = {}; + for (const [k, v] of Object.entries(value)) out[k] = fromSerializable(v); + return out; + } + if (Array.isArray(value)) return value.map(fromSerializable); + return value; +} + +export function encodeCacheValue(value: unknown, ttlMs?: number): Buffer { + const envelope: Record = { + [ENVELOPE_VALUE]: toSerializable(value), + }; + if (ttlMs !== undefined) envelope[ENVELOPE_EXP] = Date.now() + ttlMs; + return Buffer.from(JSON.stringify(envelope)); +} + +export function decodeCacheValue( + raw: unknown, +): { value: unknown } | { expired: true } | undefined { + if (raw === undefined) return undefined; + const text = Buffer.isBuffer(raw) + ? raw.toString('utf8') + : typeof raw === 'string' + ? raw + : undefined; + if (text === undefined) return { value: raw }; + let parsed: unknown; + try { + parsed = JSON.parse(text); + } catch { + return undefined; + } + if (!isRecord(parsed) || !(ENVELOPE_VALUE in parsed)) return undefined; + const exp = parsed[ENVELOPE_EXP]; + if (typeof exp === 'number' && Date.now() > exp) return { expired: true }; + return { value: fromSerializable(parsed[ENVELOPE_VALUE]) }; } export async function applyUpdate( diff --git a/packages/core/src/schemas/destination.ts b/packages/core/src/schemas/destination.ts index 6d08f2aaa..e71b68f01 100644 --- a/packages/core/src/schemas/destination.ts +++ b/packages/core/src/schemas/destination.ts @@ -171,7 +171,7 @@ export const ConfigSchema = z ]) .optional() .describe( - 'Batch scheduling: bare number is the debounce wait window (legacy); object form supports wait (debounce ms), size (count cap, default 1000), age (max ms since first entry, default 30000).', + "Enables batching for all of this destination's events into one shared default buffer; a mapping rule's own batch splits that entity-action into its own buffer and overrides per field. Bare number is the debounce wait window; object form supports wait (debounce ms), size (count cap, default 1000), age (max ms since first entry, default 30000).", ), }) .meta({ diff --git a/packages/core/src/types/destination.ts b/packages/core/src/types/destination.ts index 2d7fbfe18..555f85710 100644 --- a/packages/core/src/types/destination.ts +++ b/packages/core/src/types/destination.ts @@ -140,9 +140,11 @@ export interface Config { */ dlqMax?: number; /** - * Per-destination batch upper bounds. Supplements the mapping-level - * `batch` setting on individual rules. A bare number is treated as - * the debounce `wait` window (legacy compat). + * Enables batching for ALL of this destination's events into one shared + * default buffer. A mapping rule's own `batch` splits that entity-action + * into its own buffer and overrides per field (`rule ?? config ?? default`). + * Batching stays off when neither is set. A bare number is treated as the + * debounce `wait` window. * * - `wait`: debounce window in ms; the timer resets on every push. * - `size`: hard count cap; flush immediately when entries reach this number. Default 1000 when batching is enabled. @@ -271,6 +273,12 @@ export interface Batch { export interface BatchRegistry { [mappingKey: string]: { batched: Batch; + /** + * Marks a buffer created by `config.batch` (the destination-wide default + * batch) rather than a mapping rule's own `batch`. The default buffer is + * heterogeneous, so its flush reports `rule: undefined` to consumers. + */ + isDefault?: boolean; batchFn: () => void; /** * Force a flush of the current batch immediately. Used by shutdown diff --git a/packages/core/src/types/mapping.ts b/packages/core/src/types/mapping.ts index 8fe47183f..5c94d2b0c 100644 --- a/packages/core/src/types/mapping.ts +++ b/packages/core/src/types/mapping.ts @@ -22,12 +22,11 @@ export interface Rules { export interface Rule { /** - * Batch scheduling for this event mapping. A bare number is the - * debounce `wait` window (legacy form). Object form supports `wait` - * (debounce ms), `size` (count cap), and `age` (max ms since first - * entry). Destination-level `config.batch` provides upper bounds - * applied at scheduler creation; mapping-level values override - * destination-level for matched events. + * Batch scheduling for this event mapping. When present, it splits this + * event type into its own buffer, overriding `config.batch` per field + * (`rule ?? config ?? default`). A bare number is the debounce `wait` + * window. Object form supports `wait` (debounce ms), `size` (count cap), + * and `age` (max ms since first entry). */ batch?: number | Destination.BatchOptions; condition?: Condition; // Added condition diff --git a/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts b/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts index 2b23c33fb..80d2d294b 100644 --- a/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts +++ b/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts @@ -21,6 +21,7 @@ import { } from '@walkeros/core'; import type { MockLogger } from '@walkeros/core'; import * as examples from '../examples'; +import { eventToRow } from '../eventToRow'; import { openWriter } from '../writer'; jest.mock('@google-cloud/bigquery'); @@ -330,6 +331,47 @@ describe('Server Destination BigQuery', () => { expect(rowsArg).toHaveLength(3); }); + test('uses entry data when present and eventToRow otherwise per entry', async () => { + if (!destination.pushBatch) throw new Error('pushBatch missing'); + + const config = await buildConfig(); + __resetMockCalls(); + + const e0 = createEvent(); + const e1 = createEvent(); + const mappedRow = { custom: 'mapped-row' }; + const events = [e0, e1]; + // Entry 1 carries mapped data, entry 0 does not. The derived `data` array + // is compacted (only defined entries), so `data[0]` is the mapped row and + // `data[1]` is undefined -- misaligned with `events`. Reading `entries` + // keeps each event's data correct. + const data = [mappedRow]; + const entries = [{ event: e0 }, { event: e1, data: mappedRow }]; + + destination.pushBatch( + { key: 'k', events, data, entries }, + createMockContext({ + config, + env: testEnv, + logger: createMockLogger(), + id: 'test-bq', + }), + ); + + await new Promise((r) => setImmediate(r)); + + const calls = __getMockCalls(); + const append = calls.find((c) => c.method === 'appendRows'); + expect(append).toBeDefined(); + if (!append) return; + const rowsArg = append.args[0]; + expect(Array.isArray(rowsArg)).toBe(true); + if (!Array.isArray(rowsArg)) return; + expect(rowsArg).toHaveLength(2); + expect(rowsArg[0]).toEqual(eventToRow(e0)); + expect(rowsArg[1]).toEqual(mappedRow); + }); + test('logs partial failures without throwing', async () => { if (!destination.pushBatch) throw new Error('pushBatch missing'); diff --git a/packages/server/destinations/gcp/src/bigquery/pushBatch.ts b/packages/server/destinations/gcp/src/bigquery/pushBatch.ts index b48e21f27..2db09e157 100644 --- a/packages/server/destinations/gcp/src/bigquery/pushBatch.ts +++ b/packages/server/destinations/gcp/src/bigquery/pushBatch.ts @@ -26,10 +26,9 @@ export const pushBatch: PushBatchFn = (batch, { config, logger }) => { return; } - const rows = batch.events.map((event, i) => { - const data = batch.data[i]; - return isObject(data) ? data : eventToRow(event); - }); + const rows = batch.entries.map((e) => + isObject(e.data) ? e.data : eventToRow(e.event), + ); if (rows.length === 0) return; diff --git a/packages/server/sources/express/src/__tests__/cache-roundtrip.test.ts b/packages/server/sources/express/src/__tests__/cache-roundtrip.test.ts new file mode 100644 index 000000000..decca3a22 --- /dev/null +++ b/packages/server/sources/express/src/__tests__/cache-roundtrip.test.ts @@ -0,0 +1,170 @@ +import { startFlow } from '@walkeros/collector'; +import { Source } from '@walkeros/core'; +import type { Destination, RespondFn } from '@walkeros/core'; +import type { Request, Response } from 'express'; +import { sourceExpress } from '../index'; +import type { Types as ExpressTypes } from '../types'; + +/** + * End-to-end proof for the request-cache round-trip at the express boundary. + * + * The production crash: a GET to a cached path (e.g. `/walker.js`) responds + * with a Buffer body. Storing that structured HTTP response + * (`{ body: Buffer, headers, status }`) to the byte-backed cache crashed; on + * the next request it must round-trip and be served from cache with the exact + * bytes, headers, and status preserved, plus `X-Cache: HIT` applied by the + * cache `update` rule. + * + * Harness mirrors `concurrent-requests.test.ts` (real `startFlow` + a live + * express source driven through plain-object mock req/res) combined with the + * `cache` + `update` config shape from the collector's + * `source-cache-integration.test.ts`. + */ +// Destination env carrying the per-scope `respond` the collector injects into +// each step. Typing it here lets the destination read `ctx.env.respond` +// without a cast. +type ResponderTypes = Destination.Types< + unknown, + unknown, + { respond?: RespondFn } +>; + +describe('Express source cache round-trip', () => { + it('stores a Buffer response on MISS and serves it on HIT with bytes, headers, status intact', async () => { + const ASSET_BODY = Buffer.from('/* walker.js bundle bytes */', 'utf8'); + const destinationCalls: string[] = []; + + // Destination responds with a Buffer body + Content-Type + status 200. + // The express GET default (transparent GIF) is idempotent, so this + // first-call wins and seeds the cache with a structured HTTP response. + const assetDestination: Destination.Instance = { + type: 'asset', + config: {}, + push: async (_event, ctx) => { + destinationCalls.push('asset'); + ctx.env?.respond?.({ + body: ASSET_BODY, + status: 200, + headers: { 'Content-Type': 'application/javascript' }, + }); + }, + }; + + const { collector } = await startFlow({ + consent: { functional: true }, + sources: { + express: { + code: sourceExpress, + // No port: drive the handler in-process via the source's `push`. + config: { + settings: { paths: ['/walker.js'] }, + ingest: { + map: { + method: { key: 'method' }, + path: { key: 'url' }, + }, + }, + }, + cache: { + stop: true, + rules: [ + { + match: { key: 'ingest.method', operator: 'eq', value: 'GET' }, + key: ['ingest.method', 'ingest.path'], + ttl: 300, + update: { + 'headers.X-Cache': { key: 'cache.status' }, + }, + }, + ], + }, + }, + }, + destinations: { + asset: { code: assetDestination }, + }, + }); + + const expressSource = Source.getSource(collector, 'express'); + + type Capture = { + status: number; + body: unknown; + headers: Record; + }; + + // `name=page view` gives the pushed event a valid name so it reaches the + // destination (the GET handler parses the query string into the event via + // requestToData). The full url is the cache key and is identical across + // both requests, so the second is a HIT. + const mockRequest = (): Request => + ({ + method: 'GET', + url: '/walker.js?name=page%20view', + headers: {}, + get: () => undefined, + // Express Request/Response are framework interfaces too large to mock + // structurally; this confined test-double cast is the sanctioned + // express-harness boundary (cf. concurrent-requests.test.ts). + }) as unknown as Request; + + const mockResponse = () => { + const captures: Capture[] = []; + let status = 200; + const headers: Record = {}; + const res = { + status: (code: number) => { + status = code; + return res; + }, + set: (key: string, value: string) => { + headers[key] = value; + return res; + }, + send: (body?: unknown) => { + captures.push({ status, body, headers: { ...headers } }); + return res; + }, + json: (body: unknown) => { + captures.push({ status, body, headers: { ...headers } }); + return res; + }, + }; + // Express Request/Response are framework interfaces too large to mock + // structurally; this confined test-double cast is the sanctioned + // express-harness boundary (cf. concurrent-requests.test.ts). + return { res: res as unknown as Response, captures }; + }; + + // Asserts the captured response is the cached asset: a Buffer body equal + // to the original bytes, status 200, Content-Type preserved. + const expectAsset = (capture: Capture, xCache: string) => { + expect(capture.status).toBe(200); + expect(Buffer.isBuffer(capture.body)).toBe(true); + if (!Buffer.isBuffer(capture.body)) + throw new Error('body was not a Buffer'); + expect(capture.body.equals(ASSET_BODY)).toBe(true); + expect(capture.headers['Content-Type']).toBe('application/javascript'); + expect(capture.headers['X-Cache']).toBe(xCache); + }; + + // First request: MISS β€” pipeline runs, destination responds with the + // Buffer asset, cache stores the structured response. + const first = mockResponse(); + await expressSource.push(mockRequest(), first.res); + + expect(destinationCalls).toEqual(['asset']); + expect(first.captures).toHaveLength(1); + expectAsset(first.captures[0], 'MISS'); + + // Second request: HIT β€” pipeline skipped, response served from cache. + // The Buffer body round-trips equal, headers + status preserved, X-Cache + // flipped to HIT, and the destination is NOT called again. + const second = mockResponse(); + await expressSource.push(mockRequest(), second.res); + + expect(destinationCalls).toEqual(['asset']); // destination not re-run + expect(second.captures).toHaveLength(1); + expectAsset(second.captures[0], 'HIT'); + }); +}); diff --git a/packages/server/stores/fs/src/__tests__/store.test.ts b/packages/server/stores/fs/src/__tests__/store.test.ts index bf47333e8..558cc6230 100644 --- a/packages/server/stores/fs/src/__tests__/store.test.ts +++ b/packages/server/stores/fs/src/__tests__/store.test.ts @@ -87,6 +87,32 @@ describe('FsStore', () => { }); }); + describe('raw byte persistence', () => { + it('writes a Buffer raw and get returns a Buffer with the same bytes', async () => { + const store = await createStore(tmpDir); + const bytes = Buffer.from('var x = 1;'); + await store.set('buf', bytes); + const result = await store.get('buf'); + if (!Buffer.isBuffer(result)) throw new Error('expected Buffer'); + expect(result.equals(bytes)).toBe(true); + }); + + it('writes a string raw and get returns a Buffer with the same bytes', async () => { + const store = await createStore(tmpDir); + await store.set('str', 'hello world'); + const result = await store.get('str'); + if (!Buffer.isBuffer(result)) throw new Error('expected Buffer'); + expect(result.toString()).toBe('hello world'); + }); + + it('rejects a non-Buffer/string value with a clear error', async () => { + const store = await createStore(tmpDir); + await expect(store.set('x', { a: 1 })).rejects.toThrow( + /Buffer or string/, + ); + }); + }); + describe('delete', () => { it('should remove a file', async () => { await fs.writeFile(path.join(tmpDir, 'remove.txt'), 'bye'); diff --git a/packages/server/stores/fs/src/store.ts b/packages/server/stores/fs/src/store.ts index bbbb2a76a..593e90332 100644 --- a/packages/server/stores/fs/src/store.ts +++ b/packages/server/stores/fs/src/store.ts @@ -29,7 +29,7 @@ export const storeFsInit: Store.Init> = ( type: 'fs', config: context.config as Store.Config>, - async get(key: string): Promise { + async get(key: string): Promise { const filePath = resolvePath(key); if (!filePath) return undefined; try { @@ -42,8 +42,13 @@ export const storeFsInit: Store.Init> = ( async set(key: string, value: unknown): Promise { const filePath = resolvePath(key); if (!filePath) return; + if (!Buffer.isBuffer(value) && typeof value !== 'string') { + throw new Error( + `fs store persists Buffer or string values only, received ${typeof value}`, + ); + } await fs.mkdir(path.dirname(filePath), { recursive: true }); - await fs.writeFile(filePath, value as Buffer); + await fs.writeFile(filePath, value); }, async delete(key: string): Promise { diff --git a/packages/server/stores/gcs/src/__tests__/store.test.ts b/packages/server/stores/gcs/src/__tests__/store.test.ts index 980d7eb10..713f46d07 100644 --- a/packages/server/stores/gcs/src/__tests__/store.test.ts +++ b/packages/server/stores/gcs/src/__tests__/store.test.ts @@ -189,6 +189,55 @@ describe('storeGcsInit', () => { }); }); + describe('codec round-trip', () => { + // Characterization test: locks in that an arbitrary Buffer (such as the + // base64-JSON payload the request cache codec produces) survives a + // set -> get cycle byte-for-byte through the GCS HTTP client. + it('round-trips an arbitrary Buffer byte-identically', async () => { + const original = Buffer.from( + JSON.stringify({ + __walkeros_cache_v__: { + status: 200, + body: { + __walkeros_cache__: 'buffer', + d: Buffer.from([0, 1, 2, 255, 254, 0x7f, 0x80]).toString( + 'base64', + ), + }, + }, + }), + ); + + // Capture the body bytes uploaded on set, then replay them as the GET + // response's arrayBuffer so the test mirrors a real backing store. + let storedBody: ArrayBuffer | undefined; + mockFetch.mockImplementation((_url: string, init?: RequestInit) => { + if (init?.method === 'POST' && init.body instanceof Uint8Array) { + // Copy into a fresh ArrayBuffer-backed view so the stored bytes are + // independent of the upload view's (possibly SharedArrayBuffer) + // backing store. + const view = init.body; + const copy = new ArrayBuffer(view.byteLength); + new Uint8Array(copy).set(view); + storedBody = copy; + return Promise.resolve({ ok: true }); + } + // GET (download) path + return Promise.resolve({ + ok: true, + arrayBuffer: () => Promise.resolve(storedBody ?? new ArrayBuffer(0)), + }); + }); + + const store = await createStore(); + await store.set('cache/entry', original); + const result = await store.get('cache/entry'); + + expect(result).toBeInstanceOf(Buffer); + expect((result as Buffer).equals(original)).toBe(true); + }); + }); + describe('delete', () => { it('should delete an object', async () => { mockFetch.mockResolvedValueOnce({ ok: true }); diff --git a/packages/server/stores/s3/src/__tests__/store.test.ts b/packages/server/stores/s3/src/__tests__/store.test.ts index ed5ab4bb5..6c33422ae 100644 --- a/packages/server/stores/s3/src/__tests__/store.test.ts +++ b/packages/server/stores/s3/src/__tests__/store.test.ts @@ -174,6 +174,49 @@ describe('storeS3Init', () => { }); }); + describe('codec round-trip', () => { + // Characterization test: locks in that an arbitrary Buffer (such as the + // base64-JSON payload the request cache codec produces) survives a + // set -> get cycle byte-for-byte through the s3mini client. + it('round-trips an arbitrary Buffer byte-identically', async () => { + const original = Buffer.from( + JSON.stringify({ + __walkeros_cache_v__: { + status: 200, + body: { + __walkeros_cache__: 'buffer', + d: Buffer.from([0, 1, 2, 255, 254, 0x7f, 0x80]).toString( + 'base64', + ), + }, + }, + }), + ); + + // Capture the bytes handed to the client on set, then replay them as + // the client's get response so the test mirrors a real backing store. + let stored: Buffer | undefined; + mockPutObject.mockImplementation(async (_key: string, value: Buffer) => { + stored = value; + return { ok: true }; + }); + mockGetObjectArrayBuffer.mockImplementation(async () => { + if (!stored) return null; + return stored.buffer.slice( + stored.byteOffset, + stored.byteOffset + stored.byteLength, + ); + }); + + const store = await createStore(); + await store.set('cache/entry', original); + const result = await store.get('cache/entry'); + + expect(result).toBeInstanceOf(Buffer); + expect((result as Buffer).equals(original)).toBe(true); + }); + }); + describe('delete', () => { it('should delete an object', async () => { mockDeleteObject.mockResolvedValue(true); diff --git a/packages/web/destinations/api/src/index.test.ts b/packages/web/destinations/api/src/index.test.ts index f42d3c568..828dca9cc 100644 --- a/packages/web/destinations/api/src/index.test.ts +++ b/packages/web/destinations/api/src/index.test.ts @@ -209,14 +209,43 @@ describe('Destination API', () => { expect(JSON.parse(calledData)).toEqual(events); }); - test('prefers batch.data over batch.events', () => { - const events = [getEvent('product view')]; - const mappedData = [{ custom: 'data1' }, { custom: 'data2' }]; + test('sends entry data when present, raw event otherwise', () => { + const e0 = getEvent('product view'); + const e1 = getEvent('product click'); + const mapped = { custom: 'data1' }; const batch = { key: 'product view', - events, - data: mappedData, - entries: events.map((event) => ({ event })), + events: [e0, e1], + data: [mapped], + entries: [{ event: e0, data: mapped }, { event: e1 }], + }; + + destination.pushBatch!( + batch, + createMockContext({ + config: { settings: { url } }, + env: testEnv, + logger: mockLogger, + id: 'test-api', + }), + ); + + expect(mockSendWeb).toHaveBeenCalledTimes(1); + const [, calledData] = mockSendWeb.mock.calls[0]; + expect(JSON.parse(calledData)).toEqual([mapped, e1]); + }); + + test('delivers every entry when some have mapped data and some do not', () => { + const e0 = getEvent('product view'); + const e1 = getEvent('product click'); + const mapped = { mapped: 1 }; + const batch = { + key: 'product view', + events: [e0, e1], // derived back-compat view + data: [mapped], // compacted, length 1 -- the bug condition + // Entry 1 has the mapped data; the compacted `data[0]` would otherwise + // cross-assign it to e0 and drop e1. + entries: [{ event: e0 }, { event: e1, data: mapped }], }; destination.pushBatch!( @@ -231,7 +260,7 @@ describe('Destination API', () => { expect(mockSendWeb).toHaveBeenCalledTimes(1); const [, calledData] = mockSendWeb.mock.calls[0]; - expect(JSON.parse(calledData)).toEqual(mappedData); + expect(JSON.parse(calledData)).toEqual([e0, mapped]); }); test('applies transform to each batch item', () => { diff --git a/packages/web/destinations/api/src/index.ts b/packages/web/destinations/api/src/index.ts index d3ea6d4ce..b856d9e5d 100644 --- a/packages/web/destinations/api/src/index.ts +++ b/packages/web/destinations/api/src/index.ts @@ -52,7 +52,9 @@ export const destinationAPI: Destination = { return; } - const items = batch.data.length ? batch.data : batch.events; + const items = batch.entries.map((e) => + isDefined(e.data) ? e.data : e.event, + ); // Apply transform to each item if defined, then stringify array const payload = transform diff --git a/skills/walkeros-understanding-destinations/SKILL.md b/skills/walkeros-understanding-destinations/SKILL.md index 73ef00fb8..58ecb1c66 100644 --- a/skills/walkeros-understanding-destinations/SKILL.md +++ b/skills/walkeros-understanding-destinations/SKILL.md @@ -107,10 +107,10 @@ evictions) and `?.dlq` (DLQ evictions). Build the key with `stepId()` from ## Batch scheduling -When a mapping rule sets `batch` and the destination implements `pushBatch`, -events are buffered and delivered in groups. The configuration shape is -`batch?: number | { wait?, size?, age? }` at both the mapping-rule layer and the -destination-config layer. +Set `config.batch` on a destination (with `pushBatch` implemented) to batch +**all** of its events into one shared buffer. No `'* *'` wildcard mapping rule +is needed. The configuration shape is `batch?: number | { wait?, size?, age? }` +at both the destination-config layer and the mapping-rule layer. - `wait` (ms): debounce window. The timer resets on every push. Legacy form `batch: 1000` is shorthand for `{ wait: 1000 }`. @@ -118,7 +118,10 @@ destination-config layer. - `age` (ms): hard age cap since the first entry of the current window. Default `30000`. Prevents debounce starvation under sustained load. -Mapping-level values override destination-level for matched events. +A mapping rule's own `batch` splits that entity-action into its own buffer and +overrides `config.batch` per field (`rule ?? config ?? default`). To batch only +specific events, omit `config.batch` and set `batch` on those rules. Pending +batches flush on shutdown. ### Per-event metadata diff --git a/skills/walkeros-understanding-mapping/SKILL.md b/skills/walkeros-understanding-mapping/SKILL.md index a2b606318..d8c21da09 100644 --- a/skills/walkeros-understanding-mapping/SKILL.md +++ b/skills/walkeros-understanding-mapping/SKILL.md @@ -338,11 +338,23 @@ mapping: { ### Batch Processing +To batch **all** of a destination's events, set `config.batch` (no mapping rule +needed): + +```typescript +config: { + batch: { size: 5 }, // Flush after every 5 events (a bare number is the wait window in ms) +} +``` + +A rule-level `batch` batches only that entity-action into its own buffer and +overrides `config.batch` per field: + ```typescript mapping: { - '*': { - '*': { - batch: 5, // Send after 5 events + order: { + complete: { + batch: { wait: 1000 }, // Only order complete batches, with a 1s debounce } } } diff --git a/website/docs/destinations/api/server.mdx b/website/docs/destinations/api/server.mdx index 0cd38dc54..2d9b7b9cd 100644 --- a/website/docs/destinations/api/server.mdx +++ b/website/docs/destinations/api/server.mdx @@ -117,6 +117,32 @@ await startFlow({ language="typescript" /> +### With batching + +Set `config.batch` to send every event in one shared batch instead of one +request per event. A bare number is the debounce `wait` window; an object tunes +`wait`, `size`, and `age`. No `'* *'` wildcard mapping rule is needed. + + + ## Use cases ### Webhook integration diff --git a/website/docs/destinations/code.mdx b/website/docs/destinations/code.mdx index d528c6a31..c6693924b 100644 --- a/website/docs/destinations/code.mdx +++ b/website/docs/destinations/code.mdx @@ -137,9 +137,9 @@ Each code string has access to specific variables: #### Batch scheduling -A destination can cap how large or how old a batch may grow before it is -flushed. Configure via `config.batch` (bare number = legacy debounce -`wait` window): +Set `config.batch` to batch **all** of a destination's events into one shared +buffer. No `'* *'` wildcard mapping rule is needed. A bare number is the +debounce `wait` window; an object tunes `wait`, `size`, and `age`: ```ts { @@ -160,6 +160,26 @@ Without `size`/`age` the batch can grow unbounded under sustained load (the debounce timer keeps resetting). Defaults are conservative; raise them only when you understand your destination's batch-size limits. +**Batch all vs. batch selectively.** With `config.batch` set, every event of +the destination joins the shared default buffer, including events matched by +data-only mapping rules. A mapping rule's own `batch` splits that entity-action +into its own buffer and overrides `config.batch` per field +(`rule ?? config ?? default`). To batch only specific events, omit +`config.batch` and set `batch` on those rules: + +```ts +{ + config: { + // no config.batch: batching stays off for everything else + mapping: { + order: { complete: { batch: { wait: 1000, size: 100 } } }, + }, + }, +} +``` + +Pending batches flush automatically on shutdown, so buffered events are not lost. + #### Failure handling If `pushBatch` throws (or returns a rejected Promise), the entire batch @@ -320,13 +340,7 @@ Use mapping to override the default push code for specific events: }) \`, }, - mapping: { - '*': { - '*': { - batch: 1000, // Batch events with 1 second debounce - }, - }, - }, + batch: 1000, // Batch all events with 1 second debounce }, }, }, diff --git a/website/docs/destinations/server/gcp.mdx b/website/docs/destinations/server/gcp.mdx index cd3872cf8..ff9c8d712 100644 --- a/website/docs/destinations/server/gcp.mdx +++ b/website/docs/destinations/server/gcp.mdx @@ -240,9 +240,10 @@ ingestion. This replaces the legacy `tabledata.insertAll` path. - **Cost**: $25/TB after the 2 TiB/month free tier (vs ~$50/TB for the legacy path). Most low-volume deployments fit entirely in the free tier. -- **Batching**: `pushBatch` is implemented. Set the collector's `batch: ` - mapping setting to flush all events in a window as a single `appendRows` - call. +- **Batching**: `pushBatch` is implemented. Set `config.batch` on the + destination (a bare number is the debounce `wait` in ms) to flush all events + in a window as a single `appendRows` call. No `'* *'` wildcard mapping rule is + needed. :::caution EXPERIMENTAL SDK The upstream `@google-cloud/bigquery-storage` package self-marks as diff --git a/website/docs/mapping/rule.mdx b/website/docs/mapping/rule.mdx index 9d54f9ea5..e1c3f6852 100644 --- a/website/docs/mapping/rule.mdx +++ b/website/docs/mapping/rule.mdx @@ -84,12 +84,14 @@ object `{ wait?, size?, age? }` for finer control: - `size`: hard count cap. Flushes immediately at this many events. - `age` (ms): hard age cap since the first entry of the current window. -Per-destination upper bounds (`config.batch` on the destination) apply -when the mapping rule omits a field; mapping-level values override -destination-level for matched events. Defaults when batching is enabled: -`size: 1000`, `age: 30000`. `wait` is a hint, not a hard delay: when -events keep arriving faster than `wait`, the `size` and `age` caps are -what force the flush. +A rule-level `batch` splits this entity-action into its own buffer. When the +destination also sets `config.batch`, the rule's values override +destination-level per field (`rule ?? config ?? default`); fields the rule omits +fall back to `config.batch`. Setting `config.batch` alone batches **all** of the +destination's events into one shared buffer with no rule needed. Defaults when +batching is enabled: `size: 1000`, `age: 30000`. `wait` is a hint, not a hard +delay: when events keep arriving faster than `wait`, the `size` and `age` caps +are what force the flush. ## Conditional rules diff --git a/website/docs/sources/web/browser/tagging/html-attributes.mdx b/website/docs/sources/web/browser/tagging/html-attributes.mdx index 023f0d677..f2780eb3d 100644 --- a/website/docs/sources/web/browser/tagging/html-attributes.mdx +++ b/website/docs/sources/web/browser/tagging/html-attributes.mdx @@ -254,8 +254,9 @@ parent element per ID. :::note -`data-elb`, `data-elbaction`, `data-elbcontext`, `data-elbglobals`, and -`data-elblink` are reserved attributes, whereas `data-elb-*` attributes may be +`data-elb`, `data-elbaction`, `data-elbactions`, `data-elbcontext`, +`data-elbglobals`, and `data-elblink` are reserved attributes, whereas +`data-elb-*` attributes may be arbitrary combinations based on the related entity name. `data-elb_` is a reserved path-scoped generic that uses the same value syntax as `data-elb-` but only reaches triggers nested below it. Actions and properties can be set diff --git a/website/docs/stores/cache.mdx b/website/docs/stores/cache.mdx index 7a95b9929..59001100b 100644 --- a/website/docs/stores/cache.mdx +++ b/website/docs/stores/cache.mdx @@ -14,6 +14,14 @@ The wrapping is transparent: a transformer wired to `$store.crm` does not know w Store-level cache is different from the [event-level Cache](../collector/cache.mdx) configured on sources, transformers, and destinations. Event cache short-circuits the pipeline on hit. Store cache memoizes `get`/`set`/`delete` calls on a single store. Both can be used in the same flow. ::: +## Request-cache values and TTL + +Request-cache values are serialized by the cache layer before they reach the backing store, so any store backend (filesystem, S3, GCS, in-memory) can hold structured HTTP responses, not just the in-memory tier. Cached entries honor their TTL: an expired entry is treated as a miss and re-fetched from the backing. + +Cached values must be JSON-safe plus `Buffer` (a `Date` becomes a string, an `Error` loses its fields), which is fine for HTTP responses, events, and push results. + +The filesystem, S3, and GCS stores persist `Buffer` or `string` values only, so structured `state` payloads (which bypass the cache layer) need an object-capable store such as the default in-memory cache. + ## Minimal example Enable the built-in in-memory tier (`__cache`) on any store by setting `cache` on the store declaration. No extra store needed: diff --git a/website/docs/stores/server/fs.mdx b/website/docs/stores/server/fs.mdx index 3aa6146c1..85888ce44 100644 --- a/website/docs/stores/server/fs.mdx +++ b/website/docs/stores/server/fs.mdx @@ -76,7 +76,9 @@ await store.delete('old-file.txt'); // void`} language="typescript" /> -`get()` returns `Buffer` for compatibility with the file transformer, which uses `content instanceof Buffer` for Content-Length calculation. +`get()` returns `Buffer` for raw files, for compatibility with the file transformer, which uses `content instanceof Buffer` for Content-Length calculation. + +The fs store is a byte/string store. `get()` returns a `Buffer`; `set()` accepts a `Buffer` or `string` and writes it raw, and rejects other value types with a clear error. Served assets stay intact byte-for-byte. Structured request-cache values are serialized by the cache layer in `@walkeros/core` before they reach the store, so the store never sees a non-`Buffer`, non-`string` value when backing a request cache. ## File serving pattern diff --git a/website/docs/stores/server/sheets.mdx b/website/docs/stores/server/sheets.mdx index 6a8415bb6..338d17b1c 100644 --- a/website/docs/stores/server/sheets.mdx +++ b/website/docs/stores/server/sheets.mdx @@ -186,3 +186,4 @@ Each value is JSON-stringified into one cell (the `value` column). Reads JSON-pa - **Single-cell value shape.** Multi-column structured rows are out of scope, ship a richer schema in a later phase if customers ask. - **No drift detection on header content.** If an operator manually edits row 1, the next `walkeros setup store.` overwrites it without warning. - **No transactional updates.** `set` is two HTTP calls (read index, write cell). Concurrent writers can interleave. +- **Not a request-cache backend.** The Sheets store is a key-value state store, not a supported request-cache backend. Cache large or binary HTTP responses in a filesystem, S3, or GCS store instead. From 76d32c1124d9f0a6ad6529defdf903949fb95873 Mon Sep 17 00:00:00 2001 From: alexanderkirtzel Date: Thu, 4 Jun 2026 08:38:00 +0200 Subject: [PATCH 03/10] batch dlq --- .changeset/batch-outcome-shutdown-fixes.md | 12 ++ packages/cli/openapi/spec.json | 31 +++- packages/cli/src/types/api.gen.d.ts | 19 +- .../destination-batch-correctness.test.ts | 106 ++++++++++++ .../src/__tests__/destination.test.ts | 162 +++++++++++++++--- .../collector/src/__tests__/shutdown.test.ts | 121 ++++++++++++- packages/collector/src/destination.ts | 5 +- packages/collector/src/shutdown.ts | 22 ++- packages/collector/src/types/code.ts | 2 +- packages/core/src/cache.ts | 3 + packages/core/src/types/destination.ts | 2 +- .../@google-cloud/bigquery-storage.ts | 16 ++ .../__tests__/bigquery-storage-mock.d.ts | 1 + .../gcp/src/bigquery/__tests__/index.test.ts | 104 ++++++++--- .../gcp/src/bigquery/pushBatch.ts | 103 ++++++----- website/docs/destinations/code.mdx | 5 +- website/docs/destinations/server/gcp.mdx | 4 +- 17 files changed, 600 insertions(+), 118 deletions(-) create mode 100644 .changeset/batch-outcome-shutdown-fixes.md diff --git a/.changeset/batch-outcome-shutdown-fixes.md b/.changeset/batch-outcome-shutdown-fixes.md new file mode 100644 index 000000000..9f1887276 --- /dev/null +++ b/.changeset/batch-outcome-shutdown-fixes.md @@ -0,0 +1,12 @@ +--- +'@walkeros/core': patch +'@walkeros/collector': patch +'@walkeros/server-destination-gcp': patch +--- + +Batched destination delivery now reports failures. A batch push that fails +(including BigQuery row errors) is routed to the dead-letter buffer and counted +as failed instead of being silently dropped, and graceful shutdown waits for +in-flight batches to finish. Also fixes a shutdown timer that could delay +process exit, and makes a zero millisecond batch wait (`batch: 0`) correctly +enable batching. diff --git a/packages/cli/openapi/spec.json b/packages/cli/openapi/spec.json index 547e38463..6bf9974f8 100644 --- a/packages/cli/openapi/spec.json +++ b/packages/cli/openapi/spec.json @@ -1959,6 +1959,29 @@ }, "required": ["name"] }, + "ProjectDetailResponse": { + "type": "object", + "properties": { + "id": { + "type": "string", + "pattern": "^proj_[a-zA-Z0-9_-]+$", + "example": "proj_x7y8z9" + }, + "name": { + "type": "string" + }, + "siteUrl": { + "type": ["string", "null"], + "format": "uri", + "example": "https://example.com" + }, + "role": { + "type": "string", + "enum": ["owner", "admin", "member", "deployer", "viewer"] + } + }, + "required": ["id", "name", "role"] + }, "UpdateProjectRequest": { "type": "object", "properties": { @@ -1966,6 +1989,12 @@ "type": "string", "minLength": 1, "maxLength": 255 + }, + "siteUrl": { + "type": ["string", "null"], + "maxLength": 2048, + "format": "uri", + "example": "https://example.com" } } }, @@ -2997,7 +3026,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Project" + "$ref": "#/components/schemas/ProjectDetailResponse" } } } diff --git a/packages/cli/src/types/api.gen.d.ts b/packages/cli/src/types/api.gen.d.ts index 2bf228ba1..ca549fc8c 100644 --- a/packages/cli/src/types/api.gen.d.ts +++ b/packages/cli/src/types/api.gen.d.ts @@ -573,7 +573,7 @@ export interface paths { [name: string]: unknown; }; content: { - 'application/json': components['schemas']['Project']; + 'application/json': components['schemas']['ProjectDetailResponse']; }; }; /** @description Unauthorized */ @@ -4584,8 +4584,25 @@ export interface components { CreateProjectRequest: { name: string; }; + ProjectDetailResponse: { + /** @example proj_x7y8z9 */ + id: string; + name: string; + /** + * Format: uri + * @example https://example.com + */ + siteUrl?: string | null; + /** @enum {string} */ + role: 'owner' | 'admin' | 'member' | 'deployer' | 'viewer'; + }; UpdateProjectRequest: { name?: string; + /** + * Format: uri + * @example https://example.com + */ + siteUrl?: string | null; }; ListMembersResponse: { members: components['schemas']['Member'][]; diff --git a/packages/collector/src/__tests__/destination-batch-correctness.test.ts b/packages/collector/src/__tests__/destination-batch-correctness.test.ts index aec9ee1d1..9cac16da4 100644 --- a/packages/collector/src/__tests__/destination-batch-correctness.test.ts +++ b/packages/collector/src/__tests__/destination-batch-correctness.test.ts @@ -223,6 +223,112 @@ describe('Destination batch correctness (PROD-004)', () => { expect(recorded[0]).toBe(5); }); + it('shutdown waits for an in-flight async batch append', async () => { + let resolveAppend: (() => void) | undefined; + let settled = false; + const mockPushBatch = jest.fn( + () => + new Promise((resolve) => { + resolveAppend = () => { + settled = true; + resolve(); + }; + }), + ); + + const dest: Destination.Instance = { + type: 'async-batch', + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + mapping: { '*': { '*': { batch: 60_000 } } }, + }, + }; + + const { collector } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + for (let i = 0; i < 3; i++) { + await pushToDestinations( + collector, + makeEvent(i), + { ingest: createIngest(`req-${i}`) }, + collector.destinations, + ); + } + + // Start the flush but do not await it yet: the async pushBatch is in-flight. + let flushResolved = false; + const flushPromise = collector.destinations['d'] + .batches!['* *'].flush() + .then(() => { + flushResolved = true; + }); + + // Let the pushBatch call begin and the await chain reach the pending promise. + await jest.advanceTimersByTimeAsync(0); + + expect(mockPushBatch).toHaveBeenCalledTimes(1); + // flush() must not resolve while the append promise is still pending. + expect(settled).toBe(false); + expect(flushResolved).toBe(false); + + // Resolve the in-flight append; now flush() should settle. + resolveAppend!(); + await flushPromise; + + expect(settled).toBe(true); + expect(flushResolved).toBe(true); + }); + + it('a rejected async pushBatch routes the batch to the DLQ and increments failed, not out', async () => { + const mockPushBatch = jest.fn(() => + Promise.reject(new Error('async batch boom')), + ); + + const dest: Destination.Instance = { + type: 'async-rejecting', + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + mapping: { '*': { '*': { batch: 1 } } }, + }, + }; + + const { collector } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + for (let i = 0; i < 5; i++) { + await pushToDestinations( + collector, + makeEvent(i), + { ingest: createIngest(`req-${i}`) }, + collector.destinations, + ); + } + + await collector.destinations['d'].batches!['* *'].flush(); + + // Whole batch routed to DLQ as [event, error]. + const dlq = collector.destinations['d'].dlq; + expect(dlq).toBeDefined(); + expect(dlq!.length).toBe(5); + expect(dlq![0][0].id).toBe('e-0'); + const firstError = dlq![0][1]; + expect(firstError).toBeInstanceOf(Error); + if (!(firstError instanceof Error)) throw new Error('expected Error'); + expect(firstError.message).toBe('async batch boom'); + + const destStatus = collector.status.destinations['d']; + // failed incremented by full batch size; success counters untouched. + expect(destStatus.failed).toBe(5); + expect(destStatus.count).toBe(0); + }); + it('routes batch failure to DLQ without leaking unhandled rejections', async () => { const unhandled: unknown[] = []; const onUnhandled = (err: unknown): void => { diff --git a/packages/collector/src/__tests__/destination.test.ts b/packages/collector/src/__tests__/destination.test.ts index 342ff8f3e..c5266d50e 100644 --- a/packages/collector/src/__tests__/destination.test.ts +++ b/packages/collector/src/__tests__/destination.test.ts @@ -818,9 +818,9 @@ describe('Destination', () => { it('batches all events when only config.batch is set (no mapping rule)', async () => { const captured: { events: WalkerOS.Events; key: string }[] = []; - const mockPushBatch = jest.fn((batch) => - captured.push({ events: [...batch.events], key: batch.key }), - ); + const mockPushBatch = jest.fn((batch) => { + captured.push({ events: [...batch.events], key: batch.key }); + }); const mockPush = jest.fn(); const dest: Destination.Instance = { push: mockPush, @@ -884,8 +884,11 @@ describe('Destination', () => { expect(mockPushBatch).not.toHaveBeenCalled(); }); - it('does not enqueue into a config.batch buffer when consent is denied', async () => { - const mockPushBatch = jest.fn(); + it('gates a denied event into queuePush, then replays it through pushBatch on consent grant', async () => { + const captured: { events: WalkerOS.Events; key: string }[] = []; + const mockPushBatch = jest.fn((batch) => { + captured.push({ events: [...batch.events], key: batch.key }); + }); const mockPush = jest.fn(); const dest: Destination.Instance = { push: mockPush, @@ -896,11 +899,111 @@ describe('Destination', () => { consent: { marketing: true }, }, }; - const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + const { collector, elb } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + // Consent is denied (no marketing): event must be gated, not dropped. await elb('page view'); jest.advanceTimersByTime(100); expect(mockPush).not.toHaveBeenCalled(); expect(mockPushBatch).not.toHaveBeenCalled(); + + // The denied event must be parked in the destination's consent-denied + // queue (queuePush on the live instance), proving gate-not-drop. + const live = collector.destinations.d; + expect(live.queuePush).toHaveLength(1); + expect(live.queuePush?.[0].name).toBe('page view'); + + // Grant consent: the gated event must replay through the batch buffer. + await elb('walker consent', { marketing: true }); + jest.advanceTimersByTime(100); + + expect(mockPushBatch).toHaveBeenCalledTimes(1); + expect(captured[0].events).toHaveLength(1); + expect(captured[0].events[0].name).toBe('page view'); + expect(captured[0].key).toBe(' batch-all'); + expect(mockPush).not.toHaveBeenCalled(); + // Queue drained after replay. + expect(collector.destinations.d.queuePush).toHaveLength(0); + }); + }); + + describe('batch: 0 is a real wait value, not "disabled"', () => { + beforeEach(() => jest.useFakeTimers()); + afterEach(() => jest.useRealTimers()); + + it('routes a batch: 0 rule to its own buffer, not the default', async () => { + const captured: { key: string; rule: unknown }[] = []; + const mockPushBatch = jest.fn((batch, ctx) => { + captured.push({ key: batch.key, rule: ctx.rule }); + }); + const mockPush = jest.fn(); + const dest: Destination.Instance = { + push: mockPush, + pushBatch: mockPushBatch, + config: { + init: true, + mapping: { + page: { + view: { batch: 0 }, + }, + }, + }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + + await elb('page view'); + // wait 0 still needs a tick to flush the debounce. + jest.advanceTimersByTime(1); + + expect(mockPushBatch).toHaveBeenCalledTimes(1); + expect(captured[0].key).toBe('page view'); // rule's own key, not ' batch-all' + expect(captured[0].key).not.toBe(' batch-all'); + expect(captured[0].rule).toBeDefined(); // rule batch -> rule passed through + expect(mockPush).not.toHaveBeenCalled(); + }); + + it('enables batching when config.batch is 0', async () => { + const captured: { key: string }[] = []; + const mockPushBatch = jest.fn((batch) => { + captured.push({ key: batch.key }); + }); + const mockPush = jest.fn(); + const dest: Destination.Instance = { + push: mockPush, + pushBatch: mockPushBatch, + config: { init: true, batch: 0 }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + + await elb('page view'); + jest.advanceTimersByTime(1); + + expect(mockPushBatch).toHaveBeenCalledTimes(1); + expect(captured[0].key).toBe(' batch-all'); // default buffer + expect(mockPush).not.toHaveBeenCalled(); + }); + + it('batch: 0 with no pushBatch still falls back to synchronous push', async () => { + const mockPush = jest.fn(); + const dest: Destination.Instance = { + push: mockPush, + config: { + init: true, + mapping: { + page: { + view: { batch: 0 }, + }, + }, + }, + }; + const { elb } = await startFlow({ destinations: { d: { code: dest } } }); + + await elb('page view'); + jest.advanceTimersByTime(1); + + expect(mockPush).toHaveBeenCalledTimes(1); }); }); @@ -910,9 +1013,9 @@ describe('Destination', () => { it('routes two data-only rules into one shared default buffer under config.batch', async () => { const captured: { count: number; key: string }[] = []; - const mockPushBatch = jest.fn((batch) => - captured.push({ count: batch.events.length, key: batch.key }), - ); + const mockPushBatch = jest.fn((batch) => { + captured.push({ count: batch.events.length, key: batch.key }); + }); const dest: Destination.Instance = { push: jest.fn(), pushBatch: mockPushBatch, @@ -949,22 +1052,39 @@ describe('Destination', () => { expect(seenRule).toBeUndefined(); }); - it('does not collide config.batch default buffer with an explicit wildcard batch rule', async () => { - const keys: string[] = []; + it('keeps the config.batch default buffer separate from a rule-owned buffer', async () => { + const flushes: { key: string; names: string[] }[] = []; const dest: Destination.Instance = { push: jest.fn(), - pushBatch: jest.fn((batch) => keys.push(batch.key)), + pushBatch: jest.fn((batch) => { + flushes.push({ + key: batch.key, + names: batch.events.map((e: WalkerOS.Event) => e.name), + }); + }), config: { init: true, - batch: { wait: 50 }, - mapping: { '*': { '*': { batch: { wait: 50, size: 1 } } } }, // own rule buffer + batch: { wait: 50, size: 20 }, // default buffer for unmapped events + // Non-wildcard rule with its own batch -> its own buffer. + mapping: { order: { complete: { batch: { wait: 50, size: 1 } } } }, }, }; const { elb } = await startFlow({ destinations: { d: { code: dest } } }); - await elb('page view'); // matches wildcard rule -> its own '* *' buffer, size 1 flush - jest.advanceTimersByTime(60); - expect(keys).toContain('* *'); - expect(keys).not.toContain(' batch-all'); + await elb('order complete'); // matches rule -> 'order complete' buffer + await elb('page view'); // unmapped -> config.batch default ' batch-all' buffer + jest.advanceTimersByTime(100); + + // Both buffers flush, with distinct keys. + const keys = flushes.map((f) => f.key); + expect(keys).toContain('order complete'); + expect(keys).toContain(' batch-all'); + expect(new Set(keys).size).toBe(keys.length); // no duplicate keys + + // No cross-contamination: each buffer flushes only its own event. + const ruleFlush = flushes.find((f) => f.key === 'order complete'); + const defaultFlush = flushes.find((f) => f.key === ' batch-all'); + expect(ruleFlush?.names).toEqual(['order complete']); + expect(defaultFlush?.names).toEqual(['page view']); }); }); @@ -976,9 +1096,9 @@ describe('Destination', () => { const flushes: { key: string; size: number }[] = []; const dest: Destination.Instance = { push: jest.fn(), - pushBatch: jest.fn((batch) => - flushes.push({ key: batch.key, size: batch.events.length }), - ), + pushBatch: jest.fn((batch) => { + flushes.push({ key: batch.key, size: batch.events.length }); + }), config: { init: true, batch: { wait: 60_000, size: 50 }, // default: big wait, size 50 diff --git a/packages/collector/src/__tests__/shutdown.test.ts b/packages/collector/src/__tests__/shutdown.test.ts index 684bcf1d9..fee4f5c2a 100644 --- a/packages/collector/src/__tests__/shutdown.test.ts +++ b/packages/collector/src/__tests__/shutdown.test.ts @@ -1,6 +1,34 @@ -import type { Destination, Source, Transformer } from '@walkeros/core'; +import type { Destination, Logger, Source, Transformer } from '@walkeros/core'; +import { Level } from '@walkeros/core'; import { startFlow } from '..'; +// Build a single-entry batch registry whose flush() behaviour is supplied by +// the test, so we can drive the fast-resolve and hang paths deterministically. +function makeBatches( + flush: () => Promise, +): NonNullable { + const batched: Destination.Instance['batches'] = {}; + return { + default: { + batched: { key: 'default', entries: [], events: [], data: [] }, + batchFn: () => {}, + flush, + }, + } satisfies typeof batched; +} + +// Capture only ERROR-level log messages via a custom logger handler. +function makeErrorCapture(): { + messages: string[]; + handler: Logger.Handler; +} { + const messages: string[] = []; + const handler: Logger.Handler = (level, message) => { + if (level === Level.ERROR) messages.push(message); + }; + return { messages, handler }; +} + describe('shutdown command', () => { it('calls destroy on all destinations', async () => { const destroyFn = jest.fn(); @@ -156,4 +184,95 @@ describe('shutdown command', () => { expect(order).toEqual(['source', 'destination', 'transformer']); }); + + describe('timeout guards do not leak timers', () => { + const STEP_TIMEOUT = 5000; + + beforeEach(() => { + jest.useFakeTimers(); + }); + + afterEach(() => { + jest.clearAllTimers(); + jest.useRealTimers(); + }); + + it('clears the flush timeout when batch flush resolves fast', async () => { + const { collector, elb } = await startFlow({}); + + const dest: Destination.Instance = { + config: {}, + push: jest.fn(), + type: 'test-dest', + batches: makeBatches(() => Promise.resolve()), + }; + collector.destinations['fast'] = dest; + + await elb('walker shutdown'); + + expect(jest.getTimerCount()).toBe(0); + }); + + it('clears the destroy timeout when destroy resolves fast', async () => { + const { collector, elb } = await startFlow({}); + + const dest: Destination.Instance = { + config: {}, + push: jest.fn(), + type: 'test-dest', + destroy: jest.fn(() => Promise.resolve()), + }; + collector.destinations['fast'] = dest; + + await elb('walker shutdown'); + + expect(jest.getTimerCount()).toBe(0); + }); + + it('times out and is caught when batch flush hangs', async () => { + const { messages, handler } = makeErrorCapture(); + const { collector, elb } = await startFlow({ logger: { handler } }); + + const dest: Destination.Instance = { + config: {}, + push: jest.fn(), + type: 'test-dest', + batches: makeBatches(() => new Promise(() => {})), + }; + collector.destinations['hang'] = dest; + + const shutdown = elb('walker shutdown'); + await jest.advanceTimersByTimeAsync(STEP_TIMEOUT); + await shutdown; // resolves, does not hang + + expect( + messages.some((m) => + /destination 'hang' batch flush timed out/.test(m), + ), + ).toBe(true); + expect(jest.getTimerCount()).toBe(0); + }); + + it('times out and is caught when destroy hangs', async () => { + const { messages, handler } = makeErrorCapture(); + const { collector, elb } = await startFlow({ logger: { handler } }); + + const dest: Destination.Instance = { + config: {}, + push: jest.fn(), + type: 'test-dest', + destroy: jest.fn(() => new Promise(() => {})), + }; + collector.destinations['hang'] = dest; + + const shutdown = elb('walker shutdown'); + await jest.advanceTimersByTimeAsync(STEP_TIMEOUT); + await shutdown; // resolves, does not hang + + expect( + messages.some((m) => /destination 'hang' destroy timed out/.test(m)), + ).toBe(true); + expect(jest.getTimerCount()).toBe(0); + }); + }); }); diff --git a/packages/collector/src/destination.ts b/packages/collector/src/destination.ts index af2557953..e22b56ebf 100644 --- a/packages/collector/src/destination.ts +++ b/packages/collector/src/destination.ts @@ -948,13 +948,14 @@ export async function destinationPush( } const eventMapping = processed.mapping; - const ruleHasBatch = Boolean(eventMapping?.batch); + // Presence, not truthiness: batch: 0 is a valid 0 ms wait, not "disabled". + const ruleHasBatch = eventMapping?.batch !== undefined; const mappingKey = ruleHasBatch ? processed.mappingKey || '* *' : BATCH_ALL_KEY; if ( - (ruleHasBatch || config.batch) && + (ruleHasBatch || config.batch !== undefined) && destination.pushBatch && config.mock === undefined ) { diff --git a/packages/collector/src/shutdown.ts b/packages/collector/src/shutdown.ts index ac8a366bd..339d72865 100644 --- a/packages/collector/src/shutdown.ts +++ b/packages/collector/src/shutdown.ts @@ -34,19 +34,22 @@ async function flushDestinationBatches( if (!batches) return []; const logger = rootLogger.scope(dest.type || 'destination'); return Object.values(batches).map(async (b) => { + let timer: ReturnType | undefined; try { await Promise.race([ b.flush(), - new Promise((_, reject) => - setTimeout( + new Promise((_, reject) => { + timer = setTimeout( () => reject(new Error(`destination '${id}' batch flush timed out`)), STEP_TIMEOUT, - ), - ), + ); + }), ]); } catch (err) { logger.error(`destination '${id}' batch flush failed: ${err}`); + } finally { + if (timer) clearTimeout(timer); } }); }); @@ -81,18 +84,21 @@ async function destroyStepGroup< logger, }; + let timer: ReturnType | undefined; try { await Promise.race([ destroy(context), - new Promise((_, reject) => - setTimeout( + new Promise((_, reject) => { + timer = setTimeout( () => reject(new Error(`${label} '${id}' destroy timed out`)), STEP_TIMEOUT, - ), - ), + ); + }), ]); } catch (err) { logger.error(`${label} '${id}' destroy failed: ${err}`); + } finally { + if (timer) clearTimeout(timer); } }); diff --git a/packages/collector/src/types/code.ts b/packages/collector/src/types/code.ts index 242f79ae5..1b6204ef4 100644 --- a/packages/collector/src/types/code.ts +++ b/packages/collector/src/types/code.ts @@ -25,4 +25,4 @@ export type PushFn = (event: WalkerOS.Event, context: PushContext) => void; export type PushBatchFn = ( batch: Destination.Batch, context: PushBatchContext, -) => void; +) => WalkerOS.PromiseOrValue; diff --git a/packages/core/src/cache.ts b/packages/core/src/cache.ts index 75fb2b9c7..a7fd76d9c 100644 --- a/packages/core/src/cache.ts +++ b/packages/core/src/cache.ts @@ -153,6 +153,9 @@ function fromSerializable(value: unknown): unknown { for (const [k, v] of Object.entries(data)) out[k] = fromSerializable(v); return out; } + // Unrecognized tag (not 'buffer'/'escape'): fall through to default object + // traversal so a user object that merely carries the marker key, or a value + // from a future tag version, is preserved verbatim rather than coerced. } const out: Record = {}; for (const [k, v] of Object.entries(value)) out[k] = fromSerializable(v); diff --git a/packages/core/src/types/destination.ts b/packages/core/src/types/destination.ts index 555f85710..b69cd9f98 100644 --- a/packages/core/src/types/destination.ts +++ b/packages/core/src/types/destination.ts @@ -239,7 +239,7 @@ export type PushFn = ( export type PushBatchFn = ( batch: Batch>, context: PushBatchContext, -) => void; +) => WalkerOS.PromiseOrValue; export type PushEvent = { event: WalkerOS.Event; diff --git a/packages/server/destinations/gcp/src/bigquery/__mocks__/@google-cloud/bigquery-storage.ts b/packages/server/destinations/gcp/src/bigquery/__mocks__/@google-cloud/bigquery-storage.ts index e3792c985..85c78f648 100644 --- a/packages/server/destinations/gcp/src/bigquery/__mocks__/@google-cloud/bigquery-storage.ts +++ b/packages/server/destinations/gcp/src/bigquery/__mocks__/@google-cloud/bigquery-storage.ts @@ -7,6 +7,7 @@ interface MockRowError { } let nextAppendRowErrors: MockRowError[] | null = null; +let nextAppendThrow: unknown = null; let nextCreateStreamConnectionError: unknown = null; class MockJSONWriter { @@ -15,6 +16,11 @@ class MockJSONWriter { } appendRows(rows: unknown[], offsetValue?: unknown) { calls.push({ method: 'appendRows', args: [rows, offsetValue] }); + const queuedThrow = nextAppendThrow; + if (queuedThrow !== null) { + nextAppendThrow = null; + throw queuedThrow; + } const queuedErrors = nextAppendRowErrors; nextAppendRowErrors = null; return { @@ -115,6 +121,7 @@ function __getMockCalls() { function __resetMockCalls() { calls.length = 0; nextAppendRowErrors = null; + nextAppendThrow = null; nextCreateStreamConnectionError = null; } @@ -126,6 +133,14 @@ function __setNextAppendRowErrors(errors: MockRowError[]): void { nextAppendRowErrors = errors; } +/** + * Test-only: make the next appendRows() throw synchronously, simulating a + * Storage Write API call failure. Auto-resets after one use. + */ +function __setNextAppendThrow(err: unknown): void { + nextAppendThrow = err; +} + /** * Test-only: queue an error for the next createStreamConnection() call so * tests can simulate openWriter failures (NotFound, INVALID_ARGUMENT, etc.). @@ -142,5 +157,6 @@ export { __getMockCalls, __resetMockCalls, __setNextAppendRowErrors, + __setNextAppendThrow, __setNextOpenWriterError, }; diff --git a/packages/server/destinations/gcp/src/bigquery/__tests__/bigquery-storage-mock.d.ts b/packages/server/destinations/gcp/src/bigquery/__tests__/bigquery-storage-mock.d.ts index 54da83986..cd35b9fa5 100644 --- a/packages/server/destinations/gcp/src/bigquery/__tests__/bigquery-storage-mock.d.ts +++ b/packages/server/destinations/gcp/src/bigquery/__tests__/bigquery-storage-mock.d.ts @@ -4,6 +4,7 @@ declare module '@google-cloud/bigquery-storage' { export function __setNextAppendRowErrors( errors: Array<{ index: number; code: number; message: string }>, ): void; + export function __setNextAppendThrow(err: unknown): void; export function __setNextOpenWriterError(err: unknown): void; } diff --git a/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts b/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts index 80d2d294b..ed46c2bc1 100644 --- a/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts +++ b/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts @@ -11,6 +11,7 @@ import { __getMockCalls, __resetMockCalls, __setNextAppendRowErrors, + __setNextAppendThrow, __setNextOpenWriterError, } from '@google-cloud/bigquery-storage'; import { @@ -307,7 +308,7 @@ describe('Server Destination BigQuery', () => { const data: Array = events.map(() => undefined); const entries = events.map((event) => ({ event })); - destination.pushBatch( + await destination.pushBatch( { key: 'k', events, data, entries }, createMockContext({ config, @@ -317,10 +318,6 @@ describe('Server Destination BigQuery', () => { }), ); - // pushBatch is synchronous and uses an IIFE for the async appendRows. - // Wait a tick to let the IIFE complete. - await new Promise((r) => setImmediate(r)); - const calls = __getMockCalls(); const append = calls.find((c) => c.method === 'appendRows'); expect(append).toBeDefined(); @@ -348,7 +345,7 @@ describe('Server Destination BigQuery', () => { const data = [mappedRow]; const entries = [{ event: e0 }, { event: e1, data: mappedRow }]; - destination.pushBatch( + await destination.pushBatch( { key: 'k', events, data, entries }, createMockContext({ config, @@ -358,8 +355,6 @@ describe('Server Destination BigQuery', () => { }), ); - await new Promise((r) => setImmediate(r)); - const calls = __getMockCalls(); const append = calls.find((c) => c.method === 'appendRows'); expect(append).toBeDefined(); @@ -372,7 +367,7 @@ describe('Server Destination BigQuery', () => { expect(rowsArg[1]).toEqual(mappedRow); }); - test('logs partial failures without throwing', async () => { + test('rejects when the batch has row errors', async () => { if (!destination.pushBatch) throw new Error('pushBatch missing'); const config = await buildConfig(); @@ -386,23 +381,24 @@ describe('Server Destination BigQuery', () => { const data: Array = events.map(() => undefined); const entries = events.map((event) => ({ event })); - // Synchronous call must not throw. - expect(() => - destination.pushBatch!( - { key: 'k', events, data, entries }, - createMockContext({ - config, - env: testEnv, - logger, - id: 'test-bq', - }), - ), - ).not.toThrow(); - - // Wait for the IIFE async work to complete. - await new Promise((r) => setImmediate(r)); + // The batch path now propagates row errors so the collector flush + // boundary routes the batch to the DLQ. + const pending = destination.pushBatch!( + { key: 'k', events, data, entries }, + createMockContext({ + config, + env: testEnv, + logger, + id: 'test-bq', + }), + ); + // Assert the summary framing, not just the row substring, so a future + // simplification of the thrown message is caught. + await expect(pending).rejects.toThrow('invalid row'); + await expect(pending).rejects.toThrow(/of 3 rows/); + await expect(pending).rejects.toThrow(/code=3/); - // INFO summary with ok/failed counts. + // INFO summary with ok/failed counts for operator visibility. expect(logger.info).toHaveBeenCalledWith( expect.any(String), expect.objectContaining({ ok: 2, failed: 1 }), @@ -417,7 +413,63 @@ describe('Server Destination BigQuery', () => { message: 'invalid row', }), ); - expect(logger.error).toHaveBeenCalledTimes(1); + }); + + test('rejects when appendRows throws', async () => { + if (!destination.pushBatch) throw new Error('pushBatch missing'); + + const config = await buildConfig(); + __resetMockCalls(); + + __setNextAppendThrow(new Error('append boom')); + + const logger = createMockLogger(); + const events = [createEvent(), createEvent()]; + const data: Array = events.map(() => undefined); + const entries = events.map((event) => ({ event })); + + await expect( + destination.pushBatch!( + { key: 'k', events, data, entries }, + createMockContext({ + config, + env: testEnv, + logger, + id: 'test-bq', + }), + ), + ).rejects.toThrow('append boom'); + }); + + test('rejects when init() has not run (writer missing)', async () => { + if (!destination.pushBatch) throw new Error('pushBatch missing'); + + // Misconfigured destination: settings present but writer absent because + // init() never ran. The batch path must surface this like single-push. + const settings: Settings = { + client: new BigQuery({ projectId }), + projectId, + datasetId, + tableId, + location: 'EU', + }; + const config: Config = { settings }; + + const logger = createMockLogger(); + const events = [createEvent()]; + const entries = events.map((event) => ({ event })); + + await expect( + destination.pushBatch!( + { key: 'k', events, data: [undefined], entries }, + createMockContext({ + config, + env: testEnv, + logger, + id: 'test-bq', + }), + ), + ).rejects.toThrow('writer is missing'); }); }); }); diff --git a/packages/server/destinations/gcp/src/bigquery/pushBatch.ts b/packages/server/destinations/gcp/src/bigquery/pushBatch.ts index 2db09e157..799708e21 100644 --- a/packages/server/destinations/gcp/src/bigquery/pushBatch.ts +++ b/packages/server/destinations/gcp/src/bigquery/pushBatch.ts @@ -5,26 +5,18 @@ import { eventToRow } from './eventToRow'; /** * Batched push using a single appendRows call. * - * Returns void (per PushBatchFn signature). Throwing inside an async body - * would produce an unhandled rejection in the collector's debounced batch path - * and silently drop batches. So: log, do not throw. + * Errors (row errors or a failed append call) propagate to the collector's + * batch flush boundary, which routes the whole batch to the DLQ and increments + * the failed counter. */ -export const pushBatch: PushBatchFn = (batch, { config, logger }) => { +export const pushBatch: PushBatchFn = async (batch, { config, logger }) => { const settings = config.settings; - if (!settings) { - logger.error('pushBatch: settings missing, init() not run'); - return; - } + if (!settings) return logger.throw('settings missing, init() not run'); const { writer, datasetId, tableId } = settings; - if (!writer) { - logger.error('pushBatch: writer missing, init() not run'); - return; - } - if (!datasetId || !tableId) { - logger.error('pushBatch: datasetId/tableId missing'); - return; - } + if (!writer) return logger.throw('writer is missing, init() not run'); + if (!datasetId) return logger.throw('datasetId is missing'); + if (!tableId) return logger.throw('tableId is missing'); const rows = batch.entries.map((e) => isObject(e.data) ? e.data : eventToRow(e.event), @@ -32,47 +24,50 @@ export const pushBatch: PushBatchFn = (batch, { config, logger }) => { if (rows.length === 0) return; - // Fire-and-forget the async call. PushBatchFn returns void by contract. - // Wrap in IIFE so we can await internally and log all outcomes without leaking rejections. - void (async () => { - try { - logger.debug('Calling BigQuery Storage Write API (batch)', { - dataset: datasetId, - table: tableId, + try { + logger.debug('Calling BigQuery Storage Write API (batch)', { + dataset: datasetId, + table: tableId, + rowCount: rows.length, + }); + const pending = writer.appendRows(rows); + const result = await pending.getResult(); + + if (result.rowErrors && result.rowErrors.length > 0) { + const failed = result.rowErrors.length; + // Operator-visible summary at INFO, each row error at ERROR. + logger.info('BigQuery batch had partial failures', { + ok: rows.length - failed, + failed, rowCount: rows.length, }); - const pending = writer.appendRows(rows); - const result = await pending.getResult(); - - if (result.rowErrors && result.rowErrors.length > 0) { - // Partial failure (per Resolved Decision #5): log a summary at INFO - // for operator visibility, then each row error at ERROR. Never throw. - const failed = result.rowErrors.length; - logger.info('BigQuery batch had partial failures', { - ok: rows.length - failed, - failed, - rowCount: rows.length, + for (const re of result.rowErrors) { + logger.error('BigQuery row append failed', { + index: re.index, + code: re.code, + message: re.message, }); - for (const re of result.rowErrors) { - logger.error('BigQuery row append failed', { - index: re.index, - code: re.code, - message: re.message, - }); - } - return; } - - // Full success: keep at DEBUG to avoid noise on every batch. - logger.debug('BigQuery batch append ok', { - ok: rows.length, - failed: 0, - offset: result.appendResult?.offset?.value, - }); - } catch (err) { - logger.error('BigQuery batch append threw', { - error: err instanceof Error ? err.message : String(err), - }); + // Throw on any partial failure: the collector flush boundary is atomic + // per batch, so the WHOLE batch (succeeded rows included) routes to the + // DLQ and counts as failed. Precise per-row accounting would require a + // separate PushBatchFn->collector contract change. + const first = result.rowErrors[0]; + throw new Error( + `BigQuery batch append failed: ${failed} of ${rows.length} rows, first error code=${first.code} message=${first.message}`, + ); } - })(); + + // Full success: keep at DEBUG to avoid noise on every batch. + logger.debug('BigQuery batch append ok', { + ok: rows.length, + failed: 0, + offset: result.appendResult?.offset?.value, + }); + } catch (err) { + logger.error('BigQuery batch append threw', { + error: err instanceof Error ? err.message : String(err), + }); + throw err; + } }; diff --git a/website/docs/destinations/code.mdx b/website/docs/destinations/code.mdx index c6693924b..ec0a71f12 100644 --- a/website/docs/destinations/code.mdx +++ b/website/docs/destinations/code.mdx @@ -187,7 +187,10 @@ is routed to the destination's `dlq` (dead-letter queue) and `status.destinations[id].failed` is incremented by the batch size. Per-item retry logic is the destination SDK's responsibility (BigQuery, Kafka, HubSpot each have their own backoff semantics). The collector -never drops batch failures silently. +never drops batch failures silently. Batched delivery is asynchronous, +so the originating `elb()` call's result cannot reflect a later +batch-flush outcome; failures appear in the destination's `failed` count, +dead-letter buffer, and logs. ### on diff --git a/website/docs/destinations/server/gcp.mdx b/website/docs/destinations/server/gcp.mdx index ff9c8d712..ac0ce7bf8 100644 --- a/website/docs/destinations/server/gcp.mdx +++ b/website/docs/destinations/server/gcp.mdx @@ -243,7 +243,9 @@ ingestion. This replaces the legacy `tabledata.insertAll` path. - **Batching**: `pushBatch` is implemented. Set `config.batch` on the destination (a bare number is the debounce `wait` in ms) to flush all events in a window as a single `appendRows` call. No `'* *'` wildcard mapping rule is - needed. + needed. The batch path awaits the write and surfaces row errors: a failed + append fails the batch, which routes the events to the dead-letter buffer and + the `failed` count rather than being logged and ignored. :::caution EXPERIMENTAL SDK The upstream `@google-cloud/bigquery-storage` package self-marks as From e40b2f1f28e82bddc08e8dc22e615fff5eb09d70 Mon Sep 17 00:00:00 2001 From: alexanderkirtzel Date: Thu, 4 Jun 2026 09:37:32 +0200 Subject: [PATCH 04/10] types --- .../cli/src/__tests__/helpers/msw-handlers.ts | 41 +++++++++++++++---- .../api/api-projects.integration.test.ts | 10 +++-- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/packages/cli/src/__tests__/helpers/msw-handlers.ts b/packages/cli/src/__tests__/helpers/msw-handlers.ts index 3682c7f71..6d595ab71 100644 --- a/packages/cli/src/__tests__/helpers/msw-handlers.ts +++ b/packages/cli/src/__tests__/helpers/msw-handlers.ts @@ -1,11 +1,19 @@ import { http, HttpResponse } from 'msw'; import type { paths } from '../../types/api.gen.js'; -// Extract response types from the spec +// Extract response types from the spec. The project endpoints return three +// distinct shapes: the list returns the full record, get-by-id returns a +// narrower detail view, and create returns only the freshly written fields. type ProjectsResponse = paths['/api/projects']['get']['responses']['200']['content']['application/json']; -type ProjectResponse = +type CreateProjectResponse = + paths['/api/projects']['post']['responses']['201']['content']['application/json']; +type ProjectDetailResponse = paths['/api/projects/{projectId}']['get']['responses']['200']['content']['application/json']; +type UpdateProjectResponse = + paths['/api/projects/{projectId}']['patch']['responses']['200']['content']['application/json']; +// The full project record, as carried by the list endpoint. +type Project = ProjectsResponse['projects'][number]; type FlowsResponse = paths['/api/projects/{projectId}/flows']['get']['responses']['200']['content']['application/json']; type FlowResponse = @@ -13,7 +21,8 @@ type FlowResponse = type WhoamiResponse = paths['/api/auth/whoami']['get']['responses']['200']['content']['application/json']; -// If the API spec changes these shapes, TypeScript will error HERE +// If the API spec changes these shapes, TypeScript will error HERE. +// Full record returned by the list endpoint. export const mockProject = { id: 'proj_test123', name: 'Test Project', @@ -24,7 +33,22 @@ export const mockProject = { flowCount: 0, deploymentCount: 0, isDemo: false, -} satisfies ProjectResponse; +} satisfies Project; + +// Narrower view returned by GET /api/projects/:projectId. +export const mockProjectDetail = { + id: mockProject.id, + name: mockProject.name, + siteUrl: null, + role: mockProject.role, +} satisfies ProjectDetailResponse; + +// POST /api/projects returns only the freshly written fields. +export const mockCreatedProject = { + id: mockProject.id, + name: mockProject.name, + createdAt: mockProject.createdAt, +} satisfies CreateProjectResponse; export const mockFlow = { id: 'flow_test456', @@ -61,19 +85,22 @@ export const handlers = [ http.post('*/api/projects', async ({ request }) => { const body = (await request.json()) as { name: string }; return HttpResponse.json( - { ...mockProject, name: body.name } satisfies ProjectResponse, + { + ...mockCreatedProject, + name: body.name, + } satisfies CreateProjectResponse, { status: 201 }, ); }), http.get('*/api/projects/:projectId', () => { - return HttpResponse.json(mockProject satisfies ProjectResponse); + return HttpResponse.json(mockProjectDetail satisfies ProjectDetailResponse); }), http.patch('*/api/projects/:projectId', async ({ request }) => { const body = (await request.json()) as { name?: string }; return HttpResponse.json({ ...mockProject, ...(body.name && { name: body.name }), - } satisfies ProjectResponse); + } satisfies UpdateProjectResponse); }), http.delete('*/api/projects/:projectId', () => { return new HttpResponse(null, { status: 204 }); diff --git a/packages/cli/src/__tests__/integration/api/api-projects.integration.test.ts b/packages/cli/src/__tests__/integration/api/api-projects.integration.test.ts index 4d3b537fa..a6f7dee38 100644 --- a/packages/cli/src/__tests__/integration/api/api-projects.integration.test.ts +++ b/packages/cli/src/__tests__/integration/api/api-projects.integration.test.ts @@ -26,7 +26,11 @@ import { updateProject, deleteProject, } from '../../../commands/projects/index.js'; -import { mockProject } from '../../helpers/msw-handlers.js'; +import { + mockProject, + mockProjectDetail, + mockCreatedProject, +} from '../../helpers/msw-handlers.js'; describe('projects (integration via MSW)', () => { it('listProjects returns typed project list', async () => { @@ -40,12 +44,12 @@ describe('projects (integration via MSW)', () => { it('getProject returns a single project', async () => { const result = await getProject({ projectId: 'proj_test123' }); - expect(result).toEqual(mockProject); + expect(result).toEqual(mockProjectDetail); }); it('createProject sends name and returns project', async () => { const result = await createProject({ name: 'New Project' }); - expect(result).toEqual({ ...mockProject, name: 'New Project' }); + expect(result).toEqual({ ...mockCreatedProject, name: 'New Project' }); }); it('updateProject patches and returns updated project', async () => { From c1d53e574d8a037f944585291bd37e8225ff0495 Mon Sep 17 00:00:00 2001 From: alexanderkirtzel Date: Thu, 4 Jun 2026 13:15:11 +0200 Subject: [PATCH 05/10] request caching --- .changeset/request-cache-serialization.md | 11 +- packages/cli/eslint.config.js | 30 ++ packages/cli/openapi/spec.json | 21 +- .../cli/src/__tests__/helpers/msw-handlers.ts | 5 +- .../api/api-projects.integration.test.ts | 6 +- .../bundle/browser-dev-externals.test.ts | 100 ++++ .../integration/bundle/size-budget.test.ts | 15 +- .../bundle/skeleton-contract.test.ts | 202 ++++++++ .../unit/bundle/bundler-codegen.test.ts | 68 ++- .../src/__tests__/unit/bundle/targets.test.ts | 8 +- packages/cli/src/commands/bundle/bundler.ts | 133 +++-- packages/cli/src/commands/bundle/targets.ts | 19 +- .../cli/src/commands/push/flow-context.ts | 8 +- packages/cli/src/commands/push/index.ts | 108 +++- packages/cli/src/types/api.gen.d.ts | 12 +- .../destination-batch-correctness.test.ts | 474 +++++++++++++++++- .../collector/src/__tests__/shutdown.test.ts | 106 +++- packages/collector/src/collector.ts | 1 + packages/collector/src/destination.ts | 135 +++-- packages/collector/src/handle.ts | 8 +- packages/collector/src/types/code.ts | 2 +- packages/core/src/__tests__/cache.test.ts | 56 +++ packages/core/src/cache.ts | 32 +- packages/core/src/invocations.ts | 11 +- packages/core/src/types/collector.ts | 8 + packages/core/src/types/destination.ts | 42 +- .../gcp/src/bigquery/__tests__/index.test.ts | 26 +- .../gcp/src/bigquery/pushBatch.ts | 33 +- .../src/__tests__/cache-roundtrip.test.ts | 133 +++++ .../stores/fs/src/__tests__/store.test.ts | 55 ++ .../stores/gcs/src/__tests__/store.test.ts | 12 + .../stores/s3/src/__tests__/store.test.ts | 12 + .../stores/sheets/src/__tests__/store.test.ts | 23 +- packages/server/stores/sheets/src/store.ts | 11 + .../browser/src/__tests__/html/walker.html | 11 + .../browser/src/__tests__/walker.test.ts | 22 + website/docs/stores/cache.mdx | 4 +- website/docs/stores/server/sheets.mdx | 2 +- 38 files changed, 1806 insertions(+), 159 deletions(-) create mode 100644 packages/cli/eslint.config.js create mode 100644 packages/cli/src/__tests__/integration/bundle/browser-dev-externals.test.ts create mode 100644 packages/cli/src/__tests__/integration/bundle/skeleton-contract.test.ts diff --git a/.changeset/request-cache-serialization.md b/.changeset/request-cache-serialization.md index d68387e68..dbb37d8fe 100644 --- a/.changeset/request-cache-serialization.md +++ b/.changeset/request-cache-serialization.md @@ -3,8 +3,9 @@ '@walkeros/server-store-fs': patch --- -Request caching now persists structured HTTP responses (including binary bodies) -to any store backend and honors TTL. Previously, caching a response to a -filesystem, S3, or GCS store could crash the process or never populate, and -entries never expired. Cached values now round-trip safely and expire correctly -instead of serving stale content after a redeploy. +Request caching now persists structured HTTP responses, including binary bodies +(`Buffer`, `Uint8Array`, `ArrayBuffer`), to byte/string store backends +(filesystem, S3, GCS, in-memory) and honors TTL. Previously, caching a response +could crash the process or never populate, and entries never expired. Cached +values now round-trip safely (binary bodies decode back as a `Buffer`) and +expire correctly instead of serving stale content after a redeploy. diff --git a/packages/cli/eslint.config.js b/packages/cli/eslint.config.js new file mode 100644 index 000000000..2a45cc5a9 --- /dev/null +++ b/packages/cli/eslint.config.js @@ -0,0 +1,30 @@ +import baseConfig from '@walkeros/config/eslint'; + +export default [ + ...baseConfig, + { + // Guard the bundle codegen against regressing the lazy /dev registry. + // The skeleton must register a package's ./dev surface as a lazy thunk + // (`() => import('/dev')`) so the deploy wrap can DCE it. A static + // `import * as ... from '/dev'` cannot be tree-shaken and leaks the + // dev graph (zod schemas) into production bundles. A non-literal dynamic + // import specifier would also defeat static analysis and esbuild's DCE. + files: ['src/commands/bundle/bundler.ts'], + rules: { + 'no-restricted-syntax': [ + 'error', + { + selector: + 'ImportDeclaration[source.value=/\\/dev$/]:has(ImportNamespaceSpecifier)', + message: + "Do not statically `import * as` from a '/dev' subpath in the bundle codegen: it cannot be tree-shaken out of the deploy wrap. Emit a lazy `() => import('/dev')` registry entry instead.", + }, + { + selector: 'ImportExpression > .source:not(Literal)', + message: + 'Dynamic import() in the bundle codegen must use a literal specifier so esbuild can statically analyse and DCE it.', + }, + ], + }, + }, +]; diff --git a/packages/cli/openapi/spec.json b/packages/cli/openapi/spec.json index 6bf9974f8..3b1a032c9 100644 --- a/packages/cli/openapi/spec.json +++ b/packages/cli/openapi/spec.json @@ -770,6 +770,25 @@ }, "required": ["id", "name", "createdAt"] }, + "UpdateProjectResponse": { + "type": "object", + "properties": { + "id": { + "type": "string", + "pattern": "^proj_[a-zA-Z0-9_-]+$", + "example": "proj_x7y8z9" + }, + "name": { + "type": "string" + }, + "updatedAt": { + "type": "string", + "format": "date-time", + "example": "2026-01-26T14:30:00.000Z" + } + }, + "required": ["id", "name", "updatedAt"] + }, "DeploymentSummary": { "type": "object", "properties": { @@ -3084,7 +3103,7 @@ "content": { "application/json": { "schema": { - "$ref": "#/components/schemas/Project" + "$ref": "#/components/schemas/UpdateProjectResponse" } } } diff --git a/packages/cli/src/__tests__/helpers/msw-handlers.ts b/packages/cli/src/__tests__/helpers/msw-handlers.ts index 6d595ab71..e483ad0af 100644 --- a/packages/cli/src/__tests__/helpers/msw-handlers.ts +++ b/packages/cli/src/__tests__/helpers/msw-handlers.ts @@ -98,8 +98,9 @@ export const handlers = [ http.patch('*/api/projects/:projectId', async ({ request }) => { const body = (await request.json()) as { name?: string }; return HttpResponse.json({ - ...mockProject, - ...(body.name && { name: body.name }), + id: mockProject.id, + name: body.name ?? mockProject.name, + updatedAt: mockProject.updatedAt, } satisfies UpdateProjectResponse); }), http.delete('*/api/projects/:projectId', () => { diff --git a/packages/cli/src/__tests__/integration/api/api-projects.integration.test.ts b/packages/cli/src/__tests__/integration/api/api-projects.integration.test.ts index a6f7dee38..a26d7a1c2 100644 --- a/packages/cli/src/__tests__/integration/api/api-projects.integration.test.ts +++ b/packages/cli/src/__tests__/integration/api/api-projects.integration.test.ts @@ -57,7 +57,11 @@ describe('projects (integration via MSW)', () => { projectId: 'proj_test123', name: 'Renamed', }); - expect(result).toEqual({ ...mockProject, name: 'Renamed' }); + expect(result).toEqual({ + id: mockProject.id, + name: 'Renamed', + updatedAt: mockProject.updatedAt, + }); }); it('deleteProject returns success', async () => { diff --git a/packages/cli/src/__tests__/integration/bundle/browser-dev-externals.test.ts b/packages/cli/src/__tests__/integration/bundle/browser-dev-externals.test.ts new file mode 100644 index 000000000..b871f20e3 --- /dev/null +++ b/packages/cli/src/__tests__/integration/bundle/browser-dev-externals.test.ts @@ -0,0 +1,100 @@ +/** + * Integration Test: browser skeleton externalizes each `/dev` + * + * Proves the option-A externalization: a browser skeleton build (withDev / + * skipWrapper) keeps the lazy `/dev` registry a literal `import('/dev')` + * instead of inlining the /dev graph (zod schemas, zodToSchema, etc.). The + * deploy wrap DCEs the unreferenced registry, while simulate/preview resolve + * the dev module on demand. + */ + +import fs from 'fs-extra'; +import os from 'os'; +import path from 'path'; +import { bundleCore } from '../../../commands/bundle/bundler.js'; +import { loadBundleConfig } from '../../../config/loader.js'; +import { createCLILogger } from '../../../core/cli-logger.js'; +import type { Logger } from '@walkeros/core'; + +// @walkeros/web-source-browser genuinely exposes ./dev. `zodToSchema` lives in +// its dev module only (not in the runtime entry), so it is a clean sentinel +// for whether the /dev graph got inlined. +const browserFlow = { + version: 4, + flows: { + default: { + sources: { + browser: { + package: '@walkeros/web-source-browser', + }, + }, + destinations: { + demo: { + package: '@walkeros/destination-demo', + }, + }, + config: { + platform: 'web', + bundle: { + packages: { + '@walkeros/collector': { + version: 'latest', + imports: ['startFlow'], + }, + '@walkeros/web-source-browser': { version: 'latest' }, + '@walkeros/destination-demo': { version: 'latest' }, + }, + }, + }, + }, + }, +}; + +describe('browser skeleton /dev externals', () => { + let tmpDir: string; + let logger: Logger.Instance; + + beforeEach(async () => { + tmpDir = path.join( + os.tmpdir(), + `browser-dev-externals-${Date.now()}-${Math.random().toString(36).slice(2)}`, + ); + await fs.ensureDir(tmpDir); + logger = createCLILogger({ silent: true }); + jest.spyOn(console, 'log').mockImplementation(() => {}); + }); + + afterEach(async () => { + await fs.remove(tmpDir); + jest.restoreAllMocks(); + }); + + it('keeps `/dev` external+literal and does NOT inline the /dev graph', async () => { + const { flowSettings, buildOptions } = loadBundleConfig(browserFlow, { + configPath: path.join(tmpDir, 'flow.json'), + }); + + // Browser ESM skeleton (skipWrapper -> withDev). Disable cache + minify so + // we can grep the raw stage-1 text deterministically. + buildOptions.output = path.join(tmpDir, 'skeleton.mjs'); + buildOptions.platform = 'browser'; + buildOptions.format = 'esm'; + buildOptions.skipWrapper = true; + buildOptions.withDev = true; + buildOptions.cache = false; + buildOptions.minify = false; + + await bundleCore(flowSettings, buildOptions, logger); + + const output = await fs.readFile(buildOptions.output, 'utf-8'); + + // The lazy registry survives as a literal dynamic import of the /dev + // subpath (esbuild normalizes the quote style, so match quote-agnostic). + expect(output).toMatch( + /import\(["']@walkeros\/web-source-browser\/dev["']\)/, + ); + + // The /dev body is NOT inlined: a dev-only symbol must be absent. + expect(output).not.toContain('zodToSchema'); + }, 60000); +}); diff --git a/packages/cli/src/__tests__/integration/bundle/size-budget.test.ts b/packages/cli/src/__tests__/integration/bundle/size-budget.test.ts index 52f0739df..17d629fe6 100644 --- a/packages/cli/src/__tests__/integration/bundle/size-budget.test.ts +++ b/packages/cli/src/__tests__/integration/bundle/size-budget.test.ts @@ -4,7 +4,10 @@ * Guards against two regressions at once: * 1. Size blow-up: the cdn IIFE must stay within the budget. * 2. Dev/zod leakage: the cdn and cdn-skeleton outputs must be free of any - * runtime-schema / zod markers that would indicate dev imports snuck in. + * runtime-schema / zod markers that would indicate the /dev graph got + * inlined. The cdn-skeleton legitimately carries the lazy `/dev` registry + * as a literal `import('/dev')`, but that subpath stays external, so + * the zod/schema body must not appear inline. * * Baseline: CDN direct IIFE = 70,560 bytes. Budget = baseline Γ— 1.10. */ @@ -70,7 +73,7 @@ describe('CDN bundle size budget', () => { expect(text).not.toMatch(/@walkeros\/[\w-]+\/dev/); }, 120000); - it('cdn-skeleton target is clean of the same markers', async () => { + it('cdn-skeleton carries the lazy /dev registry without inlining the zod graph', async () => { const out = join(tmpDir, 'skel.mjs'); await bundle(MINIMAL_FLOW, { target: 'cdn-skeleton', @@ -79,9 +82,15 @@ describe('CDN bundle size budget', () => { }); const text = await readFile(out, 'utf8'); + // The /dev subpath stays external, so the zod/schema body is NOT inlined. expect(text).not.toMatch(/\b_zod\b/); expect(text).not.toContain('toJSONSchema'); - expect(text).not.toMatch(/@walkeros\/[\w-]+\/dev/); + + // The lazy registry survives as a literal `import('/dev')`; the deploy + // wrap DCEs it. The browser skeleton externalizes the subpath so the /dev + // graph never inlines. + expect(text).toContain('__devExports'); + expect(text).toMatch(/import\(["']@walkeros\/[\w-]+\/dev["']\)/); }, 120000); it('simulate target DOES include dev schemas (regression guard)', async () => { diff --git a/packages/cli/src/__tests__/integration/bundle/skeleton-contract.test.ts b/packages/cli/src/__tests__/integration/bundle/skeleton-contract.test.ts new file mode 100644 index 000000000..c52793af0 --- /dev/null +++ b/packages/cli/src/__tests__/integration/bundle/skeleton-contract.test.ts @@ -0,0 +1,202 @@ +/** + * KEYSTONE anti-drift contract test: ONE skeleton, BOTH contracts. + * + * The original bug existed because "dev-free for deploy" and "dev-available + * for simulate" were two separate artifacts built under two separate + * assumptions that drifted apart. The unification makes a single skeleton + * serve both paths. This test asserts BOTH properties off the SAME single + * skeleton build so they can never drift again: + * + * 1. Simulate contract: the skeleton exposes a lazy `__devExports` registry + * whose thunks resolve each package's `/dev` module on demand, so the + * simulate/preview path can introspect schemas and examples. + * 2. Deploy purity: running the real deploy wrap (`wrapSkeleton`) on that + * SAME skeleton file produces output with zero dev-only zod graph. + * + * Do NOT split this into two independent builds. Two builds re-create the + * drift surface inside the test suite, the very failure mode this guards. + * + * Sentinel discipline: + * - Deploy purity uses ONLY dev-only zod sentinels (`zodToSchema`, + * `toJSONSchema`, `_zod`, `ZodObject`). These live solely in the `/dev` + * zod graph. + * - `createTrigger` is a legitimate RUNTIME export of the browser source and + * appears in production bundles. It is NOT a dev-leak marker, so it is used + * ONLY for the simulate-resolution assertion, never for purity. + */ + +import fs from 'fs-extra'; +import path from 'path'; +import { pathToFileURL } from 'url'; +import { bundleCore } from '../../../commands/bundle/bundler.js'; +import { wrapSkeleton } from '../../../commands/bundle/wrap.js'; +import { loadBundleConfig } from '../../../config/loader.js'; +import { createCLILogger } from '../../../core/cli-logger.js'; +import type { Logger } from '@walkeros/core'; + +// Skeletons are written inside the CLI package tree (not os.tmpdir) so that the +// runtime dynamic `import('/dev')` inside the skeleton resolves bare +// `@walkeros/*` specifiers against the workspace node_modules, exactly as it +// would when the simulate path imports a real skeleton. +const cliRoot = path.resolve(__dirname, '../../../..'); +const workDir = path.join(cliRoot, '.tmp/skeleton-contract'); + +// Dev-only zod-graph sentinels. Each lives ONLY in a package's `/dev` module +// (the schema/zod graph), never in the runtime entry. Their presence in a +// wrapped deploy bundle means the /dev graph leaked. +const DEV_ZOD_SENTINELS = ['zodToSchema', 'toJSONSchema', '_zod', 'ZodObject']; + +interface PlatformCase { + name: string; + platform: 'browser' | 'node'; + pkg: string; + flow: unknown; +} + +const cases: PlatformCase[] = [ + { + name: 'web/browser', + platform: 'browser', + pkg: '@walkeros/web-source-browser', + flow: { + version: 4, + flows: { + default: { + sources: { browser: { package: '@walkeros/web-source-browser' } }, + destinations: { demo: { package: '@walkeros/destination-demo' } }, + config: { + platform: 'web', + bundle: { + packages: { + '@walkeros/collector': { + version: 'latest', + imports: ['startFlow'], + }, + '@walkeros/web-source-browser': { version: 'latest' }, + '@walkeros/destination-demo': { version: 'latest' }, + }, + }, + }, + }, + }, + }, + }, + { + name: 'server/node', + platform: 'node', + pkg: '@walkeros/server-source-fetch', + flow: { + version: 4, + flows: { + default: { + sources: { fetch: { package: '@walkeros/server-source-fetch' } }, + destinations: { demo: { package: '@walkeros/destination-demo' } }, + config: { + platform: 'server', + bundle: { + packages: { + '@walkeros/collector': { + version: 'latest', + imports: ['startFlow'], + }, + '@walkeros/server-source-fetch': { version: 'latest' }, + '@walkeros/destination-demo': { version: 'latest' }, + }, + }, + }, + }, + }, + }, + }, +]; + +describe('skeleton contract (one build, both contracts)', () => { + let logger: Logger.Instance; + + beforeAll(async () => { + await fs.ensureDir(workDir); + }); + + beforeEach(() => { + logger = createCLILogger({ silent: true }); + jest.spyOn(console, 'log').mockImplementation(() => {}); + }); + + afterEach(() => { + jest.restoreAllMocks(); + }); + + afterAll(async () => { + await fs.remove(workDir).catch(() => {}); + }); + + describe.each(cases)('$name', ({ platform, pkg, flow }) => { + it('serves simulate (lazy /dev resolves) AND deploy (dev-free) off ONE skeleton build', async () => { + const caseDir = path.join(workDir, platform); + await fs.ensureDir(caseDir); + const skeletonPath = path.join(caseDir, 'skeleton.mjs'); + + // ── ONE build: the unified skeleton (skipWrapper, withDev). ────────── + const { flowSettings, buildOptions } = loadBundleConfig(flow, { + configPath: path.join(caseDir, 'flow.json'), + }); + buildOptions.output = skeletonPath; + buildOptions.platform = platform; + buildOptions.format = 'esm'; + buildOptions.skipWrapper = true; + buildOptions.withDev = true; + buildOptions.cache = false; + buildOptions.minify = false; + + await bundleCore(flowSettings, buildOptions, logger); + + // ── Contract 1: simulate resolution off that build. ───────────────── + // The skeleton externalizes `/dev` as a literal lazy import, so + // the dev graph is not inlined but stays resolvable on demand. + const skeletonText = await fs.readFile(skeletonPath, 'utf-8'); + expect(skeletonText).toContain('__devExports'); + expect(skeletonText).toMatch( + new RegExp(`import\\(["']${pkg.replace(/[/\\]/g, '\\$&')}/dev["']\\)`), + ); + + const skeletonModule = await import(pathToFileURL(skeletonPath).href); + const registry = skeletonModule.__devExports; + expect(typeof registry[pkg]).toBe('function'); + + const devModule = await registry[pkg](); + expect(typeof devModule.examples.createTrigger).toBe('function'); + + // ── Contract 2: deploy purity off the SAME skeleton file. ─────────── + // Run the real deploy wrap (stage-2) on the skeleton we just built and + // resolved. The unreferenced lazy registry must be DCE'd, leaving zero + // dev-only zod graph in the wrapped output. + const wrappedPath = path.join(caseDir, 'wrapped.js'); + await wrapSkeleton({ + skeletonPath, + platform, + outputPath: wrappedPath, + minify: false, + ...(platform === 'browser' + ? { windowCollector: 'collector', windowElb: 'elb' } + : {}), + }); + + const wrappedText = await fs.readFile(wrappedPath, 'utf-8'); + for (const sentinel of DEV_ZOD_SENTINELS) { + expect(wrappedText).not.toContain(sentinel); + } + // No `/dev` import path may survive into the deploy bundle. + expect(wrappedText).not.toMatch(/@walkeros\/[\w-]+\/dev/); + + // If a sourcemap was emitted, grep it for the same sentinels: a leaked + // dev symbol can hide in mappings/sources even when absent from code. + const mapPath = `${wrappedPath}.map`; + if (await fs.pathExists(mapPath)) { + const mapText = await fs.readFile(mapPath, 'utf-8'); + for (const sentinel of DEV_ZOD_SENTINELS) { + expect(mapText).not.toContain(sentinel); + } + } + }, 120000); + }); +}); diff --git a/packages/cli/src/__tests__/unit/bundle/bundler-codegen.test.ts b/packages/cli/src/__tests__/unit/bundle/bundler-codegen.test.ts index a42e9bffe..287ffd3a5 100644 --- a/packages/cli/src/__tests__/unit/bundle/bundler-codegen.test.ts +++ b/packages/cli/src/__tests__/unit/bundle/bundler-codegen.test.ts @@ -22,6 +22,7 @@ import { generateWrapEntryServer, } from '../../../commands/bundle/bundler.js'; import { loadBundleConfig } from '../../../config/index.js'; +import path from 'path'; import type { Flow } from '@walkeros/core'; import type { BuildOptions } from '../../../types/bundle.js'; @@ -387,6 +388,69 @@ describe('Implicit Collector', () => { }); }); +describe('lazy /dev registry', () => { + // Resolve the real @walkeros/web-source-browser package so its package.json + // ./dev export is read by the codegen (it genuinely exposes ./dev). + const browserPkgPath = path.resolve( + __dirname, + '../../../../../web/sources/browser', + ); + + const flowSettings: Flow = { + config: { + platform: 'web', + bundle: { + packages: { + '@walkeros/web-source-browser': {}, + }, + }, + }, + sources: { + browser: { + package: '@walkeros/web-source-browser', + }, + }, + destinations: {}, + }; + + const buildOptions = { + platform: 'browser', + format: 'esm', + // withDev is resolved from skipWrapper when not set explicitly. + skipWrapper: true, + packages: { + '@walkeros/web-source-browser': {}, + }, + output: './dist/bundle.js', + code: '', + }; + + it('emits a lazy dynamic-import registry entry for a /dev-exposing package', async () => { + const { codeEntry } = await createEntryPoint( + flowSettings, + buildOptions as BuildOptions, + new Map([['@walkeros/web-source-browser', browserPkgPath]]), + ); + + expect(codeEntry).toContain('export const __devExports = {'); + expect(codeEntry).toContain( + "'@walkeros/web-source-browser': () => import('@walkeros/web-source-browser/dev')", + ); + }); + + it('emits NO static namespace dev import', async () => { + const { codeEntry } = await createEntryPoint( + flowSettings, + buildOptions as BuildOptions, + new Map([['@walkeros/web-source-browser', browserPkgPath]]), + ); + + // A static `import * as ... from '/dev'` cannot be tree-shaken out of + // the deploy wrap; the registry must use a lazy thunk instead. + expect(codeEntry).not.toMatch(/import \* as [^\n]*\/dev'/); + }); +}); + describe('detectStepPackages', () => { it('detects transformer packages from flow config', () => { const flowSettings: Flow = { @@ -1130,13 +1194,13 @@ describe('bundle() target resolution', () => { expect(buildOptions.withDev).toBe(false); }); - it('target="cdn-skeleton" resolves to skipWrapper=true, withDev=false', async () => { + it('target="cdn-skeleton" resolves to skipWrapper=true, withDev=true', async () => { const { bundle } = await import('../../../commands/bundle/index.js'); await bundle(MINIMAL_FLOW, { silent: true, target: 'cdn-skeleton' }); expect(warnSpy).not.toHaveBeenCalled(); const buildOptions = bundleCoreMock.mock.calls[0][1]; expect(buildOptions.skipWrapper).toBe(true); - expect(buildOptions.withDev).toBe(false); + expect(buildOptions.withDev).toBe(true); }); it('target="simulate" resolves to skipWrapper=true, withDev=true', async () => { diff --git a/packages/cli/src/__tests__/unit/bundle/targets.test.ts b/packages/cli/src/__tests__/unit/bundle/targets.test.ts index 386934485..d38c8b39a 100644 --- a/packages/cli/src/__tests__/unit/bundle/targets.test.ts +++ b/packages/cli/src/__tests__/unit/bundle/targets.test.ts @@ -13,19 +13,19 @@ describe('BundleTarget presets', () => { }); }); - it('cdn-skeleton: ESM skeleton for later IIFE wrap, no dev imports', () => { + it('cdn-skeleton: ESM skeleton for later IIFE wrap, carries lazy dev registry', () => { expect(resolveTarget('cdn-skeleton')).toEqual({ skipWrapper: true, - withDev: false, + withDev: true, platform: 'browser', injectEnv: false, // injected during wrapSkeleton stage 2, not here }); }); - it('runner: server skeleton for Scaleway function runtime, no dev imports', () => { + it('runner: server skeleton for Scaleway function runtime, carries lazy dev registry', () => { expect(resolveTarget('runner')).toEqual({ skipWrapper: true, - withDev: false, + withDev: true, platform: 'node', injectEnv: false, }); diff --git a/packages/cli/src/commands/bundle/bundler.ts b/packages/cli/src/commands/bundle/bundler.ts index 0d2b0850b..b5606cdad 100644 --- a/packages/cli/src/commands/bundle/bundler.ts +++ b/packages/cli/src/commands/bundle/bundler.ts @@ -510,11 +510,8 @@ export async function bundleCore( // Step 4: Create split entry point (code skeleton + data payload) logger.debug('Creating entry point'); - const { codeEntry, dataPayload, hasFlow } = await createEntryPoint( - flowSettings, - buildOptions, - packagePaths, - ); + const { codeEntry, dataPayload, hasFlow, devPackages } = + await createEntryPoint(flowSettings, buildOptions, packagePaths); // outputPath was resolved at the top of bundleCore (alongside the stale // cleanup paths). Just ensure its directory exists. @@ -530,7 +527,13 @@ export async function bundleCore( // dependency-version signal in `generateCacheKeyContent`). Reuse it here // so a transitive version bump busts both cache layers consistently. const codeKeyInputs: CodeCacheKeyInputs = { - externals: new Set(), + // Browser skeletons externalize each `/dev` so the lazy registry is + // not inlined; record it so the L2 slot reflects the externalization. + externals: new Set( + buildOptions.platform === 'browser' + ? devPackages.map((p) => `${p}/dev`) + : [], + ), platform: buildOptions.platform === 'node' ? 'node' : 'browser', target: resolveTarget(buildOptions), nodeMajor: parseInt(process.versions.node.split('.')[0], 10), @@ -570,6 +573,7 @@ export async function bundleCore( packagePaths, logger, expectedTopLevelPackages, + devPackages, ); try { @@ -816,6 +820,7 @@ function createEsbuildOptions( packagePaths: Map, logger: Logger.Instance, stepPackageExternals: string[] = [], + devPackages: string[] = [], ): esbuild.BuildOptions { // Don't use aliases - they cause esbuild to bundle even external packages // Instead, use absWorkingDir to point to temp directory where node_modules is @@ -853,8 +858,19 @@ function createEsbuildOptions( 'process.env.NODE_ENV': '"production"', global: 'globalThis', }; - // For browser bundles, let users handle Node.js built-ins as needed - baseOptions.external = buildOptions.external || []; + // For browser bundles, let users handle Node.js built-ins as needed. + // When the skeleton carries the lazy /dev registry (withDev), keep each + // `/dev` external so the registry stays a literal `import('/dev')` + // instead of inlining the /dev graph (zod schemas etc.) into the skeleton. + // The deploy wrap then DCEs the unreferenced registry to zero bytes, while + // simulate/preview can still resolve the dev module on demand. `devPackages` + // is empty for the finished IIFE (`cdn`, withDev:false), so this is a no-op + // there. + const devSubpathExternals = devPackages.map((p) => `${p}/dev`); + baseOptions.external = [ + ...(buildOptions.external || []), + ...devSubpathExternals, + ]; } else if (buildOptions.platform === 'node') { // Node builtins are always external. Step packages (sources, destinations, // transformers, stores resolved by pacote) are also externalized so the @@ -1081,6 +1097,10 @@ export function detectNamedImports( interface ImportGenerationResult { importStatements: string[]; devExportEntries: string[]; + // Packages that received a lazy `/dev` registry entry. Exposed so a later + // browser-build step can externalize the same `/dev` specifiers without + // recomputing the set (preventing drift between the registry and externals). + devPackages: string[]; } /** @@ -1165,36 +1185,50 @@ async function generateImportStatements( // Examples are no longer auto-imported - simulator loads them dynamically } - // Generate /dev imports for packages that expose a ./dev export. - // Only emitted when withDev is true (i.e., skipWrapper bundles consumed by - // push/simulate/flow-context). Production IIFE bundles skip this entirely β€” - // otherwise the dev graph (zod schemas, etc.) leaks into walker.js because - // stage 2 esbuild cannot tree-shake transitive imports out of the already- - // concatenated stage 1 file. - const devExportEntries: string[] = []; - if (withDev) { - for (const packageName of usedPackages) { - const localPath = packagePaths.get(packageName); - if (!localPath) continue; + // Register the /dev surface for packages that expose a ./dev export as a + // lazy dynamic import: `'': () => import('/dev')`. The specifier + // stays a literal so a node host can resolve it and esbuild can statically + // analyse it. Because the registry entry is an unreferenced thunk, the + // deploy wrap DCEs the whole /dev graph (zod schemas, etc.) to zero bytes, + // while simulate/push await the thunk to pull the dev module in on demand. + // The same dev-package set is reused for browser-build externals via + // computeDevPackages, so the registry and externals can never drift apart. + const devPackages = withDev + ? await computeDevPackages(usedPackages, packagePaths) + : []; + const devExportEntries = devPackages.map( + (packageName) => `'${packageName}': () => import('${packageName}/dev')`, + ); - try { - const pkgJsonPath = path.join(localPath, 'package.json'); - const pkgJson = await fs.readJSON(pkgJsonPath); - const exports = pkgJson.exports; - if (exports && typeof exports === 'object' && './dev' in exports) { - const varName = `__dev_${packageNameToVariable(packageName)}`; - importStatements.push( - `import * as ${varName} from '${packageName}/dev';`, - ); - devExportEntries.push(`'${packageName}': ${varName}`); - } - } catch { - // Package doesn't have a readable package.json β€” skip gracefully + return { importStatements, devExportEntries, devPackages }; +} + +/** + * Resolves which of the given packages expose a `./dev` subpath export. + * Single source of truth for the lazy `/dev` registry codegen and for the + * browser build's `/dev` externals, keeping the two in lockstep. + */ +export async function computeDevPackages( + usedPackages: Iterable, + packagePaths: Map, +): Promise { + const devPackages: string[] = []; + for (const packageName of usedPackages) { + const localPath = packagePaths.get(packageName); + if (!localPath) continue; + + try { + const pkgJsonPath = path.join(localPath, 'package.json'); + const pkgJson = await fs.readJSON(pkgJsonPath); + const exports = pkgJson.exports; + if (exports && typeof exports === 'object' && './dev' in exports) { + devPackages.push(packageName); } + } catch { + // Package doesn't have a readable package.json, skip gracefully } } - - return { importStatements, devExportEntries }; + return devPackages; } const VALID_JS_IDENTIFIER = /^[a-zA-Z_$][a-zA-Z0-9_$]*$/; @@ -1275,7 +1309,12 @@ export async function createEntryPoint( flowSettings: Flow, buildOptions: BuildOptions, packagePaths: Map, -): Promise<{ codeEntry: string; dataPayload: string; hasFlow: boolean }> { +): Promise<{ + codeEntry: string; + dataPayload: string; + hasFlow: boolean; + devPackages: string[]; +}> { // Detect packages used by all step types const sourcePackages = detectStepPackages(flowSettings, 'sources'); const destinationPackages = detectStepPackages(flowSettings, 'destinations'); @@ -1306,16 +1345,17 @@ export async function createEntryPoint( buildOptions.withDev !== undefined ? buildOptions.withDev === true : buildOptions.skipWrapper === true; - const { importStatements, devExportEntries } = await generateImportStatements( - buildOptions.packages, - destinationPackages, - sourcePackages, - transformerPackages, - storePackages, - namedImports, - packagePaths, - withDev, - ); + const { importStatements, devExportEntries, devPackages } = + await generateImportStatements( + buildOptions.packages, + destinationPackages, + sourcePackages, + transformerPackages, + storePackages, + namedImports, + packagePaths, + withDev, + ); const importsCode = importStatements.join('\n'); const hasFlow = @@ -1333,6 +1373,7 @@ export async function createEntryPoint( codeEntry: importsCode ? `${importsCode}\n\n${userCode}` : userCode, dataPayload: '{}', hasFlow: false, + devPackages, }; } @@ -1359,7 +1400,7 @@ export async function createEntryPoint( ? `${importsCode}\n\n${fullModule}` : fullModule; - return { codeEntry, dataPayload, hasFlow: true }; + return { codeEntry, dataPayload, hasFlow: true, devPackages }; } interface EsbuildError { diff --git a/packages/cli/src/commands/bundle/targets.ts b/packages/cli/src/commands/bundle/targets.ts index b3bf4e15f..ecf42017b 100644 --- a/packages/cli/src/commands/bundle/targets.ts +++ b/packages/cli/src/commands/bundle/targets.ts @@ -16,7 +16,14 @@ export type BundleTarget = export interface TargetPreset { /** If true, emit ESM skeleton. If false, emit IIFE via generateWebEntry. */ skipWrapper: boolean; - /** If true, include @walkeros/*\/dev imports for schema validation. */ + /** + * If true, the skeleton carries the lazy `/dev` registry + * (`'': () => import('/dev')`) so simulate/push can await schemas + * on demand. Every skeleton target sets this true: the registry is an + * unreferenced thunk, so the deploy wrap DCEs the whole /dev graph to zero + * bytes. Only the finished IIFE (`cdn`) sets this false, since it has no + * registry and needs none. + */ withDev: boolean; /** Runtime platform β€” controls env injection and Node/browser codegen. */ platform: 'browser' | 'node'; @@ -38,16 +45,20 @@ export const BUNDLE_TARGETS: Readonly< }), // skipWrapper emits the introspectable skeleton (not a finished bundle), // consumed by simulate/preview/deploy; node keeps step packages external for - // on-disk resolution (see the skeleton branch in bundler.ts). + // on-disk resolution (see the skeleton branch in bundler.ts). Every skeleton + // carries the lazy /dev registry (withDev:true); it is an unreferenced thunk + // the deploy wrap DCEs, so a skeleton with the registry is safe to deploy. + // The browser skeleton additionally externalizes each `/dev` so the + // registry stays a literal `import()` rather than inlining the /dev graph. 'cdn-skeleton': Object.freeze({ skipWrapper: true, - withDev: false, + withDev: true, platform: 'browser', injectEnv: false, }), runner: Object.freeze({ skipWrapper: true, - withDev: false, + withDev: true, platform: 'node', injectEnv: false, }), diff --git a/packages/cli/src/commands/push/flow-context.ts b/packages/cli/src/commands/push/flow-context.ts index af184bde9..3392bebd1 100644 --- a/packages/cli/src/commands/push/flow-context.ts +++ b/packages/cli/src/commands/push/flow-context.ts @@ -41,7 +41,13 @@ export interface FlowModule { // eslint-disable-next-line @typescript-eslint/no-explicit-any startFlow: (config: unknown) => Promise; __configData?: unknown; - __devExports?: Record; + /** + * Registry of lazy /dev loaders, one per package that exposes a `./dev` + * export. Each entry is a thunk that dynamically imports the dev module on + * demand, so the deploy wrap can DCE the unreferenced /dev graph to zero + * bytes. Read sites must call and await the thunk, then narrow the result. + */ + __devExports?: Record Promise>; } /** diff --git a/packages/cli/src/commands/push/index.ts b/packages/cli/src/commands/push/index.ts index 5517fcdb6..4f97c6eb3 100644 --- a/packages/cli/src/commands/push/index.ts +++ b/packages/cli/src/commands/push/index.ts @@ -48,6 +48,87 @@ function isString(value: unknown): value is string { return typeof value === 'string'; } +/** + * The trigger instance a source's `createTrigger` resolves to. `trigger` + * fires a (type, options) trigger; the returned function takes the captured + * content. `flow.collector.command` lets the simulator request a shutdown. + * The bundle carries no compile-time types, so the simulator pins only the + * members it actually reads and keeps the rest opaque. + */ +interface TriggerInstance { + trigger: ( + type?: string, + options?: unknown, + ) => (content?: unknown) => Promise; + flow?: { + collector?: { + command?: (command: string) => Promise; + }; + }; +} + +/** + * Signature of a source package's `createTrigger`, narrowed off the awaited + * `/dev` module. The simulator only needs it to be callable and resolve to a + * `TriggerInstance`; the bundle carries no compile-time types so the + * parameters stay opaque. + */ +type CreateTrigger = (...args: unknown[]) => Promise; + +/** + * Type predicate narrowing an opaque value to a callable. Used instead of a + * cast so `getCreateTrigger` can return a precisely typed function without + * the banned `Function` type or an `as` assertion. + */ +function isCallable(value: unknown): value is CreateTrigger { + return typeof value === 'function'; +} + +/** + * Narrow the awaited `/dev` module of a source package down to its + * `examples.createTrigger`, validating each hop with `in`/`typeof` instead of + * a cast. Returns `undefined` if any hop is missing or the wrong shape. + */ +function getCreateTrigger(devModule: unknown): CreateTrigger | undefined { + if (!isRecord(devModule) || !('examples' in devModule)) return undefined; + const examples = devModule.examples; + if (!isRecord(examples) || !('createTrigger' in examples)) return undefined; + const createTrigger = examples.createTrigger; + return isCallable(createTrigger) ? createTrigger : undefined; +} + +/** + * Shape of a destination package's simulation `/dev` env, narrowed off the + * awaited `/dev` module. `push` is the env object handed to the destination; + * `simulation` lists `call:` markers the wrapper should track. + */ +interface DevEnv { + push?: Record; + simulation?: string[]; +} + +/** + * Narrow the awaited `/dev` module of a destination package down to its + * `examples.env`, validating each hop with `in`/`typeof` instead of a cast. + * Returns `undefined` if any hop is missing or the wrong shape. + */ +function getDevEnv(devModule: unknown): DevEnv | undefined { + if (!isRecord(devModule) || !('examples' in devModule)) return undefined; + const examples = devModule.examples; + if (!isRecord(examples) || !('env' in examples)) return undefined; + const env = examples.env; + if (!isRecord(env)) return undefined; + const result: DevEnv = {}; + if ('push' in env && isRecord(env.push)) result.push = env.push; + if ('simulation' in env && isStringArray(env.simulation)) + result.simulation = env.simulation; + return result; +} + +function isStringArray(value: unknown): value is string[] { + return Array.isArray(value) && value.every(isString); +} + /** * Walk a transformer chain via static `.next` links starting at `startId`. * Mirrors the collector's internal `walkChain` for the case where the @@ -618,11 +699,12 @@ export async function simulateSource( networkCalls, }, async (module) => { - // Look up createTrigger from __devExports (bundled /dev export) - const devExports = module.__devExports?.[sourceConfig!.package!] as - | { examples?: { createTrigger?: Function } } - | undefined; - const createTrigger = devExports?.examples?.createTrigger; + // Look up createTrigger from the lazy __devExports registry: await the + // thunk to pull the /dev module in, then narrow without a cast. + const loadDev = module.__devExports?.[sourceConfig!.package!]; + const devModule = + typeof loadDev === 'function' ? await loadDev() : undefined; + const createTrigger = getCreateTrigger(devModule); if (!createTrigger) { throw new Error( `Source package "${sourceConfig!.package}" has no createTrigger in /dev export`, @@ -1035,17 +1117,11 @@ export async function simulateDestination( }> = []; if (destPkg?.package) { - const devExports = module.__devExports?.[destPkg.package] as - | { - examples?: { - env?: { - push?: Record; - simulation?: string[]; - }; - }; - } - | undefined; - const devEnv = devExports?.examples?.env; + // Await the lazy __devExports thunk, then narrow without a cast. + const loadDev = module.__devExports?.[destPkg.package]; + const devModule = + typeof loadDev === 'function' ? await loadDev() : undefined; + const devEnv = getDevEnv(devModule); if (devEnv?.push) { const destinations = flowConfig.destinations as Record< diff --git a/packages/cli/src/types/api.gen.d.ts b/packages/cli/src/types/api.gen.d.ts index ca549fc8c..41f7f1a72 100644 --- a/packages/cli/src/types/api.gen.d.ts +++ b/packages/cli/src/types/api.gen.d.ts @@ -676,7 +676,7 @@ export interface paths { [name: string]: unknown; }; content: { - 'application/json': components['schemas']['Project']; + 'application/json': components['schemas']['UpdateProjectResponse']; }; }; /** @description Validation error */ @@ -4131,6 +4131,16 @@ export interface components { */ createdAt: string; }; + UpdateProjectResponse: { + /** @example proj_x7y8z9 */ + id: string; + name: string; + /** + * Format: date-time + * @example 2026-01-26T14:30:00.000Z + */ + updatedAt: string; + }; DeploymentSummary: { /** @example dep_a1b2c3d4 */ id: string; diff --git a/packages/collector/src/__tests__/destination-batch-correctness.test.ts b/packages/collector/src/__tests__/destination-batch-correctness.test.ts index 9cac16da4..035385b1d 100644 --- a/packages/collector/src/__tests__/destination-batch-correctness.test.ts +++ b/packages/collector/src/__tests__/destination-batch-correctness.test.ts @@ -12,10 +12,22 @@ * timer. With size/age caps now landed on `debounce`, the batch * flushes deterministically. */ -import type { Destination, WalkerOS } from '@walkeros/core'; -import { createEvent, createIngest } from '@walkeros/core'; +import type { Destination, Logger, WalkerOS } from '@walkeros/core'; +import { createEvent, createIngest, Level } from '@walkeros/core'; import { pushToDestinations, startFlow } from '..'; +// Capture WARN-level log messages (+ their context) via a custom handler. +function makeWarnCapture(): { + warnings: Array<{ message: string; context: Logger.LogContext }>; + handler: Logger.Handler; +} { + const warnings: Array<{ message: string; context: Logger.LogContext }> = []; + const handler: Logger.Handler = (level, message, context) => { + if (level === Level.WARN) warnings.push({ message, context }); + }; + return { warnings, handler }; +} + describe('Destination batch correctness (PROD-004)', () => { beforeEach(() => { jest.useFakeTimers(); @@ -283,6 +295,90 @@ describe('Destination batch correctness (PROD-004)', () => { expect(flushResolved).toBe(true); }); + it('shutdown awaits a batch the size cap auto-fired while pushBatch is still in flight', async () => { + // Regression: the debounce SIZE cap fires `flushBatch` autonomously, so by + // the time shutdown calls flush() the buffer is already drained + // (lastArgs/pending empty). flush() used to early-return undefined and let + // teardown race the still-in-flight append, dropping the delivery. + type Deferred = { promise: Promise; resolve: () => void }; + function defer(): Deferred { + let resolve: () => void = () => {}; + const promise = new Promise((res) => { + resolve = res; + }); + return { promise, resolve }; + } + + const appendGate = defer(); + let appendSettled = false; + const seenSizes: number[] = []; + const mockPushBatch = jest.fn(async (batch: Destination.Batch) => { + seenSizes.push(batch.entries.length); + await appendGate.promise; + appendSettled = true; + }); + + let destroyed = false; + const dest: Destination.Instance = { + type: 'async-autofire', + push: jest.fn(), + pushBatch: mockPushBatch, + destroy: () => { + destroyed = true; + }, + config: { + init: true, + // size cap of 3 + a long wait: the 3rd push fires flushBatch on its own, + // shutdown never gets to drive the fire itself. + batch: { wait: 60_000, size: 3 }, + mapping: { '*': { '*': { batch: 1 } } }, + }, + }; + + const { collector, elb } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + for (let i = 0; i < 3; i++) { + await pushToDestinations( + collector, + makeEvent(i), + { ingest: createIngest(`req-${i}`) }, + collector.destinations, + ); + } + + // Let the autonomous fire begin and the append reach its gate. + await jest.advanceTimersByTimeAsync(0); + expect(mockPushBatch).toHaveBeenCalledTimes(1); + expect(seenSizes[0]).toBe(3); + expect(appendSettled).toBe(false); + + // Graceful shutdown while the append is still in flight. + let shutdownDone = false; + const shutdown = elb('walker shutdown').then(() => { + shutdownDone = true; + }); + + // Shutdown must NOT complete (and must NOT destroy the destination) while + // the in-flight append is unresolved. + await jest.advanceTimersByTimeAsync(0); + expect(shutdownDone).toBe(false); + expect(destroyed).toBe(false); + expect(appendSettled).toBe(false); + + // Resolve the append; shutdown should now settle and only then destroy. + appendGate.resolve(); + await shutdown; + + expect(appendSettled).toBe(true); + expect(shutdownDone).toBe(true); + expect(destroyed).toBe(true); + // Delivery counted, nothing dropped or failed. + expect(collector.status.destinations['d'].count).toBe(3); + expect(collector.status.destinations['d'].failed).toBe(0); + }); + it('a rejected async pushBatch routes the batch to the DLQ and increments failed, not out', async () => { const mockPushBatch = jest.fn(() => Promise.reject(new Error('async batch boom')), @@ -329,6 +425,380 @@ describe('Destination batch correctness (PROD-004)', () => { expect(destStatus.count).toBe(0); }); + it('caps the DLQ at dlqMax on a whole-batch failure, counts the overflow dropped, and warns once', async () => { + const { warnings, handler } = makeWarnCapture(); + const mockPushBatch = jest.fn(() => { + throw new Error('whole batch boom'); + }); + + const dest: Destination.Instance = { + type: 'overflowing', + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + // dlqMax smaller than the 5-entry batch: 3 entries must be dropped. + dlqMax: 2, + mapping: { '*': { '*': { batch: 1 } } }, + }, + }; + + const { collector } = await startFlow({ + destinations: { d: { code: dest } }, + // Default log level is ERROR; raise to WARN so the overflow warn surfaces. + logger: { level: 'WARN', handler }, + }); + + for (let i = 0; i < 5; i++) { + await pushToDestinations( + collector, + makeEvent(i), + { ingest: createIngest(`req-${i}`) }, + collector.destinations, + ); + } + + await collector.destinations['d'].batches!['* *'].flush(); + + // Only dlqMax (2) entries retained; dropOldest keeps the newest two. + const dlq = collector.destinations['d'].dlq; + expect(dlq).toBeDefined(); + expect(dlq!.length).toBe(2); + expect(dlq![0][0].id).toBe('e-3'); + expect(dlq![1][0].id).toBe('e-4'); + + // Drop counter records the 3 evicted entries under the dlq buffer. + const destStepId = Object.keys(collector.status.dropped).find((id) => + id.includes('destination'), + ); + expect(destStepId).toBeDefined(); + expect(collector.status.dropped[destStepId!].dlq).toBe(3); + + // Whole batch counted as failed regardless of DLQ eviction. + expect(collector.status.destinations['d'].failed).toBe(5); + expect(collector.status.destinations['d'].dlqSize).toBe(2); + + // Overflow warn fired exactly once, with the cap + cumulative drop count. + const overflowWarns = warnings.filter((w) => + /destination\.dlq overflow/.test(w.message), + ); + expect(overflowWarns.length).toBe(1); + expect(overflowWarns[0].context.buffer).toBe('dlq'); + expect(overflowWarns[0].context.cap).toBe(2); + expect(overflowWarns[0].context.droppedCount).toBe(3); + }); + + it('a returned BatchOutcome DLQs only the failed entries and counts the rest delivered', async () => { + // pushBatch returns a per-row outcome: entry index 1 of 3 failed. + const mockPushBatch = jest.fn( + (_batch: Destination.Batch): Destination.BatchOutcome => ({ + failed: [{ index: 1, error: new Error('row 1 invalid') }], + }), + ); + + const dest: Destination.Instance = { + type: 'partial-outcome', + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + mapping: { '*': { '*': { batch: 1 } } }, + }, + }; + + const { collector } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + for (let i = 0; i < 3; i++) { + await pushToDestinations( + collector, + makeEvent(i), + { ingest: createIngest(`req-${i}`) }, + collector.destinations, + ); + } + + await collector.destinations['d'].batches!['* *'].flush(); + + // Only the single failed entry is on the DLQ, with its own per-row error. + const dlq = collector.destinations['d'].dlq; + expect(dlq).toBeDefined(); + expect(dlq!.length).toBe(1); + expect(dlq![0][0].id).toBe('e-1'); + const rowError = dlq![0][1]; + expect(rowError).toBeInstanceOf(Error); + if (!(rowError instanceof Error)) throw new Error('expected Error'); + expect(rowError.message).toBe('row 1 invalid'); + + const destStatus = collector.status.destinations['d']; + // 1 failed, 2 delivered. + expect(destStatus.failed).toBe(1); + expect(destStatus.count).toBe(2); + }); + + it('a BatchOutcome with empty failed counts the whole batch delivered', async () => { + const mockPushBatch = jest.fn( + (_batch: Destination.Batch): Destination.BatchOutcome => ({ + failed: [], + }), + ); + + const dest: Destination.Instance = { + type: 'empty-outcome', + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + mapping: { '*': { '*': { batch: 1 } } }, + }, + }; + + const { collector } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + for (let i = 0; i < 3; i++) { + await pushToDestinations( + collector, + makeEvent(i), + { ingest: createIngest(`req-${i}`) }, + collector.destinations, + ); + } + + await collector.destinations['d'].batches!['* *'].flush(); + + expect(collector.destinations['d'].dlq?.length ?? 0).toBe(0); + const destStatus = collector.status.destinations['d']; + expect(destStatus.failed).toBe(0); + expect(destStatus.count).toBe(3); + }); + + it('a BatchOutcome failure carries a generic error to the DLQ when no per-row error is given', async () => { + const mockPushBatch = jest.fn( + (_batch: Destination.Batch): Destination.BatchOutcome => ({ + failed: [{ index: 0 }], + }), + ); + + const dest: Destination.Instance = { + type: 'outcome-no-error', + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + mapping: { '*': { '*': { batch: 1 } } }, + }, + }; + + const { collector } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + await pushToDestinations( + collector, + makeEvent(0), + { ingest: createIngest('req-0') }, + collector.destinations, + ); + + await collector.destinations['d'].batches!['* *'].flush(); + + const dlq = collector.destinations['d'].dlq; + expect(dlq?.length).toBe(1); + expect(dlq![0][0].id).toBe('e-0'); + // A real error object lands on the DLQ even without a per-row error. + expect(dlq![0][1]).toBeInstanceOf(Error); + + const destStatus = collector.status.destinations['d']; + expect(destStatus.failed).toBe(1); + expect(destStatus.count).toBe(0); + }); + + // --- config.batch enabler routing (BATCH_ALL_KEY) --------------------- + // + // These three pin the untested routing branches of the `config.batch` + // enabler: a destination-wide default buffer keyed by the reserved + // BATCH_ALL_KEY (' batch-all', leading space). They are characterization + // tests asserting the CURRENT behavior of destination.ts. + + it('(e) config.batch + a wildcard "* *" rule with batch routes to the "* *" buffer, never " batch-all", and rule bounds win', async () => { + // Both config.batch (bounds) AND a wildcard rule with its own batch are set. + // ruleHasBatch is true for every event (the "* *" rule matches all), so + // mappingKey = processed.mappingKey ("* *"), NOT the reserved BATCH_ALL_KEY. + // The rule's size (2) overrides config.batch's size (50): the buffer must + // flush at 2 entries, proving config.batch only supplies bounds fallback. + const batchSizes: number[] = []; + const mockPushBatch = jest.fn((batch: Destination.Batch) => { + batchSizes.push(batch.entries.length); + }); + + const dest: Destination.Instance = { + type: 'wildcard-and-config', + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + // config.batch bounds: high wait, size 50 (would never flush at 4). + batch: { wait: 60_000, size: 50 }, + mapping: { + // wildcard rule carries its OWN batch with a much smaller size cap. + '*': { '*': { batch: { wait: 60_000, size: 2 } } }, + }, + }, + }; + + const { collector } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + // 4 events: with rule size=2 winning, the buffer flushes twice (2 + 2). + // If config.batch's size=50 had won instead, zero flushes would occur. + for (let i = 0; i < 4; i++) { + await pushToDestinations( + collector, + makeEvent(i), + { ingest: createIngest(`req-${i}`) }, + collector.destinations, + ); + } + + // Rule bounds (size 2) drove two synchronous flushes without timer advance. + expect(mockPushBatch).toHaveBeenCalledTimes(2); + expect(batchSizes).toEqual([2, 2]); + + // Routing landed on the "* *" rule buffer, and the reserved default buffer + // was NEVER created. + const batches = collector.destinations['d'].batches; + expect(batches).toBeDefined(); + expect(Object.keys(batches!)).toEqual(['* *']); + expect(batches!['* *']).toBeDefined(); + expect(batches![' batch-all']).toBeUndefined(); + // The "* *" buffer is a rule buffer, not the config.batch default. + expect(batches!['* *'].isDefault).toBe(false); + }); + + it('(d) config.batch + a matched non-wildcard rule with no batch routes to " batch-all" (rule undefined), still applying the rule data transform', async () => { + // config.batch is set, plus a non-wildcard rule `page.view` that MATCHES + // the event but has NO batch of its own (only a `data` transform). + // For those events ruleHasBatch is false, so mappingKey = BATCH_ALL_KEY + // (' batch-all'), the flush-context rule is undefined (isDefault buffer), + // yet processed.data from the matched rule is still applied per entry. + let capturedRule: Destination.PushBatchContext['rule'] | 'unset' = 'unset'; + const capturedData: Destination.Data[] = []; + const mockPushBatch = jest.fn( + ( + batch: Destination.Batch, + context: Destination.PushBatchContext, + ) => { + capturedRule = context.rule; + for (const value of batch.data) capturedData.push(value); + }, + ); + + const dest: Destination.Instance = { + type: 'config-batch-data-rule', + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + // config.batch enables the destination-wide default buffer. + batch: { wait: 60_000 }, + mapping: { + // Non-wildcard rule that matches "page view"; data transform, no batch. + page: { view: { data: { map: { foo: { value: 'bar' } } } } }, + }, + }, + }; + + const { collector } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + for (let i = 0; i < 3; i++) { + await pushToDestinations( + collector, + makeEvent(i), + { ingest: createIngest(`req-${i}`) }, + collector.destinations, + ); + } + + // The only buffer is the reserved default; no rule buffer was created. + const batches = collector.destinations['d'].batches; + expect(batches).toBeDefined(); + expect(Object.keys(batches!)).toEqual([' batch-all']); + expect(batches![' batch-all'].isDefault).toBe(true); + + // Flush the default buffer (long wait, so no timer fires on its own). + await batches![' batch-all'].flush(); + + expect(mockPushBatch).toHaveBeenCalledTimes(1); + // isDefault buffer => flush-context rule is undefined, NOT the matched rule. + expect(capturedRule).toBeUndefined(); + // The matched rule's data transform was still applied to every batched event. + expect(capturedData).toEqual([ + { foo: 'bar' }, + { foo: 'bar' }, + { foo: 'bar' }, + ]); + }); + + it('(config.batch-only) a rejecting pushBatch on the " batch-all" default buffer routes the whole batch to DLQ and increments failed', async () => { + // config.batch set, NO rule batch: events land in the reserved default + // buffer (' batch-all'). A plain rejecting pushBatch (throw, not a + // BatchOutcome) must whole-batch-DLQ and count `failed`, proving DLQ + // accounting works for the DEFAULT buffer, not only rule buffers. + const mockPushBatch = jest.fn(() => { + throw new Error('default buffer boom'); + }); + + const dest: Destination.Instance = { + type: 'config-batch-default-dlq', + push: jest.fn(), + pushBatch: mockPushBatch, + config: { + init: true, + // config.batch only: no mapping rule carries batch. + batch: { wait: 60_000 }, + }, + }; + + const { collector } = await startFlow({ + destinations: { d: { code: dest } }, + }); + + for (let i = 0; i < 4; i++) { + await pushToDestinations( + collector, + makeEvent(i), + { ingest: createIngest(`req-${i}`) }, + collector.destinations, + ); + } + + const batches = collector.destinations['d'].batches; + expect(batches).toBeDefined(); + expect(Object.keys(batches!)).toEqual([' batch-all']); + + await batches![' batch-all'].flush(); + + // Whole batch routed to the DLQ as [event, error]. + const dlq = collector.destinations['d'].dlq; + expect(dlq).toBeDefined(); + expect(dlq!.length).toBe(4); + expect(dlq![0][0].id).toBe('e-0'); + const firstError = dlq![0][1]; + expect(firstError).toBeInstanceOf(Error); + if (!(firstError instanceof Error)) throw new Error('expected Error'); + expect(firstError.message).toBe('default buffer boom'); + + const destStatus = collector.status.destinations['d']; + expect(destStatus.failed).toBe(4); + expect(destStatus.count).toBe(0); + }); + it('routes batch failure to DLQ without leaking unhandled rejections', async () => { const unhandled: unknown[] = []; const onUnhandled = (err: unknown): void => { diff --git a/packages/collector/src/__tests__/shutdown.test.ts b/packages/collector/src/__tests__/shutdown.test.ts index fee4f5c2a..02223bcfb 100644 --- a/packages/collector/src/__tests__/shutdown.test.ts +++ b/packages/collector/src/__tests__/shutdown.test.ts @@ -1,4 +1,10 @@ -import type { Destination, Logger, Source, Transformer } from '@walkeros/core'; +import type { + Destination, + Logger, + Source, + Store, + Transformer, +} from '@walkeros/core'; import { Level } from '@walkeros/core'; import { startFlow } from '..'; @@ -148,6 +154,36 @@ describe('shutdown command', () => { expect(mockPushBatch).toHaveBeenCalledTimes(1); }); + it('is idempotent: a second shutdown does not re-destroy or throw', async () => { + const destDestroy = jest.fn(); + const storeDestroy = jest.fn(); + const { collector, elb } = await startFlow({}); + + const dest: Destination.Instance = { + config: {}, + push: jest.fn(), + type: 'test-dest', + destroy: destDestroy, + }; + collector.destinations['test'] = dest; + const store: Store.Instance = { + type: 'memory', + config: {}, + get: jest.fn(), + set: jest.fn(), + delete: jest.fn(), + destroy: storeDestroy, + }; + collector.stores['kv'] = store; + + await elb('walker shutdown'); + // Second shutdown must be a clean no-op: never throws, never re-destroys. + await expect(elb('walker shutdown')).resolves.toBeDefined(); + + expect(destDestroy).toHaveBeenCalledTimes(1); + expect(storeDestroy).toHaveBeenCalledTimes(1); + }); + it('respects shutdown order: sources before destinations before transformers', async () => { const order: string[] = []; const { collector, elb } = await startFlow({}); @@ -274,5 +310,73 @@ describe('shutdown command', () => { ).toBe(true); expect(jest.getTimerCount()).toBe(0); }); + + it('isolates a slow destination: fast one flushes, slow one times out independently', async () => { + const { messages, handler } = makeErrorCapture(); + const { collector, elb } = await startFlow({ logger: { handler } }); + + let fastFlushed = false; + const fast: Destination.Instance = { + config: {}, + push: jest.fn(), + type: 'fast', + batches: makeBatches(async () => { + fastFlushed = true; + }), + }; + const slow: Destination.Instance = { + config: {}, + push: jest.fn(), + type: 'slow', + batches: makeBatches(() => new Promise(() => {})), // never resolves + }; + collector.destinations['fast'] = fast; + collector.destinations['slow'] = slow; + + const shutdown = elb('walker shutdown'); + // Drive the slow destination to its timeout; the fast one resolved already. + await jest.advanceTimersByTimeAsync(STEP_TIMEOUT); + await shutdown; // resolves even though one destination hung + + // Fast destination completed its flush within budget. + expect(fastFlushed).toBe(true); + // Slow destination timed out independently, without blocking the fast one. + expect( + messages.some((m) => + /destination 'slow' batch flush timed out/.test(m), + ), + ).toBe(true); + expect( + messages.some((m) => + /destination 'fast' batch flush timed out/.test(m), + ), + ).toBe(false); + expect(jest.getTimerCount()).toBe(0); + }); + + it('empty-batch shutdown is a clean no-op for a destination with zero buffered entries', async () => { + const { messages, handler } = makeErrorCapture(); + const { collector, elb } = await startFlow({ logger: { handler } }); + + // Batch registry present, but the single batch has zero buffered entries + // and a flush that resolves immediately (nothing to send). + let flushCalls = 0; + const dest: Destination.Instance = { + config: {}, + push: jest.fn(), + type: 'empty', + batches: makeBatches(async () => { + flushCalls++; + }), + }; + collector.destinations['empty'] = dest; + + await expect(elb('walker shutdown')).resolves.toBeDefined(); + + // Flush ran and resolved cleanly; no error logged, no timer leaked. + expect(flushCalls).toBe(1); + expect(messages).toEqual([]); + expect(jest.getTimerCount()).toBe(0); + }); }); }); diff --git a/packages/collector/src/collector.ts b/packages/collector/src/collector.ts index 6a7360121..d1049de27 100644 --- a/packages/collector/src/collector.ts +++ b/packages/collector/src/collector.ts @@ -69,6 +69,7 @@ export async function collector( user: initConfig.user || {}, sources: {}, pending: { destinations: {} }, + hasShutdown: false, seenEvents: new Set(), push: undefined as unknown as Collector.PushFn, // Placeholder, will be set below command: undefined as unknown as Collector.CommandFn, // Placeholder, will be set below diff --git a/packages/collector/src/destination.ts b/packages/collector/src/destination.ts index e22b56ebf..543bff2d0 100644 --- a/packages/collector/src/destination.ts +++ b/packages/collector/src/destination.ts @@ -63,6 +63,17 @@ function isBatchedResult(value: unknown): boolean { return value === BATCHED_RESULT; } +/** + * Narrows a `pushBatch` return value to a {@link Destination.BatchOutcome}. + * A destination returning `void` (the historical contract) yields whole-batch + * semantics and never enters the partial-failure path. + */ +function isBatchOutcome(value: unknown): value is Destination.BatchOutcome { + return ( + isObject(value) && Array.isArray((value as { failed?: unknown }).failed) + ); +} + /** * Reserved buffer key for the destination-wide default batch created by * `config.batch`. A produced mappingKey is `" "` with both @@ -1036,8 +1047,52 @@ export async function destinationPush( flushState.batch = { size: snapshot.entries.length, index: 0 }; emitStep(collector, flushState); - let succeeded = true; - await tryCatchAsync( + // Routes a set of [event, error] pairs to this destination's DLQ, + // applying the bound and emitting the overflow warning once. Shared by + // the whole-batch failure path (pushBatch threw) and the partial-row + // failure path (pushBatch returned a BatchOutcome). + const routeToDlq = (failures: Array<[WalkerOS.Event, unknown]>) => { + const dlq = (destination.dlq = destination.dlq || []); + const dlqBound = { + max: destination.config.dlqMax ?? DEFAULT_DLQ_MAX, + }; + let totalDropped = 0; + for (const pair of failures) { + const r = pushBounded(dlq, pair, dlqBound); + totalDropped += r.dropped; + } + if (totalDropped > 0) { + const droppedCount = bumpDropped( + collector.status, + stepId('destination', destIdResolved), + 'dlq', + totalDropped, + ); + warnOverflowOnce( + dlq, + destLogger, + 'destination.dlq overflow; oldest entries dropped', + { + buffer: 'dlq', + destination: destIdResolved, + cap: dlqBound.max, + droppedCount, + }, + ); + } else if (dlq.length < dlqBound.max) { + resetOverflowFlag(dlq); + } + destStatus.failed += failures.length; + destStatus.dlqSize = dlq.length; + collector.status.failed += failures.length; + }; + + // Number of entries treated as delivered after the flush settles. + // Whole-batch success (void) = all; whole-batch failure (throw) = 0; + // partial outcome = total minus the entries reported failed. + let succeededCount = snapshot.entries.length; + + const outcome = await tryCatchAsync( useHooks( destination.pushBatch!, 'DestinationPushBatch', @@ -1045,7 +1100,7 @@ export async function destinationPush( collector.logger, ), (err) => { - succeeded = false; + succeededCount = 0; const errFinished = Date.now(); const batchErrState = buildBaseState(collector, { stepId: stepId('destination', destId), @@ -1064,40 +1119,8 @@ export async function destinationPush( index: 0, }; emitStep(collector, batchErrState); - // Defect 2 fix: route the entire batch to DLQ on failure. - const dlq = (destination.dlq = destination.dlq || []); - const dlqBound = { - max: destination.config.dlqMax ?? DEFAULT_DLQ_MAX, - }; - let totalDropped = 0; - for (const entry of snapshot.entries) { - const r = pushBounded(dlq, [entry.event, err], dlqBound); - totalDropped += r.dropped; - } - if (totalDropped > 0) { - const droppedCount = bumpDropped( - collector.status, - stepId('destination', destIdResolved), - 'dlq', - totalDropped, - ); - warnOverflowOnce( - dlq, - destLogger, - 'destination.dlq overflow; oldest entries dropped', - { - buffer: 'dlq', - destination: destIdResolved, - cap: dlqBound.max, - droppedCount, - }, - ); - } else if (dlq.length < dlqBound.max) { - resetOverflowFlag(dlq); - } - destStatus.failed += snapshot.entries.length; - destStatus.dlqSize = dlq.length; - collector.status.failed += snapshot.entries.length; + // Route the entire batch to DLQ on a thrown/whole-batch failure. + routeToDlq(snapshot.entries.map((entry) => [entry.event, err])); destLogger.error('Push batch failed', { error: err instanceof Error ? err.message : String(err), entries: snapshot.entries.length, @@ -1106,6 +1129,38 @@ export async function destinationPush( }, )(snapshot, batchContext); + // Partial-failure path: pushBatch resolved a BatchOutcome listing the + // entries that did not succeed. DLQ and fail-count only those; the rest + // are delivered. Out-of-range indices are ignored defensively. + if (isBatchOutcome(outcome) && outcome.failed.length > 0) { + const failedPairs: Array<[WalkerOS.Event, unknown]> = []; + const seen = new Set(); + for (const failure of outcome.failed) { + const entry = snapshot.entries[failure.index]; + if (!entry || seen.has(failure.index)) continue; + seen.add(failure.index); + failedPairs.push([ + entry.event, + failure.error ?? + new Error( + `Push batch entry ${failure.index} failed (no per-row error provided)`, + ), + ]); + } + if (failedPairs.length > 0) { + routeToDlq(failedPairs); + succeededCount = Math.max( + 0, + snapshot.entries.length - failedPairs.length, + ); + destLogger.error('Push batch partial failure', { + failed: failedPairs.length, + delivered: succeededCount, + entries: snapshot.entries.length, + }); + } + } + destLogger.debug('push batch done'); // Decrement in-flight regardless of outcome. @@ -1114,10 +1169,10 @@ export async function destinationPush( (destStatus.inFlightBatch ?? 0) - snapshot.entries.length, ); - if (succeeded) { - destStatus.count += snapshot.entries.length; + if (succeededCount > 0) { + destStatus.count += succeededCount; destStatus.lastAt = Date.now(); - collector.status.out += snapshot.entries.length; + collector.status.out += succeededCount; } }; diff --git a/packages/collector/src/handle.ts b/packages/collector/src/handle.ts index 8ebf05b7d..4de00a95f 100644 --- a/packages/collector/src/handle.ts +++ b/packages/collector/src/handle.ts @@ -158,7 +158,13 @@ export async function commonHandleCommand( break; case Const.Commands.Shutdown: - await destroyAllSteps(collector); + // Idempotency guard: re-entrancy must not re-run destroyAllSteps and + // double-close writers, destinations, or stores. The first shutdown + // sets the flag and runs the full sequence; a second command no-ops. + if (!collector.hasShutdown) { + collector.hasShutdown = true; + await destroyAllSteps(collector); + } break; case Const.Commands.User: diff --git a/packages/collector/src/types/code.ts b/packages/collector/src/types/code.ts index 1b6204ef4..f5cfa677a 100644 --- a/packages/collector/src/types/code.ts +++ b/packages/collector/src/types/code.ts @@ -25,4 +25,4 @@ export type PushFn = (event: WalkerOS.Event, context: PushContext) => void; export type PushBatchFn = ( batch: Destination.Batch, context: PushBatchContext, -) => WalkerOS.PromiseOrValue; +) => WalkerOS.PromiseOrValue; diff --git a/packages/core/src/__tests__/cache.test.ts b/packages/core/src/__tests__/cache.test.ts index 0e7763331..18eb13d8f 100644 --- a/packages/core/src/__tests__/cache.test.ts +++ b/packages/core/src/__tests__/cache.test.ts @@ -269,6 +269,31 @@ describe('storeCache', () => { expect(Buffer.isBuffer(encoded)).toBe(true); expect(decodeCacheValue(encoded)).toEqual({ value: { data: 'value' } }); }); + + it('does not throw or leak an unhandled rejection when an async set rejects', async () => { + const store: Store.Instance = { + type: 'rejecting-mock', + config: {}, + get: async () => undefined, + set: () => Promise.reject(new Error('backend down')), + delete: async () => undefined, + }; + + const rejections: unknown[] = []; + const onRejection = (reason: unknown) => rejections.push(reason); + process.on('unhandledRejection', onRejection); + try { + // Must not throw synchronously: the HTTP response is already sent. + expect(() => storeCache(store, 'k', { status: 200 }, 60)).not.toThrow(); + // Flush the microtask queue (timers are faked in this suite, so a + // real macrotask never fires). The rejection rides a real Promise, so + // microtask turns are enough for an escaped rejection to surface. + for (let i = 0; i < 5; i++) await Promise.resolve(); + expect(rejections).toHaveLength(0); + } finally { + process.off('unhandledRejection', onRejection); + } + }); }); describe('applyUpdate', () => { @@ -431,6 +456,37 @@ describe('cache value codec', () => { expect(decodeCacheValue(Buffer.from('not json'))).toBeUndefined(); expect(decodeCacheValue(Buffer.from('{"x":1}'))).toBeUndefined(); }); + + it('round-trips a Uint8Array body to a byte-identical Buffer', () => { + const value = { status: 200, body: new Uint8Array([0xff, 0x00, 0x80]) }; + expect(decodeCacheValue(encodeCacheValue(value))).toEqual({ + value: { status: 200, body: Buffer.from([0xff, 0x00, 0x80]) }, + }); + }); + + it('round-trips a Uint8Array view honoring byteOffset/length', () => { + const full = new Uint8Array([0x01, 0xff, 0x00, 0x80, 0x02]); + const view = full.subarray(1, 4); // [0xff, 0x00, 0x80] + const value = { body: view }; + expect(decodeCacheValue(encodeCacheValue(value))).toEqual({ + value: { body: Buffer.from([0xff, 0x00, 0x80]) }, + }); + }); + + it('round-trips an ArrayBuffer body to a byte-identical Buffer', () => { + const source = new Uint8Array([0xde, 0xad, 0xbe, 0xef]); + const value = { body: source.buffer }; + expect(decodeCacheValue(encodeCacheValue(value))).toEqual({ + value: { body: Buffer.from([0xde, 0xad, 0xbe, 0xef]) }, + }); + }); + + it('round-trips an empty Uint8Array to an empty Buffer', () => { + const value = { body: new Uint8Array([]) }; + expect(decodeCacheValue(encodeCacheValue(value))).toEqual({ + value: { body: Buffer.alloc(0) }, + }); + }); }); describe('request-cache codec wiring', () => { diff --git a/packages/core/src/cache.ts b/packages/core/src/cache.ts index a7fd76d9c..6781cccea 100644 --- a/packages/core/src/cache.ts +++ b/packages/core/src/cache.ts @@ -102,7 +102,17 @@ export function storeCache( ttlSeconds: number, ): void { // exp inside the value = uniform expiry for stores that ignore the ttl arg (fs/s3/gcs); the ttl arg lets honoring stores (in-memory) evict instead of retaining until read. - store.set(key, encodeCacheValue(value, ttlSeconds * 1000), ttlSeconds * 1000); + // `Store.SetFn` returns `void | Promise`. The write is fire-and-forget + // (the HTTP response is sent before it lands), so an async store that rejects + // (network error, EACCES) would otherwise surface as an unhandled rejection + // and crash the process. Swallow it: a failed cache persist must never crash + // the request path. Matches the best-effort silent catch on the checkCache purge. + const result = store.set( + key, + encodeCacheValue(value, ttlSeconds * 1000), + ttlSeconds * 1000, + ); + if (result instanceof Promise) result.catch(() => {}); } // --- Cache value codec ------------------------------------------------------- @@ -123,10 +133,28 @@ function isRecord(value: unknown): value is Record { } // Precondition: cache values are acyclic JSON-shaped data (RespondOptions / -// events / push results / sentinel); Buffer is the only non-JSON leaf handled. +// events / push results / sentinel); binary leaves are the only non-JSON +// values handled. Node `Buffer`, any `Uint8Array` (e.g. a Fetch-sourced body), +// and a raw `ArrayBuffer` all normalize to the same base64 'buffer' tag and +// therefore all decode back as a Node `Buffer` (see fromSerializable). A +// `Buffer` IS a `Uint8Array`, so the `Buffer.isBuffer` check stays first; the +// `Uint8Array` branch then handles non-Buffer typed-array views by slicing on +// byteOffset/byteLength so a subarray view contributes only its own bytes. function toSerializable(value: unknown): unknown { if (Buffer.isBuffer(value)) return { [CACHE_MARKER]: 'buffer', d: value.toString('base64') }; + if (value instanceof Uint8Array) + return { + [CACHE_MARKER]: 'buffer', + d: Buffer.from(value.buffer, value.byteOffset, value.byteLength).toString( + 'base64', + ), + }; + if (value instanceof ArrayBuffer) + return { + [CACHE_MARKER]: 'buffer', + d: Buffer.from(value).toString('base64'), + }; if (Array.isArray(value)) return value.map(toSerializable); if (isRecord(value)) { if (CACHE_MARKER in value) { diff --git a/packages/core/src/invocations.ts b/packages/core/src/invocations.ts index dd4560a58..55d8fd567 100644 --- a/packages/core/src/invocations.ts +++ b/packages/core/src/invocations.ts @@ -151,7 +151,13 @@ export function debounce

( debounced.flush = (): Promise => { if (!lastArgs && pending.length === 0) { - return Promise.resolve(undefined); + // Nothing buffered, but an autonomous fire (wait/age/size cap) may have + // already drained the window and left its `fn` return in `result`. For a + // batch flush whose `fn` is async, that promise can still be in flight. + // Return it so a caller like graceful shutdown awaits the in-flight work + // instead of racing teardown (e.g. closing a writer mid-append). When + // the last fire was sync/settled, awaiting a resolved value is a no-op. + return Promise.resolve(result); } return new Promise((resolve) => { pending.push(resolve); @@ -163,6 +169,9 @@ export function debounce

( const settle = pending; pending = []; reset(); + // Drop the last fire's retained return so a later empty flush() cannot + // surface a stale, cancelled-context result instead of undefined. + result = undefined; settle.forEach((r) => r(undefined)); }; diff --git a/packages/core/src/types/collector.ts b/packages/core/src/types/collector.ts index 2b7c2d36d..a1417e984 100644 --- a/packages/core/src/types/collector.ts +++ b/packages/core/src/types/collector.ts @@ -326,6 +326,14 @@ export interface Instance { pending: { destinations: Destination.InitDestinations; }; + /** + * Set true on the first `shutdown` command, guarding re-entrancy: a second + * `walker shutdown` must not re-run `destroyAllSteps` and double-close + * writers, destinations, or stores. Subsequent shutdown commands no-op. + * Initialized to `false` by the collector factory; absence (`undefined`) + * is treated as "not yet shut down", so the first shutdown still runs. + */ + hasShutdown?: boolean; /** * Every event type ever broadcast through `onApply` (state, lifecycle, and * arbitrary). Lets a `require:[]` gate be satisfied by the current diff --git a/packages/core/src/types/destination.ts b/packages/core/src/types/destination.ts index b69cd9f98..11893dc42 100644 --- a/packages/core/src/types/destination.ts +++ b/packages/core/src/types/destination.ts @@ -236,10 +236,50 @@ export type PushFn = ( context: PushContext, ) => WalkerOS.PromiseOrValue; +/** + * Per-row outcome for a single failed batch entry. + * + * `index` is the position into the flushed batch's `entries` array (the same + * order `flushBatch` iterates), so the collector can route exactly that entry + * to the DLQ. `error` is the per-row failure carried into the DLQ pair; when + * omitted the collector substitutes a generic batch error. + */ +export interface BatchFailure { + index: number; + error?: unknown; +} + +/** + * Partial-failure result of a `pushBatch` call. + * + * `failed` lists the entries (by index into the flushed batch's `entries` + * array) that did NOT succeed. Every other entry is treated as delivered. + */ +export interface BatchOutcome { + failed: BatchFailure[]; +} + +/** + * Pushes a batch of events to a destination. + * + * Return contract (backward-compatible, additive): + * - Resolve `void` (the historical contract): the WHOLE batch succeeded. The + * collector counts every entry as delivered. + * - Throw / reject: the WHOLE batch failed. The collector routes every entry + * to the DLQ and increments `failed` by the batch size. Unchanged. + * - Resolve a `BatchOutcome`: PARTIAL failure. Only the entries listed in + * `outcome.failed` (by `index` into the flushed `entries` array) are routed + * to the DLQ and counted as failed, each with its own `error`. The remaining + * entries are counted as delivered. This lets a destination report row-level + * failures without forcing already-succeeded rows onto the DLQ, which would + * duplicate them on a later DLQ retry. + * + * Indices in `failed` must line up with the order of `batch.entries`. + */ export type PushBatchFn = ( batch: Batch>, context: PushBatchContext, -) => WalkerOS.PromiseOrValue; +) => WalkerOS.PromiseOrValue; export type PushEvent = { event: WalkerOS.Event; diff --git a/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts b/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts index ed46c2bc1..bb9e42915 100644 --- a/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts +++ b/packages/server/destinations/gcp/src/bigquery/__tests__/index.test.ts @@ -367,7 +367,7 @@ describe('Server Destination BigQuery', () => { expect(rowsArg[1]).toEqual(mappedRow); }); - test('rejects when the batch has row errors', async () => { + test('returns a per-row outcome on partial row errors instead of throwing', async () => { if (!destination.pushBatch) throw new Error('pushBatch missing'); const config = await buildConfig(); @@ -381,9 +381,10 @@ describe('Server Destination BigQuery', () => { const data: Array = events.map(() => undefined); const entries = events.map((event) => ({ event })); - // The batch path now propagates row errors so the collector flush - // boundary routes the batch to the DLQ. - const pending = destination.pushBatch!( + // Partial row errors no longer throw: the destination reports the failed + // rows via a BatchOutcome so the collector DLQs only those rows and counts + // the succeeded rows as delivered (avoids duplicates on DLQ retry). + const outcome = await destination.pushBatch!( { key: 'k', events, data, entries }, createMockContext({ config, @@ -392,11 +393,18 @@ describe('Server Destination BigQuery', () => { id: 'test-bq', }), ); - // Assert the summary framing, not just the row substring, so a future - // simplification of the thrown message is caught. - await expect(pending).rejects.toThrow('invalid row'); - await expect(pending).rejects.toThrow(/of 3 rows/); - await expect(pending).rejects.toThrow(/code=3/); + + expect(outcome).toEqual({ + failed: [ + { + index: 0, + error: expect.objectContaining({ + code: 3, + message: 'invalid row', + }), + }, + ], + }); // INFO summary with ok/failed counts for operator visibility. expect(logger.info).toHaveBeenCalledWith( diff --git a/packages/server/destinations/gcp/src/bigquery/pushBatch.ts b/packages/server/destinations/gcp/src/bigquery/pushBatch.ts index 799708e21..f1dc2e10a 100644 --- a/packages/server/destinations/gcp/src/bigquery/pushBatch.ts +++ b/packages/server/destinations/gcp/src/bigquery/pushBatch.ts @@ -5,9 +5,12 @@ import { eventToRow } from './eventToRow'; /** * Batched push using a single appendRows call. * - * Errors (row errors or a failed append call) propagate to the collector's - * batch flush boundary, which routes the whole batch to the DLQ and increments - * the failed counter. + * Per-row failures (BigQuery's `rowErrors`) are reported back to the collector + * as a BatchOutcome identifying the failed entries by index. The collector then + * DLQs only those rows and counts the rest as delivered, so a later DLQ retry + * does not re-write the already-succeeded rows (no duplicates). A whole-append + * failure (the appendRows call itself rejects, or the writer is missing) still + * throws, routing the entire batch to the DLQ. */ export const pushBatch: PushBatchFn = async (batch, { config, logger }) => { const settings = config.settings; @@ -48,14 +51,22 @@ export const pushBatch: PushBatchFn = async (batch, { config, logger }) => { message: re.message, }); } - // Throw on any partial failure: the collector flush boundary is atomic - // per batch, so the WHOLE batch (succeeded rows included) routes to the - // DLQ and counts as failed. Precise per-row accounting would require a - // separate PushBatchFn->collector contract change. - const first = result.rowErrors[0]; - throw new Error( - `BigQuery batch append failed: ${failed} of ${rows.length} rows, first error code=${first.code} message=${first.message}`, - ); + // Report per-row failures so the collector DLQs only the failed rows and + // counts the succeeded rows as delivered. `re.index` is the row's + // position in `rows`, which is built 1:1 from `batch.entries`, so it maps + // directly to the collector's entry index. The per-row error preserves + // the original BigQuery `code` and `message` for debugging from the DLQ. + // The Storage Write SDK types `index` as a protobuf int64 (number, string, + // or Long) and `message` as a nullable string, so both are normalized to + // the strict BatchFailure shape here. + const failures = result.rowErrors.map((re) => ({ + index: Number(re.index), + error: Object.assign(new Error(re.message ?? 'BigQuery row error'), { + code: re.code, + }), + })); + + return { failed: failures }; } // Full success: keep at DEBUG to avoid noise on every batch. diff --git a/packages/server/sources/express/src/__tests__/cache-roundtrip.test.ts b/packages/server/sources/express/src/__tests__/cache-roundtrip.test.ts index decca3a22..de47e4c7b 100644 --- a/packages/server/sources/express/src/__tests__/cache-roundtrip.test.ts +++ b/packages/server/sources/express/src/__tests__/cache-roundtrip.test.ts @@ -167,4 +167,137 @@ describe('Express source cache round-trip', () => { expect(second.captures).toHaveLength(1); expectAsset(second.captures[0], 'HIT'); }); + + // Regression lock: two simultaneous MISSes for the same cache key both write + // without crashing, and a subsequent read returns a coherent HIT. This proves + // concurrent same-key cache writes are tolerated (last-writer-wins on a + // byte-backed store) and that the stored value still round-trips intact. + it('handles concurrent same-key MISSes without crashing and serves a coherent HIT afterwards', async () => { + const ASSET_BODY = Buffer.from('/* concurrent walker.js bytes */', 'utf8'); + const destinationCalls: string[] = []; + + const assetDestination: Destination.Instance = { + type: 'asset', + config: {}, + push: async (_event, ctx) => { + destinationCalls.push('asset'); + ctx.env?.respond?.({ + body: ASSET_BODY, + status: 200, + headers: { 'Content-Type': 'application/javascript' }, + }); + }, + }; + + const { collector } = await startFlow({ + consent: { functional: true }, + sources: { + express: { + code: sourceExpress, + config: { + settings: { paths: ['/walker.js'] }, + ingest: { + map: { + method: { key: 'method' }, + path: { key: 'url' }, + }, + }, + }, + cache: { + stop: true, + rules: [ + { + match: { key: 'ingest.method', operator: 'eq', value: 'GET' }, + key: ['ingest.method', 'ingest.path'], + ttl: 300, + update: { + 'headers.X-Cache': { key: 'cache.status' }, + }, + }, + ], + }, + }, + }, + destinations: { + asset: { code: assetDestination }, + }, + }); + + const expressSource = Source.getSource(collector, 'express'); + + type Capture = { + status: number; + body: unknown; + headers: Record; + }; + + const mockRequest = (): Request => + ({ + method: 'GET', + url: '/walker.js?name=page%20view', + headers: {}, + get: () => undefined, + }) as unknown as Request; + + const mockResponse = () => { + const captures: Capture[] = []; + let status = 200; + const headers: Record = {}; + const res = { + status: (code: number) => { + status = code; + return res; + }, + set: (key: string, value: string) => { + headers[key] = value; + return res; + }, + send: (body?: unknown) => { + captures.push({ status, body, headers: { ...headers } }); + return res; + }, + json: (body: unknown) => { + captures.push({ status, body, headers: { ...headers } }); + return res; + }, + }; + return { res: res as unknown as Response, captures }; + }; + + const expectAsset = (capture: Capture) => { + expect(capture.status).toBe(200); + expect(Buffer.isBuffer(capture.body)).toBe(true); + if (!Buffer.isBuffer(capture.body)) + throw new Error('body was not a Buffer'); + expect(capture.body.equals(ASSET_BODY)).toBe(true); + expect(capture.headers['Content-Type']).toBe('application/javascript'); + }; + + // Two simultaneous MISSes for the same key. Both run the pipeline (the + // cache is empty when each starts) and both write the structured response. + // Neither must crash, and both must serve the correct bytes/status. + const a = mockResponse(); + const b = mockResponse(); + await Promise.all([ + expressSource.push(mockRequest(), a.res), + expressSource.push(mockRequest(), b.res), + ]); + + expect(a.captures).toHaveLength(1); + expect(b.captures).toHaveLength(1); + expectAsset(a.captures[0]); + expectAsset(b.captures[0]); + + const missCount = destinationCalls.length; + + // A subsequent read returns a coherent HIT served from the value written by + // the concurrent writers; the destination is not invoked again. + const third = mockResponse(); + await expressSource.push(mockRequest(), third.res); + + expect(destinationCalls).toHaveLength(missCount); // no new pipeline run + expect(third.captures).toHaveLength(1); + expectAsset(third.captures[0]); + expect(third.captures[0].headers['X-Cache']).toBe('HIT'); + }); }); diff --git a/packages/server/stores/fs/src/__tests__/store.test.ts b/packages/server/stores/fs/src/__tests__/store.test.ts index 558cc6230..fb7696e3f 100644 --- a/packages/server/stores/fs/src/__tests__/store.test.ts +++ b/packages/server/stores/fs/src/__tests__/store.test.ts @@ -2,6 +2,7 @@ import * as fs from 'fs/promises'; import * as path from 'path'; import * as os from 'os'; import type { Collector, Logger } from '@walkeros/core'; +import { encodeCacheValue, decodeCacheValue } from '@walkeros/core'; import { storeFsInit } from '../store'; describe('FsStore', () => { @@ -113,6 +114,60 @@ describe('FsStore', () => { }); }); + describe('TTL expiry round-trip', () => { + // End-to-end lock: an encoded cache value written to the fs store reads + // back as a HIT before its TTL, and as expired afterwards. The codec + // stamps an absolute `Date.now() + ttlMs` expiry, so the clock is + // controlled with fake timers around the synchronous codec calls. The fs + // I/O itself runs under real time (writes/reads happen between the + // synchronous encode and decode steps), so no real sleep is needed. + const VALUE = { status: 200, body: 'cached-payload' }; + const TTL_MS = 60_000; + const BASE = new Date('2026-06-04T00:00:00.000Z').getTime(); + + afterEach(() => { + jest.useRealTimers(); + }); + + it('reads back a HIT within the TTL', async () => { + const store = await createStore(tmpDir); + + jest.useFakeTimers(); + jest.setSystemTime(BASE); + const encoded = encodeCacheValue(VALUE, TTL_MS); + jest.useRealTimers(); + + await store.set('cache/entry', encoded); + const raw = await store.get('cache/entry'); + + jest.useFakeTimers(); + jest.setSystemTime(BASE + TTL_MS - 1); + const decoded = decodeCacheValue(raw); + jest.useRealTimers(); + + expect(decoded).toEqual({ value: VALUE }); + }); + + it('reads back as expired once the TTL has passed', async () => { + const store = await createStore(tmpDir); + + jest.useFakeTimers(); + jest.setSystemTime(BASE); + const encoded = encodeCacheValue(VALUE, TTL_MS); + jest.useRealTimers(); + + await store.set('cache/entry', encoded); + const raw = await store.get('cache/entry'); + + jest.useFakeTimers(); + jest.setSystemTime(BASE + TTL_MS + 1); + const decoded = decodeCacheValue(raw); + jest.useRealTimers(); + + expect(decoded).toEqual({ expired: true }); + }); + }); + describe('delete', () => { it('should remove a file', async () => { await fs.writeFile(path.join(tmpDir, 'remove.txt'), 'bye'); diff --git a/packages/server/stores/gcs/src/__tests__/store.test.ts b/packages/server/stores/gcs/src/__tests__/store.test.ts index 713f46d07..783159b45 100644 --- a/packages/server/stores/gcs/src/__tests__/store.test.ts +++ b/packages/server/stores/gcs/src/__tests__/store.test.ts @@ -187,6 +187,18 @@ describe('storeGcsInit', () => { await store.set('../evil.txt', Buffer.from('bad')); expect(mockFetch).not.toHaveBeenCalled(); }); + + // Regression lock: the request-cache codec hands stores an encoded Buffer. + // A non-Buffer value must be rejected with a clear error rather than being + // silently POSTed as a malformed body. + it('rejects a non-Buffer value with a clear error', async () => { + const store = await createStore(); + const bad: unknown = { a: 1 }; + await expect(store.set('x', bad)).rejects.toThrow( + /value must be a Buffer/, + ); + expect(mockFetch).not.toHaveBeenCalled(); + }); }); describe('codec round-trip', () => { diff --git a/packages/server/stores/s3/src/__tests__/store.test.ts b/packages/server/stores/s3/src/__tests__/store.test.ts index 6c33422ae..a793e4ae3 100644 --- a/packages/server/stores/s3/src/__tests__/store.test.ts +++ b/packages/server/stores/s3/src/__tests__/store.test.ts @@ -172,6 +172,18 @@ describe('storeS3Init', () => { await store.set('../evil.txt', Buffer.from('bad')); expect(mockPutObject).not.toHaveBeenCalled(); }); + + // Regression lock: the request-cache codec hands stores an encoded Buffer. + // A non-Buffer value must be rejected with a clear error rather than being + // silently uploaded as a malformed object. + it('rejects a non-Buffer value with a clear error', async () => { + const store = await createStore(); + const bad: unknown = { a: 1 }; + await expect(store.set('x', bad)).rejects.toThrow( + /value must be a Buffer/, + ); + expect(mockPutObject).not.toHaveBeenCalled(); + }); }); describe('codec round-trip', () => { diff --git a/packages/server/stores/sheets/src/__tests__/store.test.ts b/packages/server/stores/sheets/src/__tests__/store.test.ts index 13b3cb8e2..b13a11ac8 100644 --- a/packages/server/stores/sheets/src/__tests__/store.test.ts +++ b/packages/server/stores/sheets/src/__tests__/store.test.ts @@ -2,7 +2,11 @@ jest.mock('../auth', () => ({ createTokenProvider: jest.fn(() => jest.fn().mockResolvedValue('mock-token')), })); -import { createMockContext, createMockLogger } from '@walkeros/core'; +import { + createMockContext, + createMockLogger, + encodeCacheValue, +} from '@walkeros/core'; import type { Store } from '@walkeros/core'; import type { SheetsStoreSettings } from '../types'; import { storeSheetsInit, __resetSpreadsheetExistenceCache } from '../store'; @@ -254,6 +258,23 @@ describe('storeSheetsInit', () => { } }); + it('rejects a Buffer value (request-cache wiring is not supported)', async () => { + const { restore } = installFetch([ + probeOk(), + jsonResponse({ values: [['alice']] }), + ]); + try { + const store = await storeSheetsInit(createCtx()); + const encoded = encodeCacheValue({ status: 200 }, 1000); + + await expect(store.set('req-cache-key', encoded)).rejects.toThrow( + /Sheets store cannot be used as a request cache/, + ); + } finally { + restore(); + } + }); + it('updates the existing cell for a known key', async () => { const { calls, restore } = installFetch([ probeOk(), diff --git a/packages/server/stores/sheets/src/store.ts b/packages/server/stores/sheets/src/store.ts index dab682ac7..20c2a5e1a 100644 --- a/packages/server/stores/sheets/src/store.ts +++ b/packages/server/stores/sheets/src/store.ts @@ -311,6 +311,17 @@ export const storeSheetsInit: Store.Init = async (context) => { }, async set(key: string, value: unknown): Promise { + // The request-cache codec hands stores an encoded Buffer (via + // encodeCacheValue). Sheets would JSON.stringify it into + // {"type":"Buffer","data":[...]} and return that object as a HIT with no + // TTL check on read, serving silent stale garbage. Sheets is not a request-cache + // backend; reject the Buffer case loudly. Non-Buffer JSON values are + // unaffected. + if (Buffer.isBuffer(value)) { + throw new Error( + 'Sheets store cannot be used as a request cache; use fs, s3, gcs, or an in-memory store', + ); + } const serialized = JSON.stringify(value); const existingRow = keyToRow.get(key); if (existingRow === undefined) { diff --git a/packages/web/sources/browser/src/__tests__/html/walker.html b/packages/web/sources/browser/src/__tests__/html/walker.html index ea987f84c..84e8d2df5 100644 --- a/packages/web/sources/browser/src/__tests__/html/walker.html +++ b/packages/web/sources/browser/src/__tests__/html/walker.html @@ -208,6 +208,17 @@

+
+
+
+ +
+
+
+
+ +
+