diff --git a/docs/env-vars.md b/docs/env-vars.md index 9c58c0d5..21fca772 100644 --- a/docs/env-vars.md +++ b/docs/env-vars.md @@ -84,6 +84,15 @@ Bounded ring-buffer reports written under `mirror-logs/`. Reports never include - `LML_API_KEY` — Bearer token sent on every LML request. Must match LML's `LML_API_KEY`. Optional in dev; required in production once LML's `LML_REQUIRE_AUTH` is flipped to `true`. Injected at the single `lmlFetch` chokepoint in `@wxyc/lml-client` (`shared/lml-client/src/index.ts`). - `LML_CLIENT_MAX_CONCURRENT` — Maximum concurrent in-flight `/api/v1/lookup` calls; gates BS's fan-out at the chokepoint so back-pressure surfaces on the BS side instead of queueing inside LML. Mirrors LML's `discogs_max_concurrent` (default `5`). Set lower in production to leave headroom for other LML callers (request-o-matic, tubafrenzy). - `LML_CLIENT_RATE_PER_MIN` — Token-bucket refill rate (and capacity) for `/api/v1/lookup` calls per minute. Mirrors LML's `discogs_rate_limit` (default `50`). Tune downward in production to leave headroom for other LML callers. + +### Backfill LML rate gating (`jobs/flowsheet-metadata-backfill`) + +Stricter ceilings for the historical-drain cron, since one in-flight LML call held for the full per-call timeout saturates LML's serialized Discogs fan-out and starves real-time iOS/dj-site clients (BS#994 / BS#995). The first two are read at module load by `jobs/flowsheet-metadata-backfill/lml-limiter.ts:createLmlLimiter`; the third by `jobs/flowsheet-metadata-backfill/lml-fetch.ts`. All three are positive integers; non-positive or unparseable values fall back to the default with a `console.warn`. Mutating `process.env` after first import does NOT reconfigure the singletons — restart the container to change a value. + +- `BACKFILL_LML_MAX_CONCURRENT` (default `1`) — Maximum concurrent in-flight backfill `/api/v1/lookup` calls. Tighter than runtime `LML_CLIENT_MAX_CONCURRENT=5` because the cron has no human-facing latency budget; serializing keeps blast radius bounded. The semaphore is belt-and-suspenders defense in case the orchestrator ever becomes concurrent. +- `BACKFILL_LML_RATE_PER_MIN` (default `20`) — Token-bucket refill rate (and capacity) for backfill LML calls per minute. Tighter than runtime `LML_CLIENT_RATE_PER_MIN=50` to leave headroom for real-time traffic. +- `BACKFILL_LML_PER_CALL_TIMEOUT_MS` (default `8000`) — Per-call abort budget. Tighter than `@wxyc/lml-client`'s 30 s runtime default because the cron's per-row hold time is what saturates LML's Discogs fan-out even at concurrency=1 (BS#994 follow-up, retro 2026-05-23). Cold-tail rows that exceed this stay `metadata_attempt_at IS NULL` and are retried on the next pass when LML's cache is warmer / once WXYC/library-metadata-lookup#338 lands. Mirrors BS#992's per-caller `timeoutMs` pattern for the rotation picker. + - `DISCOGS_API_KEY`, `DISCOGS_API_SECRET` — Served to dj-site via `/config/secrets` endpoint. Not used by the backend itself (Discogs access goes through LML). - `SPOTIFY_CLIENT_ID`, `SPOTIFY_CLIENT_SECRET` diff --git a/jobs/flowsheet-metadata-backfill/lml-fetch.ts b/jobs/flowsheet-metadata-backfill/lml-fetch.ts index 71e8dcad..ee0f07ad 100644 --- a/jobs/flowsheet-metadata-backfill/lml-fetch.ts +++ b/jobs/flowsheet-metadata-backfill/lml-fetch.ts @@ -3,10 +3,18 @@ * (#638 / #641). * * Delegates to `@wxyc/lml-client.lookupMetadata` (the shared HTTP + - * Sentry-instrumentation chokepoint introduced in BS#887) and injects the - * backfill's own `defaultLmlLimiter` so this surface gets its stricter - * BACKFILL_LML_* rate ceiling instead of the runtime path's - * LML_CLIENT_* defaults (BS#995 / BS#994). + * Sentry-instrumentation chokepoint introduced in BS#887) and injects: + * - the backfill's own `defaultLmlLimiter` so this surface gets its stricter + * BACKFILL_LML_* rate ceiling instead of the runtime path's + * LML_CLIENT_* defaults (BS#995 / BS#994), and + * - a tighter per-call abort budget (`BACKFILL_LML_PER_CALL_TIMEOUT_MS`, + * default 8000 ms) so cold-tail rows that LML can't resolve quickly + * don't hold one of LML's serialized Discogs fan-out slots for the + * runtime path's 30 s (BS#994 follow-up, retro 2026-05-23). Rows that + * exceed the budget stay `metadata_attempt_at IS NULL` and are + * retried on the next pass when LML's cache is warmer / once LML#338 + * lands. Pattern mirrors BS#992's per-caller timeout for the rotation + * picker. * * The third parameter is named `track` (not `song`) to match the orchestrator's * `EnrichRow.track_title` field. It's plumbed through to LML's `body.song` by @@ -18,5 +26,18 @@ import { lookupMetadata as sharedLookupMetadata, type LookupResponse } from '@wx import { defaultLmlLimiter } from './lml-limiter.js'; +const envInt = (name: string, fallback: number): number => { + const raw = process.env[name]; + if (raw === undefined || raw === '') return fallback; + // Number(raw) (not parseInt) so partial-parse strings like "8000banana" + // surface as NaN and get rejected instead of silently coercing. + const parsed = Number(raw); + if (Number.isFinite(parsed) && parsed > 0) return parsed; + console.warn(`lml-fetch: ${name}=${raw} is invalid (must be positive number); using fallback ${fallback}`); + return fallback; +}; + +const TIMEOUT_MS = envInt('BACKFILL_LML_PER_CALL_TIMEOUT_MS', 8000); + export const lookupMetadata = (artist: string, album?: string, track?: string): Promise => - sharedLookupMetadata(artist, album, track, { limiter: defaultLmlLimiter }); + sharedLookupMetadata(artist, album, track, { limiter: defaultLmlLimiter, timeoutMs: TIMEOUT_MS }); diff --git a/tests/unit/jobs/flowsheet-metadata-backfill/lml-fetch.test.ts b/tests/unit/jobs/flowsheet-metadata-backfill/lml-fetch.test.ts new file mode 100644 index 00000000..c1c07b03 --- /dev/null +++ b/tests/unit/jobs/flowsheet-metadata-backfill/lml-fetch.test.ts @@ -0,0 +1,115 @@ +// Unit tests for the backfill's per-call LML timeout (BS#994 follow-up). +// +// Verifies that `lookupMetadata` passes a tighter timeoutMs through to +// `@wxyc/lml-client` than the runtime path's 30s default — the cron's +// per-row hold time is what saturates LML's serialized Discogs fan-out +// even at concurrency=1 (PR #1001's static gate alone isn't sufficient). + +describe('jobs/flowsheet-metadata-backfill/lml-fetch (BS#994 timeout tighten)', () => { + const originalEnv = process.env; + + beforeEach(() => { + jest.resetModules(); + process.env = { ...originalEnv }; + }); + + afterEach(() => { + process.env = originalEnv; + jest.restoreAllMocks(); + }); + + const loadModule = async ( + mockLookup: jest.Mock + ): Promise => { + // Mock the local limiter module — we only care about the args passed to + // sharedLookupMetadata. Avoids needing to fully stub all of @wxyc/lml-client's + // exports (Semaphore, TokenBucket, createLmlLimiter) that lml-limiter.ts + // pulls in at module-load. + jest.doMock('../../../../jobs/flowsheet-metadata-backfill/lml-limiter.js', () => ({ + defaultLmlLimiter: { run: jest.fn() }, + })); + jest.doMock('@wxyc/lml-client', () => ({ + lookupMetadata: mockLookup, + })); + // Module evaluates env at load — must doMock + import after env setup. + return import('../../../../jobs/flowsheet-metadata-backfill/lml-fetch.js'); + }; + + it('passes default 8000ms timeoutMs when BACKFILL_LML_PER_CALL_TIMEOUT_MS is unset', async () => { + delete process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS; + const mockLookup = jest.fn().mockResolvedValue({ matches: [] }); + + const { lookupMetadata } = await loadModule(mockLookup); + await lookupMetadata('Juana Molina', 'DOGA', 'la paradoja'); + + expect(mockLookup).toHaveBeenCalledWith( + 'Juana Molina', + 'DOGA', + 'la paradoja', + expect.objectContaining({ timeoutMs: 8000 }) + ); + }); + + it('reads BACKFILL_LML_PER_CALL_TIMEOUT_MS from env when set', async () => { + process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS = '5000'; + const mockLookup = jest.fn().mockResolvedValue({ matches: [] }); + + const { lookupMetadata } = await loadModule(mockLookup); + await lookupMetadata('Jessica Pratt'); + + expect(mockLookup).toHaveBeenCalledWith( + 'Jessica Pratt', + undefined, + undefined, + expect.objectContaining({ timeoutMs: 5000 }) + ); + }); + + it('falls back to 8000ms on non-positive or unparseable env values, with warn', async () => { + process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS = '0'; + const warn = jest.spyOn(console, 'warn').mockImplementation(() => {}); + const mockLookup = jest.fn().mockResolvedValue({ matches: [] }); + + const { lookupMetadata } = await loadModule(mockLookup); + await lookupMetadata('Stereolab'); + + expect(mockLookup).toHaveBeenCalledWith( + 'Stereolab', + undefined, + undefined, + expect.objectContaining({ timeoutMs: 8000 }) + ); + expect(warn).toHaveBeenCalledWith(expect.stringContaining('BACKFILL_LML_PER_CALL_TIMEOUT_MS=0')); + }); + + it('rejects partial-parse strings like "8000banana" (no silent coercion)', async () => { + process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS = '8000banana'; + const warn = jest.spyOn(console, 'warn').mockImplementation(() => {}); + const mockLookup = jest.fn().mockResolvedValue({ matches: [] }); + + const { lookupMetadata } = await loadModule(mockLookup); + await lookupMetadata('Cat Power'); + + // "8000banana" → NaN under Number(), not 8000 under parseInt; falls back. + expect(mockLookup).toHaveBeenCalledWith( + 'Cat Power', + undefined, + undefined, + expect.objectContaining({ timeoutMs: 8000 }) + ); + expect(warn).toHaveBeenCalledWith(expect.stringContaining('BACKFILL_LML_PER_CALL_TIMEOUT_MS=8000banana')); + }); + + it('passes the backfill limiter through alongside the timeout', async () => { + delete process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS; + const mockLookup = jest.fn().mockResolvedValue({ matches: [] }); + + const { lookupMetadata } = await loadModule(mockLookup); + await lookupMetadata('Chuquimamani-Condori', 'Edits', 'Call Your Name'); + + const callArgs = mockLookup.mock.calls[0][3]; + expect(callArgs).toHaveProperty('limiter'); + expect(callArgs.limiter).toBeDefined(); + expect(callArgs).toHaveProperty('timeoutMs', 8000); + }); +});