WXYC · jakebromberg · May 23, 2026 · May 23, 2026 · May 23, 2026
diff --git a/docs/env-vars.md b/docs/env-vars.md
@@ -84,6 +84,15 @@ Bounded ring-buffer reports written under `mirror-logs/`. Reports never include
 - `LML_API_KEY` — Bearer token sent on every LML request. Must match LML's `LML_API_KEY`. Optional in dev; required in production once LML's `LML_REQUIRE_AUTH` is flipped to `true`. Injected at the single `lmlFetch` chokepoint in `@wxyc/lml-client` (`shared/lml-client/src/index.ts`).
 - `LML_CLIENT_MAX_CONCURRENT` — Maximum concurrent in-flight `/api/v1/lookup` calls; gates BS's fan-out at the chokepoint so back-pressure surfaces on the BS side instead of queueing inside LML. Mirrors LML's `discogs_max_concurrent` (default `5`). Set lower in production to leave headroom for other LML callers (request-o-matic, tubafrenzy).
 - `LML_CLIENT_RATE_PER_MIN` — Token-bucket refill rate (and capacity) for `/api/v1/lookup` calls per minute. Mirrors LML's `discogs_rate_limit` (default `50`). Tune downward in production to leave headroom for other LML callers.
+
+### Backfill LML rate gating (`jobs/flowsheet-metadata-backfill`)
+
+Stricter ceilings for the historical-drain cron, since one in-flight LML call held for the full per-call timeout saturates LML's serialized Discogs fan-out and starves real-time iOS/dj-site clients (BS#994 / BS#995). The first two are read at module load by `jobs/flowsheet-metadata-backfill/lml-limiter.ts:createLmlLimiter`; the third by `jobs/flowsheet-metadata-backfill/lml-fetch.ts`. All three are positive integers; non-positive or unparseable values fall back to the default with a `console.warn`. Mutating `process.env` after first import does NOT reconfigure the singletons — restart the container to change a value.
+
+- `BACKFILL_LML_MAX_CONCURRENT` (default `1`) — Maximum concurrent in-flight backfill `/api/v1/lookup` calls. Tighter than runtime `LML_CLIENT_MAX_CONCURRENT=5` because the cron has no human-facing latency budget; serializing keeps blast radius bounded. The semaphore is belt-and-suspenders defense in case the orchestrator ever becomes concurrent.
+- `BACKFILL_LML_RATE_PER_MIN` (default `20`) — Token-bucket refill rate (and capacity) for backfill LML calls per minute. Tighter than runtime `LML_CLIENT_RATE_PER_MIN=50` to leave headroom for real-time traffic.
+- `BACKFILL_LML_PER_CALL_TIMEOUT_MS` (default `8000`) — Per-call abort budget. Tighter than `@wxyc/lml-client`'s 30 s runtime default because the cron's per-row hold time is what saturates LML's Discogs fan-out even at concurrency=1 (BS#994 follow-up, retro 2026-05-23). Cold-tail rows that exceed this stay `metadata_attempt_at IS NULL` and are retried on the next pass when LML's cache is warmer / once WXYC/library-metadata-lookup#338 lands. Mirrors BS#992's per-caller `timeoutMs` pattern for the rotation picker.
+
 - `DISCOGS_API_KEY`, `DISCOGS_API_SECRET` — Served to dj-site via `/config/secrets` endpoint. Not used by the backend itself (Discogs access goes through LML).
 - `SPOTIFY_CLIENT_ID`, `SPOTIFY_CLIENT_SECRET`
 

diff --git a/jobs/flowsheet-metadata-backfill/lml-fetch.ts b/jobs/flowsheet-metadata-backfill/lml-fetch.ts
@@ -3,10 +3,18 @@
  * (#638 / #641).
  *
  * Delegates to `@wxyc/lml-client.lookupMetadata` (the shared HTTP +
- * Sentry-instrumentation chokepoint introduced in BS#887) and injects the
- * backfill's own `defaultLmlLimiter` so this surface gets its stricter
- * BACKFILL_LML_* rate ceiling instead of the runtime path's
- * LML_CLIENT_* defaults (BS#995 / BS#994).
+ * Sentry-instrumentation chokepoint introduced in BS#887) and injects:
+ *   - the backfill's own `defaultLmlLimiter` so this surface gets its stricter
+ *     BACKFILL_LML_* rate ceiling instead of the runtime path's
+ *     LML_CLIENT_* defaults (BS#995 / BS#994), and
+ *   - a tighter per-call abort budget (`BACKFILL_LML_PER_CALL_TIMEOUT_MS`,
+ *     default 8000 ms) so cold-tail rows that LML can't resolve quickly
+ *     don't hold one of LML's serialized Discogs fan-out slots for the
+ *     runtime path's 30 s (BS#994 follow-up, retro 2026-05-23). Rows that
+ *     exceed the budget stay `metadata_attempt_at IS NULL` and are
+ *     retried on the next pass when LML's cache is warmer / once LML#338
+ *     lands. Pattern mirrors BS#992's per-caller timeout for the rotation
+ *     picker.
  *
  * The third parameter is named `track` (not `song`) to match the orchestrator's
  * `EnrichRow.track_title` field. It's plumbed through to LML's `body.song` by
@@ -18,5 +26,18 @@ import { lookupMetadata as sharedLookupMetadata, type LookupResponse } from '@wx
 
 import { defaultLmlLimiter } from './lml-limiter.js';
 
+const envInt = (name: string, fallback: number): number => {
+  const raw = process.env[name];
+  if (raw === undefined || raw === '') return fallback;
+  // Number(raw) (not parseInt) so partial-parse strings like "8000banana"
+  // surface as NaN and get rejected instead of silently coercing.
+  const parsed = Number(raw);
+  if (Number.isFinite(parsed) && parsed > 0) return parsed;
+  console.warn(`lml-fetch: ${name}=${raw} is invalid (must be positive number); using fallback ${fallback}`);
+  return fallback;
+};
+
+const TIMEOUT_MS = envInt('BACKFILL_LML_PER_CALL_TIMEOUT_MS', 8000);
+
 export const lookupMetadata = (artist: string, album?: string, track?: string): Promise<LookupResponse> =>
-  sharedLookupMetadata(artist, album, track, { limiter: defaultLmlLimiter });
+  sharedLookupMetadata(artist, album, track, { limiter: defaultLmlLimiter, timeoutMs: TIMEOUT_MS });
diff --git a/tests/unit/jobs/flowsheet-metadata-backfill/lml-fetch.test.ts b/tests/unit/jobs/flowsheet-metadata-backfill/lml-fetch.test.ts
@@ -0,0 +1,115 @@
+// Unit tests for the backfill's per-call LML timeout (BS#994 follow-up).
+//
+// Verifies that `lookupMetadata` passes a tighter timeoutMs through to
+// `@wxyc/lml-client` than the runtime path's 30s default — the cron's
+// per-row hold time is what saturates LML's serialized Discogs fan-out
+// even at concurrency=1 (PR #1001's static gate alone isn't sufficient).
+
+describe('jobs/flowsheet-metadata-backfill/lml-fetch (BS#994 timeout tighten)', () => {
+  const originalEnv = process.env;
+
+  beforeEach(() => {
+    jest.resetModules();
+    process.env = { ...originalEnv };
+  });
+
+  afterEach(() => {
+    process.env = originalEnv;
+    jest.restoreAllMocks();
+  });
+
+  const loadModule = async (
+    mockLookup: jest.Mock
+  ): Promise<typeof import('../../../../jobs/flowsheet-metadata-backfill/lml-fetch.js')> => {
+    // Mock the local limiter module — we only care about the args passed to
+    // sharedLookupMetadata. Avoids needing to fully stub all of @wxyc/lml-client's
+    // exports (Semaphore, TokenBucket, createLmlLimiter) that lml-limiter.ts
+    // pulls in at module-load.
+    jest.doMock('../../../../jobs/flowsheet-metadata-backfill/lml-limiter.js', () => ({
+      defaultLmlLimiter: { run: jest.fn() },
+    }));
+    jest.doMock('@wxyc/lml-client', () => ({
+      lookupMetadata: mockLookup,
+    }));
+    // Module evaluates env at load — must doMock + import after env setup.
+    return import('../../../../jobs/flowsheet-metadata-backfill/lml-fetch.js');
+  };
+
+  it('passes default 8000ms timeoutMs when BACKFILL_LML_PER_CALL_TIMEOUT_MS is unset', async () => {
+    delete process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS;
+    const mockLookup = jest.fn().mockResolvedValue({ matches: [] });
+
+    const { lookupMetadata } = await loadModule(mockLookup);
+    await lookupMetadata('Juana Molina', 'DOGA', 'la paradoja');
+
+    expect(mockLookup).toHaveBeenCalledWith(
+      'Juana Molina',
+      'DOGA',
+      'la paradoja',
+      expect.objectContaining({ timeoutMs: 8000 })
+    );
+  });
+
+  it('reads BACKFILL_LML_PER_CALL_TIMEOUT_MS from env when set', async () => {
+    process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS = '5000';
+    const mockLookup = jest.fn().mockResolvedValue({ matches: [] });
+
+    const { lookupMetadata } = await loadModule(mockLookup);
+    await lookupMetadata('Jessica Pratt');
+
+    expect(mockLookup).toHaveBeenCalledWith(
+      'Jessica Pratt',
+      undefined,
+      undefined,
+      expect.objectContaining({ timeoutMs: 5000 })
+    );
+  });
+
+  it('falls back to 8000ms on non-positive or unparseable env values, with warn', async () => {
+    process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS = '0';
+    const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
+    const mockLookup = jest.fn().mockResolvedValue({ matches: [] });
+
+    const { lookupMetadata } = await loadModule(mockLookup);
+    await lookupMetadata('Stereolab');
+
+    expect(mockLookup).toHaveBeenCalledWith(
+      'Stereolab',
+      undefined,
+      undefined,
+      expect.objectContaining({ timeoutMs: 8000 })
+    );
+    expect(warn).toHaveBeenCalledWith(expect.stringContaining('BACKFILL_LML_PER_CALL_TIMEOUT_MS=0'));
+  });
+
+  it('rejects partial-parse strings like "8000banana" (no silent coercion)', async () => {
+    process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS = '8000banana';
+    const warn = jest.spyOn(console, 'warn').mockImplementation(() => {});
+    const mockLookup = jest.fn().mockResolvedValue({ matches: [] });
+
+    const { lookupMetadata } = await loadModule(mockLookup);
+    await lookupMetadata('Cat Power');
+
+    // "8000banana" → NaN under Number(), not 8000 under parseInt; falls back.
+    expect(mockLookup).toHaveBeenCalledWith(
+      'Cat Power',
+      undefined,
+      undefined,
+      expect.objectContaining({ timeoutMs: 8000 })
+    );
+    expect(warn).toHaveBeenCalledWith(expect.stringContaining('BACKFILL_LML_PER_CALL_TIMEOUT_MS=8000banana'));
+  });
+
+  it('passes the backfill limiter through alongside the timeout', async () => {
+    delete process.env.BACKFILL_LML_PER_CALL_TIMEOUT_MS;
+    const mockLookup = jest.fn().mockResolvedValue({ matches: [] });
+
+    const { lookupMetadata } = await loadModule(mockLookup);
+    await lookupMetadata('Chuquimamani-Condori', 'Edits', 'Call Your Name');
+
+    const callArgs = mockLookup.mock.calls[0][3];
+    expect(callArgs).toHaveProperty('limiter');
+    expect(callArgs.limiter).toBeDefined();
+    expect(callArgs).toHaveProperty('timeoutMs', 8000);
+  });
+});