From d8766e3016c033987d5ce77469d567325759309e Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 26 Jun 2026 13:24:48 +0900 Subject: [PATCH 1/2] fix(review-workload): retry provider gate ENOENT lost mid-acquire (#253) acquireProviderWorkloadGate won the gate via mkdirSync then wrote owner.json; a concurrent release/reclaim removing the gate dir between those two steps threw ENOENT, which the outer catch re-threw (only EEXIST was retried). It escaped acquireProviderWorkloadLease and the Grok reviewer catch-all mislabeled the loser scope_failed instead of provider_workload_blocked, flaking the concurrent-Grok smoke (6 !== 7). Treat ENOENT (gate vanished mid-acquire) as a deadline-bounded retry alongside EEXIST, and move the deadline check above the reclaim-continue so the retry path can never hot-loop. Any other error still throws (fail-loud). Synced to all plugin/relay copies. Adds a deterministic fail-on-revert regression injecting the exact mkdir->owner-write ENOENT race. Co-Authored-By: Claude Opus 4.8 --- plugins/agy/scripts/lib/review-workload.mjs | 4 +- .../scripts/lib/review-workload.mjs | 4 +- .../claude/scripts/lib/review-workload.mjs | 4 +- .../gemini/scripts/lib/review-workload.mjs | 4 +- plugins/grok/scripts/lib/review-workload.mjs | 4 +- plugins/kimi/scripts/lib/review-workload.mjs | 4 +- .../relay-agy/scripts/lib/review-workload.mjs | 4 +- .../scripts/lib/review-workload.mjs | 4 +- .../scripts/lib/review-workload.mjs | 4 +- .../scripts/lib/review-workload.mjs | 4 +- scripts/lib/review-workload.mjs | 4 +- tests/unit/review-workload.test.mjs | 64 +++++++++++++++++++ 12 files changed, 86 insertions(+), 22 deletions(-) diff --git a/plugins/agy/scripts/lib/review-workload.mjs b/plugins/agy/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/plugins/agy/scripts/lib/review-workload.mjs +++ b/plugins/agy/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/api-reviewers/scripts/lib/review-workload.mjs b/plugins/api-reviewers/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/plugins/api-reviewers/scripts/lib/review-workload.mjs +++ b/plugins/api-reviewers/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/claude/scripts/lib/review-workload.mjs b/plugins/claude/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/plugins/claude/scripts/lib/review-workload.mjs +++ b/plugins/claude/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/gemini/scripts/lib/review-workload.mjs b/plugins/gemini/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/plugins/gemini/scripts/lib/review-workload.mjs +++ b/plugins/gemini/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/grok/scripts/lib/review-workload.mjs b/plugins/grok/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/plugins/grok/scripts/lib/review-workload.mjs +++ b/plugins/grok/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/kimi/scripts/lib/review-workload.mjs b/plugins/kimi/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/plugins/kimi/scripts/lib/review-workload.mjs +++ b/plugins/kimi/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/relay/relay-agy/scripts/lib/review-workload.mjs b/relay/relay-agy/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/relay/relay-agy/scripts/lib/review-workload.mjs +++ b/relay/relay-agy/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/relay/relay-gemini/scripts/lib/review-workload.mjs b/relay/relay-gemini/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/relay/relay-gemini/scripts/lib/review-workload.mjs +++ b/relay/relay-gemini/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/relay/relay-grok/scripts/lib/review-workload.mjs b/relay/relay-grok/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/relay/relay-grok/scripts/lib/review-workload.mjs +++ b/relay/relay-grok/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/relay/relay-kimi/scripts/lib/review-workload.mjs b/relay/relay-kimi/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/relay/relay-kimi/scripts/lib/review-workload.mjs +++ b/relay/relay-kimi/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/scripts/lib/review-workload.mjs b/scripts/lib/review-workload.mjs index fcb5454d..aa2b937d 100644 --- a/scripts/lib/review-workload.mjs +++ b/scripts/lib/review-workload.mjs @@ -211,14 +211,14 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/tests/unit/review-workload.test.mjs b/tests/unit/review-workload.test.mjs index 018e809e..a2c7bd55 100644 --- a/tests/unit/review-workload.test.mjs +++ b/tests/unit/review-workload.test.mjs @@ -3,6 +3,7 @@ import assert from "node:assert/strict"; import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; +import { pathToFileURL } from "node:url"; import { PROVIDER_WORKLOAD_BLOCKED_CODE, @@ -93,6 +94,69 @@ test("provider workload lease serializes stale reclaim before removing inactive "stale reclaim must not unlink the lock path outside a serialized compare-and-retry section"); }); +test("provider workload gate retries when owner write loses the gate directory race", async () => { + const { root, env } = tempEnv(); + const moduleRoot = mkdtempSync(join(tmpdir(), "provider-workload-race-module-")); + try { + const source = readFileSync(new URL("../../scripts/lib/review-workload.mjs", import.meta.url), "utf8"); + const shimPath = join(moduleRoot, "fs-race-shim.mjs"); + const modulePath = join(moduleRoot, "review-workload-race.mjs"); + writeFileSync(shimPath, ` +import * as fs from "node:fs"; + +export const linkSync = fs.linkSync; +export const lstatSync = fs.lstatSync; +export const readFileSync = fs.readFileSync; +export const renameSync = fs.renameSync; +export const rmSync = fs.rmSync; +export const unlinkSync = fs.unlinkSync; + +let pendingGateDir = null; +let injected = false; + +export function mkdirSync(path, options) { + const result = fs.mkdirSync(path, options); + if (!injected && String(path).endsWith(".json.gate")) pendingGateDir = String(path); + return result; +} + +export function writeFileSync(path, data, options) { + if (!injected && pendingGateDir && String(path) === \`\${pendingGateDir}/owner.json\`) { + injected = true; + fs.rmSync(pendingGateDir, { recursive: true, force: true }); + const error = new Error("injected owner write ENOENT"); + error.code = "ENOENT"; + throw error; + } + return fs.writeFileSync(path, data, options); +} +`, "utf8"); + writeFileSync( + modulePath, + source.replace( + 'from "node:fs";', + `from ${JSON.stringify(pathToFileURL(shimPath).href)};`, + ), + "utf8", + ); + + const workload = await import(pathToFileURL(modulePath).href); + const acquired = workload.acquireProviderWorkloadLease({ + provider: "grok", + jobId: "race-job", + cwd: "/tmp/race", + sourceBearing: true, + env, + }); + assert.equal(acquired.ok, true); + assert.equal(JSON.parse(readFileSync(join(root, "grok.json"), "utf8")).job_id, "race-job"); + workload.releaseProviderWorkloadLease(acquired.lease); + } finally { + rmSync(root, { recursive: true, force: true }); + rmSync(moduleRoot, { recursive: true, force: true }); + } +}); + test("provider workload lease release unregisters exit cleanup listener", () => { const { root, env } = tempEnv(); const before = process.listenerCount("exit"); From 0f4ebfefc2e4dc17cb9e9f062820fd8cb5156d96 Mon Sep 17 00:00:00 2001 From: Test User Date: Fri, 26 Jun 2026 13:51:24 +0900 Subject: [PATCH 2/2] fix(review-workload): re-establish vanished lock root instead of spinning (#253) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Review follow-up (gemini HIGH): catching ENOENT broadly in the gate-race fix could 100%-CPU hot-loop until the deadline when the gate's PARENT (lock root) is missing — mkdirSync(gateDir) throws ENOENT (syscall "mkdir"), tryReclaim's renameSync also ENOENTs and returns true, so the loop continues with no sleep and the next mkdir fails identically. Deadline-bounded, but a multi-second CPU burn in the lock-root-deleted degraded state (concurrent teardown / tmp cleaner). Distinguish structural ENOENT (syscall "mkdir") from the mid-acquire race ENOENT (owner-file open): re-establish the lock root and sleep-pace, so a vanished root self-heals on the next iteration and can never spin. The race path is unchanged. Adds a deterministic fail-on-revert regression injecting the exact missing-root mkdir ENOENT. Co-Authored-By: Claude Opus 4.8 --- plugins/agy/scripts/lib/review-workload.mjs | 12 ++++ .../scripts/lib/review-workload.mjs | 12 ++++ .../claude/scripts/lib/review-workload.mjs | 12 ++++ .../gemini/scripts/lib/review-workload.mjs | 12 ++++ plugins/grok/scripts/lib/review-workload.mjs | 12 ++++ plugins/kimi/scripts/lib/review-workload.mjs | 12 ++++ .../relay-agy/scripts/lib/review-workload.mjs | 12 ++++ .../scripts/lib/review-workload.mjs | 12 ++++ .../scripts/lib/review-workload.mjs | 12 ++++ .../scripts/lib/review-workload.mjs | 12 ++++ scripts/lib/review-workload.mjs | 12 ++++ tests/unit/review-workload.test.mjs | 60 +++++++++++++++++++ 12 files changed, 192 insertions(+) diff --git a/plugins/agy/scripts/lib/review-workload.mjs b/plugins/agy/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/plugins/agy/scripts/lib/review-workload.mjs +++ b/plugins/agy/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/plugins/api-reviewers/scripts/lib/review-workload.mjs b/plugins/api-reviewers/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/plugins/api-reviewers/scripts/lib/review-workload.mjs +++ b/plugins/api-reviewers/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/plugins/claude/scripts/lib/review-workload.mjs b/plugins/claude/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/plugins/claude/scripts/lib/review-workload.mjs +++ b/plugins/claude/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/plugins/gemini/scripts/lib/review-workload.mjs b/plugins/gemini/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/plugins/gemini/scripts/lib/review-workload.mjs +++ b/plugins/gemini/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/plugins/grok/scripts/lib/review-workload.mjs b/plugins/grok/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/plugins/grok/scripts/lib/review-workload.mjs +++ b/plugins/grok/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/plugins/kimi/scripts/lib/review-workload.mjs b/plugins/kimi/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/plugins/kimi/scripts/lib/review-workload.mjs +++ b/plugins/kimi/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/relay/relay-agy/scripts/lib/review-workload.mjs b/relay/relay-agy/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/relay/relay-agy/scripts/lib/review-workload.mjs +++ b/relay/relay-agy/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/relay/relay-gemini/scripts/lib/review-workload.mjs b/relay/relay-gemini/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/relay/relay-gemini/scripts/lib/review-workload.mjs +++ b/relay/relay-gemini/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/relay/relay-grok/scripts/lib/review-workload.mjs b/relay/relay-grok/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/relay/relay-grok/scripts/lib/review-workload.mjs +++ b/relay/relay-grok/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/relay/relay-kimi/scripts/lib/review-workload.mjs b/relay/relay-kimi/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/relay/relay-kimi/scripts/lib/review-workload.mjs +++ b/relay/relay-kimi/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/scripts/lib/review-workload.mjs b/scripts/lib/review-workload.mjs index aa2b937d..d5d0b16b 100644 --- a/scripts/lib/review-workload.mjs +++ b/scripts/lib/review-workload.mjs @@ -218,6 +218,18 @@ function acquireProviderWorkloadGate(file, env) { holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } diff --git a/tests/unit/review-workload.test.mjs b/tests/unit/review-workload.test.mjs index a2c7bd55..e28e8536 100644 --- a/tests/unit/review-workload.test.mjs +++ b/tests/unit/review-workload.test.mjs @@ -157,6 +157,66 @@ export function writeFileSync(path, data, options) { } }); +test("provider workload gate recovers when the lock root vanishes mid-acquire instead of hot-looping", async () => { + const { root, env } = tempEnv(); + const raceEnv = { ...env, RELAY_PROVIDER_WORKLOAD_GATE_TIMEOUT_MS: "750" }; + const moduleRoot = mkdtempSync(join(tmpdir(), "provider-workload-root-vanish-module-")); + try { + const source = readFileSync(new URL("../../scripts/lib/review-workload.mjs", import.meta.url), "utf8"); + const shimPath = join(moduleRoot, "fs-root-vanish-shim.mjs"); + const modulePath = join(moduleRoot, "review-workload-root-vanish.mjs"); + writeFileSync(shimPath, ` +import * as fs from "node:fs"; + +export const linkSync = fs.linkSync; +export const lstatSync = fs.lstatSync; +export const readFileSync = fs.readFileSync; +export const renameSync = fs.renameSync; +export const rmSync = fs.rmSync; +export const unlinkSync = fs.unlinkSync; +export const writeFileSync = fs.writeFileSync; + +let injected = false; + +export function mkdirSync(path, options) { + const p = String(path); + if (!injected && p.endsWith(".json.gate")) { + injected = true; + // Concurrent teardown removes the whole lock root exactly as we try to create + // the gate dir: the gate mkdir then fails ENOENT because its parent is gone. + fs.rmSync(p.slice(0, p.lastIndexOf("/")), { recursive: true, force: true }); + const error = new Error("injected mkdir ENOENT (lock root vanished)"); + error.code = "ENOENT"; + error.syscall = "mkdir"; + throw error; + } + return fs.mkdirSync(path, options); +} +`, "utf8"); + writeFileSync( + modulePath, + source.replace('from "node:fs";', `from ${JSON.stringify(pathToFileURL(shimPath).href)};`), + "utf8", + ); + + const workload = await import(pathToFileURL(modulePath).href); + const acquired = workload.acquireProviderWorkloadLease({ + provider: "grok", + jobId: "root-vanish-job", + cwd: "/tmp/root-vanish", + sourceBearing: true, + env: raceEnv, + }); + assert.equal(acquired.ok, true, + "must re-establish the vanished lock root and acquire, not spin the catch loop to the deadline"); + assert.equal(JSON.parse(readFileSync(join(root, "grok.json"), "utf8")).job_id, "root-vanish-job"); + workload.releaseProviderWorkloadLease(acquired.lease); + } finally { + rmSync(root, { recursive: true, force: true }); + rmSync(moduleRoot, { recursive: true, force: true }); + } +}); + test("provider workload lease release unregisters exit cleanup listener", () => { const { root, env } = tempEnv(); const before = process.listenerCount("exit");