diff --git a/plugins/agy/scripts/lib/review-workload.mjs b/plugins/agy/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/plugins/agy/scripts/lib/review-workload.mjs +++ b/plugins/agy/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/api-reviewers/scripts/lib/review-workload.mjs b/plugins/api-reviewers/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/plugins/api-reviewers/scripts/lib/review-workload.mjs +++ b/plugins/api-reviewers/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/claude/scripts/lib/review-workload.mjs b/plugins/claude/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/plugins/claude/scripts/lib/review-workload.mjs +++ b/plugins/claude/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/gemini/scripts/lib/review-workload.mjs b/plugins/gemini/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/plugins/gemini/scripts/lib/review-workload.mjs +++ b/plugins/gemini/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/grok/scripts/lib/review-workload.mjs b/plugins/grok/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/plugins/grok/scripts/lib/review-workload.mjs +++ b/plugins/grok/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/plugins/kimi/scripts/lib/review-workload.mjs b/plugins/kimi/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/plugins/kimi/scripts/lib/review-workload.mjs +++ b/plugins/kimi/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/relay/relay-agy/scripts/lib/review-workload.mjs b/relay/relay-agy/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/relay/relay-agy/scripts/lib/review-workload.mjs +++ b/relay/relay-agy/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/relay/relay-gemini/scripts/lib/review-workload.mjs b/relay/relay-gemini/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/relay/relay-gemini/scripts/lib/review-workload.mjs +++ b/relay/relay-gemini/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/relay/relay-grok/scripts/lib/review-workload.mjs b/relay/relay-grok/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/relay/relay-grok/scripts/lib/review-workload.mjs +++ b/relay/relay-grok/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/relay/relay-kimi/scripts/lib/review-workload.mjs b/relay/relay-kimi/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/relay/relay-kimi/scripts/lib/review-workload.mjs +++ b/relay/relay-kimi/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/scripts/lib/review-workload.mjs b/scripts/lib/review-workload.mjs index fcb5454d..d5d0b16b 100644 --- a/scripts/lib/review-workload.mjs +++ b/scripts/lib/review-workload.mjs @@ -211,14 +211,26 @@ function acquireProviderWorkloadGate(file, env) { release: () => releaseProviderWorkloadGate(gateDir, token), }); } catch (error) { - if (error?.code !== "EEXIST") throw error; - if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; + if (error?.code !== "EEXIST" && error?.code !== "ENOENT") throw error; if (Date.now() >= deadline) { return Object.freeze({ ok: false, holder: parseGateOwner(readGateOwnerRaw(gateDir)), }); } + // ENOENT from mkdirSync itself means the lock root (gateDir's parent) vanished + // under us — a concurrent teardown, not the gate-dir mid-acquire race. Reclaim + // would loop with no progress (renameSync also ENOENTs → true → continue), so + // re-establish the parent and sleep-pace: a persistently-missing root degrades + // to a bounded poll, never a CPU hot-loop. The race ENOENT instead comes from + // the owner-file open (syscall !== "mkdir") and falls through to reclaim, where + // the next mkdir succeeds. + if (error?.code === "ENOENT" && error?.syscall === "mkdir") { + try { mkdirSync(lockRoot(env), { recursive: true, mode: 0o700 }); } catch { /* best effort */ } + sleepSync(GATE_POLL_MS); + continue; + } + if (tryReclaimProviderWorkloadGate(gateDir, env)) continue; sleepSync(GATE_POLL_MS); } } diff --git a/tests/unit/review-workload.test.mjs b/tests/unit/review-workload.test.mjs index 018e809e..e28e8536 100644 --- a/tests/unit/review-workload.test.mjs +++ b/tests/unit/review-workload.test.mjs @@ -3,6 +3,7 @@ import assert from "node:assert/strict"; import { mkdtempSync, readFileSync, rmSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; +import { pathToFileURL } from "node:url"; import { PROVIDER_WORKLOAD_BLOCKED_CODE, @@ -93,6 +94,129 @@ test("provider workload lease serializes stale reclaim before removing inactive "stale reclaim must not unlink the lock path outside a serialized compare-and-retry section"); }); +test("provider workload gate retries when owner write loses the gate directory race", async () => { + const { root, env } = tempEnv(); + const moduleRoot = mkdtempSync(join(tmpdir(), "provider-workload-race-module-")); + try { + const source = readFileSync(new URL("../../scripts/lib/review-workload.mjs", import.meta.url), "utf8"); + const shimPath = join(moduleRoot, "fs-race-shim.mjs"); + const modulePath = join(moduleRoot, "review-workload-race.mjs"); + writeFileSync(shimPath, ` +import * as fs from "node:fs"; + +export const linkSync = fs.linkSync; +export const lstatSync = fs.lstatSync; +export const readFileSync = fs.readFileSync; +export const renameSync = fs.renameSync; +export const rmSync = fs.rmSync; +export const unlinkSync = fs.unlinkSync; + +let pendingGateDir = null; +let injected = false; + +export function mkdirSync(path, options) { + const result = fs.mkdirSync(path, options); + if (!injected && String(path).endsWith(".json.gate")) pendingGateDir = String(path); + return result; +} + +export function writeFileSync(path, data, options) { + if (!injected && pendingGateDir && String(path) === \`\${pendingGateDir}/owner.json\`) { + injected = true; + fs.rmSync(pendingGateDir, { recursive: true, force: true }); + const error = new Error("injected owner write ENOENT"); + error.code = "ENOENT"; + throw error; + } + return fs.writeFileSync(path, data, options); +} +`, "utf8"); + writeFileSync( + modulePath, + source.replace( + 'from "node:fs";', + `from ${JSON.stringify(pathToFileURL(shimPath).href)};`, + ), + "utf8", + ); + + const workload = await import(pathToFileURL(modulePath).href); + const acquired = workload.acquireProviderWorkloadLease({ + provider: "grok", + jobId: "race-job", + cwd: "/tmp/race", + sourceBearing: true, + env, + }); + assert.equal(acquired.ok, true); + assert.equal(JSON.parse(readFileSync(join(root, "grok.json"), "utf8")).job_id, "race-job"); + workload.releaseProviderWorkloadLease(acquired.lease); + } finally { + rmSync(root, { recursive: true, force: true }); + rmSync(moduleRoot, { recursive: true, force: true }); + } +}); + +test("provider workload gate recovers when the lock root vanishes mid-acquire instead of hot-looping", async () => { + const { root, env } = tempEnv(); + const raceEnv = { ...env, RELAY_PROVIDER_WORKLOAD_GATE_TIMEOUT_MS: "750" }; + const moduleRoot = mkdtempSync(join(tmpdir(), "provider-workload-root-vanish-module-")); + try { + const source = readFileSync(new URL("../../scripts/lib/review-workload.mjs", import.meta.url), "utf8"); + const shimPath = join(moduleRoot, "fs-root-vanish-shim.mjs"); + const modulePath = join(moduleRoot, "review-workload-root-vanish.mjs"); + writeFileSync(shimPath, ` +import * as fs from "node:fs"; + +export const linkSync = fs.linkSync; +export const lstatSync = fs.lstatSync; +export const readFileSync = fs.readFileSync; +export const renameSync = fs.renameSync; +export const rmSync = fs.rmSync; +export const unlinkSync = fs.unlinkSync; +export const writeFileSync = fs.writeFileSync; + +let injected = false; + +export function mkdirSync(path, options) { + const p = String(path); + if (!injected && p.endsWith(".json.gate")) { + injected = true; + // Concurrent teardown removes the whole lock root exactly as we try to create + // the gate dir: the gate mkdir then fails ENOENT because its parent is gone. + fs.rmSync(p.slice(0, p.lastIndexOf("/")), { recursive: true, force: true }); + const error = new Error("injected mkdir ENOENT (lock root vanished)"); + error.code = "ENOENT"; + error.syscall = "mkdir"; + throw error; + } + return fs.mkdirSync(path, options); +} +`, "utf8"); + writeFileSync( + modulePath, + source.replace('from "node:fs";', `from ${JSON.stringify(pathToFileURL(shimPath).href)};`), + "utf8", + ); + + const workload = await import(pathToFileURL(modulePath).href); + const acquired = workload.acquireProviderWorkloadLease({ + provider: "grok", + jobId: "root-vanish-job", + cwd: "/tmp/root-vanish", + sourceBearing: true, + env: raceEnv, + }); + assert.equal(acquired.ok, true, + "must re-establish the vanished lock root and acquire, not spin the catch loop to the deadline"); + assert.equal(JSON.parse(readFileSync(join(root, "grok.json"), "utf8")).job_id, "root-vanish-job"); + workload.releaseProviderWorkloadLease(acquired.lease); + } finally { + rmSync(root, { recursive: true, force: true }); + rmSync(moduleRoot, { recursive: true, force: true }); + } +}); + test("provider workload lease release unregisters exit cleanup listener", () => { const { root, env } = tempEnv(); const before = process.listenerCount("exit");