From bceddbeb1a2db2c2e0f36096931b359f7c1ddf42 Mon Sep 17 00:00:00 2001 From: Aryan Iyappan Date: Sun, 7 Jun 2026 18:59:19 +0530 Subject: [PATCH] feat(harness): git workflow automation and steer/review hardening - Auto feature branches from protected refs via harness-git-branch - Scoped commits with --only-path and git QA assertions - Headless QA finalize writes git-workflow.yaml and smoke commits - Fast steer completion, lite review preflight, inline repair - ADR 0057 and expanded tests (363 passing) Co-authored-by: pi-mono <261679550+pi-mono@users.noreply.github.com> --- .agents/skills/harness-git-commit/SKILL.md | 15 +- .pi/agents/harness/running/executor.md | 2 + .pi/extensions/harness-debate-tools.ts | 25 +- .pi/extensions/harness-live-widget.ts | 28 +- .pi/extensions/harness-run-context.ts | 334 +++++++-- .pi/extensions/policy-gate.ts | 9 +- .pi/extensions/review-integrity.ts | 29 + .pi/harness/agents.manifest.json | 4 +- .../docs/adrs/0044-harness-steer-loop.md | 8 +- .../docs/adrs/0057-fast-steer-completion.md | 48 ++ .pi/harness/docs/adrs/README.md | 2 + .pi/harness/docs/practice-map.md | 16 +- .../specs/adversary-report.schema.json | 13 + .../specs/harness-run-context.schema.json | 3 + .pi/harness/specs/repair-brief.schema.json | 24 +- .pi/harness/specs/review-outcome.schema.json | 12 + .pi/harness/specs/steer-state.schema.json | 7 + .pi/lib/ask-user/index.ts | 5 +- .pi/lib/harness-execute-postwork.ts | 110 +++ .pi/lib/harness-git-branch.d.mts | 30 + .pi/lib/harness-git-branch.mjs | 223 ++++++ .pi/lib/harness-git-qa.d.mts | 7 + .pi/lib/harness-git-qa.mjs | 58 ++ .pi/lib/harness-lite-review-precheck.ts | 56 ++ .pi/lib/harness-remediation.ts | 257 +++++++ .pi/lib/harness-repair-brief.ts | 58 +- .pi/lib/harness-run-context.ts | 502 ++++++++++++-- .pi/lib/harness-subagent-precheck.ts | 13 +- .pi/lib/harness-ui-state.ts | 115 +++- .pi/lib/plan-headless-ux.ts | 163 ++++- .pi/prompts/harness-auto.md | 2 +- .pi/prompts/harness-review.md | 22 +- .pi/prompts/harness-run.md | 13 + .pi/prompts/harness-steer.md | 37 +- .pi/scripts/README.md | 2 + .pi/scripts/harness-adversary-repro-pack.mjs | 147 ++++ .pi/scripts/harness-git-branch.mjs | 69 ++ .pi/scripts/harness-git-commit.mjs | 10 + .pi/scripts/harness-git-qa-assert.mjs | 32 + .pi/scripts/harness-inline-repair.mjs | 78 +++ .pi/scripts/harness-review-preflight.mjs | 112 +++ .pi/scripts/harness-steer-hygiene.d.mts | 7 + .pi/scripts/harness-steer-hygiene.mjs | 135 ++++ .pi/scripts/harness-steer-qa-seed.mjs | 141 ++++ .pi/scripts/harness-verify.mjs | 17 + test/harness-git-branch.test.mjs | 95 +++ test/harness-live-widget-status.test.ts | 402 +++++++++++ test/harness-remediation.test.mjs | 144 ++++ test/harness-repair-brief.test.mjs | 50 ++ test/harness-review-preflight.test.mjs | 36 + test/harness-run-context-postrun.test.mjs | 648 ++++++++++++++++++ test/harness-subagent-precheck-disk.test.mjs | 78 +++ test/plan-headless-git-qa.test.mjs | 138 ++++ test/plan-headless-ux.test.mjs | 60 ++ vendor/pi-subagents/src/subagents.ts | 33 +- 55 files changed, 4493 insertions(+), 191 deletions(-) create mode 100644 .pi/harness/docs/adrs/0057-fast-steer-completion.md create mode 100644 .pi/lib/harness-execute-postwork.ts create mode 100644 .pi/lib/harness-git-branch.d.mts create mode 100644 .pi/lib/harness-git-branch.mjs create mode 100644 .pi/lib/harness-git-qa.d.mts create mode 100644 .pi/lib/harness-git-qa.mjs create mode 100644 .pi/lib/harness-lite-review-precheck.ts create mode 100644 .pi/scripts/harness-adversary-repro-pack.mjs create mode 100644 .pi/scripts/harness-git-branch.mjs create mode 100644 .pi/scripts/harness-git-qa-assert.mjs create mode 100644 .pi/scripts/harness-inline-repair.mjs create mode 100644 .pi/scripts/harness-review-preflight.mjs create mode 100644 .pi/scripts/harness-steer-hygiene.d.mts create mode 100644 .pi/scripts/harness-steer-hygiene.mjs create mode 100644 .pi/scripts/harness-steer-qa-seed.mjs create mode 100644 test/harness-git-branch.test.mjs create mode 100644 test/harness-remediation.test.mjs create mode 100644 test/harness-repair-brief.test.mjs create mode 100644 test/harness-review-preflight.test.mjs create mode 100644 test/harness-subagent-precheck-disk.test.mjs create mode 100644 test/plan-headless-git-qa.test.mjs diff --git a/.agents/skills/harness-git-commit/SKILL.md b/.agents/skills/harness-git-commit/SKILL.md index 94670f9a..3d7aafaa 100644 --- a/.agents/skills/harness-git-commit/SKILL.md +++ b/.agents/skills/harness-git-commit/SKILL.md @@ -27,8 +27,13 @@ Deterministic commits via bundled CLI. Config merges **project** `.pi/auto-commi ```bash node "$UP_PKG/.pi/scripts/harness-auto-commit-bootstrap.mjs" ``` -3. Stage files: `git add …` (CLI does not stage). -4. Commit via CLI (examples): +3. When `branch.strategy` is `auto-feature-branch`, ensure working branch before execute: + ```bash + node "$UP_PKG/.pi/scripts/harness-git-branch.mjs" \ + --run-id "" --run-dir "" --project-root "" + ``` +4. Stage files: `git add …` (CLI does not stage). +5. Commit via CLI (examples): ```bash # Conventional subject from config template node "$UP_PKG/.pi/scripts/harness-git-commit.mjs" \ @@ -41,6 +46,10 @@ Deterministic commits via bundled CLI. Config merges **project** `.pi/auto-commi # Amend: preserve body, ensure trailer node "$UP_PKG/.pi/scripts/harness-git-commit.mjs" --amend --message "$(git log -1 --format=%B)" + # Scoped commit (ignores other staged paths) + node "$UP_PKG/.pi/scripts/harness-git-commit.mjs" \ + --only-path "path/to/file.ts" --type chore --scope harness --subject "scoped change" + # Preview only node "$UP_PKG/.pi/scripts/harness-git-commit.mjs" --dry-run --subject "preview" ``` @@ -55,6 +64,8 @@ Deterministic commits via bundled CLI. Config merges **project** `.pi/auto-commi | `message.coAuthorTrailer` | `Co-authored-by: {login} <{email}>` | | `coAuthor.login` / `coAuthor.email` | Attribution (project overrides package) | | `coAuthor.required` | When false, skip trailer (default true) | +| `branch.strategy` | `auto-feature-branch` creates `harness/` from protected branches | +| `branch.protected` | Branch globs that trigger auto feature branch (default main/master/release/*) | Edit project file to change format or co-author for external repos. diff --git a/.pi/agents/harness/running/executor.md b/.pi/agents/harness/running/executor.md index 7725a288..a764fa05 100644 --- a/.pi/agents/harness/running/executor.md +++ b/.pi/agents/harness/running/executor.md @@ -14,6 +14,8 @@ Implement the approved plan with surgical diffs and strict scope control. The pa When spawn context sets `mode: repair`, read `repair_brief_path` (typically `artifacts/repair-brief.yaml`). Fix only what the brief lists — failed acceptance checks, `fix_directives`, and `priority_lake_ids`. Directives prefixed `[sentrux:…]` come from `artifacts/sentrux-repair-plan.yaml` (merged by the parent); treat them as structural fixes before widening scope. Optional context: `artifacts/sentrux-diagnostics.json` for hotspot ordering only — do not re-run Sentrux CLI unless the brief asks. Do **not** widen scope beyond `plan_packet_path`. Set `repair_attempt` in handoff metadata when the schema allows. +**Repro gate:** When `must_pass_before_handoff: true`, run every `repro_commands` entry from the brief (shell-safe commands only) before `submit_executor_handoff`. Record outcomes in `validation_summary`. If a step is non-shell (`repro_skipped`), document why and still run `verification_commands` when listed. + ## Process 1. Read the approved `PlanPacket` at `plan_packet_path` from spawn context; extract allowed scope before any mutation. Approval is recorded in `run-context.yaml` (`plan_ready: true`) and subprocess policy bootstrap — not as a field inside `plan-packet.yaml`. diff --git a/.pi/extensions/harness-debate-tools.ts b/.pi/extensions/harness-debate-tools.ts index 14a5c8d8..f65c8ec1 100644 --- a/.pi/extensions/harness-debate-tools.ts +++ b/.pi/extensions/harness-debate-tools.ts @@ -168,16 +168,25 @@ function registerHarnessDebateHandler1(pi: ExtensionAPI) { if (applied.length === 0) return; const status = await getPlanDebateRoundStatus(rd, lastRound, runId); + const nextBody = [ + "**Debate lane auto-applied from subagent output**", + ...applied, + "", + status.next_tool + ? `**Required next tool (do not stop with prose only):** ${status.next_tool}` + : "Check harness_debate_round_status for this round.", + ].join("\n"); + if (isHarnessNonInteractive()) { + pi.appendEntry("harness-debate-next-step", { + applied, + status, + recorded_at: new Date().toISOString(), + }); + return; + } pi.sendMessage({ customType: "harness-debate-next-step", - content: [ - "**Debate lane auto-applied from subagent output**", - ...applied, - "", - status.next_tool - ? `**Required next tool (do not stop with prose only):** ${status.next_tool}` - : "Check harness_debate_round_status for this round.", - ].join("\n"), + content: nextBody, display: true, details: { applied, status }, }); diff --git a/.pi/extensions/harness-live-widget.ts b/.pi/extensions/harness-live-widget.ts index 960f25fd..92b911cc 100644 --- a/.pi/extensions/harness-live-widget.ts +++ b/.pi/extensions/harness-live-widget.ts @@ -3,7 +3,12 @@ import type { ExtensionContext, } from "@earendil-works/pi-coding-agent"; import { isHarnessProjectEnabled } from "../lib/harness-project-config.js"; -import { evaluateCrossSessionResume } from "../lib/harness-run-context.js"; +import { + evaluateCrossSessionResume, + hasConfirmedClearAfterLatestRunContext, + isRunIdTombstonedByConfirmedHarnessClear, + runIdFromCrossSessionResumeCommand, +} from "../lib/harness-run-context.js"; import { buildHarnessProgressStatusLine } from "../lib/harness-subagent-progress.js"; import { deriveHarnessStatusHint, @@ -308,7 +313,7 @@ export default function harnessLiveWidget(pi: ExtensionAPI) { }); pi.events.on("harness-run-context:updated", () => { - stateStore.setCrossSessionResumeCommand(null); + stateStore.acknowledgeRunContextUpdated(); if (mountCtx) scheduleRefresh(mountCtx); }); @@ -327,9 +332,28 @@ export default function harnessLiveWidget(pi: ExtensionAPI) { : null; const cmd = typeof data?.resume_command === "string" ? data.resume_command : null; + if (mountCtx) { + const entries = mountCtx.sessionManager.getEntries(); + const runId = runIdFromCrossSessionResumeCommand(cmd); + if ( + hasConfirmedClearAfterLatestRunContext(entries) || + (runId + ? isRunIdTombstonedByConfirmedHarnessClear(entries, runId) + : false) + ) { + stateStore.clearActiveRunState(entries.length); + scheduleRefresh(mountCtx); + return; + } + } stateStore.setCrossSessionResumeCommand(cmd); if (mountCtx) scheduleRefresh(mountCtx); }); + pi.events.on("harness-runs-cleared", () => { + const entryCount = mountCtx?.sessionManager.getEntries().length ?? 0; + stateStore.clearActiveRunState(entryCount); + if (mountCtx) scheduleRefresh(mountCtx); + }); pi.events.on("harness-project-enabled:changed", (payload: unknown) => { const data = diff --git a/.pi/extensions/harness-run-context.ts b/.pi/extensions/harness-run-context.ts index cb0f5753..6f046b5d 100644 --- a/.pi/extensions/harness-run-context.ts +++ b/.pi/extensions/harness-run-context.ts @@ -25,19 +25,28 @@ import { import { runAskUser } from "../lib/ask-user/index.js"; import { isHarnessNonInteractive } from "../lib/ask-user/policy.js"; import { claimHarnessGovernanceLoad } from "../lib/extension-load-guard.js"; +import { + executePostWorkEnabled, + formatExecutorHandoffBrief, + runExecutePostWork, +} from "../lib/harness-execute-postwork.js"; +import { + ensureHarnessGitBranch, + writeGitWorkflowArtifact, +} from "../lib/harness-git-branch.mjs"; import { getHarnessPackageRoot } from "../lib/harness-paths.js"; import { buildPhaseCompletedPayload, phaseTerminalArtifact, } from "../lib/harness-phase-telemetry.js"; import { captureHarnessEvent } from "../lib/harness-posthog.js"; +import { steerBurstAllowed } from "../lib/harness-remediation.js"; import { blockingHarnessAutoCommandReason, blockingReviewCommandReason, blockingRunCommandReason, blockingSteerCommandReason, buildHarnessClearManifest, - canonicalPlanPath, claimRunOwnership, createFreshRunContext, criticalPathWorkItemIdsFromPlanPacket, @@ -57,16 +66,19 @@ import { type HarnessRunContext, type HarnessTurnEntry, harnessAutoTasksDiffer, + hasConfirmedClearAfterLatestRunContext, hasHarnessAbortSignal, hasPlanUserApproval, indexOfLastPlanCommand, inferHarnessPhase, + invalidateEvalVerdictAfterRepair, isAmendPlanAllowed, isHarnessBootstrapPrompt, isNewTaskPlanBlocked, isPlanApprovalAskUser, isPlanPhaseScopedWrite, isStaleActiveRunPointer, + isSteerBurstArgs, loadProjectActiveRun, loadRunContextFromDisk, nextStepAfterOutcome, @@ -78,8 +90,11 @@ import { parseHarnessUseRunArgs, parsePlanApprovalFromMessage, planPacketSummary, + readAdversaryReportFromRun, + readEvalVerdictFromRun, readExecutorHandoffFromRun, readPlanPacketFromPath, + readRepairBriefFromRun, readReviewOutcomeFromRun, reconcileReviewRouting, reconcileStaleExecuteCompletion, @@ -92,6 +107,7 @@ import { resolveHarnessRunPostAgentState, resolveHarnessRunWriteTarget, resolveRemediationClassForRun, + resolveSteerEntryEffects, saveProjectActiveRun, saveRunContextToDisk, sessionHasResumePromptForRun, @@ -100,6 +116,7 @@ import { steerMaxAttemptsFromEnv, syncPlanLastOutcomeFromTaskClarification, syncPlanReadyFromDisk, + updateSteerStateOnEntry, userVisiblePromptSlice, validatePlanOverridePath, validatePlanPacket, @@ -117,6 +134,7 @@ import { isReviewRoundYamlWriteAllowed } from "../lib/plan-debate-write-guard.js import { endHeadlessHarnessPrintSession, maybeForceHeadlessPlanProgress, + maybeHeadlessGitQaFinalizeOnRun, maybeHeadlessQaAutoExecuteSmoke, seedHeadlessTaskClarificationIfNeeded, shouldEndHeadlessHarnessPrintSession, @@ -411,7 +429,7 @@ function hydrateFromSession(entries: unknown[]): HarnessRunContext | null { } async function hydrateFromDisk( - sessionId: string, + _sessionId: string, projectRoot: string, entries: unknown[], ): Promise { @@ -419,37 +437,19 @@ async function hydrateFromDisk( if (fromSession) { return reconcileStaleExecuteCompletion(projectRoot, fromSession, entries); } + if (hasConfirmedClearAfterLatestRunContext(entries)) return null; const pointer = await loadProjectActiveRun(projectRoot); if (!pointer || isStaleActiveRunPointer(pointer, projectRoot)) return null; const disk = await loadRunContextFromDisk(pointer.run_id, projectRoot); - if (disk) { - const clar = await syncPlanLastOutcomeFromTaskClarification( - projectRoot, - disk, - ); - const planSynced = await syncPlanReadyFromDisk(projectRoot, clar, entries); - return reconcileStaleExecuteCompletion(projectRoot, planSynced, entries); - } - - return { - schema_version: "1.0.0", - run_id: pointer.run_id, - pi_session_id: sessionId, - project_root: projectRoot, - phase: pointer.phase, - plan_id: pointer.plan_id, - plan_packet_path: canonicalPlanPath(pointer.run_id, projectRoot), - plan_ready: pointer.plan_ready, - task_summary: null, - status: "active", - last_completed_step: null, - last_outcome: null, - next_recommended_command: null, - owner_pi_session_id: pointer.owner_pi_session_id, - updated_at: pointer.updated_at, - }; + if (!disk) return null; + const clar = await syncPlanLastOutcomeFromTaskClarification( + projectRoot, + disk, + ); + const planSynced = await syncPlanReadyFromDisk(projectRoot, clar, entries); + return reconcileStaleExecuteCompletion(projectRoot, planSynced, entries); } function needsClarificationFollowUp(ctx: HarnessRunContext | null): boolean { @@ -467,9 +467,15 @@ async function offerCrossSessionResume( }, ): Promise { const projectRoot = process.cwd(); - const entries = getEntries(ctx); - const info = await evaluateCrossSessionResume(projectRoot, entries); - if (!info || sessionHasResumePromptForRun(entries, info.runId)) return; + const info = await evaluateCrossSessionResume(projectRoot, getEntries(ctx)); + if (!info) return; + const entriesAfter = getEntries(ctx); + if ( + sessionHasResumePromptForRun(entriesAfter, info.runId) || + !(await evaluateCrossSessionResume(projectRoot, entriesAfter)) + ) { + return; + } const content = formatCrossSessionResumeMessage(info); pi.appendEntry("harness-session-resume-prompt", { @@ -901,7 +907,11 @@ function registerHarnessRunStatusCommand( const sessionId = ctx.sessionManager.getSessionId(); const projectRoot = process.cwd(); const entries = getEntries(ctx); - let ctxState = getLatestRunContext(entries) ?? active.get(); + if (hasConfirmedClearAfterLatestRunContext(entries)) active.set(null); + + let ctxState = + getLatestRunContext(entries) ?? + (hasConfirmedClearAfterLatestRunContext(entries) ? null : active.get()); if (!ctxState) ctxState = await hydrateFromDisk(sessionId, projectRoot, entries); if (!ctxState) { @@ -1412,33 +1422,111 @@ function registerHeadlessPlanProgressWatcher( }); } +const EXECUTOR_AGENT_ID = "harness/running/executor"; + +function subagentResultsFromDetails( + details: unknown, +): Array<{ agent?: string }> { + const d = details as { results?: Array<{ agent?: string }> }; + return d?.results ?? []; +} + +async function reconcileExecutorHandoffFromParent(input: { + pi: ExtensionAPI; + ctx: { + hasUI: boolean; + ui: { notify(message: string, type?: "info" | "warning" | "error"): void }; + sessionManager: { getEntries(): unknown[] }; + abort?: () => void; + }; + active: ActiveContextAccess; + runPostWork?: boolean; +}): Promise { + const entries = getEntries(input.ctx); + const runCtx = getLatestRunContext(entries) ?? input.active.get(); + if (!runCtx?.run_id) return; + const projectRoot = process.cwd(); + if (input.runPostWork && executePostWorkEnabled()) { + const post = await runExecutePostWork({ + projectRoot, + runId: runCtx.run_id, + moduleUrl: MODULE_URL, + }); + if (post.notes.length > 0) { + input.pi.appendEntry("harness-execute-postwork", { + run_id: runCtx.run_id, + ...post, + recorded_at: nowIso(), + }); + } + } + const refreshed = await refreshRunContextProgress( + projectRoot, + runCtx, + entries, + ); + Object.assign(runCtx, refreshed); + input.active.set(runCtx); + persistContext(input.pi, runCtx); + if (refreshed.last_completed_step !== "execute") return; + + const handoff = await readExecutorHandoffFromRun(runCtx.run_id, projectRoot); + const notify = `Execute finished (${refreshed.last_outcome ?? "done"}). Next: ${refreshed.next_recommended_command ?? "/harness-review"}`; + input.pi.appendEntry("harness-step-handoff", { + next_command: refreshed.next_recommended_command, + execution_status: refreshed.last_outcome, + phase: refreshed.phase, + source: "executor_reconcile", + }); + if (!isHarnessNonInteractive()) { + input.pi.appendEntry("harness-executor-handoff-brief", { + run_id: runCtx.run_id, + brief: formatExecutorHandoffBrief(handoff), + recorded_at: nowIso(), + }); + } + if (input.ctx.hasUI) input.ctx.ui.notify(notify, "info"); + + const parsed = latestParsedHarnessCommand(entries); + if ( + isHarnessNonInteractive() && + parsed?.command === "harness-run" && + (await shouldEndHeadlessHarnessPrintSession({ + command: parsed.command, + runCtx, + projectRoot, + })) + ) { + endHeadlessHarnessPrintSession(input.ctx); + } +} + function registerExecutorHandoffReconcile( pi: ExtensionAPI, active: ActiveContextAccess, ): void { pi.on("tool_result", async (event, ctx) => { - if (event.isError || event.toolName !== "submit_executor_handoff") return; - const entries = getEntries(ctx); - const runCtx = getLatestRunContext(entries) ?? active.get(); - if (!runCtx?.run_id) return; - const projectRoot = process.cwd(); - const refreshed = await refreshRunContextProgress( - projectRoot, - runCtx, - entries, - ); - Object.assign(runCtx, refreshed); - active.set(runCtx); - persistContext(pi, runCtx); - if (refreshed.last_completed_step === "execute") { - const notify = `Execute finished (${refreshed.last_outcome ?? "done"}). Next: ${refreshed.next_recommended_command ?? "/harness-review"}`; - pi.appendEntry("harness-step-handoff", { - next_command: refreshed.next_recommended_command, - execution_status: refreshed.last_outcome, - phase: refreshed.phase, + if (event.isError) return; + if (event.toolName === "submit_executor_handoff") { + await reconcileExecutorHandoffFromParent({ + pi, + ctx, + active, + runPostWork: false, }); - if (ctx.hasUI) ctx.ui.notify(notify, "info"); + return; } + if (event.toolName !== "subagent") return; + const hasExecutor = subagentResultsFromDetails(event.details).some( + (r) => r.agent === EXECUTOR_AGENT_ID, + ); + if (!hasExecutor) return; + await reconcileExecutorHandoffFromParent({ + pi, + ctx, + active, + runPostWork: true, + }); }); } @@ -1716,7 +1804,11 @@ async function handlePreResolvedHarnessCommand(args: { !isHarnessBootstrapPrompt(userPrompt) && !hasHarnessAbortSignal(userPrompt) ) { - const policyBlock = getPolicyTransitionBlock(userPrompt, entries); + const policyBlock = getPolicyTransitionBlock( + userPrompt, + entries, + activeCtx, + ); if (policyBlock.blocked) { return { activeCtx, @@ -1823,6 +1915,8 @@ async function handleBeforeAgentStart(input: { const sessionId = input.ctx.sessionManager.getSessionId(); const projectRoot = process.cwd(); const entries = getEntries(input.ctx); + if (hasConfirmedClearAfterLatestRunContext(entries)) input.active.set(null); + const userPrompt = userVisiblePromptSlice(input.event.prompt); const turn = getLatestHarnessTurn(entries); const parsed = turn @@ -1961,6 +2055,30 @@ async function handleBeforeAgentStart(input: { entries, ); if (runBlockReason) return blockRunContextMessage(runBlockReason); + if ( + (command === "harness-run" || command === "harness-auto") && + activeCtx.plan_ready + ) { + const runDir = join( + projectRoot, + ".pi", + "harness", + "runs", + activeCtx.run_id, + ); + try { + const branchResult = await ensureHarnessGitBranch({ + projectRoot, + runId: activeCtx.run_id, + upPkg: getHarnessPackageRoot(MODULE_URL), + }); + await writeGitWorkflowArtifact({ runDir, result: branchResult }); + } catch (err) { + console.warn( + `[harness-run-context] git branch ensure failed: ${err instanceof Error ? err.message : err}`, + ); + } + } const reviewBlockReason = await blockingReviewCommandReason( command, activeCtx, @@ -1973,6 +2091,54 @@ async function handleBeforeAgentStart(input: { projectRoot, ); if (steerBlockReason) return blockRunContextMessage(steerBlockReason); + if (command === "harness-steer") { + const steerEffects = await resolveSteerEntryEffects( + activeCtx.run_id, + projectRoot, + args, + ); + activeCtx.steer_max_attempts = + activeCtx.steer_max_attempts ?? steerMaxAttemptsFromEnv(); + activeCtx = await updateSteerStateOnEntry( + activeCtx.run_id, + projectRoot, + steerEffects, + activeCtx, + ); + activeCtx.phase = "execute"; + if (steerEffects.markBurstUsed) { + activeCtx.inline_repair_attempted = true; + } + input.active.set(activeCtx); + persistContext(input.pi, activeCtx); + syncPolicyFromRunContext(input.pi, entries, activeCtx); + if (process.env.HARNESS_QA_SMOKE === "1" && steerEffects.skipExecutor) { + const runDir = join( + projectRoot, + ".pi", + "harness", + "runs", + activeCtx.run_id, + ); + try { + const { runHarnessSteerHygiene } = await import( + "../scripts/harness-steer-hygiene.mjs" + ); + await runHarnessSteerHygiene({ runDir, projectRoot }); + activeCtx.last_completed_step = "steer"; + activeCtx.last_outcome = "completed"; + activeCtx.next_recommended_command = "/harness-review"; + activeCtx.phase = "evaluate"; + input.active.set(activeCtx); + persistContext(input.pi, activeCtx); + syncPolicyFromRunContext(input.pi, entries, activeCtx); + } catch (err) { + console.warn( + `[harness-run-context] QA steer hygiene failed: ${err instanceof Error ? err.message : err}`, + ); + } + } + } const { planSummary, planPacketForSpawn } = await readPlanSpawnState(activeCtx); const { activePlanBlock, planMode, contextSpawnOpts } = @@ -2035,8 +2201,35 @@ async function handleBeforeAgentStart(input: { gateBlock = formatPlanHumanGateBlock(gateStatus); } const gateSuffix = gateBlock ? `\n\n${gateBlock}` : ""; + let commandBlock = ""; + if (command === "harness-review") { + const runDir = join( + projectRoot, + ".pi", + "harness", + "runs", + activeCtx.run_id, + ); + commandBlock = `\n\n## Review Phase 1 preflight (required before evaluators)\nRun deterministic shell in this session, then hard-gate:\n\`\`\`bash\nnode "$UP_PKG/.pi/scripts/harness-verify.mjs"\nnode "$UP_PKG/.pi/scripts/harness-review-preflight.mjs" --run-dir "${runDir}" --steer-attempt ${activeCtx.steer_attempt ?? 0}\n\`\`\`\nInclude \`steer_attempt\` on \`artifacts/benchmark-log.yaml\`. After steer repair, run \`harness-adversary-repro-pack.mjs\` before lite-review adversary skip.\nDo **not** embed executor repair in this session — use \`/harness-steer\` or \`/harness-steer --burst\`.`; + } + if (command === "harness-steer") { + const brief = await readRepairBriefFromRun(activeCtx.run_id, projectRoot); + const runDir = join( + projectRoot, + ".pi", + "harness", + "runs", + activeCtx.run_id, + ); + if (brief?.gap_kind === "hygiene") { + commandBlock = `\n\n## Hygiene steer\n\`gap_kind: hygiene\` — run hygiene script **before** spawning executor:\n\`\`\`bash\nnode "$UP_PKG/.pi/scripts/harness-steer-hygiene.mjs" --run-dir "${runDir}" --project-root "${projectRoot}"\n\`\`\`\nDo **not** spawn \`harness/running/executor\` for hygiene-only gaps. Then \`/harness-review\`.`; + } + if (isSteerBurstArgs(args)) { + commandBlock += `\n\n## Burst steer\nPreflight:\n\`\`\`bash\nnode "$UP_PKG/.pi/scripts/harness-inline-repair.mjs" --run-dir "${runDir}"\n\`\`\`\nRequires eval pass + adversary \`block_merge\` on disk and \`HARNESS_STEER_BURST=1\`.`; + } + } return { - systemPrompt: `${input.event.systemPrompt}\n\n${formatPlanContextBlock(activeCtx, contextSpawnOpts)}${activePlanBlock ? `\n\n${activePlanBlock}` : ""}${gateSuffix}`, + systemPrompt: `${input.event.systemPrompt}\n\n${formatPlanContextBlock(activeCtx, contextSpawnOpts)}${activePlanBlock ? `\n\n${activePlanBlock}` : ""}${gateSuffix}${commandBlock}`, }; } @@ -2259,6 +2452,12 @@ async function handleAgentEnd(input: { runCtx: activeCtx, command: parsed.command, }); + await maybeHeadlessGitQaFinalizeOnRun({ + projectRoot, + runCtx: activeCtx, + command: parsed.command, + upPkg: getHarnessPackageRoot(MODULE_URL), + }); persistContext(input.pi, activeCtx); if ( await shouldEndHeadlessHarnessPrintSession({ @@ -2290,13 +2489,24 @@ async function handleAgentEnd(input: { activeCtx.plan_ready, ); Object.assign(activeCtx, runPost); + if ( + parsed?.command === "harness-run" || + parsed?.command === "harness-auto" + ) { + await maybeHeadlessGitQaFinalizeOnRun({ + projectRoot, + runCtx: activeCtx, + command: parsed.command, + upPkg: getHarnessPackageRoot(MODULE_URL), + }); + } } if (parsed?.command === "harness-steer") { activeCtx.last_completed_step = "steer"; - activeCtx.steer_attempt = (activeCtx.steer_attempt ?? 0) + 1; activeCtx.steer_max_attempts = activeCtx.steer_max_attempts ?? steerMaxAttemptsFromEnv(); activeCtx.phase = "execute"; + await invalidateEvalVerdictAfterRepair(activeCtx.run_id, projectRoot); syncPolicyFromRunContext(input.pi, entries, activeCtx); } if ( @@ -2323,6 +2533,19 @@ async function handleAgentEnd(input: { activeCtx.run_id, projectRoot, ); + const adversaryReport = await readAdversaryReportFromRun( + activeCtx.run_id, + projectRoot, + ); + const evalVerdict = await readEvalVerdictFromRun( + activeCtx.run_id, + projectRoot, + ); + const burstAllowed = steerBurstAllowed( + evalVerdict, + adversaryReport, + activeCtx.inline_repair_attempted, + ); const reviewComplete = activeCtx.last_completed_step === "review" || activeCtx.last_completed_step === "adversary"; @@ -2339,6 +2562,7 @@ async function handleAgentEnd(input: { steerAttempt: activeCtx.steer_attempt ?? 0, steerMaxAttempts: activeCtx.steer_max_attempts ?? steerMaxAttemptsFromEnv(), reviewComplete, + burstAllowed, }); activeCtx.next_recommended_command = next; activeCtx.updated_at = new Date().toISOString(); diff --git a/.pi/extensions/policy-gate.ts b/.pi/extensions/policy-gate.ts index 30ea39a3..dadb4fbc 100644 --- a/.pi/extensions/policy-gate.ts +++ b/.pi/extensions/policy-gate.ts @@ -25,6 +25,7 @@ import { isPlanPhaseScopedWrite, normalizeHarnessPath, parseHarnessSlashInput, + policyStateFromDiskIfNeeded, readPlanPacketFromPath, saveProjectActiveRun, saveRunContextToDisk, @@ -213,7 +214,13 @@ async function handlePolicyBeforeAgentStart(args: { const nextPhase = inferHarnessPhase(entries, userPrompt); const planSignal = hasApprovedPlanSignal(userPrompt, entries); - const transitionBlock = getPolicyTransitionBlock(userPrompt, entries); + const diskPolicy = await policyStateFromDiskIfNeeded(entries, process.cwd()); + const transitionBlock = getPolicyTransitionBlock( + userPrompt, + entries, + getLatestRunContext(entries), + diskPolicy, + ); if (transitionBlock.blocked) { return { message: { diff --git a/.pi/extensions/review-integrity.ts b/.pi/extensions/review-integrity.ts index 60f23194..fc3ce310 100644 --- a/.pi/extensions/review-integrity.ts +++ b/.pi/extensions/review-integrity.ts @@ -24,6 +24,31 @@ const REVIEW_SUBAGENT_TYPES = new Set([ const EXECUTOR_SUBAGENT_TYPE = "harness/running/executor"; const PLANNING_SUBAGENT_PREFIX = "harness/planning/"; +/** ADR 0057: Phase 1 deterministic scripts allowed in review phase (same executor session). */ +const PHASE1_ALLOWLIST = [ + /harness-verify\.mjs/, + /harness-review-preflight\.mjs/, + /harness-adversary-repro-pack\.mjs/, + /harness-sentrux-report\.mjs/, + /harness-sentrux-diagnostics\.mjs/, + /harness-ls-lint-cli\.mjs/, +]; + +function bashCommandFromToolInput(input: unknown): string { + if (!input || typeof input !== "object") return ""; + const rec = input as Record; + if (typeof rec.command === "string") return rec.command; + if (typeof rec.cmd === "string") return rec.cmd; + return ""; +} + +function isPhase1AllowlistedShell(toolName: string, input: unknown): boolean { + if (toolName !== "bash" && toolName !== "shell") return false; + const cmd = bashCommandFromToolInput(input); + if (!cmd) return false; + return PHASE1_ALLOWLIST.some((p) => p.test(cmd)); +} + interface IsolationState { executorSessionId: string | null; violationActive: boolean; @@ -311,6 +336,10 @@ export default function reviewIntegrity(pi: ExtensionAPI) { if (!state.violationActive) return undefined; + if (isPhase1AllowlistedShell(event.toolName, event.input)) { + return undefined; + } + await appendIncident({ type: "review_integrity_violation", session_id: ctx.sessionManager.getSessionId(), diff --git a/.pi/harness/agents.manifest.json b/.pi/harness/agents.manifest.json index 338bc143..d2ac91f4 100644 --- a/.pi/harness/agents.manifest.json +++ b/.pi/harness/agents.manifest.json @@ -2,7 +2,7 @@ "schema_version": "1.0.0", "package": "ultimate-pi", "package_version": "0.25.0", - "generated_at": "2026-05-27T15:57:32.501Z", + "generated_at": "2026-06-07T10:38:00.046Z", "policy_sha256": "1a631333f1abed3b411961d3527bcae2d4fcd2f715b09a689b0b83b3ea0f54f3", "agents": { "pi-pi/agent-expert": { @@ -95,7 +95,7 @@ }, "harness/running/executor": { "path": ".pi/agents/harness/running/executor.md", - "sha256": "e8710179def62a9adaa63ba5b05c3f36dee95da6fd751ef34be773bbee65a5c2" + "sha256": "2372e4485af6b51ce47b5a6ff59a0ee7cecd28ea79e7ae0ace903dba9f362598" }, "harness/reviewing/adversary": { "path": ".pi/agents/harness/reviewing/adversary.md", diff --git a/.pi/harness/docs/adrs/0044-harness-steer-loop.md b/.pi/harness/docs/adrs/0044-harness-steer-loop.md index 0f25d4b0..abc01002 100644 --- a/.pi/harness/docs/adrs/0044-harness-steer-loop.md +++ b/.pi/harness/docs/adrs/0044-harness-steer-loop.md @@ -14,9 +14,9 @@ After `/harness-run`, failed benchmarks or blocked execution previously routed u 3. **Remediation routing** — `review-outcome.remediation_class`: `implementation_gap` → `/harness-steer`; `plan_gap` → `/harness-plan` revise with `repair_brief_path`; `pass` → policy status. **Review outcome wins** over executor `scope_drift` when they disagree; tie → `plan_gap`. 4. **Plan-gap revise reset** — When review returns `plan_gap` and the next `/harness-plan` runs in revise mode, archive stale plan-phase debate state and generated planning artifacts under `artifacts/revisions//` before the planner starts. Preserve review repair artifacts in place so the new planning round starts clean while retaining audit history. 5. **`/harness-steer`** — Thin orchestrator: read briefs, set policy **phase `execute`**, spawn `harness/executor` with `mode: repair`, then `/harness-review` again. -6. **Caps** — `HARNESS_STEER_MAX_ATTEMPTS` (default 3). **Tiered review:** full review on initial run + steer 1; steers 2+ use lite (benchmark + verdict) unless prior `block_merge` or user forces full. -6. **Sentrux** — Refresh baseline or compare new violations only after steer mutations (avoid false degraded on every attempt). -7. **Evaluate-phase writes** — Orchestrator may write review/steer YAML under run `artifacts/` in `evaluate`/`adversary` phase (allowlisted files). +6. **Caps** — `HARNESS_STEER_MAX_ATTEMPTS` (default 3). **Tiered review:** full review on initial run + steer 1; steers 2+ use lite (benchmark + verdict) unless prior `block_merge` on disk, `adversary_repro` fail, or user forces full (see ADR 0057). **Burst:** when eval pass + adversary `block_merge`, `HARNESS_STEER_BURST=1` grants one extra steer slot via `/harness-steer --burst`. **Hygiene:** `gap_kind: hygiene` at steer entry runs `harness-steer-hygiene.mjs` without incrementing `steer_attempt` (`hygiene_repairs` increments instead). +7. **Sentrux** — Refresh baseline or compare new violations only after steer mutations (avoid false degraded on every attempt). +8. **Evaluate-phase writes** — Orchestrator may write review/steer YAML under run `artifacts/` in `evaluate`/`adversary` phase (allowlisted files). ## Consequences @@ -34,4 +34,4 @@ After `/harness-run`, failed benchmarks or blocked execution previously routed u - `.pi/prompts/harness-steer.md` - `.pi/harness/specs/review-outcome.schema.json`, `repair-brief.schema.json` - `nextStepAfterOutcome` in `.pi/lib/harness-run-context.ts` -- ADR 0039 (amended), 0043 +- ADR 0039 (amended), 0043, 0057 diff --git a/.pi/harness/docs/adrs/0057-fast-steer-completion.md b/.pi/harness/docs/adrs/0057-fast-steer-completion.md new file mode 100644 index 00000000..b0099205 --- /dev/null +++ b/.pi/harness/docs/adrs/0057-fast-steer-completion.md @@ -0,0 +1,48 @@ +# ADR 0057: Fast steer completion (split verdict, hygiene, burst) + +- **Status:** Accepted +- **Date:** 2026-06-07 +- **Amends:** [0044](0044-harness-steer-loop.md), [0039](0039-harness-post-run-review-gate.md) + +## Context + +Post-run review could deadlock when eval passed but adversary blocked merge: routing treated eval pass as success while adversary demanded repair. Hygiene failures (lint/format) consumed full executor steer attempts. Phase 1 shell in `/harness-review` raced evaluator spawns without a freshness gate. Lite-review adversary skip trusted session `last_outcome` instead of disk `block_merge`. + +## Decision + +### P0 — Foundation + +1. **`synthesizeReviewOutcome`** — Canonical merge of eval + adversary (+ benchmark). Split fields: `eval_status`, `adversary_status`, `gap_kind`. Eval pass + adversary `block_merge` → `implementation_gap` (not pass). +2. **Disk-backed precheck** — `priorBlockMergeFromDisk` reads `artifacts/adversary-report.yaml` and `review-outcome.yaml`; lite skip only when repro pack passed (`benchmark-log.adversary_repro: pass`). +3. **Phase 1 preflight** — `harness-review-preflight.mjs` hard-gates evaluator spawns; `review-integrity` allowlists Phase 1 bash scripts in the review session. +4. **Hygiene at steer start (Option A)** — `gap_kind: hygiene` runs `harness-steer-hygiene.mjs` at `/harness-steer` entry; increments `hygiene_repairs` only (not `steer_attempt`). + +### P1 — Repair brief + +5. **`repair-brief` 1.1.0** — `repro_commands`, `repro_skipped`, `verification_commands`, `must_pass_before_handoff`, `gap_kind`. +6. **Executor repro gate** — Run `repro_commands` before handoff when `must_pass_before_handoff: true`. + +### P2 — Burst + defer inline repair + +7. **No fused executor inside `/harness-review`** — Use `/harness-steer --burst` with `harness-inline-repair.mjs` preflight. +8. **`HARNESS_STEER_BURST` default 0** — Burst allowed only when eval pass + adversary `block_merge` on disk; `effectiveSteerMaxAttempts = base + 1` when burst allowed. +9. **`harness-adversary-repro-pack.mjs`** — Freshness guard before lite adversary skip. + +## Consequences + +### Positive + +- Split-verdict runs route to steer/burst instead of false pass. +- Hygiene repairs are cheap and do not burn steer attempts. +- Review Phase 1 cannot spawn evaluators on stale benchmark logs. + +### Negative + +- More scripts and schema fields to keep in sync (mitigated by `harness-verify`). + +## References + +- `.pi/lib/harness-remediation.ts`, `.pi/lib/harness-subagent-precheck.ts` +- `.pi/scripts/harness-review-preflight.mjs`, `harness-steer-hygiene.mjs`, `harness-inline-repair.mjs`, `harness-adversary-repro-pack.mjs` +- `.pi/prompts/harness-review.md`, `harness-steer.md` +- ADR 0044 (steer loop) diff --git a/.pi/harness/docs/adrs/README.md b/.pi/harness/docs/adrs/README.md index 8b114901..58577f11 100644 --- a/.pi/harness/docs/adrs/README.md +++ b/.pi/harness/docs/adrs/README.md @@ -43,6 +43,8 @@ Team-shared ADRs for the ultimate-pi harness live under `.pi/harness/docs/adrs/` | [0053](0053-plan-task-clarification-gate.md) | Plan-phase task clarification gate | Accepted | | [0054](0054-harness-native-ask-user.md) | Harness-native ask_user + Glimpse presenters | Accepted | | [0055](0055-auto-commit-coauthor-lifecycle.md) | Auto-commit co-author + message format lifecycle | Accepted | +| [0056](0056-agent-native-speed-wiring.md) | Agent-native speed wiring | Accepted | +| [0057](0057-fast-steer-completion.md) | Fast steer completion (split verdict, hygiene, burst) | Accepted | ## Practice map diff --git a/.pi/harness/docs/practice-map.md b/.pi/harness/docs/practice-map.md index 50e790bd..a32537a4 100644 --- a/.pi/harness/docs/practice-map.md +++ b/.pi/harness/docs/practice-map.md @@ -2,7 +2,7 @@ Source of truth linking harness phases to proven practices (graphify corpus), agents/scripts, spawn topology, and **agent translation** (ADR 0042). Orchestrators and agents should cite this doc when unsure why a lane exists. -See also: [ADRs](adrs/README.md), [ADR 0040](adrs/0040-practice-grounded-orchestration.md), [ADR 0041](adrs/0041-intelligent-planning-reconnaissance.md), [ADR 0042](adrs/0042-agent-native-orchestration.md), [ADR 0043](adrs/0043-path-first-harness-tools.md), [ADR 0044](adrs/0044-harness-steer-loop.md), [`raw/modules/structured-planning.md`](../../../raw/modules/structured-planning.md). +See also: [ADRs](adrs/README.md), [ADR 0040](adrs/0040-practice-grounded-orchestration.md), [ADR 0041](adrs/0041-intelligent-planning-reconnaissance.md), [ADR 0042](adrs/0042-agent-native-orchestration.md), [ADR 0043](adrs/0043-path-first-harness-tools.md), [ADR 0044](adrs/0044-harness-steer-loop.md), [ADR 0057](adrs/0057-fast-steer-completion.md), [`raw/modules/structured-planning.md`](../../../raw/modules/structured-planning.md). ## Agent translation (human practice → agent design) @@ -93,24 +93,26 @@ See also: [ADRs](adrs/README.md), [ADR 0040](adrs/0040-practice-grounded-orchest | Phase | Practice | Agent translation | Actor | |-------|----------|-------------------|-------| -| 1 | Automated QC + fitness | Deterministic first | Parent scripts | +| 1 | Automated QC + fitness | Deterministic first + `harness-review-preflight.mjs` gate | Parent scripts | | 1b | Structural repair plan | OSS diagnostics → actions | `sentrux-repair-advisor` | | 2 | Measure vs plan | Benchmark on disk | `evaluator` benchmark | | 3 | Policy audit | Verdict (no fail-fast skip) | `evaluator` verdict | | 4 | Red team | Tiered: full attempt 1, lite 2+ steer | `adversary` | | 5 | Outcome + repair brief | Machine routing | Parent + `review-outcome.yaml`, `repair-brief.yaml` | | 6 | Steer gate | One `ask_user` | harness-decisions | -| 7 | Steer / revise | `implementation_gap` → `/harness-steer`; `plan_gap` → plan revise | ADR 0044 | +| 7 | Steer / revise | `implementation_gap` → `/harness-steer` or `--burst`; `plan_gap` → plan revise | ADR 0044, 0057 | -`--quick` = deterministic + benchmark + verdict (no adversary). Steer attempts 2+ default to lite review unless `block_merge`. +`--quick` = deterministic + benchmark + verdict (no adversary). Steer attempts 2+ default to lite review unless `block_merge` on disk or repro pack failed. -## `/harness-steer` — Repair sub-cycle (ADR 0044) +## `/harness-steer` — Repair sub-cycle (ADR 0044, 0057) | Step | Practice | Actor | |------|----------|-------| -| 0 | Read review + repair briefs | Parent | +| 0 | Read review + repair briefs + steer-state | Parent | +| 0b | Hygiene fast-path (`gap_kind: hygiene`) | `harness-steer-hygiene.mjs` | +| 0c | Burst preflight (`--burst`) | `harness-inline-repair.mjs` | | 1 | Policy phase → `execute` | Parent | -| 2 | Repair scope | `harness/running/executor` `mode: repair` | +| 2 | Repair scope | `harness/running/executor` `mode: repair` (skip when hygiene-only) | | 3 | Re-verify | `/harness-review` | ## Anti-patterns diff --git a/.pi/harness/specs/adversary-report.schema.json b/.pi/harness/specs/adversary-report.schema.json index 261e9358..68148f9b 100644 --- a/.pi/harness/specs/adversary-report.schema.json +++ b/.pi/harness/specs/adversary-report.schema.json @@ -46,6 +46,19 @@ "minLength": 1 } }, + "repro_commands": { + "type": "array", + "items": { + "type": "object", + "additionalProperties": false, + "required": ["cmd"], + "properties": { + "cmd": { "type": "string", "minLength": 1 }, + "cwd": { "type": "string", "minLength": 1 }, + "safe_for_phase1": { "type": "boolean" } + } + } + }, "block_merge": { "type": "boolean" } diff --git a/.pi/harness/specs/harness-run-context.schema.json b/.pi/harness/specs/harness-run-context.schema.json index 534a334b..f3bb1396 100644 --- a/.pi/harness/specs/harness-run-context.schema.json +++ b/.pi/harness/specs/harness-run-context.schema.json @@ -86,6 +86,9 @@ "steer_max_attempts": { "type": "integer", "minimum": 1 + }, + "inline_repair_attempted": { + "type": "boolean" } } } diff --git a/.pi/harness/specs/repair-brief.schema.json b/.pi/harness/specs/repair-brief.schema.json index 0dbc3b59..9cbf4f54 100644 --- a/.pi/harness/specs/repair-brief.schema.json +++ b/.pi/harness/specs/repair-brief.schema.json @@ -13,7 +13,10 @@ "fix_directives" ], "properties": { - "schema_version": { "type": "string", "const": "1.0.0" }, + "schema_version": { + "type": "string", + "enum": ["1.0.0", "1.1.0"] + }, "run_id": { "type": "string", "minLength": 1 }, "steer_attempt": { "type": "integer", "minimum": 1 }, "remediation_class": { @@ -40,6 +43,25 @@ "constraints": { "type": "array", "items": { "type": "string", "minLength": 1 } + }, + "repro_commands": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "repro_skipped": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "verification_commands": { + "type": "array", + "items": { "type": "string", "minLength": 1 } + }, + "must_pass_before_handoff": { + "type": "boolean" + }, + "gap_kind": { + "type": "string", + "enum": ["hygiene", "functional", "mixed"] } } } diff --git a/.pi/harness/specs/review-outcome.schema.json b/.pi/harness/specs/review-outcome.schema.json index 5d95e308..41f95613 100644 --- a/.pi/harness/specs/review-outcome.schema.json +++ b/.pi/harness/specs/review-outcome.schema.json @@ -41,6 +41,18 @@ "source_artifacts": { "type": "object", "additionalProperties": { "type": "string" } + }, + "eval_status": { + "type": "string", + "minLength": 1 + }, + "adversary_status": { + "type": "string", + "enum": ["proceed", "block_merge", "conditional_pass"] + }, + "gap_kind": { + "type": "string", + "enum": ["hygiene", "functional", "mixed"] } } } diff --git a/.pi/harness/specs/steer-state.schema.json b/.pi/harness/specs/steer-state.schema.json index 7d21cbbc..8ff393e7 100644 --- a/.pi/harness/specs/steer-state.schema.json +++ b/.pi/harness/specs/steer-state.schema.json @@ -15,6 +15,13 @@ "last_review_tier": { "type": "string", "enum": ["full", "lite"] + }, + "hygiene_repairs": { + "type": "integer", + "minimum": 0 + }, + "burst_used": { + "type": "boolean" } } } diff --git a/.pi/lib/ask-user/index.ts b/.pi/lib/ask-user/index.ts index fec1d4f7..93a3bf09 100644 --- a/.pi/lib/ask-user/index.ts +++ b/.pi/lib/ask-user/index.ts @@ -67,7 +67,10 @@ export async function runAskUser( }; } - if (isHarnessNonInteractive()) { + if ( + isHarnessNonInteractive() && + process.env.HARNESS_ASK_USER_UI?.toLowerCase() !== "headless" + ) { const blocked = nonInteractiveAskUserResult(params.question ?? ""); return { error: blocked.text, diff --git a/.pi/lib/harness-execute-postwork.ts b/.pi/lib/harness-execute-postwork.ts new file mode 100644 index 00000000..e0b7423d --- /dev/null +++ b/.pi/lib/harness-execute-postwork.ts @@ -0,0 +1,110 @@ +/** + * Deterministic execute-phase post-work (Sentrux capture) — parent extension, not LLM. + */ + +import { join } from "node:path"; +import { safeSpawnAsync } from "./harness-lens/clients/safe-spawn.js"; +import { getHarnessPackageRoot } from "./harness-paths.js"; + +const DEFAULT_TIMEOUT_MS = 120_000; + +export interface ExecutePostWorkResult { + sentrux_report_ok: boolean; + sentrux_diagnostics_ok: boolean; + notes: string[]; +} + +function parseTimeoutMs(): number { + const raw = process.env.HARNESS_SENTRUX_TIMEOUT_MS?.trim(); + if (!raw) return DEFAULT_TIMEOUT_MS; + const parsed = Number.parseInt(raw, 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : DEFAULT_TIMEOUT_MS; +} + +export function executePostWorkEnabled(): boolean { + return process.env.HARNESS_EXECUTE_POSTWORK !== "0"; +} + +/** Run Sentrux report + diagnostics after executor subprocess completes. */ +export async function runExecutePostWork(args: { + projectRoot: string; + runId: string; + packageRoot?: string; + moduleUrl?: string; +}): Promise { + const notes: string[] = []; + const packageRoot = + args.packageRoot ?? + (args.moduleUrl ? getHarnessPackageRoot(args.moduleUrl) : process.cwd()); + const runDir = join(args.projectRoot, ".pi", "harness", "runs", args.runId); + const timeout = parseTimeoutMs(); + const scripts = join(packageRoot, ".pi", "scripts"); + + const report = await safeSpawnAsync( + "node", + [ + join(scripts, "harness-sentrux-report.mjs"), + "--out", + runDir, + "--root", + args.projectRoot, + "--run-id", + args.runId, + "--signal", + ], + { cwd: args.projectRoot, timeout }, + ); + + const notInstalled = + report.status === 127 || + /not installed/i.test(report.stderr) || + /not installed/i.test(report.stdout); + if (notInstalled) { + notes.push("sentrux: not_installed"); + return { + sentrux_report_ok: false, + sentrux_diagnostics_ok: false, + notes, + }; + } + + const reportOk = report.status === 0; + if (!reportOk) { + notes.push(`sentrux-report: exit ${report.status ?? "null"}`); + } + + const reportPath = join(runDir, "artifacts", "sentrux-report.json"); + const diag = await safeSpawnAsync( + "node", + [ + join(scripts, "harness-sentrux-diagnostics.mjs"), + "--report", + reportPath, + "--out", + runDir, + "--churn", + ], + { cwd: args.projectRoot, timeout }, + ); + const diagOk = diag.status === 0; + if (!diagOk) { + notes.push(`sentrux-diagnostics: exit ${diag.status ?? "null"}`); + } + + return { + sentrux_report_ok: reportOk, + sentrux_diagnostics_ok: diagOk, + notes, + }; +} + +export function formatExecutorHandoffBrief( + handoff: { + execution_status?: string; + } | null, +): string { + if (!handoff?.execution_status) { + return "Executor subprocess finished; handoff artifact not yet on disk — check handoff/executor-summary.yaml."; + } + return `Executor handoff: execution_status=${handoff.execution_status}. Artifacts under run_dir/handoff and run_dir/artifacts. Parent: do not re-run executor; run post-work is extension-handled. Next: /harness-review unless blocked.`; +} diff --git a/.pi/lib/harness-git-branch.d.mts b/.pi/lib/harness-git-branch.d.mts new file mode 100644 index 00000000..59a72eb0 --- /dev/null +++ b/.pi/lib/harness-git-branch.d.mts @@ -0,0 +1,30 @@ +export type HarnessGitBranchResult = { + ok: boolean; + skipped: boolean; + reason: string; + current_branch: string | null; + target_branch: string | null; + action: string; + new_branch?: string | null; +}; + +export function isProtectedBranch( + branch: string, + protectedPatterns?: string[], +): boolean; + +export function harnessFeatureBranchName(runId: string): string; + +export function readCurrentBranch(projectRoot: string): string | null; + +export function ensureHarnessGitBranch(opts: { + projectRoot: string; + runId: string; + upPkg?: string; + dryRun?: boolean; +}): Promise; + +export function writeGitWorkflowArtifact(opts: { + runDir: string; + result: HarnessGitBranchResult; +}): Promise; diff --git a/.pi/lib/harness-git-branch.mjs b/.pi/lib/harness-git-branch.mjs new file mode 100644 index 00000000..1a1d4d69 --- /dev/null +++ b/.pi/lib/harness-git-branch.mjs @@ -0,0 +1,223 @@ +/** + * Intelligent git branch handling for harness runs (.pi/auto-commit.json branch.strategy). + */ + +import { writeFile, mkdir } from "node:fs/promises"; +import { join } from "node:path"; +import { spawnSync } from "node:child_process"; +import { stringify as stringifyYaml } from "yaml"; +import { resolveAutoCommitConfig } from "./harness-auto-commit-config.mjs"; + +function runGit(args, cwd) { + const result = spawnSync("git", args, { + cwd, + encoding: "utf8", + shell: false, + }); + return { + ok: result.status === 0, + status: result.status ?? 1, + stdout: (result.stdout ?? "").trim(), + stderr: (result.stderr ?? "").trim(), + }; +} + +/** @param {string} pattern e.g. release/* */ +function globToRegExp(pattern) { + const escaped = pattern + .replace(/[.+^${}()|[\]\\]/g, "\\$&") + .replace(/\*/g, ".*") + .replace(/\?/g, "."); + return new RegExp(`^${escaped}$`); +} + +/** + * @param {string} branch + * @param {string[]} protectedPatterns + */ +export function isProtectedBranch(branch, protectedPatterns = []) { + if (!branch || branch === "HEAD") return false; + return protectedPatterns.some((p) => globToRegExp(String(p)).test(branch)); +} + +/** @param {string} runId */ +export function harnessFeatureBranchName(runId) { + const slug = String(runId ?? "run") + .trim() + .toLowerCase() + .replace(/[^a-z0-9]+/g, "-") + .replace(/^-+|-+$/g, "") + .slice(0, 48); + return `harness/${slug || "run"}`; +} + +/** + * @param {string} projectRoot + */ +export function readCurrentBranch(projectRoot) { + const r = runGit(["rev-parse", "--abbrev-ref", "HEAD"], projectRoot); + if (!r.ok) return null; + if (r.stdout === "HEAD") return "detached"; + return r.stdout; +} + +/** + * @param {object} opts + * @param {string} opts.projectRoot + * @param {string} opts.runId + * @param {string} [opts.upPkg] + * @param {boolean} [opts.dryRun] + */ +export async function ensureHarnessGitBranch(opts) { + const projectRoot = opts.projectRoot; + const runId = opts.runId; + const dryRun = opts.dryRun === true; + + const gitDir = runGit(["rev-parse", "--git-dir"], projectRoot); + if (!gitDir.ok) { + return { + ok: false, + skipped: true, + reason: "not_a_git_repo", + current_branch: null, + target_branch: null, + action: "none", + }; + } + + const upPkg = opts.upPkg ?? projectRoot; + const config = await resolveAutoCommitConfig(projectRoot, upPkg); + const strategy = String(config.branch?.strategy ?? "none").toLowerCase(); + const protectedPatterns = Array.isArray(config.branch?.protected) + ? config.branch.protected.map(String) + : ["main", "master"]; + + if (strategy === "none" || !strategy) { + return { + ok: true, + skipped: true, + reason: "strategy_none", + current_branch: readCurrentBranch(projectRoot), + target_branch: null, + action: "none", + }; + } + + if (strategy !== "auto-feature-branch") { + return { + ok: false, + skipped: true, + reason: `unsupported_strategy:${strategy}`, + current_branch: readCurrentBranch(projectRoot), + target_branch: null, + action: "none", + }; + } + + const current = readCurrentBranch(projectRoot); + const target = harnessFeatureBranchName(runId); + + if (!current) { + return { + ok: false, + skipped: true, + reason: "cannot_read_branch", + current_branch: null, + target_branch: target, + action: "none", + }; + } + + if (current === target) { + return { + ok: true, + skipped: false, + reason: "already_on_target", + current_branch: current, + target_branch: target, + action: "none", + }; + } + + if (!isProtectedBranch(current, protectedPatterns)) { + return { + ok: true, + skipped: false, + reason: "not_on_protected_branch", + current_branch: current, + target_branch: target, + action: "none", + }; + } + + if (dryRun) { + return { + ok: true, + skipped: false, + reason: "dry_run", + current_branch: current, + target_branch: target, + action: "would_checkout_or_create", + }; + } + + const exists = runGit( + ["show-ref", "--verify", "--quiet", `refs/heads/${target}`], + projectRoot, + ); + let action = "checkout"; + if (!exists.ok) { + const created = runGit(["checkout", "-b", target], projectRoot); + if (!created.ok) { + return { + ok: false, + skipped: false, + reason: created.stderr || "checkout_create_failed", + current_branch: current, + target_branch: target, + action: "failed", + }; + } + action = "create"; + } else { + const checked = runGit(["checkout", target], projectRoot); + if (!checked.ok) { + return { + ok: false, + skipped: false, + reason: checked.stderr || "checkout_failed", + current_branch: current, + target_branch: target, + action: "failed", + }; + } + } + + return { + ok: true, + skipped: false, + reason: action, + current_branch: current, + target_branch: target, + action, + new_branch: readCurrentBranch(projectRoot), + }; +} + +/** + * Persist branch workflow result under run artifacts. + * @param {object} opts + * @param {string} opts.runDir + * @param {object} opts.result + */ +export async function writeGitWorkflowArtifact(opts) { + const path = join(opts.runDir, "artifacts", "git-workflow.yaml"); + await mkdir(join(opts.runDir, "artifacts"), { recursive: true }); + const doc = { + schema_version: "1.0.0", + recorded_at: new Date().toISOString(), + ...opts.result, + }; + await writeFile(path, stringifyYaml(doc), "utf-8"); + return path; +} diff --git a/.pi/lib/harness-git-qa.d.mts b/.pi/lib/harness-git-qa.d.mts new file mode 100644 index 00000000..c5706f70 --- /dev/null +++ b/.pi/lib/harness-git-qa.d.mts @@ -0,0 +1,7 @@ +export const SMOKE_FILE_REL: ".pi/harness/evals/smoke/E2E-LAST-RUN.txt"; + +export function smokeFileHasIsoLine(projectRoot: string): Promise; + +export function isHarnessGitQaCommitComplete( + projectRoot: string, +): Promise; diff --git a/.pi/lib/harness-git-qa.mjs b/.pi/lib/harness-git-qa.mjs new file mode 100644 index 00000000..07fad281 --- /dev/null +++ b/.pi/lib/harness-git-qa.mjs @@ -0,0 +1,58 @@ +/** + * Git QA smoke commit checks (plain ESM — safe from bash QA scripts). + */ + +import { readFile } from "node:fs/promises"; +import { join } from "node:path"; +import { spawnSync } from "node:child_process"; + +export const SMOKE_FILE_REL = ".pi/harness/evals/smoke/E2E-LAST-RUN.txt"; +const ISO_LINE_RE = /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9:.+-Z]+/m; + +function runGitCapture(projectRoot, args) { + const result = spawnSync("git", args, { + cwd: projectRoot, + encoding: "utf8", + shell: false, + }); + return { + ok: (result.status ?? 1) === 0, + stdout: (result.stdout ?? "").trim(), + }; +} + +export async function smokeFileHasIsoLine(projectRoot) { + try { + const text = await readFile(join(projectRoot, SMOKE_FILE_REL), "utf-8"); + return ISO_LINE_RE.test(text); + } catch { + return false; + } +} + +/** True when smoke marker is committed at HEAD as a single-path harness-git-commit. */ +export async function isHarnessGitQaCommitComplete(projectRoot) { + if (!(await smokeFileHasIsoLine(projectRoot))) return false; + const wt = runGitCapture(projectRoot, ["diff", "HEAD", "--", SMOKE_FILE_REL]); + if (!wt.ok || wt.stdout) return false; + const staged = runGitCapture(projectRoot, [ + "diff", + "--cached", + "HEAD", + "--", + SMOKE_FILE_REL, + ]); + if (!staged.ok || staged.stdout) return false; + const headFiles = runGitCapture(projectRoot, [ + "diff-tree", + "--no-commit-id", + "--name-only", + "-r", + "HEAD", + ]); + if (!headFiles.ok) return false; + const names = headFiles.stdout.split("\n").filter(Boolean); + if (names.length !== 1 || names[0] !== SMOKE_FILE_REL) return false; + const msg = runGitCapture(projectRoot, ["log", "-1", "--format=%B"]); + return msg.ok && msg.stdout.includes("Co-authored-by:"); +} diff --git a/.pi/lib/harness-lite-review-precheck.ts b/.pi/lib/harness-lite-review-precheck.ts new file mode 100644 index 00000000..3f48fe9f --- /dev/null +++ b/.pi/lib/harness-lite-review-precheck.ts @@ -0,0 +1,56 @@ +/** + * Disk-backed lite-review guards (split from subagent precheck for testability). + */ + +import { + readAdversaryReportFromRun, + readBenchmarkLogFromRun, + readReviewOutcomeFromRun, +} from "./harness-run-context.js"; + +export interface LiteReviewPrecheckOptions { + projectRoot?: string; + runId?: string | null; + lastOutcome?: string | null; +} + +export async function priorBlockMergeFromDisk( + opts?: LiteReviewPrecheckOptions, +): Promise { + const outcome = String(opts?.lastOutcome ?? "").toLowerCase(); + if (outcome.includes("block_merge") || outcome.includes("block")) { + return true; + } + const runId = opts?.runId; + const projectRoot = opts?.projectRoot; + if (!runId || !projectRoot) return false; + + const adversary = await readAdversaryReportFromRun(runId, projectRoot); + if (adversary?.block_merge === true) return true; + + const review = await readReviewOutcomeFromRun(runId, projectRoot); + if ( + review?.adversary_status === "block_merge" || + (review?.remediation_class === "implementation_gap" && + review?.eval_status?.toLowerCase() === "pass") + ) { + return true; + } + + const benchmark = await readBenchmarkLogFromRun(runId, projectRoot); + if (benchmark?.adversary_repro === "fail") return true; + + return false; +} + +/** Lite review may skip adversary only when repro pack passed and no prior block_merge. */ +export async function liteReviewMaySkipAdversary( + opts?: LiteReviewPrecheckOptions, +): Promise { + if (await priorBlockMergeFromDisk(opts)) return false; + const runId = opts?.runId; + const projectRoot = opts?.projectRoot; + if (!runId || !projectRoot) return false; + const benchmark = await readBenchmarkLogFromRun(runId, projectRoot); + return benchmark?.adversary_repro === "pass"; +} diff --git a/.pi/lib/harness-remediation.ts b/.pi/lib/harness-remediation.ts index 23998d8b..0bc135f4 100644 --- a/.pi/lib/harness-remediation.ts +++ b/.pi/lib/harness-remediation.ts @@ -9,11 +9,16 @@ export type RemediationClass = | "rollback" | "inconclusive"; +export type GapKind = "hygiene" | "functional" | "mixed"; + export interface ReviewOutcomeLike { schema_version?: string; status?: string; remediation_class?: RemediationClass | string; recommended_next?: string; + eval_status?: string; + adversary_status?: string; + gap_kind?: GapKind; } export interface EvalVerdictLike { @@ -22,6 +27,56 @@ export interface EvalVerdictLike { failed_checks?: string[]; } +export interface AdversaryVerdictLike { + block_merge?: boolean; + severity?: string; + recommendation?: string; + repro_steps?: string[]; + repro_commands?: ReproCommandLike[]; +} + +export interface ReproCommandLike { + cmd: string; + cwd?: string; + safe_for_phase1?: boolean; +} + +export interface BenchmarkLogLike { + harness_verify?: string; + ls_lint?: string; + sentrux_check?: string; + notes?: string; + adversary_repro?: string; +} + +const HYGIENE_CHECK_PATTERNS = [ + /lint_format/i, + /biome/i, + /format/i, + /staged.?file/i, + /git add/i, + /ls.?lint/i, + /filename/i, + /hygiene/i, + /prettier/i, +]; + +const FUNCTIONAL_CHECK_PATTERNS = [ + /acceptance/i, + /test/i, + /repro/i, + /toctou/i, + /race/i, + /block_merge/i, + /implementation/i, + /functional/i, + /widget/i, + /resume/i, +]; + +const SHELL_CMD_PREFIX = + /^(npx|node|npm|tsx|sg|python3?|biome|cargo|go|make)\b/i; + /** Infer remediation when parent skipped Phase 6 but eval-verdict exists on disk. */ export function remediationClassFromEvalVerdict( verdict: EvalVerdictLike | null, @@ -63,11 +118,13 @@ export function remediationClassFromEvalVerdict( export function recommendedNextForRemediation( remediation: RemediationClass, + opts?: { burst?: boolean }, ): string { switch (remediation) { case "pass": return "/harness-policy-status"; case "implementation_gap": + if (opts?.burst) return "/harness-steer --burst"; return "/harness-steer"; case "plan_gap": return "/harness-plan (mode: revise)"; @@ -77,3 +134,203 @@ export function recommendedNextForRemediation( return "/harness-review"; } } + +export function steerBurstFromEnv(): boolean { + const raw = process.env.HARNESS_STEER_BURST?.trim(); + return raw === "1" || raw?.toLowerCase() === "true"; +} + +/** Whether burst steer is allowed for eval-pass + adversary block_merge. */ +export function steerBurstAllowed( + evalVerdict: EvalVerdictLike | null, + adversary: AdversaryVerdictLike | null, + inlineRepairAttempted?: boolean, +): boolean { + if (!steerBurstFromEnv()) return false; + if (inlineRepairAttempted) return false; + const evalPass = (evalVerdict?.status ?? "").toLowerCase() === "pass"; + return evalPass && adversary?.block_merge === true; +} + +export function effectiveSteerMaxAttempts( + baseMax: number, + burstAllowed: boolean, +): number { + return burstAllowed ? baseMax + 1 : baseMax; +} + +function matchesAny(text: string, patterns: RegExp[]): boolean { + const lower = text.toLowerCase(); + return patterns.some((p) => p.test(lower)); +} + +function collectFailureText( + evalVerdict: EvalVerdictLike | null, + adversary: AdversaryVerdictLike | null, + benchmark: BenchmarkLogLike | null, +): string { + const parts: string[] = []; + if (Array.isArray(evalVerdict?.failed_checks)) { + parts.push(...evalVerdict.failed_checks); + } + if (typeof evalVerdict?.recommended_action === "string") { + parts.push(evalVerdict.recommended_action); + } + if (Array.isArray(adversary?.repro_steps)) { + parts.push(...adversary.repro_steps); + } + if (typeof benchmark?.notes === "string") parts.push(benchmark.notes); + if (benchmark?.ls_lint === "fail") parts.push("ls_lint_fail"); + if (benchmark?.harness_verify === "fail") parts.push("harness_verify_fail"); + return parts.join(" "); +} + +/** Classify implementation_gap as hygiene-only, functional, or mixed. */ +export function classifyImplementationGap( + evalVerdict: EvalVerdictLike | null, + adversary: AdversaryVerdictLike | null, + benchmark: BenchmarkLogLike | null, +): GapKind { + const text = collectFailureText(evalVerdict, adversary, benchmark); + const hygiene = matchesAny(text, HYGIENE_CHECK_PATTERNS); + const functional = + matchesAny(text, FUNCTIONAL_CHECK_PATTERNS) || + adversary?.block_merge === true; + if (hygiene && functional) return "mixed"; + if (hygiene) return "hygiene"; + return "functional"; +} + +export interface SynthesizeReviewOutcomeInput { + runId: string; + eval: EvalVerdictLike | null; + adversary: AdversaryVerdictLike | null; + benchmark?: BenchmarkLogLike | null; + steerAttempt?: number; + inlineRepairAttempted?: boolean; +} + +export interface SynthesizedReviewOutcome { + schema_version: string; + run_id: string; + status: "pass" | "fail" | "inconclusive"; + remediation_class: RemediationClass; + recommended_next: string; + source_artifacts: Record; + review_tier: string; + eval_status?: string; + adversary_status?: string; + gap_kind?: GapKind; + steer_attempt?: number; +} + +/** Merge eval + adversary (+ benchmark) into a canonical review-outcome. */ +export function synthesizeReviewOutcome( + input: SynthesizeReviewOutcomeInput, +): SynthesizedReviewOutcome | null { + const evalStatus = (input.eval?.status ?? "").toLowerCase(); + if (!evalStatus) return null; + + const adversaryPresent = input.adversary != null; + const blockMerge = input.adversary?.block_merge === true; + const evalRemediation = remediationClassFromEvalVerdict(input.eval); + + const sourceArtifacts: Record = {}; + if (input.eval) + sourceArtifacts["eval-verdict"] = "artifacts/eval-verdict.yaml"; + if (adversaryPresent) { + sourceArtifacts["adversary-report"] = "artifacts/adversary-report.yaml"; + } + + let remediation: RemediationClass; + let status: "pass" | "fail" | "inconclusive"; + let gapKind: GapKind | undefined; + + if (evalStatus === "pass" && blockMerge) { + remediation = "implementation_gap"; + status = "fail"; + gapKind = classifyImplementationGap( + input.eval, + input.adversary, + input.benchmark ?? null, + ); + } else if (evalRemediation) { + remediation = evalRemediation; + status = + evalStatus === "pass" + ? "pass" + : evalStatus === "fail" + ? "fail" + : "inconclusive"; + if (remediation === "implementation_gap") { + gapKind = classifyImplementationGap( + input.eval, + input.adversary, + input.benchmark ?? null, + ); + } + } else { + remediation = "inconclusive"; + status = "inconclusive"; + } + + const burst = steerBurstAllowed( + input.eval, + input.adversary, + input.inlineRepairAttempted, + ); + + const outcome: SynthesizedReviewOutcome = { + schema_version: "1.0.0", + run_id: input.runId, + status, + remediation_class: remediation, + recommended_next: recommendedNextForRemediation(remediation, { burst }), + source_artifacts: sourceArtifacts, + review_tier: "synthesized", + eval_status: input.eval?.status, + adversary_status: blockMerge + ? "block_merge" + : adversaryPresent + ? "proceed" + : undefined, + }; + if (gapKind) outcome.gap_kind = gapKind; + if (input.steerAttempt != null) outcome.steer_attempt = input.steerAttempt; + return outcome; +} + +/** Extract shell-safe repro commands from adversary repro_steps or structured repro_commands. */ +export function parseReproCommandsFromAdversary( + adversary: AdversaryVerdictLike | null, +): { commands: string[]; skipped: string[] } { + const commands: string[] = []; + const skipped: string[] = []; + + if (Array.isArray(adversary?.repro_commands)) { + for (const entry of adversary.repro_commands) { + if (entry && typeof entry.cmd === "string" && entry.cmd.trim()) { + commands.push(entry.cmd.trim()); + } + } + } + + if (Array.isArray(adversary?.repro_steps)) { + for (const step of adversary.repro_steps) { + if (typeof step !== "string") continue; + const trimmed = step.trim(); + if (!trimmed) continue; + for (const line of trimmed.split("\n")) { + const cmd = line.trim(); + if (!cmd || cmd.startsWith("#")) continue; + if (SHELL_CMD_PREFIX.test(cmd)) { + if (!commands.includes(cmd)) commands.push(cmd); + } else if (cmd.length > 0) { + skipped.push(cmd.slice(0, 120)); + } + } + } + } + + return { commands, skipped }; +} diff --git a/.pi/lib/harness-repair-brief.ts b/.pi/lib/harness-repair-brief.ts index 161d0ceb..24081d40 100644 --- a/.pi/lib/harness-repair-brief.ts +++ b/.pi/lib/harness-repair-brief.ts @@ -4,13 +4,17 @@ import { join } from "node:path"; import { + classifyImplementationGap, + type GapKind, + parseReproCommandsFromAdversary, type RemediationClass, remediationClassFromEvalVerdict, + synthesizeReviewOutcome, } from "./harness-remediation.js"; import { harnessRunsRoot } from "./harness-subagent-submit-path.js"; import { readYamlFile, writeYamlFile } from "./harness-yaml.js"; -const REPAIR_BRIEF_SCHEMA = "1.0.0"; +const REPAIR_BRIEF_SCHEMA = "1.1.0"; function asRecord(v: unknown): Record | null { return v && typeof v === "object" && !Array.isArray(v) @@ -74,9 +78,34 @@ export async function synthesizeRepairBrief( "sentrux-repair-plan", ); + const benchmark = await readArtifactYaml( + runRoot, + "artifacts/benchmark-log.yaml", + "benchmark-log", + ); + const synthesized = synthesizeReviewOutcome({ + runId: input.runId, + eval: evalDoc as { + status?: string; + recommended_action?: string; + failed_checks?: string[]; + }, + adversary: adversary as { + block_merge?: boolean; + repro_steps?: string[]; + repro_commands?: { cmd: string }[]; + }, + benchmark: benchmark as { + harness_verify?: string; + ls_lint?: string; + notes?: string; + }, + steerAttempt: input.steerAttempt, + }); const remediation = (typeof review?.remediation_class === "string" && (review.remediation_class as RemediationClass)) || + synthesized?.remediation_class || remediationClassFromEvalVerdict( evalDoc as { status?: string; @@ -85,6 +114,25 @@ export async function synthesizeRepairBrief( }, ) || "inconclusive"; + const gapKind: GapKind | undefined = + (typeof review?.gap_kind === "string" + ? (review.gap_kind as GapKind) + : undefined) || + synthesized?.gap_kind || + (remediation === "implementation_gap" + ? classifyImplementationGap( + evalDoc as { failed_checks?: string[] }, + adversary as { block_merge?: boolean; repro_steps?: string[] }, + benchmark as { notes?: string; ls_lint?: string }, + ) + : undefined); + const { commands: reproCommands, skipped: reproSkipped } = + parseReproCommandsFromAdversary( + adversary as { + repro_steps?: string[]; + repro_commands?: { cmd: string }[]; + }, + ); const sourceArtifacts = buildSourceArtifacts(input, planRel, { evalDoc, @@ -140,6 +188,14 @@ export async function synthesizeRepairBrief( if (priorityLakeIds.length > 0) { brief.priority_lake_ids = [...new Set(priorityLakeIds)]; } + if (gapKind) brief.gap_kind = gapKind; + if (reproCommands.length > 0) { + brief.repro_commands = reproCommands; + brief.must_pass_before_handoff = true; + } + if (reproSkipped.length > 0) brief.repro_skipped = reproSkipped; + const verification: string[] = stringList(benchmark?.verification_commands); + if (verification.length > 0) brief.verification_commands = verification; return brief; } diff --git a/.pi/lib/harness-run-context.ts b/.pi/lib/harness-run-context.ts index 5891e910..5f20b435 100644 --- a/.pi/lib/harness-run-context.ts +++ b/.pi/lib/harness-run-context.ts @@ -22,10 +22,14 @@ import { PLAN_CANCEL_OPTION, } from "./ask-user/policy.js"; import { + type BenchmarkLogLike, + effectiveSteerMaxAttempts, type RemediationClass, type ReviewOutcomeLike, recommendedNextForRemediation, remediationClassFromEvalVerdict, + steerBurstAllowed, + synthesizeReviewOutcome, } from "./harness-remediation.js"; import { readYamlFile, writeYamlFile } from "./harness-yaml.js"; @@ -73,6 +77,8 @@ export interface HarnessRunContext { steer_approved?: boolean; steer_attempt?: number; steer_max_attempts?: number; + /** Set after burst/inline repair subprocess completes in review cycle. */ + inline_repair_attempted?: boolean; } export interface ProjectActiveRunPointer { @@ -316,6 +322,10 @@ export function isEvaluatePhaseOrchestratorArtifactRel(rel: string): boolean { export const DEFAULT_STEER_MAX_ATTEMPTS = 3; +export function isSteerBurstArgs(args: string): boolean { + return /\b--burst\b/.test(args); +} + export function steerMaxAttemptsFromEnv(): number { const raw = process.env.HARNESS_STEER_MAX_ATTEMPTS?.trim(); if (!raw) return DEFAULT_STEER_MAX_ATTEMPTS; @@ -986,30 +996,21 @@ export function getLatestRunContext( entries: unknown[], ): HarnessRunContext | null { for (let i = entries.length - 1; i >= 0; i--) { - const clearEntry = entries[i] as SessionEntryLike; - if ( - clearEntry.type === "custom" && - clearEntry.customType === "harness-clear-result" - ) { - const clearData = clearEntry.data as - | { - approved?: boolean; - active_cleared?: boolean; - cleared_all?: boolean; - } - | undefined; - if ( - clearData?.approved === true && - (clearData.active_cleared === true || clearData.cleared_all === true) - ) { - return null; - } - } const entry = entries[i] as SessionEntryLike; - if (entry.type !== "custom" || entry.customType !== "harness-run-context") + if (entry.type !== "custom") continue; + if (entry.customType === "harness-clear-result") { + if (isConfirmedHarnessClearData(entry.data)) return null; continue; + } + if (entry.customType !== "harness-run-context") continue; const ctx = entry.data as Partial | undefined; if (ctx?.run_id && ctx.project_root) { + if ( + isRunClearedByClearEntriesAfterIndex(entries, i, ctx.run_id) || + isRunTombstonedByPriorClear(entries, i, ctx.run_id) + ) { + continue; + } return normalizeRunContext(ctx); } } @@ -1209,6 +1210,143 @@ export interface CrossSessionResumeInfo { taskSummary: string | null; } +function isConfirmedHarnessClearData(data: unknown): boolean { + const clearData = data as + | { + approved?: boolean; + active_cleared?: boolean; + cleared_all?: boolean; + } + | undefined; + return ( + clearData?.approved === true && + (clearData.active_cleared === true || clearData.cleared_all === true) + ); +} + +function confirmedHarnessClearRunIds(data: unknown): string[] { + if (!isConfirmedHarnessClearData(data)) return []; + const clearData = data as { active_run_ids?: unknown } | undefined; + if (!Array.isArray(clearData?.active_run_ids)) return []; + return clearData.active_run_ids.filter( + (runId): runId is string => + typeof runId === "string" && runId.trim().length > 0, + ); +} + +export function runIdFromCrossSessionResumeCommand( + command: string | null | undefined, +): string | null { + if (!command) return null; + const parts = command.trim().split(/\s+/); + const commandIndex = parts.indexOf("/harness-use-run"); + if (commandIndex < 0) return null; + const runId = parts[commandIndex + 1]; + return runId && !runId.startsWith("-") ? runId : null; +} + +export function isRunIdTombstonedByConfirmedHarnessClear( + entries: unknown[], + runId: string, +): boolean { + return entries.some((raw) => { + const entry = raw as SessionEntryLike; + return ( + entry.type === "custom" && + entry.customType === "harness-clear-result" && + confirmedHarnessClearAppliesToRun(entry.data, runId) + ); + }); +} +function confirmedHarnessClearAppliesToRun( + data: unknown, + runId: string, +): boolean { + if (!isConfirmedHarnessClearData(data)) return false; + const runIds = confirmedHarnessClearRunIds(data); + if (runIds.length === 0) return false; + return runIds.includes(runId); +} + +function isRunClearedByClearEntriesAfterIndex( + entries: unknown[], + runContextIndex: number, + runId: string, +): boolean { + for (let i = runContextIndex + 1; i < entries.length; i++) { + const entry = entries[i] as SessionEntryLike; + if (entry.type !== "custom") continue; + if (entry.customType !== "harness-clear-result") continue; + if (confirmedHarnessClearAppliesToRun(entry.data, runId)) return true; + } + return false; +} + +function isRunTombstonedByPriorClear( + entries: unknown[], + runContextIndex: number, + runId: string, +): boolean { + for (let i = 0; i < runContextIndex; i++) { + const entry = entries[i] as SessionEntryLike; + if (entry.type !== "custom") continue; + if (entry.customType !== "harness-clear-result") continue; + if (confirmedHarnessClearAppliesToRun(entry.data, runId)) return true; + } + return false; +} + +/** True when a confirmed clear tombstoned this run id in this session. */ +export function isRunClearedByConfirmedHarnessClear( + entries: unknown[], + runId: string, +): boolean { + if (isRunIdTombstonedByConfirmedHarnessClear(entries, runId)) { + return true; + } + + for (let i = entries.length - 1; i >= 0; i--) { + const entry = entries[i] as SessionEntryLike; + if (entry.type !== "custom" || entry.customType !== "harness-run-context") { + continue; + } + const ctx = entry.data as Partial | undefined; + if (ctx?.run_id !== runId) continue; + return ( + isRunClearedByClearEntriesAfterIndex(entries, i, runId) || + isRunTombstonedByPriorClear(entries, i, runId) + ); + } + return false; +} + +/** True once this session has recorded a confirmed clear of active harness runs. */ +export function hasConfirmedHarnessClear(entries: unknown[]): boolean { + return entries.some((raw) => { + const entry = raw as SessionEntryLike; + return ( + entry.type === "custom" && + entry.customType === "harness-clear-result" && + isConfirmedHarnessClearData(entry.data) + ); + }); +} + +/** True when a confirmed clear is newer than the latest harness-run-context entry. */ +export function hasConfirmedClearAfterLatestRunContext( + entries: unknown[], +): boolean { + for (let i = entries.length - 1; i >= 0; i--) { + const entry = entries[i] as SessionEntryLike; + if (entry.type !== "custom") continue; + if (entry.customType === "harness-run-context") return false; + if (entry.customType === "harness-clear-result") { + return isConfirmedHarnessClearData(entry.data); + } + } + return false; +} + /** True when this session already showed the cross-session resume prompt for runId. */ export function sessionHasResumePromptForRun( entries: unknown[], @@ -1253,7 +1391,9 @@ export async function resolveCrossSessionResumeInfo( ): Promise { if (isStaleActiveRunPointer(pointer, projectRoot)) return null; const disk = await loadRunContextFromDisk(pointer.run_id, projectRoot); - if (!disk || disk.status === "completed") return null; + if (!disk || disk.status !== "active") return null; + if (disk.run_id !== pointer.run_id) return null; + if (resolve(disk.project_root) !== resolve(projectRoot)) return null; const resumeCommand = `/harness-use-run ${pointer.run_id} --claim`; const statuses = await resolveCompletionStatuses( [], @@ -1268,7 +1408,7 @@ export async function resolveCrossSessionResumeInfo( executionStatus: statuses.executionStatus, evalStatus: statuses.evalStatus, adversaryComplete: statuses.adversaryComplete, - aborted: disk.status === "aborted", + aborted: false, }); return { runId: pointer.run_id, @@ -1285,9 +1425,17 @@ export async function evaluateCrossSessionResume( projectRoot: string, entries: unknown[], ): Promise { - if (getLatestRunContext(entries)) return null; + if (hasConfirmedClearAfterLatestRunContext(entries)) return null; + const pointer = await loadProjectActiveRun(projectRoot); if (!pointer) return null; + if (isRunIdTombstonedByConfirmedHarnessClear(entries, pointer.run_id)) { + return null; + } + if (getLatestRunContext(entries)) return null; + if (isRunClearedByConfirmedHarnessClear(entries, pointer.run_id)) { + return null; + } return resolveCrossSessionResumeInfo(projectRoot, pointer); } @@ -1751,6 +1899,155 @@ export async function readAdversaryReportFromRun( } } +export async function readBenchmarkLogFromRun( + runId: string, + projectRoot: string, +): Promise { + try { + const path = join( + harnessRunsRoot(projectRoot), + runId, + "artifacts", + "benchmark-log.yaml", + ); + return (await readYamlFile(path, "benchmark-log")) as BenchmarkLogLike; + } catch { + return null; + } +} + +export async function readRepairBriefFromRun( + runId: string, + projectRoot: string, +): Promise<{ + gap_kind?: string; + steer_attempt?: number; + must_pass_before_handoff?: boolean; +} | null> { + try { + const path = join( + harnessRunsRoot(projectRoot), + runId, + "artifacts", + "repair-brief.yaml", + ); + return (await readYamlFile(path, "repair-brief")) as { + gap_kind?: string; + steer_attempt?: number; + must_pass_before_handoff?: boolean; + }; + } catch { + return null; + } +} + +export interface SteerEntryEffects { + incrementSteerAttempt: boolean; + incrementHygieneRepairs: boolean; + markBurstUsed: boolean; + skipExecutor: boolean; +} + +/** Steer entry at /harness-steer start — hygiene lane skips attempt increment. */ +export async function resolveSteerEntryEffects( + runId: string, + projectRoot: string, + args: string, +): Promise { + const brief = await readRepairBriefFromRun(runId, projectRoot); + const gapKind = brief?.gap_kind; + const hygieneOnly = gapKind === "hygiene"; + const burst = isSteerBurstArgs(args); + return { + incrementSteerAttempt: !hygieneOnly, + incrementHygieneRepairs: hygieneOnly, + markBurstUsed: burst, + skipExecutor: hygieneOnly, + }; +} + +/** Mark eval-verdict stale after executor repair so review re-runs verdict. */ +export async function invalidateEvalVerdictAfterRepair( + runId: string, + projectRoot: string, +): Promise { + const path = join( + harnessRunsRoot(projectRoot), + runId, + "artifacts", + "eval-verdict.yaml", + ); + try { + const doc = (await readYamlFile(path, "eval-verdict")) as Record< + string, + unknown + >; + doc.status = "stale"; + doc.notes = "invalidated after steer repair; re-run verdict evaluator"; + await writeYamlFile(path, doc); + } catch { + /* no prior verdict */ + } +} + +export async function updateSteerStateOnEntry( + runId: string, + projectRoot: string, + effects: SteerEntryEffects, + ctx: HarnessRunContext, +): Promise { + const runRoot = join(harnessRunsRoot(projectRoot), runId); + const steerPath = join(runRoot, "artifacts", "steer-state.yaml"); + const existing = (await readSteerStateFromRun(runId, projectRoot)) ?? {}; + const attempt = existing.attempt ?? ctx.steer_attempt ?? 0; + const hygieneRepairs = existing.hygiene_repairs ?? 0; + const nextState = { + schema_version: "1.0.0", + run_id: runId, + attempt: effects.incrementSteerAttempt ? attempt + 1 : attempt, + max_attempts: ctx.steer_max_attempts ?? steerMaxAttemptsFromEnv(), + active: true, + hygiene_repairs: effects.incrementHygieneRepairs + ? hygieneRepairs + 1 + : hygieneRepairs, + burst_used: effects.markBurstUsed ? true : existing.burst_used, + }; + await mkdir(join(runRoot, "artifacts"), { recursive: true }); + await writeYamlFile(steerPath, nextState); + return { + ...ctx, + steer_attempt: nextState.attempt, + steer_max_attempts: nextState.max_attempts, + }; +} + +export async function readSteerStateFromRun( + runId: string, + projectRoot: string, +): Promise<{ + attempt?: number; + max_attempts?: number; + hygiene_repairs?: number; + burst_used?: boolean; +} | null> { + try { + const path = join( + harnessRunsRoot(projectRoot), + runId, + "artifacts", + "steer-state.yaml", + ); + return (await readYamlFile(path, "steer-state")) as { + attempt?: number; + max_attempts?: number; + hygiene_repairs?: number; + burst_used?: boolean; + }; + } catch { + return null; + } +} + export interface CompletionStatuses { planStatus: string | null; executionStatus: string | null; @@ -2058,10 +2355,59 @@ export function harnessSlashCommandLineForPolicy( return null; } +function sessionHasHarnessPolicyState(entries: unknown[]): boolean { + return entries.some((raw) => { + const entry = raw as SessionEntryLike; + return ( + entry.type === "custom" && entry.customType === "harness-policy-state" + ); + }); +} + +function policyStateForTransition( + entries: unknown[], + activeCtx?: HarnessRunContext | null, +): HarnessPolicyState { + const state = getLatestPolicyState(entries); + if (sessionHasHarnessPolicyState(entries)) return state; + const runCtx = activeCtx ?? getLatestRunContext(entries); + if (!runCtx?.run_id) return state; + const boot = policyBootstrapFromRunContext(runCtx); + return { + phase: boot.phase, + approvedPlan: boot.approvedPlan, + planId: boot.planId, + aborted: state.aborted, + }; +} + +/** Bootstrap policy phase from disk when session has no policy/run entries yet. */ +export async function policyStateFromDiskIfNeeded( + entries: unknown[], + projectRoot: string, +): Promise { + if (sessionHasHarnessPolicyState(entries) || getLatestRunContext(entries)) { + return null; + } + const pointer = await loadProjectActiveRun(projectRoot); + if (!pointer || isStaleActiveRunPointer(pointer, projectRoot)) return null; + const disk = await loadRunContextFromDisk(pointer.run_id, projectRoot); + if (!disk) return null; + const boot = policyBootstrapFromRunContext(disk); + return { + phase: boot.phase, + approvedPlan: boot.approvedPlan, + planId: boot.planId, + aborted: false, + }; +} + /** Mirrors policy-gate phase checks so run-context does not inject on blocked turns. */ export function getPolicyTransitionBlock( userPrompt: string, entries: unknown[], + activeCtx?: HarnessRunContext | null, + diskPolicy?: HarnessPolicyState | null, ): { blocked: boolean; message?: string } { if ( isHarnessBootstrapPrompt(userPrompt) || @@ -2069,7 +2415,7 @@ export function getPolicyTransitionBlock( ) { return { blocked: false }; } - const state = getLatestPolicyState(entries); + const state = diskPolicy ?? policyStateForTransition(entries, activeCtx); const nextPhase = inferHarnessPhase(entries, userPrompt); if (!isValidHarnessPhaseTransition(state.phase, nextPhase)) { return { @@ -2081,7 +2427,7 @@ export function getPolicyTransitionBlock( }; } if (nextPhase === "execute" && !state.approvedPlan) { - const runCtx = getLatestRunContext(entries); + const runCtx = activeCtx ?? getLatestRunContext(entries); if ( !runCtx?.plan_ready && !hasApprovedPlanSignalFromUserPrompt(userPrompt) @@ -2156,6 +2502,7 @@ export async function resolveRemediationClassForRun( export async function ensureReviewOutcomeFromEval( runId: string, projectRoot: string, + opts?: { steerAttempt?: number; inlineRepairAttempted?: boolean }, ): Promise { const existing = await readReviewOutcomeFromRun(runId, projectRoot); if (existing?.remediation_class) return existing; @@ -2163,29 +2510,17 @@ export async function ensureReviewOutcomeFromEval( const evalV = await readEvalVerdictFromRun(runId, projectRoot); if (!evalV?.status) return null; - const remediation = remediationClassFromEvalVerdict(evalV) ?? "inconclusive"; - const evalStatus = (evalV.status ?? "").toLowerCase(); - const status = - evalStatus === "pass" - ? "pass" - : evalStatus === "fail" - ? "fail" - : "inconclusive"; - - const outcome: ReviewOutcomeLike & { - run_id: string; - recommended_next: string; - source_artifacts: Record; - review_tier: string; - } = { - schema_version: "1.0.0", - run_id: runId, - status, - remediation_class: remediation, - recommended_next: recommendedNextForRemediation(remediation), - source_artifacts: { "eval-verdict": "artifacts/eval-verdict.yaml" }, - review_tier: "synthesized", - }; + const adversary = await readAdversaryReportFromRun(runId, projectRoot); + const benchmark = await readBenchmarkLogFromRun(runId, projectRoot); + const synthesized = synthesizeReviewOutcome({ + runId, + eval: evalV, + adversary, + benchmark, + steerAttempt: opts?.steerAttempt, + inlineRepairAttempted: opts?.inlineRepairAttempted, + }); + if (!synthesized) return null; const outPath = join( harnessRunsRoot(projectRoot), @@ -2193,16 +2528,17 @@ export async function ensureReviewOutcomeFromEval( "artifacts", "review-outcome.yaml", ); - await writeYamlFile(outPath, outcome); + await writeYamlFile(outPath, synthesized); + const steerState = await readSteerStateFromRun(runId, projectRoot); const { ensureRepairBriefOnDisk } = await import("./harness-repair-brief.js"); await ensureRepairBriefOnDisk({ runId, projectRoot, - steerAttempt: 0, + steerAttempt: steerState?.attempt ?? opts?.steerAttempt ?? 0, }); - return outcome; + return synthesized; } /** Align next_recommended_command with on-disk review/eval routing after /harness-review. */ @@ -2239,16 +2575,27 @@ export async function reconcileReviewRouting( ); if (!remediation) return working; + const adversary = await readAdversaryReportFromRun( + working.run_id, + projectRoot, + ); + const burst = steerBurstAllowed( + evalV, + adversary, + working.inline_repair_attempted, + ); + const steerState = await readSteerStateFromRun(working.run_id, projectRoot); const next = nextStepAfterOutcome({ phase: working.phase, lastCompletedStep: working.last_completed_step, lastOutcome: working.last_outcome, evalStatus: working.last_outcome, remediationClass: remediation, - steerAttempt: working.steer_attempt ?? 0, + steerAttempt: steerState?.attempt ?? working.steer_attempt ?? 0, steerMaxAttempts: working.steer_max_attempts ?? steerMaxAttemptsFromEnv(), reviewComplete: true, aborted: working.status === "aborted", + burstAllowed: burst, }); return { @@ -2264,29 +2611,40 @@ function nextStepForEvaluateLikePhase(input: { evalStatus: string; steerAttempt: number; steerMax: number; + burstAllowed?: boolean; }): string { - if (input.remediation === "pass" || input.evalStatus === "pass") { + const effectiveMax = effectiveSteerMaxAttempts( + input.steerMax, + input.burstAllowed === true, + ); + if (input.remediation === "implementation_gap") { + if (input.steerAttempt < effectiveMax) { + return input.burstAllowed ? "/harness-steer --burst" : "/harness-steer"; + } + return "/harness-plan (mode: revise) or /harness-abort"; + } + if (input.remediation === "pass") { if (input.adversaryComplete) return "/harness-policy-status"; return "/harness-review"; } + if (input.evalStatus === "pass" && input.adversaryComplete) { + return "/harness-policy-status"; + } + if (input.evalStatus === "pass" && !input.adversaryComplete) { + return "/harness-review"; + } if (input.remediation === "rollback") return "/harness-incident"; if (input.remediation === "plan_gap") return "/harness-plan (mode: revise)"; - if ( - input.remediation === "implementation_gap" || - (input.remediation === "inconclusive" && input.evalStatus === "fail") - ) { - if (input.steerAttempt < input.steerMax) return "/harness-steer"; + if (input.remediation === "inconclusive" && input.evalStatus === "fail") { + if (input.steerAttempt < effectiveMax) return "/harness-steer"; return "/harness-plan (mode: revise) or /harness-abort"; } if (input.evalStatus === "fail") { if (input.remediation === "plan_gap") { return "/harness-plan (mode: revise)"; } - if ( - input.remediation === "implementation_gap" || - input.remediation === "inconclusive" - ) { - if (input.steerAttempt < input.steerMax) return "/harness-steer"; + if (input.remediation === "inconclusive") { + if (input.steerAttempt < effectiveMax) return "/harness-steer"; return "/harness-plan (mode: revise) or /harness-abort"; } return "/harness-plan (mode: revise) or /harness-incident"; @@ -2309,6 +2667,7 @@ export function nextStepAfterOutcome(input: { steerAttempt?: number; steerMaxAttempts?: number; reviewComplete?: boolean; + burstAllowed?: boolean; }): string { if (input.aborted) { return '/harness-plan ""'; @@ -2368,6 +2727,7 @@ export function nextStepAfterOutcome(input: { evalStatus: evalSt, steerAttempt, steerMax, + burstAllowed: input.burstAllowed, }); } @@ -2617,9 +2977,21 @@ export async function blockingSteerCommandReason( return "Run /harness-review first (artifacts/repair-brief.yaml missing)."; } + const steerState = await readSteerStateFromRun(activeCtx.run_id, projectRoot); + const attempt = steerState?.attempt ?? activeCtx.steer_attempt ?? 0; const max = activeCtx.steer_max_attempts ?? steerMaxAttemptsFromEnv(); - if ((activeCtx.steer_attempt ?? 0) >= max) { - return `Steer attempt cap reached (${max}). Use /harness-plan (mode: revise) or /harness-abort.`; + const adversary = await readAdversaryReportFromRun( + activeCtx.run_id, + projectRoot, + ); + const burst = steerBurstAllowed( + evalV, + adversary, + activeCtx.inline_repair_attempted, + ); + const effectiveMax = effectiveSteerMaxAttempts(max, burst); + if (attempt >= effectiveMax) { + return `Steer attempt cap reached (${effectiveMax}${burst ? ` incl. burst` : ""}). Use /harness-plan (mode: revise) or /harness-abort.`; } return null; } diff --git a/.pi/lib/harness-subagent-precheck.ts b/.pi/lib/harness-subagent-precheck.ts index 6ebfc832..8cb2a1e2 100644 --- a/.pi/lib/harness-subagent-precheck.ts +++ b/.pi/lib/harness-subagent-precheck.ts @@ -7,6 +7,7 @@ import { agentAllowsMutatingTools, } from "../../vendor/pi-subagents/src/agents.js"; import { getAgentKind } from "./agents-policy.mjs"; +import { liteReviewMaySkipAdversary } from "./harness-lite-review-precheck.js"; import { getHarnessPackageRoot } from "./harness-paths.js"; import { isHarnessReviewParallelEnabled } from "./harness-review-parallel.js"; import { type HarnessPhase, inferHarnessPhase } from "./harness-run-context.js"; @@ -50,10 +51,10 @@ function parseSteerAttemptFromTasks(params: { return 0; } -function priorBlockMergeInContext(opts?: PrecheckOptions): boolean { - const outcome = String(opts?.lastOutcome ?? "").toLowerCase(); - return outcome.includes("block_merge") || outcome.includes("block"); -} +export { + liteReviewMaySkipAdversary, + priorBlockMergeFromDisk, +} from "./harness-lite-review-precheck.js"; function collectAgents(params: { agent?: string; @@ -116,12 +117,12 @@ export async function precheckHarnessSubagentSpawn( if ( steerAttempt >= 2 && names.includes("harness/reviewing/adversary") && - !priorBlockMergeInContext(opts) + (await liteReviewMaySkipAdversary(opts)) ) { return { ok: false, message: - `Lite review (steer attempt ${steerAttempt}): skip adversary unless prior block_merge. ` + + `Lite review (steer attempt ${steerAttempt}): skip adversary unless prior block_merge or adversary_repro fail. ` + `Run benchmark + verdict evaluator only.`, }; } diff --git a/.pi/lib/harness-ui-state.ts b/.pi/lib/harness-ui-state.ts index 851fc9a7..442e7805 100644 --- a/.pi/lib/harness-ui-state.ts +++ b/.pi/lib/harness-ui-state.ts @@ -3,7 +3,12 @@ import { shouldEmitBlockingBudgetExhausted } from "./harness-budget-enforce.js"; import { extractCompletionStatuses, getLatestRunContext, + hasConfirmedClearAfterLatestRunContext, + hasConfirmedHarnessClear, + isRunClearedByConfirmedHarnessClear, + isRunIdTombstonedByConfirmedHarnessClear, nextStepAfterOutcome, + runIdFromCrossSessionResumeCommand, } from "./harness-run-context.js"; import { buildHarnessProgressStatusLine } from "./harness-subagent-progress.js"; @@ -151,6 +156,7 @@ const RELEVANT_CUSTOM_TYPES = new Set([ "harness-run-trace", "harness-trace-state", "harness-run-context", + "harness-clear-result", ]); function asNumber(value: unknown): number | null { @@ -338,6 +344,15 @@ function applyTraceState( : null; } +function resetActiveRunState(state: HarnessUiState): void { + state.phase = "plan"; + state.planApproved = false; + state.planId = null; + state.traceRunId = null; + state.nextRecommendedCommand = "/harness-plan"; + state.crossSessionResumeCommand = null; +} + function applyRunContextState( state: HarnessUiState, latest: Map, @@ -356,7 +371,18 @@ function applyRunContextState( } | undefined; if (!runCtx) { - state.nextRecommendedCommand = null; + resetActiveRunState(state); + return; + } + if (hasConfirmedClearAfterLatestRunContext(entries)) { + resetActiveRunState(state); + return; + } + if ( + typeof runCtx.run_id === "string" && + isRunClearedByConfirmedHarnessClear(entries, runCtx.run_id) + ) { + resetActiveRunState(state); return; } if (runCtx.plan_ready) { @@ -408,6 +434,7 @@ export function harnessUiEntriesFingerprint(entries: unknown[]): string { return JSON.stringify({ len: entries.length, policy: latest.get("harness-policy-state") ?? null, + clear: latest.get("harness-clear-result") ?? null, run: latest.get("harness-run-context") ?? null, }); } @@ -529,9 +556,26 @@ export function deriveHarnessStatusHint(state: HarnessUiState): { } } +function hasRunContextEntryAfterIndex( + entries: unknown[], + entryCountAtClear: number, +): boolean { + for (let i = Math.max(0, entryCountAtClear); i < entries.length; i++) { + const entry = entries[i] as CustomEntryLike; + if (entry.type === "custom" && entry.customType === "harness-run-context") { + return true; + } + } + return false; +} + export class HarnessUiStateStore { private lastFingerprint = ""; private crossSessionResumeCommand: string | null = null; + private suppressActiveRunUntilUpdate = false; + /** Session entry count when clearActiveRunState ran; new run-context after this lifts suppress. */ + private suppressAfterEntryCount = -1; + private entriesLengthAtLastRefresh = 0; private cachedState: HarnessUiState = { ...DEFAULT_STATE, severity: { ...DEFAULT_STATE.severity }, @@ -541,6 +585,26 @@ export class HarnessUiStateStore { this.crossSessionResumeCommand = command; } + public acknowledgeRunContextUpdated(): void { + this.crossSessionResumeCommand = null; + this.suppressActiveRunUntilUpdate = false; + this.lastFingerprint = ""; + } + + public clearActiveRunState(sessionEntryCount?: number): void { + this.crossSessionResumeCommand = null; + this.suppressActiveRunUntilUpdate = true; + this.suppressAfterEntryCount = + sessionEntryCount ?? this.entriesLengthAtLastRefresh; + this.lastFingerprint = ""; + const nextState: HarnessUiState = { + ...this.cachedState, + severity: { ...this.cachedState.severity }, + }; + resetActiveRunState(nextState); + this.cachedState = nextState; + } + private applyCrossSessionOverlay(state: HarnessUiState): HarnessUiState { if (!this.crossSessionResumeCommand) { return { ...state, crossSessionResumeCommand: null }; @@ -551,17 +615,62 @@ export class HarnessUiStateStore { }; } + private maybeLiftSuppressAfterClear(entries: unknown[]): void { + if (!this.suppressActiveRunUntilUpdate) return; + if (this.suppressAfterEntryCount < 0) return; + if (entries.length <= this.suppressAfterEntryCount) return; + if ( + hasRunContextEntryAfterIndex(entries, this.suppressAfterEntryCount) || + hasConfirmedClearAfterLatestRunContext(entries) + ) { + this.suppressActiveRunUntilUpdate = false; + this.suppressAfterEntryCount = -1; + this.crossSessionResumeCommand = null; + } + } + /** Refresh from session entries; recompute when harness policy/run context changes. */ public refresh(ctx: ExtensionContext): HarnessUiState { const entries = ctx.sessionManager.getEntries(); const fingerprint = harnessUiEntriesFingerprint(entries); + const resumeRunId = runIdFromCrossSessionResumeCommand( + this.crossSessionResumeCommand, + ); + const clearBlocksResume = + hasConfirmedClearAfterLatestRunContext(entries) || + (resumeRunId + ? isRunIdTombstonedByConfirmedHarnessClear(entries, resumeRunId) + : false); + if (clearBlocksResume) { + this.crossSessionResumeCommand = null; + } + if (fingerprint !== this.lastFingerprint) { this.cachedState = createStateFromEntries(entries); this.lastFingerprint = fingerprint; - if (getLatestRunContext(entries)) { - this.crossSessionResumeCommand = null; + this.maybeLiftSuppressAfterClear(entries); + if (!this.suppressActiveRunUntilUpdate) { + if ( + this.cachedState.traceRunId && + (hasConfirmedHarnessClear(entries) || getLatestRunContext(entries)) + ) { + this.crossSessionResumeCommand = null; + } else if (clearBlocksResume) { + this.crossSessionResumeCommand = null; + this.suppressActiveRunUntilUpdate = true; + this.suppressAfterEntryCount = entries.length; + } } } + if (this.suppressActiveRunUntilUpdate) { + const nextState: HarnessUiState = { + ...this.cachedState, + severity: { ...this.cachedState.severity }, + }; + resetActiveRunState(nextState); + this.cachedState = nextState; + } + this.entriesLengthAtLastRefresh = entries.length; this.cachedState = this.applyCrossSessionOverlay(this.cachedState); return this.cachedState; } diff --git a/.pi/lib/plan-headless-ux.ts b/.pi/lib/plan-headless-ux.ts index ec4c39ec..6336d423 100644 --- a/.pi/lib/plan-headless-ux.ts +++ b/.pi/lib/plan-headless-ux.ts @@ -2,6 +2,7 @@ * Headless / QA harness UX — avoid Phase 0 stalls and multi-hour plan debate loops. */ +import { spawnSync } from "node:child_process"; import { constants } from "node:fs"; import { access, mkdir, readdir, readFile, writeFile } from "node:fs/promises"; import { dirname, join } from "node:path"; @@ -11,12 +12,18 @@ import { canAutoApprovePlan, isHarnessPlanAutoApproveEnabled, } from "./harness-auto-approve.js"; +import { + isHarnessGitQaCommitComplete, + SMOKE_FILE_REL, + smokeFileHasIsoLine, +} from "./harness-git-qa.mjs"; import { appendPlanApprovalIfNew, type HarnessRunContext, hasPlanUserApproval, indexOfLastPlanCommand, type PlanPacketLike, + readExecutorHandoffFromRun, readPlanPacketFromPath, saveRunContextToDisk, } from "./harness-run-context.js"; @@ -42,8 +49,17 @@ import { const QA_SMOKE_TASK_RE = /\b(qa smoke|e2e-last-run|evals\/smoke\/|iso-?8601.*timestamp|append one .* timestamp line)\b/i; +const QA_GIT_TASK_RE = + /\b(harness git|harness-git-branch|harness-git-commit|auto-feature-branch|git workflow|git branch.*commit)\b/i; + +export function isHarnessGitQaTask(taskSummary: string): boolean { + return QA_GIT_TASK_RE.test(taskSummary.trim()); +} + export function isHarnessQaSmokeTask(taskSummary: string): boolean { - return QA_SMOKE_TASK_RE.test(taskSummary.trim()); + return ( + QA_SMOKE_TASK_RE.test(taskSummary.trim()) || isHarnessGitQaTask(taskSummary) + ); } export function shouldSeedHeadlessTaskClarification( @@ -500,19 +516,10 @@ export async function headlessTaskClarificationReady( return readiness.ok; } -const SMOKE_FILE_REL = ".pi/harness/evals/smoke/E2E-LAST-RUN.txt"; -const ISO_LINE_RE = /^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9:.+-Z]+/m; - -export async function smokeFileHasIsoLine( - projectRoot: string, -): Promise { - try { - const text = await readFile(join(projectRoot, SMOKE_FILE_REL), "utf-8"); - return ISO_LINE_RE.test(text); - } catch { - return false; - } -} +export { + isHarnessGitQaCommitComplete, + smokeFileHasIsoLine, +} from "./harness-git-qa.mjs"; export function shouldEndHeadlessPlanPrintSession(args: { command: string; @@ -541,13 +548,39 @@ export async function shouldEndHeadlessHarnessPrintSession(args: { ) { return true; } - if (process.env.HARNESS_QA_SMOKE !== "1") return false; - const hasSmoke = await smokeFileHasIsoLine(args.projectRoot); const lastStep = String(runCtx.last_completed_step ?? "").toLowerCase(); const lastOutcome = String(runCtx.last_outcome ?? "").toLowerCase(); - if (command === "harness-run" && hasSmoke && lastOutcome === "completed") { + if (command === "harness-run" && runCtx.run_id) { + const handoff = await readExecutorHandoffFromRun( + runCtx.run_id, + args.projectRoot, + ); + const execStatus = String(handoff?.execution_status ?? "").toLowerCase(); + if ( + execStatus === "completed" || + execStatus === "scope_drift" || + execStatus === "blocked" + ) { + return true; + } + if (lastStep === "execute" && lastOutcome) { + return true; + } + } + if ( + command === "harness-steer" && + (lastStep === "steer" || lastOutcome === "completed") + ) { return true; } + if (process.env.HARNESS_QA_SMOKE !== "1") return false; + const gitCommitDone = await isHarnessGitQaCommitComplete(args.projectRoot); + const hasSmoke = await smokeFileHasIsoLine(args.projectRoot); + const gitTask = isHarnessGitQaTask(runCtx.task_summary ?? ""); + if (command === "harness-run") { + if (gitTask && gitCommitDone) return true; + if (!gitTask && hasSmoke && lastOutcome === "completed") return true; + } if ( (command === "harness-review" || command === "harness-eval" || @@ -556,9 +589,12 @@ export async function shouldEndHeadlessHarnessPrintSession(args: { ) { return true; } - if (command === "harness-auto" && hasSmoke) { - if (lastStep === "review" || lastStep === "adversary") return true; - if (runCtx.plan_ready === true && lastOutcome === "pass") return true; + if (command === "harness-auto") { + if (gitTask && gitCommitDone) return true; + if (!gitTask && hasSmoke) { + if (lastStep === "review" || lastStep === "adversary") return true; + if (runCtx.plan_ready === true && lastOutcome === "pass") return true; + } } return false; } @@ -567,6 +603,10 @@ export function endHeadlessHarnessPrintSession(ctx: { abort?: () => void; }): void { ctx.abort?.(); + // pi -p often keeps the Node process alive after abort(); exit so headless QA/CI does not hang. + if (isHarnessNonInteractive() && process.env.HARNESS_HEADLESS_EXIT !== "0") { + setTimeout(() => process.exit(0), 50); + } } /** QA smoke: after headless auto plan, append ISO directly and skip full executor/review loop. */ @@ -581,6 +621,7 @@ export async function maybeHeadlessQaAutoExecuteSmoke(args: { } if (!args.runCtx.plan_ready) return false; if (!isHarnessQaSmokeTask(args.runCtx.task_summary ?? "")) return false; + if (isHarnessGitQaTask(args.runCtx.task_summary ?? "")) return false; if (await smokeFileHasIsoLine(args.projectRoot)) return true; const smokePath = join(args.projectRoot, SMOKE_FILE_REL); await mkdir(dirname(smokePath), { recursive: true }); @@ -596,3 +637,85 @@ export async function maybeHeadlessQaAutoExecuteSmoke(args: { Object.assign(args.runCtx, updated); return true; } + +/** QA git task: append smoke marker and commit via harness-git-commit on feature branch. */ +async function ensureGitWorkflowArtifactForRun(args: { + projectRoot: string; + runId: string; + upPkg?: string; +}): Promise { + const upPkg = args.upPkg ?? args.projectRoot; + const { ensureHarnessGitBranch, writeGitWorkflowArtifact } = await import( + "./harness-git-branch.mjs" + ); + const runDir = join(args.projectRoot, ".pi", "harness", "runs", args.runId); + const branchResult = await ensureHarnessGitBranch({ + projectRoot: args.projectRoot, + runId: args.runId, + upPkg, + }); + await writeGitWorkflowArtifact({ runDir, result: branchResult }); +} + +export async function maybeHeadlessGitQaFinalizeOnRun(args: { + projectRoot: string; + runCtx: HarnessRunContext; + command: string; + upPkg?: string; +}): Promise { + if (args.command !== "harness-run" && args.command !== "harness-auto") { + return false; + } + if (process.env.HARNESS_QA_SMOKE !== "1" || !isHarnessNonInteractive()) { + return false; + } + if (!isHarnessGitQaTask(args.runCtx.task_summary ?? "")) return false; + await ensureGitWorkflowArtifactForRun({ + projectRoot: args.projectRoot, + runId: args.runCtx.run_id, + upPkg: args.upPkg, + }); + if (await isHarnessGitQaCommitComplete(args.projectRoot)) return true; + + const smokePath = join(args.projectRoot, SMOKE_FILE_REL); + await mkdir(dirname(smokePath), { recursive: true }); + await writeFile(smokePath, `${new Date().toISOString()}\n`, "utf-8"); + + const upPkg = args.upPkg ?? args.projectRoot; + const commitScript = join(upPkg, ".pi", "scripts", "harness-git-commit.mjs"); + const commit = spawnSync( + process.execPath, + [ + commitScript, + "--root", + args.projectRoot, + "--only-path", + SMOKE_FILE_REL, + "--type", + "chore", + "--scope", + "harness", + "--subject", + `qa smoke marker for ${args.runCtx.run_id}`, + ], + { encoding: "utf8", cwd: args.projectRoot }, + ); + if (commit.status !== 0) { + return false; + } + if (!(await isHarnessGitQaCommitComplete(args.projectRoot))) { + return false; + } + + const updated: HarnessRunContext = { + ...args.runCtx, + phase: "evaluate", + last_completed_step: "execute", + last_outcome: "completed", + next_recommended_command: "/harness-review", + updated_at: new Date().toISOString(), + }; + await saveRunContextToDisk(updated); + Object.assign(args.runCtx, updated); + return true; +} diff --git a/.pi/prompts/harness-auto.md b/.pi/prompts/harness-auto.md index d5baaf29..eec9094e 100644 --- a/.pi/prompts/harness-auto.md +++ b/.pi/prompts/harness-auto.md @@ -23,7 +23,7 @@ Follow **harness-plan** performance rules (`subagent` with `agentScope: "both"`) 1. **Plan** — follow `/harness-plan`; drive steps via `harness_plan_next_action`. When `HARNESS_PLAN_AUTO_APPROVE=1` and deterministic gates pass (non-interactive), `approve_plan` auto-approves. Otherwise one human approval. 2. **Execute** — `harness/running/executor` with `executor_strategy` from packet (default `single_pass` for low/med). 3. **Review** — always **`/harness-review`** after execute (no benchmark fail-fast). -4. **Steer loop** — while `review-outcome.remediation_class === implementation_gap` and `steer_attempt < HARNESS_STEER_MAX_ATTEMPTS`: `/harness-steer` → `/harness-review` (tiered adversary on attempts 2+). +4. **Steer loop** — while `review-outcome.remediation_class === implementation_gap` and `steer_attempt < effective max`: `/harness-steer` (or `/harness-steer --burst` when eval pass + adversary `block_merge` and `HARNESS_STEER_BURST=1`) → `/harness-review` (tiered adversary on attempts 2+; hygiene at steer start via `harness-steer-hygiene.mjs` when `gap_kind: hygiene`). 5. **Parent** — apply locked strict gates; commit/PR only when `remediation_class: pass`. For commits, invoke **harness-git-commit** skill (never raw `git commit`). Do **not** call separate `/harness-eval` or `/harness-critic` (deprecated aliases of `/harness-review`). diff --git a/.pi/prompts/harness-review.md b/.pi/prompts/harness-review.md index b122f4cf..155e4e63 100644 --- a/.pi/prompts/harness-review.md +++ b/.pi/prompts/harness-review.md @@ -88,9 +88,27 @@ sentrux_check: pass|fail|skipped|not_installed sentrux_gate: pass|degraded|skipped|not_installed ls_lint: pass|fail|skipped|not_installed ls_lint_violations: 0 +steer_attempt: +adversary_repro: pass|fail|skipped notes: "…" ``` +**Hard gate** before Phase 2 evaluators: + +```bash +node "$UP_PKG/.pi/scripts/harness-review-preflight.mjs" --run-dir "" --steer-attempt +``` + +Abort evaluator spawns if preflight fails (stale/missing `benchmark-log.yaml`). + +After steer repair, when adversary may be skipped (lite tier), run repro pack first: + +```bash +node "$UP_PKG/.pi/scripts/harness-adversary-repro-pack.mjs" --run-dir "" +``` + +Lite skip is allowed only when `benchmark-log.adversary_repro: pass` and no prior `block_merge` on disk. + `harness_artifact_ready({ paths: ["artifacts/benchmark-log.yaml", "artifacts/sentrux-report.json", "artifacts/sentrux-diagnostics.json", "artifacts/sentrux-signal.yaml", "artifacts/ls-lint-signal.yaml"] })` when written. ## Phase 1b — Sentrux repair advisor (subagent) @@ -211,10 +229,12 @@ Write **`artifacts/review-outcome.yaml`** and **`artifacts/repair-brief.yaml`** | `remediation_class` | `recommended_next` | |---------------------|-------------------| | `pass` | `/harness-policy-status` | -| `implementation_gap` | `/harness-steer` | +| `implementation_gap` | `/harness-steer` or `/harness-steer --burst` when eval pass + adversary `block_merge` and `HARNESS_STEER_BURST=1` | | `plan_gap` | `/harness-plan` (mode: revise) | | `rollback` | `/harness-incident` | +Use `synthesizeReviewOutcome` fields: `eval_status`, `adversary_status`, `gap_kind` (`hygiene` \| `functional` \| `mixed`). **Do not** fuse executor repair in this session — defer to `/harness-steer` or `/harness-steer --burst`. + One `ask_user` steer gate when not pass (unless `steer_approved` on run-context). ## Completion diff --git a/.pi/prompts/harness-run.md b/.pi/prompts/harness-run.md index 7c08bc42..0d6e84ba 100644 --- a/.pi/prompts/harness-run.md +++ b/.pi/prompts/harness-run.md @@ -45,6 +45,19 @@ node "$UP_PKG/.pi/scripts/harness-ls-lint-cli.mjs" --json Note `violation_count` in run notes (do not block execute on pre-existing violations unless chair policy says otherwise). +## Pre-work — Git feature branch (parent) + +When `.pi/auto-commit.json` has `branch.strategy: auto-feature-branch`, ensure a non-protected working branch before the executor mutates files: + +```bash +node "$UP_PKG/.pi/scripts/harness-git-branch.mjs" \ + --run-id "" \ + --run-dir "" \ + --project-root "" +``` + +On protected branches (`main`, `master`, `release/*` by default), this creates or checks out `harness/`. Result is recorded in `artifacts/git-workflow.yaml`. Commits after review must use **harness-git-commit** (never raw `git commit`). + ## Orchestration — Single jelled implementer **Practice:** Peopleware — one accountable team owns delivery; generator–evaluator separation (executor does not self-certify). diff --git a/.pi/prompts/harness-steer.md b/.pi/prompts/harness-steer.md index 67b3a16b..7127a8ba 100644 --- a/.pi/prompts/harness-steer.md +++ b/.pi/prompts/harness-steer.md @@ -1,6 +1,6 @@ --- description: Post-review repair pass — executor reads repair-brief.yaml, then re-verify via /harness-review. -argument-hint: "[--attempt N]" +argument-hint: "[--burst] [--attempt N]" --- # harness-steer @@ -11,21 +11,38 @@ Thin orchestrator for the **steer loop**. Run only after `/harness-review` produ - Active run with `plan_ready` and `plan_packet_path` - `review-outcome.remediation_class` is `implementation_gap` (review outcome wins over executor `scope_drift` for routing) -- `steer_attempt < HARNESS_STEER_MAX_ATTEMPTS` (default 3) +- `steer_attempt < effective max` from `artifacts/steer-state.yaml` (default `HARNESS_STEER_MAX_ATTEMPTS=3`; +1 when burst allowed) ## Steps -1. Read `artifacts/review-outcome.yaml`, `artifacts/repair-brief.yaml`, `plan_packet_path` (paths only — do not paste bodies into tool args). When present, `repair-brief.yaml` already merges `artifacts/sentrux-repair-plan.yaml` (`[sentrux:…]` directives). -2. Update `artifacts/steer-state.yaml` (`attempt`, `max_attempts`, `active: true`). -3. Set policy phase to **execute** before spawning executor (required for mutating tools). -4. One `ask_user` steer gate unless `run-context.steer_approved` is already true. -5. Spawn **`harness/running/executor`** with `HarnessSpawnContext.mode: repair` and `repair_brief_path: artifacts/repair-brief.yaml`. Repair uses the same hash-anchored `read`/`edit`, batching, and pre-handoff verification rules as `/harness-run`. -6. Optional: `node "$UP_PKG/.pi/scripts/harness-sentrux-cli.mjs" gate --save` after repair to refresh the structural baseline. -7. Optional: `node "$UP_PKG/.pi/scripts/harness-ls-lint-cli.mjs"` after repair to confirm filename conventions. -7. `next_command`: **`/harness-review`** (always re-verify; use tiered adversary on attempts 2+). +1. Read `artifacts/review-outcome.yaml`, `artifacts/repair-brief.yaml`, `artifacts/steer-state.yaml`, `plan_packet_path` (paths only — do not paste bodies into tool args). When present, `repair-brief.yaml` already merges `artifacts/sentrux-repair-plan.yaml` (`[sentrux:…]` directives). +2. Extension updates `steer-state.yaml` on entry (`attempt`, `hygiene_repairs`, `burst_used`). **Hygiene-only** (`gap_kind: hygiene`) increments `hygiene_repairs` only — not `attempt`. +3. **Hygiene fast-path** (`gap_kind: hygiene` or `mixed` with hygiene directives first): + +```bash +node "$UP_PKG/.pi/scripts/harness-steer-hygiene.mjs" --run-dir "" --project-root "" +``` + +Do **not** spawn executor for hygiene-only gaps. Then `next_command`: `/harness-review`. + +4. **Burst** (`--burst` + `HARNESS_STEER_BURST=1`): preflight before executor: + +```bash +node "$UP_PKG/.pi/scripts/harness-inline-repair.mjs" --run-dir "" +``` + +Requires eval `pass` + adversary `block_merge` on disk. Sets `inline_repair_attempted` on run-context. + +5. Set policy phase to **execute** before spawning executor (required for mutating tools). +6. One `ask_user` steer gate unless `run-context.steer_approved` is already true. +7. Spawn **`harness/running/executor`** with `HarnessSpawnContext.mode: repair` and `repair_brief_path: artifacts/repair-brief.yaml`. Run `repair-brief.repro_commands` (or `verification_commands`) before handoff when `must_pass_before_handoff: true`. +8. Optional: `node "$UP_PKG/.pi/scripts/harness-sentrux-cli.mjs" gate --save` after repair. +9. Optional: `node "$UP_PKG/.pi/scripts/harness-ls-lint-cli.mjs"` after repair. +10. `next_command`: **`/harness-review`** (always re-verify; lite review on attempts 2+ unless prior `block_merge`). ## Forbidden - Re-call `approve_plan` unless `plan-packet.yaml` structure changed - Widen scope beyond approved packet - Skip review after repair +- Broad `git add` during hygiene (script uses path allowlist) diff --git a/.pi/scripts/README.md b/.pi/scripts/README.md index d392683f..879d2a52 100644 --- a/.pi/scripts/README.md +++ b/.pi/scripts/README.md @@ -37,6 +37,8 @@ From **Typescript extensions**, use `resolveHarnessScript()` / `getHarnessPackag | Resolve package root (`UP_PKG`) | `node "$UP_PKG/.pi/scripts/harness-resolve-up-pkg.mjs"` | | Auto-commit config bootstrap | `node "$UP_PKG/.pi/scripts/harness-auto-commit-bootstrap.mjs"` | | Git commit + co-author trailer | `node "$UP_PKG/.pi/scripts/harness-git-commit.mjs" --subject "…"` [`--dry-run`] | +| Git feature branch before execute | `node "$UP_PKG/.pi/scripts/harness-git-branch.mjs" --run-id "" --run-dir ""` | +| Git QA smoke commit assert | `node "$UP_PKG/.pi/scripts/harness-git-qa-assert.mjs" [--project-root DIR]` | | Project `.env` (append-only) | `node "$UP_PKG/.pi/scripts/harness-sync-env.mjs"` (`--create-missing` after user confirms) | | Harness lens extension | `.pi/extensions/harness-lens.ts` → `.pi/lib/harness-lens/index.ts` (loaded by `.pi/extensions`; PostHog owns lens telemetry) | diff --git a/.pi/scripts/harness-adversary-repro-pack.mjs b/.pi/scripts/harness-adversary-repro-pack.mjs new file mode 100644 index 00000000..c6f1c0c5 --- /dev/null +++ b/.pi/scripts/harness-adversary-repro-pack.mjs @@ -0,0 +1,147 @@ +#!/usr/bin/env node +/** + * Run safe adversary repro commands during review Phase 1. + * + * Usage: + * node harness-adversary-repro-pack.mjs --run-dir [--project-root ] + */ + +import { readFile, writeFile, stat } from "node:fs/promises"; +import { join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { spawnSync } from "node:child_process"; +import { parse as parseYaml, stringify as stringifyYaml } from "yaml"; + +const SHELL_PREFIX = /^(npx|node|npm|tsx|sg|python3?)\b/i; + +function parseArgs(argv) { + const out = { runDir: null, projectRoot: null }; + for (let i = 2; i < argv.length; i++) { + if (argv[i] === "--run-dir" && argv[i + 1]) out.runDir = argv[++i]; + else if (argv[i] === "--project-root" && argv[i + 1]) + out.projectRoot = argv[++i]; + } + return out; +} + +async function readYaml(path) { + try { + return parseYaml(await readFile(path, "utf8")); + } catch { + return null; + } +} + +function extractCommands(adversary) { + const cmds = []; + if (Array.isArray(adversary?.repro_commands)) { + for (const entry of adversary.repro_commands) { + if (entry?.cmd && typeof entry.cmd === "string") { + if (entry.safe_for_phase1 !== false) cmds.push(entry.cmd.trim()); + } + } + } + if (Array.isArray(adversary?.repro_steps)) { + for (const step of adversary.repro_steps) { + if (typeof step !== "string") continue; + for (const line of step.split("\n")) { + const cmd = line.trim(); + if (cmd && SHELL_PREFIX.test(cmd) && !cmds.includes(cmd)) { + cmds.push(cmd); + } + } + } + } + return cmds; +} + +async function isReportFresh(runDir, adversaryPath) { + const adversaryMtime = (await stat(adversaryPath)).mtimeMs; + const handoffPath = join(runDir, "handoff", "executor-summary.yaml"); + try { + const handoffMtime = (await stat(handoffPath)).mtimeMs; + return adversaryMtime >= handoffMtime - 1000; + } catch { + return true; + } +} + +export async function runHarnessAdversaryReproPack(opts) { + const runDir = opts.runDir; + const projectRoot = opts.projectRoot ?? join(runDir, "..", "..", ".."); + if (!runDir) return { ok: false, reason: "missing --run-dir", skipped: true }; + + const adversaryPath = join(runDir, "artifacts", "adversary-report.yaml"); + let adversary; + try { + adversary = await readYaml(adversaryPath); + if (!(await isReportFresh(runDir, adversaryPath))) { + return { + ok: false, + reason: "adversary-report older than last executor handoff", + skipped: true, + adversary_repro: "stale", + }; + } + } catch { + return { + ok: true, + skipped: true, + adversary_repro: "skipped", + reason: "no adversary-report.yaml", + }; + } + + const commands = extractCommands(adversary); + const results = []; + for (const cmd of commands) { + const r = spawnSync(cmd, { shell: true, cwd: projectRoot, encoding: "utf8" }); + results.push({ + cmd, + exit_code: r.status ?? 1, + }); + } + + const failed = results.some((r) => r.exit_code !== 0); + const adversary_repro = + commands.length === 0 ? "skipped" : failed ? "fail" : "pass"; + + const benchmarkPath = join(runDir, "artifacts", "benchmark-log.yaml"); + const benchmark = (await readYaml(benchmarkPath)) ?? { + schema_version: "1.0.0", + }; + benchmark.adversary_repro = adversary_repro; + benchmark.adversary_repro_results = results; + await writeFile(benchmarkPath, stringifyYaml(benchmark)); + + return { + ok: adversary_repro !== "fail", + adversary_repro, + results, + skipped: commands.length === 0, + }; +} + +async function main() { + const args = parseArgs(process.argv); + const projectRoot = + args.projectRoot ?? (args.runDir ? join(args.runDir, "..", "..", "..") : null); + const result = await runHarnessAdversaryReproPack({ + runDir: args.runDir, + projectRoot, + }); + if (!result.ok && !result.skipped) { + console.error(`harness-adversary-repro-pack: FAIL (${result.adversary_repro})`); + process.exit(1); + } + console.log( + `harness-adversary-repro-pack: ${result.adversary_repro ?? result.reason ?? "done"}`, + ); +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/.pi/scripts/harness-git-branch.mjs b/.pi/scripts/harness-git-branch.mjs new file mode 100644 index 00000000..ebec9348 --- /dev/null +++ b/.pi/scripts/harness-git-branch.mjs @@ -0,0 +1,69 @@ +#!/usr/bin/env node +/** + * Ensure harness feature branch when on protected default branch. + * + * Usage: + * node "$UP_PKG/.pi/scripts/harness-git-branch.mjs" \ + * --run-id [--run-dir ] [--project-root ] [--dry-run] + */ + +import { dirname, join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { + ensureHarnessGitBranch, + writeGitWorkflowArtifact, +} from "../lib/harness-git-branch.mjs"; + +const SCRIPT_DIR = dirname(fileURLToPath(import.meta.url)); +const UP_PKG = join(SCRIPT_DIR, "..", ".."); + +function parseArgs(argv) { + const out = { + runId: null, + runDir: null, + projectRoot: null, + dryRun: false, + }; + for (let i = 2; i < argv.length; i++) { + const a = argv[i]; + if (a === "--dry-run") out.dryRun = true; + else if (a === "--run-id" && argv[i + 1]) out.runId = argv[++i]; + else if (a === "--run-dir" && argv[i + 1]) out.runDir = argv[++i]; + else if (a === "--project-root" && argv[i + 1]) + out.projectRoot = argv[++i]; + else if (a === "--help" || a === "-h") { + console.log(`Usage: harness-git-branch.mjs --run-id [--run-dir ] [--project-root ] [--dry-run]`); + process.exit(0); + } + } + return out; +} + +async function main() { + const args = parseArgs(process.argv); + const projectRoot = args.projectRoot ?? process.cwd(); + const runId = args.runId; + if (!runId) { + console.error("harness-git-branch: --run-id is required"); + process.exit(1); + } + + const result = await ensureHarnessGitBranch({ + projectRoot, + runId, + upPkg: UP_PKG, + dryRun: args.dryRun, + }); + + if (args.runDir && !args.dryRun) { + await writeGitWorkflowArtifact({ runDir: args.runDir, result }); + } + + console.log(JSON.stringify(result, null, 2)); + if (!result.ok) process.exit(1); +} + +main().catch((err) => { + console.error(`harness-git-branch: ${err.message}`); + process.exit(1); +}); diff --git a/.pi/scripts/harness-git-commit.mjs b/.pi/scripts/harness-git-commit.mjs index 1d446fdf..7ca76870 100644 --- a/.pi/scripts/harness-git-commit.mjs +++ b/.pi/scripts/harness-git-commit.mjs @@ -41,6 +41,10 @@ function parseArgs(argv) { else if (a === "--body" && argv[i + 1]) opts.body = argv[++i]; else if (a === "--message" && argv[i + 1]) opts.message = argv[++i]; else if (a === "--root" && argv[i + 1]) opts.root = argv[++i]; + else if (a === "--only-path" && argv[i + 1]) { + opts.onlyPaths ??= []; + opts.onlyPaths.push(argv[++i]); + } else if (a.startsWith("-")) { console.error(`harness-git-commit: unknown flag ${a}`); process.exit(1); @@ -158,6 +162,12 @@ async function main() { if (flags.has("amend")) gitArgs.push("--amend"); if (flags.has("no-verify")) gitArgs.push("--no-verify"); if (flags.has("signoff")) gitArgs.push("--signoff"); + if (Array.isArray(opts.onlyPaths) && opts.onlyPaths.length > 0) { + for (const relPath of opts.onlyPaths) { + await runGit(["add", "--", relPath], projectRoot); + } + gitArgs.push("--only", "--", ...opts.onlyPaths); + } const out = await runGit(gitArgs, projectRoot); if (out.trim()) process.stdout.write(out); diff --git a/.pi/scripts/harness-git-qa-assert.mjs b/.pi/scripts/harness-git-qa-assert.mjs new file mode 100644 index 00000000..18cbec56 --- /dev/null +++ b/.pi/scripts/harness-git-qa-assert.mjs @@ -0,0 +1,32 @@ +#!/usr/bin/env node +/** + * Assert scoped harness git QA smoke commit at HEAD. + * + * Usage: node harness-git-qa-assert.mjs [--project-root DIR] + */ + +import { isHarnessGitQaCommitComplete } from "../lib/harness-git-qa.mjs"; + +function parseArgs(argv) { + let projectRoot = process.cwd(); + for (let i = 2; i < argv.length; i++) { + if (argv[i] === "--project-root" && argv[i + 1]) projectRoot = argv[++i]; + } + return { projectRoot }; +} + +async function main() { + const { projectRoot } = parseArgs(process.argv); + if (!(await isHarnessGitQaCommitComplete(projectRoot))) { + console.error( + "harness-git-qa-assert: FAIL — scoped harness-git-commit missing at HEAD", + ); + process.exit(1); + } + console.log("harness-git-qa-assert: pass"); +} + +main().catch((err) => { + console.error(`harness-git-qa-assert: ${err.message}`); + process.exit(1); +}); diff --git a/.pi/scripts/harness-inline-repair.mjs b/.pi/scripts/harness-inline-repair.mjs new file mode 100644 index 00000000..c3f65494 --- /dev/null +++ b/.pi/scripts/harness-inline-repair.mjs @@ -0,0 +1,78 @@ +#!/usr/bin/env node +/** + * Burst/inline repair preflight — validates eval-pass + adversary block_merge before + * /harness-steer --burst (deferred Phase 4b; no executor embedded in /harness-review). + * + * Usage: + * node harness-inline-repair.mjs --run-dir + */ + +import { readFile } from "node:fs/promises"; +import { join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { parse as parseYaml } from "yaml"; + +function parseArgs(argv) { + const out = { runDir: null }; + for (let i = 2; i < argv.length; i++) { + if (argv[i] === "--run-dir" && argv[i + 1]) out.runDir = argv[++i]; + } + return out; +} + +async function readYaml(path) { + try { + return parseYaml(await readFile(path, "utf8")); + } catch { + return null; + } +} + +export async function runHarnessInlineRepairPreflight(opts) { + const runDir = opts.runDir; + if (!runDir) return { ok: false, reason: "missing --run-dir" }; + + const evalDoc = await readYaml(join(runDir, "artifacts", "eval-verdict.yaml")); + const adversary = await readYaml( + join(runDir, "artifacts", "adversary-report.yaml"), + ); + const runCtx = await readYaml(join(runDir, "run-context.yaml")); + + const evalPass = (evalDoc?.status ?? "").toLowerCase() === "pass"; + const blockMerge = adversary?.block_merge === true; + if (!evalPass || !blockMerge) { + return { + ok: false, + reason: "inline/burst repair requires eval pass + adversary block_merge", + }; + } + if (runCtx?.inline_repair_attempted === true) { + return { ok: false, reason: "inline_repair_attempted already set" }; + } + const burstEnv = process.env.HARNESS_STEER_BURST?.trim(); + const burstOn = burstEnv === "1" || burstEnv?.toLowerCase() === "true"; + if (!burstOn) { + return { + ok: false, + reason: "set HARNESS_STEER_BURST=1 to allow burst steer", + }; + } + return { ok: true, recommended: "/harness-steer --burst" }; +} + +async function main() { + const args = parseArgs(process.argv); + const result = await runHarnessInlineRepairPreflight(args); + if (!result.ok) { + console.error(`harness-inline-repair: ${result.reason}`); + process.exit(1); + } + console.log(`harness-inline-repair: ok → ${result.recommended}`); +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/.pi/scripts/harness-review-preflight.mjs b/.pi/scripts/harness-review-preflight.mjs new file mode 100644 index 00000000..9ee7fa50 --- /dev/null +++ b/.pi/scripts/harness-review-preflight.mjs @@ -0,0 +1,112 @@ +#!/usr/bin/env node +/** + * Hard gate before spawning review evaluators — ensures benchmark-log is fresh. + * + * Usage: + * node harness-review-preflight.mjs --run-dir [--steer-attempt N] + */ + +import { readFile, stat } from "node:fs/promises"; +import { join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { parse as parseYaml } from "yaml"; + +function parseArgs(argv) { + const out = { runDir: null, steerAttempt: null }; + for (let i = 2; i < argv.length; i++) { + if (argv[i] === "--run-dir" && argv[i + 1]) { + out.runDir = argv[++i]; + } else if (argv[i] === "--steer-attempt" && argv[i + 1]) { + out.steerAttempt = Number.parseInt(argv[++i], 10); + } + } + return out; +} + +async function readYaml(path) { + try { + const raw = await readFile(path, "utf8"); + return parseYaml(raw); + } catch { + return null; + } +} + +export async function runHarnessReviewPreflight(opts) { + const runDir = opts.runDir; + if (!runDir) { + return { ok: false, reason: "missing --run-dir" }; + } + const benchmarkPath = join(runDir, "artifacts", "benchmark-log.yaml"); + const handoffPath = join(runDir, "handoff", "executor-summary.yaml"); + const runCtxPath = join(runDir, "run-context.yaml"); + + let benchmarkMtime = 0; + let handoffMtime = 0; + try { + benchmarkMtime = (await stat(benchmarkPath)).mtimeMs; + } catch { + return { + ok: false, + reason: + "benchmark-log.yaml missing — run Phase 1 (harness-verify + tests) before evaluators", + }; + } + try { + handoffMtime = (await stat(handoffPath)).mtimeMs; + } catch { + /* execute may not have handoff in readonly resume */ + } + + const benchmark = await readYaml(benchmarkPath); + if (!benchmark || typeof benchmark !== "object") { + return { ok: false, reason: "benchmark-log.yaml unreadable or empty" }; + } + if (!benchmark.harness_verify) { + return { + ok: false, + reason: "benchmark-log.yaml missing harness_verify field", + }; + } + + const runCtx = await readYaml(runCtxPath); + const expectedAttempt = + opts.steerAttempt ?? + (typeof runCtx?.steer_attempt === "number" ? runCtx.steer_attempt : 0); + + if (typeof benchmark.steer_attempt === "number") { + if (benchmark.steer_attempt < expectedAttempt) { + return { + ok: false, + reason: `benchmark-log stale: steer_attempt ${benchmark.steer_attempt} < expected ${expectedAttempt}`, + }; + } + } + + if (handoffMtime > 0 && benchmarkMtime < handoffMtime - 1000) { + return { + ok: false, + reason: + "benchmark-log older than executor handoff — re-run Phase 1 deterministic checks", + }; + } + + return { ok: true, benchmark }; +} + +async function main() { + const args = parseArgs(process.argv); + const result = await runHarnessReviewPreflight(args); + if (!result.ok) { + console.error(`harness-review-preflight: FAIL — ${result.reason}`); + process.exit(1); + } + console.log("harness-review-preflight: pass"); +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/.pi/scripts/harness-steer-hygiene.d.mts b/.pi/scripts/harness-steer-hygiene.d.mts new file mode 100644 index 00000000..906080c8 --- /dev/null +++ b/.pi/scripts/harness-steer-hygiene.d.mts @@ -0,0 +1,7 @@ +export function runHarnessSteerHygiene(opts: { + runDir: string; + projectRoot?: string | null; +}): Promise<{ + ok: boolean; + log?: { outcome?: string }; +}>; diff --git a/.pi/scripts/harness-steer-hygiene.mjs b/.pi/scripts/harness-steer-hygiene.mjs new file mode 100644 index 00000000..2c0ec279 --- /dev/null +++ b/.pi/scripts/harness-steer-hygiene.mjs @@ -0,0 +1,135 @@ +#!/usr/bin/env node +/** + * Deterministic hygiene repairs (lint/format/stage) without executor steer. + * + * Usage: + * node harness-steer-hygiene.mjs --run-dir [--project-root ] + */ + +import { readFile, writeFile, mkdir } from "node:fs/promises"; +import { join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { spawnSync } from "node:child_process"; +import { parse as parseYaml, stringify as stringifyYaml } from "yaml"; + +const DENYLIST = [ + /^\.env/i, + /credentials/i, + /^graphify-out\//, + /^\.pi\/harness\/runs\//, + /^node_modules\//, +]; + +function parseArgs(argv) { + const out = { runDir: null, projectRoot: null }; + for (let i = 2; i < argv.length; i++) { + if (argv[i] === "--run-dir" && argv[i + 1]) out.runDir = argv[++i]; + else if (argv[i] === "--project-root" && argv[i + 1]) + out.projectRoot = argv[++i]; + } + return out; +} + +function isDenied(relPath) { + return DENYLIST.some((p) => p.test(relPath)); +} + +function runCmd(cmd, cwd) { + const result = spawnSync(cmd, { + shell: true, + cwd, + encoding: "utf8", + }); + return { + cmd, + exit_code: result.status ?? 1, + stdout: (result.stdout ?? "").slice(0, 2000), + stderr: (result.stderr ?? "").slice(0, 2000), + }; +} + +async function readYaml(path) { + try { + return parseYaml(await readFile(path, "utf8")); + } catch { + return null; + } +} + +function collectChangedFiles(projectRoot, runDir) { + const fromGit = spawnSync("git diff --name-only HEAD", { + shell: true, + cwd: projectRoot, + encoding: "utf8", + }); + const files = new Set(); + if (fromGit.status === 0) { + for (const line of (fromGit.stdout ?? "").split("\n")) { + const t = line.trim(); + if (t && !isDenied(t)) files.add(t); + } + } + const handoff = join(runDir, "handoff", "executor-summary.yaml"); + return { files: [...files], handoffPath: handoff }; +} + +export async function runHarnessSteerHygiene(opts) { + const runDir = opts.runDir; + const projectRoot = opts.projectRoot ?? join(runDir, "..", "..", ".."); + if (!runDir) return { ok: false, reason: "missing --run-dir" }; + + const { files } = collectChangedFiles(projectRoot, runDir); + const log = { + schema_version: "1.0.0", + commands: [], + changed_files: files, + outcome: "skipped", + }; + + if (files.length === 0) { + log.outcome = "no_changed_files"; + } else { + const biomeTargets = files.filter((f) => /\.(ts|tsx|js|mjs|json)$/.test(f)); + if (biomeTargets.length > 0) { + const quoted = biomeTargets.map((f) => `"${f}"`).join(" "); + log.commands.push( + runCmd(`npx -y @biomejs/biome check --write ${quoted}`, projectRoot), + ); + } + const stageable = files.filter((f) => !isDenied(f)); + if (stageable.length > 0) { + const quoted = stageable.map((f) => `"${f}"`).join(" "); + log.commands.push(runCmd(`git add ${quoted}`, projectRoot)); + } + const failed = log.commands.some((c) => c.exit_code !== 0); + log.outcome = failed ? "fail" : "pass"; + } + + const outPath = join(runDir, "artifacts", "hygiene-repair-log.yaml"); + await mkdir(join(runDir, "artifacts"), { recursive: true }); + await writeFile(outPath, stringifyYaml(log)); + + return { ok: log.outcome === "pass" || log.outcome === "no_changed_files", log }; +} + +async function main() { + const args = parseArgs(process.argv); + const projectRoot = + args.projectRoot ?? (args.runDir ? join(args.runDir, "..", "..", "..") : null); + const result = await runHarnessSteerHygiene({ + runDir: args.runDir, + projectRoot, + }); + if (!result.ok) { + console.error(`harness-steer-hygiene: FAIL — ${result.log?.outcome ?? "error"}`); + process.exit(1); + } + console.log(`harness-steer-hygiene: ${result.log?.outcome ?? "pass"}`); +} + +if (process.argv[1] === fileURLToPath(import.meta.url)) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/.pi/scripts/harness-steer-qa-seed.mjs b/.pi/scripts/harness-steer-qa-seed.mjs new file mode 100644 index 00000000..b2968872 --- /dev/null +++ b/.pi/scripts/harness-steer-qa-seed.mjs @@ -0,0 +1,141 @@ +#!/usr/bin/env node +/** + * Seed deterministic steer-loop fixtures for headless QA (hygiene gap → /harness-steer). + * + * Usage: + * node harness-steer-qa-seed.mjs --run-dir [--project-root ] + */ + +import { readFile, writeFile, mkdir } from "node:fs/promises"; +import { join } from "node:path"; +import { stringify as stringifyYaml } from "yaml"; + +function parseArgs(argv) { + const out = { runDir: null, projectRoot: null }; + for (let i = 2; i < argv.length; i++) { + if (argv[i] === "--run-dir" && argv[i + 1]) out.runDir = argv[++i]; + else if (argv[i] === "--project-root" && argv[i + 1]) + out.projectRoot = argv[++i]; + } + return out; +} + +async function readRunId(runDir) { + try { + const raw = await readFile(join(runDir, "run-context.yaml"), "utf8"); + const m = /run_id:\s*['"]?([^\s'"]+)/.exec(raw); + return m?.[1] ?? null; + } catch { + return null; + } +} + +export async function seedSteerQaFixtures(opts) { + const runDir = opts.runDir; + if (!runDir) return { ok: false, reason: "missing --run-dir" }; + const runId = opts.runId ?? (await readRunId(runDir)); + if (!runId) return { ok: false, reason: "run_id unknown" }; + + const artifacts = join(runDir, "artifacts"); + await mkdir(artifacts, { recursive: true }); + + const reviewOutcome = { + schema_version: "1.0.0", + run_id: runId, + status: "fail", + remediation_class: "implementation_gap", + recommended_next: "/harness-steer", + gap_kind: "hygiene", + eval_status: "fail", + adversary_status: "proceed", + steer_attempt: 0, + review_tier: "full", + source_artifacts: { + eval_verdict: "artifacts/eval-verdict.yaml", + benchmark_log: "artifacts/benchmark-log.yaml", + }, + seed_source: "harness-steer-qa-seed", + }; + + const repairBrief = { + schema_version: "1.0.0", + run_id: runId, + steer_attempt: 1, + remediation_class: "implementation_gap", + gap_kind: "hygiene", + source_artifacts: { + review_outcome: "artifacts/review-outcome.yaml", + eval_verdict: "artifacts/eval-verdict.yaml", + }, + fix_directives: [ + "Run harness-steer-hygiene to stage allowed changed files only.", + ], + verification_commands: [ + 'node "$UP_PKG/.pi/scripts/harness-ls-lint-cli.mjs"', + ], + must_pass_before_handoff: false, + seed_source: "harness-steer-qa-seed", + }; + + const evalVerdict = { + schema_version: "1.0.0", + run_id: runId, + status: "fail", + recommended_action: "steer", + failed_checks: ["ls_lint_format"], + seed_source: "harness-steer-qa-seed", + }; + + const steerState = { + schema_version: "1.0.0", + run_id: runId, + attempt: 0, + max_attempts: 3, + active: false, + hygiene_repairs: 0, + seed_source: "harness-steer-qa-seed", + }; + + await writeFile( + join(artifacts, "review-outcome.yaml"), + stringifyYaml(reviewOutcome), + "utf8", + ); + await writeFile( + join(artifacts, "repair-brief.yaml"), + stringifyYaml(repairBrief), + "utf8", + ); + await writeFile( + join(artifacts, "eval-verdict.yaml"), + stringifyYaml(evalVerdict), + "utf8", + ); + await writeFile( + join(artifacts, "steer-state.yaml"), + stringifyYaml(steerState), + "utf8", + ); + + return { ok: true, run_id: runId, artifacts_dir: artifacts }; +} + +async function main() { + const args = parseArgs(process.argv); + const runDir = args.runDir; + if (!runDir) { + console.error("harness-steer-qa-seed: --run-dir is required"); + process.exit(1); + } + const out = await seedSteerQaFixtures({ runDir: args.runDir }); + console.log(JSON.stringify(out, null, 2)); + if (!out.ok) process.exit(1); +} + +const isMain = process.argv[1]?.endsWith("harness-steer-qa-seed.mjs"); +if (isMain) { + main().catch((err) => { + console.error(err); + process.exit(1); + }); +} diff --git a/.pi/scripts/harness-verify.mjs b/.pi/scripts/harness-verify.mjs index ad5ec6a2..04284407 100644 --- a/.pi/scripts/harness-verify.mjs +++ b/.pi/scripts/harness-verify.mjs @@ -756,6 +756,23 @@ async function checkAutoCommitGitCommit() { if (!out.includes("Co-authored-by:")) { fail("harness-git-commit message missing Co-authored-by trailer"); } + const branchScript = join(ROOT, ".pi", "scripts", "harness-git-branch.mjs"); + const branchLib = join(ROOT, ".pi", "lib", "harness-git-branch.mjs"); + if (!(await fileExists(branchScript)) || !(await fileExists(branchLib))) { + fail("missing harness-git-branch script or lib"); + } + const dryBranch = await runNodeScript(branchScript, [ + "--run-id", + "harness-verify-smoke", + "--dry-run", + ]); + if (dryBranch.code !== 0) { + fail(dryBranch.out.trim() || "harness-git-branch --dry-run failed"); + } + const qaAssert = join(ROOT, ".pi", "scripts", "harness-git-qa-assert.mjs"); + if (!(await fileExists(qaAssert))) { + fail("missing harness-git-qa-assert.mjs"); + } ok("auto-commit git commit (skill, CLI, config, SYSTEM.md)"); } diff --git a/test/harness-git-branch.test.mjs b/test/harness-git-branch.test.mjs new file mode 100644 index 00000000..0a8f6543 --- /dev/null +++ b/test/harness-git-branch.test.mjs @@ -0,0 +1,95 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { mkdtemp, mkdir, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { spawnSync } from "node:child_process"; +import { + ensureHarnessGitBranch, + harnessFeatureBranchName, + isProtectedBranch, +} from "../.pi/lib/harness-git-branch.mjs"; + +test("isProtectedBranch matches globs", () => { + assert.equal(isProtectedBranch("main", ["main", "master"]), true); + assert.equal(isProtectedBranch("release/1.0", ["release/*"]), true); + assert.equal(isProtectedBranch("feat/foo", ["main"]), false); +}); + +test("harnessFeatureBranchName slugifies run id", () => { + assert.equal( + harnessFeatureBranchName("harness-qa-live-123"), + "harness/harness-qa-live-123", + ); +}); + +test("ensureHarnessGitBranch creates feature branch from main", async () => { + const tmp = await mkdtemp(join(tmpdir(), "harness-git-branch-")); + spawnSync("git", ["init"], { cwd: tmp }); + spawnSync("git", ["config", "user.email", "qa@test.local"], { cwd: tmp }); + spawnSync("git", ["config", "user.name", "QA"], { cwd: tmp }); + spawnSync("git", ["checkout", "-b", "main"], { cwd: tmp }); + await writeFile(join(tmp, "README.md"), "init\n", "utf8"); + spawnSync("git", ["add", "README.md"], { cwd: tmp }); + spawnSync("git", ["commit", "-m", "init"], { cwd: tmp }); + + await mkdir(join(tmp, ".pi"), { recursive: true }); + await writeFile( + join(tmp, ".pi", "auto-commit.json"), + JSON.stringify({ + coAuthor: { login: "bot", email: "bot@test.local" }, + branch: { + strategy: "auto-feature-branch", + protected: ["main"], + }, + message: { + template: "{type}({scope}): {subject}", + coAuthorTrailer: "Co-authored-by: {login} <{email}>", + }, + }), + "utf8", + ); + + const result = await ensureHarnessGitBranch({ + projectRoot: tmp, + runId: "run-abc-123", + upPkg: tmp, + }); + assert.equal(result.ok, true); + assert.equal(result.action, "create"); + assert.equal(result.target_branch, "harness/run-abc-123"); + assert.equal(result.new_branch, "harness/run-abc-123"); +}); + +test("ensureHarnessGitBranch skips when not on protected branch", async () => { + const tmp = await mkdtemp(join(tmpdir(), "harness-git-branch-")); + spawnSync("git", ["init"], { cwd: tmp }); + spawnSync("git", ["config", "user.email", "qa@test.local"], { cwd: tmp }); + spawnSync("git", ["config", "user.name", "QA"], { cwd: tmp }); + spawnSync("git", ["checkout", "-b", "feat/existing"], { cwd: tmp }); + await writeFile(join(tmp, "seed.txt"), "x\n", "utf8"); + spawnSync("git", ["add", "seed.txt"], { cwd: tmp }); + spawnSync("git", ["commit", "-m", "seed"], { cwd: tmp }); + await mkdir(join(tmp, ".pi"), { recursive: true }); + await writeFile( + join(tmp, ".pi", "auto-commit.json"), + JSON.stringify({ + coAuthor: { login: "bot", email: "bot@test.local" }, + branch: { strategy: "auto-feature-branch", protected: ["main"] }, + message: { + template: "{type}: {subject}", + coAuthorTrailer: "Co-authored-by: {login} <{email}>", + }, + }), + "utf8", + ); + + const result = await ensureHarnessGitBranch({ + projectRoot: tmp, + runId: "run-x", + upPkg: tmp, + }); + assert.equal(result.ok, true); + assert.equal(result.reason, "not_on_protected_branch"); + assert.equal(result.action, "none"); +}); diff --git a/test/harness-live-widget-status.test.ts b/test/harness-live-widget-status.test.ts index dbf739a3..9f5481bd 100644 --- a/test/harness-live-widget-status.test.ts +++ b/test/harness-live-widget-status.test.ts @@ -1,5 +1,10 @@ import assert from "node:assert/strict"; +import { mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; import { describe, test } from "node:test"; +import harnessLiveWidget from "../.pi/extensions/harness-live-widget.ts"; +import { writeHarnessProjectEnabled } from "../.pi/lib/harness-project-config.ts"; import { formatCrossSessionResumeMessage, nextStepAfterOutcome, @@ -9,6 +14,7 @@ import { deriveHarnessStatusHint, formatHarnessPhaseLabel, type HarnessUiState, + HarnessUiStateStore, nextHarnessPhase, } from "../.pi/lib/harness-ui-state.ts"; @@ -259,6 +265,72 @@ describe("createStateFromEntries run-context merge", () => { "/harness-plan or /harness-incident", ); }); + + test("no active run recommends fresh harness-plan command", () => { + const state = createStateFromEntries([]); + assert.equal(state.phase, "plan"); + assert.equal(state.nextRecommendedCommand, "/harness-plan"); + assert.equal(deriveHarnessStatusHint(state).text, "Next: /harness-plan"); + }); + + test("confirmed clear after run context invalidates stale active state", () => { + const state = createStateFromEntries([ + { + type: "custom", + customType: "harness-run-context", + data: { + phase: "evaluate", + plan_ready: true, + plan_id: "plan-stale", + run_id: "run-stale", + last_completed_step: "execute", + last_outcome: "completed", + next_recommended_command: "/harness-review", + status: "active", + }, + }, + { + type: "custom", + customType: "harness-clear-result", + data: { approved: true, active_cleared: true, cleared_all: true }, + }, + ]); + assert.equal(state.traceRunId, null); + assert.equal(state.nextRecommendedCommand, "/harness-plan"); + assert.equal(deriveHarnessStatusHint(state).text, "Next: /harness-plan"); + }); + + test("confirmed clear tombstones active_run_ids even if stale run context is appended later", () => { + const state = createStateFromEntries([ + { + type: "custom", + customType: "harness-clear-result", + data: { + approved: true, + active_cleared: true, + cleared_all: true, + active_run_ids: ["run-stale"], + }, + }, + { + type: "custom", + customType: "harness-run-context", + data: { + phase: "execute", + plan_ready: true, + plan_id: "plan-stale", + run_id: "run-stale", + last_completed_step: "execute", + last_outcome: "completed", + next_recommended_command: "/harness-review", + status: "active", + }, + }, + ]); + assert.equal(state.traceRunId, null); + assert.equal(state.nextRecommendedCommand, "/harness-plan"); + assert.equal(deriveHarnessStatusHint(state).text, "Next: /harness-plan"); + }); }); describe("cross-session resume UX", () => { @@ -287,3 +359,333 @@ describe("cross-session resume UX", () => { assert.equal(hint.severity, "warning"); }); }); + +type Handler = (...args: unknown[]) => unknown; + +function createPi() { + const lifecycle = new Map(); + const eventHandlers = new Map(); + return { + on(name: string, handler: Handler) { + const handlers = lifecycle.get(name) ?? []; + handlers.push(handler); + lifecycle.set(name, handlers); + }, + async fire(name: string, ...args: unknown[]) { + for (const handler of lifecycle.get(name) ?? []) await handler(...args); + }, + events: { + on(name: string, handler: Handler) { + const handlers = eventHandlers.get(name) ?? []; + handlers.push(handler); + eventHandlers.set(name, handlers); + }, + emit(name: string, payload: unknown) { + for (const handler of eventHandlers.get(name) ?? []) handler(payload); + }, + }, + }; +} + +function createWidgetCtx(entries: unknown[] = []) { + const widgets: Array<{ key: string; content: unknown }> = []; + const statuses: Array<{ key: string; text: string | undefined }> = []; + return { + hasUI: true, + ui: { + setWidget(key: string, content: unknown) { + widgets.push({ key, content }); + }, + setStatus(key: string, text: string | undefined) { + statuses.push({ key, text }); + }, + }, + sessionManager: { getEntries: () => entries }, + widgets, + statuses, + }; +} + +async function flushMicrotasks(): Promise { + await Promise.resolve(); + await Promise.resolve(); +} + +describe("cross-session resume invalidation", () => { + test("mounted live widget ignores delayed cross-session-resume after confirmed clear", async () => { + const previous = process.cwd(); + const root = mkdtempSync(join(tmpdir(), "up-live-widget-delayed-resume-")); + try { + process.chdir(root); + writeHarnessProjectEnabled(root, true); + const entries: unknown[] = [ + { + type: "custom", + customType: "harness-clear-result", + data: { + approved: true, + active_cleared: true, + cleared_all: true, + active_run_ids: ["run-stale"], + }, + }, + ]; + const pi = createPi(); + const ctx = createWidgetCtx(entries); + harnessLiveWidget(pi as never); + await pi.fire("session_start", {}, ctx); + const factory = ctx.widgets.at(-1)?.content as ( + tui: { requestRender(): void }, + theme: { + fg(color: string, text: string): string; + bold(text: string): string; + }, + ) => { render(width: number): string[] }; + const widget = factory( + { requestRender() {} }, + { + fg: (_color: string, text: string) => text, + bold: (text: string) => text, + }, + ); + + pi.events.emit("harness-cross-session-resume", { + resume_command: "/harness-use-run run-stale --claim", + }); + await flushMicrotasks(); + const rendered = widget.render(120).join("\n"); + assert.doesNotMatch(rendered, /harness-use-run/); + assert.match(rendered, /Next: \/harness-plan/); + } finally { + process.chdir(previous); + rmSync(root, { recursive: true, force: true }); + } + }); + + test("HarnessUiStateStore clears resume overlay after confirmed clear entry", () => { + const entries: unknown[] = []; + const store = new HarnessUiStateStore(); + const ctx = createWidgetCtx(entries); + store.setCrossSessionResumeCommand("/harness-use-run run-abc --claim"); + assert.equal( + deriveHarnessStatusHint(store.refresh(ctx as never)).text, + "Resume: /harness-use-run run-abc --claim", + ); + entries.push({ + type: "custom", + customType: "harness-clear-result", + data: { approved: true, active_cleared: true, cleared_all: true }, + }); + const state = store.refresh(ctx as never); + assert.equal(state.crossSessionResumeCommand, null); + assert.equal(state.nextRecommendedCommand, "/harness-plan"); + assert.equal(deriveHarnessStatusHint(state).text, "Next: /harness-plan"); + }); + + test("HarnessUiStateStore rejects delayed stale resume overlay after confirmed clear", () => { + const entries: unknown[] = [ + { + type: "custom", + customType: "harness-clear-result", + data: { + approved: true, + active_cleared: true, + active_run_ids: ["run-stale"], + }, + }, + ]; + const store = new HarnessUiStateStore(); + const ctx = createWidgetCtx(entries); + assert.equal( + deriveHarnessStatusHint(store.refresh(ctx as never)).text, + "Next: /harness-plan", + ); + + store.setCrossSessionResumeCommand("/harness-use-run run-stale --claim"); + const state = store.refresh(ctx as never); + + assert.equal(state.crossSessionResumeCommand, null); + assert.equal(state.nextRecommendedCommand, "/harness-plan"); + assert.equal(deriveHarnessStatusHint(state).text, "Next: /harness-plan"); + }); + + test("HarnessUiStateStore clear event suppresses lagging run-context with project_root", () => { + const entries: unknown[] = [ + { + type: "custom", + customType: "harness-run-context", + data: { + phase: "execute", + plan_ready: true, + plan_id: "plan-stale", + run_id: "run-stale", + project_root: "/tmp/ultimate-pi", + last_completed_step: "execute", + last_outcome: "completed", + next_recommended_command: "/harness-review", + status: "active", + }, + }, + ]; + const store = new HarnessUiStateStore(); + const ctx = createWidgetCtx(entries); + store.refresh(ctx as never); + store.setCrossSessionResumeCommand("/harness-use-run run-stale --claim"); + store.clearActiveRunState(entries.length); + const state = store.refresh(ctx as never); + assert.equal(state.traceRunId, null); + assert.equal(state.crossSessionResumeCommand, null); + assert.equal(state.nextRecommendedCommand, "/harness-plan"); + assert.equal(deriveHarnessStatusHint(state).text, "Next: /harness-plan"); + }); + + test("HarnessUiStateStore clear event suppresses stale active entries until run update", () => { + const entries: unknown[] = [ + { + type: "custom", + customType: "harness-run-context", + data: { + phase: "execute", + plan_ready: true, + plan_id: "plan-stale", + run_id: "run-stale", + last_completed_step: "execute", + last_outcome: "completed", + next_recommended_command: "/harness-review", + status: "active", + }, + }, + ]; + const store = new HarnessUiStateStore(); + const ctx = createWidgetCtx(entries); + assert.equal( + deriveHarnessStatusHint(store.refresh(ctx as never)).text, + "Next: /harness-review", + ); + + store.setCrossSessionResumeCommand("/harness-use-run run-stale --claim"); + store.clearActiveRunState(entries.length); + const state = store.refresh(ctx as never); + + assert.equal(state.traceRunId, null); + assert.equal(state.crossSessionResumeCommand, null); + assert.equal(state.nextRecommendedCommand, "/harness-plan"); + assert.equal(deriveHarnessStatusHint(state).text, "Next: /harness-plan"); + }); + + test("current run update after confirmed clear supersedes clear overlay", () => { + const entries: unknown[] = [ + { + type: "custom", + customType: "harness-run-context", + data: { + phase: "evaluate", + plan_ready: true, + plan_id: "plan-stale", + run_id: "run-stale", + last_completed_step: "execute", + last_outcome: "completed", + next_recommended_command: "/harness-review", + status: "active", + }, + }, + ]; + const store = new HarnessUiStateStore(); + const ctx = createWidgetCtx(entries); + store.refresh(ctx as never); + store.clearActiveRunState(entries.length); + assert.equal( + deriveHarnessStatusHint(store.refresh(ctx as never)).text, + "Next: /harness-plan", + ); + + entries.push( + { + type: "custom", + customType: "harness-clear-result", + data: { approved: true, active_cleared: true, cleared_all: true }, + }, + { + type: "custom", + customType: "harness-run-context", + data: { + phase: "plan", + plan_ready: true, + plan_id: "plan-current", + run_id: "run-current", + last_completed_step: "plan", + last_outcome: "ready", + next_recommended_command: null, + status: "active", + }, + }, + ); + + const state = store.refresh(ctx as never); + assert.equal(state.traceRunId, "run-current"); + assert.equal(state.crossSessionResumeCommand, null); + assert.equal(state.nextRecommendedCommand, "/harness-run"); + assert.equal(deriveHarnessStatusHint(state).text, "Next: /harness-run"); + }); + + test("mounted live widget clears resume overlay on harness-runs-cleared event", async () => { + const previous = process.cwd(); + const root = mkdtempSync(join(tmpdir(), "up-live-widget-clear-")); + try { + process.chdir(root); + writeHarnessProjectEnabled(root, true); + const pi = createPi(); + const ctx = createWidgetCtx([ + { + type: "custom", + customType: "harness-run-context", + data: { + phase: "execute", + plan_ready: true, + plan_id: "plan-stale", + run_id: "run-stale", + last_completed_step: "execute", + last_outcome: "completed", + next_recommended_command: "/harness-review", + status: "active", + }, + }, + ]); + harnessLiveWidget(pi as never); + await pi.fire("session_start", {}, ctx); + const factory = ctx.widgets.at(-1)?.content as ( + tui: { requestRender(): void }, + theme: { + fg(color: string, text: string): string; + bold(text: string): string; + }, + ) => { render(width: number): string[] }; + const widget = factory( + { requestRender() {} }, + { + fg: (_color: string, text: string) => text, + bold: (text: string) => text, + }, + ); + + pi.events.emit("harness-cross-session-resume", { + resume_command: "/harness-use-run run-abc --claim", + }); + await flushMicrotasks(); + assert.match( + widget.render(120).join("\n"), + /Resume: \/harness-use-run run-abc/, + ); + + pi.events.emit("harness-runs-cleared", { deleted: 1, projectRoot: root }); + await flushMicrotasks(); + const afterClear = widget.render(120).join("\n"); + assert.doesNotMatch(afterClear, /harness-use-run/); + assert.doesNotMatch(afterClear, /\/harness-review/); + assert.match(afterClear, /Next: \/harness-plan/); + } finally { + process.chdir(previous); + rmSync(root, { recursive: true, force: true }); + } + }); +}); diff --git a/test/harness-remediation.test.mjs b/test/harness-remediation.test.mjs new file mode 100644 index 00000000..b3600d9f --- /dev/null +++ b/test/harness-remediation.test.mjs @@ -0,0 +1,144 @@ +import assert from "node:assert/strict"; +import { describe, test } from "node:test"; +import { + classifyImplementationGap, + effectiveSteerMaxAttempts, + parseReproCommandsFromAdversary, + recommendedNextForRemediation, + remediationClassFromEvalVerdict, + steerBurstAllowed, + steerBurstFromEnv, + synthesizeReviewOutcome, +} from "../.pi/lib/harness-remediation.ts"; + +describe("synthesizeReviewOutcome split verdict", () => { + test("eval pass + adversary block_merge → implementation_gap + burst when env on", () => { + const prev = process.env.HARNESS_STEER_BURST; + process.env.HARNESS_STEER_BURST = "1"; + const out = synthesizeReviewOutcome({ + runId: "run-1", + eval: { status: "pass" }, + adversary: { block_merge: true, repro_steps: ["npx vitest run foo"] }, + }); + assert.equal(out?.remediation_class, "implementation_gap"); + assert.equal(out?.status, "fail"); + assert.equal(out?.eval_status, "pass"); + assert.equal(out?.adversary_status, "block_merge"); + assert.equal(out?.recommended_next, "/harness-steer --burst"); + if (prev === undefined) delete process.env.HARNESS_STEER_BURST; + else process.env.HARNESS_STEER_BURST = prev; + }); + + test("eval pass + adversary proceed → pass", () => { + const out = synthesizeReviewOutcome({ + runId: "run-1", + eval: { status: "pass" }, + adversary: { block_merge: false }, + }); + assert.equal(out?.remediation_class, "pass"); + assert.equal(out?.recommended_next, "/harness-policy-status"); + }); + + test("hygiene gap_kind from ls_lint failure", () => { + const out = synthesizeReviewOutcome({ + runId: "run-1", + eval: { + status: "fail", + recommended_action: "steer", + failed_checks: ["ls_lint_format"], + }, + adversary: null, + benchmark: { ls_lint: "fail" }, + }); + assert.equal(out?.remediation_class, "implementation_gap"); + assert.equal(out?.gap_kind, "hygiene"); + assert.equal(out?.recommended_next, "/harness-steer"); + }); +}); + +describe("steerBurstAllowed", () => { + test("default env off", () => { + const prev = process.env.HARNESS_STEER_BURST; + delete process.env.HARNESS_STEER_BURST; + assert.equal(steerBurstFromEnv(), false); + assert.equal( + steerBurstAllowed({ status: "pass" }, { block_merge: true }), + false, + ); + if (prev !== undefined) process.env.HARNESS_STEER_BURST = prev; + }); + + test("blocked when inline repair already attempted", () => { + const prev = process.env.HARNESS_STEER_BURST; + process.env.HARNESS_STEER_BURST = "1"; + assert.equal( + steerBurstAllowed( + { status: "pass" }, + { block_merge: true }, + true, + ), + false, + ); + if (prev === undefined) delete process.env.HARNESS_STEER_BURST; + else process.env.HARNESS_STEER_BURST = prev; + }); +}); + +describe("effectiveSteerMaxAttempts", () => { + test("adds one when burst allowed", () => { + assert.equal(effectiveSteerMaxAttempts(3, true), 4); + assert.equal(effectiveSteerMaxAttempts(3, false), 3); + }); +}); + +describe("parseReproCommandsFromAdversary", () => { + test("extracts shell commands and skips prose", () => { + const { commands, skipped } = parseReproCommandsFromAdversary({ + repro_steps: [ + "Open the widget and click resume", + "npx vitest run test/widget.test.mjs", + ], + repro_commands: [{ cmd: "node scripts/repro.mjs" }], + }); + assert.equal(commands.includes("node scripts/repro.mjs"), true); + assert.equal( + commands.includes("npx vitest run test/widget.test.mjs"), + true, + ); + assert.equal(skipped.length, 1); + }); +}); + +describe("remediationClassFromEvalVerdict", () => { + test("maps replan to plan_gap", () => { + assert.equal( + remediationClassFromEvalVerdict({ + status: "fail", + recommended_action: "replan", + }), + "plan_gap", + ); + }); +}); + +describe("recommendedNextForRemediation", () => { + test("burst path", () => { + assert.equal( + recommendedNextForRemediation("implementation_gap", { burst: true }), + "/harness-steer --burst", + ); + }); +}); + +describe("classifyImplementationGap", () => { + test("mixed when hygiene and block_merge", () => { + assert.equal( + classifyImplementationGap( + { failed_checks: ["format_check"] }, + { block_merge: true }, + null, + ), + "mixed", + ); + }); +}); diff --git a/test/harness-repair-brief.test.mjs b/test/harness-repair-brief.test.mjs new file mode 100644 index 00000000..e44bfa25 --- /dev/null +++ b/test/harness-repair-brief.test.mjs @@ -0,0 +1,50 @@ +import assert from "node:assert/strict"; +import { mkdir } from "node:fs/promises"; +import { join } from "node:path"; +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { describe, test } from "node:test"; +import { synthesizeRepairBrief } from "../.pi/lib/harness-repair-brief.ts"; +import { writeYamlFile } from "../.pi/lib/harness-yaml.ts"; + +describe("synthesizeRepairBrief", () => { + test("includes repro_commands and gap_kind from adversary", async () => { + const root = await mkdtemp(join(tmpdir(), "up-repair-brief-")); + const runId = "run-brief-1"; + const runDir = join(root, ".pi", "harness", "runs", runId); + const artifacts = join(runDir, "artifacts"); + await mkdir(artifacts, { recursive: true }); + await writeYamlFile(join(artifacts, "eval-verdict.yaml"), { + schema_version: "1.0.0", + run_id: runId, + status: "pass", + }); + await writeYamlFile(join(artifacts, "adversary-report.yaml"), { + schema_version: "1.0.0", + run_id: runId, + block_merge: true, + repro_commands: [{ cmd: "npx vitest run test/widget.test.mjs" }], + repro_steps: ["Click resume in the widget"], + }); + await writeYamlFile(join(artifacts, "benchmark-log.yaml"), { + schema_version: "1.0.0", + harness_verify: "pass", + }); + + const brief = await synthesizeRepairBrief({ + runId, + projectRoot: root, + steerAttempt: 1, + }); + + assert.equal(brief.schema_version, "1.1.0"); + assert.equal(brief.remediation_class, "implementation_gap"); + assert.equal(brief.must_pass_before_handoff, true); + assert.deepEqual(brief.repro_commands, [ + "npx vitest run test/widget.test.mjs", + ]); + assert.equal(Array.isArray(brief.repro_skipped), true); + assert.equal(brief.repro_skipped.length, 1); + await rm(root, { recursive: true, force: true }); + }); +}); diff --git a/test/harness-review-preflight.test.mjs b/test/harness-review-preflight.test.mjs new file mode 100644 index 00000000..2140a1b1 --- /dev/null +++ b/test/harness-review-preflight.test.mjs @@ -0,0 +1,36 @@ +import assert from "node:assert/strict"; +import { mkdir, writeFile } from "node:fs/promises"; +import { join } from "node:path"; +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { describe, test } from "node:test"; +import { runHarnessReviewPreflight } from "../.pi/scripts/harness-review-preflight.mjs"; + +describe("harness-review-preflight", () => { + test("fails when benchmark-log missing", async () => { + const root = await mkdtemp(join(tmpdir(), "up-preflight-")); + const runDir = join(root, "run"); + await mkdir(runDir, { recursive: true }); + const result = await runHarnessReviewPreflight({ runDir }); + assert.equal(result.ok, false); + assert.match(result.reason ?? "", /benchmark-log/i); + await rm(root, { recursive: true, force: true }); + }); + + test("passes with fresh benchmark-log", async () => { + const root = await mkdtemp(join(tmpdir(), "up-preflight-ok-")); + const runDir = join(root, "run"); + const artifacts = join(runDir, "artifacts"); + await mkdir(artifacts, { recursive: true }); + await writeFile( + join(artifacts, "benchmark-log.yaml"), + "schema_version: '1.0.0'\nharness_verify: pass\nsteer_attempt: 0\n", + ); + const result = await runHarnessReviewPreflight({ + runDir, + steerAttempt: 0, + }); + assert.equal(result.ok, true); + await rm(root, { recursive: true, force: true }); + }); +}); diff --git a/test/harness-run-context-postrun.test.mjs b/test/harness-run-context-postrun.test.mjs index f6c455b3..bc1ee25a 100644 --- a/test/harness-run-context-postrun.test.mjs +++ b/test/harness-run-context-postrun.test.mjs @@ -6,12 +6,17 @@ import { tmpdir } from "node:os"; import { describe, test } from "node:test"; import { buildHarnessClearManifest, + evaluateCrossSessionResume, getLatestRunContext, + hasConfirmedHarnessClear, claimRunOwnership, nextStepAfterOutcome, parseHarnessUseRunArgs, + getPolicyTransitionBlock, policyBootstrapFromRunContext, + policyStateFromDiskIfNeeded, resolveCompletionStatuses, + resolveCrossSessionResumeInfo, blockingHarnessAutoCommandReason, blockingReviewCommandReason, blockingRunCommandReason, @@ -22,11 +27,16 @@ import { shouldReuseHarnessRunIdForAuto, isHarnessAutoSession, ensureReviewOutcomeFromEval, + readSteerStateFromRun, remediationClassFromEvalVerdict, reconcileReviewRouting, + resolveSteerEntryEffects, + updateSteerStateOnEntry, reconcileStaleExecuteCompletion, refreshRunContextProgress, releaseForeignQaRunOwnership, + saveProjectActiveRun, + saveRunContextToDisk, resolveArgsForCommand, resolveHarnessRunPostAgentState, syncPlanLastOutcomeFromTaskClarification, @@ -115,6 +125,484 @@ describe("buildHarnessClearManifest", () => { null, ); }); + test("getLatestRunContext keeps a new run started after cleared_all with no active_run_ids", () => { + const newRun = { + schema_version: "1.0.0", + run_id: "run-new", + pi_session_id: "sess", + project_root: "/tmp/project", + phase: "plan", + plan_id: "plan-new", + plan_packet_path: null, + plan_ready: true, + task_summary: "new", + status: "active", + last_completed_step: "plan", + last_outcome: "ready", + next_recommended_command: "/harness-run", + owner_pi_session_id: "sess", + updated_at: "2026-01-01T00:00:00.000Z", + }; + const latest = getLatestRunContext([ + { + type: "custom", + customType: "harness-clear-result", + data: { + approved: true, + active_cleared: true, + cleared_all: true, + active_run_ids: [], + }, + }, + { type: "custom", customType: "harness-run-context", data: newRun }, + ]); + assert.equal(latest?.run_id, "run-new"); + assert.equal(latest?.next_recommended_command, "/harness-run"); + }); + test("getLatestRunContext does not revive a cleared active run from a later stale entry", () => { + const runContext = { + schema_version: "1.0.0", + run_id: "run-stale", + pi_session_id: "sess", + project_root: "/tmp/project", + phase: "execute", + plan_id: "plan-stale", + plan_packet_path: null, + plan_ready: true, + task_summary: "clear", + status: "active", + last_completed_step: "execute", + last_outcome: "completed", + next_recommended_command: "/harness-review", + owner_pi_session_id: "sess", + updated_at: "2026-01-01T00:00:00.000Z", + }; + assert.equal( + getLatestRunContext([ + { + type: "custom", + customType: "harness-clear-result", + data: { + approved: true, + active_cleared: true, + cleared_all: true, + active_run_ids: ["run-stale"], + }, + }, + { type: "custom", customType: "harness-run-context", data: runContext }, + ]), + null, + ); + }); + test("getLatestRunContext does not clear active run on cancelled harness-clear", () => { + const runContext = { + schema_version: "1.0.0", + run_id: "run-not-cleared", + pi_session_id: "sess", + project_root: "/tmp/project", + phase: "plan", + plan_id: null, + plan_packet_path: null, + plan_ready: false, + task_summary: "clear", + status: "active", + last_completed_step: null, + last_outcome: null, + next_recommended_command: null, + owner_pi_session_id: "sess", + updated_at: "2026-01-01T00:00:00.000Z", + }; + const latest = getLatestRunContext([ + { type: "custom", customType: "harness-run-context", data: runContext }, + { + type: "custom", + customType: "harness-clear-result", + data: { approved: false, cleared_all: false, active_cleared: false }, + }, + ]); + assert.equal(latest?.run_id, "run-not-cleared"); + assert.equal( + hasConfirmedHarnessClear([ + { + type: "custom", + customType: "harness-clear-result", + data: { approved: false, cleared_all: false, active_cleared: false }, + }, + ]), + false, + ); + }); + + +}); +describe("cross-session resume eligibility", () => { + function runContext(root, overrides = {}) { + return { + schema_version: "1.0.0", + run_id: "run-resume", + pi_session_id: "sess-old", + project_root: root, + phase: "plan", + plan_id: "plan-resume", + plan_packet_path: join(root, ".pi", "harness", "runs", "run-resume", "plan-packet.yaml"), + plan_ready: true, + task_summary: "resume task", + status: "active", + last_completed_step: "plan", + last_outcome: "ready", + next_recommended_command: null, + owner_pi_session_id: "sess-old", + updated_at: new Date().toISOString(), + ...overrides, + }; + } + + test("evaluateCrossSessionResume ignores tombstoned disk pointer after confirmed clear", async () => { + const root = await mkdtemp(join(tmpdir(), "up-resume-tombstone-")); + try { + const ctx = runContext(root, { run_id: "run-stale", plan_id: "plan-stale" }); + await saveRunContextToDisk(ctx); + await saveProjectActiveRun(ctx); + const entries = [ + { + type: "custom", + customType: "harness-clear-result", + data: { + approved: true, + active_cleared: true, + cleared_all: true, + active_run_ids: ["run-stale"], + }, + }, + { type: "custom", customType: "harness-run-context", data: ctx }, + ]; + assert.equal(getLatestRunContext(entries), null); + assert.equal(await evaluateCrossSessionResume(root, entries), null); + } finally { + await rm(root, { recursive: true, force: true }); + } + }); + + test("evaluateCrossSessionResume ignores active pointer tombstoned by confirmed clear", async () => { + const root = await mkdtemp(join(tmpdir(), "up-resume-clear-pointer-")); + try { + const ctx = runContext(root, { run_id: "run-cleared" }); + await saveRunContextToDisk(ctx); + await saveProjectActiveRun(ctx); + const entries = [ + { + type: "custom", + customType: "harness-clear-result", + data: { + approved: true, + active_cleared: true, + active_run_ids: ["run-cleared"], + }, + }, + ]; + assert.equal(await evaluateCrossSessionResume(root, entries), null); + } finally { + await rm(root, { recursive: true, force: true }); + } + }); + + test("valid prior-session active run remains resumable with claim command", async () => { + const root = await mkdtemp(join(tmpdir(), "up-resume-valid-")); + try { + const ctx = runContext(root); + await saveRunContextToDisk(ctx); + await saveProjectActiveRun(ctx); + const info = await evaluateCrossSessionResume(root, []); + assert.equal(info?.resumeCommand, "/harness-use-run run-resume --claim"); + assert.equal(info?.nextAfterResume, "/harness-run"); + } finally { + await rm(root, { recursive: true, force: true }); + } + }); + + test("confirmed clear suppresses otherwise valid active-run resume", async () => { + const root = await mkdtemp(join(tmpdir(), "up-resume-cleared-")); + try { + const ctx = runContext(root); + await saveRunContextToDisk(ctx); + await saveProjectActiveRun(ctx); + const info = await evaluateCrossSessionResume(root, [ + { + type: "custom", + customType: "harness-clear-result", + data: { approved: true, active_cleared: true, cleared_all: true }, + }, + ]); + assert.equal(info, null); + } finally { + await rm(root, { recursive: true, force: true }); + } + }); + + test("invalid active-run pointers fail closed without resume command", async () => { + const root = await mkdtemp(join(tmpdir(), "up-resume-invalid-")); + try { + for (const status of ["completed", "aborted"]) { + const ctx = runContext(root, { status }); + await saveRunContextToDisk(ctx); + await saveProjectActiveRun(ctx); + assert.equal(await evaluateCrossSessionResume(root, []), null); + } + + const missingCtx = runContext(root, { run_id: "run-missing" }); + await saveProjectActiveRun(missingCtx); + assert.equal(await evaluateCrossSessionResume(root, []), null); + + const staleCtx = runContext(root, { + run_id: "run-stale", + updated_at: "2000-01-01T00:00:00.000Z", + }); + await saveRunContextToDisk(staleCtx); + await saveProjectActiveRun(staleCtx); + assert.equal(await evaluateCrossSessionResume(root, []), null); + + + const foreignCtx = runContext(root, { + project_root: join(root, "other-project"), + status: "active", + }); + await mkdir(join(root, ".pi", "harness", "runs", foreignCtx.run_id), { + recursive: true, + }); + await writeYamlFile( + join(root, ".pi", "harness", "runs", foreignCtx.run_id, "run-context.yaml"), + foreignCtx, + ); + assert.equal( + await resolveCrossSessionResumeInfo(root, { + schema_version: "1.0.0", + run_id: "run-resume", + project_root: root, + owner_pi_session_id: "sess-old", + phase: "plan", + plan_id: "plan-resume", + plan_ready: true, + updated_at: "2026-01-01T00:00:00.000Z", + }), + null, + ); + } finally { + await rm(root, { recursive: true, force: true }); + } + }); +}); + +let extensionHarness; + +async function getExtensionHarness() { + if (extensionHarness) return extensionHarness; + const commands = new Map(); + const lifecycle = new Map(); + const eventHandlers = new Map(); + const sentMessages = []; + const appendedEntries = []; + const addHandler = (map, name, handler) => { + const handlers = map.get(name) ?? []; + handlers.push(handler); + map.set(name, handlers); + }; + const pi = { + registerCommand(name, definition) { + commands.set(name, definition.handler); + }, + registerTool() {}, + appendEntry(customType, data) { + appendedEntries.push({ type: "custom", customType, data }); + }, + sendMessage(message) { + sentMessages.push(message); + }, + events: { + on(name, handler) { + addHandler(eventHandlers, name, handler); + }, + emit(name, payload) { + for (const handler of eventHandlers.get(name) ?? []) handler(payload); + }, + }, + on(name, handler) { + addHandler(lifecycle, name, handler); + }, + }; + const mod = await import("../.pi/extensions/harness-run-context.ts"); + const install = mod.default?.default ?? mod.default; + install(pi); + extensionHarness = { + commands, + lifecycle, + sentMessages, + appendedEntries, + emit: pi.events.emit, + reset() { + sentMessages.length = 0; + appendedEntries.length = 0; + pi.events.emit("harness-runs-cleared", {}); + }, + }; + return extensionHarness; +} + +function extensionCtx({ entries = [], notifications = [], sessionId = "new-session" } = {}) { + return { + hasUI: true, + ui: { + notify(message, type) { + notifications.push({ message, type }); + }, + }, + sessionManager: { + getEntries: () => entries, + getSessionId: () => sessionId, + }, + }; +} + +async function writeActiveRunPointer(root, overrides = {}) { + await mkdir(join(root, ".pi", "harness"), { recursive: true }); + await writeFile( + join(root, ".pi", "harness", "active-run.json"), + `${JSON.stringify( + { + schema_version: "1.0.0", + run_id: "missing-run", + project_root: root, + owner_pi_session_id: "old-session", + phase: "execute", + plan_id: "plan-missing", + plan_ready: true, + updated_at: new Date().toISOString(), + ...overrides, + }, + null, + 2, + )}\n`, + "utf-8", + ); +} + +describe("harness-run-context extension invalid active pointer handling", () => { + test("/harness-plan is not blocked by a missing active-run pointer", async () => { + const harness = await getExtensionHarness(); + harness.reset(); + const root = await mkdtemp(join(tmpdir(), "up-ext-missing-plan-")); + const prev = process.cwd(); + try { + process.chdir(root); + await writeActiveRunPointer(root); + const beforeAgentStart = harness.lifecycle.get("before_agent_start")?.[0]; + assert.ok(beforeAgentStart); + const result = await beforeAgentStart( + { prompt: '/harness-plan "new task"', systemPrompt: "base" }, + extensionCtx(), + ); + assert.notEqual(result?.message?.customType, "harness-run-context-block"); + assert.ok(!result?.message?.content?.includes("Active harness run in progress")); + } finally { + process.chdir(prev); + await rm(root, { recursive: true, force: true }); + } + }); + + test("/harness-run-status treats confirmed clear plus leftover pointer as no active run", async () => { + const harness = await getExtensionHarness(); + harness.reset(); + const root = await mkdtemp(join(tmpdir(), "up-ext-clear-status-")); + const prev = process.cwd(); + const notifications = []; + try { + process.chdir(root); + await writeActiveRunPointer(root, { + run_id: "cleared-run", + plan_id: "plan-cleared", + }); + const entries = [ + { + type: "custom", + customType: "harness-clear-result", + data: { approved: true, active_cleared: true, cleared_all: true }, + }, + ]; + await harness.commands.get("harness-run-status")( + "", + extensionCtx({ entries, notifications }), + ); + const text = notifications.map((item) => item.message).join("\n"); + assert.match(text, /No active harness run\. Start with \/harness-plan/); + assert.ok(!text.includes("phase: execute")); + assert.ok(!text.includes("plan-cleared")); + } finally { + process.chdir(prev); + await rm(root, { recursive: true, force: true }); + } + }); + + test("valid prior-session run still offers explicit claim recovery", async () => { + const harness = await getExtensionHarness(); + harness.reset(); + const root = await mkdtemp(join(tmpdir(), "up-ext-valid-resume-")); + const prev = process.cwd(); + try { + process.chdir(root); + const ctx = { + schema_version: "1.0.0", + run_id: "run-resume", + pi_session_id: "old-session", + project_root: root, + phase: "plan", + plan_id: "plan-resume", + plan_packet_path: join(root, ".pi", "harness", "runs", "run-resume", "plan-packet.yaml"), + plan_ready: true, + task_summary: "resume task", + status: "active", + last_completed_step: "plan", + last_outcome: "ready", + next_recommended_command: null, + owner_pi_session_id: "old-session", + updated_at: new Date().toISOString(), + }; + await saveRunContextToDisk(ctx); + await saveProjectActiveRun(ctx); + const sessionStart = harness.lifecycle.get("session_start")?.[0]; + assert.ok(sessionStart); + await sessionStart({}, extensionCtx({ sessionId: "new-session" })); + const content = harness.sentMessages.map((message) => message.content).join("\n"); + assert.match(content, /\/harness-use-run run-resume --claim/); + } finally { + process.chdir(prev); + await rm(root, { recursive: true, force: true }); + } + }); +}); + + + +describe("getPolicyTransitionBlock cross-session bootstrap", () => { + test("allows harness-review when disk run context is hydrated without session policy", () => { + const activeCtx = { + schema_version: "1.0.0", + run_id: "run-review", + pi_session_id: "sess", + project_root: "/tmp/project", + phase: "evaluate", + plan_id: "plan-review", + plan_packet_path: "/tmp/project/.pi/harness/runs/run-review/plan-packet.yaml", + plan_ready: true, + task_summary: "widget", + status: "active", + last_completed_step: "steer", + last_outcome: "completed", + next_recommended_command: "/harness-review", + owner_pi_session_id: "sess", + updated_at: "2026-01-01T00:00:00.000Z", + }; + const block = getPolicyTransitionBlock("/harness-review --quick", [], activeCtx); + assert.equal(block.blocked, false); + }); }); describe("claimRunOwnership", () => { @@ -170,6 +658,139 @@ describe("nextStepAfterOutcome post-run", () => { ); }); + test("hygiene steer entry does not increment steer_attempt", async () => { + const root = await mkdtemp(join(tmpdir(), "up-hygiene-steer-")); + const runId = "run-hygiene"; + const artifacts = join(root, ".pi", "harness", "runs", runId, "artifacts"); + await mkdir(artifacts, { recursive: true }); + await writeYamlFile(join(artifacts, "repair-brief.yaml"), { + schema_version: "1.1.0", + run_id: runId, + gap_kind: "hygiene", + remediation_class: "implementation_gap", + }); + await writeYamlFile(join(artifacts, "steer-state.yaml"), { + schema_version: "1.0.0", + run_id: runId, + attempt: 1, + hygiene_repairs: 0, + }); + const effects = await resolveSteerEntryEffects(runId, root, ""); + assert.equal(effects.skipExecutor, true); + assert.equal(effects.incrementSteerAttempt, false); + assert.equal(effects.incrementHygieneRepairs, true); + const ctx = { + schema_version: "1.0.0", + run_id: runId, + pi_session_id: "sess", + project_root: root, + phase: "execute", + plan_id: "p", + plan_packet_path: null, + plan_ready: true, + task_summary: "t", + status: "active", + last_completed_step: "review", + last_outcome: "fail", + next_recommended_command: "/harness-steer", + owner_pi_session_id: "sess", + steer_attempt: 1, + updated_at: "2026-01-01T00:00:00.000Z", + }; + await updateSteerStateOnEntry(runId, root, effects, ctx); + const state = await readSteerStateFromRun(runId, root); + assert.equal(state?.attempt, 1); + assert.equal(state?.hygiene_repairs, 1); + await rm(root, { recursive: true, force: true }); + }); + + test("burst cap allows one extra steer when burst env on", async () => { + const prev = process.env.HARNESS_STEER_BURST; + process.env.HARNESS_STEER_BURST = "1"; + const root = await mkdtemp(join(tmpdir(), "up-burst-cap-")); + const runId = "run-burst"; + const runDir = join(root, ".pi", "harness", "runs", runId); + const artifacts = join(runDir, "artifacts"); + await mkdir(join(runDir, "handoff"), { recursive: true }); + await mkdir(artifacts, { recursive: true }); + await writeYamlFile(join(runDir, "handoff", "executor-summary.yaml"), { + execution_status: "completed", + }); + await writeYamlFile(join(artifacts, "eval-verdict.yaml"), { + schema_version: "1.0.0", + status: "pass", + }); + await writeYamlFile(join(artifacts, "adversary-report.yaml"), { + schema_version: "1.0.0", + block_merge: true, + }); + await writeYamlFile(join(artifacts, "repair-brief.yaml"), { + schema_version: "1.1.0", + remediation_class: "implementation_gap", + }); + await writeYamlFile(join(artifacts, "review-outcome.yaml"), { + schema_version: "1.0.0", + remediation_class: "implementation_gap", + }); + await writeYamlFile(join(artifacts, "steer-state.yaml"), { + schema_version: "1.0.0", + attempt: 3, + max_attempts: 3, + }); + const ctx = { + schema_version: "1.0.0", + run_id: runId, + pi_session_id: "sess", + project_root: root, + phase: "evaluate", + plan_id: "p", + plan_packet_path: null, + plan_ready: true, + task_summary: "t", + status: "active", + last_completed_step: "review", + last_outcome: "pass", + next_recommended_command: "/harness-steer --burst", + owner_pi_session_id: "sess", + steer_attempt: 3, + steer_max_attempts: 3, + updated_at: "2026-01-01T00:00:00.000Z", + }; + const atCap = await blockingSteerCommandReason("harness-steer", ctx, root); + assert.equal(atCap, null); + await writeYamlFile(join(artifacts, "steer-state.yaml"), { + schema_version: "1.0.0", + attempt: 4, + max_attempts: 3, + }); + const ctxExhausted = { ...ctx, steer_attempt: 4 }; + const blocked = await blockingSteerCommandReason( + "harness-steer", + ctxExhausted, + root, + ); + assert.match(blocked ?? "", /cap reached/i); + if (prev === undefined) delete process.env.HARNESS_STEER_BURST; + else process.env.HARNESS_STEER_BURST = prev; + await rm(root, { recursive: true, force: true }); + }); + + test("split verdict burst routes to harness-steer --burst", () => { + assert.equal( + nextStepAfterOutcome({ + phase: "evaluate", + evalStatus: "pass", + lastCompletedStep: "review", + remediationClass: "implementation_gap", + steerAttempt: 0, + steerMaxAttempts: 3, + reviewComplete: true, + burstAllowed: true, + }), + "/harness-steer --burst", + ); + }); + test("adversary complete suggests policy status", () => { assert.equal( nextStepAfterOutcome({ @@ -636,6 +1257,33 @@ describe("review routing from eval-verdict", () => { assert.match(steerBlock ?? "", /implementation_gap/i); await rm(root, { recursive: true, force: true }); }); + + test("synthesizes split verdict eval pass + adversary block_merge", async () => { + const root = await mkdtemp(join(tmpdir(), "up-split-verdict-")); + const runId = "run-split"; + const runDir = join(root, ".pi", "harness", "runs", runId, "artifacts"); + await mkdir(runDir, { recursive: true }); + await writeYamlFile(join(runDir, "eval-verdict.yaml"), { + schema_version: "1.0.0", + run_id: runId, + status: "pass", + }); + await writeYamlFile(join(runDir, "adversary-report.yaml"), { + schema_version: "1.0.0", + run_id: runId, + block_merge: true, + severity: "high", + }); + await ensureReviewOutcomeFromEval(runId, root); + const { readReviewOutcomeFromRun } = await import( + "../.pi/lib/harness-run-context.ts" + ); + const outcome = await readReviewOutcomeFromRun(runId, root); + assert.equal(outcome?.remediation_class, "implementation_gap"); + assert.equal(outcome?.eval_status, "pass"); + assert.equal(outcome?.adversary_status, "block_merge"); + await rm(root, { recursive: true, force: true }); + }); }); describe("harness command gates", () => { diff --git a/test/harness-subagent-precheck-disk.test.mjs b/test/harness-subagent-precheck-disk.test.mjs new file mode 100644 index 00000000..e63d2bd0 --- /dev/null +++ b/test/harness-subagent-precheck-disk.test.mjs @@ -0,0 +1,78 @@ +import assert from "node:assert/strict"; +import { mkdir } from "node:fs/promises"; +import { join } from "node:path"; +import { mkdtemp, rm } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { describe, test } from "node:test"; +import { + liteReviewMaySkipAdversary, + priorBlockMergeFromDisk, +} from "../.pi/lib/harness-lite-review-precheck.ts"; +import { writeYamlFile } from "../.pi/lib/harness-yaml.ts"; + +describe("precheck disk block_merge", () => { + test("priorBlockMergeFromDisk reads adversary block_merge not last_outcome alone", async () => { + const root = await mkdtemp(join(tmpdir(), "up-precheck-disk-")); + const runId = "run-disk-block"; + const artifacts = join(root, ".pi", "harness", "runs", runId, "artifacts"); + await mkdir(artifacts, { recursive: true }); + await writeYamlFile(join(artifacts, "adversary-report.yaml"), { + schema_version: "1.0.0", + block_merge: true, + }); + + const blocked = await priorBlockMergeFromDisk({ + projectRoot: root, + runId, + lastOutcome: "pass", + }); + assert.equal(blocked, true); + await rm(root, { recursive: true, force: true }); + }); + + test("liteReviewMaySkipAdversary false when block_merge on disk", async () => { + const root = await mkdtemp(join(tmpdir(), "up-precheck-lite-block-")); + const runId = "run-lite-block"; + const artifacts = join(root, ".pi", "harness", "runs", runId, "artifacts"); + await mkdir(artifacts, { recursive: true }); + await writeYamlFile(join(artifacts, "adversary-report.yaml"), { + schema_version: "1.0.0", + block_merge: true, + }); + await writeYamlFile(join(artifacts, "benchmark-log.yaml"), { + schema_version: "1.0.0", + adversary_repro: "pass", + }); + + const maySkip = await liteReviewMaySkipAdversary({ + projectRoot: root, + runId, + lastOutcome: "pass", + }); + assert.equal(maySkip, false); + await rm(root, { recursive: true, force: true }); + }); + + test("liteReviewMaySkipAdversary true when repro pass and no block_merge", async () => { + const root = await mkdtemp(join(tmpdir(), "up-precheck-lite-ok-")); + const runId = "run-lite-ok"; + const artifacts = join(root, ".pi", "harness", "runs", runId, "artifacts"); + await mkdir(artifacts, { recursive: true }); + await writeYamlFile(join(artifacts, "adversary-report.yaml"), { + schema_version: "1.0.0", + block_merge: false, + }); + await writeYamlFile(join(artifacts, "benchmark-log.yaml"), { + schema_version: "1.0.0", + adversary_repro: "pass", + }); + + const maySkip = await liteReviewMaySkipAdversary({ + projectRoot: root, + runId, + lastOutcome: "pass", + }); + assert.equal(maySkip, true); + await rm(root, { recursive: true, force: true }); + }); +}); diff --git a/test/plan-headless-git-qa.test.mjs b/test/plan-headless-git-qa.test.mjs new file mode 100644 index 00000000..4e74a106 --- /dev/null +++ b/test/plan-headless-git-qa.test.mjs @@ -0,0 +1,138 @@ +import { test } from "node:test"; +import assert from "node:assert/strict"; +import { mkdtemp, mkdir, writeFile } from "node:fs/promises"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { spawnSync } from "node:child_process"; +import { access } from "node:fs/promises"; +import { constants } from "node:fs"; +import { + isHarnessGitQaCommitComplete, + maybeHeadlessGitQaFinalizeOnRun, +} from "../.pi/lib/plan-headless-ux.ts"; + +const REPO_ROOT = join(import.meta.dirname, ".."); +const SMOKE_REL = ".pi/harness/evals/smoke/E2E-LAST-RUN.txt"; + +async function initGitRepo(root) { + spawnSync("git", ["init"], { cwd: root }); + spawnSync("git", ["config", "user.email", "qa@test.local"], { cwd: root }); + spawnSync("git", ["config", "user.name", "QA"], { cwd: root }); + spawnSync("git", ["checkout", "-b", "main"], { cwd: root }); + await writeFile(join(root, "README.md"), "init\n", "utf8"); + spawnSync("git", ["add", "README.md"], { cwd: root }); + spawnSync("git", ["commit", "-m", "init"], { cwd: root }); + await mkdir(join(root, ".pi"), { recursive: true }); + await writeFile( + join(root, ".pi", "auto-commit.json"), + JSON.stringify({ + coAuthor: { login: "bot", email: "bot@test.local" }, + message: { + template: "{type}({scope}): {subject}", + templateNoScope: "{type}: {subject}", + coAuthorTrailer: "Co-authored-by: {login} <{email}>", + }, + branch: { strategy: "none" }, + }), + "utf8", + ); +} + +test("maybeHeadlessGitQaFinalizeOnRun commits only smoke file with unrelated staged files", async () => { + const prevSmoke = process.env.HARNESS_QA_SMOKE; + const prevNi = process.env.HARNESS_NON_INTERACTIVE; + process.env.HARNESS_QA_SMOKE = "1"; + process.env.HARNESS_NON_INTERACTIVE = "1"; + const root = await mkdtemp(join(tmpdir(), "harness-git-qa-")); + try { + await initGitRepo(root); + await writeFile(join(root, "noise.txt"), "staged\n", "utf8"); + spawnSync("git", ["add", "noise.txt"], { cwd: root }); + + const runCtx = { + run_id: "qa-run-1", + project_root: root, + task_summary: + "Harness git workflow: append ISO line and harness-git-commit only smoke file", + plan_ready: true, + }; + const runDir = join(root, ".pi", "harness", "runs", runCtx.run_id); + await mkdir(join(runDir, "artifacts"), { recursive: true }); + await writeFile(join(runDir, "run-context.yaml"), `run_id: ${runCtx.run_id}\n`, "utf8"); + const done = await maybeHeadlessGitQaFinalizeOnRun({ + projectRoot: root, + runCtx, + command: "harness-run", + upPkg: REPO_ROOT, + }); + assert.equal(done, true); + assert.equal(await isHarnessGitQaCommitComplete(root), true); + const headFiles = spawnSync( + "git", + ["diff-tree", "--no-commit-id", "--name-only", "-r", "HEAD"], + { cwd: root, encoding: "utf8" }, + ); + assert.deepEqual( + headFiles.stdout.trim().split("\n").filter(Boolean), + [SMOKE_REL], + ); + const stagedNoise = spawnSync( + "git", + ["diff", "--cached", "--name-only"], + { cwd: root, encoding: "utf8" }, + ); + assert.match(stagedNoise.stdout, /noise\.txt/); + } finally { + if (prevSmoke === undefined) delete process.env.HARNESS_QA_SMOKE; + else process.env.HARNESS_QA_SMOKE = prevSmoke; + if (prevNi === undefined) delete process.env.HARNESS_NON_INTERACTIVE; + else process.env.HARNESS_NON_INTERACTIVE = prevNi; + } +}); + +test("maybeHeadlessGitQaFinalizeOnRun writes git-workflow when commit already at HEAD", async () => { + const prevSmoke = process.env.HARNESS_QA_SMOKE; + const prevNi = process.env.HARNESS_NON_INTERACTIVE; + process.env.HARNESS_QA_SMOKE = "1"; + process.env.HARNESS_NON_INTERACTIVE = "1"; + const root = await mkdtemp(join(tmpdir(), "harness-git-qa-")); + try { + await initGitRepo(root); + const runCtx = { + run_id: "qa-run-2", + project_root: root, + task_summary: + "Harness git workflow: append ISO line and harness-git-commit only smoke file", + plan_ready: true, + }; + const runDir = join(root, ".pi", "harness", "runs", runCtx.run_id); + await mkdir(join(runDir, "artifacts"), { recursive: true }); + await writeFile(join(runDir, "run-context.yaml"), `run_id: ${runCtx.run_id}\n`, "utf8"); + assert.equal( + await maybeHeadlessGitQaFinalizeOnRun({ + projectRoot: root, + runCtx, + command: "harness-auto", + upPkg: REPO_ROOT, + }), + true, + ); + const artifact = join(runDir, "artifacts", "git-workflow.yaml"); + await access(artifact, constants.R_OK); + assert.equal( + await maybeHeadlessGitQaFinalizeOnRun({ + projectRoot: root, + runCtx, + command: "harness-auto", + upPkg: REPO_ROOT, + }), + true, + ); + assert.equal(await isHarnessGitQaCommitComplete(root), true); + } finally { + if (prevSmoke === undefined) delete process.env.HARNESS_QA_SMOKE; + else process.env.HARNESS_QA_SMOKE = prevSmoke; + if (prevNi === undefined) delete process.env.HARNESS_NON_INTERACTIVE; + else process.env.HARNESS_NON_INTERACTIVE = prevNi; + } +}); diff --git a/test/plan-headless-ux.test.mjs b/test/plan-headless-ux.test.mjs index b8ef7e0e..9091feb5 100644 --- a/test/plan-headless-ux.test.mjs +++ b/test/plan-headless-ux.test.mjs @@ -1,6 +1,7 @@ import { test } from "node:test"; import assert from "node:assert/strict"; import { mkdir, writeFile } from "node:fs/promises"; +import { writeYamlFile } from "../.pi/lib/harness-yaml.ts"; import { join } from "node:path"; import { tmpdir } from "node:os"; import { randomUUID } from "node:crypto"; @@ -225,3 +226,62 @@ test("maybeHeadlessQaAutoExecuteSmoke writes smoke ISO after auto plan", async ( } } }); + +test("shouldEndHeadlessHarnessPrintSession ends after steer hygiene", async () => { + const { shouldEndHeadlessHarnessPrintSession } = await import( + "../.pi/lib/plan-headless-ux.ts" + ); + const prev = process.env.HARNESS_QA_SMOKE; + const prevNi = process.env.HARNESS_NON_INTERACTIVE; + process.env.HARNESS_QA_SMOKE = "1"; + process.env.HARNESS_NON_INTERACTIVE = "1"; + const end = await shouldEndHeadlessHarnessPrintSession({ + command: "harness-steer", + runCtx: { + run_id: "r1", + last_completed_step: "steer", + last_outcome: "completed", + }, + projectRoot: process.cwd(), + }); + assert.equal(end, true); + if (prev === undefined) delete process.env.HARNESS_QA_SMOKE; + else process.env.HARNESS_QA_SMOKE = prev; + if (prevNi === undefined) delete process.env.HARNESS_NON_INTERACTIVE; + else process.env.HARNESS_NON_INTERACTIVE = prevNi; +}); + +test("shouldEndHeadlessHarnessPrintSession ends harness-run when executor handoff exists", async () => { + const { shouldEndHeadlessHarnessPrintSession } = await import( + "../.pi/lib/plan-headless-ux.ts" + ); + const saved = { HARNESS_NON_INTERACTIVE: process.env.HARNESS_NON_INTERACTIVE }; + process.env.HARNESS_NON_INTERACTIVE = "1"; + try { + const projectRoot = join(tmpdir(), `headless-run-end-${randomUUID()}`); + const runId = "run-exec-handoff"; + const runDir = join(projectRoot, ".pi", "harness", "runs", runId); + await mkdir(join(runDir, "handoff"), { recursive: true }); + await writeYamlFile(join(runDir, "handoff", "executor-summary.yaml"), { + schema_version: "1.0.0", + execution_status: "completed", + }); + const shouldEnd = await shouldEndHeadlessHarnessPrintSession({ + command: "harness-run", + projectRoot, + runCtx: { + run_id: runId, + plan_ready: true, + last_completed_step: "execute", + last_outcome: "completed", + }, + }); + assert.equal(shouldEnd, true); + } finally { + if (saved.HARNESS_NON_INTERACTIVE === undefined) { + delete process.env.HARNESS_NON_INTERACTIVE; + } else { + process.env.HARNESS_NON_INTERACTIVE = saved.HARNESS_NON_INTERACTIVE; + } + } +}); diff --git a/vendor/pi-subagents/src/subagents.ts b/vendor/pi-subagents/src/subagents.ts index 764f81ff..8b3f1d10 100644 --- a/vendor/pi-subagents/src/subagents.ts +++ b/vendor/pi-subagents/src/subagents.ts @@ -791,6 +791,37 @@ function truncateSubagentDetails( }; } +const HARNESS_HANDOFF_CONTENT_CAP = 1400; + +function applyTruncateDetailsPolicy; details?: SubagentDetails }>( + toolResult: T, + options: HarnessSubagentsOptions, +): T { + if (!options.truncateDetails || !toolResult.details) return toolResult; + const details = truncateSubagentDetails(toolResult.details); + const harnessResults = details.results.filter((r) => r.agent.startsWith("harness/")); + if (harnessResults.length === 1) { + const r = harnessResults[0]!; + const output = getResultFinalOutput(r); + const status = r.timedOut + ? "timed out" + : r.exitCode === 0 + ? "completed" + : "failed"; + let body = output.trim(); + if (body.length > HARNESS_HANDOFF_CONTENT_CAP) { + body = `${body.slice(0, HARNESS_HANDOFF_CONTENT_CAP)}\n…(truncated — read handoff artifacts under HARNESS_RUN_DIR)`; + } + const text = [ + `[subagent ${r.agent}] ${status}.`, + body || "(no final output)", + "Submit tools wrote canonical artifacts; do not re-parse subprocess transcript.", + ].join("\n"); + return { ...toolResult, content: [{ type: "text", text }], details }; + } + return { ...toolResult, details }; +} + type SubagentToolParams = { agent?: string; task?: string; @@ -1508,7 +1539,7 @@ export function createSubagentsExtension( ...(toolResult.details?.aggregator ? [toolResult.details.aggregator] : []), ]; spawnTimedOut = allResults.some((r) => r.timedOut === true); - return toolResult; + return applyTruncateDetailsPolicy(toolResult, options); } finally { options.onSpawnEnd?.(harnessAgents.length); const mode = params.chain?.length