diff --git a/.github/workflows/gitleaks.yml b/.github/workflows/gitleaks.yml new file mode 100644 index 00000000..15c70781 --- /dev/null +++ b/.github/workflows/gitleaks.yml @@ -0,0 +1,22 @@ +name: gitleaks + +on: + push: + branches: [main] + pull_request: + +permissions: + contents: read + +jobs: + scan: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + # gitleaks-action v1 scans for committed secrets and needs no license + # key (v2 requires GITLEAKS_LICENSE for organization repos). + - name: Scan for secrets + uses: zricethezav/gitleaks-action@v1.6.0 diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml new file mode 100644 index 00000000..e3ceaf1c --- /dev/null +++ b/.github/workflows/go.yml @@ -0,0 +1,44 @@ +name: Go + +on: + push: + branches: [main] + pull_request: + paths: + - "backend/**" + - ".github/workflows/go.yml" + +permissions: + contents: read + +jobs: + build-test: + runs-on: ubuntu-latest + defaults: + run: + working-directory: backend + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-go@v5 + with: + go-version: "1.22" + cache: false + + - name: Check formatting + run: | + unformatted=$(gofmt -l .) + if [ -n "$unformatted" ]; then + echo "These files need gofmt:" + echo "$unformatted" + exit 1 + fi + + - name: Build + run: go build ./... + + - name: Vet + run: go vet ./... + + - name: Test + run: go test -race ./... diff --git a/README.md b/README.md index 0f28a2e3..353d1200 100644 --- a/README.md +++ b/README.md @@ -2,3 +2,6 @@ Rewrite of the agent-orchestrator: a long-running Go backend daemon (`backend/`) paired with an Electron + TypeScript frontend (`frontend/`). + +See [`docs/`](docs/README.md) for architecture and status — start with the +Lifecycle Manager + Session Manager lane in [`docs/architecture.md`](docs/architecture.md). diff --git a/backend/internal/domain/decide/decide.go b/backend/internal/domain/decide/decide.go new file mode 100644 index 00000000..e7f2c445 --- /dev/null +++ b/backend/internal/domain/decide/decide.go @@ -0,0 +1,263 @@ +// Package decide is the pure DECIDE core: total, deterministic, zero I/O. It +// collapses observed facts (plus the prior detecting/activity memory) into one +// LifecycleDecision. Every function here must remain side-effect free so the +// whole status truth-table can be tested in isolation. +package decide + +import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "regexp" + "strings" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +// Anti-flap tuning. detecting escalates to stuck only after this many +// consecutive unchanged-evidence ticks OR once this much wallclock has elapsed +// since first entering detecting. +const ( + DetectingMaxAttempts = 3 + DetectingMaxDuration = 5 * time.Minute +) + +// ResolveProbeDecision reconciles runtime/process liveness into a decision. +// +// The ordering encodes the load-bearing invariants: +// - an explicit kill short-circuits straight to terminal (the only inferred +// terminal this decider may reach without quarantine); +// - a *failed* probe (timeout/error) is never read as death — it routes to +// detecting, as does any disagreement between the two probes; +// - only runtime-dead + process-dead + no-recent-activity reaches killed. +func ResolveProbeDecision(in ProbeInput) LifecycleDecision { + if in.KillRequested { + return LifecycleDecision{ + Status: domain.StatusKilled, + Evidence: "manual kill requested", + SessionState: domain.SessionTerminated, + SessionReason: domain.ReasonManuallyKilled, + } + } + + if in.RuntimeFailed || in.ProcessFailed || in.Runtime == domain.RuntimeProbeFailed { + ev := fmt.Sprintf("probe_failed runtime=%s runtimeFailed=%t process=%s processFailed=%t", + in.Runtime, in.RuntimeFailed, in.Process, in.ProcessFailed) + return detecting(in, domain.ReasonProbeFailure, ev) + } + + switch in.Runtime { + case domain.RuntimeAlive: + if in.Process == ProcessDead { + // Runtime up but the agent process is gone: probes disagree. + ev := fmt.Sprintf("disagree runtime=alive process=%s recentActivity=%t", in.Process, in.RecentActivity) + return detecting(in, domain.ReasonAgentProcessExited, ev) + } + return LifecycleDecision{ + Status: domain.StatusWorking, + Evidence: fmt.Sprintf("alive runtime=alive process=%s", in.Process), + SessionState: domain.SessionWorking, + SessionReason: domain.ReasonTaskInProgress, + } + + case domain.RuntimeExited, domain.RuntimeMissing: + // Runtime is gone. Death is only concluded when the process is *also* + // confirmed dead AND nothing has been heard from the agent recently; + // any other shape is ambiguous and quarantines. + if in.Process == ProcessAlive || in.RecentActivity { + ev := fmt.Sprintf("disagree runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity) + return detecting(in, domain.ReasonRuntimeLost, ev) + } + if in.Process == ProcessDead { + return LifecycleDecision{ + Status: domain.StatusKilled, + Evidence: fmt.Sprintf("dead runtime=%s process=dead recentActivity=false", in.Runtime), + SessionState: domain.SessionTerminated, + SessionReason: domain.ReasonRuntimeLost, + } + } + // Process indeterminate: cannot confirm death, so quarantine. + ev := fmt.Sprintf("runtime_lost runtime=%s process=%s recentActivity=false", in.Runtime, in.Process) + return detecting(in, domain.ReasonRuntimeLost, ev) + + default: + // unknown (not yet probed): ambiguous, never conclude death. + ev := fmt.Sprintf("runtime_unknown runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity) + return detecting(in, domain.ReasonRuntimeLost, ev) + } +} + +// ResolveOpenPRDecision walks the PR pipeline ladder. CI failure dominates +// everything, then requested changes, then the approval/merge states, then a +// pending review, then a stalled (idle-beyond-threshold) PR, else plain open. +func ResolveOpenPRDecision(in OpenPRInput) LifecycleDecision { + // evidence is a stable, timestamp-free summary " # " + // for logs/traceability; it folds in the PR identity inputs (Number/URL). + evidence := func(cond string) string { + s := cond + if in.Number > 0 { + s += fmt.Sprintf(" #%d", in.Number) + } + if in.URL != "" { + s += " " + in.URL + } + return s + } + base := func(status domain.SessionStatus, cond string, prReason domain.PRReason, ss domain.SessionState, sr domain.SessionReason) LifecycleDecision { + return LifecycleDecision{ + Status: status, + Evidence: evidence(cond), + SessionState: ss, + SessionReason: sr, + PRState: domain.PROpen, + PRReason: prReason, + } + } + + switch { + case in.CIFailing: + return base(domain.StatusCIFailed, "ci_failing", domain.PRReasonCIFailing, domain.SessionWorking, domain.ReasonFixingCI) + case in.ChangesRequested: + return base(domain.StatusChangesRequested, "changes_requested", domain.PRReasonChangesRequested, domain.SessionWorking, domain.ReasonResolvingReviewComments) + case in.Mergeable: + // Mergeability is the authoritative merge gate, so it already folds in + // "approved if review is required". Checking it before Approved means a + // PR on a no-required-review repo (mergeable, not formally approved) is + // still surfaced as ready-to-merge instead of falling through to PR_OPEN. + return base(domain.StatusMergeable, "merge_ready", domain.PRReasonMergeReady, domain.SessionIdle, domain.ReasonAwaitingExternalReview) + case in.Approved: + return base(domain.StatusApproved, "approved", domain.PRReasonApproved, domain.SessionIdle, domain.ReasonAwaitingExternalReview) + case in.ReviewPending: + return base(domain.StatusReviewPending, "review_pending", domain.PRReasonReviewPending, domain.SessionIdle, domain.ReasonAwaitingExternalReview) + case in.IdleBeyond: + // A PR open but quiet past the stuck threshold needs a human nudge. + return base(domain.StatusStuck, "idle_beyond", domain.PRReasonInProgress, domain.SessionStuck, domain.ReasonAwaitingUserInput) + default: + return base(domain.StatusPROpen, "pr_open", domain.PRReasonInProgress, domain.SessionWorking, domain.ReasonPRCreated) + } +} + +// ResolveTerminalPRStateDecision handles merged/closed PRs. A merge parks the +// session idle awaiting a human's post-merge decision; a close drops to idle. +// none/open are not terminal — callers should route those to the open-PR or +// probe deciders — but the function stays total for safety. +func ResolveTerminalPRStateDecision(pr domain.PRState) LifecycleDecision { + switch pr { + case domain.PRMerged: + return LifecycleDecision{ + Status: domain.StatusMerged, + Evidence: "pr merged", + SessionState: domain.SessionIdle, + SessionReason: domain.ReasonMergedWaitingDecision, + PRState: domain.PRMerged, + PRReason: domain.PRReasonMerged, + } + case domain.PRClosed: + return LifecycleDecision{ + Status: domain.StatusIdle, + Evidence: "pr closed unmerged", + SessionState: domain.SessionIdle, + SessionReason: domain.ReasonAwaitingUserInput, + PRState: domain.PRClosed, + PRReason: domain.PRReasonClosedUnmerged, + } + default: + return LifecycleDecision{ + Status: domain.StatusWorking, + Evidence: fmt.Sprintf("non-terminal pr state=%s", pr), + SessionState: domain.SessionWorking, + SessionReason: domain.ReasonTaskInProgress, + PRState: pr, + } + } +} + +// CreateDetectingDecision advances or escalates the anti-flap quarantine. +// +// The attempt counter climbs only while the (timestamp-stripped) evidence hash +// is unchanged and resets the moment the evidence moves; StartedAt is preserved +// across the whole detecting episode so the duration cap is a real wall-clock +// safety net even when the evidence keeps flapping. Escalation to stuck fires +// at DetectingMaxAttempts consecutive unchanged ticks OR DetectingMaxDuration +// elapsed since first entering detecting. +func CreateDetectingDecision(in DetectingInput) LifecycleDecision { + hash := HashEvidence(in.Evidence) + + attempts := 1 + startedAt := in.Now + if in.Prior != nil { + startedAt = in.Prior.StartedAt + if in.Prior.EvidenceHash == hash { + attempts = in.Prior.Attempts + 1 + } + } + + escalate := attempts >= DetectingMaxAttempts || !in.Now.Before(startedAt.Add(DetectingMaxDuration)) + if escalate { + return LifecycleDecision{ + Status: domain.StatusStuck, + Evidence: in.Evidence, + SessionState: domain.SessionStuck, + SessionReason: in.ProposedReason, + } + } + + return LifecycleDecision{ + Status: domain.StatusDetecting, + Evidence: in.Evidence, + Detecting: &domain.DetectingState{Attempts: attempts, StartedAt: startedAt, EvidenceHash: hash}, + SessionState: domain.SessionDetecting, + SessionReason: in.ProposedReason, + } +} + +// HashEvidence normalises an evidence string (stripping timestamps and +// collapsing whitespace) and hashes it, so unchanged-but-restamped signals +// compare equal and the detecting counter is not reset by clock movement alone. +func HashEvidence(evidence string) string { + s := evidence + for _, re := range timestampPatterns { + s = re.ReplaceAllString(s, "") + } + s = strings.Join(strings.Fields(s), " ") + sum := sha256.Sum256([]byte(s)) + return hex.EncodeToString(sum[:]) +} + +// timestampPatterns is the list of regexes HashEvidence applies (in order) to +// delete the time-varying parts of an evidence string before hashing, so the +// same ambiguous signal restamped with a new clock value hashes equal and the +// detecting counter keeps climbing instead of resetting every tick. +// +// Order matters: the full datetime form is removed first so its embedded +// HH:MM:SS isn't half-eaten by the bare time-of-day pattern that follows. +// +// 1. full ISO-8601 / RFC3339 datetime — date, a T or space separator, +// HH:MM:SS, optional fractional seconds, optional Z or ±HH:MM offset. +// e.g. "2026-05-26T12:00:00Z", "2026-05-26 12:00:00.218+05:30" +// 2. a bare time-of-day, e.g. "12:00:00" or "12:00:00.218" +// 3. a bare unix epoch — any 10-13 digit run (seconds or millis), e.g. +// "1716724800". This is broad enough to also clobber a same-width numeric +// ID if one ever appears in evidence; evidence is decider-authored, so keep +// IDs out of evidence strings to preserve hash fidelity. +var timestampPatterns = []*regexp.Regexp{ + regexp.MustCompile(`\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?`), + regexp.MustCompile(`\d{2}:\d{2}:\d{2}(?:\.\d+)?`), + regexp.MustCompile(`\b\d{10,13}\b`), +} + +// detecting adapts a probe verdict into the shared anti-flap path. It packages +// the proposed reason + evidence (plus the prior counter from the same probe +// input) into a DetectingInput and defers to CreateDetectingDecision, so every +// probe-driven ambiguity is counted and escalated by the identical quarantine +// logic instead of each probe branch re-implementing the counter. +func detecting(in ProbeInput, reason domain.SessionReason, evidence string) LifecycleDecision { + return CreateDetectingDecision(DetectingInput{ + Evidence: evidence, + ProposedState: domain.SessionDetecting, + ProposedReason: reason, + Prior: in.Prior, + Now: in.Now, + }) +} diff --git a/backend/internal/domain/decide/decide_test.go b/backend/internal/domain/decide/decide_test.go new file mode 100644 index 00000000..d6e027f1 --- /dev/null +++ b/backend/internal/domain/decide/decide_test.go @@ -0,0 +1,530 @@ +package decide + +import ( + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +var t0 = time.Date(2026, 5, 26, 12, 0, 0, 0, time.UTC) + +func TestResolveProbeDecision(t *testing.T) { + tests := []struct { + name string + in ProbeInput + wantStatus domain.SessionStatus + wantState domain.SessionState + wantReason domain.SessionReason + wantDetect bool // expect non-nil Detecting memory + wantTermNil bool // expect terminal (Detecting must be nil) + }{ + { + name: "kill requested short-circuits to terminal killed", + in: ProbeInput{KillRequested: true, Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, + wantStatus: domain.StatusKilled, + wantState: domain.SessionTerminated, + wantReason: domain.ReasonManuallyKilled, + wantTermNil: true, + }, + { + name: "kill requested wins even over a dead+dead probe", + in: ProbeInput{KillRequested: true, Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusKilled, + wantState: domain.SessionTerminated, + wantReason: domain.ReasonManuallyKilled, + wantTermNil: true, + }, + { + name: "runtime probe failed routes to detecting, never death", + in: ProbeInput{Runtime: domain.RuntimeMissing, RuntimeFailed: true, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonProbeFailure, + wantDetect: true, + }, + { + name: "process probe failed routes to detecting", + in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, ProcessFailed: true, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonProbeFailure, + wantDetect: true, + }, + { + name: "runtime state probe_failed routes to detecting", + in: ProbeInput{Runtime: domain.RuntimeProbeFailed, Process: ProcessIndeterminate, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonProbeFailure, + wantDetect: true, + }, + { + name: "runtime alive + process alive is working", + in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, + wantStatus: domain.StatusWorking, + wantState: domain.SessionWorking, + wantReason: domain.ReasonTaskInProgress, + }, + { + name: "runtime alive + process indeterminate leans alive", + in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessIndeterminate, Now: t0}, + wantStatus: domain.StatusWorking, + wantState: domain.SessionWorking, + wantReason: domain.ReasonTaskInProgress, + }, + { + name: "runtime alive + process dead disagree -> detecting (agent_process_exited)", + in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonAgentProcessExited, + wantDetect: true, + }, + { + name: "runtime dead + process alive disagree -> detecting (runtime_lost)", + in: ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessAlive, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonRuntimeLost, + wantDetect: true, + }, + { + name: "runtime dead + recent activity disagree -> detecting (runtime_lost)", + in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, RecentActivity: true, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonRuntimeLost, + wantDetect: true, + }, + { + name: "runtime dead + process indeterminate cannot confirm -> detecting", + in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonRuntimeLost, + wantDetect: true, + }, + { + name: "runtime exited + process dead + no activity -> killed terminal", + in: ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusKilled, + wantState: domain.SessionTerminated, + wantReason: domain.ReasonRuntimeLost, + wantTermNil: true, + }, + { + name: "runtime missing + process dead + no activity -> killed terminal", + in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusKilled, + wantState: domain.SessionTerminated, + wantReason: domain.ReasonRuntimeLost, + wantTermNil: true, + }, + { + name: "runtime unknown is ambiguous -> detecting (runtime_lost)", + in: ProbeInput{Runtime: domain.RuntimeUnknown, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonRuntimeLost, + wantDetect: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ResolveProbeDecision(tt.in) + if got.Status != tt.wantStatus { + t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) + } + if got.SessionState != tt.wantState { + t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) + } + if got.SessionReason != tt.wantReason { + t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason) + } + if tt.wantDetect && got.Detecting == nil { + t.Errorf("expected non-nil Detecting memory, got nil") + } + if tt.wantTermNil && got.Detecting != nil { + t.Errorf("terminal decision must carry nil Detecting, got %+v", got.Detecting) + } + }) + } +} + +func TestResolveOpenPRDecision(t *testing.T) { + tests := []struct { + name string + in OpenPRInput + wantStatus domain.SessionStatus + wantPR domain.PRReason + wantState domain.SessionState + }{ + { + name: "ci failing dominates everything", + in: OpenPRInput{CIFailing: true, ChangesRequested: true, Approved: true, Mergeable: true}, + wantStatus: domain.StatusCIFailed, + wantPR: domain.PRReasonCIFailing, + wantState: domain.SessionWorking, + }, + { + name: "changes requested before approval states", + in: OpenPRInput{ChangesRequested: true, Approved: true, Mergeable: true}, + wantStatus: domain.StatusChangesRequested, + wantPR: domain.PRReasonChangesRequested, + wantState: domain.SessionWorking, + }, + { + name: "approved + mergeable -> mergeable", + in: OpenPRInput{Approved: true, Mergeable: true}, + wantStatus: domain.StatusMergeable, + wantPR: domain.PRReasonMergeReady, + wantState: domain.SessionIdle, + }, + { + name: "mergeable without formal approval (no required review) -> mergeable", + in: OpenPRInput{Mergeable: true}, + wantStatus: domain.StatusMergeable, + wantPR: domain.PRReasonMergeReady, + wantState: domain.SessionIdle, + }, + { + name: "approved but not mergeable -> approved", + in: OpenPRInput{Approved: true}, + wantStatus: domain.StatusApproved, + wantPR: domain.PRReasonApproved, + wantState: domain.SessionIdle, + }, + { + name: "review pending", + in: OpenPRInput{ReviewPending: true}, + wantStatus: domain.StatusReviewPending, + wantPR: domain.PRReasonReviewPending, + wantState: domain.SessionIdle, + }, + { + name: "idle beyond threshold -> stuck", + in: OpenPRInput{IdleBeyond: true}, + wantStatus: domain.StatusStuck, + wantPR: domain.PRReasonInProgress, + wantState: domain.SessionStuck, + }, + { + name: "review pending wins over idle-beyond", + in: OpenPRInput{ReviewPending: true, IdleBeyond: true}, + wantStatus: domain.StatusReviewPending, + wantPR: domain.PRReasonReviewPending, + wantState: domain.SessionIdle, + }, + { + name: "nothing set -> plain open", + in: OpenPRInput{}, + wantStatus: domain.StatusPROpen, + wantPR: domain.PRReasonInProgress, + wantState: domain.SessionWorking, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ResolveOpenPRDecision(tt.in) + if got.Status != tt.wantStatus { + t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) + } + if got.PRReason != tt.wantPR { + t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR) + } + if got.PRState != domain.PROpen { + t.Errorf("PRState = %q, want %q", got.PRState, domain.PROpen) + } + if got.SessionState != tt.wantState { + t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) + } + }) + } +} + +func TestResolveOpenPRDecisionEvidence(t *testing.T) { + tests := []struct { + name string + in OpenPRInput + want string + }{ + { + name: "condition with PR number and URL", + in: OpenPRInput{CIFailing: true, Number: 123, URL: "https://example.com/pr/123"}, + want: "ci_failing #123 https://example.com/pr/123", + }, + { + name: "condition with number only", + in: OpenPRInput{Approved: true, Mergeable: true, Number: 7}, + want: "merge_ready #7", + }, + { + name: "no identity falls back to the bare condition", + in: OpenPRInput{}, + want: "pr_open", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ResolveOpenPRDecision(tt.in).Evidence; got != tt.want { + t.Errorf("Evidence = %q, want %q", got, tt.want) + } + }) + } +} + +func TestDecidersDeriveConsistently(t *testing.T) { + // Every decision a decider produces must be self-consistent: the display + // Status it reports must equal what DeriveLegacyStatus produces from the + // canonical (session, pr) sub-states it emits. This locks the deciders and + // the display-derivation against drifting apart. + // + // The ResolveTerminalPRStateDecision none/open default is intentionally + // excluded — it is a documented no-op for misuse, not a real verdict. + var decisions []LifecycleDecision + + for _, in := range []OpenPRInput{ + {CIFailing: true}, + {ChangesRequested: true}, + {Approved: true, Mergeable: true}, + {Mergeable: true}, + {Approved: true}, + {ReviewPending: true}, + {IdleBeyond: true}, + {}, + } { + decisions = append(decisions, ResolveOpenPRDecision(in)) + } + + decisions = append(decisions, + ResolveTerminalPRStateDecision(domain.PRMerged), + ResolveTerminalPRStateDecision(domain.PRClosed), + ) + + for _, in := range []ProbeInput{ + {KillRequested: true, Now: t0}, + {Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, + {Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0}, + {Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0}, + } { + decisions = append(decisions, ResolveProbeDecision(in)) + } + + for _, d := range decisions { + l := domain.CanonicalSessionLifecycle{ + Session: domain.SessionSubstate{State: d.SessionState, Reason: d.SessionReason}, + PR: domain.PRSubstate{State: d.PRState, Reason: d.PRReason}, + } + if got := domain.DeriveLegacyStatus(l); got != d.Status { + t.Errorf("decision %+v: Status=%q but DeriveLegacyStatus=%q", d, d.Status, got) + } + } +} + +func TestResolveTerminalPRStateDecision(t *testing.T) { + tests := []struct { + name string + pr domain.PRState + wantStatus domain.SessionStatus + wantState domain.SessionState + wantReason domain.SessionReason + wantPR domain.PRReason + }{ + { + name: "merged parks idle awaiting decision", + pr: domain.PRMerged, + wantStatus: domain.StatusMerged, + wantState: domain.SessionIdle, + wantReason: domain.ReasonMergedWaitingDecision, + wantPR: domain.PRReasonMerged, + }, + { + name: "closed drops to idle", + pr: domain.PRClosed, + wantStatus: domain.StatusIdle, + wantState: domain.SessionIdle, + wantReason: domain.ReasonAwaitingUserInput, + wantPR: domain.PRReasonClosedUnmerged, + }, + { + name: "non-terminal none is a working no-op", + pr: domain.PRNone, + wantStatus: domain.StatusWorking, + wantState: domain.SessionWorking, + wantReason: domain.ReasonTaskInProgress, + }, + { + name: "non-terminal open is a working no-op", + pr: domain.PROpen, + wantStatus: domain.StatusWorking, + wantState: domain.SessionWorking, + wantReason: domain.ReasonTaskInProgress, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ResolveTerminalPRStateDecision(tt.pr) + if got.Status != tt.wantStatus { + t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) + } + if got.SessionState != tt.wantState { + t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) + } + if got.SessionReason != tt.wantReason { + t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason) + } + if tt.wantPR != "" && got.PRReason != tt.wantPR { + t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR) + } + }) + } +} + +func TestCreateDetectingDecision(t *testing.T) { + const ev = "runtime_lost runtime=missing process=indeterminate" + hash := HashEvidence(ev) + + t.Run("first entry records attempt 1 and stays detecting", func(t *testing.T) { + got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Now: t0}) + if got.Status != domain.StatusDetecting || got.SessionState != domain.SessionDetecting { + t.Fatalf("want detecting, got Status=%q State=%q", got.Status, got.SessionState) + } + if got.Detecting == nil || got.Detecting.Attempts != 1 { + t.Fatalf("want attempts=1, got %+v", got.Detecting) + } + if !got.Detecting.StartedAt.Equal(t0) { + t.Errorf("StartedAt = %v, want %v", got.Detecting.StartedAt, t0) + } + if got.Detecting.EvidenceHash != hash { + t.Errorf("EvidenceHash = %q, want %q", got.Detecting.EvidenceHash, hash) + } + if got.SessionReason != domain.ReasonRuntimeLost { + t.Errorf("SessionReason = %q, want %q", got.SessionReason, domain.ReasonRuntimeLost) + } + }) + + t.Run("unchanged evidence climbs the counter", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) + if got.Detecting == nil || got.Detecting.Attempts != 2 { + t.Fatalf("want attempts=2, got %+v", got.Detecting) + } + if !got.Detecting.StartedAt.Equal(t0) { + t.Errorf("StartedAt must be preserved, got %v", got.Detecting.StartedAt) + } + }) + + t.Run("escalates to stuck on the third unchanged tick", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) + if got.Status != domain.StatusStuck || got.SessionState != domain.SessionStuck { + t.Fatalf("want stuck, got Status=%q State=%q", got.Status, got.SessionState) + } + if got.Detecting != nil { + t.Errorf("stuck decision must drop detecting memory, got %+v", got.Detecting) + } + if got.SessionReason != domain.ReasonRuntimeLost { + t.Errorf("escalation should carry the why, got %q", got.SessionReason) + } + }) + + t.Run("changing evidence resets the counter but preserves StartedAt", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: "different evidence", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) + if got.Status != domain.StatusDetecting { + t.Fatalf("changed evidence should stay detecting, got %q", got.Status) + } + if got.Detecting == nil || got.Detecting.Attempts != 1 { + t.Fatalf("counter should reset to 1, got %+v", got.Detecting) + } + if !got.Detecting.StartedAt.Equal(t0) { + t.Errorf("StartedAt must survive an evidence change, got %v", got.Detecting.StartedAt) + } + }) + + t.Run("duration cap escalates even below the attempt count", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration)}) + if got.Status != domain.StatusStuck { + t.Fatalf("want stuck from duration cap, got %q", got.Status) + } + }) + + t.Run("duration cap fires even when evidence keeps flapping", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: "ever-changing", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration + time.Minute)}) + if got.Status != domain.StatusStuck { + t.Fatalf("duration cap must override a reset counter, got %q", got.Status) + } + }) +} + +func TestProbeDetectingEscalationFlow(t *testing.T) { + // An unchanging ambiguous probe should escalate to stuck after exactly + // DetectingMaxAttempts ticks. + in := ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0} + d := ResolveProbeDecision(in) + for i := 1; i < DetectingMaxAttempts; i++ { + if d.Status != domain.StatusDetecting { + t.Fatalf("tick %d: expected detecting, got %q", i, d.Status) + } + in.Prior = d.Detecting + in.Now = t0.Add(time.Duration(i) * time.Second) + d = ResolveProbeDecision(in) + } + if d.Status != domain.StatusStuck { + t.Fatalf("expected escalation to stuck after %d ticks, got %q", DetectingMaxAttempts, d.Status) + } +} + +func TestHashEvidence(t *testing.T) { + t.Run("identical strings hash identically", func(t *testing.T) { + if HashEvidence("same input") != HashEvidence("same input") { + t.Error("identical evidence must hash equal") + } + }) + + t.Run("different evidence hashes differently", func(t *testing.T) { + if HashEvidence("runtime_lost") == HashEvidence("agent_process_exited") { + t.Error("distinct evidence must hash differently") + } + }) + + t.Run("only the timestamp differs -> equal hash", func(t *testing.T) { + a := "probe failed at 2026-05-26T12:00:00Z runtime=missing" + b := "probe failed at 2026-05-26T12:05:43.218Z runtime=missing" + if HashEvidence(a) != HashEvidence(b) { + t.Errorf("restamped evidence should hash equal:\n a=%q\n b=%q", a, b) + } + }) + + t.Run("bare time-of-day stripped", func(t *testing.T) { + if HashEvidence("idle since 12:00:00") != HashEvidence("idle since 13:30:59") { + t.Error("time-of-day differences should be stripped") + } + }) + + t.Run("unix epoch stripped", func(t *testing.T) { + if HashEvidence("last seen 1716724800") != HashEvidence("last seen 1716728400") { + t.Error("epoch differences should be stripped") + } + }) + + t.Run("a real content change still changes the hash", func(t *testing.T) { + a := "probe at 2026-05-26T12:00:00Z runtime=missing" + b := "probe at 2026-05-26T12:00:00Z runtime=alive" + if HashEvidence(a) == HashEvidence(b) { + t.Error("non-timestamp content change must change the hash") + } + }) + + t.Run("whitespace differences are normalised", func(t *testing.T) { + if HashEvidence("runtime=missing process=dead") != HashEvidence("runtime=missing process=dead") { + t.Error("collapsed whitespace should hash equal") + } + }) +} diff --git a/backend/internal/domain/decide/types.go b/backend/internal/domain/decide/types.go new file mode 100644 index 00000000..7ac4adf1 --- /dev/null +++ b/backend/internal/domain/decide/types.go @@ -0,0 +1,76 @@ +package decide + +import ( + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +// LifecycleDecision is the output of every decider: the derived display status +// plus the canonical sub-state values to persist, the human-readable evidence, +// and the (possibly updated) detecting memory. +// +// Zero-value sub-state fields mean "this decider does not address that +// sub-state — leave it unchanged", NOT "set it to the empty value". SessionState +// is always populated, but the probe/detecting/kill paths legitimately leave +// PRState/PRReason empty: a liveness verdict knows nothing about the PR. When +// the LCM turns a decision into a LifecyclePatch it must therefore map an empty +// PRState to a nil patch.PR (left untouched) rather than writing it through — +// writing PRNone on a routine probe tick would clobber a live PR. Detecting is +// nil-by-default for the same reason; see LifecyclePatch's three-way +// Detecting/ClearDetecting semantics. +type LifecycleDecision struct { + Status domain.SessionStatus + Evidence string + Detecting *domain.DetectingState + SessionState domain.SessionState + SessionReason domain.SessionReason + PRState domain.PRState + PRReason domain.PRReason +} + +// ProbeInput reconciles runtime + process liveness. A *failed* probe (timeout +// or error) is distinct from a "dead" verdict and must route to detecting, +// never to a death conclusion. KillRequested short-circuits to terminal. +type ProbeInput struct { + Runtime domain.RuntimeState + RuntimeFailed bool + Process ProcessLiveness + ProcessFailed bool + RecentActivity bool + KillRequested bool + Prior *domain.DetectingState + Now time.Time +} + +// ProcessLiveness mirrors isProcessRunning's three-valued answer. +type ProcessLiveness string + +const ( + ProcessAlive ProcessLiveness = "alive" + ProcessDead ProcessLiveness = "dead" + ProcessIndeterminate ProcessLiveness = "indeterminate" +) + +// OpenPRInput drives the PR pipeline ladder for an open PR. +type OpenPRInput struct { + CIFailing bool + ChangesRequested bool + Approved bool + Mergeable bool + ReviewPending bool + IdleBeyond bool // idle past the stuck threshold + Number int + URL string +} + +// DetectingInput feeds the quarantine counter. Evidence is hashed with +// timestamps stripped, so "same ambiguous signal" keeps the counter climbing +// while any real change resets it. +type DetectingInput struct { + Evidence string + ProposedState domain.SessionState + ProposedReason domain.SessionReason + Prior *domain.DetectingState + Now time.Time +} diff --git a/backend/internal/domain/lifecycle.go b/backend/internal/domain/lifecycle.go new file mode 100644 index 00000000..567a4769 --- /dev/null +++ b/backend/internal/domain/lifecycle.go @@ -0,0 +1,191 @@ +// Package domain holds the shared contract types for the LCM + Session Manager +// lane: the canonical session state model, the derived display status, and the +// session read-model. It has no behaviour beyond pure derivation (status.go) +// and imports nothing outside the standard library, so every other package can +// depend on it without creating cycles. +package domain + +import "time" + +// LifecycleVersion is the schema version stamped onto every persisted record. +// Greenfield: we start at 1 and carry no migration/synthesis code. +const LifecycleVersion = 1 + +// CanonicalSessionLifecycle is the ONLY thing persisted for a session's state. +// The display status is derived from it on read (see DeriveLegacyStatus) and is +// never stored — this prevents canonical truth and display from drifting. +// +// Three orthogonal (state, reason) sub-states describe the session, its PR, and +// its runtime. Activity and Detecting are decider *inputs* that must survive +// between observations (they are read back by the pure decide core), so they +// live in the persisted record too. +type CanonicalSessionLifecycle struct { + // Version is the schema version of this record's shape (LifecycleVersion). + Version int `json:"version"` + // Revision is a monotonic counter the store bumps on every write. It is used + // for optimistic-concurrency checks (LifecyclePatch.ExpectedRevision) and is + // distinct from the schema Version above. + Revision int `json:"revision"` + Session SessionSubstate `json:"session"` + PR PRSubstate `json:"pr"` + Runtime RuntimeSubstate `json:"runtime"` + + // Activity is the last-known agent activity. It arrives on a different + // cadence (ApplyActivitySignal) than runtime probes (the reaper), so the + // probe decider reads it from here to answer "was there recent activity?". + Activity ActivitySubstate `json:"activity"` + + // Detecting is the anti-flap quarantine memory. It is non-nil only while + // the session is in the detecting state; it carries the attempt counter, + // the first-entry time, and a hash of the (timestamp-stripped) evidence so + // the decider can tell "same ambiguous signal N times" from "signal moved". + Detecting *DetectingState `json:"detecting,omitempty"` +} + +// ---- session sub-state ---- + +type SessionState string + +const ( + SessionNotStarted SessionState = "not_started" + SessionWorking SessionState = "working" + SessionIdle SessionState = "idle" + SessionNeedsInput SessionState = "needs_input" + SessionStuck SessionState = "stuck" + SessionDetecting SessionState = "detecting" + SessionDone SessionState = "done" + SessionTerminated SessionState = "terminated" +) + +type SessionReason string + +const ( + ReasonSpawnRequested SessionReason = "spawn_requested" + ReasonAgentAcknowledged SessionReason = "agent_acknowledged" + ReasonTaskInProgress SessionReason = "task_in_progress" + ReasonPRCreated SessionReason = "pr_created" + ReasonFixingCI SessionReason = "fixing_ci" + ReasonResolvingReviewComments SessionReason = "resolving_review_comments" + ReasonAwaitingUserInput SessionReason = "awaiting_user_input" + ReasonAwaitingExternalReview SessionReason = "awaiting_external_review" + ReasonResearchComplete SessionReason = "research_complete" + ReasonMergedWaitingDecision SessionReason = "merged_waiting_decision" + ReasonManuallyKilled SessionReason = "manually_killed" + ReasonPRMerged SessionReason = "pr_merged" + ReasonAutoCleanup SessionReason = "auto_cleanup" + ReasonRuntimeLost SessionReason = "runtime_lost" + ReasonAgentProcessExited SessionReason = "agent_process_exited" + ReasonProbeFailure SessionReason = "probe_failure" + ReasonErrorInProcess SessionReason = "error_in_process" +) + +type SessionSubstate struct { + State SessionState `json:"state"` + Reason SessionReason `json:"reason"` +} + +// ---- PR sub-state ---- + +type PRState string + +const ( + PRNone PRState = "none" + PROpen PRState = "open" + PRMerged PRState = "merged" + PRClosed PRState = "closed" +) + +type PRReason string + +const ( + PRReasonNotCreated PRReason = "not_created" + PRReasonInProgress PRReason = "in_progress" + PRReasonCIFailing PRReason = "ci_failing" + PRReasonReviewPending PRReason = "review_pending" + PRReasonChangesRequested PRReason = "changes_requested" + PRReasonApproved PRReason = "approved" + PRReasonMergeReady PRReason = "merge_ready" + PRReasonMerged PRReason = "merged" + PRReasonClosedUnmerged PRReason = "closed_unmerged" + PRReasonClearedOnRestore PRReason = "cleared_on_restore" +) + +type PRSubstate struct { + State PRState `json:"state"` + Reason PRReason `json:"reason"` + Number int `json:"number,omitempty"` + URL string `json:"url,omitempty"` +} + +// ---- runtime sub-state ---- + +type RuntimeState string + +const ( + RuntimeUnknown RuntimeState = "unknown" + RuntimeAlive RuntimeState = "alive" + RuntimeExited RuntimeState = "exited" + RuntimeMissing RuntimeState = "missing" + RuntimeProbeFailed RuntimeState = "probe_failed" +) + +type RuntimeReason string + +const ( + RuntimeReasonSpawnIncomplete RuntimeReason = "spawn_incomplete" + RuntimeReasonProcessRunning RuntimeReason = "process_running" + RuntimeReasonProcessMissing RuntimeReason = "process_missing" + RuntimeReasonTmuxMissing RuntimeReason = "tmux_missing" + RuntimeReasonManualKillRequested RuntimeReason = "manual_kill_requested" + RuntimeReasonPRMergedCleanup RuntimeReason = "pr_merged_cleanup" + RuntimeReasonAutoCleanup RuntimeReason = "auto_cleanup" + RuntimeReasonProbeError RuntimeReason = "probe_error" +) + +type RuntimeSubstate struct { + State RuntimeState `json:"state"` + Reason RuntimeReason `json:"reason"` +} + +// ---- activity sub-state (decider input) ---- + +type ActivityState string + +const ( + ActivityActive ActivityState = "active" + ActivityReady ActivityState = "ready" + ActivityIdle ActivityState = "idle" + ActivityWaitingInput ActivityState = "waiting_input" // sticky: does not decay by wallclock + ActivityBlocked ActivityState = "blocked" // sticky: does not decay by wallclock + ActivityExited ActivityState = "exited" +) + +// IsSticky reports whether an activity state must NOT be aged/demoted by the +// passage of time (a paused agent is still paused until a new signal says so). +func (a ActivityState) IsSticky() bool { + return a == ActivityWaitingInput || a == ActivityBlocked +} + +type ActivitySource string + +const ( + SourceNative ActivitySource = "native" + SourceTerminal ActivitySource = "terminal" + SourceHook ActivitySource = "hook" + SourceRuntime ActivitySource = "runtime" + SourceNone ActivitySource = "none" +) + +type ActivitySubstate struct { + State ActivityState `json:"state"` + LastActivityAt time.Time `json:"lastActivityAt"` + Source ActivitySource `json:"source"` +} + +// ---- detecting quarantine memory (decider input) ---- + +type DetectingState struct { + Attempts int `json:"attempts"` + StartedAt time.Time `json:"startedAt"` + EvidenceHash string `json:"evidenceHash"` +} diff --git a/backend/internal/domain/session.go b/backend/internal/domain/session.go new file mode 100644 index 00000000..578cca40 --- /dev/null +++ b/backend/internal/domain/session.go @@ -0,0 +1,42 @@ +package domain + +import "time" + +// SessionID, ProjectID, IssueID are distinct string types so they can't be +// swapped at a call site by accident. +type ( + SessionID string + ProjectID string + IssueID string +) + +type SessionKind string + +const ( + KindWorker SessionKind = "worker" + KindOrchestrator SessionKind = "orchestrator" +) + +// SessionRecord is the PERSISTENCE shape: identity, canonical lifecycle, and +// metadata — everything the store holds, and nothing derived. The store reads +// and writes records; it never produces the derived display status. +type SessionRecord struct { + ID SessionID `json:"id"` + ProjectID ProjectID `json:"projectId"` + IssueID IssueID `json:"issueId,omitempty"` + Kind SessionKind `json:"kind"` + Lifecycle CanonicalSessionLifecycle `json:"lifecycle"` + Metadata map[string]string `json:"metadata,omitempty"` + CreatedAt time.Time `json:"createdAt"` + UpdatedAt time.Time `json:"updatedAt"` +} + +// Session is the read-model returned across the API boundary (to controllers, +// then the frontend): a SessionRecord plus the DERIVED display Status. The +// Session Manager is the single producer of Status — it builds a Session from a +// stored SessionRecord by calling DeriveLegacyStatus, so the store and API +// never recompute (or accidentally persist) it. +type Session struct { + SessionRecord + Status SessionStatus `json:"status"` +} diff --git a/backend/internal/domain/status.go b/backend/internal/domain/status.go new file mode 100644 index 00000000..b12b2b9f --- /dev/null +++ b/backend/internal/domain/status.go @@ -0,0 +1,100 @@ +package domain + +// SessionStatus is the single-word DISPLAY status the dashboard renders. It is +// derived from the canonical lifecycle on read and never persisted. +type SessionStatus string + +const ( + StatusSpawning SessionStatus = "spawning" + StatusWorking SessionStatus = "working" + StatusDetecting SessionStatus = "detecting" + StatusPROpen SessionStatus = "pr_open" + StatusCIFailed SessionStatus = "ci_failed" + StatusReviewPending SessionStatus = "review_pending" + StatusChangesRequested SessionStatus = "changes_requested" + StatusApproved SessionStatus = "approved" + StatusMergeable SessionStatus = "mergeable" + StatusMerged SessionStatus = "merged" + StatusCleanup SessionStatus = "cleanup" + StatusNeedsInput SessionStatus = "needs_input" + StatusStuck SessionStatus = "stuck" + StatusErrored SessionStatus = "errored" + StatusKilled SessionStatus = "killed" + StatusIdle SessionStatus = "idle" + StatusDone SessionStatus = "done" + StatusTerminated SessionStatus = "terminated" +) + +// DeriveLegacyStatus is the ONLY producer of the display status. It must stay a +// pure, total function of the canonical record. +// +// Order matters: +// 1. Terminal / hard session states (done, terminated, needs_input, stuck, +// detecting, not_started) map directly — these OUTRANK PR facts. +// 2. Otherwise a merged PR wins. +// 3. Otherwise an open PR maps by its reason. +// 4. Otherwise fall through to the SOFT session state (idle/working). +// +// So "PR facts dominate session facts" applies only to the soft states: an idle +// or working session with an open, CI-failing PR displays as ci_failed — but a +// session that is stuck or needs_input shows that regardless of PR state, since +// it needs a human either way. +func DeriveLegacyStatus(l CanonicalSessionLifecycle) SessionStatus { + switch l.Session.State { + case SessionDone: + return StatusDone + case SessionTerminated: + return terminatedStatus(l.Session.Reason) + case SessionNeedsInput: + return StatusNeedsInput + case SessionStuck: + return StatusStuck + case SessionDetecting: + return StatusDetecting + case SessionNotStarted: + return StatusSpawning + } + + if l.PR.State == PRMerged { + return StatusMerged + } + + if l.PR.State == PROpen { + return openPRStatus(l.PR.Reason) + } + + if l.Session.State == SessionIdle { + return StatusIdle + } + return StatusWorking +} + +func terminatedStatus(r SessionReason) SessionStatus { + switch r { + case ReasonManuallyKilled, ReasonRuntimeLost, ReasonAgentProcessExited: + return StatusKilled + case ReasonAutoCleanup, ReasonPRMerged: + return StatusCleanup + case ReasonErrorInProcess, ReasonProbeFailure: + return StatusErrored + default: + return StatusTerminated + } +} + +func openPRStatus(r PRReason) SessionStatus { + switch r { + case PRReasonCIFailing: + return StatusCIFailed + case PRReasonChangesRequested: + return StatusChangesRequested + case PRReasonApproved: + return StatusApproved + case PRReasonMergeReady: + return StatusMergeable + case PRReasonReviewPending: + return StatusReviewPending + default: + return StatusPROpen + } +} diff --git a/backend/internal/domain/status_test.go b/backend/internal/domain/status_test.go new file mode 100644 index 00000000..12b0ade0 --- /dev/null +++ b/backend/internal/domain/status_test.go @@ -0,0 +1,87 @@ +package domain + +import "testing" + +func TestDeriveLegacyStatus(t *testing.T) { + tests := []struct { + name string + in CanonicalSessionLifecycle + want SessionStatus + }{ + { + name: "not_started maps to spawning", + in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionNotStarted, Reason: ReasonSpawnRequested}}, + want: StatusSpawning, + }, + { + name: "terminated+manually_killed maps to killed", + in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonManuallyKilled}}, + want: StatusKilled, + }, + { + name: "terminated+auto_cleanup maps to cleanup", + in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonAutoCleanup}}, + want: StatusCleanup, + }, + { + name: "terminated+error maps to errored", + in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonErrorInProcess}}, + want: StatusErrored, + }, + { + name: "hard state needs_input maps directly", + in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionNeedsInput}}, + want: StatusNeedsInput, + }, + { + name: "merged PR dominates an idle session", + in: CanonicalSessionLifecycle{ + Session: SessionSubstate{State: SessionIdle}, + PR: PRSubstate{State: PRMerged}, + }, + want: StatusMerged, + }, + { + name: "open PR with failing CI dominates idle session", + in: CanonicalSessionLifecycle{ + Session: SessionSubstate{State: SessionIdle}, + PR: PRSubstate{State: PROpen, Reason: PRReasonCIFailing}, + }, + want: StatusCIFailed, + }, + { + name: "open PR approved", + in: CanonicalSessionLifecycle{ + Session: SessionSubstate{State: SessionWorking}, + PR: PRSubstate{State: PROpen, Reason: PRReasonApproved}, + }, + want: StatusApproved, + }, + { + name: "open PR merge_ready maps to mergeable", + in: CanonicalSessionLifecycle{ + Session: SessionSubstate{State: SessionWorking}, + PR: PRSubstate{State: PROpen, Reason: PRReasonMergeReady}, + }, + want: StatusMergeable, + }, + { + name: "no PR falls through to idle", + in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionIdle}}, + want: StatusIdle, + }, + { + name: "no PR falls through to working", + in: CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionWorking}}, + want: StatusWorking, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := DeriveLegacyStatus(tt.in); got != tt.want { + t.Errorf("DeriveLegacyStatus() = %q, want %q", got, tt.want) + } + }) + } +} diff --git a/backend/internal/lifecycle/decide_bridge.go b/backend/internal/lifecycle/decide_bridge.go new file mode 100644 index 00000000..942fdad4 --- /dev/null +++ b/backend/internal/lifecycle/decide_bridge.go @@ -0,0 +1,227 @@ +package lifecycle + +import ( + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/domain/decide" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// defaultRecentActivityWindow is how fresh the last activity signal must be for +// the probe decider to treat the agent as "recently active" (which keeps an +// ambiguous dead-runtime probe in detecting instead of concluding death). +const defaultRecentActivityWindow = 60 * time.Second + +// ---- fact translation: ports DTOs -> pure decide inputs ---- + +// runtimeFactsToProbeInput maps a raw RuntimeFacts (plus the prior detecting +// memory and last-known activity read back from canonical) into the probe +// decider's input. KillRequested is always false here: the inferred-death path +// never carries an explicit kill — that arrives via OnKillRequested. +func runtimeFactsToProbeInput(f ports.RuntimeFacts, cur domain.CanonicalSessionLifecycle, window time.Duration) decide.ProbeInput { + rt, rtFailed := runtimeProbeToState(f.RuntimeState) + proc, procFailed := processProbeToLiveness(f.ProcessState) + now := nowOr(f.ObservedAt) + return decide.ProbeInput{ + Runtime: rt, + RuntimeFailed: rtFailed, + Process: proc, + ProcessFailed: procFailed, + RecentActivity: hasRecentActivity(cur.Activity, now, window), + Prior: cur.Detecting, + Now: now, + } +} + +func runtimeProbeToState(p ports.RuntimeProbe) (domain.RuntimeState, bool) { + switch p { + case ports.RuntimeProbeAlive: + return domain.RuntimeAlive, false + case ports.RuntimeProbeDead: + return domain.RuntimeExited, false + case ports.RuntimeProbeFailed: + return domain.RuntimeProbeFailed, true + default: // indeterminate / unset: ambiguous, never a death conclusion + return domain.RuntimeUnknown, false + } +} + +func processProbeToLiveness(p ports.ProcessProbe) (decide.ProcessLiveness, bool) { + switch p { + case ports.ProcessProbeAlive: + return decide.ProcessAlive, false + case ports.ProcessProbeDead: + return decide.ProcessDead, false + case ports.ProcessProbeFailed: + return decide.ProcessIndeterminate, true + default: // indeterminate / unset + return decide.ProcessIndeterminate, false + } +} + +// runtimeSubstateFromFacts derives the runtime sub-state to persist. Liveness +// always owns this axis, so it is written on every runtime observation +// regardless of what the session axis does. +func runtimeSubstateFromFacts(f ports.RuntimeFacts) domain.RuntimeSubstate { + switch f.RuntimeState { + case ports.RuntimeProbeAlive: + return domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning} + case ports.RuntimeProbeDead: + return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonTmuxMissing} + case ports.RuntimeProbeFailed: + return domain.RuntimeSubstate{State: domain.RuntimeProbeFailed, Reason: domain.RuntimeReasonProbeError} + case ports.RuntimeProbeIndeterminate: + // Probe ran but couldn't tell — distinct from a probe error, so no + // probe_error reason; the ambiguity is carried by RuntimeUnknown alone. + return domain.RuntimeSubstate{State: domain.RuntimeUnknown} + default: // unset + return domain.RuntimeSubstate{State: domain.RuntimeUnknown} + } +} + +// hasRecentActivity answers the probe decider's "was the agent heard from +// recently?" question. Sticky states (waiting_input/blocked) count as recent +// because they mean a live-but-paused agent; an explicit exited signal never +// counts; otherwise we age the last-activity timestamp against the window. +func hasRecentActivity(a domain.ActivitySubstate, now time.Time, window time.Duration) bool { + if a.State == domain.ActivityExited { + return false + } + if a.State.IsSticky() { + return true + } + if a.LastActivityAt.IsZero() { + return false + } + return now.Sub(a.LastActivityAt) <= window +} + +// openPRInput maps SCM facts onto the open-PR ladder. IdleBeyond is always false +// in split A — the idle-duration signal is owned by the escalation engine +// (split B); the synchronous LCM has no clock of its own here. +func openPRInput(f ports.SCMFacts) decide.OpenPRInput { + return decide.OpenPRInput{ + CIFailing: f.CISummary == ports.CIFailing, + ChangesRequested: f.ReviewDecision == ports.ReviewChangesRequested, + Approved: f.ReviewDecision == ports.ReviewApproved, + Mergeable: f.Mergeability.Mergeable, + ReviewPending: f.ReviewDecision == ports.ReviewPending, + Number: f.PRNumber, + URL: f.PRURL, + } +} + +// ---- activity -> session axis mapping (activity owns working/idle/waiting) ---- + +// activityToSession maps an activity classification onto the session sub-state. +// exited returns ok=false: an exit signal must NOT write a terminal session +// state — only the probe pipeline (via detecting) may conclude inferred death. +func activityToSession(a domain.ActivityState) (domain.SessionState, domain.SessionReason, bool) { + switch a { + case domain.ActivityActive: + return domain.SessionWorking, domain.ReasonTaskInProgress, true + case domain.ActivityReady: + // ready = the agent finished a unit and is waiting for more work. + return domain.SessionIdle, domain.ReasonResearchComplete, true + case domain.ActivityIdle: + // plain inactivity carries no completion claim, so no specific reason + // (research_complete here would read misleadingly in diagnostics). + return domain.SessionIdle, "", true + case domain.ActivityWaitingInput: + return domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, true + case domain.ActivityBlocked: + return domain.SessionStuck, domain.ReasonAwaitingUserInput, true + default: // exited / unset + return "", "", false + } +} + +// ---- composition predicates: who may write the session axis ---- + +// isTerminal reports a final session state that must not be resurrected by an +// observation (only an explicit Restore reopens a terminal session). +func isTerminal(s domain.SessionState) bool { + return s == domain.SessionDone || s == domain.SessionTerminated +} + +// isLivenessOwned reports whether the current session sub-state was set by the +// liveness/death axis (the probe pipeline) and may therefore be recovered by a +// later healthy probe. detecting is always liveness-owned; a stuck/terminated +// state is liveness-owned only when its reason came from a death inference. +func isLivenessOwned(s domain.SessionSubstate) bool { + if s.State == domain.SessionDetecting { + return true + } + switch s.Reason { + case domain.ReasonRuntimeLost, domain.ReasonAgentProcessExited, domain.ReasonProbeFailure: + return true + } + return false +} + +// shouldWriteSessionRuntime is the #1 composition rule for ApplyRuntimeObservation. +// A death-axis verdict (detecting/stuck/terminal) always writes — it overrides +// activity because a (maybe) dead agent can't be working/waiting. A healthy +// "working" verdict only writes when it is recovering a liveness-owned state +// (e.g. detecting -> working); it must NOT clobber an activity-owned +// needs_input/blocked/idle the activity axis is responsible for. +func shouldWriteSessionRuntime(d decide.LifecycleDecision, cur domain.CanonicalSessionLifecycle) bool { + if isTerminal(cur.Session.State) { + // A terminal session is only reopened by an explicit Restore — never by + // an observation. Even a death-axis verdict (e.g. detecting) must not + // resurrect it; the runtime axis is still patched separately. + return false + } + if d.SessionState == domain.SessionWorking { + return isLivenessOwned(cur.Session) + } + return true +} + +// shouldWriteSessionActivity is the mirror rule for ApplyActivitySignal: the +// activity axis owns working/idle/waiting. A valid activity signal is direct +// proof of life, so it is allowed to RESOLVE a detecting session (pull it out of +// the liveness quarantine) — but it must not resurrect a terminal session, and +// it leaves a liveness-escalated stuck state to the probe pipeline (stuck is a +// deliberate human-facing escalation, not a transient quarantine). +func shouldWriteSessionActivity(cur domain.CanonicalSessionLifecycle) bool { + if isTerminal(cur.Session.State) { + return false + } + if cur.Session.State == domain.SessionDetecting { + return true + } + return !isLivenessOwned(cur.Session) +} + +// ---- explicit-kill mapping (SM's terminal-write authority) ---- + +func killSession(k ports.LifecycleKillReason) domain.SessionSubstate { + switch k { + case ports.KillManual: + return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonManuallyKilled} + case ports.KillCleanup: + return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonAutoCleanup} + default: // error + return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonErrorInProcess} + } +} + +func killRuntime(k ports.LifecycleKillReason) domain.RuntimeSubstate { + switch k { + case ports.KillManual: + return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonManualKillRequested} + case ports.KillCleanup: + return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonAutoCleanup} + default: // error + return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonProbeError} + } +} + +func nowOr(t time.Time) time.Time { + if t.IsZero() { + return time.Now() + } + return t +} diff --git a/backend/internal/lifecycle/fakes_test.go b/backend/internal/lifecycle/fakes_test.go new file mode 100644 index 00000000..cc47ad84 --- /dev/null +++ b/backend/internal/lifecycle/fakes_test.go @@ -0,0 +1,185 @@ +package lifecycle + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// fakeStore is an in-memory LifecycleStore that faithfully applies merge-patch +// semantics (sparse field writes, the three-way Detecting/ClearDetecting rule, +// ExpectedRevision optimistic-concurrency check, monotonic Revision bump) so +// tests assert against the real persisted canonical. +type fakeStore struct { + mu sync.Mutex + records map[domain.SessionID]*domain.SessionRecord + metadata map[domain.SessionID]map[string]string +} + +var _ ports.LifecycleStore = (*fakeStore)(nil) + +func newFakeStore() *fakeStore { + return &fakeStore{ + records: map[domain.SessionID]*domain.SessionRecord{}, + metadata: map[domain.SessionID]map[string]string{}, + } +} + +// seed installs a starting lifecycle for a session id (bypassing the patch path). +func (s *fakeStore) seed(id domain.SessionID, l domain.CanonicalSessionLifecycle) { + s.mu.Lock() + defer s.mu.Unlock() + if l.Version == 0 { + l.Version = domain.LifecycleVersion + } + s.records[id] = &domain.SessionRecord{ID: id, Lifecycle: l} +} + +func (s *fakeStore) Load(_ context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) { + s.mu.Lock() + defer s.mu.Unlock() + rec, ok := s.records[id] + if !ok { + return domain.CanonicalSessionLifecycle{}, false, nil + } + return rec.Lifecycle, true, nil +} + +func (s *fakeStore) PatchLifecycle(_ context.Context, id domain.SessionID, p ports.LifecyclePatch) error { + s.mu.Lock() + defer s.mu.Unlock() + + rec, ok := s.records[id] + if !ok { + rec = &domain.SessionRecord{ID: id, Lifecycle: domain.CanonicalSessionLifecycle{Version: domain.LifecycleVersion}} + s.records[id] = rec + } + l := &rec.Lifecycle + + if p.ExpectedRevision != nil && *p.ExpectedRevision != l.Revision { + return fmt.Errorf("revision mismatch for %s: have %d, expected %d", id, l.Revision, *p.ExpectedRevision) + } + + if p.Session != nil { + l.Session = *p.Session + } + if p.PR != nil { + l.PR = *p.PR + } + if p.Runtime != nil { + l.Runtime = *p.Runtime + } + if p.Activity != nil { + l.Activity = *p.Activity + } + switch { + case p.ClearDetecting: + l.Detecting = nil + case p.Detecting != nil: + d := *p.Detecting + l.Detecting = &d + } + + l.Version = domain.LifecycleVersion + l.Revision++ + rec.UpdatedAt = time.Now() + return nil +} + +func (s *fakeStore) Seed(_ context.Context, rec domain.SessionRecord) error { + s.mu.Lock() + defer s.mu.Unlock() + if _, ok := s.records[rec.ID]; ok { + return fmt.Errorf("seed: session %s already exists", rec.ID) + } + if rec.Lifecycle.Version == 0 { + rec.Lifecycle.Version = domain.LifecycleVersion + } + r := rec + s.records[rec.ID] = &r + return nil +} + +func (s *fakeStore) Get(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + s.mu.Lock() + defer s.mu.Unlock() + rec, ok := s.records[id] + if !ok { + return domain.SessionRecord{}, false, nil + } + return *rec, true, nil +} + +func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + var out []domain.SessionRecord + for _, rec := range s.records { + if rec.ProjectID == project { + out = append(out, *rec) + } + } + return out, nil +} + +func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (map[string]string, error) { + s.mu.Lock() + defer s.mu.Unlock() + out := map[string]string{} + for k, v := range s.metadata[id] { + out[k] = v + } + return out, nil +} + +func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, kv map[string]string) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.metadata[id] == nil { + s.metadata[id] = map[string]string{} + } + for k, v := range kv { + s.metadata[id][k] = v + } + return nil +} + +// recordingNotifier captures emitted events for assertions. +type recordingNotifier struct { + mu sync.Mutex + events []ports.OrchestratorEvent +} + +var _ ports.Notifier = (*recordingNotifier)(nil) + +func (n *recordingNotifier) Notify(_ context.Context, e ports.OrchestratorEvent) error { + n.mu.Lock() + defer n.mu.Unlock() + n.events = append(n.events, e) + return nil +} + +// recordingMessenger captures messages injected into agents. +type recordingMessenger struct { + mu sync.Mutex + sent []struct { + ID domain.SessionID + Message string + } +} + +var _ ports.AgentMessenger = (*recordingMessenger)(nil) + +func (a *recordingMessenger) Send(_ context.Context, id domain.SessionID, message string) error { + a.mu.Lock() + defer a.mu.Unlock() + a.sent = append(a.sent, struct { + ID domain.SessionID + Message string + }{id, message}) + return nil +} diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go new file mode 100644 index 00000000..2581fea0 --- /dev/null +++ b/backend/internal/lifecycle/manager.go @@ -0,0 +1,423 @@ +// Package lifecycle implements ports.LifecycleManager: the synchronous +// observe->decide->persist reducer. Every Apply*/On* entrypoint runs the same +// pipeline under a per-session lock — load canonical, run the matching pure +// decider, diff the result into a sparse merge-patch, persist. The LCM never +// polls and never writes the display status (that is derived on read). +// +// After a transition is persisted, the Apply* paths fire the mapped reaction +// (the ACT layer: reaction table + escalation engine) via the react() chokepoint +// in reactions.go. The Session Manager lands in a later split. +package lifecycle + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/domain/decide" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// Metadata keys OnSpawnCompleted records for the spawned session's handles. +const ( + MetaBranch = "branch" + MetaWorkspacePath = "workspacePath" + MetaRuntimeHandleID = "runtimeHandleId" + MetaRuntimeName = "runtimeName" + MetaAgentSessionID = "agentSessionId" +) + +// Manager is the LCM. The Apply* pipeline persists a transition and then fires +// the mapped reaction via Notifier/AgentMessenger (see reactions.go). +type Manager struct { + store ports.LifecycleStore + notifier ports.Notifier + messenger ports.AgentMessenger + + recentActivityWindow time.Duration + locks keyedMutex + + // trackers hold per-(session,reaction) escalation budgets (ACT policy, not + // canonical state). trackerMu guards them: react() touches them from the + // caller's goroutine, TickEscalations from the reaper's. clock is the time + // source for escalation stamping (overridable in tests). + trackers map[trackerKey]*reactionTracker + trackerMu sync.Mutex + clock func() time.Time +} + +var _ ports.LifecycleManager = (*Manager)(nil) + +func New(store ports.LifecycleStore, notifier ports.Notifier, messenger ports.AgentMessenger) *Manager { + return &Manager{ + store: store, + notifier: notifier, + messenger: messenger, + recentActivityWindow: defaultRecentActivityWindow, + trackers: map[trackerKey]*reactionTracker{}, + clock: time.Now, + } +} + +// ---- per-session serialisation ---- + +// keyedMutex hands out one lock per session id so the load->decide->persist +// read-modify-write is serial within a session but parallel across sessions. +// +// Entries are reference-counted and evicted when the last holder releases, so +// the map stays bounded to sessions with in-flight operations rather than +// growing unbounded over the lifetime of a long-running daemon. +type keyedMutex struct { + mu sync.Mutex + locks map[domain.SessionID]*lockEntry +} + +type lockEntry struct { + mu sync.Mutex + refs int +} + +func (k *keyedMutex) lock(id domain.SessionID) func() { + k.mu.Lock() + if k.locks == nil { + k.locks = make(map[domain.SessionID]*lockEntry) + } + e, ok := k.locks[id] + if !ok { + e = &lockEntry{} + k.locks[id] = e + } + e.refs++ + k.mu.Unlock() + + e.mu.Lock() + return func() { + e.mu.Unlock() + k.mu.Lock() + e.refs-- + if e.refs == 0 { + delete(k.locks, id) + } + k.mu.Unlock() + } +} + +func (m *Manager) withLock(id domain.SessionID, fn func() error) error { + unlock := m.locks.lock(id) + defer unlock() + return fn() +} + +// transition is what a persisted write produced: the canonical before and after +// the patch. The ACT layer (react) derives the reaction from these. It is nil +// when the pipeline made no write. +type transition struct { + beforeLC domain.CanonicalSessionLifecycle + afterLC domain.CanonicalSessionLifecycle +} + +// mutate runs the shared pipeline: load -> build patch -> persist (only if the +// patch changed something). decideFn returns the diffed patch and whether it +// touches anything; a false "changed" is a clean no-op (no write, no revision +// bump), which is how failed-probe / unknown-fact inputs are dropped. +// +// On a write it returns the transition (before/after canonical) so the caller — +// which still holds the originating facts — can fire the mapped reaction. +func (m *Manager) mutate( + ctx context.Context, + id domain.SessionID, + decideFn func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error), +) (*transition, error) { + var tr *transition + err := m.withLock(id, func() error { + cur, exists, err := m.store.Load(ctx, id) + if err != nil { + return err + } + patch, changed, err := decideFn(cur, exists) + if err != nil { + return err + } + if !changed { + return nil + } + if err := m.store.PatchLifecycle(ctx, id, patch); err != nil { + return err + } + after, _, err := m.store.Load(ctx, id) + if err != nil { + return err + } + tr = &transition{beforeLC: cur, afterLC: after} + return nil + }) + return tr, err +} + +// ---- OBSERVE entrypoints ---- + +// ApplyRuntimeObservation feeds the probe decider. Liveness always writes the +// runtime axis; the session axis follows the #1 composition rule; and a +// non-detecting verdict clears any stale detecting memory (#3) so the next +// probe doesn't read a phantom prior. +func (m *Manager) ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error { + tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error) { + if !exists { + return ports.LifecyclePatch{}, false, nil // nothing seeded; ignore stray probe + } + + d := decide.ResolveProbeDecision(runtimeFactsToProbeInput(f, cur, m.recentActivityWindow)) + + var patch ports.LifecyclePatch + changed := false + + if rt := runtimeSubstateFromFacts(f); cur.Runtime != rt { + patch.Runtime = &rt + changed = true + } + // A terminal session is reopened only by an explicit Restore: an + // observation may refresh the runtime axis above but must touch neither + // the session axis nor the detecting memory. + if !isTerminal(cur.Session.State) { + if shouldWriteSessionRuntime(d, cur) { + changed = setSessionIfChanged(&patch, cur, d.SessionState, d.SessionReason) || changed + } + changed = setDetecting(&patch, cur, d.Detecting) || changed + } + + return patch, changed, nil + }) + if err != nil { + return err + } + return m.react(ctx, id, tr, reactionContext{}) +} + +// ApplySCMObservation maps PR facts onto the PR axis. A failed fetch is dropped +// (failed probe != "no PR"). An open PR writes only the PR sub-state — the +// session axis stays owned by activity, and DeriveLegacyStatus surfaces the PR +// reason for display. A terminal PR (merged/closed) also parks the session. +func (m *Manager) ApplySCMObservation(ctx context.Context, id domain.SessionID, f ports.SCMFacts) error { + tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error) { + if !exists || !f.Fetched { + return ports.LifecyclePatch{}, false, nil + } + + switch f.PRState { + case domain.PROpen: + d := decide.ResolveOpenPRDecision(openPRInput(f)) + var patch ports.LifecyclePatch + changed := setPRIfChanged(&patch, cur, d, f) + return patch, changed, nil + + case domain.PRMerged, domain.PRClosed: + d := decide.ResolveTerminalPRStateDecision(f.PRState) + var patch ports.LifecyclePatch + changed := setPRIfChanged(&patch, cur, d, f) + // A merge/close is a milestone that ends the work, so it parks the + // session axis (idle / merged_waiting_decision) even over an + // activity-owned needs_input/blocked — unlike the open-PR path, + // which leaves the session axis to activity. A terminal session is + // still never reopened. + if !isTerminal(cur.Session.State) { + changed = setSessionIfChanged(&patch, cur, d.SessionState, d.SessionReason) || changed + } + return patch, changed, nil + + default: // none / unset: no PR-driven transition in split A + return ports.LifecyclePatch{}, false, nil + } + }) + if err != nil { + return err + } + return m.react(ctx, id, tr, reactionContext{ciFailureLogTail: f.CIFailureLogTail}) +} + +// ApplyActivitySignal updates the activity axis. Only a valid-confidence signal +// is authoritative (stale/unavailable/probe_failure != idleness). It refreshes +// the persisted activity sub-state (the probe decider's RecentActivity input) +// and maps the classification onto the session axis. A valid signal is proof of +// life, so it may resolve a detecting session — clearing the quarantine memory +// so a later probe doesn't resume counting from a stale prior. +func (m *Manager) ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ports.ActivitySignal) error { + tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error) { + if !exists || s.State != ports.SignalValid { + return ports.LifecyclePatch{}, false, nil + } + + var patch ports.LifecyclePatch + changed := false + + act := domain.ActivitySubstate{State: s.Activity, LastActivityAt: nowOr(s.Timestamp), Source: s.Source} + if !sameActivity(cur.Activity, act) { + patch.Activity = &act + changed = true + } + if st, rs, ok := activityToSession(s.Activity); ok && shouldWriteSessionActivity(cur) { + changed = setSessionIfChanged(&patch, cur, st, rs) || changed + // Proof of life that pulls the session out of detecting must also + // drop the quarantine memory (detecting memory only exists while + // detecting, so this is a no-op otherwise). + if cur.Detecting != nil { + patch.ClearDetecting = true + changed = true + } + } + + return patch, changed, nil + }) + if err != nil { + return err + } + return m.react(ctx, id, tr, reactionContext{}) +} + +// ---- mutation outcomes reported by the Session Manager ---- + +// OnSpawnCompleted records that a spawn finished: the runtime is up and the +// handles are known. Per the agreed rule it flips the runtime axis to alive and +// stores the handles in metadata, but leaves the session at not_started +// (display: spawning) — the agent "acknowledges" via the first activity signal. +func (m *Manager) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error { + return m.withLock(id, func() error { + cur, exists, err := m.store.Load(ctx, id) + if err != nil { + return err + } + if !exists { + // The SM seeds the initial lifecycle before spawning; a completion + // for an unseeded session is a contract violation, not a stray + // observation, so surface it rather than fabricating a record. + return fmt.Errorf("lifecycle: OnSpawnCompleted for unseeded session %q", id) + } + rt := domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning} + if cur.Runtime != rt { + if err := m.store.PatchLifecycle(ctx, id, ports.LifecyclePatch{Runtime: &rt}); err != nil { + return err + } + } + if meta := spawnMetadata(o); len(meta) > 0 { + if err := m.store.PatchMetadata(ctx, id, meta); err != nil { + return err + } + } + return nil + }) +} + +// OnKillRequested is the SM's explicit terminal-write authority (the one +// terminal path that does not go through the inferred-death decider). It writes +// the terminal session/runtime sub-states for the kill kind and clears any +// in-flight detecting memory. +func (m *Manager) OnKillRequested(ctx context.Context, id domain.SessionID, r ports.KillReason) error { + // An explicit user kill is a human action, not an inferred event, so it + // fires no reaction — the transition is discarded. + _, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error) { + if !exists { + // Killing an unknown/already-gone session is a benign race; no-op + // rather than fabricating a terminal record for a session we never + // knew about. + return ports.LifecyclePatch{}, false, nil + } + + var patch ports.LifecyclePatch + changed := false + + if sess := killSession(r.Kind); cur.Session != sess { + patch.Session = &sess + changed = true + } + if rt := killRuntime(r.Kind); cur.Runtime != rt { + patch.Runtime = &rt + changed = true + } + if cur.Detecting != nil { + patch.ClearDetecting = true + changed = true + } + return patch, changed, nil + }) + if err != nil { + return err + } + // A kill is terminal but bypasses react()'s incident-over cleanup (it fires + // no reaction). Drop any escalation trackers here so a later duration-based + // TickEscalations can't emit reaction.escalated for a dead session. + m.clearSessionTrackers(id) + return nil +} + +// ---- patch helpers (diff -> sparse merge-patch) ---- + +// setSessionIfChanged sets patch.Session only when the decided sub-state +// differs from current; an empty decided state means "decider does not address +// the session axis" and is left untouched. +func setSessionIfChanged(patch *ports.LifecyclePatch, cur domain.CanonicalSessionLifecycle, st domain.SessionState, rs domain.SessionReason) bool { + if st == "" { + return false + } + want := domain.SessionSubstate{State: st, Reason: rs} + if cur.Session == want { + return false + } + patch.Session = &want + return true +} + +// setPRIfChanged folds the decided PR sub-state plus the fact-borne PR identity +// (number/url) into the patch when it differs from current. +func setPRIfChanged(patch *ports.LifecyclePatch, cur domain.CanonicalSessionLifecycle, d decide.LifecycleDecision, f ports.SCMFacts) bool { + want := domain.PRSubstate{State: d.PRState, Reason: d.PRReason, Number: f.PRNumber, URL: f.PRURL} + if cur.PR == want { + return false + } + patch.PR = &want + return true +} + +// setDetecting implements the three-way detecting semantics: set/replace when +// the decision carries memory, clear (#3) when it doesn't but canonical still +// holds stale memory, else leave untouched. +func setDetecting(patch *ports.LifecyclePatch, cur domain.CanonicalSessionLifecycle, d *domain.DetectingState) bool { + if d != nil { + if cur.Detecting != nil && *cur.Detecting == *d { + return false + } + patch.Detecting = d + return true + } + if cur.Detecting != nil { + patch.ClearDetecting = true + return true + } + return false +} + +// sameActivity compares activity sub-states with time-aware equality (== on +// time.Time is monotonic-clock sensitive and would spuriously report changes). +func sameActivity(a, b domain.ActivitySubstate) bool { + return a.State == b.State && a.Source == b.Source && a.LastActivityAt.Equal(b.LastActivityAt) +} + +func spawnMetadata(o ports.SpawnOutcome) map[string]string { + meta := map[string]string{} + if o.Branch != "" { + meta[MetaBranch] = o.Branch + } + if o.WorkspacePath != "" { + meta[MetaWorkspacePath] = o.WorkspacePath + } + if o.RuntimeHandle.ID != "" { + meta[MetaRuntimeHandleID] = o.RuntimeHandle.ID + } + if o.RuntimeHandle.RuntimeName != "" { + meta[MetaRuntimeName] = o.RuntimeHandle.RuntimeName + } + if o.AgentSessionID != "" { + meta[MetaAgentSessionID] = o.AgentSessionID + } + return meta +} diff --git a/backend/internal/lifecycle/manager_test.go b/backend/internal/lifecycle/manager_test.go new file mode 100644 index 00000000..d0a97125 --- /dev/null +++ b/backend/internal/lifecycle/manager_test.go @@ -0,0 +1,477 @@ +package lifecycle + +import ( + "context" + "sync" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +var t0 = time.Date(2026, 5, 26, 12, 0, 0, 0, time.UTC) + +const sid domain.SessionID = "s1" + +func newManager() (*Manager, *fakeStore) { + store := newFakeStore() + return New(store, &recordingNotifier{}, &recordingMessenger{}), store +} + +func mustLoad(t *testing.T, store *fakeStore) domain.CanonicalSessionLifecycle { + t.Helper() + l, ok, err := store.Load(context.Background(), sid) + if err != nil || !ok { + t.Fatalf("load: ok=%v err=%v", ok, err) + } + return l +} + +// ---- ApplyRuntimeObservation + #1 composition + #3 detecting clear ---- + +func TestApplyRuntimeObservation(t *testing.T) { + aliveProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0} + failedProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0} + deadProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0} + + tests := []struct { + name string + seed domain.CanonicalSessionLifecycle + facts ports.RuntimeFacts + wantSession domain.SessionState + wantReason domain.SessionReason + wantRuntime domain.RuntimeState + wantDisplay domain.SessionStatus + wantDetecting bool // expect non-nil detecting memory persisted + }{ + { + name: "healthy probe must not clobber an activity-owned needs_input (#1)", + seed: lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive), + facts: aliveProbe, + wantSession: domain.SessionNeedsInput, + wantReason: domain.ReasonAwaitingUserInput, + wantRuntime: domain.RuntimeAlive, + wantDisplay: domain.StatusNeedsInput, + wantDetecting: false, + }, + { + name: "healthy probe recovers a liveness-owned detecting -> working and clears memory (#1 + #3)", + seed: detectingLC(), + facts: aliveProbe, + wantSession: domain.SessionWorking, + wantReason: domain.ReasonTaskInProgress, + wantRuntime: domain.RuntimeAlive, + wantDisplay: domain.StatusWorking, + wantDetecting: false, + }, + { + name: "failed probe routes to detecting and records memory", + seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), + facts: failedProbe, + wantSession: domain.SessionDetecting, + wantReason: domain.ReasonProbeFailure, + wantRuntime: domain.RuntimeProbeFailed, + wantDisplay: domain.StatusDetecting, + wantDetecting: true, + }, + { + name: "dead+dead with no recent activity concludes killed and clears detecting (#3)", + seed: detectingLC(), + facts: deadProbe, + wantSession: domain.SessionTerminated, + wantReason: domain.ReasonRuntimeLost, + wantRuntime: domain.RuntimeExited, + wantDisplay: domain.StatusKilled, + wantDetecting: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mgr, store := newManager() + store.seed(sid, tt.seed) + + if err := mgr.ApplyRuntimeObservation(context.Background(), sid, tt.facts); err != nil { + t.Fatalf("apply: %v", err) + } + + l := mustLoad(t, store) + if l.Session.State != tt.wantSession || l.Session.Reason != tt.wantReason { + t.Errorf("session = %v/%v, want %v/%v", l.Session.State, l.Session.Reason, tt.wantSession, tt.wantReason) + } + if l.Runtime.State != tt.wantRuntime { + t.Errorf("runtime = %v, want %v", l.Runtime.State, tt.wantRuntime) + } + if got := domain.DeriveLegacyStatus(l); got != tt.wantDisplay { + t.Errorf("display = %v, want %v", got, tt.wantDisplay) + } + if (l.Detecting != nil) != tt.wantDetecting { + t.Errorf("detecting present = %v, want %v (%+v)", l.Detecting != nil, tt.wantDetecting, l.Detecting) + } + }) + } +} + +func TestApplyRuntimeObservation_NoRecordIsNoOp(t *testing.T) { + mgr, store := newManager() + if err := mgr.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}); err != nil { + t.Fatalf("apply: %v", err) + } + if _, ok, _ := store.Load(context.Background(), sid); ok { + t.Error("a probe for an unseeded session must not fabricate a record") + } +} + +func TestApplyRuntimeObservation_DoesNotResurrectTerminal(t *testing.T) { + mgr, store := newManager() + store.seed(sid, lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.RuntimeExited)) + + // A failed probe would normally route to detecting, but a terminal session + // must not be reopened by an observation (only an explicit Restore does). + if err := mgr.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}); err != nil { + t.Fatalf("apply: %v", err) + } + + l := mustLoad(t, store) + if l.Session.State != domain.SessionTerminated || l.Session.Reason != domain.ReasonManuallyKilled { + t.Errorf("session = %v/%v, want terminated/manually_killed (no resurrection)", l.Session.State, l.Session.Reason) + } + if l.Detecting != nil { + t.Errorf("terminal session must not gain detecting memory, got %+v", l.Detecting) + } +} + +// ---- ApplyActivitySignal ---- + +func TestApplyActivitySignal(t *testing.T) { + tests := []struct { + name string + seed domain.CanonicalSessionLifecycle + signal ports.ActivitySignal + wantSession domain.SessionState + wantReason domain.SessionReason + checkReason bool + wantActivity domain.ActivityState + wantChanged bool + }{ + { + name: "valid waiting_input maps to needs_input", + seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), + signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityWaitingInput, Timestamp: t0, Source: domain.SourceHook}, + wantSession: domain.SessionNeedsInput, + wantActivity: domain.ActivityWaitingInput, + wantChanged: true, + }, + { + name: "valid active recovers needs_input -> working", + seed: lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive), + signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook}, + wantSession: domain.SessionWorking, + wantActivity: domain.ActivityActive, + wantChanged: true, + }, + { + name: "valid idle maps to idle with a neutral reason", + seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), + signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityIdle, Timestamp: t0, Source: domain.SourceHook}, + wantSession: domain.SessionIdle, + wantReason: "", + checkReason: true, + wantActivity: domain.ActivityIdle, + wantChanged: true, + }, + { + name: "low-confidence signal is dropped (no idleness inferred)", + seed: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive), + signal: ports.ActivitySignal{State: ports.SignalProbeFailure, Activity: domain.ActivityIdle, Timestamp: t0, Source: domain.SourceHook}, + wantSession: domain.SessionWorking, + wantChanged: false, + }, + { + name: "valid activity resolves a detecting session (proof of life)", + seed: detectingLC(), + signal: ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook}, + wantSession: domain.SessionWorking, + wantActivity: domain.ActivityActive, + wantChanged: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mgr, store := newManager() + store.seed(sid, tt.seed) + + if err := mgr.ApplyActivitySignal(context.Background(), sid, tt.signal); err != nil { + t.Fatalf("apply: %v", err) + } + + l := mustLoad(t, store) + if l.Session.State != tt.wantSession { + t.Errorf("session = %v, want %v", l.Session.State, tt.wantSession) + } + if tt.checkReason && l.Session.Reason != tt.wantReason { + t.Errorf("session reason = %q, want %q", l.Session.Reason, tt.wantReason) + } + if tt.wantChanged && l.Revision != 1 { + t.Errorf("revision = %d, want 1 (expected a write)", l.Revision) + } + if !tt.wantChanged && l.Revision != 0 { + t.Errorf("revision = %d, want 0 (expected a no-op)", l.Revision) + } + if tt.wantChanged && tt.wantActivity != "" && l.Activity.State != tt.wantActivity { + t.Errorf("activity = %v, want %v", l.Activity.State, tt.wantActivity) + } + if tt.name == "valid activity resolves a detecting session (proof of life)" && l.Detecting != nil { + t.Errorf("resolving detecting must clear the quarantine memory, got %+v", l.Detecting) + } + }) + } +} + +// ---- ApplySCMObservation ---- + +func TestApplySCMObservation(t *testing.T) { + t.Run("failed fetch is a no-op (failed probe != no PR)", func(t *testing.T) { + mgr, store := newManager() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) + if err := mgr.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{Fetched: false, PRState: domain.PROpen}); err != nil { + t.Fatalf("apply: %v", err) + } + if l := mustLoad(t, store); l.Revision != 0 || l.PR.State != "" { + t.Errorf("expected no-op, got revision=%d pr=%v", l.Revision, l.PR.State) + } + }) + + t.Run("open PR writes only the PR axis; session stays activity-owned", func(t *testing.T) { + mgr, store := newManager() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) + f := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 12, PRURL: "https://x/12"} + if err := mgr.ApplySCMObservation(context.Background(), sid, f); err != nil { + t.Fatalf("apply: %v", err) + } + l := mustLoad(t, store) + if l.PR.State != domain.PROpen || l.PR.Reason != domain.PRReasonCIFailing || l.PR.Number != 12 { + t.Errorf("pr = %+v, want open/ci_failing/#12", l.PR) + } + if l.Session.State != domain.SessionWorking { + t.Errorf("session = %v, want working (untouched)", l.Session.State) + } + if got := domain.DeriveLegacyStatus(l); got != domain.StatusCIFailed { + t.Errorf("display = %v, want ci_failed", got) + } + }) + + t.Run("merged PR parks the session and displays merged", func(t *testing.T) { + mgr, store := newManager() + seed := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) + seed.PR = domain.PRSubstate{State: domain.PROpen, Reason: domain.PRReasonInProgress, Number: 12} + store.seed(sid, seed) + f := ports.SCMFacts{Fetched: true, PRState: domain.PRMerged, PRNumber: 12} + if err := mgr.ApplySCMObservation(context.Background(), sid, f); err != nil { + t.Fatalf("apply: %v", err) + } + l := mustLoad(t, store) + if l.PR.State != domain.PRMerged || l.Session.Reason != domain.ReasonMergedWaitingDecision { + t.Errorf("got pr=%v session=%v, want merged + merged_waiting_decision", l.PR.State, l.Session.Reason) + } + if got := domain.DeriveLegacyStatus(l); got != domain.StatusMerged { + t.Errorf("display = %v, want merged", got) + } + }) + + t.Run("open-PR review branches map to the PR axis", func(t *testing.T) { + cases := []struct { + name string + facts ports.SCMFacts + wantReason domain.PRReason + wantStatus domain.SessionStatus + }{ + {"changes requested", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested}, domain.PRReasonChangesRequested, domain.StatusChangesRequested}, + {"approved + mergeable", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, Mergeability: ports.Mergeability{Mergeable: true}}, domain.PRReasonMergeReady, domain.StatusMergeable}, + {"review pending", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewPending}, domain.PRReasonReviewPending, domain.StatusReviewPending}, + } + for _, c := range cases { + t.Run(c.name, func(t *testing.T) { + mgr, store := newManager() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) + if err := mgr.ApplySCMObservation(context.Background(), sid, c.facts); err != nil { + t.Fatalf("apply: %v", err) + } + l := mustLoad(t, store) + if l.PR.State != domain.PROpen || l.PR.Reason != c.wantReason { + t.Errorf("pr = %v/%v, want open/%v", l.PR.State, l.PR.Reason, c.wantReason) + } + if got := domain.DeriveLegacyStatus(l); got != c.wantStatus { + t.Errorf("display = %v, want %v", got, c.wantStatus) + } + }) + } + }) + + t.Run("no PR is a no-op in split A", func(t *testing.T) { + mgr, store := newManager() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) + if err := mgr.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{Fetched: true, PRState: domain.PRNone}); err != nil { + t.Fatalf("apply: %v", err) + } + if l := mustLoad(t, store); l.Revision != 0 { + t.Errorf("expected no-op, got revision=%d", l.Revision) + } + }) +} + +// ---- mutation outcomes ---- + +func TestOnSpawnCompleted(t *testing.T) { + mgr, store := newManager() + store.seed(sid, lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.RuntimeUnknown)) + + out := ports.SpawnOutcome{ + Branch: "feat/x", + WorkspacePath: "/w/x", + RuntimeHandle: ports.RuntimeHandle{ID: "tmux:1", RuntimeName: "tmux"}, + AgentSessionID: "agent-1", + } + if err := mgr.OnSpawnCompleted(context.Background(), sid, out); err != nil { + t.Fatalf("apply: %v", err) + } + + l := mustLoad(t, store) + if l.Runtime.State != domain.RuntimeAlive { + t.Errorf("runtime = %v, want alive", l.Runtime.State) + } + if l.Session.State != domain.SessionNotStarted { + t.Errorf("session = %v, want not_started (spawn does not assert acknowledgement)", l.Session.State) + } + if got := domain.DeriveLegacyStatus(l); got != domain.StatusSpawning { + t.Errorf("display = %v, want spawning", got) + } + meta, _ := store.GetMetadata(context.Background(), sid) + if meta[MetaBranch] != "feat/x" || meta[MetaAgentSessionID] != "agent-1" || meta[MetaRuntimeName] != "tmux" { + t.Errorf("metadata not recorded: %+v", meta) + } +} + +func TestOnKillRequested(t *testing.T) { + tests := []struct { + name string + kind ports.LifecycleKillReason + wantReason domain.SessionReason + wantRuntime domain.RuntimeReason + wantDisplay domain.SessionStatus + }{ + {"manual", ports.KillManual, domain.ReasonManuallyKilled, domain.RuntimeReasonManualKillRequested, domain.StatusKilled}, + {"cleanup", ports.KillCleanup, domain.ReasonAutoCleanup, domain.RuntimeReasonAutoCleanup, domain.StatusCleanup}, + {"error", ports.KillError, domain.ReasonErrorInProcess, domain.RuntimeReasonProbeError, domain.StatusErrored}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + mgr, store := newManager() + store.seed(sid, detectingLC()) + + if err := mgr.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: tt.kind, Detail: "x"}); err != nil { + t.Fatalf("apply: %v", err) + } + + l := mustLoad(t, store) + if l.Session.State != domain.SessionTerminated || l.Session.Reason != tt.wantReason { + t.Errorf("session = %v/%v, want terminated/%v", l.Session.State, l.Session.Reason, tt.wantReason) + } + if l.Runtime.Reason != tt.wantRuntime { + t.Errorf("runtime reason = %v, want %v", l.Runtime.Reason, tt.wantRuntime) + } + if l.Detecting != nil { + t.Errorf("kill must clear detecting memory, got %+v", l.Detecting) + } + if got := domain.DeriveLegacyStatus(l); got != tt.wantDisplay { + t.Errorf("display = %v, want %v", got, tt.wantDisplay) + } + }) + } +} + +func TestOnSpawnCompleted_UnseededErrors(t *testing.T) { + mgr, store := newManager() + err := mgr.OnSpawnCompleted(context.Background(), sid, ports.SpawnOutcome{Branch: "x"}) + if err == nil { + t.Error("OnSpawnCompleted for an unseeded session must error, not fabricate a record") + } + if _, ok, _ := store.Load(context.Background(), sid); ok { + t.Error("no record should have been created") + } +} + +func TestOnKillRequested_UnseededIsNoOp(t *testing.T) { + mgr, store := newManager() + if err := mgr.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil { + t.Fatalf("kill of unknown session should be a benign no-op, got %v", err) + } + if _, ok, _ := store.Load(context.Background(), sid); ok { + t.Error("killing an unknown session must not fabricate a terminal record") + } +} + +// ---- fake store contract ---- + +func TestFakeStoreExpectedRevision(t *testing.T) { + store := newFakeStore() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) // revision 0 + rt := domain.RuntimeSubstate{State: domain.RuntimeExited} + + bad := 99 + if err := store.PatchLifecycle(context.Background(), sid, ports.LifecyclePatch{Runtime: &rt, ExpectedRevision: &bad}); err == nil { + t.Error("stale ExpectedRevision must be rejected") + } + good := 0 + if err := store.PatchLifecycle(context.Background(), sid, ports.LifecyclePatch{Runtime: &rt, ExpectedRevision: &good}); err != nil { + t.Errorf("matching ExpectedRevision must succeed, got %v", err) + } +} + +// ---- per-session serialisation under the race detector ---- + +func TestPerSessionSerialization(t *testing.T) { + mgr, store := newManager() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) + + const n = 50 + var wg sync.WaitGroup + wg.Add(n) + for i := 0; i < n; i++ { + go func(i int) { + defer wg.Done() + _ = mgr.ApplyActivitySignal(context.Background(), sid, ports.ActivitySignal{ + State: ports.SignalValid, + Activity: domain.ActivityActive, + Timestamp: t0.Add(time.Duration(i) * time.Second), + Source: domain.SourceHook, + }) + }(i) + } + wg.Wait() + + // Each goroutine writes a distinct LastActivityAt, so every call is a real + // change; with correct serialisation all n land without a lost update. + if l := mustLoad(t, store); l.Revision != n { + t.Errorf("revision = %d, want %d (lost update under concurrency)", l.Revision, n) + } +} + +// ---- helpers ---- + +func lc(state domain.SessionState, reason domain.SessionReason, rt domain.RuntimeState) domain.CanonicalSessionLifecycle { + return domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Session: domain.SessionSubstate{State: state, Reason: reason}, + Runtime: domain.RuntimeSubstate{State: rt}, + } +} + +func detectingLC() domain.CanonicalSessionLifecycle { + l := lc(domain.SessionDetecting, domain.ReasonRuntimeLost, domain.RuntimeMissing) + l.Detecting = &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: "abc"} + return l +} diff --git a/backend/internal/lifecycle/reactions.go b/backend/internal/lifecycle/reactions.go new file mode 100644 index 00000000..72841510 --- /dev/null +++ b/backend/internal/lifecycle/reactions.go @@ -0,0 +1,417 @@ +package lifecycle + +// reactions.go is the ACT layer: the reaction table, the per-(session,reaction) +// escalation engine, and the duration-driven TickEscalations the synchronous +// LCM can't wake itself for. Reactions fire from react() after a transition is +// persisted by the Apply* pipeline (see manager.go). +// +// Dispatch is synchronous: react() runs Send/Notify inline. It is the single +// dispatch chokepoint, so moving it onto a worker goroutine later (once a daemon +// owns that goroutine's lifecycle) is a change confined to this one function. + +import ( + "context" + "fmt" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// reactionKey names a row in the reaction table and a tracker bucket. +type reactionKey string + +const ( + reactionCIFailed reactionKey = "ci-failed" + reactionChangesRequested reactionKey = "changes-requested" + reactionBugbotComments reactionKey = "bugbot-comments" + reactionMergeConflicts reactionKey = "merge-conflicts" + reactionAgentIdle reactionKey = "agent-idle" + reactionApprovedAndGreen reactionKey = "approved-and-green" + reactionAgentStuck reactionKey = "agent-stuck" + reactionNeedsInput reactionKey = "agent-needs-input" + reactionAgentExited reactionKey = "agent-exited" + reactionPRClosed reactionKey = "pr-closed" + reactionAllComplete reactionKey = "all-complete" +) + +type actionKind string + +const ( + actionSendToAgent actionKind = "send-to-agent" + actionNotify actionKind = "notify" + actionAutoMerge actionKind = "auto-merge" +) + +// reactionConfig is one row of the reaction table (distillation §4.1/§4.2). +// +// - retries numeric escalation cap: escalate once attempts exceed it. +// - escalateAfter duration escalation: escalate once this elapses since the +// first attempt (fired by TickEscalations, since the LCM never polls). +// - persistent the tracker survives the status leaving the triggering +// state; it only resets when the incident is truly over (PR no longer open +// or the session terminal). Only ci-failed is persistent, so a flapping +// CI (fail→pending→fail) keeps draining one shared retry budget. +type reactionConfig struct { + action actionKind + message string + priority ports.EventPriority + eventType string + retries int + escalateAfter time.Duration + persistent bool +} + +// defaultReactions is the product's default behaviour (distillation §4.2). +// auto-merge is intentionally absent: approved-and-green is a notify, so the +// human decides to merge. The auto-merge action kind exists for opt-in configs, +// but no default row uses it. +var defaultReactions = map[reactionKey]reactionConfig{ + reactionCIFailed: { + action: actionSendToAgent, persistent: true, retries: 2, + message: "CI is failing on your PR. Review the failing output below and push a fix.", + eventType: "reaction.ci-failed", priority: ports.PriorityAction, + }, + reactionChangesRequested: { + action: actionSendToAgent, escalateAfter: 30 * time.Minute, + message: "A reviewer requested changes on your PR. Address the comments and push.", + eventType: "reaction.changes-requested", priority: ports.PriorityAction, + }, + reactionBugbotComments: { + action: actionSendToAgent, escalateAfter: 30 * time.Minute, + message: "An automated reviewer left comments on your PR. Address them and push.", + eventType: "reaction.bugbot-comments", priority: ports.PriorityAction, + }, + reactionMergeConflicts: { + action: actionSendToAgent, escalateAfter: 15 * time.Minute, + message: "Your PR has merge conflicts. Rebase onto the base branch and resolve them.", + eventType: "reaction.merge-conflicts", priority: ports.PriorityAction, + }, + reactionAgentIdle: { + action: actionSendToAgent, retries: 2, escalateAfter: 15 * time.Minute, + message: "You appear idle. Continue the task or explain what is blocking you.", + eventType: "reaction.agent-idle", priority: ports.PriorityWarning, + }, + reactionApprovedAndGreen: { + // notify-only: a green, approved PR is the human-decision path — the human + // decides to merge (no auto-merge by default). + action: actionNotify, priority: ports.PriorityAction, + message: "PR is approved and green — ready to merge.", + eventType: "reaction.approved-and-green", + }, + reactionAgentStuck: { + // §4.2 lists a threshold: 10m here; it is intentionally not gated — entry + // into stuck is already debounced upstream by the detecting->stuck + // quarantine (DETECTING_MAX_ATTEMPTS/DURATION), so a second timer would be + // redundant. + action: actionNotify, priority: ports.PriorityUrgent, + message: "Agent is stuck and needs attention.", + eventType: "reaction.agent-stuck", + }, + reactionNeedsInput: { + action: actionNotify, priority: ports.PriorityUrgent, + message: "Agent needs input to continue.", + eventType: "reaction.agent-needs-input", + }, + reactionAgentExited: { + action: actionNotify, priority: ports.PriorityUrgent, + message: "Agent process exited unexpectedly.", + eventType: "reaction.agent-exited", + }, + reactionPRClosed: { + action: actionNotify, priority: ports.PriorityAction, + message: "PR was closed without merging — decide: resume, learn, or terminate.", + eventType: "reaction.pr-closed", + }, + reactionAllComplete: { + action: actionNotify, priority: ports.PriorityInfo, + message: "PR merged — work complete.", + eventType: "reaction.all-complete", + }, +} + +// reactionEventFor maps a canonical record to the reaction it should drive, +// mirroring DeriveLegacyStatus but for the ACT layer. ok is false when the +// current state has no reaction. +// +// A closed PR derives to the idle display status, so it is detected from the PR +// axis directly before falling through to the status mapping. bugbot-comments +// and merge-conflicts have no producer in the split-A decide core yet, so they +// are dormant: configured but unreachable until DECIDE surfaces them. +func reactionEventFor(l domain.CanonicalSessionLifecycle) (reactionKey, bool) { + if l.PR.State == domain.PRClosed { + return reactionPRClosed, true + } + switch domain.DeriveLegacyStatus(l) { + case domain.StatusCIFailed: + return reactionCIFailed, true + case domain.StatusChangesRequested: + return reactionChangesRequested, true + case domain.StatusApproved, domain.StatusMergeable: + return reactionApprovedAndGreen, true + case domain.StatusIdle: + return reactionAgentIdle, true + case domain.StatusStuck: + return reactionAgentStuck, true + case domain.StatusNeedsInput: + return reactionNeedsInput, true + case domain.StatusKilled: + // Inferred death only — an explicit user kill goes through + // OnKillRequested, which does not react. + return reactionAgentExited, true + case domain.StatusMerged: + return reactionAllComplete, true + } + return "", false +} + +// reactionContext carries fact-derived material the message templates need. The +// SCM path populates it (CI failure log tail); other paths pass the zero value. +type reactionContext struct { + ciFailureLogTail *string +} + +// trackerKey buckets an escalation tracker by session and reaction. +type trackerKey struct { + id domain.SessionID + key reactionKey +} + +// reactionTracker is the per-(session,reaction) escalation budget. It lives in +// memory on the Manager: a daemon restart resets budgets, which only ever costs +// a few extra agent retries before re-escalating — never a missed human +// notification. Keeping it out of the canonical store preserves the +// truth-vs-policy split (the store holds session truth; this is ACT policy). +type reactionTracker struct { + attempts int + escalated bool + firstAttemptAt time.Time +} + +// react fires the ACT layer after a persisted transition: clear the tracker for +// the reaction we left, then dispatch the reaction for the one we entered. It +// fires only on a genuine reaction change, so re-persisting the same state does +// not re-dispatch. Synchronous by design (see file header). +// +// Integration-time caveat: react runs AFTER withLock releases (deliberately, so +// a busy-waiting send-to-agent never holds the per-session mutex). Under a live +// daemon with concurrent observers (SCM poller + reaper + activity ingest) the +// afterLC snapshot can be stale by dispatch time — e.g. a ci-failed send firing +// after the session already moved to approved. Tests are single-threaded so it +// is not observable yet; when the daemon lands, give react a per-session +// ordering (a small react queue) or re-check the triggering state before +// dispatching. +func (m *Manager) react(ctx context.Context, id domain.SessionID, tr *transition, rc reactionContext) error { + if tr == nil { + return nil + } + beforeKey, hadBefore := reactionEventFor(tr.beforeLC) + afterKey, hasAfter := reactionEventFor(tr.afterLC) + + changed := beforeKey != afterKey + + switch { + case incidentOver(tr.afterLC) || recovered(tr.afterLC): + // The PR-pipeline incident has ended — the PR resolved (merged/closed), + // the session went terminal, or it reached an approved/green state. Every + // tracker for this session is now stale, including a persistent ci-failed + // one. This is keyed on the state REACHED, not the one left: the recovery + // transition is typically review_pending->approved (beforeKey empty), so + // clearing only beforeKey would leak the ci-failed tracker and leave its + // escalated=true to silence a future regression. Clear them all. + m.clearSessionTrackers(id) + case hadBefore && (!hasAfter || changed): + // Within an unresolved open PR: a normal tracker resets when its state is + // left. A persistent one (ci-failed) is NOT cleared here — it must survive + // the ambiguous review_pending limbo (the fail->pending->fail flap, §4.2); + // it only resets via the recovery/incident-over branch above. + if !defaultReactions[beforeKey].persistent { + m.clearTracker(id, beforeKey) + } + } + + if hasAfter && (!hadBefore || changed) { + return m.executeReaction(ctx, id, afterKey, rc) + } + return nil +} + +// incidentOver reports that a PR-pipeline incident has truly ended (PR no longer +// open, or the session terminal), so all trackers for the session may reset. +func incidentOver(l domain.CanonicalSessionLifecycle) bool { + return l.PR.State != domain.PROpen || isTerminal(l.Session.State) +} + +// recovered reports a genuinely-green open PR: an approved/mergeable state, which +// unambiguously means CI is no longer failing (the open-PR ladder ranks ci_failing +// above approved, so an approved display cannot coexist with failing CI). Unlike +// the ambiguous review_pending state — which may just be CI re-running — reaching +// this ends a ci-failed incident and re-arms its budget. +func recovered(l domain.CanonicalSessionLifecycle) bool { + if l.PR.State != domain.PROpen { + return false + } + switch l.PR.Reason { + case domain.PRReasonApproved, domain.PRReasonMergeReady: + return true + default: + return false + } +} + +func (m *Manager) executeReaction(ctx context.Context, id domain.SessionID, key reactionKey, rc reactionContext) error { + cfg := defaultReactions[key] + switch cfg.action { + case actionNotify: + // notify reactions are human-attention terminals: fire once on the + // triggering transition, no retry/escalation budget. + return m.notifier.Notify(ctx, ports.OrchestratorEvent{ + Type: cfg.eventType, + Priority: cfg.priority, + SessionID: id, + Message: cfg.message, + }) + case actionAutoMerge: + // Off by default: no default row maps here, and wiring a merge port is a + // later PR. An opt-in config could route a reaction here. + return nil + case actionSendToAgent: + return m.sendToAgent(ctx, id, key, cfg, rc) + } + return nil +} + +// sendToAgent runs the escalation engine for an auto send-to-agent reaction: +// count the attempt, escalate when the numeric cap or duration is exceeded +// (silencing further auto-dispatch), else inject the message via the messenger. +func (m *Manager) sendToAgent(ctx context.Context, id domain.SessionID, key reactionKey, cfg reactionConfig, rc reactionContext) error { + m.trackerMu.Lock() + tk := m.trackerFor(id, key) + if tk.escalated { + m.trackerMu.Unlock() + return nil // silenced until the condition clears the tracker + } + now := m.clock() + freshFirst := tk.firstAttemptAt.IsZero() + if freshFirst { + tk.firstAttemptAt = now + } + tk.attempts++ + if shouldEscalate(tk, cfg, now) { + tk.escalated = true + m.trackerMu.Unlock() + return m.escalate(ctx, id, key) + } + m.trackerMu.Unlock() + + if err := m.messenger.Send(ctx, id, composeMessage(cfg, rc)); err != nil { + // A delivery failure must not consume escalation budget: roll this + // attempt back so the next relevant transition retries from the same + // point rather than marching toward escalation on undelivered messages + // (distillation §4.3). + m.trackerMu.Lock() + tk.attempts-- + if freshFirst { + tk.firstAttemptAt = time.Time{} + } + m.trackerMu.Unlock() + return err + } + return nil +} + +// shouldEscalate uses inclusive boundaries: escalate once the numeric cap is +// exceeded or once exactly escalateAfter has elapsed (don't wait for the next +// tick to cross a strict threshold). +func shouldEscalate(tk *reactionTracker, cfg reactionConfig, now time.Time) bool { + if cfg.retries > 0 && tk.attempts > cfg.retries { + return true + } + if cfg.escalateAfter > 0 && !tk.firstAttemptAt.IsZero() && now.Sub(tk.firstAttemptAt) >= cfg.escalateAfter { + return true + } + return false +} + +// escalate emits reaction.escalated and notifies the human. The caller has +// already set tracker.escalated under the lock, which silences further +// auto-dispatch for this reaction until the tracker clears. +func (m *Manager) escalate(ctx context.Context, id domain.SessionID, key reactionKey) error { + return m.notifier.Notify(ctx, ports.OrchestratorEvent{ + Type: "reaction.escalated", + Priority: ports.PriorityUrgent, + SessionID: id, + Message: fmt.Sprintf("auto-handling of %q is exhausted and needs a human.", key), + Data: map[string]any{"reaction": string(key)}, + }) +} + +func composeMessage(cfg reactionConfig, rc reactionContext) string { + if rc.ciFailureLogTail != nil && *rc.ciFailureLogTail != "" { + return cfg.message + "\n\nFailing output:\n" + *rc.ciFailureLogTail + } + return cfg.message +} + +// trackerFor returns the tracker for (id,key), creating it on first use. The +// caller must hold trackerMu. +func (m *Manager) trackerFor(id domain.SessionID, key reactionKey) *reactionTracker { + k := trackerKey{id: id, key: key} + tk := m.trackers[k] + if tk == nil { + tk = &reactionTracker{} + m.trackers[k] = tk + } + return tk +} + +func (m *Manager) clearTracker(id domain.SessionID, key reactionKey) { + m.trackerMu.Lock() + delete(m.trackers, trackerKey{id: id, key: key}) + m.trackerMu.Unlock() +} + +// clearSessionTrackers drops every tracker for a session — used when its +// incident is over, so no budget (and no stale escalated=true) survives into a +// later unrelated incident. +func (m *Manager) clearSessionTrackers(id domain.SessionID) { + m.trackerMu.Lock() + for k := range m.trackers { + if k.id == id { + delete(m.trackers, k) + } + } + m.trackerMu.Unlock() +} + +// TickEscalations fires the duration-based escalations the synchronous LCM +// cannot wake itself for. The reaper calls it on a timer; it escalates any +// not-yet-escalated tracker whose escalateAfter has elapsed. Notifications are +// sent outside the lock so agent/notifier latency never blocks tracker access. +func (m *Manager) TickEscalations(ctx context.Context, now time.Time) error { + type due struct { + id domain.SessionID + key reactionKey + } + var fire []due + + m.trackerMu.Lock() + for k, tk := range m.trackers { + if tk.escalated { + continue + } + cfg := defaultReactions[k.key] + if cfg.escalateAfter > 0 && !tk.firstAttemptAt.IsZero() && now.Sub(tk.firstAttemptAt) >= cfg.escalateAfter { + tk.escalated = true + fire = append(fire, due{id: k.id, key: k.key}) + } + } + m.trackerMu.Unlock() + + for _, d := range fire { + if err := m.escalate(ctx, d.id, d.key); err != nil { + return err + } + } + return nil +} diff --git a/backend/internal/lifecycle/reactions_test.go b/backend/internal/lifecycle/reactions_test.go new file mode 100644 index 00000000..e90e8881 --- /dev/null +++ b/backend/internal/lifecycle/reactions_test.go @@ -0,0 +1,416 @@ +package lifecycle + +import ( + "context" + "fmt" + "strings" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// failingMessenger always fails delivery, counting attempts — used to assert a +// send failure does not consume escalation budget. +type failingMessenger struct{ attempts int } + +func (f *failingMessenger) Send(_ context.Context, _ domain.SessionID, _ string) error { + f.attempts++ + return fmt.Errorf("messenger unavailable") +} + +// newReactive wires a Manager with handles on the recording fakes so reaction +// tests can assert what was sent/notified. clock is pinned to t0 for +// deterministic escalation stamping. +func newReactive() (*Manager, *fakeStore, *recordingNotifier, *recordingMessenger) { + store := newFakeStore() + notf := &recordingNotifier{} + msgr := &recordingMessenger{} + m := New(store, notf, msgr) + m.clock = func() time.Time { return t0 } + return m, store, notf, msgr +} + +func lcOpenPR(reason domain.PRReason) domain.CanonicalSessionLifecycle { + l := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive) + l.PR = domain.PRSubstate{State: domain.PROpen, Reason: reason, Number: 7} + return l +} + +func notifyCount(n *recordingNotifier, eventType string) int { + n.mu.Lock() + defer n.mu.Unlock() + c := 0 + for _, e := range n.events { + if e.Type == eventType { + c++ + } + } + return c +} + +func ctx() context.Context { return context.Background() } + +// ---- right reaction per transition ---- + +func TestReaction_CIFailedSendsToAgentWithLogTail(t *testing.T) { + m, store, notf, msgr := newReactive() + store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) + + tail := "build failed\nundefined: foo" + err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, + PRNumber: 7, CIFailureLogTail: &tail, + }) + if err != nil { + t.Fatalf("apply: %v", err) + } + + if len(msgr.sent) != 1 { + t.Fatalf("want 1 send, got %d", len(msgr.sent)) + } + if got := msgr.sent[0].Message; !strings.Contains(got, "CI is failing") || !strings.Contains(got, tail) { + t.Errorf("message missing base text or log tail: %q", got) + } + if notifyCount(notf, "reaction.escalated") != 0 { + t.Error("a first failure must not escalate") + } +} + +func TestReaction_ApprovedAndGreenNotifiesNeverAutoMerges(t *testing.T) { + m, store, notf, msgr := newReactive() + store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) + + err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, + Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7, + }) + if err != nil { + t.Fatalf("apply: %v", err) + } + + // approved-and-green is notify (human decides to merge); the agent is never + // messaged and no auto-merge fires. + if len(msgr.sent) != 0 { + t.Errorf("approved-and-green must not message the agent, got %d sends", len(msgr.sent)) + } + if notifyCount(notf, "reaction.approved-and-green") != 1 { + t.Errorf("want one approved-and-green notify, got events %+v", notf.events) + } +} + +func TestReaction_NotifyEventsForHardStates(t *testing.T) { + tests := []struct { + name string + apply func(m *Manager) + eventType string + }{ + { + name: "waiting_input -> agent-needs-input", + apply: func(m *Manager) { applyActivity(m, domain.ActivityWaitingInput) }, + eventType: "reaction.agent-needs-input", + }, + { + name: "blocked -> agent-stuck", + apply: func(m *Manager) { applyActivity(m, domain.ActivityBlocked) }, + eventType: "reaction.agent-stuck", + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + m, store, notf, msgr := newReactive() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) + tc.apply(m) + if notifyCount(notf, tc.eventType) != 1 { + t.Errorf("want one %s, got events %+v", tc.eventType, notf.events) + } + if len(msgr.sent) != 0 { + t.Errorf("notify reaction must not message the agent, got %d", len(msgr.sent)) + } + }) + } +} + +func TestReaction_InferredDeathNotifiesAgentExited(t *testing.T) { + m, store, notf, _ := newReactive() + store.seed(sid, detectingLC()) + + err := m.ApplyRuntimeObservation(ctx(), sid, ports.RuntimeFacts{ + RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0, + }) + if err != nil { + t.Fatalf("apply: %v", err) + } + if l := mustLoad(t, store); domain.DeriveLegacyStatus(l) != domain.StatusKilled { + t.Fatalf("precondition: want killed, got %s", domain.DeriveLegacyStatus(l)) + } + if notifyCount(notf, "reaction.agent-exited") != 1 { + t.Errorf("want one agent-exited, got events %+v", notf.events) + } +} + +func TestReaction_PRClosedAndMerged(t *testing.T) { + tests := []struct { + name string + prState domain.PRState + eventType string + }{ + {"closed -> pr-closed", domain.PRClosed, "reaction.pr-closed"}, + {"merged -> all-complete", domain.PRMerged, "reaction.all-complete"}, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + m, store, notf, _ := newReactive() + store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) + err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: tc.prState, PRNumber: 7, + }) + if err != nil { + t.Fatalf("apply: %v", err) + } + if notifyCount(notf, tc.eventType) != 1 { + t.Errorf("want one %s, got events %+v", tc.eventType, notf.events) + } + }) + } +} + +func TestReaction_OnKillRequestedDoesNotReact(t *testing.T) { + m, store, notf, msgr := newReactive() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) + + if err := m.OnKillRequested(ctx(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil { + t.Fatalf("kill: %v", err) + } + // An explicit human kill is not an inferred event: no agent-exited, no send. + if len(notf.events) != 0 || len(msgr.sent) != 0 { + t.Errorf("explicit kill must fire no reaction: notifies=%+v sends=%+v", notf.events, msgr.sent) + } +} + +// ---- escalation engine ---- + +func TestReaction_CIFailedNumericEscalation(t *testing.T) { + m, store, notf, msgr := newReactive() + store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) + + // ci-failed has retries 2 and is persistent, so the budget is shared across + // fail->pending->fail oscillations and escalates on the third failure. + failN := 4 + for i := 0; i < failN; i++ { + failCI(t, m) + pendingCI(t, m) // oscillate out (persistent tracker must NOT reset) + } + + if len(msgr.sent) != 2 { + t.Errorf("want 2 auto-sends before escalation, got %d", len(msgr.sent)) + } + if c := notifyCount(notf, "reaction.escalated"); c != 1 { + t.Errorf("want exactly one escalation, got %d", c) + } +} + +func TestReaction_DurationEscalationFiresOnTick(t *testing.T) { + m, store, notf, msgr := newReactive() + store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) + + // changes-requested: send once now, then escalate by duration (30m) — which + // only the reaper's TickEscalations can fire (the LCM never polls). + err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7, + }) + if err != nil { + t.Fatalf("apply: %v", err) + } + if len(msgr.sent) != 1 { + t.Fatalf("want one send on transition, got %d", len(msgr.sent)) + } + + if err := m.TickEscalations(ctx(), t0.Add(10*time.Minute)); err != nil { + t.Fatalf("tick: %v", err) + } + if notifyCount(notf, "reaction.escalated") != 0 { + t.Error("must not escalate before escalateAfter elapses") + } + + // Inclusive boundary: escalate at exactly escalateAfter (30m), not only past it. + if err := m.TickEscalations(ctx(), t0.Add(30*time.Minute)); err != nil { + t.Fatalf("tick: %v", err) + } + if notifyCount(notf, "reaction.escalated") != 1 { + t.Errorf("want one duration escalation at exactly 30m, got events %+v", notf.events) + } +} + +func TestReaction_KillClearsEscalationTrackers(t *testing.T) { + m, store, notf, _ := newReactive() + store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) + + // changes-requested creates a duration-based tracker. + if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7, + }); err != nil { + t.Fatalf("apply: %v", err) + } + if sessionTrackerCount(m, sid) == 0 { + t.Fatalf("precondition: expected a tracker") + } + + if err := m.OnKillRequested(ctx(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil { + t.Fatalf("kill: %v", err) + } + if n := sessionTrackerCount(m, sid); n != 0 { + t.Errorf("kill must clear trackers, %d left", n) + } + // A later duration tick must not escalate a dead session. + if err := m.TickEscalations(ctx(), t0.Add(time.Hour)); err != nil { + t.Fatalf("tick: %v", err) + } + if c := notifyCount(notf, "reaction.escalated"); c != 0 { + t.Errorf("killed session must not escalate, got %d", c) + } +} + +func TestReaction_SendFailureDoesNotBurnBudget(t *testing.T) { + store := newFakeStore() + notf := &recordingNotifier{} + fm := &failingMessenger{} + m := New(store, notf, fm) + m.clock = func() time.Time { return t0 } + store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) + + tail := "fail" + failing := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail} + pending := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIPending, ReviewDecision: ports.ReviewPending, PRNumber: 7} + + // ci-failed has retries 2; with every delivery failing, the budget is rolled + // back each time, so even 5 failures never escalate. + for i := 0; i < 5; i++ { + _ = m.ApplySCMObservation(ctx(), sid, failing) // returns the delivery error + _ = m.ApplySCMObservation(ctx(), sid, pending) + } + if fm.attempts < 5 { + t.Errorf("expected at least 5 send attempts, got %d", fm.attempts) + } + if c := notifyCount(notf, "reaction.escalated"); c != 0 { + t.Errorf("undelivered messages must not escalate, got %d", c) + } +} + +func TestReaction_NonPersistentTrackerClearsOnLeave(t *testing.T) { + m, store, _, msgr := newReactive() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) + + // agent-idle has retries 2 but is NOT persistent: leaving idle clears the + // tracker, so three idle incidents each send fresh and none escalate. + for i := 0; i < 3; i++ { + applyActivity(m, domain.ActivityIdle) + applyActivity(m, domain.ActivityActive) + } + if len(msgr.sent) != 3 { + t.Errorf("want 3 idle sends (budget reset each incident), got %d", len(msgr.sent)) + } +} + +func TestReaction_CIFailedRearmsOnGenuineRecovery(t *testing.T) { + m, store, notf, msgr := newReactive() + store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) + + // Drain the ci-failed budget to escalation (silenced thereafter). + for i := 0; i < 4; i++ { + failCI(t, m) + pendingCI(t, m) + } + if notifyCount(notf, "reaction.escalated") != 1 { + t.Fatalf("precondition: want one escalation, got %d", notifyCount(notf, "reaction.escalated")) + } + sentBefore := len(msgr.sent) + + // A genuine recovery (approved + green) ends the incident and re-arms the + // budget; a later regression must re-nudge the agent, not stay silenced. + if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, + Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7, + }); err != nil { + t.Fatalf("recover: %v", err) + } + failCI(t, m) + + if len(msgr.sent) != sentBefore+1 { + t.Errorf("regression after recovery must re-nudge the agent: sends %d -> %d", sentBefore, len(msgr.sent)) + } +} + +func TestReaction_IncidentOverClearsAllSessionTrackers(t *testing.T) { + m, store, _, _ := newReactive() + store.seed(sid, lcOpenPR(domain.PRReasonReviewPending)) + + failCI(t, m) // creates a persistent ci-failed tracker + if sessionTrackerCount(m, sid) == 0 { + t.Fatalf("precondition: expected a ci-failed tracker") + } + + // Merging ends the incident; no tracker (and no stale escalated=true) may + // survive for the session. + if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PRMerged, PRNumber: 7, + }); err != nil { + t.Fatalf("merge: %v", err) + } + if n := sessionTrackerCount(m, sid); n != 0 { + t.Errorf("incident over must clear all trackers, %d left", n) + } +} + +func sessionTrackerCount(m *Manager, id domain.SessionID) int { + m.trackerMu.Lock() + defer m.trackerMu.Unlock() + c := 0 + for k := range m.trackers { + if k.id == id { + c++ + } + } + return c +} + +// ---- TickEscalations never writes canonical state ---- + +func TestTickEscalations_DoesNotPersist(t *testing.T) { + m, store, _, _ := newReactive() + store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) + if err := m.TickEscalations(ctx(), t0); err != nil { + t.Fatalf("tick: %v", err) + } + if l := mustLoad(t, store); l.Revision != 0 { + t.Errorf("TickEscalations must not write canonical state, got revision=%d", l.Revision) + } +} + +// ---- helpers ---- + +func applyActivity(m *Manager, a domain.ActivityState) { + _ = m.ApplyActivitySignal(ctx(), sid, ports.ActivitySignal{ + State: ports.SignalValid, Activity: a, Timestamp: t0, Source: domain.SourceHook, + }) +} + +func failCI(t *testing.T, m *Manager) { + t.Helper() + tail := "fail" + if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail, + }); err != nil { + t.Fatalf("failCI: %v", err) + } +} + +func pendingCI(t *testing.T, m *Manager) { + t.Helper() + if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{ + Fetched: true, PRState: domain.PROpen, CISummary: ports.CIPending, ReviewDecision: ports.ReviewPending, PRNumber: 7, + }); err != nil { + t.Fatalf("pendingCI: %v", err) + } +} diff --git a/backend/internal/ports/facts.go b/backend/internal/ports/facts.go new file mode 100644 index 00000000..55f4f6ca --- /dev/null +++ b/backend/internal/ports/facts.go @@ -0,0 +1,145 @@ +// Package ports declares the boundary contracts for the LCM + Session Manager +// lane: the inbound interfaces we implement, the outbound interfaces others +// implement for us, and the fact DTOs that cross those boundaries. +// +// These are the types the SCM poller, persistence adapter, and API layer build +// against, so they are committed and stabilised before the LCM/SM logic. +package ports + +import ( + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +// SCMFacts is produced by the SCM poller and handed to ApplySCMObservation. +// +// Fetched is the failed-probe guard: when false, the GitHub query timed out or +// errored and the rest of the struct is meaningless — the LCM must NOT read it +// as "no PR / PR closed" (the SCM analogue of "failed probe != dead"). +// +// CIFailureLogTail is a pointer because it is only populated when CI is failing; +// it carries ~120 lines and we don't want it on the hot poll path otherwise. +type SCMFacts struct { + Fetched bool + ObservedAt time.Time + PRState domain.PRState + PRNumber int + PRURL string + CISummary CISummary + ReviewDecision ReviewDecision + Mergeability Mergeability + PendingComments []ReviewComment + CIFailureLogTail *string +} + +type CISummary string + +const ( + CIPending CISummary = "pending" + CIPassing CISummary = "passing" + CIFailing CISummary = "failing" + CINone CISummary = "none" +) + +type ReviewDecision string + +const ( + ReviewApproved ReviewDecision = "approved" + ReviewChangesRequested ReviewDecision = "changes_requested" + ReviewPending ReviewDecision = "pending" + ReviewNone ReviewDecision = "none" +) + +// Mergeability is the structured "can this merge?" answer. CIPassing/Approved +// here overlap CISummary/ReviewDecision by design (different granularity); +// Mergeability is authoritative for the merge gate, the others for display. +type Mergeability struct { + Mergeable bool + CIPassing bool + Approved bool + NoConflicts bool + Blockers []string +} + +// ReviewComment carries IsBot so the decider can route bot review comments +// (bugbot-comments reaction) differently from human ones (changes-requested). +type ReviewComment struct { + Author string + Body string + IsBot bool + URL string +} + +// RuntimeFacts is produced by the reaper and handed to ApplyRuntimeObservation. +type RuntimeFacts struct { + ObservedAt time.Time + RuntimeState RuntimeProbe + ProcessState ProcessProbe +} + +// RuntimeProbe / ProcessProbe keep "failed" (the probe call itself errored or +// timed out) distinct from "indeterminate" (the probe ran but couldn't tell) — +// they route differently in the decider. +type RuntimeProbe string + +const ( + RuntimeProbeAlive RuntimeProbe = "alive" + RuntimeProbeDead RuntimeProbe = "dead" + RuntimeProbeIndeterminate RuntimeProbe = "indeterminate" + RuntimeProbeFailed RuntimeProbe = "failed" +) + +type ProcessProbe string + +const ( + ProcessProbeAlive ProcessProbe = "alive" + ProcessProbeDead ProcessProbe = "dead" + ProcessProbeIndeterminate ProcessProbe = "indeterminate" + ProcessProbeFailed ProcessProbe = "failed" +) + +// ActivitySignal is pushed by agent hooks / the FS watcher. State is the +// confidence wrapper (so unavailable/probe_failure != idleness); Activity is +// the actual classification. +type ActivitySignal struct { + State SignalConfidence + Activity domain.ActivityState + Timestamp time.Time + Source domain.ActivitySource +} + +type SignalConfidence string + +const ( + SignalValid SignalConfidence = "valid" + SignalStale SignalConfidence = "stale" + SignalNull SignalConfidence = "null" + SignalUnavailable SignalConfidence = "unavailable" + SignalProbeFailure SignalConfidence = "probe_failure" +) + +// SpawnOutcome is what the Session Manager reports to the LCM after a spawn. +// RuntimeHandle is the same structured handle the Runtime port returns, so no +// ad-hoc string encoding is needed for later Destroy/SendMessage calls. +type SpawnOutcome struct { + Branch string + WorkspacePath string + RuntimeHandle RuntimeHandle + AgentSessionID string +} + +// KillReason is what the Session Manager reports to the LCM when a kill is +// requested. Kind drives whether the terminal state is killed/cleanup/errored. +type KillReason struct { + Kind LifecycleKillReason + Detail string +} + +type LifecycleKillReason string + +const ( + KillManual LifecycleKillReason = "manual" + KillCleanup LifecycleKillReason = "cleanup" + KillError LifecycleKillReason = "error" +) diff --git a/backend/internal/ports/inbound.go b/backend/internal/ports/inbound.go new file mode 100644 index 00000000..30ab7559 --- /dev/null +++ b/backend/internal/ports/inbound.go @@ -0,0 +1,70 @@ +package ports + +import ( + "context" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +// LifecycleManager is the inbound contract we implement. Every Apply* method +// runs the same synchronous pipeline: load canonical -> pure decide -> diff -> +// persist (merge-patch) -> if the status transitioned, fire reactions. The LCM +// never polls; observers (SCM poller, reaper, activity ingest) call in. +// +// Concurrency: the LCM serialises per session, so concurrent Apply* calls for +// the same session do not race the load/decide/persist read-modify-write. +type LifecycleManager interface { + // Raw-fact entrypoints (each runs decide internally). + ApplySCMObservation(ctx context.Context, id domain.SessionID, f SCMFacts) error + ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f RuntimeFacts) error + ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ActivitySignal) error + + // Mutation outcomes reported by the Session Manager. + OnSpawnCompleted(ctx context.Context, id domain.SessionID, o SpawnOutcome) error + OnKillRequested(ctx context.Context, id domain.SessionID, r KillReason) error + + // Reaper heartbeat that drives duration-based escalation (a non-polling + // LCM can't wake itself to fire a "30m elapsed" escalation). + TickEscalations(ctx context.Context, now time.Time) error +} + +// SessionManager is the inbound contract called by the API layer and CLI. It +// owns explicit mutations (spawn/kill/restore/cleanup) and never derives or +// writes observed state directly — it routes outcomes to the LCM. +type SessionManager interface { + Spawn(ctx context.Context, cfg SpawnConfig) (domain.Session, error) + Kill(ctx context.Context, id domain.SessionID, opts KillOptions) (KillResult, error) + List(ctx context.Context, project domain.ProjectID) ([]domain.Session, error) + Get(ctx context.Context, id domain.SessionID) (domain.Session, error) + Send(ctx context.Context, id domain.SessionID, message string) error + Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) + Cleanup(ctx context.Context, project domain.ProjectID) (CleanupResult, error) +} + +type SpawnConfig struct { + ProjectID domain.ProjectID + IssueID domain.IssueID + Kind domain.SessionKind + Branch string + Prompt string + AgentRules string + // OpenTerminal is reserved for a later lane (open a terminal tab on spawn). + // Spawn does NOT honor it yet — setting it has no effect. + OpenTerminal bool +} + +type KillOptions struct { + Reason LifecycleKillReason + Detail string +} + +type KillResult struct { + ID domain.SessionID + WorkspaceFreed bool +} + +type CleanupResult struct { + Cleaned []domain.SessionID + Skipped []domain.SessionID // e.g. paths that still held uncommitted work +} diff --git a/backend/internal/ports/outbound.go b/backend/internal/ports/outbound.go new file mode 100644 index 00000000..a9c03e22 --- /dev/null +++ b/backend/internal/ports/outbound.go @@ -0,0 +1,152 @@ +package ports + +import ( + "context" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +// LifecycleStore is the persistence adapter, the ONLY disk writer. It owns +// merge-patch, atomic write, file lock, and CDC eventing. The LCM and SM only +// ever touch state through this narrow interface. +// +// List returns persistence records (no derived status); the Session Manager +// turns those into domain.Session by attaching the derived display status. +// +// Seed and Get are the two record-with-identity methods the Session Manager +// needs that the LCM does not: Load returns lifecycle only (all the decider +// needs), so the SM read-model and explicit-create path would otherwise have no +// way to write or read a record's identity (ID/ProjectID/IssueID/Kind/CreatedAt) +// by id. (Co-owned with Tom's persistence layer — added here to close that gap.) +type LifecycleStore interface { + Load(ctx context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) + PatchLifecycle(ctx context.Context, id domain.SessionID, patch LifecyclePatch) error + List(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) + GetMetadata(ctx context.Context, id domain.SessionID) (map[string]string, error) + PatchMetadata(ctx context.Context, id domain.SessionID, kv map[string]string) error + + // Seed creates a new record with its identity and initial lifecycle. It is + // the SM's explicit-create path (the LCM only ever patches existing records); + // OnSpawnCompleted requires a seeded record, so Spawn calls this first. It + // must reject a seed for an id that already exists rather than overwrite — + // re-seeding an existing session (e.g. Restore) goes through PatchLifecycle. + Seed(ctx context.Context, rec domain.SessionRecord) error + + // Get returns a single full record (with identity) by id. Load is + // lifecycle-only, so the SM uses this to build the read-model and to + // reconstruct teardown handles for Kill/Restore on one id. + Get(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) +} + +// LifecyclePatch is a sparse merge-patch: a nil field is left untouched, a +// non-nil field is written. +// +// Detecting needs three-way semantics (leave / set / clear-to-nil): +// - ClearDetecting == true → store clears the detecting memory and IGNORES +// the Detecting field (clear wins; setting both is a caller bug). +// - ClearDetecting == false, Detecting != nil → set/replace the memory. +// - ClearDetecting == false, Detecting == nil → leave it untouched. +// +// ExpectedRevision supports optimistic concurrency: when non-nil the store must +// reject the patch if the stored Revision (the monotonic write counter, NOT the +// schema Version) differs. This is the alternative to the LCM owning all +// per-session serialisation itself. +type LifecyclePatch struct { + Session *domain.SessionSubstate + PR *domain.PRSubstate + Runtime *domain.RuntimeSubstate + Activity *domain.ActivitySubstate + Detecting *domain.DetectingState + ClearDetecting bool + ExpectedRevision *int +} + +// Notifier delivers events to the human (desktop/Slack later). Push, never pull. +type Notifier interface { + Notify(ctx context.Context, event OrchestratorEvent) error +} + +type EventPriority string + +const ( + PriorityUrgent EventPriority = "urgent" + PriorityAction EventPriority = "action" + PriorityWarning EventPriority = "warning" + PriorityInfo EventPriority = "info" +) + +type OrchestratorEvent struct { + Type string + Priority EventPriority + SessionID domain.SessionID + ProjectID domain.ProjectID + Message string + Data map[string]any +} + +// AgentMessenger injects a message into a running agent. The implementation +// busy-detects (waits for the agent to be idle/ready) and verifies delivery, +// which is why activity-detection accuracy matters. +type AgentMessenger interface { + Send(ctx context.Context, id domain.SessionID, message string) error +} + +// The runtime/agent/workspace plugin ports are co-owned with the coding-agents +// lane; the method sets below are the minimum the Session Manager spawn/kill +// pipelines call. They will be fleshed out alongside the tmux/claude-code impls. + +type Runtime interface { + Create(ctx context.Context, cfg RuntimeConfig) (RuntimeHandle, error) + Destroy(ctx context.Context, handle RuntimeHandle) error + SendMessage(ctx context.Context, handle RuntimeHandle, message string) error + GetOutput(ctx context.Context, handle RuntimeHandle, lines int) (string, error) + IsAlive(ctx context.Context, handle RuntimeHandle) (bool, error) +} + +type RuntimeConfig struct { + SessionID domain.SessionID + WorkspacePath string + LaunchCommand string + Env map[string]string +} + +type RuntimeHandle struct { + ID string + RuntimeName string +} + +type Agent interface { + GetLaunchCommand(cfg AgentConfig) string + GetEnvironment(cfg AgentConfig) map[string]string + // ProbeProcess returns the agent process liveness classification + // (alive/dead/indeterminate/failed) — not a boolean and not an activity + // state. Activity classification arrives separately via ActivitySignal. + ProbeProcess(ctx context.Context, handle RuntimeHandle) (ProcessProbe, error) + GetRestoreCommand(agentSessionID string) string +} + +type AgentConfig struct { + SessionID domain.SessionID + WorkspacePath string + Prompt string +} + +type Workspace interface { + Create(ctx context.Context, cfg WorkspaceConfig) (WorkspaceInfo, error) + Destroy(ctx context.Context, info WorkspaceInfo) error + List(ctx context.Context, project domain.ProjectID) ([]WorkspaceInfo, error) + Restore(ctx context.Context, cfg WorkspaceConfig) (WorkspaceInfo, error) +} + +type WorkspaceConfig struct { + ProjectID domain.ProjectID + SessionID domain.SessionID + Branch string +} + +type WorkspaceInfo struct { + Path string + Branch string + SessionID domain.SessionID + ProjectID domain.ProjectID +} diff --git a/backend/internal/session/fakes_test.go b/backend/internal/session/fakes_test.go new file mode 100644 index 00000000..648172de --- /dev/null +++ b/backend/internal/session/fakes_test.go @@ -0,0 +1,407 @@ +package session + +import ( + "context" + "fmt" + "sync" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// callLog records the cross-fake call order so tests can assert pipeline +// sequencing (e.g. OnKillRequested before Runtime.Destroy before Workspace.Destroy). +type callLog struct { + mu sync.Mutex + calls []string +} + +func (c *callLog) add(s string) { + c.mu.Lock() + defer c.mu.Unlock() + c.calls = append(c.calls, s) +} + +func (c *callLog) snapshot() []string { + c.mu.Lock() + defer c.mu.Unlock() + out := make([]string, len(c.calls)) + copy(out, c.calls) + return out +} + +// indexOf returns the position of the first call equal to name, or -1. +func (c *callLog) indexOf(name string) int { + for i, s := range c.snapshot() { + if s == name { + return i + } + } + return -1 +} + +// ---- fakeStore: in-memory LifecycleStore with faithful merge-patch + Seed/Get ---- + +type fakeStore struct { + mu sync.Mutex + records map[domain.SessionID]*domain.SessionRecord + metadata map[domain.SessionID]map[string]string +} + +var _ ports.LifecycleStore = (*fakeStore)(nil) + +func newFakeStore() *fakeStore { + return &fakeStore{ + records: map[domain.SessionID]*domain.SessionRecord{}, + metadata: map[domain.SessionID]map[string]string{}, + } +} + +func (s *fakeStore) Seed(_ context.Context, rec domain.SessionRecord) error { + s.mu.Lock() + defer s.mu.Unlock() + if _, ok := s.records[rec.ID]; ok { + return fmt.Errorf("seed: session %s already exists", rec.ID) + } + if rec.Lifecycle.Version == 0 { + rec.Lifecycle.Version = domain.LifecycleVersion + } + r := rec + s.records[rec.ID] = &r + return nil +} + +func (s *fakeStore) Get(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) { + s.mu.Lock() + defer s.mu.Unlock() + rec, ok := s.records[id] + if !ok { + return domain.SessionRecord{}, false, nil + } + return s.withMetadata(*rec), true, nil +} + +func (s *fakeStore) Load(_ context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) { + s.mu.Lock() + defer s.mu.Unlock() + rec, ok := s.records[id] + if !ok { + return domain.CanonicalSessionLifecycle{}, false, nil + } + return rec.Lifecycle, true, nil +} + +func (s *fakeStore) PatchLifecycle(_ context.Context, id domain.SessionID, p ports.LifecyclePatch) error { + s.mu.Lock() + defer s.mu.Unlock() + + rec, ok := s.records[id] + if !ok { + rec = &domain.SessionRecord{ID: id, Lifecycle: domain.CanonicalSessionLifecycle{Version: domain.LifecycleVersion}} + s.records[id] = rec + } + l := &rec.Lifecycle + + if p.ExpectedRevision != nil && *p.ExpectedRevision != l.Revision { + return fmt.Errorf("revision mismatch for %s: have %d, expected %d", id, l.Revision, *p.ExpectedRevision) + } + + if p.Session != nil { + l.Session = *p.Session + } + if p.PR != nil { + l.PR = *p.PR + } + if p.Runtime != nil { + l.Runtime = *p.Runtime + } + if p.Activity != nil { + l.Activity = *p.Activity + } + switch { + case p.ClearDetecting: + l.Detecting = nil + case p.Detecting != nil: + d := *p.Detecting + l.Detecting = &d + } + + l.Version = domain.LifecycleVersion + l.Revision++ + rec.UpdatedAt = time.Now() + return nil +} + +func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) { + s.mu.Lock() + defer s.mu.Unlock() + var out []domain.SessionRecord + for _, rec := range s.records { + if rec.ProjectID == project { + out = append(out, s.withMetadata(*rec)) + } + } + return out, nil +} + +func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (map[string]string, error) { + s.mu.Lock() + defer s.mu.Unlock() + return cloneMap(s.metadata[id]), nil +} + +func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, kv map[string]string) error { + s.mu.Lock() + defer s.mu.Unlock() + if s.metadata[id] == nil { + s.metadata[id] = map[string]string{} + } + for k, v := range kv { + s.metadata[id][k] = v + } + return nil +} + +// withMetadata attaches the separately-stored metadata to a record copy (a real +// store would return them together). Caller holds s.mu. +func (s *fakeStore) withMetadata(rec domain.SessionRecord) domain.SessionRecord { + if md := s.metadata[rec.ID]; len(md) > 0 { + rec.Metadata = cloneMap(md) + } + return rec +} + +// ---- fakeRuntime ---- + +type fakeRuntime struct { + log *callLog + createErr error + alive bool + + created []ports.RuntimeConfig + destroyed []ports.RuntimeHandle + sent []string +} + +var _ ports.Runtime = (*fakeRuntime)(nil) + +func (r *fakeRuntime) Create(_ context.Context, cfg ports.RuntimeConfig) (ports.RuntimeHandle, error) { + r.log.add("Runtime.Create") + if r.createErr != nil { + return ports.RuntimeHandle{}, r.createErr + } + r.created = append(r.created, cfg) + return ports.RuntimeHandle{ID: "rt-" + string(cfg.SessionID), RuntimeName: "tmux"}, nil +} + +func (r *fakeRuntime) Destroy(_ context.Context, h ports.RuntimeHandle) error { + r.log.add("Runtime.Destroy") + r.destroyed = append(r.destroyed, h) + return nil +} + +func (r *fakeRuntime) SendMessage(_ context.Context, _ ports.RuntimeHandle, message string) error { + r.sent = append(r.sent, message) + return nil +} + +func (r *fakeRuntime) GetOutput(_ context.Context, _ ports.RuntimeHandle, _ int) (string, error) { + return "", nil +} + +func (r *fakeRuntime) IsAlive(_ context.Context, _ ports.RuntimeHandle) (bool, error) { + return r.alive, nil +} + +// ---- fakeAgent ---- + +type fakeAgent struct { + env map[string]string +} + +var _ ports.Agent = (*fakeAgent)(nil) + +func (a *fakeAgent) GetLaunchCommand(_ ports.AgentConfig) string { return "claude" } + +func (a *fakeAgent) GetEnvironment(_ ports.AgentConfig) map[string]string { return cloneMap(a.env) } + +func (a *fakeAgent) ProbeProcess(_ context.Context, _ ports.RuntimeHandle) (ports.ProcessProbe, error) { + return ports.ProcessProbeAlive, nil +} + +func (a *fakeAgent) GetRestoreCommand(agentSessionID string) string { + return "claude --resume " + agentSessionID +} + +// ---- fakeWorkspace (with worktree-remove refusal mode) ---- + +type fakeWorkspace struct { + log *callLog + createErr error + refuse map[string]bool // path -> still registered after prune (uncommitted work) + created []ports.WorkspaceConfig + destroyed []ports.WorkspaceInfo + restoredID []domain.SessionID +} + +var _ ports.Workspace = (*fakeWorkspace)(nil) + +func (w *fakeWorkspace) Create(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { + w.log.add("Workspace.Create") + if w.createErr != nil { + return ports.WorkspaceInfo{}, w.createErr + } + w.created = append(w.created, cfg) + return workspaceFor(cfg), nil +} + +func (w *fakeWorkspace) Destroy(_ context.Context, info ports.WorkspaceInfo) error { + w.log.add("Workspace.Destroy") + if w.refuse[info.Path] { + // Worktree-remove safety: after `git worktree prune` the path is still + // registered, so it may hold the agent's uncommitted work — refuse. + return fmt.Errorf("workspace: refusing to rm -rf %s: still registered after prune", info.Path) + } + w.destroyed = append(w.destroyed, info) + return nil +} + +func (w *fakeWorkspace) List(_ context.Context, _ domain.ProjectID) ([]ports.WorkspaceInfo, error) { + return nil, nil +} + +func (w *fakeWorkspace) Restore(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) { + w.log.add("Workspace.Restore") + w.restoredID = append(w.restoredID, cfg.SessionID) + return workspaceFor(cfg), nil +} + +func workspaceFor(cfg ports.WorkspaceConfig) ports.WorkspaceInfo { + return ports.WorkspaceInfo{ + Path: "/tmp/ws/" + string(cfg.SessionID), + Branch: cfg.Branch, + SessionID: cfg.SessionID, + ProjectID: cfg.ProjectID, + } +} + +// ---- recordingMessenger ---- + +type recordingMessenger struct { + sent []struct { + ID domain.SessionID + Message string + } +} + +var _ ports.AgentMessenger = (*recordingMessenger)(nil) + +func (m *recordingMessenger) Send(_ context.Context, id domain.SessionID, message string) error { + m.sent = append(m.sent, struct { + ID domain.SessionID + Message string + }{id, message}) + return nil +} + +// ---- noopNotifier ---- + +type noopNotifier struct{} + +var _ ports.Notifier = (*noopNotifier)(nil) + +func (noopNotifier) Notify(_ context.Context, _ ports.OrchestratorEvent) error { return nil } + +// ---- recordingLCM: wraps the REAL lifecycle.Manager and logs SM-facing calls ---- + +type recordingLCM struct { + log *callLog + inner ports.LifecycleManager + + // onSpawnErr, when set, makes OnSpawnCompleted fail (without touching the + // inner manager) so tests can exercise the SM's post-spawn failure paths. + onSpawnErr error +} + +var _ ports.LifecycleManager = (*recordingLCM)(nil) + +func (l *recordingLCM) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error { + l.log.add("OnSpawnCompleted") + if l.onSpawnErr != nil { + return l.onSpawnErr + } + return l.inner.OnSpawnCompleted(ctx, id, o) +} + +func (l *recordingLCM) OnKillRequested(ctx context.Context, id domain.SessionID, r ports.KillReason) error { + l.log.add("OnKillRequested") + return l.inner.OnKillRequested(ctx, id, r) +} + +func (l *recordingLCM) ApplySCMObservation(ctx context.Context, id domain.SessionID, f ports.SCMFacts) error { + return l.inner.ApplySCMObservation(ctx, id, f) +} + +func (l *recordingLCM) ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error { + return l.inner.ApplyRuntimeObservation(ctx, id, f) +} + +func (l *recordingLCM) ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ports.ActivitySignal) error { + return l.inner.ApplyActivitySignal(ctx, id, s) +} + +func (l *recordingLCM) TickEscalations(ctx context.Context, now time.Time) error { + return l.inner.TickEscalations(ctx, now) +} + +// ---- harness: wires the SM against the fakes + the real LCM ---- + +type harness struct { + sm *Manager + store *fakeStore + runtime *fakeRuntime + agent *fakeAgent + workspace *fakeWorkspace + messenger *recordingMessenger + lcm *recordingLCM + log *callLog +} + +var fixedTime = time.Date(2026, 5, 27, 12, 0, 0, 0, time.UTC) + +func newHarness(id domain.SessionID) *harness { + log := &callLog{} + store := newFakeStore() + rt := &fakeRuntime{log: log, alive: true} + ag := &fakeAgent{env: map[string]string{"BASE": "1"}} + ws := &fakeWorkspace{log: log, refuse: map[string]bool{}} + msg := &recordingMessenger{} + + lcm := &recordingLCM{log: log, inner: lifecycle.New(store, noopNotifier{}, msg)} + + sm := New(Deps{ + Runtime: rt, + Agent: ag, + Workspace: ws, + Store: store, + Messenger: msg, + Lifecycle: lcm, + Clock: func() time.Time { return fixedTime }, + NewID: func(ports.SpawnConfig) domain.SessionID { return id }, + }) + + return &harness{sm: sm, store: store, runtime: rt, agent: ag, workspace: ws, messenger: msg, lcm: lcm, log: log} +} + +func cloneMap(in map[string]string) map[string]string { + if in == nil { + return nil + } + out := make(map[string]string, len(in)) + for k, v := range in { + out[k] = v + } + return out +} diff --git a/backend/internal/session/manager.go b/backend/internal/session/manager.go new file mode 100644 index 00000000..e2723d26 --- /dev/null +++ b/backend/internal/session/manager.go @@ -0,0 +1,464 @@ +// Package session implements ports.SessionManager: the explicit-mutation half +// of the lane. The SM is impure plumbing — it drives the Runtime/Agent/Workspace +// plugins to create and tear down sessions, seeds the initial lifecycle record, +// and routes mutation outcomes to the LCM (OnSpawnCompleted / OnKillRequested). +// +// It NEVER derives or observes lifecycle state: observed transitions are the +// LCM's job. The SM's only canonical writes are the explicit ones — seeding a +// new record on Spawn and re-seeding (reopening) on Restore — and it is the +// single producer of the derived display status, attached on read in List/Get +// and never persisted. +package session + +import ( + "context" + "crypto/rand" + "encoding/hex" + "errors" + "fmt" + "strconv" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +// ErrNotFound is returned by Get/Restore when no record exists for the id. +var ErrNotFound = errors.New("session: not found") + +// ErrNotRestorable is returned by Restore when the session is not torn down. +// Restoring a live session would spin up a second runtime/workspace for the same +// id, duplicating the agent and risking data loss. +var ErrNotRestorable = errors.New("session: not restorable (not terminal)") + +// ErrIncompleteTeardownMetadata is returned when a record's teardown handles are +// missing (empty workspace path or runtime handle), so calling a real adapter's +// Destroy could act on empty args — an unsafe delete. The teardown is skipped. +var ErrIncompleteTeardownMetadata = errors.New("session: incomplete teardown metadata") + +// Env vars a spawned process reads to learn who it is (distillation §5.4). +const ( + EnvSessionID = "AO_SESSION_ID" + EnvProjectID = "AO_PROJECT_ID" + EnvIssueID = "AO_ISSUE_ID" +) + +// Manager implements ports.SessionManager against the outbound ports. Every +// dependency is an interface so the SM runs entirely against fakes in tests. +type Manager struct { + runtime ports.Runtime + agent ports.Agent + workspace ports.Workspace + store ports.LifecycleStore + messenger ports.AgentMessenger + lcm ports.LifecycleManager + + clock func() time.Time + newID func(ports.SpawnConfig) domain.SessionID +} + +var _ ports.SessionManager = (*Manager)(nil) + +// Deps groups the SM's collaborators. Clock and NewID are optional (defaulted) +// so production wiring only supplies the ports. +type Deps struct { + Runtime ports.Runtime + Agent ports.Agent + Workspace ports.Workspace + Store ports.LifecycleStore + Messenger ports.AgentMessenger + Lifecycle ports.LifecycleManager + + Clock func() time.Time + NewID func(ports.SpawnConfig) domain.SessionID +} + +func New(d Deps) *Manager { + m := &Manager{ + runtime: d.Runtime, + agent: d.Agent, + workspace: d.Workspace, + store: d.Store, + messenger: d.Messenger, + lcm: d.Lifecycle, + clock: d.Clock, + newID: d.NewID, + } + if m.clock == nil { + m.clock = time.Now + } + if m.newID == nil { + m.newID = defaultNewID + } + return m +} + +// ---- Spawn ---- + +// Spawn runs the create pipeline in spec order: workspace -> runtime -> seed -> +// report to the LCM. The record is seeded LATE (after the runtime is up), so a +// failure before the seed leaves no record for Cleanup to reclaim — hence each +// step eagerly rolls back the steps that already succeeded. +func (m *Manager) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Session, error) { + id := m.newID(cfg) + + ws, err := m.workspace.Create(ctx, ports.WorkspaceConfig{ + ProjectID: cfg.ProjectID, + SessionID: id, + Branch: cfg.Branch, + }) + if err != nil { + return domain.Session{}, fmt.Errorf("spawn %s: workspace create: %w", id, err) + } + + agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path, Prompt: buildPrompt(cfg)} + handle, err := m.runtime.Create(ctx, ports.RuntimeConfig{ + SessionID: id, + WorkspacePath: ws.Path, + LaunchCommand: m.agent.GetLaunchCommand(agentCfg), + Env: spawnEnv(m.agent.GetEnvironment(agentCfg), id, cfg.ProjectID, cfg.IssueID), + }) + if err != nil { + m.rollbackWorkspace(ctx, ws) // nothing seeded yet + return domain.Session{}, fmt.Errorf("spawn %s: runtime create: %w", id, err) + } + + if err := m.store.Seed(ctx, seedRecord(id, cfg, m.clock())); err != nil { + m.rollbackRuntime(ctx, handle) + m.rollbackWorkspace(ctx, ws) + return domain.Session{}, fmt.Errorf("spawn %s: seed: %w", id, err) + } + + outcome := ports.SpawnOutcome{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandle: handle} + if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { + // The record is seeded but the runtime/workspace are about to be torn + // down. The store has no delete, so route the orphan to a terminal + // errored state (best effort) rather than strand a phantom "spawning". + _ = m.lcm.OnKillRequested(ctx, id, ports.KillReason{Kind: ports.KillError, Detail: "spawn completion failed"}) + m.rollbackRuntime(ctx, handle) + m.rollbackWorkspace(ctx, ws) + return domain.Session{}, fmt.Errorf("spawn %s: on spawn completed: %w", id, err) + } + + return m.Get(ctx, id) +} + +// rollback* are best-effort: the caller already has the originating failure, and +// there is no logger at this layer, so a secondary teardown error is dropped +// rather than masking the real cause. +func (m *Manager) rollbackWorkspace(ctx context.Context, ws ports.WorkspaceInfo) { + _ = m.workspace.Destroy(ctx, ws) +} + +func (m *Manager) rollbackRuntime(ctx context.Context, h ports.RuntimeHandle) { + _ = m.runtime.Destroy(ctx, h) +} + +// ---- Kill ---- + +// Kill records terminal intent with the LCM FIRST, then tears down the runtime +// and workspace. There is no separate Agent stop: the agent runs inside the +// runtime, so Runtime.Destroy stops it. The workspace teardown honors the +// worktree-remove safety — a refusal (path still registered after prune, so it +// may hold uncommitted work) surfaces as an error with WorkspaceFreed=false and +// is never forced. +func (m *Manager) Kill(ctx context.Context, id domain.SessionID, opts ports.KillOptions) (ports.KillResult, error) { + rec, ok, err := m.store.Get(ctx, id) + if err != nil { + return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w", id, err) + } + if !ok { + // Already gone: benign race, mirrors LCM.OnKillRequested's no-op. + return ports.KillResult{ID: id}, nil + } + meta, err := m.store.GetMetadata(ctx, id) + if err != nil { + return ports.KillResult{ID: id}, fmt.Errorf("kill %s: metadata: %w", id, err) + } + + // Validate the teardown handles BEFORE recording intent or touching an + // adapter: a corrupted/partially-seeded record with empty handles must never + // reach Destroy (empty path / handle could be an unsafe delete). + rtHandle := runtimeHandle(meta) + wsInfo := workspaceInfo(rec, meta) + if !validRuntimeHandle(rtHandle) { + return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w: runtime handle", id, ErrIncompleteTeardownMetadata) + } + if !validWorkspaceInfo(wsInfo) { + return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w: workspace path", id, ErrIncompleteTeardownMetadata) + } + + if err := m.lcm.OnKillRequested(ctx, id, ports.KillReason{Kind: opts.Reason, Detail: opts.Detail}); err != nil { + return ports.KillResult{ID: id}, fmt.Errorf("kill %s: on kill requested: %w", id, err) + } + if err := m.runtime.Destroy(ctx, rtHandle); err != nil { + return ports.KillResult{ID: id}, fmt.Errorf("kill %s: runtime destroy: %w", id, err) + } + if err := m.workspace.Destroy(ctx, wsInfo); err != nil { + return ports.KillResult{ID: id, WorkspaceFreed: false}, fmt.Errorf("kill %s: workspace destroy: %w", id, err) + } + return ports.KillResult{ID: id, WorkspaceFreed: true}, nil +} + +// ---- read-model ---- + +// List builds the read-model for a project: stored records with the display +// status derived on read. The SM is the single producer of that status. +func (m *Manager) List(ctx context.Context, project domain.ProjectID) ([]domain.Session, error) { + recs, err := m.store.List(ctx, project) + if err != nil { + return nil, fmt.Errorf("list %s: %w", project, err) + } + out := make([]domain.Session, 0, len(recs)) + for _, rec := range recs { + out = append(out, toSession(rec)) + } + return out, nil +} + +func (m *Manager) Get(ctx context.Context, id domain.SessionID) (domain.Session, error) { + rec, ok, err := m.store.Get(ctx, id) + if err != nil { + return domain.Session{}, fmt.Errorf("get %s: %w", id, err) + } + if !ok { + return domain.Session{}, fmt.Errorf("get %s: %w", id, ErrNotFound) + } + return toSession(rec), nil +} + +// ---- Send ---- + +// Send routes a message to the running agent through the AgentMessenger, which +// busy-detects and verifies delivery. +func (m *Manager) Send(ctx context.Context, id domain.SessionID, message string) error { + if err := m.messenger.Send(ctx, id, message); err != nil { + return fmt.Errorf("send %s: %w", id, err) + } + return nil +} + +// ---- Restore ---- + +// Restore relaunches a previously torn-down session in its workspace. The +// fallible I/O (workspace restore + runtime create) runs first so a failure +// touches no canonical state and never destroys the worktree (it may hold the +// agent's prior work). Only once the runtime is up do we reopen the lifecycle: +// resetting a terminal session is an explicit mutation (the SM's authority; the +// LCM's observe path would never resurrect a terminal session), and the PR axis +// is cleared. OnSpawnCompleted then flips the runtime to alive. +func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) { + rec, ok, err := m.store.Get(ctx, id) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: %w", id, err) + } + if !ok { + return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotFound) + } + // Only a torn-down session may be restored. Reopening a live one would spawn a + // duplicate runtime/workspace for the same id and reset its lifecycle. + if !isTerminalSession(rec.Lifecycle.Session.State) { + return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotRestorable) + } + meta, err := m.store.GetMetadata(ctx, id) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: metadata: %w", id, err) + } + + // Resume is only possible with the agent's captured session id. Without it, + // GetRestoreCommand would produce an ambiguous "resume nothing" launch, and + // we have no stored prompt to fall back to a fresh launch — so fail early, + // before any I/O. + agentSessionID := meta[lifecycle.MetaAgentSessionID] + if agentSessionID == "" { + return domain.Session{}, fmt.Errorf("restore %s: missing agent session id (cannot resume)", id) + } + + ws, err := m.workspace.Restore(ctx, ports.WorkspaceConfig{ + ProjectID: rec.ProjectID, + SessionID: id, + Branch: meta[lifecycle.MetaBranch], + }) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: workspace restore: %w", id, err) + } + + agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path} + handle, err := m.runtime.Create(ctx, ports.RuntimeConfig{ + SessionID: id, + WorkspacePath: ws.Path, + LaunchCommand: m.agent.GetRestoreCommand(agentSessionID), + Env: spawnEnv(m.agent.GetEnvironment(agentCfg), id, rec.ProjectID, rec.IssueID), + }) + if err != nil { + return domain.Session{}, fmt.Errorf("restore %s: runtime create: %w", id, err) + } + + // Past this point the runtime is live: a failure must tear it back down (but + // never the workspace, which holds the agent's prior work) so we don't strand + // a process while parking the session in a terminal lifecycle. + reopen := ports.LifecyclePatch{ + Session: &domain.SessionSubstate{State: domain.SessionNotStarted, Reason: domain.ReasonSpawnRequested}, + PR: &domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonClearedOnRestore}, + } + if err := m.store.PatchLifecycle(ctx, id, reopen); err != nil { + m.rollbackRuntime(ctx, handle) + return domain.Session{}, fmt.Errorf("restore %s: reopen: %w", id, err) + } + + outcome := ports.SpawnOutcome{ + Branch: ws.Branch, + WorkspacePath: ws.Path, + RuntimeHandle: handle, + AgentSessionID: agentSessionID, + } + if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil { + m.rollbackRuntime(ctx, handle) + return domain.Session{}, fmt.Errorf("restore %s: on spawn completed: %w", id, err) + } + return m.Get(ctx, id) +} + +// ---- Cleanup ---- + +// Cleanup reclaims the workspaces of terminal sessions in a project. A workspace +// whose teardown is refused by the worktree-remove safety (uncommitted work) is +// skipped, never forced. Runtime teardown is best-effort (a terminal session's +// runtime is usually already gone); the workspace result decides cleaned/skipped. +func (m *Manager) Cleanup(ctx context.Context, project domain.ProjectID) (ports.CleanupResult, error) { + recs, err := m.store.List(ctx, project) + if err != nil { + return ports.CleanupResult{}, fmt.Errorf("cleanup %s: %w", project, err) + } + var res ports.CleanupResult + for _, rec := range recs { + if !isTerminalSession(rec.Lifecycle.Session.State) { + continue + } + meta, err := m.store.GetMetadata(ctx, rec.ID) + if err != nil { + return res, fmt.Errorf("cleanup %s: metadata %s: %w", project, rec.ID, err) + } + wsInfo := workspaceInfo(rec, meta) + if !validWorkspaceInfo(wsInfo) { + // No workspace path to reclaim — skip rather than hand empty args to a + // real adapter's Destroy (an unsafe delete). + res.Skipped = append(res.Skipped, rec.ID) + continue + } + if rtHandle := runtimeHandle(meta); validRuntimeHandle(rtHandle) { + _ = m.runtime.Destroy(ctx, rtHandle) // best effort; usually already gone + } + if err := m.workspace.Destroy(ctx, wsInfo); err != nil { + res.Skipped = append(res.Skipped, rec.ID) + continue + } + res.Cleaned = append(res.Cleaned, rec.ID) + } + return res, nil +} + +// ---- helpers ---- + +func toSession(rec domain.SessionRecord) domain.Session { + return domain.Session{SessionRecord: rec, Status: domain.DeriveLegacyStatus(rec.Lifecycle)} +} + +func isTerminalSession(s domain.SessionState) bool { + return s == domain.SessionDone || s == domain.SessionTerminated +} + +// buildPrompt assembles the spawn prompt from the explicit config only; the full +// 3-layer assembly (base protocol + config-derived + user rules) lands later. +func buildPrompt(cfg ports.SpawnConfig) string { + switch { + case cfg.AgentRules == "": + return cfg.Prompt + case cfg.Prompt == "": + return cfg.AgentRules + default: + return cfg.Prompt + "\n\n" + cfg.AgentRules + } +} + +// spawnEnv overlays the AO_* identity vars onto the agent's environment without +// mutating the map the agent returned. +func spawnEnv(base map[string]string, id domain.SessionID, project domain.ProjectID, issue domain.IssueID) map[string]string { + env := make(map[string]string, len(base)+3) + for k, v := range base { + env[k] = v + } + env[EnvSessionID] = string(id) + env[EnvProjectID] = string(project) + env[EnvIssueID] = string(issue) + return env +} + +func seedRecord(id domain.SessionID, cfg ports.SpawnConfig, now time.Time) domain.SessionRecord { + return domain.SessionRecord{ + ID: id, + ProjectID: cfg.ProjectID, + IssueID: cfg.IssueID, + Kind: cfg.Kind, + CreatedAt: now, + UpdatedAt: now, + Lifecycle: domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Session: domain.SessionSubstate{State: domain.SessionNotStarted, Reason: domain.ReasonSpawnRequested}, + Runtime: domain.RuntimeSubstate{State: domain.RuntimeUnknown, Reason: domain.RuntimeReasonSpawnIncomplete}, + PR: domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated}, + }, + } +} + +// runtimeHandle / workspaceInfo reconstruct teardown handles from the metadata +// the LCM persisted in OnSpawnCompleted (the metadata-key contract is shared +// with the lifecycle package). +func runtimeHandle(meta map[string]string) ports.RuntimeHandle { + return ports.RuntimeHandle{ + ID: meta[lifecycle.MetaRuntimeHandleID], + RuntimeName: meta[lifecycle.MetaRuntimeName], + } +} + +func workspaceInfo(rec domain.SessionRecord, meta map[string]string) ports.WorkspaceInfo { + return ports.WorkspaceInfo{ + Path: meta[lifecycle.MetaWorkspacePath], + Branch: meta[lifecycle.MetaBranch], + SessionID: rec.ID, + ProjectID: rec.ProjectID, + } +} + +// validRuntimeHandle reports whether the handle identifies a runtime to destroy. +// An adapter needs the handle id to target the right process; an empty handle +// would be ambiguous, so we refuse to call Destroy with one. +func validRuntimeHandle(h ports.RuntimeHandle) bool { + return h.ID != "" +} + +// validWorkspaceInfo reports whether there is a concrete path to reclaim. An +// empty path handed to a worktree-remove could resolve to an unsafe target. +func validWorkspaceInfo(w ports.WorkspaceInfo) bool { + return w.Path != "" +} + +func defaultNewID(cfg ports.SpawnConfig) domain.SessionID { + base := string(cfg.IssueID) + if base == "" { + base = string(cfg.Kind) + } + if base == "" { + base = "session" + } + return domain.SessionID(base + "-" + randHex(4)) +} + +func randHex(n int) string { + b := make([]byte, n) + if _, err := rand.Read(b); err != nil { + return strconv.FormatInt(time.Now().UnixNano(), 16) + } + return hex.EncodeToString(b) +} diff --git a/backend/internal/session/manager_test.go b/backend/internal/session/manager_test.go new file mode 100644 index 00000000..702a735e --- /dev/null +++ b/backend/internal/session/manager_test.go @@ -0,0 +1,559 @@ +package session + +import ( + "context" + "errors" + "testing" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" + "github.com/aoagents/agent-orchestrator/backend/internal/lifecycle" + "github.com/aoagents/agent-orchestrator/backend/internal/ports" +) + +const ( + testProject = domain.ProjectID("proj") + testIssue = domain.IssueID("42") +) + +func spawnCfg() ports.SpawnConfig { + return ports.SpawnConfig{ + ProjectID: testProject, + IssueID: testIssue, + Kind: domain.KindWorker, + Branch: "feat/42", + Prompt: "do the thing", + AgentRules: "be careful", + } +} + +func TestSpawn_HappyPath(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + + sess, err := h.sm.Spawn(ctx, spawnCfg()) + if err != nil { + t.Fatalf("spawn: %v", err) + } + + // Display status is derived (single producer) — a freshly spawned, not_started + // session shows as spawning. + if sess.Status != domain.StatusSpawning { + t.Errorf("status = %q, want %q", sess.Status, domain.StatusSpawning) + } + + // Record seeded with identity + initial lifecycle, then OnSpawnCompleted flipped + // the runtime axis to alive. + rec, ok, err := h.store.Get(ctx, "sess-1") + if err != nil || !ok { + t.Fatalf("get seeded record: ok=%v err=%v", ok, err) + } + if rec.ProjectID != testProject || rec.IssueID != testIssue || rec.Kind != domain.KindWorker { + t.Errorf("identity = %+v, want proj/42/worker", rec) + } + if !rec.CreatedAt.Equal(fixedTime) { + t.Errorf("createdAt = %v, want %v", rec.CreatedAt, fixedTime) + } + if got := rec.Lifecycle.Session; got.State != domain.SessionNotStarted || got.Reason != domain.ReasonSpawnRequested { + t.Errorf("session substate = %+v, want not_started/spawn_requested", got) + } + if got := rec.Lifecycle.Runtime; got.State != domain.RuntimeAlive || got.Reason != domain.RuntimeReasonProcessRunning { + t.Errorf("runtime substate = %+v, want alive/process_running", got) + } + + // Pipeline order: workspace -> runtime -> (seed) -> LCM. + wantOrder := []string{"Workspace.Create", "Runtime.Create", "OnSpawnCompleted"} + if got := h.log.snapshot(); !equalStrings(got, wantOrder) { + t.Errorf("call order = %v, want %v", got, wantOrder) + } + + // Identity env wired onto the runtime config, layered over the agent's env. + if len(h.runtime.created) != 1 { + t.Fatalf("runtime.created = %d, want 1", len(h.runtime.created)) + } + env := h.runtime.created[0].Env + for k, want := range map[string]string{ + EnvSessionID: "sess-1", + EnvProjectID: "proj", + EnvIssueID: "42", + "BASE": "1", + } { + if env[k] != want { + t.Errorf("env[%q] = %q, want %q", k, env[k], want) + } + } + + // Handles persisted to metadata for later teardown/restore. + meta, _ := h.store.GetMetadata(ctx, "sess-1") + for k, want := range map[string]string{ + lifecycle.MetaBranch: "feat/42", + lifecycle.MetaWorkspacePath: "/tmp/ws/sess-1", + lifecycle.MetaRuntimeHandleID: "rt-sess-1", + lifecycle.MetaRuntimeName: "tmux", + } { + if meta[k] != want { + t.Errorf("meta[%q] = %q, want %q", k, meta[k], want) + } + } +} + +func TestSpawn_RuntimeCreateFailure_RollsBack(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + h.runtime.createErr = errors.New("boom") + + _, err := h.sm.Spawn(ctx, spawnCfg()) + if err == nil { + t.Fatal("spawn: want error, got nil") + } + + // No record seeded for a spawn that never completed. + if _, ok, _ := h.store.Get(ctx, "sess-1"); ok { + t.Error("record was seeded despite runtime-create failure") + } + // The already-created workspace was rolled back (eager rollback), since a + // late-seeded record means Cleanup could never find this orphan. + if len(h.workspace.destroyed) != 1 || h.workspace.destroyed[0].Path != "/tmp/ws/sess-1" { + t.Errorf("workspace.destroyed = %+v, want the created worktree", h.workspace.destroyed) + } + // LCM never told a spawn completed. + if h.log.indexOf("OnSpawnCompleted") != -1 { + t.Error("OnSpawnCompleted should not fire on a failed spawn") + } +} + +func TestSpawn_OnSpawnCompletedFailure_RoutesOrphanToErrored(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + h.lcm.onSpawnErr = errors.New("lcm boom") + + _, err := h.sm.Spawn(ctx, spawnCfg()) + if err == nil { + t.Fatal("spawn: want error, got nil") + } + + // Runtime + workspace are torn down on the failure path. + if len(h.runtime.destroyed) != 1 { + t.Errorf("runtime.destroyed = %d, want 1", len(h.runtime.destroyed)) + } + if len(h.workspace.destroyed) != 1 { + t.Errorf("workspace.destroyed = %d, want 1", len(h.workspace.destroyed)) + } + // The record was already seeded and the store has no delete, so the orphan is + // routed to a terminal errored state (via OnKillRequested(KillError)) rather + // than stranded forever as "spawning". + rec, ok, _ := h.store.Get(ctx, "sess-1") + if !ok { + t.Fatal("seeded record vanished; expected it parked as errored") + } + if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonErrorInProcess { + t.Errorf("session substate = %+v, want terminated/error_in_process", got) + } + if status := domain.DeriveLegacyStatus(rec.Lifecycle); status != domain.StatusErrored { + t.Errorf("status = %q, want errored", status) + } +} + +func TestKill_OrderingAndTerminalState(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { + t.Fatalf("spawn: %v", err) + } + + res, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}) + if err != nil { + t.Fatalf("kill: %v", err) + } + if !res.WorkspaceFreed { + t.Error("WorkspaceFreed = false, want true") + } + + // Intent recorded with the LCM BEFORE any teardown, runtime before workspace. + iKill := h.log.indexOf("OnKillRequested") + iRT := h.log.indexOf("Runtime.Destroy") + iWS := h.log.indexOf("Workspace.Destroy") + if !(iKill >= 0 && iKill < iRT && iRT < iWS) { + t.Errorf("kill order indices: OnKillRequested=%d Runtime.Destroy=%d Workspace.Destroy=%d (want ascending)", iKill, iRT, iWS) + } + + // Terminal canonical written by the LCM; display derives to killed. + rec, _, _ := h.store.Get(ctx, "sess-1") + if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonManuallyKilled { + t.Errorf("session substate = %+v, want terminated/manually_killed", got) + } + if status := domain.DeriveLegacyStatus(rec.Lifecycle); status != domain.StatusKilled { + t.Errorf("status = %q, want killed", status) + } +} + +func TestKill_WorktreeRemoveRefusalSurfaced(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { + t.Fatalf("spawn: %v", err) + } + // The worktree path is still registered after prune (uncommitted work). + h.workspace.refuse["/tmp/ws/sess-1"] = true + + res, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}) + if err == nil { + t.Fatal("kill: want refusal error, got nil") + } + if res.WorkspaceFreed { + t.Error("WorkspaceFreed = true, want false on refusal") + } + // The refusal must be honored — the path is never force-deleted. + if len(h.workspace.destroyed) != 0 { + t.Errorf("workspace.destroyed = %+v, want none (refused)", h.workspace.destroyed) + } + // Runtime still torn down and intent still recorded — only the worktree is spared. + if h.log.indexOf("Runtime.Destroy") == -1 || h.log.indexOf("OnKillRequested") == -1 { + t.Error("runtime teardown / kill intent should still happen on a workspace refusal") + } +} + +func TestKill_IncompleteMetadata_RefusesTeardown(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + // A record with no teardown metadata (empty runtime handle + workspace path), + // e.g. a partially-seeded or corrupted record. + if err := h.store.Seed(ctx, domain.SessionRecord{ + ID: "sess-1", ProjectID: testProject, + Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), + }); err != nil { + t.Fatalf("seed: %v", err) + } + + if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); !errors.Is(err, ErrIncompleteTeardownMetadata) { + t.Fatalf("kill: err = %v, want ErrIncompleteTeardownMetadata", err) + } + // Nothing destroyed with empty args, and no intent recorded. + if len(h.runtime.destroyed) != 0 || len(h.workspace.destroyed) != 0 { + t.Errorf("teardown ran despite incomplete metadata: rt=%v ws=%v", h.runtime.destroyed, h.workspace.destroyed) + } + if h.log.indexOf("OnKillRequested") != -1 { + t.Error("kill intent recorded despite incomplete metadata") + } +} + +func TestCleanup_IncompleteMetadata_Skipped(t *testing.T) { + h := newHarness("unused") + ctx := context.Background() + // Terminal session but no workspace path persisted — must be skipped, never + // handed to Destroy with an empty path. + if err := h.store.Seed(ctx, domain.SessionRecord{ + ID: "orphan-1", ProjectID: testProject, + Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), + }); err != nil { + t.Fatalf("seed: %v", err) + } + + res, err := h.sm.Cleanup(ctx, testProject) + if err != nil { + t.Fatalf("cleanup: %v", err) + } + if !equalIDSet(res.Skipped, []domain.SessionID{"orphan-1"}) { + t.Errorf("skipped = %v, want [orphan-1]", res.Skipped) + } + if len(res.Cleaned) != 0 { + t.Errorf("cleaned = %v, want none", res.Cleaned) + } + if len(h.workspace.destroyed) != 0 { + t.Errorf("workspace.destroyed = %v, want none (empty path must not reach Destroy)", h.workspace.destroyed) + } +} + +func TestRestore_LiveSession_Rejected(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { + t.Fatalf("spawn: %v", err) + } + // The session is live (never torn down). Capture an agent id so the only thing + // blocking restore is the non-terminal lifecycle, not missing metadata. + if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil { + t.Fatalf("patch metadata: %v", err) + } + createdBefore := len(h.runtime.created) + restoresBefore := len(h.workspace.restoredID) + + if _, err := h.sm.Restore(ctx, "sess-1"); !errors.Is(err, ErrNotRestorable) { + t.Fatalf("restore: err = %v, want ErrNotRestorable", err) + } + // No second runtime/workspace spun up for the still-live session. + if len(h.runtime.created) != createdBefore { + t.Error("runtime created for a live-session restore") + } + if len(h.workspace.restoredID) != restoresBefore { + t.Error("workspace restored for a live-session restore") + } +} + +func TestListAndGet_DeriveStatus(t *testing.T) { + cases := []struct { + name string + lc domain.CanonicalSessionLifecycle + want domain.SessionStatus + }{ + {"not_started", lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.PRNone, ""), domain.StatusSpawning}, + {"working", lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), domain.StatusWorking}, + {"idle", lc(domain.SessionIdle, domain.ReasonResearchComplete, domain.PRNone, ""), domain.StatusIdle}, + {"needs_input", lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.PRNone, ""), domain.StatusNeedsInput}, + {"pr_ci_failed", lc(domain.SessionWorking, domain.ReasonFixingCI, domain.PROpen, domain.PRReasonCIFailing), domain.StatusCIFailed}, + {"pr_merged", lc(domain.SessionIdle, domain.ReasonMergedWaitingDecision, domain.PRMerged, domain.PRReasonMerged), domain.StatusMerged}, + {"killed", lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), domain.StatusKilled}, + } + + h := newHarness("unused") + ctx := context.Background() + for _, c := range cases { + if err := h.store.Seed(ctx, domain.SessionRecord{ID: domain.SessionID(c.name), ProjectID: testProject, Lifecycle: c.lc}); err != nil { + t.Fatalf("seed %s: %v", c.name, err) + } + } + + // Get derives per-record. + for _, c := range cases { + got, err := h.sm.Get(ctx, domain.SessionID(c.name)) + if err != nil { + t.Fatalf("get %s: %v", c.name, err) + } + if got.Status != c.want { + t.Errorf("get %s: status = %q, want %q", c.name, got.Status, c.want) + } + } + + // List derives for every record in the project. + got, err := h.sm.List(ctx, testProject) + if err != nil { + t.Fatalf("list: %v", err) + } + if len(got) != len(cases) { + t.Fatalf("list len = %d, want %d", len(got), len(cases)) + } + byID := map[domain.SessionID]domain.SessionStatus{} + for _, s := range got { + byID[s.ID] = s.Status + } + for _, c := range cases { + if byID[domain.SessionID(c.name)] != c.want { + t.Errorf("list %s: status = %q, want %q", c.name, byID[domain.SessionID(c.name)], c.want) + } + } +} + +func TestGet_NotFound(t *testing.T) { + h := newHarness("sess-1") + if _, err := h.sm.Get(context.Background(), "missing"); !errors.Is(err, ErrNotFound) { + t.Errorf("get missing: err = %v, want ErrNotFound", err) + } +} + +func TestSend_RoutesToMessenger(t *testing.T) { + h := newHarness("sess-1") + if err := h.sm.Send(context.Background(), "sess-1", "hello"); err != nil { + t.Fatalf("send: %v", err) + } + if len(h.messenger.sent) != 1 || h.messenger.sent[0].ID != "sess-1" || h.messenger.sent[0].Message != "hello" { + t.Errorf("messenger.sent = %+v, want one {sess-1, hello}", h.messenger.sent) + } +} + +func TestRestore_RelaunchesWithResumeCommand(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { + t.Fatalf("spawn: %v", err) + } + if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { + t.Fatalf("kill: %v", err) + } + // The agent's resume id is captured in metadata (here set explicitly). + if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil { + t.Fatalf("patch metadata: %v", err) + } + + sess, err := h.sm.Restore(ctx, "sess-1") + if err != nil { + t.Fatalf("restore: %v", err) + } + + // Reopened: terminal session reset to a fresh spawn, PR cleared, runtime alive. + if sess.Status != domain.StatusSpawning { + t.Errorf("status = %q, want spawning", sess.Status) + } + rec, _, _ := h.store.Get(ctx, "sess-1") + if got := rec.Lifecycle.Session; got.State != domain.SessionNotStarted || got.Reason != domain.ReasonSpawnRequested { + t.Errorf("session substate = %+v, want not_started/spawn_requested", got) + } + if got := rec.Lifecycle.PR; got.State != domain.PRNone || got.Reason != domain.PRReasonClearedOnRestore { + t.Errorf("pr substate = %+v, want none/cleared_on_restore", got) + } + if rec.Lifecycle.Runtime.State != domain.RuntimeAlive { + t.Errorf("runtime state = %q, want alive", rec.Lifecycle.Runtime.State) + } + + // Relaunched via the agent's resume command (created[0] is the original spawn). + if len(h.runtime.created) != 2 { + t.Fatalf("runtime.created = %d, want 2 (spawn + restore)", len(h.runtime.created)) + } + if got := h.runtime.created[1].LaunchCommand; got != "claude --resume agent-xyz" { + t.Errorf("restore launch command = %q, want resume", got) + } + if h.log.indexOf("Workspace.Restore") == -1 { + t.Error("Workspace.Restore was not called") + } +} + +func TestRestore_MissingAgentSessionID_Errors(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { + t.Fatalf("spawn: %v", err) + } + if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { + t.Fatalf("kill: %v", err) + } + // No agent session id was ever captured (spawn leaves it empty) — resume is + // impossible, so Restore must fail early without touching workspace/runtime. + beforeRestores := len(h.workspace.restoredID) + beforeCreated := len(h.runtime.created) + + if _, err := h.sm.Restore(ctx, "sess-1"); err == nil { + t.Fatal("restore: want error for missing agent session id, got nil") + } + if len(h.workspace.restoredID) != beforeRestores { + t.Error("workspace was touched despite a doomed restore") + } + if len(h.runtime.created) != beforeCreated { + t.Error("runtime was created despite a doomed restore") + } + // The session stays terminal — a failed restore does not reopen it. + rec, _, _ := h.store.Get(ctx, "sess-1") + if rec.Lifecycle.Session.State != domain.SessionTerminated { + t.Errorf("session state = %q, want terminated (unchanged)", rec.Lifecycle.Session.State) + } +} + +func TestRestore_OnSpawnCompletedFailure_RollsBackRuntime(t *testing.T) { + h := newHarness("sess-1") + ctx := context.Background() + if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil { + t.Fatalf("spawn: %v", err) + } + if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil { + t.Fatalf("kill: %v", err) + } + if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil { + t.Fatalf("patch metadata: %v", err) + } + + // Fail the post-create LCM call; capture teardown counts just before restore. + h.lcm.onSpawnErr = errors.New("lcm boom") + destroyedBefore := len(h.runtime.destroyed) + wsDestroyedBefore := len(h.workspace.destroyed) + + if _, err := h.sm.Restore(ctx, "sess-1"); err == nil { + t.Fatal("restore: want error, got nil") + } + + // The runtime created during restore is torn back down so no process is + // stranded; the workspace is left intact (it holds the agent's prior work). + if len(h.runtime.destroyed) != destroyedBefore+1 { + t.Errorf("runtime.destroyed grew by %d, want 1 (restore rollback)", len(h.runtime.destroyed)-destroyedBefore) + } + if len(h.workspace.destroyed) != wsDestroyedBefore { + t.Errorf("workspace was destroyed on restore rollback; it must be preserved") + } +} + +func TestCleanup_SkipsUncommittedWork(t *testing.T) { + h := newHarness("unused") + ctx := context.Background() + + // Two terminal sessions (reclaimable) + one working session (must be ignored). + seedTerminal(t, h, "done-1", "/tmp/ws/done-1") + seedTerminal(t, h, "dirty-1", "/tmp/ws/dirty-1") + if err := h.store.Seed(ctx, domain.SessionRecord{ + ID: "live-1", ProjectID: testProject, + Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), + }); err != nil { + t.Fatalf("seed live: %v", err) + } + // dirty-1's worktree still holds uncommitted work — Destroy refuses it. + h.workspace.refuse["/tmp/ws/dirty-1"] = true + + res, err := h.sm.Cleanup(ctx, testProject) + if err != nil { + t.Fatalf("cleanup: %v", err) + } + + if !equalIDSet(res.Cleaned, []domain.SessionID{"done-1"}) { + t.Errorf("cleaned = %v, want [done-1]", res.Cleaned) + } + if !equalIDSet(res.Skipped, []domain.SessionID{"dirty-1"}) { + t.Errorf("skipped = %v, want [dirty-1]", res.Skipped) + } + // The live session was never a candidate. + if contains(res.Cleaned, "live-1") || contains(res.Skipped, "live-1") { + t.Error("non-terminal session must not be cleaned or skipped") + } +} + +// ---- test helpers ---- + +func lc(s domain.SessionState, r domain.SessionReason, prs domain.PRState, prr domain.PRReason) domain.CanonicalSessionLifecycle { + return domain.CanonicalSessionLifecycle{ + Version: domain.LifecycleVersion, + Session: domain.SessionSubstate{State: s, Reason: r}, + PR: domain.PRSubstate{State: prs, Reason: prr}, + Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}, + } +} + +func seedTerminal(t *testing.T, h *harness, id domain.SessionID, wsPath string) { + t.Helper() + ctx := context.Background() + if err := h.store.Seed(ctx, domain.SessionRecord{ + ID: id, ProjectID: testProject, + Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), + }); err != nil { + t.Fatalf("seed %s: %v", id, err) + } + if err := h.store.PatchMetadata(ctx, id, map[string]string{lifecycle.MetaWorkspacePath: wsPath}); err != nil { + t.Fatalf("patch metadata %s: %v", id, err) + } +} + +func equalStrings(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func contains(ids []domain.SessionID, id domain.SessionID) bool { + for _, x := range ids { + if x == id { + return true + } + } + return false +} + +func equalIDSet(got, want []domain.SessionID) bool { + if len(got) != len(want) { + return false + } + for _, w := range want { + if !contains(got, w) { + return false + } + } + return true +} diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..f42f222f --- /dev/null +++ b/docs/README.md @@ -0,0 +1,34 @@ +# agent-orchestrator (rewrite) — docs + +The agent-orchestrator is being rebuilt as a long-running **Go backend daemon** +(`backend/`) plus an **Electron + TypeScript frontend** (`frontend/`). The +backend supervises a fleet of coding-agent sessions and keeps one true status +per session. + +This folder documents the **Lifecycle Manager (LCM) + Session Manager (SM) +lane** — the deterministic core of the backend that is now implemented (behind +fakes) on the `feat/lcm-sm-contracts` integration branch. + +## Start here + +| Doc | What it covers | +|-----|----------------| +| [architecture.md](architecture.md) | How the lane works: the OBSERVE→DECIDE→ACT loop, the canonical state model, the package layout, every component, and the load-bearing invariants. Read this first. | +| [status.md](status.md) | What's done (PR by PR), what's left, the integration to-dos, the open cross-lane contract questions, and how to build/test. | + +## The one-paragraph mental model + +The backend is a **stateless supervisor over external ground truth**: git/GitHub +own PR/CI/review truth, the agent's own files own its activity, and the backend +owns no agent state. Its whole job is, per session: **OBSERVE** raw facts → +**DECIDE** one canonical status via pure, deterministic functions → **ACT** +(persist + fire reactions). The LCM is that reducer; the SM is the +explicit-mutation plumbing (spawn/kill/restore/cleanup) that feeds it. + +## Where this lane fits + +Other lanes (built by other people, in parallel) provide the real adapters this +lane depends on through narrow interfaces: the **persistence layer + CDC**, the +**SCM poller**, the **runtime/agent/workspace plugins**, the **backend API + +OpenAPI**, and the **frontend store**. See [status.md](status.md#integration) +for the hand-off points. diff --git a/docs/architecture.md b/docs/architecture.md new file mode 100644 index 00000000..9673142c --- /dev/null +++ b/docs/architecture.md @@ -0,0 +1,187 @@ +# LCM + Session Manager — architecture + +This is the deterministic core of the backend daemon. It supervises agent +sessions and keeps exactly one true status per session. + +## 1. Mental model: OBSERVE → DECIDE → ACT + +The backend owns no agent state. git/GitHub own PR/CI/review truth; the agent's +own files own its activity. The job, per session, is one loop: + +``` +OBSERVE → DECIDE → ACT +(impure, external) (pure, total) (impure) +raw facts one canonical status persist + react +``` + +In the rewrite the **OBSERVE** step lives *outside* the LCM (separate owners), +and the LCM is a **synchronous reducer** invoked with facts: + +``` +SCM poller ─ ApplySCMObservation ──┐ +reaper ─ ApplyRuntimeObservation┤ +activity hooks ─ ApplyActivitySignal ───┼─▶ LCM: load canonical +Session Mgr ─ OnSpawnCompleted ──────┘ → pure DECIDE + ─ OnKillRequested → diff → persist (merge-patch) +reaper tick ─ TickEscalations → if transition: react (ACT) +``` + +The LCM **never polls**. The reaper (a timer, owned elsewhere) drives liveness +sampling and duration-based escalation by calling in. + +## 2. Canonical state model — the crown jewel + +The **only** thing persisted per session is `CanonicalSessionLifecycle` +(`backend/internal/domain/lifecycle.go`). The single-word display status is +**derived on read and never stored** — this is the most important invariant; it +prevents canonical truth and display from drifting. + +``` +CanonicalSessionLifecycle + Version schema version of the record shape + Revision monotonic write counter (optimistic-concurrency token) + Session (state, reason) working/idle/needs_input/stuck/detecting/done/terminated + PR (state, reason) none/open/merged/closed + Runtime (state, reason) unknown/alive/exited/missing/probe_failed + Activity last-known agent activity (+ timestamp, source) ← decider input + Detecting anti-flap quarantine memory (nil unless quarantined) ← decider input +``` + +`DeriveLegacyStatus` (`domain/status.go`) is the **sole producer** of the +display `SessionStatus`. Precedence: terminal/hard session states map directly +(they outrank PR facts) → a merged PR wins → an open PR maps by reason → else the +soft session state. So an idle worker with a CI-failing open PR displays +`ci_failed`, but a `needs_input` session shows `needs_input` regardless of the PR. + +`Session` (`domain/session.go`) is the read-model: a `SessionRecord` +(persistence shape, identity + lifecycle + metadata) plus the derived `Status`. +The **Session Manager is the single producer of `Status`** — it attaches it on +read; the store and API never recompute or persist it. + +## 3. Package layout (`backend/internal/`) + +``` +domain/ the vocabulary (imports only the std lib → no cycles) + lifecycle.go CanonicalSessionLifecycle + all sub-states/enums + status.go SessionStatus + DeriveLegacyStatus (sole display producer) + session.go SessionRecord (persisted) + Session (read-model) + id types + decide/ the PURE core — total, deterministic, zero I/O + types.go LifecycleDecision + Probe/OpenPR/Detecting inputs + tuning consts + decide.go the deciders + the anti-flap quarantine + HashEvidence +ports/ the boundaries (interfaces + DTOs) + inbound.go LifecycleManager, SessionManager (we implement) + outbound.go LifecycleStore, Notifier, AgentMessenger, Runtime/Agent/Workspace + facts.go SCMFacts, RuntimeFacts, ActivitySignal, SpawnOutcome, KillReason +lifecycle/ the LCM implementation (DECIDE + ACT) + manager.go the Apply* pipeline, per-session lock, patch diffing + decide_bridge.go fact→decide-input translation + the composition rules + reactions.go the reaction table + escalation engine + TickEscalations +session/ the SM implementation (explicit mutations) + manager.go Spawn/Kill/Restore/Cleanup/List/Get/Send + rollback +``` + +`domain` + `ports` are the committed, stabilized **integration boundary**. +Everything else implements behind it. + +## 4. The pure DECIDE core (`domain/decide`) + +Total, deterministic, side-effect-free functions — the highest-value test +surface (table-tested to 100%). Key ones: + +- `ResolveProbeDecision` — runtime/process liveness. An explicit kill + short-circuits to terminal; a **failed probe is never read as death** (routes + to `detecting`), as does any probe disagreement; only runtime-dead + + process-dead + no-recent-activity reaches `killed`. +- `ResolveOpenPRDecision` — the PR ladder: `ci_failing` → `changes_requested` → + `mergeable` → `approved` → `review_pending` → idle-beyond → else `pr_open`. +- `ResolveTerminalPRStateDecision` — merged → `merged` (park idle awaiting a + human decision); closed → `idle`. +- `CreateDetectingDecision` — the **anti-flap quarantine**. Counts attempts and + hashes the *timestamp-stripped* evidence; escalates to `stuck` only after 3 + consecutive unchanged-evidence ticks **or** 5 minutes since first entering + detecting (`StartedAt` is preserved across the whole episode). Changing + evidence resets the counter. + +## 5. The LCM (`lifecycle`) + +Implements `ports.LifecycleManager`. Every `Apply*`/`On*` entrypoint runs the +same pipeline (`manager.go`): + +``` +withLock(session): ← per-session serialization + load canonical → decideFn (build sparse patch) → if changed: persist → load after +return transition (before, after) +``` +then, **after the lock releases**, `react()` fires the mapped reaction. + +- **Per-session serialization** — `keyedMutex` hands out one lock per session id + (parallel across sessions, serial within one). Entries are reference-counted + and evicted when the last holder releases, so the map stays bounded. +- **Composition rules** (`decide_bridge.go`) — two observers must not fight over + the session axis. Liveness (runtime probes) owns the runtime + death/detecting + axis; activity owns working/idle/waiting. `isLivenessOwned` decides when a + healthy probe may *recover* a state (e.g. `detecting → working`) vs. when it + must not clobber an activity-owned `needs_input`/`blocked`. A high-confidence + activity signal may resolve a `detecting` session; an open PR writes only the + PR axis and lets `DeriveLegacyStatus` surface it. +- **Detecting-memory lifecycle** — a decision with `Detecting == nil` clears the + persisted quarantine memory (`LifecyclePatch.ClearDetecting`) so a stale prior + can't leak into a later episode. +- **ACT — reactions + escalation** (`reactions.go`) — on a genuine status + transition, `react()` maps it to a reaction (`send-to-agent` / `notify`; + `auto-merge` exists but is off by default) and dispatches it. A + per-`(session,reaction)` escalation tracker counts attempts; it escalates + (notifies a human and silences further auto-dispatch) when a numeric cap or a + duration is exceeded. The `ci-failed` budget is persistent across CI + oscillation within an open PR and re-arms on genuine recovery. `TickEscalations` + (called by the reaper) fires the duration-based escalations the synchronous + LCM can't wake itself for; it notifies outside the lock. + +## 6. The Session Manager (`session`) + +Implements `ports.SessionManager` — the explicit-mutation plumbing. It never +derives/observes lifecycle state; it routes outcomes to the LCM. + +- **Spawn** — `Workspace.Create` → build prompt → `Runtime.Create` (env + `AO_SESSION_ID`/`AO_PROJECT_ID`/`AO_ISSUE_ID`) → **seed** the initial record + (`not_started`/`spawn_requested`) via the store → `LCM.OnSpawnCompleted`. + Eager rollback unwinds prior steps on failure; an `OnSpawnCompleted` failure + routes the seeded orphan to terminal-errored (the store has no delete; a later + `Cleanup` reclaims it). +- **Kill** — `LCM.OnKillRequested` → `Runtime.Destroy` → `Workspace.Destroy`, + honoring the **worktree-remove safety**: after `git worktree prune`, a still- + registered path is never `rm -rf`'d (it may hold the agent's uncommitted work) + — the refusal is surfaced, not forced. +- **Restore** — reopen via `PatchLifecycle` (not re-seed): session → + `not_started`, PR → `cleared_on_restore`; relaunch with the agent's resume + command; runtime is rolled back on a post-create failure. +- **List/Get** — read records and attach the derived `Status`. **Send** — via + `AgentMessenger`. **Cleanup** — tear down terminal/stale sessions, skipping + paths with uncommitted work. + +## 7. Load-bearing invariants + +1. **Persist canonical; derive display.** Never store the display status. +2. **One authority for death.** Only the DECIDE pipeline (via `detecting`) writes + inferred terminal states; the SM's explicit-kill path goes through + `OnKillRequested`. Everything else that notices a dead runtime persists + `detecting`, never `terminated`. +3. **Failed probe ≠ dead.** Timed-out/errored probes route to `detecting`. +4. **Evidence-hash debounce** prevents flapping signals from terminating live + work; the 5-minute cap is a whole-episode wall-clock safety net. +5. **PR facts dominate** the soft session states once a PR exists. +6. **Merge-patch persistence** — writes touch only changed keys; the store is the + single disk writer (atomic write + lock + CDC). +7. **Sticky activity states** (`waiting_input`/`blocked`) do not decay by clock. +8. **Worktree-remove safety** on teardown. + +## 8. Concurrency & testing + +- Within a session, the per-session lock serializes the load→decide→persist + read-modify-write. `react()` runs *outside* the lock (so a busy-waiting + send-to-agent never holds the session mutex) — see `status.md` for the + integration-time follow-up this implies. +- Tests use **in-memory fakes** for every outbound port, so the LCM and SM are + fully testable with no real adapters. The SM tests drive the **real** + `lifecycle.Manager` for spawn/kill round-trips, so the SM↔LCM contract is + genuinely exercised. The `decide` package is table-tested in isolation. diff --git a/docs/status.md b/docs/status.md new file mode 100644 index 00000000..9bb79cdb --- /dev/null +++ b/docs/status.md @@ -0,0 +1,98 @@ +# LCM + Session Manager — status & roadmap + +Where the lane stands, what's left, and where to plug in. + +## Branch model + +`feat/lcm-sm-contracts` is the **lane integration branch**: each sub-PR below +branched off it and merged **into** it. The whole lane lands on `main` as one +unit once it's ready. Sub-PRs were reviewed against the integration branch; +the eventual lane→main merge is a single cumulative review. + +## Done — implementation complete (behind fakes) + +| Area | What landed | PR | +|------|-------------|----| +| Skeleton | `backend/` (Go) + `frontend/` (Electron/TS) | #1 (on `main`) | +| Contracts + CI | `domain/` + `ports/`; Go + gitleaks workflows | #2 | +| Pure DECIDE core | the deciders + anti-flap quarantine + exhaustive truth-table tests | #4 | +| LCM — pipeline | `Apply*` pipeline, per-session serialization, store integration, composition rules, detecting-memory lifecycle | #5 | +| LCM — reactions | reaction table + escalation engine + real `TickEscalations` | #6 | +| Session Manager | spawn / kill / restore / cleanup / list, eager rollback, worktree-remove safety | #7 | + +`gofmt` / `go build` / `go vet` / `go test -race` all green across `domain`, +`domain/decide`, `lifecycle`, and `session`. The `decide` core is at 100% +statement coverage; the impl packages cover the load-bearing logic including the +error/rollback paths. + +### Build & test + +``` +cd backend +gofmt -l . # must print nothing +go build ./... +go vet ./... +go test -race ./... +go test -cover ./... +``` + +## Not done — the integration phase + +Everything above runs against **in-memory fakes**. Making it a live system means +swapping fakes for real adapters (built by other lanes) behind the existing +ports, and resolving the carried-forward items below. + +### Carried-forward items (must be addressed as real adapters land) + +- **`react()` out-of-lock dispatch.** Reactions fire after the per-session lock + releases (deliberate, so a busy-waiting send-to-agent doesn't hold the mutex). + Under a live daemon with concurrent observers this can dispatch on a stale + snapshot / out of order. Give `react()` a per-session ordering (a small react + queue) or re-check the triggering state before dispatching. Documented in + `lifecycle/reactions.go`. +- **`ExpectedRevision` optimistic-concurrency is unused.** The in-process + per-session mutex covers a single daemon. Multi-writer or CDC-driven setups + must use the `LifecyclePatch.ExpectedRevision` CAS the contract already exposes. +- **Store `Seed` + `Get` need a real implementation.** The Session Manager added + two record-with-identity methods to `LifecycleStore`; the real persistence + layer must implement them (create-with-identity that rejects an existing id; + full-record read by id). Documented in `ports/outbound.go`. + +### Real adapters needed (other lanes) + +| Port | Real adapter | Owning lane | +|------|--------------|-------------| +| `LifecycleStore` | persistence layer (flat-file/KV + atomic write + lock + CDC) | persistence | +| `SCMFacts` producer | SCM poller (batch PR/CI/review enrichment) | SCM | +| `Runtime` / `Agent` / `Workspace` | tmux runtime, claude-code/codex agent, git-worktree workspace | coding-agents | +| `Notifier` | desktop/Slack notifier | notifications | +| `AgentMessenger` | tmux inject with busy-detect + delivery verify | coding-agents | +| `SessionManager` consumer | backend API (routes/controllers) + OpenAPI | API | + +### Open cross-lane contract questions + +- **SCM facts** — does `SCMFacts` match what the poller can cheaply produce + (batch enrichment, CI log tail as a pointer)? +- **Persistence** — is `LifecycleStore` + `LifecyclePatch` the right boundary? + Per-session lock vs. the `ExpectedRevision` CAS? +- **API** — is the `SessionManager` interface + the `Session` read-model + OpenAPI-friendly? + +### Land the lane → `main` + +A final cumulative review of `feat/lcm-sm-contracts` vs. `main`, then merge the +complete lane in one unit. + +## Where to plug in (for someone picking this up) + +- **Implementing a real adapter?** Write it to satisfy the matching interface in + `ports/`, then construct the `lifecycle.Manager` / `session.Manager` with it in + place of the fake. Nothing in `domain`/`lifecycle`/`session` should need to + change. +- **Changing decision behavior?** It lives in `domain/decide` (pure) — add a + truth-table case first; nothing there does I/O. +- **Adding a reaction?** Extend the table in `lifecycle/reactions.go` and map the + triggering status in `reactionEventFor`. +- **Don't** persist the display status, conclude death outside the probe + pipeline, or `rm -rf` a still-registered worktree — see the invariants in + [architecture.md](architecture.md#7-load-bearing-invariants).