diff --git a/backend/internal/domain/decide/decide.go b/backend/internal/domain/decide/decide.go index e92ed694..e7f2c445 100644 --- a/backend/internal/domain/decide/decide.go +++ b/backend/internal/domain/decide/decide.go @@ -2,13 +2,14 @@ // collapses observed facts (plus the prior detecting/activity memory) into one // LifecycleDecision. Every function here must remain side-effect free so the // whole status truth-table can be tested in isolation. -// -// NOTE: function bodies are stubbed in this contracts PR. The real logic + the -// exhaustive truth-table tests land in the follow-up "decide core" PR. The -// signatures and the input/output shapes are what we are stabilising now. package decide import ( + "crypto/sha256" + "encoding/hex" + "fmt" + "regexp" + "strings" "time" "github.com/aoagents/agent-orchestrator/backend/internal/domain" @@ -22,87 +23,241 @@ const ( DetectingMaxDuration = 5 * time.Minute ) -// LifecycleDecision is the output of every decider: the derived display status -// plus the canonical sub-state values to persist, the human-readable evidence, -// and the (possibly updated) detecting memory. -type LifecycleDecision struct { - Status domain.SessionStatus - Evidence string - Detecting *domain.DetectingState - SessionState domain.SessionState - SessionReason domain.SessionReason - PRState domain.PRState - PRReason domain.PRReason -} +// ResolveProbeDecision reconciles runtime/process liveness into a decision. +// +// The ordering encodes the load-bearing invariants: +// - an explicit kill short-circuits straight to terminal (the only inferred +// terminal this decider may reach without quarantine); +// - a *failed* probe (timeout/error) is never read as death — it routes to +// detecting, as does any disagreement between the two probes; +// - only runtime-dead + process-dead + no-recent-activity reaches killed. +func ResolveProbeDecision(in ProbeInput) LifecycleDecision { + if in.KillRequested { + return LifecycleDecision{ + Status: domain.StatusKilled, + Evidence: "manual kill requested", + SessionState: domain.SessionTerminated, + SessionReason: domain.ReasonManuallyKilled, + } + } -// ProbeInput reconciles runtime + process liveness. A *failed* probe (timeout -// or error) is distinct from a "dead" verdict and must route to detecting, -// never to a death conclusion. KillRequested short-circuits to terminal. -type ProbeInput struct { - Runtime domain.RuntimeState - RuntimeFailed bool - Process ProcessLiveness - ProcessFailed bool - RecentActivity bool - KillRequested bool - Prior *domain.DetectingState - Now time.Time -} + if in.RuntimeFailed || in.ProcessFailed || in.Runtime == domain.RuntimeProbeFailed { + ev := fmt.Sprintf("probe_failed runtime=%s runtimeFailed=%t process=%s processFailed=%t", + in.Runtime, in.RuntimeFailed, in.Process, in.ProcessFailed) + return detecting(in, domain.ReasonProbeFailure, ev) + } -// ProcessLiveness mirrors isProcessRunning's three-valued answer. -type ProcessLiveness string + switch in.Runtime { + case domain.RuntimeAlive: + if in.Process == ProcessDead { + // Runtime up but the agent process is gone: probes disagree. + ev := fmt.Sprintf("disagree runtime=alive process=%s recentActivity=%t", in.Process, in.RecentActivity) + return detecting(in, domain.ReasonAgentProcessExited, ev) + } + return LifecycleDecision{ + Status: domain.StatusWorking, + Evidence: fmt.Sprintf("alive runtime=alive process=%s", in.Process), + SessionState: domain.SessionWorking, + SessionReason: domain.ReasonTaskInProgress, + } -const ( - ProcessAlive ProcessLiveness = "alive" - ProcessDead ProcessLiveness = "dead" - ProcessIndeterminate ProcessLiveness = "indeterminate" -) - -// OpenPRInput drives the PR pipeline ladder for an open PR. -type OpenPRInput struct { - CIFailing bool - ChangesRequested bool - Approved bool - Mergeable bool - ReviewPending bool - IdleBeyond bool // idle past the stuck threshold - Number int - URL string -} + case domain.RuntimeExited, domain.RuntimeMissing: + // Runtime is gone. Death is only concluded when the process is *also* + // confirmed dead AND nothing has been heard from the agent recently; + // any other shape is ambiguous and quarantines. + if in.Process == ProcessAlive || in.RecentActivity { + ev := fmt.Sprintf("disagree runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity) + return detecting(in, domain.ReasonRuntimeLost, ev) + } + if in.Process == ProcessDead { + return LifecycleDecision{ + Status: domain.StatusKilled, + Evidence: fmt.Sprintf("dead runtime=%s process=dead recentActivity=false", in.Runtime), + SessionState: domain.SessionTerminated, + SessionReason: domain.ReasonRuntimeLost, + } + } + // Process indeterminate: cannot confirm death, so quarantine. + ev := fmt.Sprintf("runtime_lost runtime=%s process=%s recentActivity=false", in.Runtime, in.Process) + return detecting(in, domain.ReasonRuntimeLost, ev) -// DetectingInput feeds the quarantine counter. Evidence is hashed with -// timestamps stripped, so "same ambiguous signal" keeps the counter climbing -// while any real change resets it. -type DetectingInput struct { - Evidence string - ProposedState domain.SessionState - ProposedReason domain.SessionReason - Prior *domain.DetectingState - Now time.Time + default: + // unknown (not yet probed): ambiguous, never conclude death. + ev := fmt.Sprintf("runtime_unknown runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity) + return detecting(in, domain.ReasonRuntimeLost, ev) + } } -// ResolveProbeDecision reconciles runtime/process liveness into a decision. -func ResolveProbeDecision(in ProbeInput) LifecycleDecision { - panic("decide.ResolveProbeDecision: not implemented (decide-core PR)") -} - -// ResolveOpenPRDecision walks the PR pipeline ladder. +// ResolveOpenPRDecision walks the PR pipeline ladder. CI failure dominates +// everything, then requested changes, then the approval/merge states, then a +// pending review, then a stalled (idle-beyond-threshold) PR, else plain open. func ResolveOpenPRDecision(in OpenPRInput) LifecycleDecision { - panic("decide.ResolveOpenPRDecision: not implemented (decide-core PR)") + // evidence is a stable, timestamp-free summary " # " + // for logs/traceability; it folds in the PR identity inputs (Number/URL). + evidence := func(cond string) string { + s := cond + if in.Number > 0 { + s += fmt.Sprintf(" #%d", in.Number) + } + if in.URL != "" { + s += " " + in.URL + } + return s + } + base := func(status domain.SessionStatus, cond string, prReason domain.PRReason, ss domain.SessionState, sr domain.SessionReason) LifecycleDecision { + return LifecycleDecision{ + Status: status, + Evidence: evidence(cond), + SessionState: ss, + SessionReason: sr, + PRState: domain.PROpen, + PRReason: prReason, + } + } + + switch { + case in.CIFailing: + return base(domain.StatusCIFailed, "ci_failing", domain.PRReasonCIFailing, domain.SessionWorking, domain.ReasonFixingCI) + case in.ChangesRequested: + return base(domain.StatusChangesRequested, "changes_requested", domain.PRReasonChangesRequested, domain.SessionWorking, domain.ReasonResolvingReviewComments) + case in.Mergeable: + // Mergeability is the authoritative merge gate, so it already folds in + // "approved if review is required". Checking it before Approved means a + // PR on a no-required-review repo (mergeable, not formally approved) is + // still surfaced as ready-to-merge instead of falling through to PR_OPEN. + return base(domain.StatusMergeable, "merge_ready", domain.PRReasonMergeReady, domain.SessionIdle, domain.ReasonAwaitingExternalReview) + case in.Approved: + return base(domain.StatusApproved, "approved", domain.PRReasonApproved, domain.SessionIdle, domain.ReasonAwaitingExternalReview) + case in.ReviewPending: + return base(domain.StatusReviewPending, "review_pending", domain.PRReasonReviewPending, domain.SessionIdle, domain.ReasonAwaitingExternalReview) + case in.IdleBeyond: + // A PR open but quiet past the stuck threshold needs a human nudge. + return base(domain.StatusStuck, "idle_beyond", domain.PRReasonInProgress, domain.SessionStuck, domain.ReasonAwaitingUserInput) + default: + return base(domain.StatusPROpen, "pr_open", domain.PRReasonInProgress, domain.SessionWorking, domain.ReasonPRCreated) + } } -// ResolveTerminalPRStateDecision handles merged/closed PRs. +// ResolveTerminalPRStateDecision handles merged/closed PRs. A merge parks the +// session idle awaiting a human's post-merge decision; a close drops to idle. +// none/open are not terminal — callers should route those to the open-PR or +// probe deciders — but the function stays total for safety. func ResolveTerminalPRStateDecision(pr domain.PRState) LifecycleDecision { - panic("decide.ResolveTerminalPRStateDecision: not implemented (decide-core PR)") + switch pr { + case domain.PRMerged: + return LifecycleDecision{ + Status: domain.StatusMerged, + Evidence: "pr merged", + SessionState: domain.SessionIdle, + SessionReason: domain.ReasonMergedWaitingDecision, + PRState: domain.PRMerged, + PRReason: domain.PRReasonMerged, + } + case domain.PRClosed: + return LifecycleDecision{ + Status: domain.StatusIdle, + Evidence: "pr closed unmerged", + SessionState: domain.SessionIdle, + SessionReason: domain.ReasonAwaitingUserInput, + PRState: domain.PRClosed, + PRReason: domain.PRReasonClosedUnmerged, + } + default: + return LifecycleDecision{ + Status: domain.StatusWorking, + Evidence: fmt.Sprintf("non-terminal pr state=%s", pr), + SessionState: domain.SessionWorking, + SessionReason: domain.ReasonTaskInProgress, + PRState: pr, + } + } } // CreateDetectingDecision advances or escalates the anti-flap quarantine. +// +// The attempt counter climbs only while the (timestamp-stripped) evidence hash +// is unchanged and resets the moment the evidence moves; StartedAt is preserved +// across the whole detecting episode so the duration cap is a real wall-clock +// safety net even when the evidence keeps flapping. Escalation to stuck fires +// at DetectingMaxAttempts consecutive unchanged ticks OR DetectingMaxDuration +// elapsed since first entering detecting. func CreateDetectingDecision(in DetectingInput) LifecycleDecision { - panic("decide.CreateDetectingDecision: not implemented (decide-core PR)") + hash := HashEvidence(in.Evidence) + + attempts := 1 + startedAt := in.Now + if in.Prior != nil { + startedAt = in.Prior.StartedAt + if in.Prior.EvidenceHash == hash { + attempts = in.Prior.Attempts + 1 + } + } + + escalate := attempts >= DetectingMaxAttempts || !in.Now.Before(startedAt.Add(DetectingMaxDuration)) + if escalate { + return LifecycleDecision{ + Status: domain.StatusStuck, + Evidence: in.Evidence, + SessionState: domain.SessionStuck, + SessionReason: in.ProposedReason, + } + } + + return LifecycleDecision{ + Status: domain.StatusDetecting, + Evidence: in.Evidence, + Detecting: &domain.DetectingState{Attempts: attempts, StartedAt: startedAt, EvidenceHash: hash}, + SessionState: domain.SessionDetecting, + SessionReason: in.ProposedReason, + } } -// HashEvidence normalises an evidence string (stripping timestamps) and hashes -// it, so unchanged-but-restamped signals compare equal. +// HashEvidence normalises an evidence string (stripping timestamps and +// collapsing whitespace) and hashes it, so unchanged-but-restamped signals +// compare equal and the detecting counter is not reset by clock movement alone. func HashEvidence(evidence string) string { - panic("decide.HashEvidence: not implemented (decide-core PR)") + s := evidence + for _, re := range timestampPatterns { + s = re.ReplaceAllString(s, "") + } + s = strings.Join(strings.Fields(s), " ") + sum := sha256.Sum256([]byte(s)) + return hex.EncodeToString(sum[:]) +} + +// timestampPatterns is the list of regexes HashEvidence applies (in order) to +// delete the time-varying parts of an evidence string before hashing, so the +// same ambiguous signal restamped with a new clock value hashes equal and the +// detecting counter keeps climbing instead of resetting every tick. +// +// Order matters: the full datetime form is removed first so its embedded +// HH:MM:SS isn't half-eaten by the bare time-of-day pattern that follows. +// +// 1. full ISO-8601 / RFC3339 datetime — date, a T or space separator, +// HH:MM:SS, optional fractional seconds, optional Z or ±HH:MM offset. +// e.g. "2026-05-26T12:00:00Z", "2026-05-26 12:00:00.218+05:30" +// 2. a bare time-of-day, e.g. "12:00:00" or "12:00:00.218" +// 3. a bare unix epoch — any 10-13 digit run (seconds or millis), e.g. +// "1716724800". This is broad enough to also clobber a same-width numeric +// ID if one ever appears in evidence; evidence is decider-authored, so keep +// IDs out of evidence strings to preserve hash fidelity. +var timestampPatterns = []*regexp.Regexp{ + regexp.MustCompile(`\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?`), + regexp.MustCompile(`\d{2}:\d{2}:\d{2}(?:\.\d+)?`), + regexp.MustCompile(`\b\d{10,13}\b`), +} + +// detecting adapts a probe verdict into the shared anti-flap path. It packages +// the proposed reason + evidence (plus the prior counter from the same probe +// input) into a DetectingInput and defers to CreateDetectingDecision, so every +// probe-driven ambiguity is counted and escalated by the identical quarantine +// logic instead of each probe branch re-implementing the counter. +func detecting(in ProbeInput, reason domain.SessionReason, evidence string) LifecycleDecision { + return CreateDetectingDecision(DetectingInput{ + Evidence: evidence, + ProposedState: domain.SessionDetecting, + ProposedReason: reason, + Prior: in.Prior, + Now: in.Now, + }) } diff --git a/backend/internal/domain/decide/decide_test.go b/backend/internal/domain/decide/decide_test.go new file mode 100644 index 00000000..d6e027f1 --- /dev/null +++ b/backend/internal/domain/decide/decide_test.go @@ -0,0 +1,530 @@ +package decide + +import ( + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +var t0 = time.Date(2026, 5, 26, 12, 0, 0, 0, time.UTC) + +func TestResolveProbeDecision(t *testing.T) { + tests := []struct { + name string + in ProbeInput + wantStatus domain.SessionStatus + wantState domain.SessionState + wantReason domain.SessionReason + wantDetect bool // expect non-nil Detecting memory + wantTermNil bool // expect terminal (Detecting must be nil) + }{ + { + name: "kill requested short-circuits to terminal killed", + in: ProbeInput{KillRequested: true, Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, + wantStatus: domain.StatusKilled, + wantState: domain.SessionTerminated, + wantReason: domain.ReasonManuallyKilled, + wantTermNil: true, + }, + { + name: "kill requested wins even over a dead+dead probe", + in: ProbeInput{KillRequested: true, Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusKilled, + wantState: domain.SessionTerminated, + wantReason: domain.ReasonManuallyKilled, + wantTermNil: true, + }, + { + name: "runtime probe failed routes to detecting, never death", + in: ProbeInput{Runtime: domain.RuntimeMissing, RuntimeFailed: true, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonProbeFailure, + wantDetect: true, + }, + { + name: "process probe failed routes to detecting", + in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, ProcessFailed: true, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonProbeFailure, + wantDetect: true, + }, + { + name: "runtime state probe_failed routes to detecting", + in: ProbeInput{Runtime: domain.RuntimeProbeFailed, Process: ProcessIndeterminate, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonProbeFailure, + wantDetect: true, + }, + { + name: "runtime alive + process alive is working", + in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, + wantStatus: domain.StatusWorking, + wantState: domain.SessionWorking, + wantReason: domain.ReasonTaskInProgress, + }, + { + name: "runtime alive + process indeterminate leans alive", + in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessIndeterminate, Now: t0}, + wantStatus: domain.StatusWorking, + wantState: domain.SessionWorking, + wantReason: domain.ReasonTaskInProgress, + }, + { + name: "runtime alive + process dead disagree -> detecting (agent_process_exited)", + in: ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonAgentProcessExited, + wantDetect: true, + }, + { + name: "runtime dead + process alive disagree -> detecting (runtime_lost)", + in: ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessAlive, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonRuntimeLost, + wantDetect: true, + }, + { + name: "runtime dead + recent activity disagree -> detecting (runtime_lost)", + in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, RecentActivity: true, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonRuntimeLost, + wantDetect: true, + }, + { + name: "runtime dead + process indeterminate cannot confirm -> detecting", + in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonRuntimeLost, + wantDetect: true, + }, + { + name: "runtime exited + process dead + no activity -> killed terminal", + in: ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusKilled, + wantState: domain.SessionTerminated, + wantReason: domain.ReasonRuntimeLost, + wantTermNil: true, + }, + { + name: "runtime missing + process dead + no activity -> killed terminal", + in: ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusKilled, + wantState: domain.SessionTerminated, + wantReason: domain.ReasonRuntimeLost, + wantTermNil: true, + }, + { + name: "runtime unknown is ambiguous -> detecting (runtime_lost)", + in: ProbeInput{Runtime: domain.RuntimeUnknown, Process: ProcessDead, Now: t0}, + wantStatus: domain.StatusDetecting, + wantState: domain.SessionDetecting, + wantReason: domain.ReasonRuntimeLost, + wantDetect: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ResolveProbeDecision(tt.in) + if got.Status != tt.wantStatus { + t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) + } + if got.SessionState != tt.wantState { + t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) + } + if got.SessionReason != tt.wantReason { + t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason) + } + if tt.wantDetect && got.Detecting == nil { + t.Errorf("expected non-nil Detecting memory, got nil") + } + if tt.wantTermNil && got.Detecting != nil { + t.Errorf("terminal decision must carry nil Detecting, got %+v", got.Detecting) + } + }) + } +} + +func TestResolveOpenPRDecision(t *testing.T) { + tests := []struct { + name string + in OpenPRInput + wantStatus domain.SessionStatus + wantPR domain.PRReason + wantState domain.SessionState + }{ + { + name: "ci failing dominates everything", + in: OpenPRInput{CIFailing: true, ChangesRequested: true, Approved: true, Mergeable: true}, + wantStatus: domain.StatusCIFailed, + wantPR: domain.PRReasonCIFailing, + wantState: domain.SessionWorking, + }, + { + name: "changes requested before approval states", + in: OpenPRInput{ChangesRequested: true, Approved: true, Mergeable: true}, + wantStatus: domain.StatusChangesRequested, + wantPR: domain.PRReasonChangesRequested, + wantState: domain.SessionWorking, + }, + { + name: "approved + mergeable -> mergeable", + in: OpenPRInput{Approved: true, Mergeable: true}, + wantStatus: domain.StatusMergeable, + wantPR: domain.PRReasonMergeReady, + wantState: domain.SessionIdle, + }, + { + name: "mergeable without formal approval (no required review) -> mergeable", + in: OpenPRInput{Mergeable: true}, + wantStatus: domain.StatusMergeable, + wantPR: domain.PRReasonMergeReady, + wantState: domain.SessionIdle, + }, + { + name: "approved but not mergeable -> approved", + in: OpenPRInput{Approved: true}, + wantStatus: domain.StatusApproved, + wantPR: domain.PRReasonApproved, + wantState: domain.SessionIdle, + }, + { + name: "review pending", + in: OpenPRInput{ReviewPending: true}, + wantStatus: domain.StatusReviewPending, + wantPR: domain.PRReasonReviewPending, + wantState: domain.SessionIdle, + }, + { + name: "idle beyond threshold -> stuck", + in: OpenPRInput{IdleBeyond: true}, + wantStatus: domain.StatusStuck, + wantPR: domain.PRReasonInProgress, + wantState: domain.SessionStuck, + }, + { + name: "review pending wins over idle-beyond", + in: OpenPRInput{ReviewPending: true, IdleBeyond: true}, + wantStatus: domain.StatusReviewPending, + wantPR: domain.PRReasonReviewPending, + wantState: domain.SessionIdle, + }, + { + name: "nothing set -> plain open", + in: OpenPRInput{}, + wantStatus: domain.StatusPROpen, + wantPR: domain.PRReasonInProgress, + wantState: domain.SessionWorking, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ResolveOpenPRDecision(tt.in) + if got.Status != tt.wantStatus { + t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) + } + if got.PRReason != tt.wantPR { + t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR) + } + if got.PRState != domain.PROpen { + t.Errorf("PRState = %q, want %q", got.PRState, domain.PROpen) + } + if got.SessionState != tt.wantState { + t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) + } + }) + } +} + +func TestResolveOpenPRDecisionEvidence(t *testing.T) { + tests := []struct { + name string + in OpenPRInput + want string + }{ + { + name: "condition with PR number and URL", + in: OpenPRInput{CIFailing: true, Number: 123, URL: "https://example.com/pr/123"}, + want: "ci_failing #123 https://example.com/pr/123", + }, + { + name: "condition with number only", + in: OpenPRInput{Approved: true, Mergeable: true, Number: 7}, + want: "merge_ready #7", + }, + { + name: "no identity falls back to the bare condition", + in: OpenPRInput{}, + want: "pr_open", + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := ResolveOpenPRDecision(tt.in).Evidence; got != tt.want { + t.Errorf("Evidence = %q, want %q", got, tt.want) + } + }) + } +} + +func TestDecidersDeriveConsistently(t *testing.T) { + // Every decision a decider produces must be self-consistent: the display + // Status it reports must equal what DeriveLegacyStatus produces from the + // canonical (session, pr) sub-states it emits. This locks the deciders and + // the display-derivation against drifting apart. + // + // The ResolveTerminalPRStateDecision none/open default is intentionally + // excluded — it is a documented no-op for misuse, not a real verdict. + var decisions []LifecycleDecision + + for _, in := range []OpenPRInput{ + {CIFailing: true}, + {ChangesRequested: true}, + {Approved: true, Mergeable: true}, + {Mergeable: true}, + {Approved: true}, + {ReviewPending: true}, + {IdleBeyond: true}, + {}, + } { + decisions = append(decisions, ResolveOpenPRDecision(in)) + } + + decisions = append(decisions, + ResolveTerminalPRStateDecision(domain.PRMerged), + ResolveTerminalPRStateDecision(domain.PRClosed), + ) + + for _, in := range []ProbeInput{ + {KillRequested: true, Now: t0}, + {Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0}, + {Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0}, + {Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0}, + } { + decisions = append(decisions, ResolveProbeDecision(in)) + } + + for _, d := range decisions { + l := domain.CanonicalSessionLifecycle{ + Session: domain.SessionSubstate{State: d.SessionState, Reason: d.SessionReason}, + PR: domain.PRSubstate{State: d.PRState, Reason: d.PRReason}, + } + if got := domain.DeriveLegacyStatus(l); got != d.Status { + t.Errorf("decision %+v: Status=%q but DeriveLegacyStatus=%q", d, d.Status, got) + } + } +} + +func TestResolveTerminalPRStateDecision(t *testing.T) { + tests := []struct { + name string + pr domain.PRState + wantStatus domain.SessionStatus + wantState domain.SessionState + wantReason domain.SessionReason + wantPR domain.PRReason + }{ + { + name: "merged parks idle awaiting decision", + pr: domain.PRMerged, + wantStatus: domain.StatusMerged, + wantState: domain.SessionIdle, + wantReason: domain.ReasonMergedWaitingDecision, + wantPR: domain.PRReasonMerged, + }, + { + name: "closed drops to idle", + pr: domain.PRClosed, + wantStatus: domain.StatusIdle, + wantState: domain.SessionIdle, + wantReason: domain.ReasonAwaitingUserInput, + wantPR: domain.PRReasonClosedUnmerged, + }, + { + name: "non-terminal none is a working no-op", + pr: domain.PRNone, + wantStatus: domain.StatusWorking, + wantState: domain.SessionWorking, + wantReason: domain.ReasonTaskInProgress, + }, + { + name: "non-terminal open is a working no-op", + pr: domain.PROpen, + wantStatus: domain.StatusWorking, + wantState: domain.SessionWorking, + wantReason: domain.ReasonTaskInProgress, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := ResolveTerminalPRStateDecision(tt.pr) + if got.Status != tt.wantStatus { + t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus) + } + if got.SessionState != tt.wantState { + t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState) + } + if got.SessionReason != tt.wantReason { + t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason) + } + if tt.wantPR != "" && got.PRReason != tt.wantPR { + t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR) + } + }) + } +} + +func TestCreateDetectingDecision(t *testing.T) { + const ev = "runtime_lost runtime=missing process=indeterminate" + hash := HashEvidence(ev) + + t.Run("first entry records attempt 1 and stays detecting", func(t *testing.T) { + got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Now: t0}) + if got.Status != domain.StatusDetecting || got.SessionState != domain.SessionDetecting { + t.Fatalf("want detecting, got Status=%q State=%q", got.Status, got.SessionState) + } + if got.Detecting == nil || got.Detecting.Attempts != 1 { + t.Fatalf("want attempts=1, got %+v", got.Detecting) + } + if !got.Detecting.StartedAt.Equal(t0) { + t.Errorf("StartedAt = %v, want %v", got.Detecting.StartedAt, t0) + } + if got.Detecting.EvidenceHash != hash { + t.Errorf("EvidenceHash = %q, want %q", got.Detecting.EvidenceHash, hash) + } + if got.SessionReason != domain.ReasonRuntimeLost { + t.Errorf("SessionReason = %q, want %q", got.SessionReason, domain.ReasonRuntimeLost) + } + }) + + t.Run("unchanged evidence climbs the counter", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) + if got.Detecting == nil || got.Detecting.Attempts != 2 { + t.Fatalf("want attempts=2, got %+v", got.Detecting) + } + if !got.Detecting.StartedAt.Equal(t0) { + t.Errorf("StartedAt must be preserved, got %v", got.Detecting.StartedAt) + } + }) + + t.Run("escalates to stuck on the third unchanged tick", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) + if got.Status != domain.StatusStuck || got.SessionState != domain.SessionStuck { + t.Fatalf("want stuck, got Status=%q State=%q", got.Status, got.SessionState) + } + if got.Detecting != nil { + t.Errorf("stuck decision must drop detecting memory, got %+v", got.Detecting) + } + if got.SessionReason != domain.ReasonRuntimeLost { + t.Errorf("escalation should carry the why, got %q", got.SessionReason) + } + }) + + t.Run("changing evidence resets the counter but preserves StartedAt", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: "different evidence", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)}) + if got.Status != domain.StatusDetecting { + t.Fatalf("changed evidence should stay detecting, got %q", got.Status) + } + if got.Detecting == nil || got.Detecting.Attempts != 1 { + t.Fatalf("counter should reset to 1, got %+v", got.Detecting) + } + if !got.Detecting.StartedAt.Equal(t0) { + t.Errorf("StartedAt must survive an evidence change, got %v", got.Detecting.StartedAt) + } + }) + + t.Run("duration cap escalates even below the attempt count", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration)}) + if got.Status != domain.StatusStuck { + t.Fatalf("want stuck from duration cap, got %q", got.Status) + } + }) + + t.Run("duration cap fires even when evidence keeps flapping", func(t *testing.T) { + prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash} + got := CreateDetectingDecision(DetectingInput{Evidence: "ever-changing", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration + time.Minute)}) + if got.Status != domain.StatusStuck { + t.Fatalf("duration cap must override a reset counter, got %q", got.Status) + } + }) +} + +func TestProbeDetectingEscalationFlow(t *testing.T) { + // An unchanging ambiguous probe should escalate to stuck after exactly + // DetectingMaxAttempts ticks. + in := ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0} + d := ResolveProbeDecision(in) + for i := 1; i < DetectingMaxAttempts; i++ { + if d.Status != domain.StatusDetecting { + t.Fatalf("tick %d: expected detecting, got %q", i, d.Status) + } + in.Prior = d.Detecting + in.Now = t0.Add(time.Duration(i) * time.Second) + d = ResolveProbeDecision(in) + } + if d.Status != domain.StatusStuck { + t.Fatalf("expected escalation to stuck after %d ticks, got %q", DetectingMaxAttempts, d.Status) + } +} + +func TestHashEvidence(t *testing.T) { + t.Run("identical strings hash identically", func(t *testing.T) { + if HashEvidence("same input") != HashEvidence("same input") { + t.Error("identical evidence must hash equal") + } + }) + + t.Run("different evidence hashes differently", func(t *testing.T) { + if HashEvidence("runtime_lost") == HashEvidence("agent_process_exited") { + t.Error("distinct evidence must hash differently") + } + }) + + t.Run("only the timestamp differs -> equal hash", func(t *testing.T) { + a := "probe failed at 2026-05-26T12:00:00Z runtime=missing" + b := "probe failed at 2026-05-26T12:05:43.218Z runtime=missing" + if HashEvidence(a) != HashEvidence(b) { + t.Errorf("restamped evidence should hash equal:\n a=%q\n b=%q", a, b) + } + }) + + t.Run("bare time-of-day stripped", func(t *testing.T) { + if HashEvidence("idle since 12:00:00") != HashEvidence("idle since 13:30:59") { + t.Error("time-of-day differences should be stripped") + } + }) + + t.Run("unix epoch stripped", func(t *testing.T) { + if HashEvidence("last seen 1716724800") != HashEvidence("last seen 1716728400") { + t.Error("epoch differences should be stripped") + } + }) + + t.Run("a real content change still changes the hash", func(t *testing.T) { + a := "probe at 2026-05-26T12:00:00Z runtime=missing" + b := "probe at 2026-05-26T12:00:00Z runtime=alive" + if HashEvidence(a) == HashEvidence(b) { + t.Error("non-timestamp content change must change the hash") + } + }) + + t.Run("whitespace differences are normalised", func(t *testing.T) { + if HashEvidence("runtime=missing process=dead") != HashEvidence("runtime=missing process=dead") { + t.Error("collapsed whitespace should hash equal") + } + }) +} diff --git a/backend/internal/domain/decide/types.go b/backend/internal/domain/decide/types.go new file mode 100644 index 00000000..7ac4adf1 --- /dev/null +++ b/backend/internal/domain/decide/types.go @@ -0,0 +1,76 @@ +package decide + +import ( + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/domain" +) + +// LifecycleDecision is the output of every decider: the derived display status +// plus the canonical sub-state values to persist, the human-readable evidence, +// and the (possibly updated) detecting memory. +// +// Zero-value sub-state fields mean "this decider does not address that +// sub-state — leave it unchanged", NOT "set it to the empty value". SessionState +// is always populated, but the probe/detecting/kill paths legitimately leave +// PRState/PRReason empty: a liveness verdict knows nothing about the PR. When +// the LCM turns a decision into a LifecyclePatch it must therefore map an empty +// PRState to a nil patch.PR (left untouched) rather than writing it through — +// writing PRNone on a routine probe tick would clobber a live PR. Detecting is +// nil-by-default for the same reason; see LifecyclePatch's three-way +// Detecting/ClearDetecting semantics. +type LifecycleDecision struct { + Status domain.SessionStatus + Evidence string + Detecting *domain.DetectingState + SessionState domain.SessionState + SessionReason domain.SessionReason + PRState domain.PRState + PRReason domain.PRReason +} + +// ProbeInput reconciles runtime + process liveness. A *failed* probe (timeout +// or error) is distinct from a "dead" verdict and must route to detecting, +// never to a death conclusion. KillRequested short-circuits to terminal. +type ProbeInput struct { + Runtime domain.RuntimeState + RuntimeFailed bool + Process ProcessLiveness + ProcessFailed bool + RecentActivity bool + KillRequested bool + Prior *domain.DetectingState + Now time.Time +} + +// ProcessLiveness mirrors isProcessRunning's three-valued answer. +type ProcessLiveness string + +const ( + ProcessAlive ProcessLiveness = "alive" + ProcessDead ProcessLiveness = "dead" + ProcessIndeterminate ProcessLiveness = "indeterminate" +) + +// OpenPRInput drives the PR pipeline ladder for an open PR. +type OpenPRInput struct { + CIFailing bool + ChangesRequested bool + Approved bool + Mergeable bool + ReviewPending bool + IdleBeyond bool // idle past the stuck threshold + Number int + URL string +} + +// DetectingInput feeds the quarantine counter. Evidence is hashed with +// timestamps stripped, so "same ambiguous signal" keeps the counter climbing +// while any real change resets it. +type DetectingInput struct { + Evidence string + ProposedState domain.SessionState + ProposedReason domain.SessionReason + Prior *domain.DetectingState + Now time.Time +}