diff --git a/.github/workflows/gitleaks.yml b/.github/workflows/gitleaks.yml
new file mode 100644
index 00000000..15c70781
--- /dev/null
+++ b/.github/workflows/gitleaks.yml
@@ -0,0 +1,22 @@
+name: gitleaks
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+
+permissions:
+  contents: read
+
+jobs:
+  scan:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      # gitleaks-action v1 scans for committed secrets and needs no license
+      # key (v2 requires GITLEAKS_LICENSE for organization repos).
+      - name: Scan for secrets
+        uses: zricethezav/gitleaks-action@v1.6.0
diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml
new file mode 100644
index 00000000..e3ceaf1c
--- /dev/null
+++ b/.github/workflows/go.yml
@@ -0,0 +1,44 @@
+name: Go
+
+on:
+  push:
+    branches: [main]
+  pull_request:
+    paths:
+      - "backend/**"
+      - ".github/workflows/go.yml"
+
+permissions:
+  contents: read
+
+jobs:
+  build-test:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        working-directory: backend
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-go@v5
+        with:
+          go-version: "1.22"
+          cache: false
+
+      - name: Check formatting
+        run: |
+          unformatted=$(gofmt -l .)
+          if [ -n "$unformatted" ]; then
+            echo "These files need gofmt:"
+            echo "$unformatted"
+            exit 1
+          fi
+
+      - name: Build
+        run: go build ./...
+
+      - name: Vet
+        run: go vet ./...
+
+      - name: Test
+        run: go test -race ./...
diff --git a/README.md b/README.md
index 0f28a2e3..353d1200 100644
--- a/README.md
+++ b/README.md
@@ -2,3 +2,6 @@
 
 Rewrite of the agent-orchestrator: a long-running Go backend daemon (`backend/`)
 paired with an Electron + TypeScript frontend (`frontend/`).
+
+See [`docs/`](docs/README.md) for architecture and status — start with the
+Lifecycle Manager + Session Manager lane in [`docs/architecture.md`](docs/architecture.md).
diff --git a/backend/internal/domain/decide/decide.go b/backend/internal/domain/decide/decide.go
new file mode 100644
index 00000000..e7f2c445
--- /dev/null
+++ b/backend/internal/domain/decide/decide.go
@@ -0,0 +1,263 @@
+// Package decide is the pure DECIDE core: total, deterministic, zero I/O. It
+// collapses observed facts (plus the prior detecting/activity memory) into one
+// LifecycleDecision. Every function here must remain side-effect free so the
+// whole status truth-table can be tested in isolation.
+package decide
+
+import (
+	"crypto/sha256"
+	"encoding/hex"
+	"fmt"
+	"regexp"
+	"strings"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+)
+
+// Anti-flap tuning. detecting escalates to stuck only after this many
+// consecutive unchanged-evidence ticks OR once this much wallclock has elapsed
+// since first entering detecting.
+const (
+	DetectingMaxAttempts = 3
+	DetectingMaxDuration = 5 * time.Minute
+)
+
+// ResolveProbeDecision reconciles runtime/process liveness into a decision.
+//
+// The ordering encodes the load-bearing invariants:
+//   - an explicit kill short-circuits straight to terminal (the only inferred
+//     terminal this decider may reach without quarantine);
+//   - a *failed* probe (timeout/error) is never read as death — it routes to
+//     detecting, as does any disagreement between the two probes;
+//   - only runtime-dead + process-dead + no-recent-activity reaches killed.
+func ResolveProbeDecision(in ProbeInput) LifecycleDecision {
+	if in.KillRequested {
+		return LifecycleDecision{
+			Status:        domain.StatusKilled,
+			Evidence:      "manual kill requested",
+			SessionState:  domain.SessionTerminated,
+			SessionReason: domain.ReasonManuallyKilled,
+		}
+	}
+
+	if in.RuntimeFailed || in.ProcessFailed || in.Runtime == domain.RuntimeProbeFailed {
+		ev := fmt.Sprintf("probe_failed runtime=%s runtimeFailed=%t process=%s processFailed=%t",
+			in.Runtime, in.RuntimeFailed, in.Process, in.ProcessFailed)
+		return detecting(in, domain.ReasonProbeFailure, ev)
+	}
+
+	switch in.Runtime {
+	case domain.RuntimeAlive:
+		if in.Process == ProcessDead {
+			// Runtime up but the agent process is gone: probes disagree.
+			ev := fmt.Sprintf("disagree runtime=alive process=%s recentActivity=%t", in.Process, in.RecentActivity)
+			return detecting(in, domain.ReasonAgentProcessExited, ev)
+		}
+		return LifecycleDecision{
+			Status:        domain.StatusWorking,
+			Evidence:      fmt.Sprintf("alive runtime=alive process=%s", in.Process),
+			SessionState:  domain.SessionWorking,
+			SessionReason: domain.ReasonTaskInProgress,
+		}
+
+	case domain.RuntimeExited, domain.RuntimeMissing:
+		// Runtime is gone. Death is only concluded when the process is *also*
+		// confirmed dead AND nothing has been heard from the agent recently;
+		// any other shape is ambiguous and quarantines.
+		if in.Process == ProcessAlive || in.RecentActivity {
+			ev := fmt.Sprintf("disagree runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity)
+			return detecting(in, domain.ReasonRuntimeLost, ev)
+		}
+		if in.Process == ProcessDead {
+			return LifecycleDecision{
+				Status:        domain.StatusKilled,
+				Evidence:      fmt.Sprintf("dead runtime=%s process=dead recentActivity=false", in.Runtime),
+				SessionState:  domain.SessionTerminated,
+				SessionReason: domain.ReasonRuntimeLost,
+			}
+		}
+		// Process indeterminate: cannot confirm death, so quarantine.
+		ev := fmt.Sprintf("runtime_lost runtime=%s process=%s recentActivity=false", in.Runtime, in.Process)
+		return detecting(in, domain.ReasonRuntimeLost, ev)
+
+	default:
+		// unknown (not yet probed): ambiguous, never conclude death.
+		ev := fmt.Sprintf("runtime_unknown runtime=%s process=%s recentActivity=%t", in.Runtime, in.Process, in.RecentActivity)
+		return detecting(in, domain.ReasonRuntimeLost, ev)
+	}
+}
+
+// ResolveOpenPRDecision walks the PR pipeline ladder. CI failure dominates
+// everything, then requested changes, then the approval/merge states, then a
+// pending review, then a stalled (idle-beyond-threshold) PR, else plain open.
+func ResolveOpenPRDecision(in OpenPRInput) LifecycleDecision {
+	// evidence is a stable, timestamp-free summary "<condition> #<num> <url>"
+	// for logs/traceability; it folds in the PR identity inputs (Number/URL).
+	evidence := func(cond string) string {
+		s := cond
+		if in.Number > 0 {
+			s += fmt.Sprintf(" #%d", in.Number)
+		}
+		if in.URL != "" {
+			s += " " + in.URL
+		}
+		return s
+	}
+	base := func(status domain.SessionStatus, cond string, prReason domain.PRReason, ss domain.SessionState, sr domain.SessionReason) LifecycleDecision {
+		return LifecycleDecision{
+			Status:        status,
+			Evidence:      evidence(cond),
+			SessionState:  ss,
+			SessionReason: sr,
+			PRState:       domain.PROpen,
+			PRReason:      prReason,
+		}
+	}
+
+	switch {
+	case in.CIFailing:
+		return base(domain.StatusCIFailed, "ci_failing", domain.PRReasonCIFailing, domain.SessionWorking, domain.ReasonFixingCI)
+	case in.ChangesRequested:
+		return base(domain.StatusChangesRequested, "changes_requested", domain.PRReasonChangesRequested, domain.SessionWorking, domain.ReasonResolvingReviewComments)
+	case in.Mergeable:
+		// Mergeability is the authoritative merge gate, so it already folds in
+		// "approved if review is required". Checking it before Approved means a
+		// PR on a no-required-review repo (mergeable, not formally approved) is
+		// still surfaced as ready-to-merge instead of falling through to PR_OPEN.
+		return base(domain.StatusMergeable, "merge_ready", domain.PRReasonMergeReady, domain.SessionIdle, domain.ReasonAwaitingExternalReview)
+	case in.Approved:
+		return base(domain.StatusApproved, "approved", domain.PRReasonApproved, domain.SessionIdle, domain.ReasonAwaitingExternalReview)
+	case in.ReviewPending:
+		return base(domain.StatusReviewPending, "review_pending", domain.PRReasonReviewPending, domain.SessionIdle, domain.ReasonAwaitingExternalReview)
+	case in.IdleBeyond:
+		// A PR open but quiet past the stuck threshold needs a human nudge.
+		return base(domain.StatusStuck, "idle_beyond", domain.PRReasonInProgress, domain.SessionStuck, domain.ReasonAwaitingUserInput)
+	default:
+		return base(domain.StatusPROpen, "pr_open", domain.PRReasonInProgress, domain.SessionWorking, domain.ReasonPRCreated)
+	}
+}
+
+// ResolveTerminalPRStateDecision handles merged/closed PRs. A merge parks the
+// session idle awaiting a human's post-merge decision; a close drops to idle.
+// none/open are not terminal — callers should route those to the open-PR or
+// probe deciders — but the function stays total for safety.
+func ResolveTerminalPRStateDecision(pr domain.PRState) LifecycleDecision {
+	switch pr {
+	case domain.PRMerged:
+		return LifecycleDecision{
+			Status:        domain.StatusMerged,
+			Evidence:      "pr merged",
+			SessionState:  domain.SessionIdle,
+			SessionReason: domain.ReasonMergedWaitingDecision,
+			PRState:       domain.PRMerged,
+			PRReason:      domain.PRReasonMerged,
+		}
+	case domain.PRClosed:
+		return LifecycleDecision{
+			Status:        domain.StatusIdle,
+			Evidence:      "pr closed unmerged",
+			SessionState:  domain.SessionIdle,
+			SessionReason: domain.ReasonAwaitingUserInput,
+			PRState:       domain.PRClosed,
+			PRReason:      domain.PRReasonClosedUnmerged,
+		}
+	default:
+		return LifecycleDecision{
+			Status:        domain.StatusWorking,
+			Evidence:      fmt.Sprintf("non-terminal pr state=%s", pr),
+			SessionState:  domain.SessionWorking,
+			SessionReason: domain.ReasonTaskInProgress,
+			PRState:       pr,
+		}
+	}
+}
+
+// CreateDetectingDecision advances or escalates the anti-flap quarantine.
+//
+// The attempt counter climbs only while the (timestamp-stripped) evidence hash
+// is unchanged and resets the moment the evidence moves; StartedAt is preserved
+// across the whole detecting episode so the duration cap is a real wall-clock
+// safety net even when the evidence keeps flapping. Escalation to stuck fires
+// at DetectingMaxAttempts consecutive unchanged ticks OR DetectingMaxDuration
+// elapsed since first entering detecting.
+func CreateDetectingDecision(in DetectingInput) LifecycleDecision {
+	hash := HashEvidence(in.Evidence)
+
+	attempts := 1
+	startedAt := in.Now
+	if in.Prior != nil {
+		startedAt = in.Prior.StartedAt
+		if in.Prior.EvidenceHash == hash {
+			attempts = in.Prior.Attempts + 1
+		}
+	}
+
+	escalate := attempts >= DetectingMaxAttempts || !in.Now.Before(startedAt.Add(DetectingMaxDuration))
+	if escalate {
+		return LifecycleDecision{
+			Status:        domain.StatusStuck,
+			Evidence:      in.Evidence,
+			SessionState:  domain.SessionStuck,
+			SessionReason: in.ProposedReason,
+		}
+	}
+
+	return LifecycleDecision{
+		Status:        domain.StatusDetecting,
+		Evidence:      in.Evidence,
+		Detecting:     &domain.DetectingState{Attempts: attempts, StartedAt: startedAt, EvidenceHash: hash},
+		SessionState:  domain.SessionDetecting,
+		SessionReason: in.ProposedReason,
+	}
+}
+
+// HashEvidence normalises an evidence string (stripping timestamps and
+// collapsing whitespace) and hashes it, so unchanged-but-restamped signals
+// compare equal and the detecting counter is not reset by clock movement alone.
+func HashEvidence(evidence string) string {
+	s := evidence
+	for _, re := range timestampPatterns {
+		s = re.ReplaceAllString(s, "")
+	}
+	s = strings.Join(strings.Fields(s), " ")
+	sum := sha256.Sum256([]byte(s))
+	return hex.EncodeToString(sum[:])
+}
+
+// timestampPatterns is the list of regexes HashEvidence applies (in order) to
+// delete the time-varying parts of an evidence string before hashing, so the
+// same ambiguous signal restamped with a new clock value hashes equal and the
+// detecting counter keeps climbing instead of resetting every tick.
+//
+// Order matters: the full datetime form is removed first so its embedded
+// HH:MM:SS isn't half-eaten by the bare time-of-day pattern that follows.
+//
+//  1. full ISO-8601 / RFC3339 datetime — date, a T or space separator,
+//     HH:MM:SS, optional fractional seconds, optional Z or ±HH:MM offset.
+//     e.g. "2026-05-26T12:00:00Z", "2026-05-26 12:00:00.218+05:30"
+//  2. a bare time-of-day, e.g. "12:00:00" or "12:00:00.218"
+//  3. a bare unix epoch — any 10-13 digit run (seconds or millis), e.g.
+//     "1716724800". This is broad enough to also clobber a same-width numeric
+//     ID if one ever appears in evidence; evidence is decider-authored, so keep
+//     IDs out of evidence strings to preserve hash fidelity.
+var timestampPatterns = []*regexp.Regexp{
+	regexp.MustCompile(`\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}:\d{2}(?:\.\d+)?(?:Z|[+-]\d{2}:?\d{2})?`),
+	regexp.MustCompile(`\d{2}:\d{2}:\d{2}(?:\.\d+)?`),
+	regexp.MustCompile(`\b\d{10,13}\b`),
+}
+
+// detecting adapts a probe verdict into the shared anti-flap path. It packages
+// the proposed reason + evidence (plus the prior counter from the same probe
+// input) into a DetectingInput and defers to CreateDetectingDecision, so every
+// probe-driven ambiguity is counted and escalated by the identical quarantine
+// logic instead of each probe branch re-implementing the counter.
+func detecting(in ProbeInput, reason domain.SessionReason, evidence string) LifecycleDecision {
+	return CreateDetectingDecision(DetectingInput{
+		Evidence:       evidence,
+		ProposedState:  domain.SessionDetecting,
+		ProposedReason: reason,
+		Prior:          in.Prior,
+		Now:            in.Now,
+	})
+}
diff --git a/backend/internal/domain/decide/decide_test.go b/backend/internal/domain/decide/decide_test.go
new file mode 100644
index 00000000..d6e027f1
--- /dev/null
+++ b/backend/internal/domain/decide/decide_test.go
@@ -0,0 +1,530 @@
+package decide
+
+import (
+	"testing"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+)
+
+var t0 = time.Date(2026, 5, 26, 12, 0, 0, 0, time.UTC)
+
+func TestResolveProbeDecision(t *testing.T) {
+	tests := []struct {
+		name        string
+		in          ProbeInput
+		wantStatus  domain.SessionStatus
+		wantState   domain.SessionState
+		wantReason  domain.SessionReason
+		wantDetect  bool // expect non-nil Detecting memory
+		wantTermNil bool // expect terminal (Detecting must be nil)
+	}{
+		{
+			name:        "kill requested short-circuits to terminal killed",
+			in:          ProbeInput{KillRequested: true, Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0},
+			wantStatus:  domain.StatusKilled,
+			wantState:   domain.SessionTerminated,
+			wantReason:  domain.ReasonManuallyKilled,
+			wantTermNil: true,
+		},
+		{
+			name:        "kill requested wins even over a dead+dead probe",
+			in:          ProbeInput{KillRequested: true, Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0},
+			wantStatus:  domain.StatusKilled,
+			wantState:   domain.SessionTerminated,
+			wantReason:  domain.ReasonManuallyKilled,
+			wantTermNil: true,
+		},
+		{
+			name:       "runtime probe failed routes to detecting, never death",
+			in:         ProbeInput{Runtime: domain.RuntimeMissing, RuntimeFailed: true, Process: ProcessDead, Now: t0},
+			wantStatus: domain.StatusDetecting,
+			wantState:  domain.SessionDetecting,
+			wantReason: domain.ReasonProbeFailure,
+			wantDetect: true,
+		},
+		{
+			name:       "process probe failed routes to detecting",
+			in:         ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, ProcessFailed: true, Now: t0},
+			wantStatus: domain.StatusDetecting,
+			wantState:  domain.SessionDetecting,
+			wantReason: domain.ReasonProbeFailure,
+			wantDetect: true,
+		},
+		{
+			name:       "runtime state probe_failed routes to detecting",
+			in:         ProbeInput{Runtime: domain.RuntimeProbeFailed, Process: ProcessIndeterminate, Now: t0},
+			wantStatus: domain.StatusDetecting,
+			wantState:  domain.SessionDetecting,
+			wantReason: domain.ReasonProbeFailure,
+			wantDetect: true,
+		},
+		{
+			name:       "runtime alive + process alive is working",
+			in:         ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0},
+			wantStatus: domain.StatusWorking,
+			wantState:  domain.SessionWorking,
+			wantReason: domain.ReasonTaskInProgress,
+		},
+		{
+			name:       "runtime alive + process indeterminate leans alive",
+			in:         ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessIndeterminate, Now: t0},
+			wantStatus: domain.StatusWorking,
+			wantState:  domain.SessionWorking,
+			wantReason: domain.ReasonTaskInProgress,
+		},
+		{
+			name:       "runtime alive + process dead disagree -> detecting (agent_process_exited)",
+			in:         ProbeInput{Runtime: domain.RuntimeAlive, Process: ProcessDead, Now: t0},
+			wantStatus: domain.StatusDetecting,
+			wantState:  domain.SessionDetecting,
+			wantReason: domain.ReasonAgentProcessExited,
+			wantDetect: true,
+		},
+		{
+			name:       "runtime dead + process alive disagree -> detecting (runtime_lost)",
+			in:         ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessAlive, Now: t0},
+			wantStatus: domain.StatusDetecting,
+			wantState:  domain.SessionDetecting,
+			wantReason: domain.ReasonRuntimeLost,
+			wantDetect: true,
+		},
+		{
+			name:       "runtime dead + recent activity disagree -> detecting (runtime_lost)",
+			in:         ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, RecentActivity: true, Now: t0},
+			wantStatus: domain.StatusDetecting,
+			wantState:  domain.SessionDetecting,
+			wantReason: domain.ReasonRuntimeLost,
+			wantDetect: true,
+		},
+		{
+			name:       "runtime dead + process indeterminate cannot confirm -> detecting",
+			in:         ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0},
+			wantStatus: domain.StatusDetecting,
+			wantState:  domain.SessionDetecting,
+			wantReason: domain.ReasonRuntimeLost,
+			wantDetect: true,
+		},
+		{
+			name:        "runtime exited + process dead + no activity -> killed terminal",
+			in:          ProbeInput{Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0},
+			wantStatus:  domain.StatusKilled,
+			wantState:   domain.SessionTerminated,
+			wantReason:  domain.ReasonRuntimeLost,
+			wantTermNil: true,
+		},
+		{
+			name:        "runtime missing + process dead + no activity -> killed terminal",
+			in:          ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessDead, Now: t0},
+			wantStatus:  domain.StatusKilled,
+			wantState:   domain.SessionTerminated,
+			wantReason:  domain.ReasonRuntimeLost,
+			wantTermNil: true,
+		},
+		{
+			name:       "runtime unknown is ambiguous -> detecting (runtime_lost)",
+			in:         ProbeInput{Runtime: domain.RuntimeUnknown, Process: ProcessDead, Now: t0},
+			wantStatus: domain.StatusDetecting,
+			wantState:  domain.SessionDetecting,
+			wantReason: domain.ReasonRuntimeLost,
+			wantDetect: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := ResolveProbeDecision(tt.in)
+			if got.Status != tt.wantStatus {
+				t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus)
+			}
+			if got.SessionState != tt.wantState {
+				t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState)
+			}
+			if got.SessionReason != tt.wantReason {
+				t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason)
+			}
+			if tt.wantDetect && got.Detecting == nil {
+				t.Errorf("expected non-nil Detecting memory, got nil")
+			}
+			if tt.wantTermNil && got.Detecting != nil {
+				t.Errorf("terminal decision must carry nil Detecting, got %+v", got.Detecting)
+			}
+		})
+	}
+}
+
+func TestResolveOpenPRDecision(t *testing.T) {
+	tests := []struct {
+		name       string
+		in         OpenPRInput
+		wantStatus domain.SessionStatus
+		wantPR     domain.PRReason
+		wantState  domain.SessionState
+	}{
+		{
+			name:       "ci failing dominates everything",
+			in:         OpenPRInput{CIFailing: true, ChangesRequested: true, Approved: true, Mergeable: true},
+			wantStatus: domain.StatusCIFailed,
+			wantPR:     domain.PRReasonCIFailing,
+			wantState:  domain.SessionWorking,
+		},
+		{
+			name:       "changes requested before approval states",
+			in:         OpenPRInput{ChangesRequested: true, Approved: true, Mergeable: true},
+			wantStatus: domain.StatusChangesRequested,
+			wantPR:     domain.PRReasonChangesRequested,
+			wantState:  domain.SessionWorking,
+		},
+		{
+			name:       "approved + mergeable -> mergeable",
+			in:         OpenPRInput{Approved: true, Mergeable: true},
+			wantStatus: domain.StatusMergeable,
+			wantPR:     domain.PRReasonMergeReady,
+			wantState:  domain.SessionIdle,
+		},
+		{
+			name:       "mergeable without formal approval (no required review) -> mergeable",
+			in:         OpenPRInput{Mergeable: true},
+			wantStatus: domain.StatusMergeable,
+			wantPR:     domain.PRReasonMergeReady,
+			wantState:  domain.SessionIdle,
+		},
+		{
+			name:       "approved but not mergeable -> approved",
+			in:         OpenPRInput{Approved: true},
+			wantStatus: domain.StatusApproved,
+			wantPR:     domain.PRReasonApproved,
+			wantState:  domain.SessionIdle,
+		},
+		{
+			name:       "review pending",
+			in:         OpenPRInput{ReviewPending: true},
+			wantStatus: domain.StatusReviewPending,
+			wantPR:     domain.PRReasonReviewPending,
+			wantState:  domain.SessionIdle,
+		},
+		{
+			name:       "idle beyond threshold -> stuck",
+			in:         OpenPRInput{IdleBeyond: true},
+			wantStatus: domain.StatusStuck,
+			wantPR:     domain.PRReasonInProgress,
+			wantState:  domain.SessionStuck,
+		},
+		{
+			name:       "review pending wins over idle-beyond",
+			in:         OpenPRInput{ReviewPending: true, IdleBeyond: true},
+			wantStatus: domain.StatusReviewPending,
+			wantPR:     domain.PRReasonReviewPending,
+			wantState:  domain.SessionIdle,
+		},
+		{
+			name:       "nothing set -> plain open",
+			in:         OpenPRInput{},
+			wantStatus: domain.StatusPROpen,
+			wantPR:     domain.PRReasonInProgress,
+			wantState:  domain.SessionWorking,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := ResolveOpenPRDecision(tt.in)
+			if got.Status != tt.wantStatus {
+				t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus)
+			}
+			if got.PRReason != tt.wantPR {
+				t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR)
+			}
+			if got.PRState != domain.PROpen {
+				t.Errorf("PRState = %q, want %q", got.PRState, domain.PROpen)
+			}
+			if got.SessionState != tt.wantState {
+				t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState)
+			}
+		})
+	}
+}
+
+func TestResolveOpenPRDecisionEvidence(t *testing.T) {
+	tests := []struct {
+		name string
+		in   OpenPRInput
+		want string
+	}{
+		{
+			name: "condition with PR number and URL",
+			in:   OpenPRInput{CIFailing: true, Number: 123, URL: "https://example.com/pr/123"},
+			want: "ci_failing #123 https://example.com/pr/123",
+		},
+		{
+			name: "condition with number only",
+			in:   OpenPRInput{Approved: true, Mergeable: true, Number: 7},
+			want: "merge_ready #7",
+		},
+		{
+			name: "no identity falls back to the bare condition",
+			in:   OpenPRInput{},
+			want: "pr_open",
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := ResolveOpenPRDecision(tt.in).Evidence; got != tt.want {
+				t.Errorf("Evidence = %q, want %q", got, tt.want)
+			}
+		})
+	}
+}
+
+func TestDecidersDeriveConsistently(t *testing.T) {
+	// Every decision a decider produces must be self-consistent: the display
+	// Status it reports must equal what DeriveLegacyStatus produces from the
+	// canonical (session, pr) sub-states it emits. This locks the deciders and
+	// the display-derivation against drifting apart.
+	//
+	// The ResolveTerminalPRStateDecision none/open default is intentionally
+	// excluded — it is a documented no-op for misuse, not a real verdict.
+	var decisions []LifecycleDecision
+
+	for _, in := range []OpenPRInput{
+		{CIFailing: true},
+		{ChangesRequested: true},
+		{Approved: true, Mergeable: true},
+		{Mergeable: true},
+		{Approved: true},
+		{ReviewPending: true},
+		{IdleBeyond: true},
+		{},
+	} {
+		decisions = append(decisions, ResolveOpenPRDecision(in))
+	}
+
+	decisions = append(decisions,
+		ResolveTerminalPRStateDecision(domain.PRMerged),
+		ResolveTerminalPRStateDecision(domain.PRClosed),
+	)
+
+	for _, in := range []ProbeInput{
+		{KillRequested: true, Now: t0},
+		{Runtime: domain.RuntimeAlive, Process: ProcessAlive, Now: t0},
+		{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0},
+		{Runtime: domain.RuntimeExited, Process: ProcessDead, Now: t0},
+	} {
+		decisions = append(decisions, ResolveProbeDecision(in))
+	}
+
+	for _, d := range decisions {
+		l := domain.CanonicalSessionLifecycle{
+			Session: domain.SessionSubstate{State: d.SessionState, Reason: d.SessionReason},
+			PR:      domain.PRSubstate{State: d.PRState, Reason: d.PRReason},
+		}
+		if got := domain.DeriveLegacyStatus(l); got != d.Status {
+			t.Errorf("decision %+v: Status=%q but DeriveLegacyStatus=%q", d, d.Status, got)
+		}
+	}
+}
+
+func TestResolveTerminalPRStateDecision(t *testing.T) {
+	tests := []struct {
+		name       string
+		pr         domain.PRState
+		wantStatus domain.SessionStatus
+		wantState  domain.SessionState
+		wantReason domain.SessionReason
+		wantPR     domain.PRReason
+	}{
+		{
+			name:       "merged parks idle awaiting decision",
+			pr:         domain.PRMerged,
+			wantStatus: domain.StatusMerged,
+			wantState:  domain.SessionIdle,
+			wantReason: domain.ReasonMergedWaitingDecision,
+			wantPR:     domain.PRReasonMerged,
+		},
+		{
+			name:       "closed drops to idle",
+			pr:         domain.PRClosed,
+			wantStatus: domain.StatusIdle,
+			wantState:  domain.SessionIdle,
+			wantReason: domain.ReasonAwaitingUserInput,
+			wantPR:     domain.PRReasonClosedUnmerged,
+		},
+		{
+			name:       "non-terminal none is a working no-op",
+			pr:         domain.PRNone,
+			wantStatus: domain.StatusWorking,
+			wantState:  domain.SessionWorking,
+			wantReason: domain.ReasonTaskInProgress,
+		},
+		{
+			name:       "non-terminal open is a working no-op",
+			pr:         domain.PROpen,
+			wantStatus: domain.StatusWorking,
+			wantState:  domain.SessionWorking,
+			wantReason: domain.ReasonTaskInProgress,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := ResolveTerminalPRStateDecision(tt.pr)
+			if got.Status != tt.wantStatus {
+				t.Errorf("Status = %q, want %q", got.Status, tt.wantStatus)
+			}
+			if got.SessionState != tt.wantState {
+				t.Errorf("SessionState = %q, want %q", got.SessionState, tt.wantState)
+			}
+			if got.SessionReason != tt.wantReason {
+				t.Errorf("SessionReason = %q, want %q", got.SessionReason, tt.wantReason)
+			}
+			if tt.wantPR != "" && got.PRReason != tt.wantPR {
+				t.Errorf("PRReason = %q, want %q", got.PRReason, tt.wantPR)
+			}
+		})
+	}
+}
+
+func TestCreateDetectingDecision(t *testing.T) {
+	const ev = "runtime_lost runtime=missing process=indeterminate"
+	hash := HashEvidence(ev)
+
+	t.Run("first entry records attempt 1 and stays detecting", func(t *testing.T) {
+		got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Now: t0})
+		if got.Status != domain.StatusDetecting || got.SessionState != domain.SessionDetecting {
+			t.Fatalf("want detecting, got Status=%q State=%q", got.Status, got.SessionState)
+		}
+		if got.Detecting == nil || got.Detecting.Attempts != 1 {
+			t.Fatalf("want attempts=1, got %+v", got.Detecting)
+		}
+		if !got.Detecting.StartedAt.Equal(t0) {
+			t.Errorf("StartedAt = %v, want %v", got.Detecting.StartedAt, t0)
+		}
+		if got.Detecting.EvidenceHash != hash {
+			t.Errorf("EvidenceHash = %q, want %q", got.Detecting.EvidenceHash, hash)
+		}
+		if got.SessionReason != domain.ReasonRuntimeLost {
+			t.Errorf("SessionReason = %q, want %q", got.SessionReason, domain.ReasonRuntimeLost)
+		}
+	})
+
+	t.Run("unchanged evidence climbs the counter", func(t *testing.T) {
+		prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash}
+		got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)})
+		if got.Detecting == nil || got.Detecting.Attempts != 2 {
+			t.Fatalf("want attempts=2, got %+v", got.Detecting)
+		}
+		if !got.Detecting.StartedAt.Equal(t0) {
+			t.Errorf("StartedAt must be preserved, got %v", got.Detecting.StartedAt)
+		}
+	})
+
+	t.Run("escalates to stuck on the third unchanged tick", func(t *testing.T) {
+		prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash}
+		got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)})
+		if got.Status != domain.StatusStuck || got.SessionState != domain.SessionStuck {
+			t.Fatalf("want stuck, got Status=%q State=%q", got.Status, got.SessionState)
+		}
+		if got.Detecting != nil {
+			t.Errorf("stuck decision must drop detecting memory, got %+v", got.Detecting)
+		}
+		if got.SessionReason != domain.ReasonRuntimeLost {
+			t.Errorf("escalation should carry the why, got %q", got.SessionReason)
+		}
+	})
+
+	t.Run("changing evidence resets the counter but preserves StartedAt", func(t *testing.T) {
+		prior := &domain.DetectingState{Attempts: DetectingMaxAttempts - 1, StartedAt: t0, EvidenceHash: hash}
+		got := CreateDetectingDecision(DetectingInput{Evidence: "different evidence", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(time.Minute)})
+		if got.Status != domain.StatusDetecting {
+			t.Fatalf("changed evidence should stay detecting, got %q", got.Status)
+		}
+		if got.Detecting == nil || got.Detecting.Attempts != 1 {
+			t.Fatalf("counter should reset to 1, got %+v", got.Detecting)
+		}
+		if !got.Detecting.StartedAt.Equal(t0) {
+			t.Errorf("StartedAt must survive an evidence change, got %v", got.Detecting.StartedAt)
+		}
+	})
+
+	t.Run("duration cap escalates even below the attempt count", func(t *testing.T) {
+		prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash}
+		got := CreateDetectingDecision(DetectingInput{Evidence: ev, ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration)})
+		if got.Status != domain.StatusStuck {
+			t.Fatalf("want stuck from duration cap, got %q", got.Status)
+		}
+	})
+
+	t.Run("duration cap fires even when evidence keeps flapping", func(t *testing.T) {
+		prior := &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: hash}
+		got := CreateDetectingDecision(DetectingInput{Evidence: "ever-changing", ProposedReason: domain.ReasonRuntimeLost, Prior: prior, Now: t0.Add(DetectingMaxDuration + time.Minute)})
+		if got.Status != domain.StatusStuck {
+			t.Fatalf("duration cap must override a reset counter, got %q", got.Status)
+		}
+	})
+}
+
+func TestProbeDetectingEscalationFlow(t *testing.T) {
+	// An unchanging ambiguous probe should escalate to stuck after exactly
+	// DetectingMaxAttempts ticks.
+	in := ProbeInput{Runtime: domain.RuntimeMissing, Process: ProcessIndeterminate, Now: t0}
+	d := ResolveProbeDecision(in)
+	for i := 1; i < DetectingMaxAttempts; i++ {
+		if d.Status != domain.StatusDetecting {
+			t.Fatalf("tick %d: expected detecting, got %q", i, d.Status)
+		}
+		in.Prior = d.Detecting
+		in.Now = t0.Add(time.Duration(i) * time.Second)
+		d = ResolveProbeDecision(in)
+	}
+	if d.Status != domain.StatusStuck {
+		t.Fatalf("expected escalation to stuck after %d ticks, got %q", DetectingMaxAttempts, d.Status)
+	}
+}
+
+func TestHashEvidence(t *testing.T) {
+	t.Run("identical strings hash identically", func(t *testing.T) {
+		if HashEvidence("same input") != HashEvidence("same input") {
+			t.Error("identical evidence must hash equal")
+		}
+	})
+
+	t.Run("different evidence hashes differently", func(t *testing.T) {
+		if HashEvidence("runtime_lost") == HashEvidence("agent_process_exited") {
+			t.Error("distinct evidence must hash differently")
+		}
+	})
+
+	t.Run("only the timestamp differs -> equal hash", func(t *testing.T) {
+		a := "probe failed at 2026-05-26T12:00:00Z runtime=missing"
+		b := "probe failed at 2026-05-26T12:05:43.218Z runtime=missing"
+		if HashEvidence(a) != HashEvidence(b) {
+			t.Errorf("restamped evidence should hash equal:\n a=%q\n b=%q", a, b)
+		}
+	})
+
+	t.Run("bare time-of-day stripped", func(t *testing.T) {
+		if HashEvidence("idle since 12:00:00") != HashEvidence("idle since 13:30:59") {
+			t.Error("time-of-day differences should be stripped")
+		}
+	})
+
+	t.Run("unix epoch stripped", func(t *testing.T) {
+		if HashEvidence("last seen 1716724800") != HashEvidence("last seen 1716728400") {
+			t.Error("epoch differences should be stripped")
+		}
+	})
+
+	t.Run("a real content change still changes the hash", func(t *testing.T) {
+		a := "probe at 2026-05-26T12:00:00Z runtime=missing"
+		b := "probe at 2026-05-26T12:00:00Z runtime=alive"
+		if HashEvidence(a) == HashEvidence(b) {
+			t.Error("non-timestamp content change must change the hash")
+		}
+	})
+
+	t.Run("whitespace differences are normalised", func(t *testing.T) {
+		if HashEvidence("runtime=missing   process=dead") != HashEvidence("runtime=missing process=dead") {
+			t.Error("collapsed whitespace should hash equal")
+		}
+	})
+}
diff --git a/backend/internal/domain/decide/types.go b/backend/internal/domain/decide/types.go
new file mode 100644
index 00000000..7ac4adf1
--- /dev/null
+++ b/backend/internal/domain/decide/types.go
@@ -0,0 +1,76 @@
+package decide
+
+import (
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+)
+
+// LifecycleDecision is the output of every decider: the derived display status
+// plus the canonical sub-state values to persist, the human-readable evidence,
+// and the (possibly updated) detecting memory.
+//
+// Zero-value sub-state fields mean "this decider does not address that
+// sub-state — leave it unchanged", NOT "set it to the empty value". SessionState
+// is always populated, but the probe/detecting/kill paths legitimately leave
+// PRState/PRReason empty: a liveness verdict knows nothing about the PR. When
+// the LCM turns a decision into a LifecyclePatch it must therefore map an empty
+// PRState to a nil patch.PR (left untouched) rather than writing it through —
+// writing PRNone on a routine probe tick would clobber a live PR. Detecting is
+// nil-by-default for the same reason; see LifecyclePatch's three-way
+// Detecting/ClearDetecting semantics.
+type LifecycleDecision struct {
+	Status        domain.SessionStatus
+	Evidence      string
+	Detecting     *domain.DetectingState
+	SessionState  domain.SessionState
+	SessionReason domain.SessionReason
+	PRState       domain.PRState
+	PRReason      domain.PRReason
+}
+
+// ProbeInput reconciles runtime + process liveness. A *failed* probe (timeout
+// or error) is distinct from a "dead" verdict and must route to detecting,
+// never to a death conclusion. KillRequested short-circuits to terminal.
+type ProbeInput struct {
+	Runtime        domain.RuntimeState
+	RuntimeFailed  bool
+	Process        ProcessLiveness
+	ProcessFailed  bool
+	RecentActivity bool
+	KillRequested  bool
+	Prior          *domain.DetectingState
+	Now            time.Time
+}
+
+// ProcessLiveness mirrors isProcessRunning's three-valued answer.
+type ProcessLiveness string
+
+const (
+	ProcessAlive         ProcessLiveness = "alive"
+	ProcessDead          ProcessLiveness = "dead"
+	ProcessIndeterminate ProcessLiveness = "indeterminate"
+)
+
+// OpenPRInput drives the PR pipeline ladder for an open PR.
+type OpenPRInput struct {
+	CIFailing        bool
+	ChangesRequested bool
+	Approved         bool
+	Mergeable        bool
+	ReviewPending    bool
+	IdleBeyond       bool // idle past the stuck threshold
+	Number           int
+	URL              string
+}
+
+// DetectingInput feeds the quarantine counter. Evidence is hashed with
+// timestamps stripped, so "same ambiguous signal" keeps the counter climbing
+// while any real change resets it.
+type DetectingInput struct {
+	Evidence       string
+	ProposedState  domain.SessionState
+	ProposedReason domain.SessionReason
+	Prior          *domain.DetectingState
+	Now            time.Time
+}
diff --git a/backend/internal/domain/lifecycle.go b/backend/internal/domain/lifecycle.go
new file mode 100644
index 00000000..567a4769
--- /dev/null
+++ b/backend/internal/domain/lifecycle.go
@@ -0,0 +1,191 @@
+// Package domain holds the shared contract types for the LCM + Session Manager
+// lane: the canonical session state model, the derived display status, and the
+// session read-model. It has no behaviour beyond pure derivation (status.go)
+// and imports nothing outside the standard library, so every other package can
+// depend on it without creating cycles.
+package domain
+
+import "time"
+
+// LifecycleVersion is the schema version stamped onto every persisted record.
+// Greenfield: we start at 1 and carry no migration/synthesis code.
+const LifecycleVersion = 1
+
+// CanonicalSessionLifecycle is the ONLY thing persisted for a session's state.
+// The display status is derived from it on read (see DeriveLegacyStatus) and is
+// never stored — this prevents canonical truth and display from drifting.
+//
+// Three orthogonal (state, reason) sub-states describe the session, its PR, and
+// its runtime. Activity and Detecting are decider *inputs* that must survive
+// between observations (they are read back by the pure decide core), so they
+// live in the persisted record too.
+type CanonicalSessionLifecycle struct {
+	// Version is the schema version of this record's shape (LifecycleVersion).
+	Version int `json:"version"`
+	// Revision is a monotonic counter the store bumps on every write. It is used
+	// for optimistic-concurrency checks (LifecyclePatch.ExpectedRevision) and is
+	// distinct from the schema Version above.
+	Revision int             `json:"revision"`
+	Session  SessionSubstate `json:"session"`
+	PR       PRSubstate      `json:"pr"`
+	Runtime  RuntimeSubstate `json:"runtime"`
+
+	// Activity is the last-known agent activity. It arrives on a different
+	// cadence (ApplyActivitySignal) than runtime probes (the reaper), so the
+	// probe decider reads it from here to answer "was there recent activity?".
+	Activity ActivitySubstate `json:"activity"`
+
+	// Detecting is the anti-flap quarantine memory. It is non-nil only while
+	// the session is in the detecting state; it carries the attempt counter,
+	// the first-entry time, and a hash of the (timestamp-stripped) evidence so
+	// the decider can tell "same ambiguous signal N times" from "signal moved".
+	Detecting *DetectingState `json:"detecting,omitempty"`
+}
+
+// ---- session sub-state ----
+
+type SessionState string
+
+const (
+	SessionNotStarted SessionState = "not_started"
+	SessionWorking    SessionState = "working"
+	SessionIdle       SessionState = "idle"
+	SessionNeedsInput SessionState = "needs_input"
+	SessionStuck      SessionState = "stuck"
+	SessionDetecting  SessionState = "detecting"
+	SessionDone       SessionState = "done"
+	SessionTerminated SessionState = "terminated"
+)
+
+type SessionReason string
+
+const (
+	ReasonSpawnRequested          SessionReason = "spawn_requested"
+	ReasonAgentAcknowledged       SessionReason = "agent_acknowledged"
+	ReasonTaskInProgress          SessionReason = "task_in_progress"
+	ReasonPRCreated               SessionReason = "pr_created"
+	ReasonFixingCI                SessionReason = "fixing_ci"
+	ReasonResolvingReviewComments SessionReason = "resolving_review_comments"
+	ReasonAwaitingUserInput       SessionReason = "awaiting_user_input"
+	ReasonAwaitingExternalReview  SessionReason = "awaiting_external_review"
+	ReasonResearchComplete        SessionReason = "research_complete"
+	ReasonMergedWaitingDecision   SessionReason = "merged_waiting_decision"
+	ReasonManuallyKilled          SessionReason = "manually_killed"
+	ReasonPRMerged                SessionReason = "pr_merged"
+	ReasonAutoCleanup             SessionReason = "auto_cleanup"
+	ReasonRuntimeLost             SessionReason = "runtime_lost"
+	ReasonAgentProcessExited      SessionReason = "agent_process_exited"
+	ReasonProbeFailure            SessionReason = "probe_failure"
+	ReasonErrorInProcess          SessionReason = "error_in_process"
+)
+
+type SessionSubstate struct {
+	State  SessionState  `json:"state"`
+	Reason SessionReason `json:"reason"`
+}
+
+// ---- PR sub-state ----
+
+type PRState string
+
+const (
+	PRNone   PRState = "none"
+	PROpen   PRState = "open"
+	PRMerged PRState = "merged"
+	PRClosed PRState = "closed"
+)
+
+type PRReason string
+
+const (
+	PRReasonNotCreated       PRReason = "not_created"
+	PRReasonInProgress       PRReason = "in_progress"
+	PRReasonCIFailing        PRReason = "ci_failing"
+	PRReasonReviewPending    PRReason = "review_pending"
+	PRReasonChangesRequested PRReason = "changes_requested"
+	PRReasonApproved         PRReason = "approved"
+	PRReasonMergeReady       PRReason = "merge_ready"
+	PRReasonMerged           PRReason = "merged"
+	PRReasonClosedUnmerged   PRReason = "closed_unmerged"
+	PRReasonClearedOnRestore PRReason = "cleared_on_restore"
+)
+
+type PRSubstate struct {
+	State  PRState  `json:"state"`
+	Reason PRReason `json:"reason"`
+	Number int      `json:"number,omitempty"`
+	URL    string   `json:"url,omitempty"`
+}
+
+// ---- runtime sub-state ----
+
+type RuntimeState string
+
+const (
+	RuntimeUnknown     RuntimeState = "unknown"
+	RuntimeAlive       RuntimeState = "alive"
+	RuntimeExited      RuntimeState = "exited"
+	RuntimeMissing     RuntimeState = "missing"
+	RuntimeProbeFailed RuntimeState = "probe_failed"
+)
+
+type RuntimeReason string
+
+const (
+	RuntimeReasonSpawnIncomplete     RuntimeReason = "spawn_incomplete"
+	RuntimeReasonProcessRunning      RuntimeReason = "process_running"
+	RuntimeReasonProcessMissing      RuntimeReason = "process_missing"
+	RuntimeReasonTmuxMissing         RuntimeReason = "tmux_missing"
+	RuntimeReasonManualKillRequested RuntimeReason = "manual_kill_requested"
+	RuntimeReasonPRMergedCleanup     RuntimeReason = "pr_merged_cleanup"
+	RuntimeReasonAutoCleanup         RuntimeReason = "auto_cleanup"
+	RuntimeReasonProbeError          RuntimeReason = "probe_error"
+)
+
+type RuntimeSubstate struct {
+	State  RuntimeState  `json:"state"`
+	Reason RuntimeReason `json:"reason"`
+}
+
+// ---- activity sub-state (decider input) ----
+
+type ActivityState string
+
+const (
+	ActivityActive       ActivityState = "active"
+	ActivityReady        ActivityState = "ready"
+	ActivityIdle         ActivityState = "idle"
+	ActivityWaitingInput ActivityState = "waiting_input" // sticky: does not decay by wallclock
+	ActivityBlocked      ActivityState = "blocked"       // sticky: does not decay by wallclock
+	ActivityExited       ActivityState = "exited"
+)
+
+// IsSticky reports whether an activity state must NOT be aged/demoted by the
+// passage of time (a paused agent is still paused until a new signal says so).
+func (a ActivityState) IsSticky() bool {
+	return a == ActivityWaitingInput || a == ActivityBlocked
+}
+
+type ActivitySource string
+
+const (
+	SourceNative   ActivitySource = "native"
+	SourceTerminal ActivitySource = "terminal"
+	SourceHook     ActivitySource = "hook"
+	SourceRuntime  ActivitySource = "runtime"
+	SourceNone     ActivitySource = "none"
+)
+
+type ActivitySubstate struct {
+	State          ActivityState  `json:"state"`
+	LastActivityAt time.Time      `json:"lastActivityAt"`
+	Source         ActivitySource `json:"source"`
+}
+
+// ---- detecting quarantine memory (decider input) ----
+
+type DetectingState struct {
+	Attempts     int       `json:"attempts"`
+	StartedAt    time.Time `json:"startedAt"`
+	EvidenceHash string    `json:"evidenceHash"`
+}
diff --git a/backend/internal/domain/session.go b/backend/internal/domain/session.go
new file mode 100644
index 00000000..578cca40
--- /dev/null
+++ b/backend/internal/domain/session.go
@@ -0,0 +1,42 @@
+package domain
+
+import "time"
+
+// SessionID, ProjectID, IssueID are distinct string types so they can't be
+// swapped at a call site by accident.
+type (
+	SessionID string
+	ProjectID string
+	IssueID   string
+)
+
+type SessionKind string
+
+const (
+	KindWorker       SessionKind = "worker"
+	KindOrchestrator SessionKind = "orchestrator"
+)
+
+// SessionRecord is the PERSISTENCE shape: identity, canonical lifecycle, and
+// metadata — everything the store holds, and nothing derived. The store reads
+// and writes records; it never produces the derived display status.
+type SessionRecord struct {
+	ID        SessionID                 `json:"id"`
+	ProjectID ProjectID                 `json:"projectId"`
+	IssueID   IssueID                   `json:"issueId,omitempty"`
+	Kind      SessionKind               `json:"kind"`
+	Lifecycle CanonicalSessionLifecycle `json:"lifecycle"`
+	Metadata  map[string]string         `json:"metadata,omitempty"`
+	CreatedAt time.Time                 `json:"createdAt"`
+	UpdatedAt time.Time                 `json:"updatedAt"`
+}
+
+// Session is the read-model returned across the API boundary (to controllers,
+// then the frontend): a SessionRecord plus the DERIVED display Status. The
+// Session Manager is the single producer of Status — it builds a Session from a
+// stored SessionRecord by calling DeriveLegacyStatus, so the store and API
+// never recompute (or accidentally persist) it.
+type Session struct {
+	SessionRecord
+	Status SessionStatus `json:"status"`
+}
diff --git a/backend/internal/domain/status.go b/backend/internal/domain/status.go
new file mode 100644
index 00000000..b12b2b9f
--- /dev/null
+++ b/backend/internal/domain/status.go
@@ -0,0 +1,100 @@
+package domain
+
+// SessionStatus is the single-word DISPLAY status the dashboard renders. It is
+// derived from the canonical lifecycle on read and never persisted.
+type SessionStatus string
+
+const (
+	StatusSpawning         SessionStatus = "spawning"
+	StatusWorking          SessionStatus = "working"
+	StatusDetecting        SessionStatus = "detecting"
+	StatusPROpen           SessionStatus = "pr_open"
+	StatusCIFailed         SessionStatus = "ci_failed"
+	StatusReviewPending    SessionStatus = "review_pending"
+	StatusChangesRequested SessionStatus = "changes_requested"
+	StatusApproved         SessionStatus = "approved"
+	StatusMergeable        SessionStatus = "mergeable"
+	StatusMerged           SessionStatus = "merged"
+	StatusCleanup          SessionStatus = "cleanup"
+	StatusNeedsInput       SessionStatus = "needs_input"
+	StatusStuck            SessionStatus = "stuck"
+	StatusErrored          SessionStatus = "errored"
+	StatusKilled           SessionStatus = "killed"
+	StatusIdle             SessionStatus = "idle"
+	StatusDone             SessionStatus = "done"
+	StatusTerminated       SessionStatus = "terminated"
+)
+
+// DeriveLegacyStatus is the ONLY producer of the display status. It must stay a
+// pure, total function of the canonical record.
+//
+// Order matters:
+//  1. Terminal / hard session states (done, terminated, needs_input, stuck,
+//     detecting, not_started) map directly — these OUTRANK PR facts.
+//  2. Otherwise a merged PR wins.
+//  3. Otherwise an open PR maps by its reason.
+//  4. Otherwise fall through to the SOFT session state (idle/working).
+//
+// So "PR facts dominate session facts" applies only to the soft states: an idle
+// or working session with an open, CI-failing PR displays as ci_failed — but a
+// session that is stuck or needs_input shows that regardless of PR state, since
+// it needs a human either way.
+func DeriveLegacyStatus(l CanonicalSessionLifecycle) SessionStatus {
+	switch l.Session.State {
+	case SessionDone:
+		return StatusDone
+	case SessionTerminated:
+		return terminatedStatus(l.Session.Reason)
+	case SessionNeedsInput:
+		return StatusNeedsInput
+	case SessionStuck:
+		return StatusStuck
+	case SessionDetecting:
+		return StatusDetecting
+	case SessionNotStarted:
+		return StatusSpawning
+	}
+
+	if l.PR.State == PRMerged {
+		return StatusMerged
+	}
+
+	if l.PR.State == PROpen {
+		return openPRStatus(l.PR.Reason)
+	}
+
+	if l.Session.State == SessionIdle {
+		return StatusIdle
+	}
+	return StatusWorking
+}
+
+func terminatedStatus(r SessionReason) SessionStatus {
+	switch r {
+	case ReasonManuallyKilled, ReasonRuntimeLost, ReasonAgentProcessExited:
+		return StatusKilled
+	case ReasonAutoCleanup, ReasonPRMerged:
+		return StatusCleanup
+	case ReasonErrorInProcess, ReasonProbeFailure:
+		return StatusErrored
+	default:
+		return StatusTerminated
+	}
+}
+
+func openPRStatus(r PRReason) SessionStatus {
+	switch r {
+	case PRReasonCIFailing:
+		return StatusCIFailed
+	case PRReasonChangesRequested:
+		return StatusChangesRequested
+	case PRReasonApproved:
+		return StatusApproved
+	case PRReasonMergeReady:
+		return StatusMergeable
+	case PRReasonReviewPending:
+		return StatusReviewPending
+	default:
+		return StatusPROpen
+	}
+}
diff --git a/backend/internal/domain/status_test.go b/backend/internal/domain/status_test.go
new file mode 100644
index 00000000..12b0ade0
--- /dev/null
+++ b/backend/internal/domain/status_test.go
@@ -0,0 +1,87 @@
+package domain
+
+import "testing"
+
+func TestDeriveLegacyStatus(t *testing.T) {
+	tests := []struct {
+		name string
+		in   CanonicalSessionLifecycle
+		want SessionStatus
+	}{
+		{
+			name: "not_started maps to spawning",
+			in:   CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionNotStarted, Reason: ReasonSpawnRequested}},
+			want: StatusSpawning,
+		},
+		{
+			name: "terminated+manually_killed maps to killed",
+			in:   CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonManuallyKilled}},
+			want: StatusKilled,
+		},
+		{
+			name: "terminated+auto_cleanup maps to cleanup",
+			in:   CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonAutoCleanup}},
+			want: StatusCleanup,
+		},
+		{
+			name: "terminated+error maps to errored",
+			in:   CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionTerminated, Reason: ReasonErrorInProcess}},
+			want: StatusErrored,
+		},
+		{
+			name: "hard state needs_input maps directly",
+			in:   CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionNeedsInput}},
+			want: StatusNeedsInput,
+		},
+		{
+			name: "merged PR dominates an idle session",
+			in: CanonicalSessionLifecycle{
+				Session: SessionSubstate{State: SessionIdle},
+				PR:      PRSubstate{State: PRMerged},
+			},
+			want: StatusMerged,
+		},
+		{
+			name: "open PR with failing CI dominates idle session",
+			in: CanonicalSessionLifecycle{
+				Session: SessionSubstate{State: SessionIdle},
+				PR:      PRSubstate{State: PROpen, Reason: PRReasonCIFailing},
+			},
+			want: StatusCIFailed,
+		},
+		{
+			name: "open PR approved",
+			in: CanonicalSessionLifecycle{
+				Session: SessionSubstate{State: SessionWorking},
+				PR:      PRSubstate{State: PROpen, Reason: PRReasonApproved},
+			},
+			want: StatusApproved,
+		},
+		{
+			name: "open PR merge_ready maps to mergeable",
+			in: CanonicalSessionLifecycle{
+				Session: SessionSubstate{State: SessionWorking},
+				PR:      PRSubstate{State: PROpen, Reason: PRReasonMergeReady},
+			},
+			want: StatusMergeable,
+		},
+		{
+			name: "no PR falls through to idle",
+			in:   CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionIdle}},
+			want: StatusIdle,
+		},
+		{
+			name: "no PR falls through to working",
+			in:   CanonicalSessionLifecycle{Session: SessionSubstate{State: SessionWorking}},
+			want: StatusWorking,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := DeriveLegacyStatus(tt.in); got != tt.want {
+				t.Errorf("DeriveLegacyStatus() = %q, want %q", got, tt.want)
+			}
+		})
+	}
+}
diff --git a/backend/internal/lifecycle/decide_bridge.go b/backend/internal/lifecycle/decide_bridge.go
new file mode 100644
index 00000000..942fdad4
--- /dev/null
+++ b/backend/internal/lifecycle/decide_bridge.go
@@ -0,0 +1,227 @@
+package lifecycle
+
+import (
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain/decide"
+	"github.com/aoagents/agent-orchestrator/backend/internal/ports"
+)
+
+// defaultRecentActivityWindow is how fresh the last activity signal must be for
+// the probe decider to treat the agent as "recently active" (which keeps an
+// ambiguous dead-runtime probe in detecting instead of concluding death).
+const defaultRecentActivityWindow = 60 * time.Second
+
+// ---- fact translation: ports DTOs -> pure decide inputs ----
+
+// runtimeFactsToProbeInput maps a raw RuntimeFacts (plus the prior detecting
+// memory and last-known activity read back from canonical) into the probe
+// decider's input. KillRequested is always false here: the inferred-death path
+// never carries an explicit kill — that arrives via OnKillRequested.
+func runtimeFactsToProbeInput(f ports.RuntimeFacts, cur domain.CanonicalSessionLifecycle, window time.Duration) decide.ProbeInput {
+	rt, rtFailed := runtimeProbeToState(f.RuntimeState)
+	proc, procFailed := processProbeToLiveness(f.ProcessState)
+	now := nowOr(f.ObservedAt)
+	return decide.ProbeInput{
+		Runtime:        rt,
+		RuntimeFailed:  rtFailed,
+		Process:        proc,
+		ProcessFailed:  procFailed,
+		RecentActivity: hasRecentActivity(cur.Activity, now, window),
+		Prior:          cur.Detecting,
+		Now:            now,
+	}
+}
+
+func runtimeProbeToState(p ports.RuntimeProbe) (domain.RuntimeState, bool) {
+	switch p {
+	case ports.RuntimeProbeAlive:
+		return domain.RuntimeAlive, false
+	case ports.RuntimeProbeDead:
+		return domain.RuntimeExited, false
+	case ports.RuntimeProbeFailed:
+		return domain.RuntimeProbeFailed, true
+	default: // indeterminate / unset: ambiguous, never a death conclusion
+		return domain.RuntimeUnknown, false
+	}
+}
+
+func processProbeToLiveness(p ports.ProcessProbe) (decide.ProcessLiveness, bool) {
+	switch p {
+	case ports.ProcessProbeAlive:
+		return decide.ProcessAlive, false
+	case ports.ProcessProbeDead:
+		return decide.ProcessDead, false
+	case ports.ProcessProbeFailed:
+		return decide.ProcessIndeterminate, true
+	default: // indeterminate / unset
+		return decide.ProcessIndeterminate, false
+	}
+}
+
+// runtimeSubstateFromFacts derives the runtime sub-state to persist. Liveness
+// always owns this axis, so it is written on every runtime observation
+// regardless of what the session axis does.
+func runtimeSubstateFromFacts(f ports.RuntimeFacts) domain.RuntimeSubstate {
+	switch f.RuntimeState {
+	case ports.RuntimeProbeAlive:
+		return domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}
+	case ports.RuntimeProbeDead:
+		return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonTmuxMissing}
+	case ports.RuntimeProbeFailed:
+		return domain.RuntimeSubstate{State: domain.RuntimeProbeFailed, Reason: domain.RuntimeReasonProbeError}
+	case ports.RuntimeProbeIndeterminate:
+		// Probe ran but couldn't tell — distinct from a probe error, so no
+		// probe_error reason; the ambiguity is carried by RuntimeUnknown alone.
+		return domain.RuntimeSubstate{State: domain.RuntimeUnknown}
+	default: // unset
+		return domain.RuntimeSubstate{State: domain.RuntimeUnknown}
+	}
+}
+
+// hasRecentActivity answers the probe decider's "was the agent heard from
+// recently?" question. Sticky states (waiting_input/blocked) count as recent
+// because they mean a live-but-paused agent; an explicit exited signal never
+// counts; otherwise we age the last-activity timestamp against the window.
+func hasRecentActivity(a domain.ActivitySubstate, now time.Time, window time.Duration) bool {
+	if a.State == domain.ActivityExited {
+		return false
+	}
+	if a.State.IsSticky() {
+		return true
+	}
+	if a.LastActivityAt.IsZero() {
+		return false
+	}
+	return now.Sub(a.LastActivityAt) <= window
+}
+
+// openPRInput maps SCM facts onto the open-PR ladder. IdleBeyond is always false
+// in split A — the idle-duration signal is owned by the escalation engine
+// (split B); the synchronous LCM has no clock of its own here.
+func openPRInput(f ports.SCMFacts) decide.OpenPRInput {
+	return decide.OpenPRInput{
+		CIFailing:        f.CISummary == ports.CIFailing,
+		ChangesRequested: f.ReviewDecision == ports.ReviewChangesRequested,
+		Approved:         f.ReviewDecision == ports.ReviewApproved,
+		Mergeable:        f.Mergeability.Mergeable,
+		ReviewPending:    f.ReviewDecision == ports.ReviewPending,
+		Number:           f.PRNumber,
+		URL:              f.PRURL,
+	}
+}
+
+// ---- activity -> session axis mapping (activity owns working/idle/waiting) ----
+
+// activityToSession maps an activity classification onto the session sub-state.
+// exited returns ok=false: an exit signal must NOT write a terminal session
+// state — only the probe pipeline (via detecting) may conclude inferred death.
+func activityToSession(a domain.ActivityState) (domain.SessionState, domain.SessionReason, bool) {
+	switch a {
+	case domain.ActivityActive:
+		return domain.SessionWorking, domain.ReasonTaskInProgress, true
+	case domain.ActivityReady:
+		// ready = the agent finished a unit and is waiting for more work.
+		return domain.SessionIdle, domain.ReasonResearchComplete, true
+	case domain.ActivityIdle:
+		// plain inactivity carries no completion claim, so no specific reason
+		// (research_complete here would read misleadingly in diagnostics).
+		return domain.SessionIdle, "", true
+	case domain.ActivityWaitingInput:
+		return domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, true
+	case domain.ActivityBlocked:
+		return domain.SessionStuck, domain.ReasonAwaitingUserInput, true
+	default: // exited / unset
+		return "", "", false
+	}
+}
+
+// ---- composition predicates: who may write the session axis ----
+
+// isTerminal reports a final session state that must not be resurrected by an
+// observation (only an explicit Restore reopens a terminal session).
+func isTerminal(s domain.SessionState) bool {
+	return s == domain.SessionDone || s == domain.SessionTerminated
+}
+
+// isLivenessOwned reports whether the current session sub-state was set by the
+// liveness/death axis (the probe pipeline) and may therefore be recovered by a
+// later healthy probe. detecting is always liveness-owned; a stuck/terminated
+// state is liveness-owned only when its reason came from a death inference.
+func isLivenessOwned(s domain.SessionSubstate) bool {
+	if s.State == domain.SessionDetecting {
+		return true
+	}
+	switch s.Reason {
+	case domain.ReasonRuntimeLost, domain.ReasonAgentProcessExited, domain.ReasonProbeFailure:
+		return true
+	}
+	return false
+}
+
+// shouldWriteSessionRuntime is the #1 composition rule for ApplyRuntimeObservation.
+// A death-axis verdict (detecting/stuck/terminal) always writes — it overrides
+// activity because a (maybe) dead agent can't be working/waiting. A healthy
+// "working" verdict only writes when it is recovering a liveness-owned state
+// (e.g. detecting -> working); it must NOT clobber an activity-owned
+// needs_input/blocked/idle the activity axis is responsible for.
+func shouldWriteSessionRuntime(d decide.LifecycleDecision, cur domain.CanonicalSessionLifecycle) bool {
+	if isTerminal(cur.Session.State) {
+		// A terminal session is only reopened by an explicit Restore — never by
+		// an observation. Even a death-axis verdict (e.g. detecting) must not
+		// resurrect it; the runtime axis is still patched separately.
+		return false
+	}
+	if d.SessionState == domain.SessionWorking {
+		return isLivenessOwned(cur.Session)
+	}
+	return true
+}
+
+// shouldWriteSessionActivity is the mirror rule for ApplyActivitySignal: the
+// activity axis owns working/idle/waiting. A valid activity signal is direct
+// proof of life, so it is allowed to RESOLVE a detecting session (pull it out of
+// the liveness quarantine) — but it must not resurrect a terminal session, and
+// it leaves a liveness-escalated stuck state to the probe pipeline (stuck is a
+// deliberate human-facing escalation, not a transient quarantine).
+func shouldWriteSessionActivity(cur domain.CanonicalSessionLifecycle) bool {
+	if isTerminal(cur.Session.State) {
+		return false
+	}
+	if cur.Session.State == domain.SessionDetecting {
+		return true
+	}
+	return !isLivenessOwned(cur.Session)
+}
+
+// ---- explicit-kill mapping (SM's terminal-write authority) ----
+
+func killSession(k ports.LifecycleKillReason) domain.SessionSubstate {
+	switch k {
+	case ports.KillManual:
+		return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonManuallyKilled}
+	case ports.KillCleanup:
+		return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonAutoCleanup}
+	default: // error
+		return domain.SessionSubstate{State: domain.SessionTerminated, Reason: domain.ReasonErrorInProcess}
+	}
+}
+
+func killRuntime(k ports.LifecycleKillReason) domain.RuntimeSubstate {
+	switch k {
+	case ports.KillManual:
+		return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonManualKillRequested}
+	case ports.KillCleanup:
+		return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonAutoCleanup}
+	default: // error
+		return domain.RuntimeSubstate{State: domain.RuntimeExited, Reason: domain.RuntimeReasonProbeError}
+	}
+}
+
+func nowOr(t time.Time) time.Time {
+	if t.IsZero() {
+		return time.Now()
+	}
+	return t
+}
diff --git a/backend/internal/lifecycle/fakes_test.go b/backend/internal/lifecycle/fakes_test.go
new file mode 100644
index 00000000..cc47ad84
--- /dev/null
+++ b/backend/internal/lifecycle/fakes_test.go
@@ -0,0 +1,185 @@
+package lifecycle
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+	"github.com/aoagents/agent-orchestrator/backend/internal/ports"
+)
+
+// fakeStore is an in-memory LifecycleStore that faithfully applies merge-patch
+// semantics (sparse field writes, the three-way Detecting/ClearDetecting rule,
+// ExpectedRevision optimistic-concurrency check, monotonic Revision bump) so
+// tests assert against the real persisted canonical.
+type fakeStore struct {
+	mu       sync.Mutex
+	records  map[domain.SessionID]*domain.SessionRecord
+	metadata map[domain.SessionID]map[string]string
+}
+
+var _ ports.LifecycleStore = (*fakeStore)(nil)
+
+func newFakeStore() *fakeStore {
+	return &fakeStore{
+		records:  map[domain.SessionID]*domain.SessionRecord{},
+		metadata: map[domain.SessionID]map[string]string{},
+	}
+}
+
+// seed installs a starting lifecycle for a session id (bypassing the patch path).
+func (s *fakeStore) seed(id domain.SessionID, l domain.CanonicalSessionLifecycle) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if l.Version == 0 {
+		l.Version = domain.LifecycleVersion
+	}
+	s.records[id] = &domain.SessionRecord{ID: id, Lifecycle: l}
+}
+
+func (s *fakeStore) Load(_ context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	rec, ok := s.records[id]
+	if !ok {
+		return domain.CanonicalSessionLifecycle{}, false, nil
+	}
+	return rec.Lifecycle, true, nil
+}
+
+func (s *fakeStore) PatchLifecycle(_ context.Context, id domain.SessionID, p ports.LifecyclePatch) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	rec, ok := s.records[id]
+	if !ok {
+		rec = &domain.SessionRecord{ID: id, Lifecycle: domain.CanonicalSessionLifecycle{Version: domain.LifecycleVersion}}
+		s.records[id] = rec
+	}
+	l := &rec.Lifecycle
+
+	if p.ExpectedRevision != nil && *p.ExpectedRevision != l.Revision {
+		return fmt.Errorf("revision mismatch for %s: have %d, expected %d", id, l.Revision, *p.ExpectedRevision)
+	}
+
+	if p.Session != nil {
+		l.Session = *p.Session
+	}
+	if p.PR != nil {
+		l.PR = *p.PR
+	}
+	if p.Runtime != nil {
+		l.Runtime = *p.Runtime
+	}
+	if p.Activity != nil {
+		l.Activity = *p.Activity
+	}
+	switch {
+	case p.ClearDetecting:
+		l.Detecting = nil
+	case p.Detecting != nil:
+		d := *p.Detecting
+		l.Detecting = &d
+	}
+
+	l.Version = domain.LifecycleVersion
+	l.Revision++
+	rec.UpdatedAt = time.Now()
+	return nil
+}
+
+func (s *fakeStore) Seed(_ context.Context, rec domain.SessionRecord) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if _, ok := s.records[rec.ID]; ok {
+		return fmt.Errorf("seed: session %s already exists", rec.ID)
+	}
+	if rec.Lifecycle.Version == 0 {
+		rec.Lifecycle.Version = domain.LifecycleVersion
+	}
+	r := rec
+	s.records[rec.ID] = &r
+	return nil
+}
+
+func (s *fakeStore) Get(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	rec, ok := s.records[id]
+	if !ok {
+		return domain.SessionRecord{}, false, nil
+	}
+	return *rec, true, nil
+}
+
+func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	var out []domain.SessionRecord
+	for _, rec := range s.records {
+		if rec.ProjectID == project {
+			out = append(out, *rec)
+		}
+	}
+	return out, nil
+}
+
+func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (map[string]string, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	out := map[string]string{}
+	for k, v := range s.metadata[id] {
+		out[k] = v
+	}
+	return out, nil
+}
+
+func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, kv map[string]string) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.metadata[id] == nil {
+		s.metadata[id] = map[string]string{}
+	}
+	for k, v := range kv {
+		s.metadata[id][k] = v
+	}
+	return nil
+}
+
+// recordingNotifier captures emitted events for assertions.
+type recordingNotifier struct {
+	mu     sync.Mutex
+	events []ports.OrchestratorEvent
+}
+
+var _ ports.Notifier = (*recordingNotifier)(nil)
+
+func (n *recordingNotifier) Notify(_ context.Context, e ports.OrchestratorEvent) error {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+	n.events = append(n.events, e)
+	return nil
+}
+
+// recordingMessenger captures messages injected into agents.
+type recordingMessenger struct {
+	mu   sync.Mutex
+	sent []struct {
+		ID      domain.SessionID
+		Message string
+	}
+}
+
+var _ ports.AgentMessenger = (*recordingMessenger)(nil)
+
+func (a *recordingMessenger) Send(_ context.Context, id domain.SessionID, message string) error {
+	a.mu.Lock()
+	defer a.mu.Unlock()
+	a.sent = append(a.sent, struct {
+		ID      domain.SessionID
+		Message string
+	}{id, message})
+	return nil
+}
diff --git a/backend/internal/lifecycle/manager.go b/backend/internal/lifecycle/manager.go
new file mode 100644
index 00000000..2581fea0
--- /dev/null
+++ b/backend/internal/lifecycle/manager.go
@@ -0,0 +1,423 @@
+// Package lifecycle implements ports.LifecycleManager: the synchronous
+// observe->decide->persist reducer. Every Apply*/On* entrypoint runs the same
+// pipeline under a per-session lock — load canonical, run the matching pure
+// decider, diff the result into a sparse merge-patch, persist. The LCM never
+// polls and never writes the display status (that is derived on read).
+//
+// After a transition is persisted, the Apply* paths fire the mapped reaction
+// (the ACT layer: reaction table + escalation engine) via the react() chokepoint
+// in reactions.go. The Session Manager lands in a later split.
+package lifecycle
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain/decide"
+	"github.com/aoagents/agent-orchestrator/backend/internal/ports"
+)
+
+// Metadata keys OnSpawnCompleted records for the spawned session's handles.
+const (
+	MetaBranch          = "branch"
+	MetaWorkspacePath   = "workspacePath"
+	MetaRuntimeHandleID = "runtimeHandleId"
+	MetaRuntimeName     = "runtimeName"
+	MetaAgentSessionID  = "agentSessionId"
+)
+
+// Manager is the LCM. The Apply* pipeline persists a transition and then fires
+// the mapped reaction via Notifier/AgentMessenger (see reactions.go).
+type Manager struct {
+	store     ports.LifecycleStore
+	notifier  ports.Notifier
+	messenger ports.AgentMessenger
+
+	recentActivityWindow time.Duration
+	locks                keyedMutex
+
+	// trackers hold per-(session,reaction) escalation budgets (ACT policy, not
+	// canonical state). trackerMu guards them: react() touches them from the
+	// caller's goroutine, TickEscalations from the reaper's. clock is the time
+	// source for escalation stamping (overridable in tests).
+	trackers  map[trackerKey]*reactionTracker
+	trackerMu sync.Mutex
+	clock     func() time.Time
+}
+
+var _ ports.LifecycleManager = (*Manager)(nil)
+
+func New(store ports.LifecycleStore, notifier ports.Notifier, messenger ports.AgentMessenger) *Manager {
+	return &Manager{
+		store:                store,
+		notifier:             notifier,
+		messenger:            messenger,
+		recentActivityWindow: defaultRecentActivityWindow,
+		trackers:             map[trackerKey]*reactionTracker{},
+		clock:                time.Now,
+	}
+}
+
+// ---- per-session serialisation ----
+
+// keyedMutex hands out one lock per session id so the load->decide->persist
+// read-modify-write is serial within a session but parallel across sessions.
+//
+// Entries are reference-counted and evicted when the last holder releases, so
+// the map stays bounded to sessions with in-flight operations rather than
+// growing unbounded over the lifetime of a long-running daemon.
+type keyedMutex struct {
+	mu    sync.Mutex
+	locks map[domain.SessionID]*lockEntry
+}
+
+type lockEntry struct {
+	mu   sync.Mutex
+	refs int
+}
+
+func (k *keyedMutex) lock(id domain.SessionID) func() {
+	k.mu.Lock()
+	if k.locks == nil {
+		k.locks = make(map[domain.SessionID]*lockEntry)
+	}
+	e, ok := k.locks[id]
+	if !ok {
+		e = &lockEntry{}
+		k.locks[id] = e
+	}
+	e.refs++
+	k.mu.Unlock()
+
+	e.mu.Lock()
+	return func() {
+		e.mu.Unlock()
+		k.mu.Lock()
+		e.refs--
+		if e.refs == 0 {
+			delete(k.locks, id)
+		}
+		k.mu.Unlock()
+	}
+}
+
+func (m *Manager) withLock(id domain.SessionID, fn func() error) error {
+	unlock := m.locks.lock(id)
+	defer unlock()
+	return fn()
+}
+
+// transition is what a persisted write produced: the canonical before and after
+// the patch. The ACT layer (react) derives the reaction from these. It is nil
+// when the pipeline made no write.
+type transition struct {
+	beforeLC domain.CanonicalSessionLifecycle
+	afterLC  domain.CanonicalSessionLifecycle
+}
+
+// mutate runs the shared pipeline: load -> build patch -> persist (only if the
+// patch changed something). decideFn returns the diffed patch and whether it
+// touches anything; a false "changed" is a clean no-op (no write, no revision
+// bump), which is how failed-probe / unknown-fact inputs are dropped.
+//
+// On a write it returns the transition (before/after canonical) so the caller —
+// which still holds the originating facts — can fire the mapped reaction.
+func (m *Manager) mutate(
+	ctx context.Context,
+	id domain.SessionID,
+	decideFn func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error),
+) (*transition, error) {
+	var tr *transition
+	err := m.withLock(id, func() error {
+		cur, exists, err := m.store.Load(ctx, id)
+		if err != nil {
+			return err
+		}
+		patch, changed, err := decideFn(cur, exists)
+		if err != nil {
+			return err
+		}
+		if !changed {
+			return nil
+		}
+		if err := m.store.PatchLifecycle(ctx, id, patch); err != nil {
+			return err
+		}
+		after, _, err := m.store.Load(ctx, id)
+		if err != nil {
+			return err
+		}
+		tr = &transition{beforeLC: cur, afterLC: after}
+		return nil
+	})
+	return tr, err
+}
+
+// ---- OBSERVE entrypoints ----
+
+// ApplyRuntimeObservation feeds the probe decider. Liveness always writes the
+// runtime axis; the session axis follows the #1 composition rule; and a
+// non-detecting verdict clears any stale detecting memory (#3) so the next
+// probe doesn't read a phantom prior.
+func (m *Manager) ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error {
+	tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error) {
+		if !exists {
+			return ports.LifecyclePatch{}, false, nil // nothing seeded; ignore stray probe
+		}
+
+		d := decide.ResolveProbeDecision(runtimeFactsToProbeInput(f, cur, m.recentActivityWindow))
+
+		var patch ports.LifecyclePatch
+		changed := false
+
+		if rt := runtimeSubstateFromFacts(f); cur.Runtime != rt {
+			patch.Runtime = &rt
+			changed = true
+		}
+		// A terminal session is reopened only by an explicit Restore: an
+		// observation may refresh the runtime axis above but must touch neither
+		// the session axis nor the detecting memory.
+		if !isTerminal(cur.Session.State) {
+			if shouldWriteSessionRuntime(d, cur) {
+				changed = setSessionIfChanged(&patch, cur, d.SessionState, d.SessionReason) || changed
+			}
+			changed = setDetecting(&patch, cur, d.Detecting) || changed
+		}
+
+		return patch, changed, nil
+	})
+	if err != nil {
+		return err
+	}
+	return m.react(ctx, id, tr, reactionContext{})
+}
+
+// ApplySCMObservation maps PR facts onto the PR axis. A failed fetch is dropped
+// (failed probe != "no PR"). An open PR writes only the PR sub-state — the
+// session axis stays owned by activity, and DeriveLegacyStatus surfaces the PR
+// reason for display. A terminal PR (merged/closed) also parks the session.
+func (m *Manager) ApplySCMObservation(ctx context.Context, id domain.SessionID, f ports.SCMFacts) error {
+	tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error) {
+		if !exists || !f.Fetched {
+			return ports.LifecyclePatch{}, false, nil
+		}
+
+		switch f.PRState {
+		case domain.PROpen:
+			d := decide.ResolveOpenPRDecision(openPRInput(f))
+			var patch ports.LifecyclePatch
+			changed := setPRIfChanged(&patch, cur, d, f)
+			return patch, changed, nil
+
+		case domain.PRMerged, domain.PRClosed:
+			d := decide.ResolveTerminalPRStateDecision(f.PRState)
+			var patch ports.LifecyclePatch
+			changed := setPRIfChanged(&patch, cur, d, f)
+			// A merge/close is a milestone that ends the work, so it parks the
+			// session axis (idle / merged_waiting_decision) even over an
+			// activity-owned needs_input/blocked — unlike the open-PR path,
+			// which leaves the session axis to activity. A terminal session is
+			// still never reopened.
+			if !isTerminal(cur.Session.State) {
+				changed = setSessionIfChanged(&patch, cur, d.SessionState, d.SessionReason) || changed
+			}
+			return patch, changed, nil
+
+		default: // none / unset: no PR-driven transition in split A
+			return ports.LifecyclePatch{}, false, nil
+		}
+	})
+	if err != nil {
+		return err
+	}
+	return m.react(ctx, id, tr, reactionContext{ciFailureLogTail: f.CIFailureLogTail})
+}
+
+// ApplyActivitySignal updates the activity axis. Only a valid-confidence signal
+// is authoritative (stale/unavailable/probe_failure != idleness). It refreshes
+// the persisted activity sub-state (the probe decider's RecentActivity input)
+// and maps the classification onto the session axis. A valid signal is proof of
+// life, so it may resolve a detecting session — clearing the quarantine memory
+// so a later probe doesn't resume counting from a stale prior.
+func (m *Manager) ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ports.ActivitySignal) error {
+	tr, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error) {
+		if !exists || s.State != ports.SignalValid {
+			return ports.LifecyclePatch{}, false, nil
+		}
+
+		var patch ports.LifecyclePatch
+		changed := false
+
+		act := domain.ActivitySubstate{State: s.Activity, LastActivityAt: nowOr(s.Timestamp), Source: s.Source}
+		if !sameActivity(cur.Activity, act) {
+			patch.Activity = &act
+			changed = true
+		}
+		if st, rs, ok := activityToSession(s.Activity); ok && shouldWriteSessionActivity(cur) {
+			changed = setSessionIfChanged(&patch, cur, st, rs) || changed
+			// Proof of life that pulls the session out of detecting must also
+			// drop the quarantine memory (detecting memory only exists while
+			// detecting, so this is a no-op otherwise).
+			if cur.Detecting != nil {
+				patch.ClearDetecting = true
+				changed = true
+			}
+		}
+
+		return patch, changed, nil
+	})
+	if err != nil {
+		return err
+	}
+	return m.react(ctx, id, tr, reactionContext{})
+}
+
+// ---- mutation outcomes reported by the Session Manager ----
+
+// OnSpawnCompleted records that a spawn finished: the runtime is up and the
+// handles are known. Per the agreed rule it flips the runtime axis to alive and
+// stores the handles in metadata, but leaves the session at not_started
+// (display: spawning) — the agent "acknowledges" via the first activity signal.
+func (m *Manager) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error {
+	return m.withLock(id, func() error {
+		cur, exists, err := m.store.Load(ctx, id)
+		if err != nil {
+			return err
+		}
+		if !exists {
+			// The SM seeds the initial lifecycle before spawning; a completion
+			// for an unseeded session is a contract violation, not a stray
+			// observation, so surface it rather than fabricating a record.
+			return fmt.Errorf("lifecycle: OnSpawnCompleted for unseeded session %q", id)
+		}
+		rt := domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning}
+		if cur.Runtime != rt {
+			if err := m.store.PatchLifecycle(ctx, id, ports.LifecyclePatch{Runtime: &rt}); err != nil {
+				return err
+			}
+		}
+		if meta := spawnMetadata(o); len(meta) > 0 {
+			if err := m.store.PatchMetadata(ctx, id, meta); err != nil {
+				return err
+			}
+		}
+		return nil
+	})
+}
+
+// OnKillRequested is the SM's explicit terminal-write authority (the one
+// terminal path that does not go through the inferred-death decider). It writes
+// the terminal session/runtime sub-states for the kill kind and clears any
+// in-flight detecting memory.
+func (m *Manager) OnKillRequested(ctx context.Context, id domain.SessionID, r ports.KillReason) error {
+	// An explicit user kill is a human action, not an inferred event, so it
+	// fires no reaction — the transition is discarded.
+	_, err := m.mutate(ctx, id, func(cur domain.CanonicalSessionLifecycle, exists bool) (ports.LifecyclePatch, bool, error) {
+		if !exists {
+			// Killing an unknown/already-gone session is a benign race; no-op
+			// rather than fabricating a terminal record for a session we never
+			// knew about.
+			return ports.LifecyclePatch{}, false, nil
+		}
+
+		var patch ports.LifecyclePatch
+		changed := false
+
+		if sess := killSession(r.Kind); cur.Session != sess {
+			patch.Session = &sess
+			changed = true
+		}
+		if rt := killRuntime(r.Kind); cur.Runtime != rt {
+			patch.Runtime = &rt
+			changed = true
+		}
+		if cur.Detecting != nil {
+			patch.ClearDetecting = true
+			changed = true
+		}
+		return patch, changed, nil
+	})
+	if err != nil {
+		return err
+	}
+	// A kill is terminal but bypasses react()'s incident-over cleanup (it fires
+	// no reaction). Drop any escalation trackers here so a later duration-based
+	// TickEscalations can't emit reaction.escalated for a dead session.
+	m.clearSessionTrackers(id)
+	return nil
+}
+
+// ---- patch helpers (diff -> sparse merge-patch) ----
+
+// setSessionIfChanged sets patch.Session only when the decided sub-state
+// differs from current; an empty decided state means "decider does not address
+// the session axis" and is left untouched.
+func setSessionIfChanged(patch *ports.LifecyclePatch, cur domain.CanonicalSessionLifecycle, st domain.SessionState, rs domain.SessionReason) bool {
+	if st == "" {
+		return false
+	}
+	want := domain.SessionSubstate{State: st, Reason: rs}
+	if cur.Session == want {
+		return false
+	}
+	patch.Session = &want
+	return true
+}
+
+// setPRIfChanged folds the decided PR sub-state plus the fact-borne PR identity
+// (number/url) into the patch when it differs from current.
+func setPRIfChanged(patch *ports.LifecyclePatch, cur domain.CanonicalSessionLifecycle, d decide.LifecycleDecision, f ports.SCMFacts) bool {
+	want := domain.PRSubstate{State: d.PRState, Reason: d.PRReason, Number: f.PRNumber, URL: f.PRURL}
+	if cur.PR == want {
+		return false
+	}
+	patch.PR = &want
+	return true
+}
+
+// setDetecting implements the three-way detecting semantics: set/replace when
+// the decision carries memory, clear (#3) when it doesn't but canonical still
+// holds stale memory, else leave untouched.
+func setDetecting(patch *ports.LifecyclePatch, cur domain.CanonicalSessionLifecycle, d *domain.DetectingState) bool {
+	if d != nil {
+		if cur.Detecting != nil && *cur.Detecting == *d {
+			return false
+		}
+		patch.Detecting = d
+		return true
+	}
+	if cur.Detecting != nil {
+		patch.ClearDetecting = true
+		return true
+	}
+	return false
+}
+
+// sameActivity compares activity sub-states with time-aware equality (== on
+// time.Time is monotonic-clock sensitive and would spuriously report changes).
+func sameActivity(a, b domain.ActivitySubstate) bool {
+	return a.State == b.State && a.Source == b.Source && a.LastActivityAt.Equal(b.LastActivityAt)
+}
+
+func spawnMetadata(o ports.SpawnOutcome) map[string]string {
+	meta := map[string]string{}
+	if o.Branch != "" {
+		meta[MetaBranch] = o.Branch
+	}
+	if o.WorkspacePath != "" {
+		meta[MetaWorkspacePath] = o.WorkspacePath
+	}
+	if o.RuntimeHandle.ID != "" {
+		meta[MetaRuntimeHandleID] = o.RuntimeHandle.ID
+	}
+	if o.RuntimeHandle.RuntimeName != "" {
+		meta[MetaRuntimeName] = o.RuntimeHandle.RuntimeName
+	}
+	if o.AgentSessionID != "" {
+		meta[MetaAgentSessionID] = o.AgentSessionID
+	}
+	return meta
+}
diff --git a/backend/internal/lifecycle/manager_test.go b/backend/internal/lifecycle/manager_test.go
new file mode 100644
index 00000000..d0a97125
--- /dev/null
+++ b/backend/internal/lifecycle/manager_test.go
@@ -0,0 +1,477 @@
+package lifecycle
+
+import (
+	"context"
+	"sync"
+	"testing"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+	"github.com/aoagents/agent-orchestrator/backend/internal/ports"
+)
+
+var t0 = time.Date(2026, 5, 26, 12, 0, 0, 0, time.UTC)
+
+const sid domain.SessionID = "s1"
+
+func newManager() (*Manager, *fakeStore) {
+	store := newFakeStore()
+	return New(store, &recordingNotifier{}, &recordingMessenger{}), store
+}
+
+func mustLoad(t *testing.T, store *fakeStore) domain.CanonicalSessionLifecycle {
+	t.Helper()
+	l, ok, err := store.Load(context.Background(), sid)
+	if err != nil || !ok {
+		t.Fatalf("load: ok=%v err=%v", ok, err)
+	}
+	return l
+}
+
+// ---- ApplyRuntimeObservation + #1 composition + #3 detecting clear ----
+
+func TestApplyRuntimeObservation(t *testing.T) {
+	aliveProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}
+	failedProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}
+	deadProbe := ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0}
+
+	tests := []struct {
+		name          string
+		seed          domain.CanonicalSessionLifecycle
+		facts         ports.RuntimeFacts
+		wantSession   domain.SessionState
+		wantReason    domain.SessionReason
+		wantRuntime   domain.RuntimeState
+		wantDisplay   domain.SessionStatus
+		wantDetecting bool // expect non-nil detecting memory persisted
+	}{
+		{
+			name:          "healthy probe must not clobber an activity-owned needs_input (#1)",
+			seed:          lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive),
+			facts:         aliveProbe,
+			wantSession:   domain.SessionNeedsInput,
+			wantReason:    domain.ReasonAwaitingUserInput,
+			wantRuntime:   domain.RuntimeAlive,
+			wantDisplay:   domain.StatusNeedsInput,
+			wantDetecting: false,
+		},
+		{
+			name:          "healthy probe recovers a liveness-owned detecting -> working and clears memory (#1 + #3)",
+			seed:          detectingLC(),
+			facts:         aliveProbe,
+			wantSession:   domain.SessionWorking,
+			wantReason:    domain.ReasonTaskInProgress,
+			wantRuntime:   domain.RuntimeAlive,
+			wantDisplay:   domain.StatusWorking,
+			wantDetecting: false,
+		},
+		{
+			name:          "failed probe routes to detecting and records memory",
+			seed:          lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive),
+			facts:         failedProbe,
+			wantSession:   domain.SessionDetecting,
+			wantReason:    domain.ReasonProbeFailure,
+			wantRuntime:   domain.RuntimeProbeFailed,
+			wantDisplay:   domain.StatusDetecting,
+			wantDetecting: true,
+		},
+		{
+			name:          "dead+dead with no recent activity concludes killed and clears detecting (#3)",
+			seed:          detectingLC(),
+			facts:         deadProbe,
+			wantSession:   domain.SessionTerminated,
+			wantReason:    domain.ReasonRuntimeLost,
+			wantRuntime:   domain.RuntimeExited,
+			wantDisplay:   domain.StatusKilled,
+			wantDetecting: false,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mgr, store := newManager()
+			store.seed(sid, tt.seed)
+
+			if err := mgr.ApplyRuntimeObservation(context.Background(), sid, tt.facts); err != nil {
+				t.Fatalf("apply: %v", err)
+			}
+
+			l := mustLoad(t, store)
+			if l.Session.State != tt.wantSession || l.Session.Reason != tt.wantReason {
+				t.Errorf("session = %v/%v, want %v/%v", l.Session.State, l.Session.Reason, tt.wantSession, tt.wantReason)
+			}
+			if l.Runtime.State != tt.wantRuntime {
+				t.Errorf("runtime = %v, want %v", l.Runtime.State, tt.wantRuntime)
+			}
+			if got := domain.DeriveLegacyStatus(l); got != tt.wantDisplay {
+				t.Errorf("display = %v, want %v", got, tt.wantDisplay)
+			}
+			if (l.Detecting != nil) != tt.wantDetecting {
+				t.Errorf("detecting present = %v, want %v (%+v)", l.Detecting != nil, tt.wantDetecting, l.Detecting)
+			}
+		})
+	}
+}
+
+func TestApplyRuntimeObservation_NoRecordIsNoOp(t *testing.T) {
+	mgr, store := newManager()
+	if err := mgr.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeAlive, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}); err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+	if _, ok, _ := store.Load(context.Background(), sid); ok {
+		t.Error("a probe for an unseeded session must not fabricate a record")
+	}
+}
+
+func TestApplyRuntimeObservation_DoesNotResurrectTerminal(t *testing.T) {
+	mgr, store := newManager()
+	store.seed(sid, lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.RuntimeExited))
+
+	// A failed probe would normally route to detecting, but a terminal session
+	// must not be reopened by an observation (only an explicit Restore does).
+	if err := mgr.ApplyRuntimeObservation(context.Background(), sid, ports.RuntimeFacts{RuntimeState: ports.RuntimeProbeFailed, ProcessState: ports.ProcessProbeAlive, ObservedAt: t0}); err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+
+	l := mustLoad(t, store)
+	if l.Session.State != domain.SessionTerminated || l.Session.Reason != domain.ReasonManuallyKilled {
+		t.Errorf("session = %v/%v, want terminated/manually_killed (no resurrection)", l.Session.State, l.Session.Reason)
+	}
+	if l.Detecting != nil {
+		t.Errorf("terminal session must not gain detecting memory, got %+v", l.Detecting)
+	}
+}
+
+// ---- ApplyActivitySignal ----
+
+func TestApplyActivitySignal(t *testing.T) {
+	tests := []struct {
+		name         string
+		seed         domain.CanonicalSessionLifecycle
+		signal       ports.ActivitySignal
+		wantSession  domain.SessionState
+		wantReason   domain.SessionReason
+		checkReason  bool
+		wantActivity domain.ActivityState
+		wantChanged  bool
+	}{
+		{
+			name:         "valid waiting_input maps to needs_input",
+			seed:         lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive),
+			signal:       ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityWaitingInput, Timestamp: t0, Source: domain.SourceHook},
+			wantSession:  domain.SessionNeedsInput,
+			wantActivity: domain.ActivityWaitingInput,
+			wantChanged:  true,
+		},
+		{
+			name:         "valid active recovers needs_input -> working",
+			seed:         lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.RuntimeAlive),
+			signal:       ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook},
+			wantSession:  domain.SessionWorking,
+			wantActivity: domain.ActivityActive,
+			wantChanged:  true,
+		},
+		{
+			name:         "valid idle maps to idle with a neutral reason",
+			seed:         lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive),
+			signal:       ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityIdle, Timestamp: t0, Source: domain.SourceHook},
+			wantSession:  domain.SessionIdle,
+			wantReason:   "",
+			checkReason:  true,
+			wantActivity: domain.ActivityIdle,
+			wantChanged:  true,
+		},
+		{
+			name:        "low-confidence signal is dropped (no idleness inferred)",
+			seed:        lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive),
+			signal:      ports.ActivitySignal{State: ports.SignalProbeFailure, Activity: domain.ActivityIdle, Timestamp: t0, Source: domain.SourceHook},
+			wantSession: domain.SessionWorking,
+			wantChanged: false,
+		},
+		{
+			name:         "valid activity resolves a detecting session (proof of life)",
+			seed:         detectingLC(),
+			signal:       ports.ActivitySignal{State: ports.SignalValid, Activity: domain.ActivityActive, Timestamp: t0, Source: domain.SourceHook},
+			wantSession:  domain.SessionWorking,
+			wantActivity: domain.ActivityActive,
+			wantChanged:  true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mgr, store := newManager()
+			store.seed(sid, tt.seed)
+
+			if err := mgr.ApplyActivitySignal(context.Background(), sid, tt.signal); err != nil {
+				t.Fatalf("apply: %v", err)
+			}
+
+			l := mustLoad(t, store)
+			if l.Session.State != tt.wantSession {
+				t.Errorf("session = %v, want %v", l.Session.State, tt.wantSession)
+			}
+			if tt.checkReason && l.Session.Reason != tt.wantReason {
+				t.Errorf("session reason = %q, want %q", l.Session.Reason, tt.wantReason)
+			}
+			if tt.wantChanged && l.Revision != 1 {
+				t.Errorf("revision = %d, want 1 (expected a write)", l.Revision)
+			}
+			if !tt.wantChanged && l.Revision != 0 {
+				t.Errorf("revision = %d, want 0 (expected a no-op)", l.Revision)
+			}
+			if tt.wantChanged && tt.wantActivity != "" && l.Activity.State != tt.wantActivity {
+				t.Errorf("activity = %v, want %v", l.Activity.State, tt.wantActivity)
+			}
+			if tt.name == "valid activity resolves a detecting session (proof of life)" && l.Detecting != nil {
+				t.Errorf("resolving detecting must clear the quarantine memory, got %+v", l.Detecting)
+			}
+		})
+	}
+}
+
+// ---- ApplySCMObservation ----
+
+func TestApplySCMObservation(t *testing.T) {
+	t.Run("failed fetch is a no-op (failed probe != no PR)", func(t *testing.T) {
+		mgr, store := newManager()
+		store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive))
+		if err := mgr.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{Fetched: false, PRState: domain.PROpen}); err != nil {
+			t.Fatalf("apply: %v", err)
+		}
+		if l := mustLoad(t, store); l.Revision != 0 || l.PR.State != "" {
+			t.Errorf("expected no-op, got revision=%d pr=%v", l.Revision, l.PR.State)
+		}
+	})
+
+	t.Run("open PR writes only the PR axis; session stays activity-owned", func(t *testing.T) {
+		mgr, store := newManager()
+		store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive))
+		f := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 12, PRURL: "https://x/12"}
+		if err := mgr.ApplySCMObservation(context.Background(), sid, f); err != nil {
+			t.Fatalf("apply: %v", err)
+		}
+		l := mustLoad(t, store)
+		if l.PR.State != domain.PROpen || l.PR.Reason != domain.PRReasonCIFailing || l.PR.Number != 12 {
+			t.Errorf("pr = %+v, want open/ci_failing/#12", l.PR)
+		}
+		if l.Session.State != domain.SessionWorking {
+			t.Errorf("session = %v, want working (untouched)", l.Session.State)
+		}
+		if got := domain.DeriveLegacyStatus(l); got != domain.StatusCIFailed {
+			t.Errorf("display = %v, want ci_failed", got)
+		}
+	})
+
+	t.Run("merged PR parks the session and displays merged", func(t *testing.T) {
+		mgr, store := newManager()
+		seed := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)
+		seed.PR = domain.PRSubstate{State: domain.PROpen, Reason: domain.PRReasonInProgress, Number: 12}
+		store.seed(sid, seed)
+		f := ports.SCMFacts{Fetched: true, PRState: domain.PRMerged, PRNumber: 12}
+		if err := mgr.ApplySCMObservation(context.Background(), sid, f); err != nil {
+			t.Fatalf("apply: %v", err)
+		}
+		l := mustLoad(t, store)
+		if l.PR.State != domain.PRMerged || l.Session.Reason != domain.ReasonMergedWaitingDecision {
+			t.Errorf("got pr=%v session=%v, want merged + merged_waiting_decision", l.PR.State, l.Session.Reason)
+		}
+		if got := domain.DeriveLegacyStatus(l); got != domain.StatusMerged {
+			t.Errorf("display = %v, want merged", got)
+		}
+	})
+
+	t.Run("open-PR review branches map to the PR axis", func(t *testing.T) {
+		cases := []struct {
+			name       string
+			facts      ports.SCMFacts
+			wantReason domain.PRReason
+			wantStatus domain.SessionStatus
+		}{
+			{"changes requested", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested}, domain.PRReasonChangesRequested, domain.StatusChangesRequested},
+			{"approved + mergeable", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved, Mergeability: ports.Mergeability{Mergeable: true}}, domain.PRReasonMergeReady, domain.StatusMergeable},
+			{"review pending", ports.SCMFacts{Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewPending}, domain.PRReasonReviewPending, domain.StatusReviewPending},
+		}
+		for _, c := range cases {
+			t.Run(c.name, func(t *testing.T) {
+				mgr, store := newManager()
+				store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive))
+				if err := mgr.ApplySCMObservation(context.Background(), sid, c.facts); err != nil {
+					t.Fatalf("apply: %v", err)
+				}
+				l := mustLoad(t, store)
+				if l.PR.State != domain.PROpen || l.PR.Reason != c.wantReason {
+					t.Errorf("pr = %v/%v, want open/%v", l.PR.State, l.PR.Reason, c.wantReason)
+				}
+				if got := domain.DeriveLegacyStatus(l); got != c.wantStatus {
+					t.Errorf("display = %v, want %v", got, c.wantStatus)
+				}
+			})
+		}
+	})
+
+	t.Run("no PR is a no-op in split A", func(t *testing.T) {
+		mgr, store := newManager()
+		store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive))
+		if err := mgr.ApplySCMObservation(context.Background(), sid, ports.SCMFacts{Fetched: true, PRState: domain.PRNone}); err != nil {
+			t.Fatalf("apply: %v", err)
+		}
+		if l := mustLoad(t, store); l.Revision != 0 {
+			t.Errorf("expected no-op, got revision=%d", l.Revision)
+		}
+	})
+}
+
+// ---- mutation outcomes ----
+
+func TestOnSpawnCompleted(t *testing.T) {
+	mgr, store := newManager()
+	store.seed(sid, lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.RuntimeUnknown))
+
+	out := ports.SpawnOutcome{
+		Branch:         "feat/x",
+		WorkspacePath:  "/w/x",
+		RuntimeHandle:  ports.RuntimeHandle{ID: "tmux:1", RuntimeName: "tmux"},
+		AgentSessionID: "agent-1",
+	}
+	if err := mgr.OnSpawnCompleted(context.Background(), sid, out); err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+
+	l := mustLoad(t, store)
+	if l.Runtime.State != domain.RuntimeAlive {
+		t.Errorf("runtime = %v, want alive", l.Runtime.State)
+	}
+	if l.Session.State != domain.SessionNotStarted {
+		t.Errorf("session = %v, want not_started (spawn does not assert acknowledgement)", l.Session.State)
+	}
+	if got := domain.DeriveLegacyStatus(l); got != domain.StatusSpawning {
+		t.Errorf("display = %v, want spawning", got)
+	}
+	meta, _ := store.GetMetadata(context.Background(), sid)
+	if meta[MetaBranch] != "feat/x" || meta[MetaAgentSessionID] != "agent-1" || meta[MetaRuntimeName] != "tmux" {
+		t.Errorf("metadata not recorded: %+v", meta)
+	}
+}
+
+func TestOnKillRequested(t *testing.T) {
+	tests := []struct {
+		name        string
+		kind        ports.LifecycleKillReason
+		wantReason  domain.SessionReason
+		wantRuntime domain.RuntimeReason
+		wantDisplay domain.SessionStatus
+	}{
+		{"manual", ports.KillManual, domain.ReasonManuallyKilled, domain.RuntimeReasonManualKillRequested, domain.StatusKilled},
+		{"cleanup", ports.KillCleanup, domain.ReasonAutoCleanup, domain.RuntimeReasonAutoCleanup, domain.StatusCleanup},
+		{"error", ports.KillError, domain.ReasonErrorInProcess, domain.RuntimeReasonProbeError, domain.StatusErrored},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			mgr, store := newManager()
+			store.seed(sid, detectingLC())
+
+			if err := mgr.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: tt.kind, Detail: "x"}); err != nil {
+				t.Fatalf("apply: %v", err)
+			}
+
+			l := mustLoad(t, store)
+			if l.Session.State != domain.SessionTerminated || l.Session.Reason != tt.wantReason {
+				t.Errorf("session = %v/%v, want terminated/%v", l.Session.State, l.Session.Reason, tt.wantReason)
+			}
+			if l.Runtime.Reason != tt.wantRuntime {
+				t.Errorf("runtime reason = %v, want %v", l.Runtime.Reason, tt.wantRuntime)
+			}
+			if l.Detecting != nil {
+				t.Errorf("kill must clear detecting memory, got %+v", l.Detecting)
+			}
+			if got := domain.DeriveLegacyStatus(l); got != tt.wantDisplay {
+				t.Errorf("display = %v, want %v", got, tt.wantDisplay)
+			}
+		})
+	}
+}
+
+func TestOnSpawnCompleted_UnseededErrors(t *testing.T) {
+	mgr, store := newManager()
+	err := mgr.OnSpawnCompleted(context.Background(), sid, ports.SpawnOutcome{Branch: "x"})
+	if err == nil {
+		t.Error("OnSpawnCompleted for an unseeded session must error, not fabricate a record")
+	}
+	if _, ok, _ := store.Load(context.Background(), sid); ok {
+		t.Error("no record should have been created")
+	}
+}
+
+func TestOnKillRequested_UnseededIsNoOp(t *testing.T) {
+	mgr, store := newManager()
+	if err := mgr.OnKillRequested(context.Background(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil {
+		t.Fatalf("kill of unknown session should be a benign no-op, got %v", err)
+	}
+	if _, ok, _ := store.Load(context.Background(), sid); ok {
+		t.Error("killing an unknown session must not fabricate a terminal record")
+	}
+}
+
+// ---- fake store contract ----
+
+func TestFakeStoreExpectedRevision(t *testing.T) {
+	store := newFakeStore()
+	store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)) // revision 0
+	rt := domain.RuntimeSubstate{State: domain.RuntimeExited}
+
+	bad := 99
+	if err := store.PatchLifecycle(context.Background(), sid, ports.LifecyclePatch{Runtime: &rt, ExpectedRevision: &bad}); err == nil {
+		t.Error("stale ExpectedRevision must be rejected")
+	}
+	good := 0
+	if err := store.PatchLifecycle(context.Background(), sid, ports.LifecyclePatch{Runtime: &rt, ExpectedRevision: &good}); err != nil {
+		t.Errorf("matching ExpectedRevision must succeed, got %v", err)
+	}
+}
+
+// ---- per-session serialisation under the race detector ----
+
+func TestPerSessionSerialization(t *testing.T) {
+	mgr, store := newManager()
+	store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive))
+
+	const n = 50
+	var wg sync.WaitGroup
+	wg.Add(n)
+	for i := 0; i < n; i++ {
+		go func(i int) {
+			defer wg.Done()
+			_ = mgr.ApplyActivitySignal(context.Background(), sid, ports.ActivitySignal{
+				State:     ports.SignalValid,
+				Activity:  domain.ActivityActive,
+				Timestamp: t0.Add(time.Duration(i) * time.Second),
+				Source:    domain.SourceHook,
+			})
+		}(i)
+	}
+	wg.Wait()
+
+	// Each goroutine writes a distinct LastActivityAt, so every call is a real
+	// change; with correct serialisation all n land without a lost update.
+	if l := mustLoad(t, store); l.Revision != n {
+		t.Errorf("revision = %d, want %d (lost update under concurrency)", l.Revision, n)
+	}
+}
+
+// ---- helpers ----
+
+func lc(state domain.SessionState, reason domain.SessionReason, rt domain.RuntimeState) domain.CanonicalSessionLifecycle {
+	return domain.CanonicalSessionLifecycle{
+		Version: domain.LifecycleVersion,
+		Session: domain.SessionSubstate{State: state, Reason: reason},
+		Runtime: domain.RuntimeSubstate{State: rt},
+	}
+}
+
+func detectingLC() domain.CanonicalSessionLifecycle {
+	l := lc(domain.SessionDetecting, domain.ReasonRuntimeLost, domain.RuntimeMissing)
+	l.Detecting = &domain.DetectingState{Attempts: 1, StartedAt: t0, EvidenceHash: "abc"}
+	return l
+}
diff --git a/backend/internal/lifecycle/reactions.go b/backend/internal/lifecycle/reactions.go
new file mode 100644
index 00000000..72841510
--- /dev/null
+++ b/backend/internal/lifecycle/reactions.go
@@ -0,0 +1,417 @@
+package lifecycle
+
+// reactions.go is the ACT layer: the reaction table, the per-(session,reaction)
+// escalation engine, and the duration-driven TickEscalations the synchronous
+// LCM can't wake itself for. Reactions fire from react() after a transition is
+// persisted by the Apply* pipeline (see manager.go).
+//
+// Dispatch is synchronous: react() runs Send/Notify inline. It is the single
+// dispatch chokepoint, so moving it onto a worker goroutine later (once a daemon
+// owns that goroutine's lifecycle) is a change confined to this one function.
+
+import (
+	"context"
+	"fmt"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+	"github.com/aoagents/agent-orchestrator/backend/internal/ports"
+)
+
+// reactionKey names a row in the reaction table and a tracker bucket.
+type reactionKey string
+
+const (
+	reactionCIFailed         reactionKey = "ci-failed"
+	reactionChangesRequested reactionKey = "changes-requested"
+	reactionBugbotComments   reactionKey = "bugbot-comments"
+	reactionMergeConflicts   reactionKey = "merge-conflicts"
+	reactionAgentIdle        reactionKey = "agent-idle"
+	reactionApprovedAndGreen reactionKey = "approved-and-green"
+	reactionAgentStuck       reactionKey = "agent-stuck"
+	reactionNeedsInput       reactionKey = "agent-needs-input"
+	reactionAgentExited      reactionKey = "agent-exited"
+	reactionPRClosed         reactionKey = "pr-closed"
+	reactionAllComplete      reactionKey = "all-complete"
+)
+
+type actionKind string
+
+const (
+	actionSendToAgent actionKind = "send-to-agent"
+	actionNotify      actionKind = "notify"
+	actionAutoMerge   actionKind = "auto-merge"
+)
+
+// reactionConfig is one row of the reaction table (distillation §4.1/§4.2).
+//
+//   - retries       numeric escalation cap: escalate once attempts exceed it.
+//   - escalateAfter  duration escalation: escalate once this elapses since the
+//     first attempt (fired by TickEscalations, since the LCM never polls).
+//   - persistent     the tracker survives the status leaving the triggering
+//     state; it only resets when the incident is truly over (PR no longer open
+//     or the session terminal). Only ci-failed is persistent, so a flapping
+//     CI (fail→pending→fail) keeps draining one shared retry budget.
+type reactionConfig struct {
+	action        actionKind
+	message       string
+	priority      ports.EventPriority
+	eventType     string
+	retries       int
+	escalateAfter time.Duration
+	persistent    bool
+}
+
+// defaultReactions is the product's default behaviour (distillation §4.2).
+// auto-merge is intentionally absent: approved-and-green is a notify, so the
+// human decides to merge. The auto-merge action kind exists for opt-in configs,
+// but no default row uses it.
+var defaultReactions = map[reactionKey]reactionConfig{
+	reactionCIFailed: {
+		action: actionSendToAgent, persistent: true, retries: 2,
+		message:   "CI is failing on your PR. Review the failing output below and push a fix.",
+		eventType: "reaction.ci-failed", priority: ports.PriorityAction,
+	},
+	reactionChangesRequested: {
+		action: actionSendToAgent, escalateAfter: 30 * time.Minute,
+		message:   "A reviewer requested changes on your PR. Address the comments and push.",
+		eventType: "reaction.changes-requested", priority: ports.PriorityAction,
+	},
+	reactionBugbotComments: {
+		action: actionSendToAgent, escalateAfter: 30 * time.Minute,
+		message:   "An automated reviewer left comments on your PR. Address them and push.",
+		eventType: "reaction.bugbot-comments", priority: ports.PriorityAction,
+	},
+	reactionMergeConflicts: {
+		action: actionSendToAgent, escalateAfter: 15 * time.Minute,
+		message:   "Your PR has merge conflicts. Rebase onto the base branch and resolve them.",
+		eventType: "reaction.merge-conflicts", priority: ports.PriorityAction,
+	},
+	reactionAgentIdle: {
+		action: actionSendToAgent, retries: 2, escalateAfter: 15 * time.Minute,
+		message:   "You appear idle. Continue the task or explain what is blocking you.",
+		eventType: "reaction.agent-idle", priority: ports.PriorityWarning,
+	},
+	reactionApprovedAndGreen: {
+		// notify-only: a green, approved PR is the human-decision path — the human
+		// decides to merge (no auto-merge by default).
+		action: actionNotify, priority: ports.PriorityAction,
+		message:   "PR is approved and green — ready to merge.",
+		eventType: "reaction.approved-and-green",
+	},
+	reactionAgentStuck: {
+		// §4.2 lists a threshold: 10m here; it is intentionally not gated — entry
+		// into stuck is already debounced upstream by the detecting->stuck
+		// quarantine (DETECTING_MAX_ATTEMPTS/DURATION), so a second timer would be
+		// redundant.
+		action: actionNotify, priority: ports.PriorityUrgent,
+		message:   "Agent is stuck and needs attention.",
+		eventType: "reaction.agent-stuck",
+	},
+	reactionNeedsInput: {
+		action: actionNotify, priority: ports.PriorityUrgent,
+		message:   "Agent needs input to continue.",
+		eventType: "reaction.agent-needs-input",
+	},
+	reactionAgentExited: {
+		action: actionNotify, priority: ports.PriorityUrgent,
+		message:   "Agent process exited unexpectedly.",
+		eventType: "reaction.agent-exited",
+	},
+	reactionPRClosed: {
+		action: actionNotify, priority: ports.PriorityAction,
+		message:   "PR was closed without merging — decide: resume, learn, or terminate.",
+		eventType: "reaction.pr-closed",
+	},
+	reactionAllComplete: {
+		action: actionNotify, priority: ports.PriorityInfo,
+		message:   "PR merged — work complete.",
+		eventType: "reaction.all-complete",
+	},
+}
+
+// reactionEventFor maps a canonical record to the reaction it should drive,
+// mirroring DeriveLegacyStatus but for the ACT layer. ok is false when the
+// current state has no reaction.
+//
+// A closed PR derives to the idle display status, so it is detected from the PR
+// axis directly before falling through to the status mapping. bugbot-comments
+// and merge-conflicts have no producer in the split-A decide core yet, so they
+// are dormant: configured but unreachable until DECIDE surfaces them.
+func reactionEventFor(l domain.CanonicalSessionLifecycle) (reactionKey, bool) {
+	if l.PR.State == domain.PRClosed {
+		return reactionPRClosed, true
+	}
+	switch domain.DeriveLegacyStatus(l) {
+	case domain.StatusCIFailed:
+		return reactionCIFailed, true
+	case domain.StatusChangesRequested:
+		return reactionChangesRequested, true
+	case domain.StatusApproved, domain.StatusMergeable:
+		return reactionApprovedAndGreen, true
+	case domain.StatusIdle:
+		return reactionAgentIdle, true
+	case domain.StatusStuck:
+		return reactionAgentStuck, true
+	case domain.StatusNeedsInput:
+		return reactionNeedsInput, true
+	case domain.StatusKilled:
+		// Inferred death only — an explicit user kill goes through
+		// OnKillRequested, which does not react.
+		return reactionAgentExited, true
+	case domain.StatusMerged:
+		return reactionAllComplete, true
+	}
+	return "", false
+}
+
+// reactionContext carries fact-derived material the message templates need. The
+// SCM path populates it (CI failure log tail); other paths pass the zero value.
+type reactionContext struct {
+	ciFailureLogTail *string
+}
+
+// trackerKey buckets an escalation tracker by session and reaction.
+type trackerKey struct {
+	id  domain.SessionID
+	key reactionKey
+}
+
+// reactionTracker is the per-(session,reaction) escalation budget. It lives in
+// memory on the Manager: a daemon restart resets budgets, which only ever costs
+// a few extra agent retries before re-escalating — never a missed human
+// notification. Keeping it out of the canonical store preserves the
+// truth-vs-policy split (the store holds session truth; this is ACT policy).
+type reactionTracker struct {
+	attempts       int
+	escalated      bool
+	firstAttemptAt time.Time
+}
+
+// react fires the ACT layer after a persisted transition: clear the tracker for
+// the reaction we left, then dispatch the reaction for the one we entered. It
+// fires only on a genuine reaction change, so re-persisting the same state does
+// not re-dispatch. Synchronous by design (see file header).
+//
+// Integration-time caveat: react runs AFTER withLock releases (deliberately, so
+// a busy-waiting send-to-agent never holds the per-session mutex). Under a live
+// daemon with concurrent observers (SCM poller + reaper + activity ingest) the
+// afterLC snapshot can be stale by dispatch time — e.g. a ci-failed send firing
+// after the session already moved to approved. Tests are single-threaded so it
+// is not observable yet; when the daemon lands, give react a per-session
+// ordering (a small react queue) or re-check the triggering state before
+// dispatching.
+func (m *Manager) react(ctx context.Context, id domain.SessionID, tr *transition, rc reactionContext) error {
+	if tr == nil {
+		return nil
+	}
+	beforeKey, hadBefore := reactionEventFor(tr.beforeLC)
+	afterKey, hasAfter := reactionEventFor(tr.afterLC)
+
+	changed := beforeKey != afterKey
+
+	switch {
+	case incidentOver(tr.afterLC) || recovered(tr.afterLC):
+		// The PR-pipeline incident has ended — the PR resolved (merged/closed),
+		// the session went terminal, or it reached an approved/green state. Every
+		// tracker for this session is now stale, including a persistent ci-failed
+		// one. This is keyed on the state REACHED, not the one left: the recovery
+		// transition is typically review_pending->approved (beforeKey empty), so
+		// clearing only beforeKey would leak the ci-failed tracker and leave its
+		// escalated=true to silence a future regression. Clear them all.
+		m.clearSessionTrackers(id)
+	case hadBefore && (!hasAfter || changed):
+		// Within an unresolved open PR: a normal tracker resets when its state is
+		// left. A persistent one (ci-failed) is NOT cleared here — it must survive
+		// the ambiguous review_pending limbo (the fail->pending->fail flap, §4.2);
+		// it only resets via the recovery/incident-over branch above.
+		if !defaultReactions[beforeKey].persistent {
+			m.clearTracker(id, beforeKey)
+		}
+	}
+
+	if hasAfter && (!hadBefore || changed) {
+		return m.executeReaction(ctx, id, afterKey, rc)
+	}
+	return nil
+}
+
+// incidentOver reports that a PR-pipeline incident has truly ended (PR no longer
+// open, or the session terminal), so all trackers for the session may reset.
+func incidentOver(l domain.CanonicalSessionLifecycle) bool {
+	return l.PR.State != domain.PROpen || isTerminal(l.Session.State)
+}
+
+// recovered reports a genuinely-green open PR: an approved/mergeable state, which
+// unambiguously means CI is no longer failing (the open-PR ladder ranks ci_failing
+// above approved, so an approved display cannot coexist with failing CI). Unlike
+// the ambiguous review_pending state — which may just be CI re-running — reaching
+// this ends a ci-failed incident and re-arms its budget.
+func recovered(l domain.CanonicalSessionLifecycle) bool {
+	if l.PR.State != domain.PROpen {
+		return false
+	}
+	switch l.PR.Reason {
+	case domain.PRReasonApproved, domain.PRReasonMergeReady:
+		return true
+	default:
+		return false
+	}
+}
+
+func (m *Manager) executeReaction(ctx context.Context, id domain.SessionID, key reactionKey, rc reactionContext) error {
+	cfg := defaultReactions[key]
+	switch cfg.action {
+	case actionNotify:
+		// notify reactions are human-attention terminals: fire once on the
+		// triggering transition, no retry/escalation budget.
+		return m.notifier.Notify(ctx, ports.OrchestratorEvent{
+			Type:      cfg.eventType,
+			Priority:  cfg.priority,
+			SessionID: id,
+			Message:   cfg.message,
+		})
+	case actionAutoMerge:
+		// Off by default: no default row maps here, and wiring a merge port is a
+		// later PR. An opt-in config could route a reaction here.
+		return nil
+	case actionSendToAgent:
+		return m.sendToAgent(ctx, id, key, cfg, rc)
+	}
+	return nil
+}
+
+// sendToAgent runs the escalation engine for an auto send-to-agent reaction:
+// count the attempt, escalate when the numeric cap or duration is exceeded
+// (silencing further auto-dispatch), else inject the message via the messenger.
+func (m *Manager) sendToAgent(ctx context.Context, id domain.SessionID, key reactionKey, cfg reactionConfig, rc reactionContext) error {
+	m.trackerMu.Lock()
+	tk := m.trackerFor(id, key)
+	if tk.escalated {
+		m.trackerMu.Unlock()
+		return nil // silenced until the condition clears the tracker
+	}
+	now := m.clock()
+	freshFirst := tk.firstAttemptAt.IsZero()
+	if freshFirst {
+		tk.firstAttemptAt = now
+	}
+	tk.attempts++
+	if shouldEscalate(tk, cfg, now) {
+		tk.escalated = true
+		m.trackerMu.Unlock()
+		return m.escalate(ctx, id, key)
+	}
+	m.trackerMu.Unlock()
+
+	if err := m.messenger.Send(ctx, id, composeMessage(cfg, rc)); err != nil {
+		// A delivery failure must not consume escalation budget: roll this
+		// attempt back so the next relevant transition retries from the same
+		// point rather than marching toward escalation on undelivered messages
+		// (distillation §4.3).
+		m.trackerMu.Lock()
+		tk.attempts--
+		if freshFirst {
+			tk.firstAttemptAt = time.Time{}
+		}
+		m.trackerMu.Unlock()
+		return err
+	}
+	return nil
+}
+
+// shouldEscalate uses inclusive boundaries: escalate once the numeric cap is
+// exceeded or once exactly escalateAfter has elapsed (don't wait for the next
+// tick to cross a strict threshold).
+func shouldEscalate(tk *reactionTracker, cfg reactionConfig, now time.Time) bool {
+	if cfg.retries > 0 && tk.attempts > cfg.retries {
+		return true
+	}
+	if cfg.escalateAfter > 0 && !tk.firstAttemptAt.IsZero() && now.Sub(tk.firstAttemptAt) >= cfg.escalateAfter {
+		return true
+	}
+	return false
+}
+
+// escalate emits reaction.escalated and notifies the human. The caller has
+// already set tracker.escalated under the lock, which silences further
+// auto-dispatch for this reaction until the tracker clears.
+func (m *Manager) escalate(ctx context.Context, id domain.SessionID, key reactionKey) error {
+	return m.notifier.Notify(ctx, ports.OrchestratorEvent{
+		Type:      "reaction.escalated",
+		Priority:  ports.PriorityUrgent,
+		SessionID: id,
+		Message:   fmt.Sprintf("auto-handling of %q is exhausted and needs a human.", key),
+		Data:      map[string]any{"reaction": string(key)},
+	})
+}
+
+func composeMessage(cfg reactionConfig, rc reactionContext) string {
+	if rc.ciFailureLogTail != nil && *rc.ciFailureLogTail != "" {
+		return cfg.message + "\n\nFailing output:\n" + *rc.ciFailureLogTail
+	}
+	return cfg.message
+}
+
+// trackerFor returns the tracker for (id,key), creating it on first use. The
+// caller must hold trackerMu.
+func (m *Manager) trackerFor(id domain.SessionID, key reactionKey) *reactionTracker {
+	k := trackerKey{id: id, key: key}
+	tk := m.trackers[k]
+	if tk == nil {
+		tk = &reactionTracker{}
+		m.trackers[k] = tk
+	}
+	return tk
+}
+
+func (m *Manager) clearTracker(id domain.SessionID, key reactionKey) {
+	m.trackerMu.Lock()
+	delete(m.trackers, trackerKey{id: id, key: key})
+	m.trackerMu.Unlock()
+}
+
+// clearSessionTrackers drops every tracker for a session — used when its
+// incident is over, so no budget (and no stale escalated=true) survives into a
+// later unrelated incident.
+func (m *Manager) clearSessionTrackers(id domain.SessionID) {
+	m.trackerMu.Lock()
+	for k := range m.trackers {
+		if k.id == id {
+			delete(m.trackers, k)
+		}
+	}
+	m.trackerMu.Unlock()
+}
+
+// TickEscalations fires the duration-based escalations the synchronous LCM
+// cannot wake itself for. The reaper calls it on a timer; it escalates any
+// not-yet-escalated tracker whose escalateAfter has elapsed. Notifications are
+// sent outside the lock so agent/notifier latency never blocks tracker access.
+func (m *Manager) TickEscalations(ctx context.Context, now time.Time) error {
+	type due struct {
+		id  domain.SessionID
+		key reactionKey
+	}
+	var fire []due
+
+	m.trackerMu.Lock()
+	for k, tk := range m.trackers {
+		if tk.escalated {
+			continue
+		}
+		cfg := defaultReactions[k.key]
+		if cfg.escalateAfter > 0 && !tk.firstAttemptAt.IsZero() && now.Sub(tk.firstAttemptAt) >= cfg.escalateAfter {
+			tk.escalated = true
+			fire = append(fire, due{id: k.id, key: k.key})
+		}
+	}
+	m.trackerMu.Unlock()
+
+	for _, d := range fire {
+		if err := m.escalate(ctx, d.id, d.key); err != nil {
+			return err
+		}
+	}
+	return nil
+}
diff --git a/backend/internal/lifecycle/reactions_test.go b/backend/internal/lifecycle/reactions_test.go
new file mode 100644
index 00000000..e90e8881
--- /dev/null
+++ b/backend/internal/lifecycle/reactions_test.go
@@ -0,0 +1,416 @@
+package lifecycle
+
+import (
+	"context"
+	"fmt"
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+	"github.com/aoagents/agent-orchestrator/backend/internal/ports"
+)
+
+// failingMessenger always fails delivery, counting attempts — used to assert a
+// send failure does not consume escalation budget.
+type failingMessenger struct{ attempts int }
+
+func (f *failingMessenger) Send(_ context.Context, _ domain.SessionID, _ string) error {
+	f.attempts++
+	return fmt.Errorf("messenger unavailable")
+}
+
+// newReactive wires a Manager with handles on the recording fakes so reaction
+// tests can assert what was sent/notified. clock is pinned to t0 for
+// deterministic escalation stamping.
+func newReactive() (*Manager, *fakeStore, *recordingNotifier, *recordingMessenger) {
+	store := newFakeStore()
+	notf := &recordingNotifier{}
+	msgr := &recordingMessenger{}
+	m := New(store, notf, msgr)
+	m.clock = func() time.Time { return t0 }
+	return m, store, notf, msgr
+}
+
+func lcOpenPR(reason domain.PRReason) domain.CanonicalSessionLifecycle {
+	l := lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive)
+	l.PR = domain.PRSubstate{State: domain.PROpen, Reason: reason, Number: 7}
+	return l
+}
+
+func notifyCount(n *recordingNotifier, eventType string) int {
+	n.mu.Lock()
+	defer n.mu.Unlock()
+	c := 0
+	for _, e := range n.events {
+		if e.Type == eventType {
+			c++
+		}
+	}
+	return c
+}
+
+func ctx() context.Context { return context.Background() }
+
+// ---- right reaction per transition ----
+
+func TestReaction_CIFailedSendsToAgentWithLogTail(t *testing.T) {
+	m, store, notf, msgr := newReactive()
+	store.seed(sid, lcOpenPR(domain.PRReasonReviewPending))
+
+	tail := "build failed\nundefined: foo"
+	err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{
+		Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing,
+		PRNumber: 7, CIFailureLogTail: &tail,
+	})
+	if err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+
+	if len(msgr.sent) != 1 {
+		t.Fatalf("want 1 send, got %d", len(msgr.sent))
+	}
+	if got := msgr.sent[0].Message; !strings.Contains(got, "CI is failing") || !strings.Contains(got, tail) {
+		t.Errorf("message missing base text or log tail: %q", got)
+	}
+	if notifyCount(notf, "reaction.escalated") != 0 {
+		t.Error("a first failure must not escalate")
+	}
+}
+
+func TestReaction_ApprovedAndGreenNotifiesNeverAutoMerges(t *testing.T) {
+	m, store, notf, msgr := newReactive()
+	store.seed(sid, lcOpenPR(domain.PRReasonReviewPending))
+
+	err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{
+		Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved,
+		Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7,
+	})
+	if err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+
+	// approved-and-green is notify (human decides to merge); the agent is never
+	// messaged and no auto-merge fires.
+	if len(msgr.sent) != 0 {
+		t.Errorf("approved-and-green must not message the agent, got %d sends", len(msgr.sent))
+	}
+	if notifyCount(notf, "reaction.approved-and-green") != 1 {
+		t.Errorf("want one approved-and-green notify, got events %+v", notf.events)
+	}
+}
+
+func TestReaction_NotifyEventsForHardStates(t *testing.T) {
+	tests := []struct {
+		name      string
+		apply     func(m *Manager)
+		eventType string
+	}{
+		{
+			name:      "waiting_input -> agent-needs-input",
+			apply:     func(m *Manager) { applyActivity(m, domain.ActivityWaitingInput) },
+			eventType: "reaction.agent-needs-input",
+		},
+		{
+			name:      "blocked -> agent-stuck",
+			apply:     func(m *Manager) { applyActivity(m, domain.ActivityBlocked) },
+			eventType: "reaction.agent-stuck",
+		},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			m, store, notf, msgr := newReactive()
+			store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive))
+			tc.apply(m)
+			if notifyCount(notf, tc.eventType) != 1 {
+				t.Errorf("want one %s, got events %+v", tc.eventType, notf.events)
+			}
+			if len(msgr.sent) != 0 {
+				t.Errorf("notify reaction must not message the agent, got %d", len(msgr.sent))
+			}
+		})
+	}
+}
+
+func TestReaction_InferredDeathNotifiesAgentExited(t *testing.T) {
+	m, store, notf, _ := newReactive()
+	store.seed(sid, detectingLC())
+
+	err := m.ApplyRuntimeObservation(ctx(), sid, ports.RuntimeFacts{
+		RuntimeState: ports.RuntimeProbeDead, ProcessState: ports.ProcessProbeDead, ObservedAt: t0,
+	})
+	if err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+	if l := mustLoad(t, store); domain.DeriveLegacyStatus(l) != domain.StatusKilled {
+		t.Fatalf("precondition: want killed, got %s", domain.DeriveLegacyStatus(l))
+	}
+	if notifyCount(notf, "reaction.agent-exited") != 1 {
+		t.Errorf("want one agent-exited, got events %+v", notf.events)
+	}
+}
+
+func TestReaction_PRClosedAndMerged(t *testing.T) {
+	tests := []struct {
+		name      string
+		prState   domain.PRState
+		eventType string
+	}{
+		{"closed -> pr-closed", domain.PRClosed, "reaction.pr-closed"},
+		{"merged -> all-complete", domain.PRMerged, "reaction.all-complete"},
+	}
+	for _, tc := range tests {
+		t.Run(tc.name, func(t *testing.T) {
+			m, store, notf, _ := newReactive()
+			store.seed(sid, lcOpenPR(domain.PRReasonReviewPending))
+			err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{
+				Fetched: true, PRState: tc.prState, PRNumber: 7,
+			})
+			if err != nil {
+				t.Fatalf("apply: %v", err)
+			}
+			if notifyCount(notf, tc.eventType) != 1 {
+				t.Errorf("want one %s, got events %+v", tc.eventType, notf.events)
+			}
+		})
+	}
+}
+
+func TestReaction_OnKillRequestedDoesNotReact(t *testing.T) {
+	m, store, notf, msgr := newReactive()
+	store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive))
+
+	if err := m.OnKillRequested(ctx(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil {
+		t.Fatalf("kill: %v", err)
+	}
+	// An explicit human kill is not an inferred event: no agent-exited, no send.
+	if len(notf.events) != 0 || len(msgr.sent) != 0 {
+		t.Errorf("explicit kill must fire no reaction: notifies=%+v sends=%+v", notf.events, msgr.sent)
+	}
+}
+
+// ---- escalation engine ----
+
+func TestReaction_CIFailedNumericEscalation(t *testing.T) {
+	m, store, notf, msgr := newReactive()
+	store.seed(sid, lcOpenPR(domain.PRReasonReviewPending))
+
+	// ci-failed has retries 2 and is persistent, so the budget is shared across
+	// fail->pending->fail oscillations and escalates on the third failure.
+	failN := 4
+	for i := 0; i < failN; i++ {
+		failCI(t, m)
+		pendingCI(t, m) // oscillate out (persistent tracker must NOT reset)
+	}
+
+	if len(msgr.sent) != 2 {
+		t.Errorf("want 2 auto-sends before escalation, got %d", len(msgr.sent))
+	}
+	if c := notifyCount(notf, "reaction.escalated"); c != 1 {
+		t.Errorf("want exactly one escalation, got %d", c)
+	}
+}
+
+func TestReaction_DurationEscalationFiresOnTick(t *testing.T) {
+	m, store, notf, msgr := newReactive()
+	store.seed(sid, lcOpenPR(domain.PRReasonReviewPending))
+
+	// changes-requested: send once now, then escalate by duration (30m) — which
+	// only the reaper's TickEscalations can fire (the LCM never polls).
+	err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{
+		Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7,
+	})
+	if err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+	if len(msgr.sent) != 1 {
+		t.Fatalf("want one send on transition, got %d", len(msgr.sent))
+	}
+
+	if err := m.TickEscalations(ctx(), t0.Add(10*time.Minute)); err != nil {
+		t.Fatalf("tick: %v", err)
+	}
+	if notifyCount(notf, "reaction.escalated") != 0 {
+		t.Error("must not escalate before escalateAfter elapses")
+	}
+
+	// Inclusive boundary: escalate at exactly escalateAfter (30m), not only past it.
+	if err := m.TickEscalations(ctx(), t0.Add(30*time.Minute)); err != nil {
+		t.Fatalf("tick: %v", err)
+	}
+	if notifyCount(notf, "reaction.escalated") != 1 {
+		t.Errorf("want one duration escalation at exactly 30m, got events %+v", notf.events)
+	}
+}
+
+func TestReaction_KillClearsEscalationTrackers(t *testing.T) {
+	m, store, notf, _ := newReactive()
+	store.seed(sid, lcOpenPR(domain.PRReasonReviewPending))
+
+	// changes-requested creates a duration-based tracker.
+	if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{
+		Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewChangesRequested, PRNumber: 7,
+	}); err != nil {
+		t.Fatalf("apply: %v", err)
+	}
+	if sessionTrackerCount(m, sid) == 0 {
+		t.Fatalf("precondition: expected a tracker")
+	}
+
+	if err := m.OnKillRequested(ctx(), sid, ports.KillReason{Kind: ports.KillManual}); err != nil {
+		t.Fatalf("kill: %v", err)
+	}
+	if n := sessionTrackerCount(m, sid); n != 0 {
+		t.Errorf("kill must clear trackers, %d left", n)
+	}
+	// A later duration tick must not escalate a dead session.
+	if err := m.TickEscalations(ctx(), t0.Add(time.Hour)); err != nil {
+		t.Fatalf("tick: %v", err)
+	}
+	if c := notifyCount(notf, "reaction.escalated"); c != 0 {
+		t.Errorf("killed session must not escalate, got %d", c)
+	}
+}
+
+func TestReaction_SendFailureDoesNotBurnBudget(t *testing.T) {
+	store := newFakeStore()
+	notf := &recordingNotifier{}
+	fm := &failingMessenger{}
+	m := New(store, notf, fm)
+	m.clock = func() time.Time { return t0 }
+	store.seed(sid, lcOpenPR(domain.PRReasonReviewPending))
+
+	tail := "fail"
+	failing := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail}
+	pending := ports.SCMFacts{Fetched: true, PRState: domain.PROpen, CISummary: ports.CIPending, ReviewDecision: ports.ReviewPending, PRNumber: 7}
+
+	// ci-failed has retries 2; with every delivery failing, the budget is rolled
+	// back each time, so even 5 failures never escalate.
+	for i := 0; i < 5; i++ {
+		_ = m.ApplySCMObservation(ctx(), sid, failing) // returns the delivery error
+		_ = m.ApplySCMObservation(ctx(), sid, pending)
+	}
+	if fm.attempts < 5 {
+		t.Errorf("expected at least 5 send attempts, got %d", fm.attempts)
+	}
+	if c := notifyCount(notf, "reaction.escalated"); c != 0 {
+		t.Errorf("undelivered messages must not escalate, got %d", c)
+	}
+}
+
+func TestReaction_NonPersistentTrackerClearsOnLeave(t *testing.T) {
+	m, store, _, msgr := newReactive()
+	store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive))
+
+	// agent-idle has retries 2 but is NOT persistent: leaving idle clears the
+	// tracker, so three idle incidents each send fresh and none escalate.
+	for i := 0; i < 3; i++ {
+		applyActivity(m, domain.ActivityIdle)
+		applyActivity(m, domain.ActivityActive)
+	}
+	if len(msgr.sent) != 3 {
+		t.Errorf("want 3 idle sends (budget reset each incident), got %d", len(msgr.sent))
+	}
+}
+
+func TestReaction_CIFailedRearmsOnGenuineRecovery(t *testing.T) {
+	m, store, notf, msgr := newReactive()
+	store.seed(sid, lcOpenPR(domain.PRReasonReviewPending))
+
+	// Drain the ci-failed budget to escalation (silenced thereafter).
+	for i := 0; i < 4; i++ {
+		failCI(t, m)
+		pendingCI(t, m)
+	}
+	if notifyCount(notf, "reaction.escalated") != 1 {
+		t.Fatalf("precondition: want one escalation, got %d", notifyCount(notf, "reaction.escalated"))
+	}
+	sentBefore := len(msgr.sent)
+
+	// A genuine recovery (approved + green) ends the incident and re-arms the
+	// budget; a later regression must re-nudge the agent, not stay silenced.
+	if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{
+		Fetched: true, PRState: domain.PROpen, ReviewDecision: ports.ReviewApproved,
+		Mergeability: ports.Mergeability{Mergeable: true}, PRNumber: 7,
+	}); err != nil {
+		t.Fatalf("recover: %v", err)
+	}
+	failCI(t, m)
+
+	if len(msgr.sent) != sentBefore+1 {
+		t.Errorf("regression after recovery must re-nudge the agent: sends %d -> %d", sentBefore, len(msgr.sent))
+	}
+}
+
+func TestReaction_IncidentOverClearsAllSessionTrackers(t *testing.T) {
+	m, store, _, _ := newReactive()
+	store.seed(sid, lcOpenPR(domain.PRReasonReviewPending))
+
+	failCI(t, m) // creates a persistent ci-failed tracker
+	if sessionTrackerCount(m, sid) == 0 {
+		t.Fatalf("precondition: expected a ci-failed tracker")
+	}
+
+	// Merging ends the incident; no tracker (and no stale escalated=true) may
+	// survive for the session.
+	if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{
+		Fetched: true, PRState: domain.PRMerged, PRNumber: 7,
+	}); err != nil {
+		t.Fatalf("merge: %v", err)
+	}
+	if n := sessionTrackerCount(m, sid); n != 0 {
+		t.Errorf("incident over must clear all trackers, %d left", n)
+	}
+}
+
+func sessionTrackerCount(m *Manager, id domain.SessionID) int {
+	m.trackerMu.Lock()
+	defer m.trackerMu.Unlock()
+	c := 0
+	for k := range m.trackers {
+		if k.id == id {
+			c++
+		}
+	}
+	return c
+}
+
+// ---- TickEscalations never writes canonical state ----
+
+func TestTickEscalations_DoesNotPersist(t *testing.T) {
+	m, store, _, _ := newReactive()
+	store.seed(sid, lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.RuntimeAlive))
+	if err := m.TickEscalations(ctx(), t0); err != nil {
+		t.Fatalf("tick: %v", err)
+	}
+	if l := mustLoad(t, store); l.Revision != 0 {
+		t.Errorf("TickEscalations must not write canonical state, got revision=%d", l.Revision)
+	}
+}
+
+// ---- helpers ----
+
+func applyActivity(m *Manager, a domain.ActivityState) {
+	_ = m.ApplyActivitySignal(ctx(), sid, ports.ActivitySignal{
+		State: ports.SignalValid, Activity: a, Timestamp: t0, Source: domain.SourceHook,
+	})
+}
+
+func failCI(t *testing.T, m *Manager) {
+	t.Helper()
+	tail := "fail"
+	if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{
+		Fetched: true, PRState: domain.PROpen, CISummary: ports.CIFailing, PRNumber: 7, CIFailureLogTail: &tail,
+	}); err != nil {
+		t.Fatalf("failCI: %v", err)
+	}
+}
+
+func pendingCI(t *testing.T, m *Manager) {
+	t.Helper()
+	if err := m.ApplySCMObservation(ctx(), sid, ports.SCMFacts{
+		Fetched: true, PRState: domain.PROpen, CISummary: ports.CIPending, ReviewDecision: ports.ReviewPending, PRNumber: 7,
+	}); err != nil {
+		t.Fatalf("pendingCI: %v", err)
+	}
+}
diff --git a/backend/internal/ports/facts.go b/backend/internal/ports/facts.go
new file mode 100644
index 00000000..55f4f6ca
--- /dev/null
+++ b/backend/internal/ports/facts.go
@@ -0,0 +1,145 @@
+// Package ports declares the boundary contracts for the LCM + Session Manager
+// lane: the inbound interfaces we implement, the outbound interfaces others
+// implement for us, and the fact DTOs that cross those boundaries.
+//
+// These are the types the SCM poller, persistence adapter, and API layer build
+// against, so they are committed and stabilised before the LCM/SM logic.
+package ports
+
+import (
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+)
+
+// SCMFacts is produced by the SCM poller and handed to ApplySCMObservation.
+//
+// Fetched is the failed-probe guard: when false, the GitHub query timed out or
+// errored and the rest of the struct is meaningless — the LCM must NOT read it
+// as "no PR / PR closed" (the SCM analogue of "failed probe != dead").
+//
+// CIFailureLogTail is a pointer because it is only populated when CI is failing;
+// it carries ~120 lines and we don't want it on the hot poll path otherwise.
+type SCMFacts struct {
+	Fetched          bool
+	ObservedAt       time.Time
+	PRState          domain.PRState
+	PRNumber         int
+	PRURL            string
+	CISummary        CISummary
+	ReviewDecision   ReviewDecision
+	Mergeability     Mergeability
+	PendingComments  []ReviewComment
+	CIFailureLogTail *string
+}
+
+type CISummary string
+
+const (
+	CIPending CISummary = "pending"
+	CIPassing CISummary = "passing"
+	CIFailing CISummary = "failing"
+	CINone    CISummary = "none"
+)
+
+type ReviewDecision string
+
+const (
+	ReviewApproved         ReviewDecision = "approved"
+	ReviewChangesRequested ReviewDecision = "changes_requested"
+	ReviewPending          ReviewDecision = "pending"
+	ReviewNone             ReviewDecision = "none"
+)
+
+// Mergeability is the structured "can this merge?" answer. CIPassing/Approved
+// here overlap CISummary/ReviewDecision by design (different granularity);
+// Mergeability is authoritative for the merge gate, the others for display.
+type Mergeability struct {
+	Mergeable   bool
+	CIPassing   bool
+	Approved    bool
+	NoConflicts bool
+	Blockers    []string
+}
+
+// ReviewComment carries IsBot so the decider can route bot review comments
+// (bugbot-comments reaction) differently from human ones (changes-requested).
+type ReviewComment struct {
+	Author string
+	Body   string
+	IsBot  bool
+	URL    string
+}
+
+// RuntimeFacts is produced by the reaper and handed to ApplyRuntimeObservation.
+type RuntimeFacts struct {
+	ObservedAt   time.Time
+	RuntimeState RuntimeProbe
+	ProcessState ProcessProbe
+}
+
+// RuntimeProbe / ProcessProbe keep "failed" (the probe call itself errored or
+// timed out) distinct from "indeterminate" (the probe ran but couldn't tell) —
+// they route differently in the decider.
+type RuntimeProbe string
+
+const (
+	RuntimeProbeAlive         RuntimeProbe = "alive"
+	RuntimeProbeDead          RuntimeProbe = "dead"
+	RuntimeProbeIndeterminate RuntimeProbe = "indeterminate"
+	RuntimeProbeFailed        RuntimeProbe = "failed"
+)
+
+type ProcessProbe string
+
+const (
+	ProcessProbeAlive         ProcessProbe = "alive"
+	ProcessProbeDead          ProcessProbe = "dead"
+	ProcessProbeIndeterminate ProcessProbe = "indeterminate"
+	ProcessProbeFailed        ProcessProbe = "failed"
+)
+
+// ActivitySignal is pushed by agent hooks / the FS watcher. State is the
+// confidence wrapper (so unavailable/probe_failure != idleness); Activity is
+// the actual classification.
+type ActivitySignal struct {
+	State     SignalConfidence
+	Activity  domain.ActivityState
+	Timestamp time.Time
+	Source    domain.ActivitySource
+}
+
+type SignalConfidence string
+
+const (
+	SignalValid        SignalConfidence = "valid"
+	SignalStale        SignalConfidence = "stale"
+	SignalNull         SignalConfidence = "null"
+	SignalUnavailable  SignalConfidence = "unavailable"
+	SignalProbeFailure SignalConfidence = "probe_failure"
+)
+
+// SpawnOutcome is what the Session Manager reports to the LCM after a spawn.
+// RuntimeHandle is the same structured handle the Runtime port returns, so no
+// ad-hoc string encoding is needed for later Destroy/SendMessage calls.
+type SpawnOutcome struct {
+	Branch         string
+	WorkspacePath  string
+	RuntimeHandle  RuntimeHandle
+	AgentSessionID string
+}
+
+// KillReason is what the Session Manager reports to the LCM when a kill is
+// requested. Kind drives whether the terminal state is killed/cleanup/errored.
+type KillReason struct {
+	Kind   LifecycleKillReason
+	Detail string
+}
+
+type LifecycleKillReason string
+
+const (
+	KillManual  LifecycleKillReason = "manual"
+	KillCleanup LifecycleKillReason = "cleanup"
+	KillError   LifecycleKillReason = "error"
+)
diff --git a/backend/internal/ports/inbound.go b/backend/internal/ports/inbound.go
new file mode 100644
index 00000000..30ab7559
--- /dev/null
+++ b/backend/internal/ports/inbound.go
@@ -0,0 +1,70 @@
+package ports
+
+import (
+	"context"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+)
+
+// LifecycleManager is the inbound contract we implement. Every Apply* method
+// runs the same synchronous pipeline: load canonical -> pure decide -> diff ->
+// persist (merge-patch) -> if the status transitioned, fire reactions. The LCM
+// never polls; observers (SCM poller, reaper, activity ingest) call in.
+//
+// Concurrency: the LCM serialises per session, so concurrent Apply* calls for
+// the same session do not race the load/decide/persist read-modify-write.
+type LifecycleManager interface {
+	// Raw-fact entrypoints (each runs decide internally).
+	ApplySCMObservation(ctx context.Context, id domain.SessionID, f SCMFacts) error
+	ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f RuntimeFacts) error
+	ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ActivitySignal) error
+
+	// Mutation outcomes reported by the Session Manager.
+	OnSpawnCompleted(ctx context.Context, id domain.SessionID, o SpawnOutcome) error
+	OnKillRequested(ctx context.Context, id domain.SessionID, r KillReason) error
+
+	// Reaper heartbeat that drives duration-based escalation (a non-polling
+	// LCM can't wake itself to fire a "30m elapsed" escalation).
+	TickEscalations(ctx context.Context, now time.Time) error
+}
+
+// SessionManager is the inbound contract called by the API layer and CLI. It
+// owns explicit mutations (spawn/kill/restore/cleanup) and never derives or
+// writes observed state directly — it routes outcomes to the LCM.
+type SessionManager interface {
+	Spawn(ctx context.Context, cfg SpawnConfig) (domain.Session, error)
+	Kill(ctx context.Context, id domain.SessionID, opts KillOptions) (KillResult, error)
+	List(ctx context.Context, project domain.ProjectID) ([]domain.Session, error)
+	Get(ctx context.Context, id domain.SessionID) (domain.Session, error)
+	Send(ctx context.Context, id domain.SessionID, message string) error
+	Restore(ctx context.Context, id domain.SessionID) (domain.Session, error)
+	Cleanup(ctx context.Context, project domain.ProjectID) (CleanupResult, error)
+}
+
+type SpawnConfig struct {
+	ProjectID  domain.ProjectID
+	IssueID    domain.IssueID
+	Kind       domain.SessionKind
+	Branch     string
+	Prompt     string
+	AgentRules string
+	// OpenTerminal is reserved for a later lane (open a terminal tab on spawn).
+	// Spawn does NOT honor it yet — setting it has no effect.
+	OpenTerminal bool
+}
+
+type KillOptions struct {
+	Reason LifecycleKillReason
+	Detail string
+}
+
+type KillResult struct {
+	ID             domain.SessionID
+	WorkspaceFreed bool
+}
+
+type CleanupResult struct {
+	Cleaned []domain.SessionID
+	Skipped []domain.SessionID // e.g. paths that still held uncommitted work
+}
diff --git a/backend/internal/ports/outbound.go b/backend/internal/ports/outbound.go
new file mode 100644
index 00000000..a9c03e22
--- /dev/null
+++ b/backend/internal/ports/outbound.go
@@ -0,0 +1,152 @@
+package ports
+
+import (
+	"context"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+)
+
+// LifecycleStore is the persistence adapter, the ONLY disk writer. It owns
+// merge-patch, atomic write, file lock, and CDC eventing. The LCM and SM only
+// ever touch state through this narrow interface.
+//
+// List returns persistence records (no derived status); the Session Manager
+// turns those into domain.Session by attaching the derived display status.
+//
+// Seed and Get are the two record-with-identity methods the Session Manager
+// needs that the LCM does not: Load returns lifecycle only (all the decider
+// needs), so the SM read-model and explicit-create path would otherwise have no
+// way to write or read a record's identity (ID/ProjectID/IssueID/Kind/CreatedAt)
+// by id. (Co-owned with Tom's persistence layer — added here to close that gap.)
+type LifecycleStore interface {
+	Load(ctx context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error)
+	PatchLifecycle(ctx context.Context, id domain.SessionID, patch LifecyclePatch) error
+	List(ctx context.Context, project domain.ProjectID) ([]domain.SessionRecord, error)
+	GetMetadata(ctx context.Context, id domain.SessionID) (map[string]string, error)
+	PatchMetadata(ctx context.Context, id domain.SessionID, kv map[string]string) error
+
+	// Seed creates a new record with its identity and initial lifecycle. It is
+	// the SM's explicit-create path (the LCM only ever patches existing records);
+	// OnSpawnCompleted requires a seeded record, so Spawn calls this first. It
+	// must reject a seed for an id that already exists rather than overwrite —
+	// re-seeding an existing session (e.g. Restore) goes through PatchLifecycle.
+	Seed(ctx context.Context, rec domain.SessionRecord) error
+
+	// Get returns a single full record (with identity) by id. Load is
+	// lifecycle-only, so the SM uses this to build the read-model and to
+	// reconstruct teardown handles for Kill/Restore on one id.
+	Get(ctx context.Context, id domain.SessionID) (domain.SessionRecord, bool, error)
+}
+
+// LifecyclePatch is a sparse merge-patch: a nil field is left untouched, a
+// non-nil field is written.
+//
+// Detecting needs three-way semantics (leave / set / clear-to-nil):
+//   - ClearDetecting == true  → store clears the detecting memory and IGNORES
+//     the Detecting field (clear wins; setting both is a caller bug).
+//   - ClearDetecting == false, Detecting != nil → set/replace the memory.
+//   - ClearDetecting == false, Detecting == nil  → leave it untouched.
+//
+// ExpectedRevision supports optimistic concurrency: when non-nil the store must
+// reject the patch if the stored Revision (the monotonic write counter, NOT the
+// schema Version) differs. This is the alternative to the LCM owning all
+// per-session serialisation itself.
+type LifecyclePatch struct {
+	Session          *domain.SessionSubstate
+	PR               *domain.PRSubstate
+	Runtime          *domain.RuntimeSubstate
+	Activity         *domain.ActivitySubstate
+	Detecting        *domain.DetectingState
+	ClearDetecting   bool
+	ExpectedRevision *int
+}
+
+// Notifier delivers events to the human (desktop/Slack later). Push, never pull.
+type Notifier interface {
+	Notify(ctx context.Context, event OrchestratorEvent) error
+}
+
+type EventPriority string
+
+const (
+	PriorityUrgent  EventPriority = "urgent"
+	PriorityAction  EventPriority = "action"
+	PriorityWarning EventPriority = "warning"
+	PriorityInfo    EventPriority = "info"
+)
+
+type OrchestratorEvent struct {
+	Type      string
+	Priority  EventPriority
+	SessionID domain.SessionID
+	ProjectID domain.ProjectID
+	Message   string
+	Data      map[string]any
+}
+
+// AgentMessenger injects a message into a running agent. The implementation
+// busy-detects (waits for the agent to be idle/ready) and verifies delivery,
+// which is why activity-detection accuracy matters.
+type AgentMessenger interface {
+	Send(ctx context.Context, id domain.SessionID, message string) error
+}
+
+// The runtime/agent/workspace plugin ports are co-owned with the coding-agents
+// lane; the method sets below are the minimum the Session Manager spawn/kill
+// pipelines call. They will be fleshed out alongside the tmux/claude-code impls.
+
+type Runtime interface {
+	Create(ctx context.Context, cfg RuntimeConfig) (RuntimeHandle, error)
+	Destroy(ctx context.Context, handle RuntimeHandle) error
+	SendMessage(ctx context.Context, handle RuntimeHandle, message string) error
+	GetOutput(ctx context.Context, handle RuntimeHandle, lines int) (string, error)
+	IsAlive(ctx context.Context, handle RuntimeHandle) (bool, error)
+}
+
+type RuntimeConfig struct {
+	SessionID     domain.SessionID
+	WorkspacePath string
+	LaunchCommand string
+	Env           map[string]string
+}
+
+type RuntimeHandle struct {
+	ID          string
+	RuntimeName string
+}
+
+type Agent interface {
+	GetLaunchCommand(cfg AgentConfig) string
+	GetEnvironment(cfg AgentConfig) map[string]string
+	// ProbeProcess returns the agent process liveness classification
+	// (alive/dead/indeterminate/failed) — not a boolean and not an activity
+	// state. Activity classification arrives separately via ActivitySignal.
+	ProbeProcess(ctx context.Context, handle RuntimeHandle) (ProcessProbe, error)
+	GetRestoreCommand(agentSessionID string) string
+}
+
+type AgentConfig struct {
+	SessionID     domain.SessionID
+	WorkspacePath string
+	Prompt        string
+}
+
+type Workspace interface {
+	Create(ctx context.Context, cfg WorkspaceConfig) (WorkspaceInfo, error)
+	Destroy(ctx context.Context, info WorkspaceInfo) error
+	List(ctx context.Context, project domain.ProjectID) ([]WorkspaceInfo, error)
+	Restore(ctx context.Context, cfg WorkspaceConfig) (WorkspaceInfo, error)
+}
+
+type WorkspaceConfig struct {
+	ProjectID domain.ProjectID
+	SessionID domain.SessionID
+	Branch    string
+}
+
+type WorkspaceInfo struct {
+	Path      string
+	Branch    string
+	SessionID domain.SessionID
+	ProjectID domain.ProjectID
+}
diff --git a/backend/internal/session/fakes_test.go b/backend/internal/session/fakes_test.go
new file mode 100644
index 00000000..648172de
--- /dev/null
+++ b/backend/internal/session/fakes_test.go
@@ -0,0 +1,407 @@
+package session
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+	"github.com/aoagents/agent-orchestrator/backend/internal/lifecycle"
+	"github.com/aoagents/agent-orchestrator/backend/internal/ports"
+)
+
+// callLog records the cross-fake call order so tests can assert pipeline
+// sequencing (e.g. OnKillRequested before Runtime.Destroy before Workspace.Destroy).
+type callLog struct {
+	mu    sync.Mutex
+	calls []string
+}
+
+func (c *callLog) add(s string) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.calls = append(c.calls, s)
+}
+
+func (c *callLog) snapshot() []string {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	out := make([]string, len(c.calls))
+	copy(out, c.calls)
+	return out
+}
+
+// indexOf returns the position of the first call equal to name, or -1.
+func (c *callLog) indexOf(name string) int {
+	for i, s := range c.snapshot() {
+		if s == name {
+			return i
+		}
+	}
+	return -1
+}
+
+// ---- fakeStore: in-memory LifecycleStore with faithful merge-patch + Seed/Get ----
+
+type fakeStore struct {
+	mu       sync.Mutex
+	records  map[domain.SessionID]*domain.SessionRecord
+	metadata map[domain.SessionID]map[string]string
+}
+
+var _ ports.LifecycleStore = (*fakeStore)(nil)
+
+func newFakeStore() *fakeStore {
+	return &fakeStore{
+		records:  map[domain.SessionID]*domain.SessionRecord{},
+		metadata: map[domain.SessionID]map[string]string{},
+	}
+}
+
+func (s *fakeStore) Seed(_ context.Context, rec domain.SessionRecord) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if _, ok := s.records[rec.ID]; ok {
+		return fmt.Errorf("seed: session %s already exists", rec.ID)
+	}
+	if rec.Lifecycle.Version == 0 {
+		rec.Lifecycle.Version = domain.LifecycleVersion
+	}
+	r := rec
+	s.records[rec.ID] = &r
+	return nil
+}
+
+func (s *fakeStore) Get(_ context.Context, id domain.SessionID) (domain.SessionRecord, bool, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	rec, ok := s.records[id]
+	if !ok {
+		return domain.SessionRecord{}, false, nil
+	}
+	return s.withMetadata(*rec), true, nil
+}
+
+func (s *fakeStore) Load(_ context.Context, id domain.SessionID) (domain.CanonicalSessionLifecycle, bool, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	rec, ok := s.records[id]
+	if !ok {
+		return domain.CanonicalSessionLifecycle{}, false, nil
+	}
+	return rec.Lifecycle, true, nil
+}
+
+func (s *fakeStore) PatchLifecycle(_ context.Context, id domain.SessionID, p ports.LifecyclePatch) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+
+	rec, ok := s.records[id]
+	if !ok {
+		rec = &domain.SessionRecord{ID: id, Lifecycle: domain.CanonicalSessionLifecycle{Version: domain.LifecycleVersion}}
+		s.records[id] = rec
+	}
+	l := &rec.Lifecycle
+
+	if p.ExpectedRevision != nil && *p.ExpectedRevision != l.Revision {
+		return fmt.Errorf("revision mismatch for %s: have %d, expected %d", id, l.Revision, *p.ExpectedRevision)
+	}
+
+	if p.Session != nil {
+		l.Session = *p.Session
+	}
+	if p.PR != nil {
+		l.PR = *p.PR
+	}
+	if p.Runtime != nil {
+		l.Runtime = *p.Runtime
+	}
+	if p.Activity != nil {
+		l.Activity = *p.Activity
+	}
+	switch {
+	case p.ClearDetecting:
+		l.Detecting = nil
+	case p.Detecting != nil:
+		d := *p.Detecting
+		l.Detecting = &d
+	}
+
+	l.Version = domain.LifecycleVersion
+	l.Revision++
+	rec.UpdatedAt = time.Now()
+	return nil
+}
+
+func (s *fakeStore) List(_ context.Context, project domain.ProjectID) ([]domain.SessionRecord, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	var out []domain.SessionRecord
+	for _, rec := range s.records {
+		if rec.ProjectID == project {
+			out = append(out, s.withMetadata(*rec))
+		}
+	}
+	return out, nil
+}
+
+func (s *fakeStore) GetMetadata(_ context.Context, id domain.SessionID) (map[string]string, error) {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	return cloneMap(s.metadata[id]), nil
+}
+
+func (s *fakeStore) PatchMetadata(_ context.Context, id domain.SessionID, kv map[string]string) error {
+	s.mu.Lock()
+	defer s.mu.Unlock()
+	if s.metadata[id] == nil {
+		s.metadata[id] = map[string]string{}
+	}
+	for k, v := range kv {
+		s.metadata[id][k] = v
+	}
+	return nil
+}
+
+// withMetadata attaches the separately-stored metadata to a record copy (a real
+// store would return them together). Caller holds s.mu.
+func (s *fakeStore) withMetadata(rec domain.SessionRecord) domain.SessionRecord {
+	if md := s.metadata[rec.ID]; len(md) > 0 {
+		rec.Metadata = cloneMap(md)
+	}
+	return rec
+}
+
+// ---- fakeRuntime ----
+
+type fakeRuntime struct {
+	log       *callLog
+	createErr error
+	alive     bool
+
+	created   []ports.RuntimeConfig
+	destroyed []ports.RuntimeHandle
+	sent      []string
+}
+
+var _ ports.Runtime = (*fakeRuntime)(nil)
+
+func (r *fakeRuntime) Create(_ context.Context, cfg ports.RuntimeConfig) (ports.RuntimeHandle, error) {
+	r.log.add("Runtime.Create")
+	if r.createErr != nil {
+		return ports.RuntimeHandle{}, r.createErr
+	}
+	r.created = append(r.created, cfg)
+	return ports.RuntimeHandle{ID: "rt-" + string(cfg.SessionID), RuntimeName: "tmux"}, nil
+}
+
+func (r *fakeRuntime) Destroy(_ context.Context, h ports.RuntimeHandle) error {
+	r.log.add("Runtime.Destroy")
+	r.destroyed = append(r.destroyed, h)
+	return nil
+}
+
+func (r *fakeRuntime) SendMessage(_ context.Context, _ ports.RuntimeHandle, message string) error {
+	r.sent = append(r.sent, message)
+	return nil
+}
+
+func (r *fakeRuntime) GetOutput(_ context.Context, _ ports.RuntimeHandle, _ int) (string, error) {
+	return "", nil
+}
+
+func (r *fakeRuntime) IsAlive(_ context.Context, _ ports.RuntimeHandle) (bool, error) {
+	return r.alive, nil
+}
+
+// ---- fakeAgent ----
+
+type fakeAgent struct {
+	env map[string]string
+}
+
+var _ ports.Agent = (*fakeAgent)(nil)
+
+func (a *fakeAgent) GetLaunchCommand(_ ports.AgentConfig) string { return "claude" }
+
+func (a *fakeAgent) GetEnvironment(_ ports.AgentConfig) map[string]string { return cloneMap(a.env) }
+
+func (a *fakeAgent) ProbeProcess(_ context.Context, _ ports.RuntimeHandle) (ports.ProcessProbe, error) {
+	return ports.ProcessProbeAlive, nil
+}
+
+func (a *fakeAgent) GetRestoreCommand(agentSessionID string) string {
+	return "claude --resume " + agentSessionID
+}
+
+// ---- fakeWorkspace (with worktree-remove refusal mode) ----
+
+type fakeWorkspace struct {
+	log        *callLog
+	createErr  error
+	refuse     map[string]bool // path -> still registered after prune (uncommitted work)
+	created    []ports.WorkspaceConfig
+	destroyed  []ports.WorkspaceInfo
+	restoredID []domain.SessionID
+}
+
+var _ ports.Workspace = (*fakeWorkspace)(nil)
+
+func (w *fakeWorkspace) Create(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) {
+	w.log.add("Workspace.Create")
+	if w.createErr != nil {
+		return ports.WorkspaceInfo{}, w.createErr
+	}
+	w.created = append(w.created, cfg)
+	return workspaceFor(cfg), nil
+}
+
+func (w *fakeWorkspace) Destroy(_ context.Context, info ports.WorkspaceInfo) error {
+	w.log.add("Workspace.Destroy")
+	if w.refuse[info.Path] {
+		// Worktree-remove safety: after `git worktree prune` the path is still
+		// registered, so it may hold the agent's uncommitted work — refuse.
+		return fmt.Errorf("workspace: refusing to rm -rf %s: still registered after prune", info.Path)
+	}
+	w.destroyed = append(w.destroyed, info)
+	return nil
+}
+
+func (w *fakeWorkspace) List(_ context.Context, _ domain.ProjectID) ([]ports.WorkspaceInfo, error) {
+	return nil, nil
+}
+
+func (w *fakeWorkspace) Restore(_ context.Context, cfg ports.WorkspaceConfig) (ports.WorkspaceInfo, error) {
+	w.log.add("Workspace.Restore")
+	w.restoredID = append(w.restoredID, cfg.SessionID)
+	return workspaceFor(cfg), nil
+}
+
+func workspaceFor(cfg ports.WorkspaceConfig) ports.WorkspaceInfo {
+	return ports.WorkspaceInfo{
+		Path:      "/tmp/ws/" + string(cfg.SessionID),
+		Branch:    cfg.Branch,
+		SessionID: cfg.SessionID,
+		ProjectID: cfg.ProjectID,
+	}
+}
+
+// ---- recordingMessenger ----
+
+type recordingMessenger struct {
+	sent []struct {
+		ID      domain.SessionID
+		Message string
+	}
+}
+
+var _ ports.AgentMessenger = (*recordingMessenger)(nil)
+
+func (m *recordingMessenger) Send(_ context.Context, id domain.SessionID, message string) error {
+	m.sent = append(m.sent, struct {
+		ID      domain.SessionID
+		Message string
+	}{id, message})
+	return nil
+}
+
+// ---- noopNotifier ----
+
+type noopNotifier struct{}
+
+var _ ports.Notifier = (*noopNotifier)(nil)
+
+func (noopNotifier) Notify(_ context.Context, _ ports.OrchestratorEvent) error { return nil }
+
+// ---- recordingLCM: wraps the REAL lifecycle.Manager and logs SM-facing calls ----
+
+type recordingLCM struct {
+	log   *callLog
+	inner ports.LifecycleManager
+
+	// onSpawnErr, when set, makes OnSpawnCompleted fail (without touching the
+	// inner manager) so tests can exercise the SM's post-spawn failure paths.
+	onSpawnErr error
+}
+
+var _ ports.LifecycleManager = (*recordingLCM)(nil)
+
+func (l *recordingLCM) OnSpawnCompleted(ctx context.Context, id domain.SessionID, o ports.SpawnOutcome) error {
+	l.log.add("OnSpawnCompleted")
+	if l.onSpawnErr != nil {
+		return l.onSpawnErr
+	}
+	return l.inner.OnSpawnCompleted(ctx, id, o)
+}
+
+func (l *recordingLCM) OnKillRequested(ctx context.Context, id domain.SessionID, r ports.KillReason) error {
+	l.log.add("OnKillRequested")
+	return l.inner.OnKillRequested(ctx, id, r)
+}
+
+func (l *recordingLCM) ApplySCMObservation(ctx context.Context, id domain.SessionID, f ports.SCMFacts) error {
+	return l.inner.ApplySCMObservation(ctx, id, f)
+}
+
+func (l *recordingLCM) ApplyRuntimeObservation(ctx context.Context, id domain.SessionID, f ports.RuntimeFacts) error {
+	return l.inner.ApplyRuntimeObservation(ctx, id, f)
+}
+
+func (l *recordingLCM) ApplyActivitySignal(ctx context.Context, id domain.SessionID, s ports.ActivitySignal) error {
+	return l.inner.ApplyActivitySignal(ctx, id, s)
+}
+
+func (l *recordingLCM) TickEscalations(ctx context.Context, now time.Time) error {
+	return l.inner.TickEscalations(ctx, now)
+}
+
+// ---- harness: wires the SM against the fakes + the real LCM ----
+
+type harness struct {
+	sm        *Manager
+	store     *fakeStore
+	runtime   *fakeRuntime
+	agent     *fakeAgent
+	workspace *fakeWorkspace
+	messenger *recordingMessenger
+	lcm       *recordingLCM
+	log       *callLog
+}
+
+var fixedTime = time.Date(2026, 5, 27, 12, 0, 0, 0, time.UTC)
+
+func newHarness(id domain.SessionID) *harness {
+	log := &callLog{}
+	store := newFakeStore()
+	rt := &fakeRuntime{log: log, alive: true}
+	ag := &fakeAgent{env: map[string]string{"BASE": "1"}}
+	ws := &fakeWorkspace{log: log, refuse: map[string]bool{}}
+	msg := &recordingMessenger{}
+
+	lcm := &recordingLCM{log: log, inner: lifecycle.New(store, noopNotifier{}, msg)}
+
+	sm := New(Deps{
+		Runtime:   rt,
+		Agent:     ag,
+		Workspace: ws,
+		Store:     store,
+		Messenger: msg,
+		Lifecycle: lcm,
+		Clock:     func() time.Time { return fixedTime },
+		NewID:     func(ports.SpawnConfig) domain.SessionID { return id },
+	})
+
+	return &harness{sm: sm, store: store, runtime: rt, agent: ag, workspace: ws, messenger: msg, lcm: lcm, log: log}
+}
+
+func cloneMap(in map[string]string) map[string]string {
+	if in == nil {
+		return nil
+	}
+	out := make(map[string]string, len(in))
+	for k, v := range in {
+		out[k] = v
+	}
+	return out
+}
diff --git a/backend/internal/session/manager.go b/backend/internal/session/manager.go
new file mode 100644
index 00000000..e2723d26
--- /dev/null
+++ b/backend/internal/session/manager.go
@@ -0,0 +1,464 @@
+// Package session implements ports.SessionManager: the explicit-mutation half
+// of the lane. The SM is impure plumbing — it drives the Runtime/Agent/Workspace
+// plugins to create and tear down sessions, seeds the initial lifecycle record,
+// and routes mutation outcomes to the LCM (OnSpawnCompleted / OnKillRequested).
+//
+// It NEVER derives or observes lifecycle state: observed transitions are the
+// LCM's job. The SM's only canonical writes are the explicit ones — seeding a
+// new record on Spawn and re-seeding (reopening) on Restore — and it is the
+// single producer of the derived display status, attached on read in List/Get
+// and never persisted.
+package session
+
+import (
+	"context"
+	"crypto/rand"
+	"encoding/hex"
+	"errors"
+	"fmt"
+	"strconv"
+	"time"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+	"github.com/aoagents/agent-orchestrator/backend/internal/lifecycle"
+	"github.com/aoagents/agent-orchestrator/backend/internal/ports"
+)
+
+// ErrNotFound is returned by Get/Restore when no record exists for the id.
+var ErrNotFound = errors.New("session: not found")
+
+// ErrNotRestorable is returned by Restore when the session is not torn down.
+// Restoring a live session would spin up a second runtime/workspace for the same
+// id, duplicating the agent and risking data loss.
+var ErrNotRestorable = errors.New("session: not restorable (not terminal)")
+
+// ErrIncompleteTeardownMetadata is returned when a record's teardown handles are
+// missing (empty workspace path or runtime handle), so calling a real adapter's
+// Destroy could act on empty args — an unsafe delete. The teardown is skipped.
+var ErrIncompleteTeardownMetadata = errors.New("session: incomplete teardown metadata")
+
+// Env vars a spawned process reads to learn who it is (distillation §5.4).
+const (
+	EnvSessionID = "AO_SESSION_ID"
+	EnvProjectID = "AO_PROJECT_ID"
+	EnvIssueID   = "AO_ISSUE_ID"
+)
+
+// Manager implements ports.SessionManager against the outbound ports. Every
+// dependency is an interface so the SM runs entirely against fakes in tests.
+type Manager struct {
+	runtime   ports.Runtime
+	agent     ports.Agent
+	workspace ports.Workspace
+	store     ports.LifecycleStore
+	messenger ports.AgentMessenger
+	lcm       ports.LifecycleManager
+
+	clock func() time.Time
+	newID func(ports.SpawnConfig) domain.SessionID
+}
+
+var _ ports.SessionManager = (*Manager)(nil)
+
+// Deps groups the SM's collaborators. Clock and NewID are optional (defaulted)
+// so production wiring only supplies the ports.
+type Deps struct {
+	Runtime   ports.Runtime
+	Agent     ports.Agent
+	Workspace ports.Workspace
+	Store     ports.LifecycleStore
+	Messenger ports.AgentMessenger
+	Lifecycle ports.LifecycleManager
+
+	Clock func() time.Time
+	NewID func(ports.SpawnConfig) domain.SessionID
+}
+
+func New(d Deps) *Manager {
+	m := &Manager{
+		runtime:   d.Runtime,
+		agent:     d.Agent,
+		workspace: d.Workspace,
+		store:     d.Store,
+		messenger: d.Messenger,
+		lcm:       d.Lifecycle,
+		clock:     d.Clock,
+		newID:     d.NewID,
+	}
+	if m.clock == nil {
+		m.clock = time.Now
+	}
+	if m.newID == nil {
+		m.newID = defaultNewID
+	}
+	return m
+}
+
+// ---- Spawn ----
+
+// Spawn runs the create pipeline in spec order: workspace -> runtime -> seed ->
+// report to the LCM. The record is seeded LATE (after the runtime is up), so a
+// failure before the seed leaves no record for Cleanup to reclaim — hence each
+// step eagerly rolls back the steps that already succeeded.
+func (m *Manager) Spawn(ctx context.Context, cfg ports.SpawnConfig) (domain.Session, error) {
+	id := m.newID(cfg)
+
+	ws, err := m.workspace.Create(ctx, ports.WorkspaceConfig{
+		ProjectID: cfg.ProjectID,
+		SessionID: id,
+		Branch:    cfg.Branch,
+	})
+	if err != nil {
+		return domain.Session{}, fmt.Errorf("spawn %s: workspace create: %w", id, err)
+	}
+
+	agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path, Prompt: buildPrompt(cfg)}
+	handle, err := m.runtime.Create(ctx, ports.RuntimeConfig{
+		SessionID:     id,
+		WorkspacePath: ws.Path,
+		LaunchCommand: m.agent.GetLaunchCommand(agentCfg),
+		Env:           spawnEnv(m.agent.GetEnvironment(agentCfg), id, cfg.ProjectID, cfg.IssueID),
+	})
+	if err != nil {
+		m.rollbackWorkspace(ctx, ws) // nothing seeded yet
+		return domain.Session{}, fmt.Errorf("spawn %s: runtime create: %w", id, err)
+	}
+
+	if err := m.store.Seed(ctx, seedRecord(id, cfg, m.clock())); err != nil {
+		m.rollbackRuntime(ctx, handle)
+		m.rollbackWorkspace(ctx, ws)
+		return domain.Session{}, fmt.Errorf("spawn %s: seed: %w", id, err)
+	}
+
+	outcome := ports.SpawnOutcome{Branch: ws.Branch, WorkspacePath: ws.Path, RuntimeHandle: handle}
+	if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil {
+		// The record is seeded but the runtime/workspace are about to be torn
+		// down. The store has no delete, so route the orphan to a terminal
+		// errored state (best effort) rather than strand a phantom "spawning".
+		_ = m.lcm.OnKillRequested(ctx, id, ports.KillReason{Kind: ports.KillError, Detail: "spawn completion failed"})
+		m.rollbackRuntime(ctx, handle)
+		m.rollbackWorkspace(ctx, ws)
+		return domain.Session{}, fmt.Errorf("spawn %s: on spawn completed: %w", id, err)
+	}
+
+	return m.Get(ctx, id)
+}
+
+// rollback* are best-effort: the caller already has the originating failure, and
+// there is no logger at this layer, so a secondary teardown error is dropped
+// rather than masking the real cause.
+func (m *Manager) rollbackWorkspace(ctx context.Context, ws ports.WorkspaceInfo) {
+	_ = m.workspace.Destroy(ctx, ws)
+}
+
+func (m *Manager) rollbackRuntime(ctx context.Context, h ports.RuntimeHandle) {
+	_ = m.runtime.Destroy(ctx, h)
+}
+
+// ---- Kill ----
+
+// Kill records terminal intent with the LCM FIRST, then tears down the runtime
+// and workspace. There is no separate Agent stop: the agent runs inside the
+// runtime, so Runtime.Destroy stops it. The workspace teardown honors the
+// worktree-remove safety — a refusal (path still registered after prune, so it
+// may hold uncommitted work) surfaces as an error with WorkspaceFreed=false and
+// is never forced.
+func (m *Manager) Kill(ctx context.Context, id domain.SessionID, opts ports.KillOptions) (ports.KillResult, error) {
+	rec, ok, err := m.store.Get(ctx, id)
+	if err != nil {
+		return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w", id, err)
+	}
+	if !ok {
+		// Already gone: benign race, mirrors LCM.OnKillRequested's no-op.
+		return ports.KillResult{ID: id}, nil
+	}
+	meta, err := m.store.GetMetadata(ctx, id)
+	if err != nil {
+		return ports.KillResult{ID: id}, fmt.Errorf("kill %s: metadata: %w", id, err)
+	}
+
+	// Validate the teardown handles BEFORE recording intent or touching an
+	// adapter: a corrupted/partially-seeded record with empty handles must never
+	// reach Destroy (empty path / handle could be an unsafe delete).
+	rtHandle := runtimeHandle(meta)
+	wsInfo := workspaceInfo(rec, meta)
+	if !validRuntimeHandle(rtHandle) {
+		return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w: runtime handle", id, ErrIncompleteTeardownMetadata)
+	}
+	if !validWorkspaceInfo(wsInfo) {
+		return ports.KillResult{ID: id}, fmt.Errorf("kill %s: %w: workspace path", id, ErrIncompleteTeardownMetadata)
+	}
+
+	if err := m.lcm.OnKillRequested(ctx, id, ports.KillReason{Kind: opts.Reason, Detail: opts.Detail}); err != nil {
+		return ports.KillResult{ID: id}, fmt.Errorf("kill %s: on kill requested: %w", id, err)
+	}
+	if err := m.runtime.Destroy(ctx, rtHandle); err != nil {
+		return ports.KillResult{ID: id}, fmt.Errorf("kill %s: runtime destroy: %w", id, err)
+	}
+	if err := m.workspace.Destroy(ctx, wsInfo); err != nil {
+		return ports.KillResult{ID: id, WorkspaceFreed: false}, fmt.Errorf("kill %s: workspace destroy: %w", id, err)
+	}
+	return ports.KillResult{ID: id, WorkspaceFreed: true}, nil
+}
+
+// ---- read-model ----
+
+// List builds the read-model for a project: stored records with the display
+// status derived on read. The SM is the single producer of that status.
+func (m *Manager) List(ctx context.Context, project domain.ProjectID) ([]domain.Session, error) {
+	recs, err := m.store.List(ctx, project)
+	if err != nil {
+		return nil, fmt.Errorf("list %s: %w", project, err)
+	}
+	out := make([]domain.Session, 0, len(recs))
+	for _, rec := range recs {
+		out = append(out, toSession(rec))
+	}
+	return out, nil
+}
+
+func (m *Manager) Get(ctx context.Context, id domain.SessionID) (domain.Session, error) {
+	rec, ok, err := m.store.Get(ctx, id)
+	if err != nil {
+		return domain.Session{}, fmt.Errorf("get %s: %w", id, err)
+	}
+	if !ok {
+		return domain.Session{}, fmt.Errorf("get %s: %w", id, ErrNotFound)
+	}
+	return toSession(rec), nil
+}
+
+// ---- Send ----
+
+// Send routes a message to the running agent through the AgentMessenger, which
+// busy-detects and verifies delivery.
+func (m *Manager) Send(ctx context.Context, id domain.SessionID, message string) error {
+	if err := m.messenger.Send(ctx, id, message); err != nil {
+		return fmt.Errorf("send %s: %w", id, err)
+	}
+	return nil
+}
+
+// ---- Restore ----
+
+// Restore relaunches a previously torn-down session in its workspace. The
+// fallible I/O (workspace restore + runtime create) runs first so a failure
+// touches no canonical state and never destroys the worktree (it may hold the
+// agent's prior work). Only once the runtime is up do we reopen the lifecycle:
+// resetting a terminal session is an explicit mutation (the SM's authority; the
+// LCM's observe path would never resurrect a terminal session), and the PR axis
+// is cleared. OnSpawnCompleted then flips the runtime to alive.
+func (m *Manager) Restore(ctx context.Context, id domain.SessionID) (domain.Session, error) {
+	rec, ok, err := m.store.Get(ctx, id)
+	if err != nil {
+		return domain.Session{}, fmt.Errorf("restore %s: %w", id, err)
+	}
+	if !ok {
+		return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotFound)
+	}
+	// Only a torn-down session may be restored. Reopening a live one would spawn a
+	// duplicate runtime/workspace for the same id and reset its lifecycle.
+	if !isTerminalSession(rec.Lifecycle.Session.State) {
+		return domain.Session{}, fmt.Errorf("restore %s: %w", id, ErrNotRestorable)
+	}
+	meta, err := m.store.GetMetadata(ctx, id)
+	if err != nil {
+		return domain.Session{}, fmt.Errorf("restore %s: metadata: %w", id, err)
+	}
+
+	// Resume is only possible with the agent's captured session id. Without it,
+	// GetRestoreCommand would produce an ambiguous "resume nothing" launch, and
+	// we have no stored prompt to fall back to a fresh launch — so fail early,
+	// before any I/O.
+	agentSessionID := meta[lifecycle.MetaAgentSessionID]
+	if agentSessionID == "" {
+		return domain.Session{}, fmt.Errorf("restore %s: missing agent session id (cannot resume)", id)
+	}
+
+	ws, err := m.workspace.Restore(ctx, ports.WorkspaceConfig{
+		ProjectID: rec.ProjectID,
+		SessionID: id,
+		Branch:    meta[lifecycle.MetaBranch],
+	})
+	if err != nil {
+		return domain.Session{}, fmt.Errorf("restore %s: workspace restore: %w", id, err)
+	}
+
+	agentCfg := ports.AgentConfig{SessionID: id, WorkspacePath: ws.Path}
+	handle, err := m.runtime.Create(ctx, ports.RuntimeConfig{
+		SessionID:     id,
+		WorkspacePath: ws.Path,
+		LaunchCommand: m.agent.GetRestoreCommand(agentSessionID),
+		Env:           spawnEnv(m.agent.GetEnvironment(agentCfg), id, rec.ProjectID, rec.IssueID),
+	})
+	if err != nil {
+		return domain.Session{}, fmt.Errorf("restore %s: runtime create: %w", id, err)
+	}
+
+	// Past this point the runtime is live: a failure must tear it back down (but
+	// never the workspace, which holds the agent's prior work) so we don't strand
+	// a process while parking the session in a terminal lifecycle.
+	reopen := ports.LifecyclePatch{
+		Session: &domain.SessionSubstate{State: domain.SessionNotStarted, Reason: domain.ReasonSpawnRequested},
+		PR:      &domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonClearedOnRestore},
+	}
+	if err := m.store.PatchLifecycle(ctx, id, reopen); err != nil {
+		m.rollbackRuntime(ctx, handle)
+		return domain.Session{}, fmt.Errorf("restore %s: reopen: %w", id, err)
+	}
+
+	outcome := ports.SpawnOutcome{
+		Branch:         ws.Branch,
+		WorkspacePath:  ws.Path,
+		RuntimeHandle:  handle,
+		AgentSessionID: agentSessionID,
+	}
+	if err := m.lcm.OnSpawnCompleted(ctx, id, outcome); err != nil {
+		m.rollbackRuntime(ctx, handle)
+		return domain.Session{}, fmt.Errorf("restore %s: on spawn completed: %w", id, err)
+	}
+	return m.Get(ctx, id)
+}
+
+// ---- Cleanup ----
+
+// Cleanup reclaims the workspaces of terminal sessions in a project. A workspace
+// whose teardown is refused by the worktree-remove safety (uncommitted work) is
+// skipped, never forced. Runtime teardown is best-effort (a terminal session's
+// runtime is usually already gone); the workspace result decides cleaned/skipped.
+func (m *Manager) Cleanup(ctx context.Context, project domain.ProjectID) (ports.CleanupResult, error) {
+	recs, err := m.store.List(ctx, project)
+	if err != nil {
+		return ports.CleanupResult{}, fmt.Errorf("cleanup %s: %w", project, err)
+	}
+	var res ports.CleanupResult
+	for _, rec := range recs {
+		if !isTerminalSession(rec.Lifecycle.Session.State) {
+			continue
+		}
+		meta, err := m.store.GetMetadata(ctx, rec.ID)
+		if err != nil {
+			return res, fmt.Errorf("cleanup %s: metadata %s: %w", project, rec.ID, err)
+		}
+		wsInfo := workspaceInfo(rec, meta)
+		if !validWorkspaceInfo(wsInfo) {
+			// No workspace path to reclaim — skip rather than hand empty args to a
+			// real adapter's Destroy (an unsafe delete).
+			res.Skipped = append(res.Skipped, rec.ID)
+			continue
+		}
+		if rtHandle := runtimeHandle(meta); validRuntimeHandle(rtHandle) {
+			_ = m.runtime.Destroy(ctx, rtHandle) // best effort; usually already gone
+		}
+		if err := m.workspace.Destroy(ctx, wsInfo); err != nil {
+			res.Skipped = append(res.Skipped, rec.ID)
+			continue
+		}
+		res.Cleaned = append(res.Cleaned, rec.ID)
+	}
+	return res, nil
+}
+
+// ---- helpers ----
+
+func toSession(rec domain.SessionRecord) domain.Session {
+	return domain.Session{SessionRecord: rec, Status: domain.DeriveLegacyStatus(rec.Lifecycle)}
+}
+
+func isTerminalSession(s domain.SessionState) bool {
+	return s == domain.SessionDone || s == domain.SessionTerminated
+}
+
+// buildPrompt assembles the spawn prompt from the explicit config only; the full
+// 3-layer assembly (base protocol + config-derived + user rules) lands later.
+func buildPrompt(cfg ports.SpawnConfig) string {
+	switch {
+	case cfg.AgentRules == "":
+		return cfg.Prompt
+	case cfg.Prompt == "":
+		return cfg.AgentRules
+	default:
+		return cfg.Prompt + "\n\n" + cfg.AgentRules
+	}
+}
+
+// spawnEnv overlays the AO_* identity vars onto the agent's environment without
+// mutating the map the agent returned.
+func spawnEnv(base map[string]string, id domain.SessionID, project domain.ProjectID, issue domain.IssueID) map[string]string {
+	env := make(map[string]string, len(base)+3)
+	for k, v := range base {
+		env[k] = v
+	}
+	env[EnvSessionID] = string(id)
+	env[EnvProjectID] = string(project)
+	env[EnvIssueID] = string(issue)
+	return env
+}
+
+func seedRecord(id domain.SessionID, cfg ports.SpawnConfig, now time.Time) domain.SessionRecord {
+	return domain.SessionRecord{
+		ID:        id,
+		ProjectID: cfg.ProjectID,
+		IssueID:   cfg.IssueID,
+		Kind:      cfg.Kind,
+		CreatedAt: now,
+		UpdatedAt: now,
+		Lifecycle: domain.CanonicalSessionLifecycle{
+			Version: domain.LifecycleVersion,
+			Session: domain.SessionSubstate{State: domain.SessionNotStarted, Reason: domain.ReasonSpawnRequested},
+			Runtime: domain.RuntimeSubstate{State: domain.RuntimeUnknown, Reason: domain.RuntimeReasonSpawnIncomplete},
+			PR:      domain.PRSubstate{State: domain.PRNone, Reason: domain.PRReasonNotCreated},
+		},
+	}
+}
+
+// runtimeHandle / workspaceInfo reconstruct teardown handles from the metadata
+// the LCM persisted in OnSpawnCompleted (the metadata-key contract is shared
+// with the lifecycle package).
+func runtimeHandle(meta map[string]string) ports.RuntimeHandle {
+	return ports.RuntimeHandle{
+		ID:          meta[lifecycle.MetaRuntimeHandleID],
+		RuntimeName: meta[lifecycle.MetaRuntimeName],
+	}
+}
+
+func workspaceInfo(rec domain.SessionRecord, meta map[string]string) ports.WorkspaceInfo {
+	return ports.WorkspaceInfo{
+		Path:      meta[lifecycle.MetaWorkspacePath],
+		Branch:    meta[lifecycle.MetaBranch],
+		SessionID: rec.ID,
+		ProjectID: rec.ProjectID,
+	}
+}
+
+// validRuntimeHandle reports whether the handle identifies a runtime to destroy.
+// An adapter needs the handle id to target the right process; an empty handle
+// would be ambiguous, so we refuse to call Destroy with one.
+func validRuntimeHandle(h ports.RuntimeHandle) bool {
+	return h.ID != ""
+}
+
+// validWorkspaceInfo reports whether there is a concrete path to reclaim. An
+// empty path handed to a worktree-remove could resolve to an unsafe target.
+func validWorkspaceInfo(w ports.WorkspaceInfo) bool {
+	return w.Path != ""
+}
+
+func defaultNewID(cfg ports.SpawnConfig) domain.SessionID {
+	base := string(cfg.IssueID)
+	if base == "" {
+		base = string(cfg.Kind)
+	}
+	if base == "" {
+		base = "session"
+	}
+	return domain.SessionID(base + "-" + randHex(4))
+}
+
+func randHex(n int) string {
+	b := make([]byte, n)
+	if _, err := rand.Read(b); err != nil {
+		return strconv.FormatInt(time.Now().UnixNano(), 16)
+	}
+	return hex.EncodeToString(b)
+}
diff --git a/backend/internal/session/manager_test.go b/backend/internal/session/manager_test.go
new file mode 100644
index 00000000..702a735e
--- /dev/null
+++ b/backend/internal/session/manager_test.go
@@ -0,0 +1,559 @@
+package session
+
+import (
+	"context"
+	"errors"
+	"testing"
+
+	"github.com/aoagents/agent-orchestrator/backend/internal/domain"
+	"github.com/aoagents/agent-orchestrator/backend/internal/lifecycle"
+	"github.com/aoagents/agent-orchestrator/backend/internal/ports"
+)
+
+const (
+	testProject = domain.ProjectID("proj")
+	testIssue   = domain.IssueID("42")
+)
+
+func spawnCfg() ports.SpawnConfig {
+	return ports.SpawnConfig{
+		ProjectID:  testProject,
+		IssueID:    testIssue,
+		Kind:       domain.KindWorker,
+		Branch:     "feat/42",
+		Prompt:     "do the thing",
+		AgentRules: "be careful",
+	}
+}
+
+func TestSpawn_HappyPath(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+
+	sess, err := h.sm.Spawn(ctx, spawnCfg())
+	if err != nil {
+		t.Fatalf("spawn: %v", err)
+	}
+
+	// Display status is derived (single producer) — a freshly spawned, not_started
+	// session shows as spawning.
+	if sess.Status != domain.StatusSpawning {
+		t.Errorf("status = %q, want %q", sess.Status, domain.StatusSpawning)
+	}
+
+	// Record seeded with identity + initial lifecycle, then OnSpawnCompleted flipped
+	// the runtime axis to alive.
+	rec, ok, err := h.store.Get(ctx, "sess-1")
+	if err != nil || !ok {
+		t.Fatalf("get seeded record: ok=%v err=%v", ok, err)
+	}
+	if rec.ProjectID != testProject || rec.IssueID != testIssue || rec.Kind != domain.KindWorker {
+		t.Errorf("identity = %+v, want proj/42/worker", rec)
+	}
+	if !rec.CreatedAt.Equal(fixedTime) {
+		t.Errorf("createdAt = %v, want %v", rec.CreatedAt, fixedTime)
+	}
+	if got := rec.Lifecycle.Session; got.State != domain.SessionNotStarted || got.Reason != domain.ReasonSpawnRequested {
+		t.Errorf("session substate = %+v, want not_started/spawn_requested", got)
+	}
+	if got := rec.Lifecycle.Runtime; got.State != domain.RuntimeAlive || got.Reason != domain.RuntimeReasonProcessRunning {
+		t.Errorf("runtime substate = %+v, want alive/process_running", got)
+	}
+
+	// Pipeline order: workspace -> runtime -> (seed) -> LCM.
+	wantOrder := []string{"Workspace.Create", "Runtime.Create", "OnSpawnCompleted"}
+	if got := h.log.snapshot(); !equalStrings(got, wantOrder) {
+		t.Errorf("call order = %v, want %v", got, wantOrder)
+	}
+
+	// Identity env wired onto the runtime config, layered over the agent's env.
+	if len(h.runtime.created) != 1 {
+		t.Fatalf("runtime.created = %d, want 1", len(h.runtime.created))
+	}
+	env := h.runtime.created[0].Env
+	for k, want := range map[string]string{
+		EnvSessionID: "sess-1",
+		EnvProjectID: "proj",
+		EnvIssueID:   "42",
+		"BASE":       "1",
+	} {
+		if env[k] != want {
+			t.Errorf("env[%q] = %q, want %q", k, env[k], want)
+		}
+	}
+
+	// Handles persisted to metadata for later teardown/restore.
+	meta, _ := h.store.GetMetadata(ctx, "sess-1")
+	for k, want := range map[string]string{
+		lifecycle.MetaBranch:          "feat/42",
+		lifecycle.MetaWorkspacePath:   "/tmp/ws/sess-1",
+		lifecycle.MetaRuntimeHandleID: "rt-sess-1",
+		lifecycle.MetaRuntimeName:     "tmux",
+	} {
+		if meta[k] != want {
+			t.Errorf("meta[%q] = %q, want %q", k, meta[k], want)
+		}
+	}
+}
+
+func TestSpawn_RuntimeCreateFailure_RollsBack(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+	h.runtime.createErr = errors.New("boom")
+
+	_, err := h.sm.Spawn(ctx, spawnCfg())
+	if err == nil {
+		t.Fatal("spawn: want error, got nil")
+	}
+
+	// No record seeded for a spawn that never completed.
+	if _, ok, _ := h.store.Get(ctx, "sess-1"); ok {
+		t.Error("record was seeded despite runtime-create failure")
+	}
+	// The already-created workspace was rolled back (eager rollback), since a
+	// late-seeded record means Cleanup could never find this orphan.
+	if len(h.workspace.destroyed) != 1 || h.workspace.destroyed[0].Path != "/tmp/ws/sess-1" {
+		t.Errorf("workspace.destroyed = %+v, want the created worktree", h.workspace.destroyed)
+	}
+	// LCM never told a spawn completed.
+	if h.log.indexOf("OnSpawnCompleted") != -1 {
+		t.Error("OnSpawnCompleted should not fire on a failed spawn")
+	}
+}
+
+func TestSpawn_OnSpawnCompletedFailure_RoutesOrphanToErrored(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+	h.lcm.onSpawnErr = errors.New("lcm boom")
+
+	_, err := h.sm.Spawn(ctx, spawnCfg())
+	if err == nil {
+		t.Fatal("spawn: want error, got nil")
+	}
+
+	// Runtime + workspace are torn down on the failure path.
+	if len(h.runtime.destroyed) != 1 {
+		t.Errorf("runtime.destroyed = %d, want 1", len(h.runtime.destroyed))
+	}
+	if len(h.workspace.destroyed) != 1 {
+		t.Errorf("workspace.destroyed = %d, want 1", len(h.workspace.destroyed))
+	}
+	// The record was already seeded and the store has no delete, so the orphan is
+	// routed to a terminal errored state (via OnKillRequested(KillError)) rather
+	// than stranded forever as "spawning".
+	rec, ok, _ := h.store.Get(ctx, "sess-1")
+	if !ok {
+		t.Fatal("seeded record vanished; expected it parked as errored")
+	}
+	if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonErrorInProcess {
+		t.Errorf("session substate = %+v, want terminated/error_in_process", got)
+	}
+	if status := domain.DeriveLegacyStatus(rec.Lifecycle); status != domain.StatusErrored {
+		t.Errorf("status = %q, want errored", status)
+	}
+}
+
+func TestKill_OrderingAndTerminalState(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+	if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil {
+		t.Fatalf("spawn: %v", err)
+	}
+
+	res, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual})
+	if err != nil {
+		t.Fatalf("kill: %v", err)
+	}
+	if !res.WorkspaceFreed {
+		t.Error("WorkspaceFreed = false, want true")
+	}
+
+	// Intent recorded with the LCM BEFORE any teardown, runtime before workspace.
+	iKill := h.log.indexOf("OnKillRequested")
+	iRT := h.log.indexOf("Runtime.Destroy")
+	iWS := h.log.indexOf("Workspace.Destroy")
+	if !(iKill >= 0 && iKill < iRT && iRT < iWS) {
+		t.Errorf("kill order indices: OnKillRequested=%d Runtime.Destroy=%d Workspace.Destroy=%d (want ascending)", iKill, iRT, iWS)
+	}
+
+	// Terminal canonical written by the LCM; display derives to killed.
+	rec, _, _ := h.store.Get(ctx, "sess-1")
+	if got := rec.Lifecycle.Session; got.State != domain.SessionTerminated || got.Reason != domain.ReasonManuallyKilled {
+		t.Errorf("session substate = %+v, want terminated/manually_killed", got)
+	}
+	if status := domain.DeriveLegacyStatus(rec.Lifecycle); status != domain.StatusKilled {
+		t.Errorf("status = %q, want killed", status)
+	}
+}
+
+func TestKill_WorktreeRemoveRefusalSurfaced(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+	if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil {
+		t.Fatalf("spawn: %v", err)
+	}
+	// The worktree path is still registered after prune (uncommitted work).
+	h.workspace.refuse["/tmp/ws/sess-1"] = true
+
+	res, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual})
+	if err == nil {
+		t.Fatal("kill: want refusal error, got nil")
+	}
+	if res.WorkspaceFreed {
+		t.Error("WorkspaceFreed = true, want false on refusal")
+	}
+	// The refusal must be honored — the path is never force-deleted.
+	if len(h.workspace.destroyed) != 0 {
+		t.Errorf("workspace.destroyed = %+v, want none (refused)", h.workspace.destroyed)
+	}
+	// Runtime still torn down and intent still recorded — only the worktree is spared.
+	if h.log.indexOf("Runtime.Destroy") == -1 || h.log.indexOf("OnKillRequested") == -1 {
+		t.Error("runtime teardown / kill intent should still happen on a workspace refusal")
+	}
+}
+
+func TestKill_IncompleteMetadata_RefusesTeardown(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+	// A record with no teardown metadata (empty runtime handle + workspace path),
+	// e.g. a partially-seeded or corrupted record.
+	if err := h.store.Seed(ctx, domain.SessionRecord{
+		ID: "sess-1", ProjectID: testProject,
+		Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""),
+	}); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+
+	if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); !errors.Is(err, ErrIncompleteTeardownMetadata) {
+		t.Fatalf("kill: err = %v, want ErrIncompleteTeardownMetadata", err)
+	}
+	// Nothing destroyed with empty args, and no intent recorded.
+	if len(h.runtime.destroyed) != 0 || len(h.workspace.destroyed) != 0 {
+		t.Errorf("teardown ran despite incomplete metadata: rt=%v ws=%v", h.runtime.destroyed, h.workspace.destroyed)
+	}
+	if h.log.indexOf("OnKillRequested") != -1 {
+		t.Error("kill intent recorded despite incomplete metadata")
+	}
+}
+
+func TestCleanup_IncompleteMetadata_Skipped(t *testing.T) {
+	h := newHarness("unused")
+	ctx := context.Background()
+	// Terminal session but no workspace path persisted — must be skipped, never
+	// handed to Destroy with an empty path.
+	if err := h.store.Seed(ctx, domain.SessionRecord{
+		ID: "orphan-1", ProjectID: testProject,
+		Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""),
+	}); err != nil {
+		t.Fatalf("seed: %v", err)
+	}
+
+	res, err := h.sm.Cleanup(ctx, testProject)
+	if err != nil {
+		t.Fatalf("cleanup: %v", err)
+	}
+	if !equalIDSet(res.Skipped, []domain.SessionID{"orphan-1"}) {
+		t.Errorf("skipped = %v, want [orphan-1]", res.Skipped)
+	}
+	if len(res.Cleaned) != 0 {
+		t.Errorf("cleaned = %v, want none", res.Cleaned)
+	}
+	if len(h.workspace.destroyed) != 0 {
+		t.Errorf("workspace.destroyed = %v, want none (empty path must not reach Destroy)", h.workspace.destroyed)
+	}
+}
+
+func TestRestore_LiveSession_Rejected(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+	if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil {
+		t.Fatalf("spawn: %v", err)
+	}
+	// The session is live (never torn down). Capture an agent id so the only thing
+	// blocking restore is the non-terminal lifecycle, not missing metadata.
+	if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil {
+		t.Fatalf("patch metadata: %v", err)
+	}
+	createdBefore := len(h.runtime.created)
+	restoresBefore := len(h.workspace.restoredID)
+
+	if _, err := h.sm.Restore(ctx, "sess-1"); !errors.Is(err, ErrNotRestorable) {
+		t.Fatalf("restore: err = %v, want ErrNotRestorable", err)
+	}
+	// No second runtime/workspace spun up for the still-live session.
+	if len(h.runtime.created) != createdBefore {
+		t.Error("runtime created for a live-session restore")
+	}
+	if len(h.workspace.restoredID) != restoresBefore {
+		t.Error("workspace restored for a live-session restore")
+	}
+}
+
+func TestListAndGet_DeriveStatus(t *testing.T) {
+	cases := []struct {
+		name string
+		lc   domain.CanonicalSessionLifecycle
+		want domain.SessionStatus
+	}{
+		{"not_started", lc(domain.SessionNotStarted, domain.ReasonSpawnRequested, domain.PRNone, ""), domain.StatusSpawning},
+		{"working", lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""), domain.StatusWorking},
+		{"idle", lc(domain.SessionIdle, domain.ReasonResearchComplete, domain.PRNone, ""), domain.StatusIdle},
+		{"needs_input", lc(domain.SessionNeedsInput, domain.ReasonAwaitingUserInput, domain.PRNone, ""), domain.StatusNeedsInput},
+		{"pr_ci_failed", lc(domain.SessionWorking, domain.ReasonFixingCI, domain.PROpen, domain.PRReasonCIFailing), domain.StatusCIFailed},
+		{"pr_merged", lc(domain.SessionIdle, domain.ReasonMergedWaitingDecision, domain.PRMerged, domain.PRReasonMerged), domain.StatusMerged},
+		{"killed", lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""), domain.StatusKilled},
+	}
+
+	h := newHarness("unused")
+	ctx := context.Background()
+	for _, c := range cases {
+		if err := h.store.Seed(ctx, domain.SessionRecord{ID: domain.SessionID(c.name), ProjectID: testProject, Lifecycle: c.lc}); err != nil {
+			t.Fatalf("seed %s: %v", c.name, err)
+		}
+	}
+
+	// Get derives per-record.
+	for _, c := range cases {
+		got, err := h.sm.Get(ctx, domain.SessionID(c.name))
+		if err != nil {
+			t.Fatalf("get %s: %v", c.name, err)
+		}
+		if got.Status != c.want {
+			t.Errorf("get %s: status = %q, want %q", c.name, got.Status, c.want)
+		}
+	}
+
+	// List derives for every record in the project.
+	got, err := h.sm.List(ctx, testProject)
+	if err != nil {
+		t.Fatalf("list: %v", err)
+	}
+	if len(got) != len(cases) {
+		t.Fatalf("list len = %d, want %d", len(got), len(cases))
+	}
+	byID := map[domain.SessionID]domain.SessionStatus{}
+	for _, s := range got {
+		byID[s.ID] = s.Status
+	}
+	for _, c := range cases {
+		if byID[domain.SessionID(c.name)] != c.want {
+			t.Errorf("list %s: status = %q, want %q", c.name, byID[domain.SessionID(c.name)], c.want)
+		}
+	}
+}
+
+func TestGet_NotFound(t *testing.T) {
+	h := newHarness("sess-1")
+	if _, err := h.sm.Get(context.Background(), "missing"); !errors.Is(err, ErrNotFound) {
+		t.Errorf("get missing: err = %v, want ErrNotFound", err)
+	}
+}
+
+func TestSend_RoutesToMessenger(t *testing.T) {
+	h := newHarness("sess-1")
+	if err := h.sm.Send(context.Background(), "sess-1", "hello"); err != nil {
+		t.Fatalf("send: %v", err)
+	}
+	if len(h.messenger.sent) != 1 || h.messenger.sent[0].ID != "sess-1" || h.messenger.sent[0].Message != "hello" {
+		t.Errorf("messenger.sent = %+v, want one {sess-1, hello}", h.messenger.sent)
+	}
+}
+
+func TestRestore_RelaunchesWithResumeCommand(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+	if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil {
+		t.Fatalf("spawn: %v", err)
+	}
+	if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil {
+		t.Fatalf("kill: %v", err)
+	}
+	// The agent's resume id is captured in metadata (here set explicitly).
+	if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil {
+		t.Fatalf("patch metadata: %v", err)
+	}
+
+	sess, err := h.sm.Restore(ctx, "sess-1")
+	if err != nil {
+		t.Fatalf("restore: %v", err)
+	}
+
+	// Reopened: terminal session reset to a fresh spawn, PR cleared, runtime alive.
+	if sess.Status != domain.StatusSpawning {
+		t.Errorf("status = %q, want spawning", sess.Status)
+	}
+	rec, _, _ := h.store.Get(ctx, "sess-1")
+	if got := rec.Lifecycle.Session; got.State != domain.SessionNotStarted || got.Reason != domain.ReasonSpawnRequested {
+		t.Errorf("session substate = %+v, want not_started/spawn_requested", got)
+	}
+	if got := rec.Lifecycle.PR; got.State != domain.PRNone || got.Reason != domain.PRReasonClearedOnRestore {
+		t.Errorf("pr substate = %+v, want none/cleared_on_restore", got)
+	}
+	if rec.Lifecycle.Runtime.State != domain.RuntimeAlive {
+		t.Errorf("runtime state = %q, want alive", rec.Lifecycle.Runtime.State)
+	}
+
+	// Relaunched via the agent's resume command (created[0] is the original spawn).
+	if len(h.runtime.created) != 2 {
+		t.Fatalf("runtime.created = %d, want 2 (spawn + restore)", len(h.runtime.created))
+	}
+	if got := h.runtime.created[1].LaunchCommand; got != "claude --resume agent-xyz" {
+		t.Errorf("restore launch command = %q, want resume", got)
+	}
+	if h.log.indexOf("Workspace.Restore") == -1 {
+		t.Error("Workspace.Restore was not called")
+	}
+}
+
+func TestRestore_MissingAgentSessionID_Errors(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+	if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil {
+		t.Fatalf("spawn: %v", err)
+	}
+	if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil {
+		t.Fatalf("kill: %v", err)
+	}
+	// No agent session id was ever captured (spawn leaves it empty) — resume is
+	// impossible, so Restore must fail early without touching workspace/runtime.
+	beforeRestores := len(h.workspace.restoredID)
+	beforeCreated := len(h.runtime.created)
+
+	if _, err := h.sm.Restore(ctx, "sess-1"); err == nil {
+		t.Fatal("restore: want error for missing agent session id, got nil")
+	}
+	if len(h.workspace.restoredID) != beforeRestores {
+		t.Error("workspace was touched despite a doomed restore")
+	}
+	if len(h.runtime.created) != beforeCreated {
+		t.Error("runtime was created despite a doomed restore")
+	}
+	// The session stays terminal — a failed restore does not reopen it.
+	rec, _, _ := h.store.Get(ctx, "sess-1")
+	if rec.Lifecycle.Session.State != domain.SessionTerminated {
+		t.Errorf("session state = %q, want terminated (unchanged)", rec.Lifecycle.Session.State)
+	}
+}
+
+func TestRestore_OnSpawnCompletedFailure_RollsBackRuntime(t *testing.T) {
+	h := newHarness("sess-1")
+	ctx := context.Background()
+	if _, err := h.sm.Spawn(ctx, spawnCfg()); err != nil {
+		t.Fatalf("spawn: %v", err)
+	}
+	if _, err := h.sm.Kill(ctx, "sess-1", ports.KillOptions{Reason: ports.KillManual}); err != nil {
+		t.Fatalf("kill: %v", err)
+	}
+	if err := h.store.PatchMetadata(ctx, "sess-1", map[string]string{lifecycle.MetaAgentSessionID: "agent-xyz"}); err != nil {
+		t.Fatalf("patch metadata: %v", err)
+	}
+
+	// Fail the post-create LCM call; capture teardown counts just before restore.
+	h.lcm.onSpawnErr = errors.New("lcm boom")
+	destroyedBefore := len(h.runtime.destroyed)
+	wsDestroyedBefore := len(h.workspace.destroyed)
+
+	if _, err := h.sm.Restore(ctx, "sess-1"); err == nil {
+		t.Fatal("restore: want error, got nil")
+	}
+
+	// The runtime created during restore is torn back down so no process is
+	// stranded; the workspace is left intact (it holds the agent's prior work).
+	if len(h.runtime.destroyed) != destroyedBefore+1 {
+		t.Errorf("runtime.destroyed grew by %d, want 1 (restore rollback)", len(h.runtime.destroyed)-destroyedBefore)
+	}
+	if len(h.workspace.destroyed) != wsDestroyedBefore {
+		t.Errorf("workspace was destroyed on restore rollback; it must be preserved")
+	}
+}
+
+func TestCleanup_SkipsUncommittedWork(t *testing.T) {
+	h := newHarness("unused")
+	ctx := context.Background()
+
+	// Two terminal sessions (reclaimable) + one working session (must be ignored).
+	seedTerminal(t, h, "done-1", "/tmp/ws/done-1")
+	seedTerminal(t, h, "dirty-1", "/tmp/ws/dirty-1")
+	if err := h.store.Seed(ctx, domain.SessionRecord{
+		ID: "live-1", ProjectID: testProject,
+		Lifecycle: lc(domain.SessionWorking, domain.ReasonTaskInProgress, domain.PRNone, ""),
+	}); err != nil {
+		t.Fatalf("seed live: %v", err)
+	}
+	// dirty-1's worktree still holds uncommitted work — Destroy refuses it.
+	h.workspace.refuse["/tmp/ws/dirty-1"] = true
+
+	res, err := h.sm.Cleanup(ctx, testProject)
+	if err != nil {
+		t.Fatalf("cleanup: %v", err)
+	}
+
+	if !equalIDSet(res.Cleaned, []domain.SessionID{"done-1"}) {
+		t.Errorf("cleaned = %v, want [done-1]", res.Cleaned)
+	}
+	if !equalIDSet(res.Skipped, []domain.SessionID{"dirty-1"}) {
+		t.Errorf("skipped = %v, want [dirty-1]", res.Skipped)
+	}
+	// The live session was never a candidate.
+	if contains(res.Cleaned, "live-1") || contains(res.Skipped, "live-1") {
+		t.Error("non-terminal session must not be cleaned or skipped")
+	}
+}
+
+// ---- test helpers ----
+
+func lc(s domain.SessionState, r domain.SessionReason, prs domain.PRState, prr domain.PRReason) domain.CanonicalSessionLifecycle {
+	return domain.CanonicalSessionLifecycle{
+		Version: domain.LifecycleVersion,
+		Session: domain.SessionSubstate{State: s, Reason: r},
+		PR:      domain.PRSubstate{State: prs, Reason: prr},
+		Runtime: domain.RuntimeSubstate{State: domain.RuntimeAlive, Reason: domain.RuntimeReasonProcessRunning},
+	}
+}
+
+func seedTerminal(t *testing.T, h *harness, id domain.SessionID, wsPath string) {
+	t.Helper()
+	ctx := context.Background()
+	if err := h.store.Seed(ctx, domain.SessionRecord{
+		ID: id, ProjectID: testProject,
+		Lifecycle: lc(domain.SessionTerminated, domain.ReasonManuallyKilled, domain.PRNone, ""),
+	}); err != nil {
+		t.Fatalf("seed %s: %v", id, err)
+	}
+	if err := h.store.PatchMetadata(ctx, id, map[string]string{lifecycle.MetaWorkspacePath: wsPath}); err != nil {
+		t.Fatalf("patch metadata %s: %v", id, err)
+	}
+}
+
+func equalStrings(a, b []string) bool {
+	if len(a) != len(b) {
+		return false
+	}
+	for i := range a {
+		if a[i] != b[i] {
+			return false
+		}
+	}
+	return true
+}
+
+func contains(ids []domain.SessionID, id domain.SessionID) bool {
+	for _, x := range ids {
+		if x == id {
+			return true
+		}
+	}
+	return false
+}
+
+func equalIDSet(got, want []domain.SessionID) bool {
+	if len(got) != len(want) {
+		return false
+	}
+	for _, w := range want {
+		if !contains(got, w) {
+			return false
+		}
+	}
+	return true
+}
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 00000000..f42f222f
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,34 @@
+# agent-orchestrator (rewrite) — docs
+
+The agent-orchestrator is being rebuilt as a long-running **Go backend daemon**
+(`backend/`) plus an **Electron + TypeScript frontend** (`frontend/`). The
+backend supervises a fleet of coding-agent sessions and keeps one true status
+per session.
+
+This folder documents the **Lifecycle Manager (LCM) + Session Manager (SM)
+lane** — the deterministic core of the backend that is now implemented (behind
+fakes) on the `feat/lcm-sm-contracts` integration branch.
+
+## Start here
+
+| Doc | What it covers |
+|-----|----------------|
+| [architecture.md](architecture.md) | How the lane works: the OBSERVE→DECIDE→ACT loop, the canonical state model, the package layout, every component, and the load-bearing invariants. Read this first. |
+| [status.md](status.md) | What's done (PR by PR), what's left, the integration to-dos, the open cross-lane contract questions, and how to build/test. |
+
+## The one-paragraph mental model
+
+The backend is a **stateless supervisor over external ground truth**: git/GitHub
+own PR/CI/review truth, the agent's own files own its activity, and the backend
+owns no agent state. Its whole job is, per session: **OBSERVE** raw facts →
+**DECIDE** one canonical status via pure, deterministic functions → **ACT**
+(persist + fire reactions). The LCM is that reducer; the SM is the
+explicit-mutation plumbing (spawn/kill/restore/cleanup) that feeds it.
+
+## Where this lane fits
+
+Other lanes (built by other people, in parallel) provide the real adapters this
+lane depends on through narrow interfaces: the **persistence layer + CDC**, the
+**SCM poller**, the **runtime/agent/workspace plugins**, the **backend API +
+OpenAPI**, and the **frontend store**. See [status.md](status.md#integration)
+for the hand-off points.
diff --git a/docs/architecture.md b/docs/architecture.md
new file mode 100644
index 00000000..9673142c
--- /dev/null
+++ b/docs/architecture.md
@@ -0,0 +1,187 @@
+# LCM + Session Manager — architecture
+
+This is the deterministic core of the backend daemon. It supervises agent
+sessions and keeps exactly one true status per session.
+
+## 1. Mental model: OBSERVE → DECIDE → ACT
+
+The backend owns no agent state. git/GitHub own PR/CI/review truth; the agent's
+own files own its activity. The job, per session, is one loop:
+
+```
+OBSERVE            →   DECIDE              →   ACT
+(impure, external)     (pure, total)           (impure)
+raw facts              one canonical status    persist + react
+```
+
+In the rewrite the **OBSERVE** step lives *outside* the LCM (separate owners),
+and the LCM is a **synchronous reducer** invoked with facts:
+
+```
+SCM poller     ─ ApplySCMObservation ──┐
+reaper         ─ ApplyRuntimeObservation┤
+activity hooks ─ ApplyActivitySignal ───┼─▶ LCM:  load canonical
+Session Mgr    ─ OnSpawnCompleted ──────┘         → pure DECIDE
+               ─ OnKillRequested                  → diff → persist (merge-patch)
+reaper tick    ─ TickEscalations                  → if transition: react (ACT)
+```
+
+The LCM **never polls**. The reaper (a timer, owned elsewhere) drives liveness
+sampling and duration-based escalation by calling in.
+
+## 2. Canonical state model — the crown jewel
+
+The **only** thing persisted per session is `CanonicalSessionLifecycle`
+(`backend/internal/domain/lifecycle.go`). The single-word display status is
+**derived on read and never stored** — this is the most important invariant; it
+prevents canonical truth and display from drifting.
+
+```
+CanonicalSessionLifecycle
+  Version    schema version of the record shape
+  Revision   monotonic write counter (optimistic-concurrency token)
+  Session    (state, reason)   working/idle/needs_input/stuck/detecting/done/terminated
+  PR         (state, reason)   none/open/merged/closed
+  Runtime    (state, reason)   unknown/alive/exited/missing/probe_failed
+  Activity   last-known agent activity (+ timestamp, source)   ← decider input
+  Detecting  anti-flap quarantine memory (nil unless quarantined) ← decider input
+```
+
+`DeriveLegacyStatus` (`domain/status.go`) is the **sole producer** of the
+display `SessionStatus`. Precedence: terminal/hard session states map directly
+(they outrank PR facts) → a merged PR wins → an open PR maps by reason → else the
+soft session state. So an idle worker with a CI-failing open PR displays
+`ci_failed`, but a `needs_input` session shows `needs_input` regardless of the PR.
+
+`Session` (`domain/session.go`) is the read-model: a `SessionRecord`
+(persistence shape, identity + lifecycle + metadata) plus the derived `Status`.
+The **Session Manager is the single producer of `Status`** — it attaches it on
+read; the store and API never recompute or persist it.
+
+## 3. Package layout (`backend/internal/`)
+
+```
+domain/                 the vocabulary (imports only the std lib → no cycles)
+  lifecycle.go          CanonicalSessionLifecycle + all sub-states/enums
+  status.go             SessionStatus + DeriveLegacyStatus (sole display producer)
+  session.go            SessionRecord (persisted) + Session (read-model) + id types
+  decide/               the PURE core — total, deterministic, zero I/O
+    types.go            LifecycleDecision + Probe/OpenPR/Detecting inputs + tuning consts
+    decide.go           the deciders + the anti-flap quarantine + HashEvidence
+ports/                  the boundaries (interfaces + DTOs)
+  inbound.go            LifecycleManager, SessionManager (we implement)
+  outbound.go           LifecycleStore, Notifier, AgentMessenger, Runtime/Agent/Workspace
+  facts.go              SCMFacts, RuntimeFacts, ActivitySignal, SpawnOutcome, KillReason
+lifecycle/              the LCM implementation (DECIDE + ACT)
+  manager.go            the Apply* pipeline, per-session lock, patch diffing
+  decide_bridge.go      fact→decide-input translation + the composition rules
+  reactions.go          the reaction table + escalation engine + TickEscalations
+session/                the SM implementation (explicit mutations)
+  manager.go            Spawn/Kill/Restore/Cleanup/List/Get/Send + rollback
+```
+
+`domain` + `ports` are the committed, stabilized **integration boundary**.
+Everything else implements behind it.
+
+## 4. The pure DECIDE core (`domain/decide`)
+
+Total, deterministic, side-effect-free functions — the highest-value test
+surface (table-tested to 100%). Key ones:
+
+- `ResolveProbeDecision` — runtime/process liveness. An explicit kill
+  short-circuits to terminal; a **failed probe is never read as death** (routes
+  to `detecting`), as does any probe disagreement; only runtime-dead +
+  process-dead + no-recent-activity reaches `killed`.
+- `ResolveOpenPRDecision` — the PR ladder: `ci_failing` → `changes_requested` →
+  `mergeable` → `approved` → `review_pending` → idle-beyond → else `pr_open`.
+- `ResolveTerminalPRStateDecision` — merged → `merged` (park idle awaiting a
+  human decision); closed → `idle`.
+- `CreateDetectingDecision` — the **anti-flap quarantine**. Counts attempts and
+  hashes the *timestamp-stripped* evidence; escalates to `stuck` only after 3
+  consecutive unchanged-evidence ticks **or** 5 minutes since first entering
+  detecting (`StartedAt` is preserved across the whole episode). Changing
+  evidence resets the counter.
+
+## 5. The LCM (`lifecycle`)
+
+Implements `ports.LifecycleManager`. Every `Apply*`/`On*` entrypoint runs the
+same pipeline (`manager.go`):
+
+```
+withLock(session):                       ← per-session serialization
+  load canonical → decideFn (build sparse patch) → if changed: persist → load after
+return transition (before, after)
+```
+then, **after the lock releases**, `react()` fires the mapped reaction.
+
+- **Per-session serialization** — `keyedMutex` hands out one lock per session id
+  (parallel across sessions, serial within one). Entries are reference-counted
+  and evicted when the last holder releases, so the map stays bounded.
+- **Composition rules** (`decide_bridge.go`) — two observers must not fight over
+  the session axis. Liveness (runtime probes) owns the runtime + death/detecting
+  axis; activity owns working/idle/waiting. `isLivenessOwned` decides when a
+  healthy probe may *recover* a state (e.g. `detecting → working`) vs. when it
+  must not clobber an activity-owned `needs_input`/`blocked`. A high-confidence
+  activity signal may resolve a `detecting` session; an open PR writes only the
+  PR axis and lets `DeriveLegacyStatus` surface it.
+- **Detecting-memory lifecycle** — a decision with `Detecting == nil` clears the
+  persisted quarantine memory (`LifecyclePatch.ClearDetecting`) so a stale prior
+  can't leak into a later episode.
+- **ACT — reactions + escalation** (`reactions.go`) — on a genuine status
+  transition, `react()` maps it to a reaction (`send-to-agent` / `notify`;
+  `auto-merge` exists but is off by default) and dispatches it. A
+  per-`(session,reaction)` escalation tracker counts attempts; it escalates
+  (notifies a human and silences further auto-dispatch) when a numeric cap or a
+  duration is exceeded. The `ci-failed` budget is persistent across CI
+  oscillation within an open PR and re-arms on genuine recovery. `TickEscalations`
+  (called by the reaper) fires the duration-based escalations the synchronous
+  LCM can't wake itself for; it notifies outside the lock.
+
+## 6. The Session Manager (`session`)
+
+Implements `ports.SessionManager` — the explicit-mutation plumbing. It never
+derives/observes lifecycle state; it routes outcomes to the LCM.
+
+- **Spawn** — `Workspace.Create` → build prompt → `Runtime.Create` (env
+  `AO_SESSION_ID`/`AO_PROJECT_ID`/`AO_ISSUE_ID`) → **seed** the initial record
+  (`not_started`/`spawn_requested`) via the store → `LCM.OnSpawnCompleted`.
+  Eager rollback unwinds prior steps on failure; an `OnSpawnCompleted` failure
+  routes the seeded orphan to terminal-errored (the store has no delete; a later
+  `Cleanup` reclaims it).
+- **Kill** — `LCM.OnKillRequested` → `Runtime.Destroy` → `Workspace.Destroy`,
+  honoring the **worktree-remove safety**: after `git worktree prune`, a still-
+  registered path is never `rm -rf`'d (it may hold the agent's uncommitted work)
+  — the refusal is surfaced, not forced.
+- **Restore** — reopen via `PatchLifecycle` (not re-seed): session →
+  `not_started`, PR → `cleared_on_restore`; relaunch with the agent's resume
+  command; runtime is rolled back on a post-create failure.
+- **List/Get** — read records and attach the derived `Status`. **Send** — via
+  `AgentMessenger`. **Cleanup** — tear down terminal/stale sessions, skipping
+  paths with uncommitted work.
+
+## 7. Load-bearing invariants
+
+1. **Persist canonical; derive display.** Never store the display status.
+2. **One authority for death.** Only the DECIDE pipeline (via `detecting`) writes
+   inferred terminal states; the SM's explicit-kill path goes through
+   `OnKillRequested`. Everything else that notices a dead runtime persists
+   `detecting`, never `terminated`.
+3. **Failed probe ≠ dead.** Timed-out/errored probes route to `detecting`.
+4. **Evidence-hash debounce** prevents flapping signals from terminating live
+   work; the 5-minute cap is a whole-episode wall-clock safety net.
+5. **PR facts dominate** the soft session states once a PR exists.
+6. **Merge-patch persistence** — writes touch only changed keys; the store is the
+   single disk writer (atomic write + lock + CDC).
+7. **Sticky activity states** (`waiting_input`/`blocked`) do not decay by clock.
+8. **Worktree-remove safety** on teardown.
+
+## 8. Concurrency & testing
+
+- Within a session, the per-session lock serializes the load→decide→persist
+  read-modify-write. `react()` runs *outside* the lock (so a busy-waiting
+  send-to-agent never holds the session mutex) — see `status.md` for the
+  integration-time follow-up this implies.
+- Tests use **in-memory fakes** for every outbound port, so the LCM and SM are
+  fully testable with no real adapters. The SM tests drive the **real**
+  `lifecycle.Manager` for spawn/kill round-trips, so the SM↔LCM contract is
+  genuinely exercised. The `decide` package is table-tested in isolation.
diff --git a/docs/status.md b/docs/status.md
new file mode 100644
index 00000000..9bb79cdb
--- /dev/null
+++ b/docs/status.md
@@ -0,0 +1,98 @@
+# LCM + Session Manager — status & roadmap
+
+Where the lane stands, what's left, and where to plug in.
+
+## Branch model
+
+`feat/lcm-sm-contracts` is the **lane integration branch**: each sub-PR below
+branched off it and merged **into** it. The whole lane lands on `main` as one
+unit once it's ready. Sub-PRs were reviewed against the integration branch;
+the eventual lane→main merge is a single cumulative review.
+
+## Done — implementation complete (behind fakes)
+
+| Area | What landed | PR |
+|------|-------------|----|
+| Skeleton | `backend/` (Go) + `frontend/` (Electron/TS) | #1 (on `main`) |
+| Contracts + CI | `domain/` + `ports/`; Go + gitleaks workflows | #2 |
+| Pure DECIDE core | the deciders + anti-flap quarantine + exhaustive truth-table tests | #4 |
+| LCM — pipeline | `Apply*` pipeline, per-session serialization, store integration, composition rules, detecting-memory lifecycle | #5 |
+| LCM — reactions | reaction table + escalation engine + real `TickEscalations` | #6 |
+| Session Manager | spawn / kill / restore / cleanup / list, eager rollback, worktree-remove safety | #7 |
+
+`gofmt` / `go build` / `go vet` / `go test -race` all green across `domain`,
+`domain/decide`, `lifecycle`, and `session`. The `decide` core is at 100%
+statement coverage; the impl packages cover the load-bearing logic including the
+error/rollback paths.
+
+### Build & test
+
+```
+cd backend
+gofmt -l .          # must print nothing
+go build ./...
+go vet ./...
+go test -race ./...
+go test -cover ./...
+```
+
+## Not done — the integration phase
+
+Everything above runs against **in-memory fakes**. Making it a live system means
+swapping fakes for real adapters (built by other lanes) behind the existing
+ports, and resolving the carried-forward items below.
+
+### Carried-forward items (must be addressed as real adapters land)
+
+- **`react()` out-of-lock dispatch.** Reactions fire after the per-session lock
+  releases (deliberate, so a busy-waiting send-to-agent doesn't hold the mutex).
+  Under a live daemon with concurrent observers this can dispatch on a stale
+  snapshot / out of order. Give `react()` a per-session ordering (a small react
+  queue) or re-check the triggering state before dispatching. Documented in
+  `lifecycle/reactions.go`.
+- **`ExpectedRevision` optimistic-concurrency is unused.** The in-process
+  per-session mutex covers a single daemon. Multi-writer or CDC-driven setups
+  must use the `LifecyclePatch.ExpectedRevision` CAS the contract already exposes.
+- **Store `Seed` + `Get` need a real implementation.** The Session Manager added
+  two record-with-identity methods to `LifecycleStore`; the real persistence
+  layer must implement them (create-with-identity that rejects an existing id;
+  full-record read by id). Documented in `ports/outbound.go`.
+
+### Real adapters needed (other lanes)
+
+| Port | Real adapter | Owning lane |
+|------|--------------|-------------|
+| `LifecycleStore` | persistence layer (flat-file/KV + atomic write + lock + CDC) | persistence |
+| `SCMFacts` producer | SCM poller (batch PR/CI/review enrichment) | SCM |
+| `Runtime` / `Agent` / `Workspace` | tmux runtime, claude-code/codex agent, git-worktree workspace | coding-agents |
+| `Notifier` | desktop/Slack notifier | notifications |
+| `AgentMessenger` | tmux inject with busy-detect + delivery verify | coding-agents |
+| `SessionManager` consumer | backend API (routes/controllers) + OpenAPI | API |
+
+### Open cross-lane contract questions
+
+- **SCM facts** — does `SCMFacts` match what the poller can cheaply produce
+  (batch enrichment, CI log tail as a pointer)?
+- **Persistence** — is `LifecycleStore` + `LifecyclePatch` the right boundary?
+  Per-session lock vs. the `ExpectedRevision` CAS?
+- **API** — is the `SessionManager` interface + the `Session` read-model
+  OpenAPI-friendly?
+
+### Land the lane → `main`
+
+A final cumulative review of `feat/lcm-sm-contracts` vs. `main`, then merge the
+complete lane in one unit.
+
+## Where to plug in (for someone picking this up)
+
+- **Implementing a real adapter?** Write it to satisfy the matching interface in
+  `ports/`, then construct the `lifecycle.Manager` / `session.Manager` with it in
+  place of the fake. Nothing in `domain`/`lifecycle`/`session` should need to
+  change.
+- **Changing decision behavior?** It lives in `domain/decide` (pure) — add a
+  truth-table case first; nothing there does I/O.
+- **Adding a reaction?** Extend the table in `lifecycle/reactions.go` and map the
+  triggering status in `reactionEventFor`.
+- **Don't** persist the display status, conclude death outside the probe
+  pipeline, or `rm -rf` a still-registered worktree — see the invariants in
+  [architecture.md](architecture.md#7-load-bearing-invariants).