diff --git a/Makefile b/Makefile
index 31a6789..8f6a824 100644
--- a/Makefile
+++ b/Makefile
@@ -15,7 +15,8 @@ LINT_DOC_PATHS := README.md kernel commands cells
 
 .PHONY: help install doctor list lint-docs \
 	cell-init cell-new cell-list cell-use cell-rm \
-	cell-status cell-link-remote cell-pull cell-push cell-branch cell-pr
+	cell-status cell-link-remote cell-pull cell-push cell-branch cell-pr \
+	eval-task-new eval-record eval-review eval-compare eval-list
 
 help: ## Show available targets
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \
@@ -78,6 +79,32 @@ cell-branch: ## Cut a branch in the cell for an edit (vars: BRANCH, optional NAM
 cell-pr: ## Open a PR for current cell edits (vars: TITLE, BODY; optional NAME)
 	@bash $(RUNTIME_ROOT)/scripts/cell-pr.sh "$(NAME)" "$(TITLE)" "$(BODY)"
 
+# ----- Eval harness (capture flow runs, review them, compare across versions) -----
+
+eval-task-new: ## Scaffold a new eval task (var: TASK)
+	@bash $(RUNTIME_ROOT)/scripts/eval-task-new.sh "$(TASK)"
+
+eval-record: ## Capture a flow run as an eval (vars: TASK; optional PROJECT, THREAD, BASE, COST, DURATION)
+	@bash $(RUNTIME_ROOT)/scripts/eval-record.sh \
+		--task "$(TASK)" \
+		$(if $(PROJECT),--project "$(PROJECT)") \
+		$(if $(THREAD),--thread "$(THREAD)") \
+		$(if $(BASE),--base "$(BASE)") \
+		$(if $(COST),--cost "$(COST)") \
+		$(if $(DURATION),--duration "$(DURATION)")
+
+eval-review: ## Open $$EDITOR on a long-form qualitative review (vars: TASK, RUN; optional REVIEWER)
+	@bash $(RUNTIME_ROOT)/scripts/eval-review.sh \
+		--task "$(TASK)" --run "$(RUN)" \
+		$(if $(REVIEWER),--reviewer "$(REVIEWER)")
+
+eval-compare: ## Compare two recorded runs (vars: TASK, A, B)
+	@bash $(RUNTIME_ROOT)/scripts/eval-compare.sh \
+		--task "$(TASK)" --a "$(A)" --b "$(B)"
+
+eval-list: ## List eval tasks and recorded runs
+	@bash $(RUNTIME_ROOT)/scripts/eval-list.sh
+
 # ----- Doc lint (preserved from v2) -----
 
 lint-docs: ## Check markdown docs for style-guide regressions
diff --git a/evals/README.md b/evals/README.md
new file mode 100644
index 0000000..e55cfc8
--- /dev/null
+++ b/evals/README.md
@@ -0,0 +1,62 @@
+# evals
+
+Capture flow runs, review them with rich qualitative feedback, and compare runs across kernel/cell versions. Reviews are committed to this repo so the next iteration of `/flow:reflect` (and the humans iterating on flow) can read them.
+
+## Workflow
+
+1. **Define a task.** A task is the prompt + the expected work — what we're evaluating flow against.
+   ```
+   make eval-task-new TASK=add-standup-cmd
+   # edit evals/tasks/add-standup-cmd/{prompt.md,README.md}
+   ```
+
+2. **Run flow on the task.** In any project, open a Claude Code session and run `/flow:flow` with the task's prompt. Ship, pause, or abort — all are recordable.
+
+3. **Capture the run.**
+   ```
+   make eval-record TASK=add-standup-cmd PROJECT=/path/to/project
+   ```
+   Discovers the most recent thread under `<project>/agent/threads/`, copies it, generates the diff, and writes a manifest pinned to the current kernel and cell SHAs. Optional flags: `THREAD=<dir>`, `BASE=main`, `COST=N`, `DURATION=N`.
+
+4. **Review the run.** Long-form qualitative review in `$EDITOR`:
+   ```
+   make eval-review TASK=add-standup-cmd RUN=<run-id>
+   ```
+   Sections cover overall impression, per-stage notes, document quality (for both human and machine readers), code quality, and the highest-value section: **patterns flow should learn**. Numeric scores are optional.
+
+5. **Compare runs:**
+   ```
+   make eval-compare TASK=add-standup-cmd A=<run-a> B=<run-b>
+   ```
+   Prints version pins, metric deltas, and lists reviews on both sides — `diff` the review files yourself to compare prose feedback.
+
+6. **List what we have:**
+   ```
+   make eval-list
+   ```
+
+7. **Commit.** `evals/tasks/`, `evals/runs/`, and `evals/templates/` are all part of this repo.
+
+## Layout
+
+```
+evals/
+  tasks/<task-id>/             # task definitions (committed)
+    task.json
+    prompt.md
+    README.md
+  runs/<task-id>/<run-id>/     # captured runs (committed)
+    manifest.json              # versions, project, metrics
+    prompt.md                  # snapshot of the prompt at record time
+    thread/                    # copied handoff docs from the run
+    diff.patch                 # git diff <base>...<branch>
+    reviews/<reviewer>.md      # one markdown file per reviewer
+  templates/
+    review.md                  # the review template
+```
+
+`run-id` format: `<utc-iso-timestamp>-<kernel-sha7>` — sortable, scannable, includes the kernel version axis.
+
+## Why qualitative
+
+Numeric scores are easy to game and lose nuance. The richest signal for improving flow is prose: *"the plan invented a file that doesn't exist," "the spec buried acceptance criteria in narrative," "implement re-asked questions explore had already answered."* That's what the review template invites, and that's what feeds the next iteration.
diff --git a/evals/runs/.gitkeep b/evals/runs/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/evals/tasks/.gitkeep b/evals/tasks/.gitkeep
new file mode 100644
index 0000000..e69de29
diff --git a/evals/tasks/mini-todo/README.md b/evals/tasks/mini-todo/README.md
new file mode 100644
index 0000000..62f3ccf
--- /dev/null
+++ b/evals/tasks/mini-todo/README.md
@@ -0,0 +1,62 @@
+# mini-todo
+
+**What this task evaluates:** Greenfield code-pipeline behaviour on a small, fully specified task. The prompt is intentionally tight (commands, storage, std-lib-only) so divergences in flow's output reflect the cell's behaviour, not prompt ambiguity. We're watching for:
+
+- **Spec stage** — does it pin the JSON schema and the failure-mode behaviour (missing id, malformed file) the prompt left implicit, or does it just restate the prompt?
+- **Plan stage** — does it propose a sensible single-file layout, or invent unnecessary structure (`src/`, modules, classes for a 100-line tool)?
+- **Implement stage** — is the code tight (target: <150 LOC), does it handle the obvious edge cases (file doesn't exist yet, bad id), and does it stop there?
+- **Review stage** — does it actually find anything, or rubber-stamp?
+- **Scope discipline** — does flow stay in scope, or does it add config files, GitHub Actions, type hints when prompt didn't ask?
+
+## Prompt
+
+See `prompt.md`. Pass that to `/flow:flow` verbatim.
+
+## Project location
+
+Run flow against `~/Workspace/jyliang/mini-todo` (a fresh empty repo, scaffolded once and reused across runs — each run cuts its own branch). If that path doesn't exist, see "First-time setup" below.
+
+## How to run a fresh run
+
+1. Reset the project to a clean main:
+   ```
+   cd ~/Workspace/jyliang/mini-todo
+   git checkout main && git reset --hard <initial-commit-sha>
+   ```
+   (Initial-commit SHA: see `notes` at the bottom of this file once it's been initialized.)
+2. Open a Claude Code session in that directory and run `/flow:flow` with the prompt from `prompt.md`.
+3. Walk flow through its stages, answering boundary prompts as a real user would. Note rough cost + duration if you can.
+4. From the flow repo:
+   ```
+   make eval-record TASK=mini-todo PROJECT=~/Workspace/jyliang/mini-todo COST=<usd> DURATION=<sec>
+   ```
+5. Review:
+   ```
+   make eval-review TASK=mini-todo RUN=<run-id>
+   ```
+
+## What's a good outcome?
+
+- Final diff is one Python file (~100–150 LOC) plus a short README.
+- All four commands work; `--file` flag is honoured; missing-id is handled cleanly.
+- Spec doc captures: data shape, what happens on missing/malformed file, behaviour when id doesn't exist.
+- Plan doc proposes the single-file layout without inventing structure.
+- No type hints, no test framework, no CI — those weren't asked for.
+- Total flow run cost: under $2; duration: under 15 minutes.
+
+## First-time setup
+
+```
+mkdir -p ~/Workspace/jyliang/mini-todo
+cd ~/Workspace/jyliang/mini-todo
+git init -b main
+echo "# mini-todo" > README.md
+git add README.md && git commit -m "init"
+```
+
+Then record that initial commit's SHA in the **Notes** section below so future runs can reset to it.
+
+## Notes
+
+- **Initial commit SHA** (reset target): `9f05045` (`9f050453dc9081213a1adc1baa1dcae6db7d3dbb`)
+- _(running history across runs — what's improving, what's still a problem, goes here.)_
diff --git a/evals/tasks/mini-todo/prompt.md b/evals/tasks/mini-todo/prompt.md
new file mode 100644
index 0000000..7549b34
--- /dev/null
+++ b/evals/tasks/mini-todo/prompt.md
@@ -0,0 +1,12 @@
+# Prompt
+
+Pass this verbatim to `/flow:flow`:
+
+> Build a single-file Python TODO CLI named `todo`. It stores tasks in a JSON file (default: `~/.todo.json`, override via `--file`). Commands:
+>
+> - `todo add "<text>"` — append a new task, print its id and text
+> - `todo ls` — list tasks, one per line, with `[ ]` / `[x]` and the id
+> - `todo done <id>` — mark complete
+> - `todo rm <id>` — delete
+>
+> Use only the Python standard library. Single file is fine. Include a short README with install + usage.
diff --git a/evals/tasks/mini-todo/task.json b/evals/tasks/mini-todo/task.json
new file mode 100644
index 0000000..4229f34
--- /dev/null
+++ b/evals/tasks/mini-todo/task.json
@@ -0,0 +1,6 @@
+{
+  "id": "mini-todo",
+  "type": "code",
+  "title": "Build a mini TODO CLI in a fresh repo",
+  "description": "Have flow build a single-file Python TODO CLI with JSON-file persistence and the commands: add, ls, done, rm. Tests scope discipline (will flow over- or under-engineer it?), spec quality (does it pin the data shape and edge cases?), and code quality on greenfield work."
+}
diff --git a/evals/templates/review.md b/evals/templates/review.md
new file mode 100644
index 0000000..37d5cc1
--- /dev/null
+++ b/evals/templates/review.md
@@ -0,0 +1,76 @@
+# Review: {{run_id}}
+
+- task: `{{task}}`
+- reviewer: `{{reviewer}}`
+- reviewed_at: {{reviewed_at}}
+
+> Read the run artifacts alongside this review:
+>
+> - manifest: `evals/runs/{{task}}/{{run_id}}/manifest.json`
+> - thread docs: `evals/runs/{{task}}/{{run_id}}/thread/*.md`
+> - final diff: `evals/runs/{{task}}/{{run_id}}/diff.patch`
+
+---
+
+## Overall impression
+
+(gut take — write freely. one paragraph or ten, your choice)
+
+## What went well
+
+(what should flow keep doing? cite specifics — quote the doc, point at the diff)
+
+## What went poorly
+
+(what should flow change? same — be specific)
+
+## Per-stage notes
+
+### Spec / explore
+
+(what did the explore stage produce? was the spec usable? did it capture intent?)
+
+### Plan
+
+(was the plan grounded in the actual codebase? did it reference real files? right level of detail?)
+
+### Implementation / final diff
+
+(read `diff.patch`. is the code good? right size? scope-disciplined? any over-engineering?)
+
+### Review (the stage)
+
+(if a review stage ran — did it catch anything real, or rubber-stamp?)
+
+## Document quality
+
+### Human readability
+
+How easy were the docs for a human to read? Quote awkward passages. Was the structure helpful or noise?
+
+### Machine clarity
+
+Did each stage's output give the next stage what it needed? Look for places where the next stage had to re-derive context, re-ask questions, or guess. That's where the upstream document failed its second reader.
+
+## Code quality
+
+Was the final diff good? Right things changed, right size, right amount of test coverage / comments / abstraction? Specific quotes from the diff are more useful than general impressions.
+
+## Patterns flow should learn
+
+**This is the highest-value section.** What should the next iteration of the kernel or cell do differently based on what you saw? Concrete proposals — *"plan stage should grep for existing patterns before proposing new abstractions," "spec stage should require an explicit acceptance criteria block."* The reflect step will read these.
+
+## Comparison to other runs
+
+If you've reviewed prior runs of this task (or similar tasks), what changed here? Better, worse, same — and why?
+
+## Optional numeric scores (1-5)
+
+Fill in what's useful, leave the rest blank. These supplement the prose, they don't replace it.
+
+- doc_readability:
+- doc_machine_clarity:
+- code_quality:
+- cost_satisfaction:
+- speed_satisfaction:
+- overall:
diff --git a/evals/templates/task-readme.md b/evals/templates/task-readme.md
new file mode 100644
index 0000000..4ce5a64
--- /dev/null
+++ b/evals/templates/task-readme.md
@@ -0,0 +1,29 @@
+# {{task}}
+
+**What this task evaluates:** _(fill in: what aspects of flow are we testing here? doc quality on a fresh-repo task? code quality on a bugfix? something else?)_
+
+## Prompt
+
+See `prompt.md`. Pass that to `/flow:flow` verbatim when running this eval.
+
+## How to run a fresh run
+
+1. Open a Claude Code session in the target project (or set up a fixture for it).
+2. Run `/flow:flow` with the prompt from `prompt.md`. Let flow walk through the stages — answer boundary prompts the way a real user would.
+3. After flow ships (or pauses), capture from this repo:
+   ```
+   make eval-record TASK={{task}} PROJECT=<path-to-project>
+   ```
+   Add `COST=<usd> DURATION=<sec>` if you tracked them.
+4. Review:
+   ```
+   make eval-review TASK={{task}} RUN=<run-id>
+   ```
+
+## What's a good outcome?
+
+_(fill in: what does flow doing well on this task look like? a tight diff? a plan that referenced the right files? specifics that the reviewer should look for — these become the "what should flow learn" section anchors.)_
+
+## Notes
+
+_(running history of observations across runs of this task — what's been improving, what's still a problem, what we've tried.)_
diff --git a/scripts/eval-compare.sh b/scripts/eval-compare.sh
new file mode 100755
index 0000000..a815fd6
--- /dev/null
+++ b/scripts/eval-compare.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+# Compare two recorded runs of the same task.
+
+set -euo pipefail
+
+RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+task=""
+run_a=""
+run_b=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --task) task="$2"; shift 2 ;;
+        --a)    run_a="$2"; shift 2 ;;
+        --b)    run_b="$2"; shift 2 ;;
+        *) echo "unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ -z "$task" ] || [ -z "$run_a" ] || [ -z "$run_b" ]; then
+    echo "Usage: make eval-compare TASK=<id> A=<run-a> B=<run-b>" >&2
+    exit 1
+fi
+
+dir_a="$RUNTIME_ROOT/evals/runs/$task/$run_a"
+dir_b="$RUNTIME_ROOT/evals/runs/$task/$run_b"
+[ -d "$dir_a" ] || { echo "missing: $dir_a" >&2; exit 1; }
+[ -d "$dir_b" ] || { echo "missing: $dir_b" >&2; exit 1; }
+
+field() { jq -r "$1 // \"—\"" "$2" 2>/dev/null || echo "—"; }
+
+ka=$(field '.versions.kernel.sha_short' "$dir_a/manifest.json")
+kb=$(field '.versions.kernel.sha_short' "$dir_b/manifest.json")
+kab=$(field '.versions.kernel.branch'   "$dir_a/manifest.json")
+kbb=$(field '.versions.kernel.branch'   "$dir_b/manifest.json")
+ca=$(field '.versions.cell.sha_short'   "$dir_a/manifest.json")
+cb=$(field '.versions.cell.sha_short'   "$dir_b/manifest.json")
+cost_a=$(field '.metrics.cost_usd'      "$dir_a/manifest.json")
+cost_b=$(field '.metrics.cost_usd'      "$dir_b/manifest.json")
+dur_a=$(field '.metrics.duration_sec'   "$dir_a/manifest.json")
+dur_b=$(field '.metrics.duration_sec'   "$dir_b/manifest.json")
+proj_a=$(field '.project.path'          "$dir_a/manifest.json")
+proj_b=$(field '.project.path'          "$dir_b/manifest.json")
+branch_a=$(field '.project.branch'      "$dir_a/manifest.json")
+branch_b=$(field '.project.branch'      "$dir_b/manifest.json")
+
+echo
+echo "Comparing task: $task"
+echo "============================================================"
+printf "                A: %s\n" "$run_a"
+printf "                B: %s\n" "$run_b"
+echo "------------------------------------------------------------"
+printf "%-16s %-22s %-22s\n" "field"          "A"                       "B"
+printf "%-16s %-22s %-22s\n" "kernel"         "$ka ($kab)"              "$kb ($kbb)"
+printf "%-16s %-22s %-22s\n" "cell"           "$ca"                     "$cb"
+printf "%-16s %-22s %-22s\n" "project"        "$(basename "$proj_a")"   "$(basename "$proj_b")"
+printf "%-16s %-22s %-22s\n" "branch"         "$branch_a"               "$branch_b"
+printf "%-16s %-22s %-22s\n" "cost (USD)"     "$cost_a"                 "$cost_b"
+printf "%-16s %-22s %-22s\n" "duration (s)"   "$dur_a"                  "$dur_b"
+echo
+
+list_reviews() {
+    local d="$1"
+    if compgen -G "$d/reviews/*.md" >/dev/null; then
+        for f in "$d/reviews"/*.md; do
+            local name; name=$(basename "$f" .md)
+            local lines; lines=$(wc -l < "$f" | tr -d ' ')
+            printf "  %-20s (%s lines)\n" "$name" "$lines"
+        done
+    else
+        echo "  (none)"
+    fi
+}
+
+echo "Reviews on A:"
+list_reviews "$dir_a"
+echo "Reviews on B:"
+list_reviews "$dir_b"
+echo
+
+# If both sides have a review by the same reviewer, suggest a side-by-side diff.
+shared=()
+if compgen -G "$dir_a/reviews/*.md" >/dev/null && compgen -G "$dir_b/reviews/*.md" >/dev/null; then
+    for f in "$dir_a/reviews"/*.md; do
+        n=$(basename "$f")
+        [ -f "$dir_b/reviews/$n" ] && shared+=("$n")
+    done
+fi
+if [ ${#shared[@]} -gt 0 ]; then
+    echo "Shared reviewers — diff to compare prose feedback:"
+    for n in "${shared[@]}"; do
+        echo "  diff $dir_a/reviews/$n $dir_b/reviews/$n"
+    done
+    echo
+fi
+
+echo "Artifacts:"
+echo "  A: ${dir_a#$RUNTIME_ROOT/}"
+echo "  B: ${dir_b#$RUNTIME_ROOT/}"
+echo
diff --git a/scripts/eval-list.sh b/scripts/eval-list.sh
new file mode 100755
index 0000000..cae55a8
--- /dev/null
+++ b/scripts/eval-list.sh
@@ -0,0 +1,44 @@
+#!/usr/bin/env bash
+# List eval tasks and recorded runs.
+
+set -euo pipefail
+
+RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+echo "Tasks:"
+found=0
+for d in "$RUNTIME_ROOT/evals/tasks"/*/; do
+    [ -d "$d" ] || continue
+    name=$(basename "$d")
+    [ "$name" = ".gitkeep" ] && continue
+    title=""
+    if [ -f "$d/task.json" ] && command -v jq >/dev/null 2>&1; then
+        title=$(jq -r '.title // ""' "$d/task.json" 2>/dev/null)
+    fi
+    printf "  %-32s %s\n" "$name" "$title"
+    found=$((found + 1))
+done
+if [ $found -eq 0 ]; then echo "  (none — scaffold one with: make eval-task-new TASK=<id>)"; fi
+
+echo
+echo "Runs:"
+found=0
+for task_dir in "$RUNTIME_ROOT/evals/runs"/*/; do
+    [ -d "$task_dir" ] || continue
+    task=$(basename "$task_dir")
+    [ "$task" = ".gitkeep" ] && continue
+    for run_dir in "$task_dir"*/; do
+        [ -d "$run_dir" ] || continue
+        run_id=$(basename "$run_dir")
+        review_count=$(find "$run_dir/reviews" -maxdepth 1 -name '*.md' 2>/dev/null | wc -l | tr -d ' ')
+        kernel="?"
+        if [ -f "$run_dir/manifest.json" ] && command -v jq >/dev/null 2>&1; then
+            kernel=$(jq -r '.versions.kernel.sha_short // "?"' "$run_dir/manifest.json" 2>/dev/null)
+        fi
+        plural="s"
+        [ "$review_count" = "1" ] && plural=""
+        printf "  %-22s %-40s kernel=%s  %s review%s\n" "$task" "$run_id" "$kernel" "$review_count" "$plural"
+        found=$((found + 1))
+    done
+done
+if [ $found -eq 0 ]; then echo "  (none — capture one with: make eval-record TASK=<id>)"; fi
diff --git a/scripts/eval-record.sh b/scripts/eval-record.sh
new file mode 100755
index 0000000..0c2657d
--- /dev/null
+++ b/scripts/eval-record.sh
@@ -0,0 +1,164 @@
+#!/usr/bin/env bash
+# Capture a flow run as an eval record.
+#
+# Discovers the most recent thread under <project>/agent/threads/, copies it,
+# generates a diff against the base branch, and writes a manifest pinned to
+# the current kernel and active-cell SHAs.
+
+set -euo pipefail
+
+RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+FLOW_HOME="${FLOW_HOME:-$HOME/.flow}"
+
+task=""
+thread_dir=""
+project_dir=""
+base_branch="main"
+cost_usd="null"
+duration_sec="null"
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --task)     task="$2"; shift 2 ;;
+        --thread)   thread_dir="$2"; shift 2 ;;
+        --project)  project_dir="$2"; shift 2 ;;
+        --base)     base_branch="$2"; shift 2 ;;
+        --cost)     cost_usd="$2"; shift 2 ;;
+        --duration) duration_sec="$2"; shift 2 ;;
+        *) echo "unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+[ -z "$task" ] && {
+    echo "Usage: make eval-record TASK=<id> [PROJECT=<path>] [THREAD=<dir>] [BASE=main] [COST=<usd>] [DURATION=<sec>]" >&2
+    exit 1
+}
+
+if [ ! -d "$RUNTIME_ROOT/evals/tasks/$task" ]; then
+    echo "Task not found: $task" >&2
+    echo "Available tasks:" >&2
+    ls "$RUNTIME_ROOT/evals/tasks" 2>/dev/null | grep -v '^\.gitkeep$' | sed 's/^/  /' >&2 || echo "  (none — scaffold one with: make eval-task-new TASK=<id>)" >&2
+    exit 1
+fi
+
+# Discover thread_dir if not given.
+if [ -z "$thread_dir" ]; then
+    [ -z "$project_dir" ] && project_dir="$(pwd)"
+    threads_root="$project_dir/agent/threads"
+    if [ ! -d "$threads_root" ]; then
+        echo "No agent/threads/ in $project_dir." >&2
+        echo "Pass --thread <dir> explicitly, or run from a project that has flow threads." >&2
+        exit 1
+    fi
+    thread_dir=$(find "$threads_root" -mindepth 1 -maxdepth 1 -type d | sort -r | head -n1)
+    [ -z "$thread_dir" ] && { echo "No threads found in $threads_root" >&2; exit 1; }
+fi
+
+[ -d "$thread_dir" ] || { echo "Thread dir not found: $thread_dir" >&2; exit 1; }
+
+# Derive project_dir from thread_dir if still unset.
+if [ -z "$project_dir" ]; then
+    project_dir=$(git -C "$thread_dir" rev-parse --show-toplevel 2>/dev/null) || {
+        echo "Could not derive project dir from $thread_dir; pass --project explicitly" >&2
+        exit 1
+    }
+fi
+
+# Derive branch: prefer spec.md frontmatter, fall back to the project's HEAD.
+branch=""
+if [ -f "$thread_dir/spec.md" ]; then
+    branch=$(grep -E '^branch:[[:space:]]' "$thread_dir/spec.md" 2>/dev/null | head -n1 | sed -E 's/^branch:[[:space:]]*//' | tr -d '"')
+fi
+[ -z "$branch" ] && branch=$(git -C "$project_dir" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "")
+
+# Pin versions.
+kernel_sha=$(git -C "$RUNTIME_ROOT" rev-parse HEAD)
+kernel_sha7=${kernel_sha:0:7}
+kernel_branch=$(git -C "$RUNTIME_ROOT" rev-parse --abbrev-ref HEAD)
+
+cell_path=""
+cell_name="(none)"
+cell_sha=""
+cell_sha7=""
+cell_branch=""
+if [ -L "$FLOW_HOME/active-cell" ]; then
+    cell_path=$(readlink "$FLOW_HOME/active-cell")
+    cell_name=$(basename "$cell_path")
+    if [ -d "$cell_path/.git" ]; then
+        cell_sha=$(git -C "$cell_path" rev-parse HEAD 2>/dev/null || echo "")
+        cell_sha7=${cell_sha:0:7}
+        cell_branch=$(git -C "$cell_path" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "")
+    fi
+fi
+
+# Run-id and destination.
+ts=$(date -u +"%Y-%m-%dT%H-%M-%SZ")
+run_id="${ts}-${kernel_sha7}"
+dest="$RUNTIME_ROOT/evals/runs/$task/$run_id"
+mkdir -p "$dest/reviews"
+
+# Copy thread (skip if missing).
+if [ -d "$thread_dir" ]; then
+    cp -R "$thread_dir" "$dest/thread"
+fi
+
+# Generate diff. base_branch...branch shows what's on the branch since it diverged from base.
+diff_path="$dest/diff.patch"
+if [ -n "$branch" ] && git -C "$project_dir" rev-parse --verify "$base_branch" >/dev/null 2>&1; then
+    git -C "$project_dir" diff "$base_branch...$branch" > "$diff_path" 2>/dev/null || {
+        echo "  warning: git diff $base_branch...$branch failed; trying $base_branch..HEAD" >&2
+        git -C "$project_dir" diff "$base_branch..HEAD" > "$diff_path" 2>/dev/null || : > "$diff_path"
+    }
+else
+    : > "$diff_path"
+    echo "  warning: could not generate diff — branch=$branch base=$base_branch" >&2
+fi
+
+# Snapshot the prompt.
+[ -f "$RUNTIME_ROOT/evals/tasks/$task/prompt.md" ] && cp "$RUNTIME_ROOT/evals/tasks/$task/prompt.md" "$dest/prompt.md"
+
+# Manifest.
+recorded_at=$(date -u +%FT%TZ)
+jq -n \
+  --arg run_id "$run_id" \
+  --arg task "$task" \
+  --arg recorded_at "$recorded_at" \
+  --arg kernel_sha "$kernel_sha" \
+  --arg kernel_sha7 "$kernel_sha7" \
+  --arg kernel_branch "$kernel_branch" \
+  --arg cell_name "$cell_name" \
+  --arg cell_sha "$cell_sha" \
+  --arg cell_sha7 "$cell_sha7" \
+  --arg cell_branch "$cell_branch" \
+  --arg project_path "$project_dir" \
+  --arg branch "$branch" \
+  --arg base "$base_branch" \
+  --arg thread_src "$thread_dir" \
+  --argjson cost "$cost_usd" \
+  --argjson duration "$duration_sec" \
+  '{
+     run_id: $run_id,
+     task: $task,
+     recorded_at: $recorded_at,
+     versions: {
+       kernel: { sha: $kernel_sha, sha_short: $kernel_sha7, branch: $kernel_branch },
+       cell:   { name: $cell_name, sha: $cell_sha, sha_short: $cell_sha7, branch: $cell_branch }
+     },
+     project: { path: $project_path, branch: $branch, base: $base },
+     source: { thread_dir: $thread_src },
+     metrics: { cost_usd: $cost, duration_sec: $duration },
+     notes: ""
+   }' > "$dest/manifest.json"
+
+cat <<EOF
+✓ Recorded run: $task/$run_id
+  → $dest
+
+  versions: kernel=$kernel_sha7 ($kernel_branch)  cell=$cell_name@$cell_sha7
+  project:  $project_dir
+  branch:   $branch  (vs $base_branch)
+  thread:   $thread_dir
+
+Next:
+  make eval-review TASK=$task RUN=$run_id
+EOF
diff --git a/scripts/eval-review.sh b/scripts/eval-review.sh
new file mode 100755
index 0000000..b89a4a0
--- /dev/null
+++ b/scripts/eval-review.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/env bash
+# Open $EDITOR on a long-form qualitative review for an eval run.
+# Saves to evals/runs/<task>/<run-id>/reviews/<reviewer>.md.
+
+set -euo pipefail
+
+RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+task=""
+run_id=""
+reviewer=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --task)     task="$2"; shift 2 ;;
+        --run)      run_id="$2"; shift 2 ;;
+        --reviewer) reviewer="$2"; shift 2 ;;
+        *) echo "unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ -z "$task" ] || [ -z "$run_id" ]; then
+    echo "Usage: make eval-review TASK=<id> RUN=<run-id> [REVIEWER=<name>]" >&2
+    exit 1
+fi
+
+run_dir="$RUNTIME_ROOT/evals/runs/$task/$run_id"
+[ -d "$run_dir" ] || { echo "run not found: $run_dir" >&2; exit 1; }
+
+# Default reviewer = git user, slugified.
+if [ -z "$reviewer" ]; then
+    raw=$(git config user.name 2>/dev/null || echo "human")
+    reviewer=$(echo "$raw" | tr '[:upper:]' '[:lower:]' | tr ' ' '-' | tr -cd 'a-z0-9-')
+    [ -z "$reviewer" ] && reviewer="human"
+fi
+
+mkdir -p "$run_dir/reviews"
+review_file="$run_dir/reviews/$reviewer.md"
+
+# First-time render: pre-fill the template with run metadata.
+if [ ! -f "$review_file" ]; then
+    template="$RUNTIME_ROOT/evals/templates/review.md"
+    [ -f "$template" ] || { echo "missing template: $template" >&2; exit 1; }
+    reviewed_at=$(date -u +%FT%TZ)
+    sed -e "s|{{run_id}}|$run_id|g" \
+        -e "s|{{task}}|$task|g" \
+        -e "s|{{reviewer}}|$reviewer|g" \
+        -e "s|{{reviewed_at}}|$reviewed_at|g" \
+        "$template" > "$review_file"
+
+    # Append an artifact index so the reviewer has paths in-editor.
+    {
+        echo ""
+        echo "---"
+        echo ""
+        echo "## Artifact index (for reading alongside)"
+        echo ""
+        echo "Manifest:"
+        if [ -f "$run_dir/manifest.json" ] && command -v jq >/dev/null 2>&1; then
+            jq -r '"  kernel: \(.versions.kernel.sha_short) (\(.versions.kernel.branch))",
+                   "  cell:   \(.versions.cell.name) @ \(.versions.cell.sha_short)",
+                   "  project: \(.project.path)",
+                   "  branch:  \(.project.branch) vs \(.project.base)",
+                   "  cost:    \(.metrics.cost_usd // "—")",
+                   "  duration: \(.metrics.duration_sec // "—")"' "$run_dir/manifest.json"
+        fi
+        echo ""
+        echo "Thread docs:"
+        if [ -d "$run_dir/thread" ]; then
+            find "$run_dir/thread" -type f -name '*.md' | sort | while read -r f; do
+                printf "  %s\n" "${f#$RUNTIME_ROOT/}"
+            done
+        fi
+        echo ""
+        if [ -s "$run_dir/diff.patch" ]; then
+            lines=$(wc -l < "$run_dir/diff.patch" | tr -d ' ')
+            echo "Diff: evals/runs/$task/$run_id/diff.patch ($lines lines)"
+        else
+            echo "Diff: (empty)"
+        fi
+    } >> "$review_file"
+fi
+
+echo "Opening: $review_file"
+"${EDITOR:-vi}" "$review_file"
+echo "✓ Review saved: ${review_file#$RUNTIME_ROOT/}"
diff --git a/scripts/eval-task-new.sh b/scripts/eval-task-new.sh
new file mode 100755
index 0000000..d2fbd11
--- /dev/null
+++ b/scripts/eval-task-new.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Scaffold a new eval task under evals/tasks/<id>/.
+
+set -euo pipefail
+
+RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+task="${1:-}"
+[ -z "$task" ] && {
+    echo "Usage: make eval-task-new TASK=<id>" >&2
+    exit 1
+}
+
+dest="$RUNTIME_ROOT/evals/tasks/$task"
+[ -e "$dest" ] && { echo "Task already exists: $dest" >&2; exit 1; }
+
+mkdir -p "$dest"
+
+cat > "$dest/task.json" <<EOF
+{
+  "id": "$task",
+  "type": "code",
+  "title": "",
+  "description": ""
+}
+EOF
+
+cat > "$dest/prompt.md" <<EOF
+# Prompt
+
+Replace this with the exact prompt to pass to /flow:flow when running this eval.
+
+> example: Add a /standup command that summarises my git activity over the last week.
+EOF
+
+# Render the task README from template, substituting {{task}}.
+sed "s/{{task}}/$task/g" "$RUNTIME_ROOT/evals/templates/task-readme.md" > "$dest/README.md"
+
+cat <<EOF
+✓ Task scaffolded: evals/tasks/$task/
+
+Next steps:
+  1. Edit $dest/prompt.md           (the prompt to pass to /flow:flow)
+  2. Edit $dest/README.md           (what this task evaluates, what good looks like)
+  3. Edit $dest/task.json           (set title and description)
+EOF