jyliang · jyliang · May 7, 2026 · May 7, 2026
diff --git a/Makefile b/Makefile
@@ -15,7 +15,8 @@ LINT_DOC_PATHS := README.md kernel commands cells
 
 .PHONY: help install doctor list lint-docs \
 	cell-init cell-new cell-list cell-use cell-rm \
-	cell-status cell-link-remote cell-pull cell-push cell-branch cell-pr
+	cell-status cell-link-remote cell-pull cell-push cell-branch cell-pr \
+	eval-task-new eval-record eval-review eval-compare eval-list
 
 help: ## Show available targets
 	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \
@@ -78,6 +79,32 @@ cell-branch: ## Cut a branch in the cell for an edit (vars: BRANCH, optional NAM
 cell-pr: ## Open a PR for current cell edits (vars: TITLE, BODY; optional NAME)
 	@bash $(RUNTIME_ROOT)/scripts/cell-pr.sh "$(NAME)" "$(TITLE)" "$(BODY)"
 
+# ----- Eval harness (capture flow runs, review them, compare across versions) -----
+
+eval-task-new: ## Scaffold a new eval task (var: TASK)
+	@bash $(RUNTIME_ROOT)/scripts/eval-task-new.sh "$(TASK)"
+
+eval-record: ## Capture a flow run as an eval (vars: TASK; optional PROJECT, THREAD, BASE, COST, DURATION)
+	@bash $(RUNTIME_ROOT)/scripts/eval-record.sh \
+		--task "$(TASK)" \
+		$(if $(PROJECT),--project "$(PROJECT)") \
+		$(if $(THREAD),--thread "$(THREAD)") \
+		$(if $(BASE),--base "$(BASE)") \
+		$(if $(COST),--cost "$(COST)") \
+		$(if $(DURATION),--duration "$(DURATION)")
+
+eval-review: ## Open $$EDITOR on a long-form qualitative review (vars: TASK, RUN; optional REVIEWER)
+	@bash $(RUNTIME_ROOT)/scripts/eval-review.sh \
+		--task "$(TASK)" --run "$(RUN)" \
+		$(if $(REVIEWER),--reviewer "$(REVIEWER)")
+
+eval-compare: ## Compare two recorded runs (vars: TASK, A, B)
+	@bash $(RUNTIME_ROOT)/scripts/eval-compare.sh \
+		--task "$(TASK)" --a "$(A)" --b "$(B)"
+
+eval-list: ## List eval tasks and recorded runs
+	@bash $(RUNTIME_ROOT)/scripts/eval-list.sh
+
 # ----- Doc lint (preserved from v2) -----
 
 lint-docs: ## Check markdown docs for style-guide regressions

diff --git a/evals/README.md b/evals/README.md
@@ -0,0 +1,62 @@
+# evals
+
+Capture flow runs, review them with rich qualitative feedback, and compare runs across kernel/cell versions. Reviews are committed to this repo so the next iteration of `/flow:reflect` (and the humans iterating on flow) can read them.
+
+## Workflow
+
+1. **Define a task.** A task is the prompt + the expected work — what we're evaluating flow against.
+   ```
+   make eval-task-new TASK=add-standup-cmd
+   # edit evals/tasks/add-standup-cmd/{prompt.md,README.md}
+   ```
+
+2. **Run flow on the task.** In any project, open a Claude Code session and run `/flow:flow` with the task's prompt. Ship, pause, or abort — all are recordable.
+
+3. **Capture the run.**
+   ```
+   make eval-record TASK=add-standup-cmd PROJECT=/path/to/project
+   ```
+   Discovers the most recent thread under `<project>/agent/threads/`, copies it, generates the diff, and writes a manifest pinned to the current kernel and cell SHAs. Optional flags: `THREAD=<dir>`, `BASE=main`, `COST=N`, `DURATION=N`.
+
+4. **Review the run.** Long-form qualitative review in `$EDITOR`:
+   ```
+   make eval-review TASK=add-standup-cmd RUN=<run-id>
+   ```
+   Sections cover overall impression, per-stage notes, document quality (for both human and machine readers), code quality, and the highest-value section: **patterns flow should learn**. Numeric scores are optional.
+
+5. **Compare runs:**
+   ```
+   make eval-compare TASK=add-standup-cmd A=<run-a> B=<run-b>
+   ```
+   Prints version pins, metric deltas, and lists reviews on both sides — `diff` the review files yourself to compare prose feedback.
+
+6. **List what we have:**
+   ```
+   make eval-list
+   ```
+
+7. **Commit.** `evals/tasks/`, `evals/runs/`, and `evals/templates/` are all part of this repo.
+
+## Layout
+
+```
+evals/
+  tasks/<task-id>/             # task definitions (committed)
+    task.json
+    prompt.md
+    README.md
+  runs/<task-id>/<run-id>/     # captured runs (committed)
+    manifest.json              # versions, project, metrics
+    prompt.md                  # snapshot of the prompt at record time
+    thread/                    # copied handoff docs from the run
+    diff.patch                 # git diff <base>...<branch>
+    reviews/<reviewer>.md      # one markdown file per reviewer
+  templates/
+    review.md                  # the review template
+```
+
+`run-id` format: `<utc-iso-timestamp>-<kernel-sha7>` — sortable, scannable, includes the kernel version axis.
+
+## Why qualitative
+
+Numeric scores are easy to game and lose nuance. The richest signal for improving flow is prose: *"the plan invented a file that doesn't exist," "the spec buried acceptance criteria in narrative," "implement re-asked questions explore had already answered."* That's what the review template invites, and that's what feeds the next iteration.
diff --git a/evals/runs/.gitkeep b/evals/runs/.gitkeep
diff --git a/evals/tasks/.gitkeep b/evals/tasks/.gitkeep
diff --git a/evals/tasks/mini-todo/README.md b/evals/tasks/mini-todo/README.md
@@ -0,0 +1,62 @@
+# mini-todo
+
+**What this task evaluates:** Greenfield code-pipeline behaviour on a small, fully specified task. The prompt is intentionally tight (commands, storage, std-lib-only) so divergences in flow's output reflect the cell's behaviour, not prompt ambiguity. We're watching for:
+
+- **Spec stage** — does it pin the JSON schema and the failure-mode behaviour (missing id, malformed file) the prompt left implicit, or does it just restate the prompt?
+- **Plan stage** — does it propose a sensible single-file layout, or invent unnecessary structure (`src/`, modules, classes for a 100-line tool)?
+- **Implement stage** — is the code tight (target: <150 LOC), does it handle the obvious edge cases (file doesn't exist yet, bad id), and does it stop there?
+- **Review stage** — does it actually find anything, or rubber-stamp?
+- **Scope discipline** — does flow stay in scope, or does it add config files, GitHub Actions, type hints when prompt didn't ask?
+
+## Prompt
+
+See `prompt.md`. Pass that to `/flow:flow` verbatim.
+
+## Project location
+
+Run flow against `~/Workspace/jyliang/mini-todo` (a fresh empty repo, scaffolded once and reused across runs — each run cuts its own branch). If that path doesn't exist, see "First-time setup" below.
+
+## How to run a fresh run
+
+1. Reset the project to a clean main:
+   ```
+   cd ~/Workspace/jyliang/mini-todo
+   git checkout main && git reset --hard <initial-commit-sha>
+   ```
+   (Initial-commit SHA: see `notes` at the bottom of this file once it's been initialized.)
+2. Open a Claude Code session in that directory and run `/flow:flow` with the prompt from `prompt.md`.
+3. Walk flow through its stages, answering boundary prompts as a real user would. Note rough cost + duration if you can.
+4. From the flow repo:
+   ```
+   make eval-record TASK=mini-todo PROJECT=~/Workspace/jyliang/mini-todo COST=<usd> DURATION=<sec>
+   ```
+5. Review:
+   ```
+   make eval-review TASK=mini-todo RUN=<run-id>
+   ```
+
+## What's a good outcome?
+
+- Final diff is one Python file (~100–150 LOC) plus a short README.
+- All four commands work; `--file` flag is honoured; missing-id is handled cleanly.
+- Spec doc captures: data shape, what happens on missing/malformed file, behaviour when id doesn't exist.
+- Plan doc proposes the single-file layout without inventing structure.
+- No type hints, no test framework, no CI — those weren't asked for.
+- Total flow run cost: under $2; duration: under 15 minutes.
+
+## First-time setup
+
+```
+mkdir -p ~/Workspace/jyliang/mini-todo
+cd ~/Workspace/jyliang/mini-todo
+git init -b main
+echo "# mini-todo" > README.md
+git add README.md && git commit -m "init"
+```
+
+Then record that initial commit's SHA in the **Notes** section below so future runs can reset to it.
+
+## Notes
+
+- **Initial commit SHA** (reset target): `9f05045` (`9f050453dc9081213a1adc1baa1dcae6db7d3dbb`)
+- _(running history across runs — what's improving, what's still a problem, goes here.)_
diff --git a/evals/tasks/mini-todo/prompt.md b/evals/tasks/mini-todo/prompt.md
@@ -0,0 +1,12 @@
+# Prompt
+
+Pass this verbatim to `/flow:flow`:
+
+> Build a single-file Python TODO CLI named `todo`. It stores tasks in a JSON file (default: `~/.todo.json`, override via `--file`). Commands:
+>
+> - `todo add "<text>"` — append a new task, print its id and text
+> - `todo ls` — list tasks, one per line, with `[ ]` / `[x]` and the id
+> - `todo done <id>` — mark complete
+> - `todo rm <id>` — delete
+>
+> Use only the Python standard library. Single file is fine. Include a short README with install + usage.
diff --git a/evals/tasks/mini-todo/task.json b/evals/tasks/mini-todo/task.json
@@ -0,0 +1,6 @@
+{
+  "id": "mini-todo",
+  "type": "code",
+  "title": "Build a mini TODO CLI in a fresh repo",
+  "description": "Have flow build a single-file Python TODO CLI with JSON-file persistence and the commands: add, ls, done, rm. Tests scope discipline (will flow over- or under-engineer it?), spec quality (does it pin the data shape and edge cases?), and code quality on greenfield work."
+}
diff --git a/evals/templates/review.md b/evals/templates/review.md
@@ -0,0 +1,76 @@
+# Review: {{run_id}}
+
+- task: `{{task}}`
+- reviewer: `{{reviewer}}`
+- reviewed_at: {{reviewed_at}}
+
+> Read the run artifacts alongside this review:
+>
+> - manifest: `evals/runs/{{task}}/{{run_id}}/manifest.json`
+> - thread docs: `evals/runs/{{task}}/{{run_id}}/thread/*.md`
+> - final diff: `evals/runs/{{task}}/{{run_id}}/diff.patch`
+
+---
+
+## Overall impression
+
+(gut take — write freely. one paragraph or ten, your choice)
+
+## What went well
+
+(what should flow keep doing? cite specifics — quote the doc, point at the diff)
+
+## What went poorly
+
+(what should flow change? same — be specific)
+
+## Per-stage notes
+
+### Spec / explore
+
+(what did the explore stage produce? was the spec usable? did it capture intent?)
+
+### Plan
+
+(was the plan grounded in the actual codebase? did it reference real files? right level of detail?)
+
+### Implementation / final diff
+
+(read `diff.patch`. is the code good? right size? scope-disciplined? any over-engineering?)
+
+### Review (the stage)
+
+(if a review stage ran — did it catch anything real, or rubber-stamp?)
+
+## Document quality
+
+### Human readability
+
+How easy were the docs for a human to read? Quote awkward passages. Was the structure helpful or noise?
+
+### Machine clarity
+
+Did each stage's output give the next stage what it needed? Look for places where the next stage had to re-derive context, re-ask questions, or guess. That's where the upstream document failed its second reader.
+
+## Code quality
+
+Was the final diff good? Right things changed, right size, right amount of test coverage / comments / abstraction? Specific quotes from the diff are more useful than general impressions.
+
+## Patterns flow should learn
+
+**This is the highest-value section.** What should the next iteration of the kernel or cell do differently based on what you saw? Concrete proposals — *"plan stage should grep for existing patterns before proposing new abstractions," "spec stage should require an explicit acceptance criteria block."* The reflect step will read these.
+
+## Comparison to other runs
+
+If you've reviewed prior runs of this task (or similar tasks), what changed here? Better, worse, same — and why?
+
+## Optional numeric scores (1-5)
+
+Fill in what's useful, leave the rest blank. These supplement the prose, they don't replace it.
+
+- doc_readability:
+- doc_machine_clarity:
+- code_quality:
+- cost_satisfaction:
+- speed_satisfaction:
+- overall:
diff --git a/evals/templates/task-readme.md b/evals/templates/task-readme.md
@@ -0,0 +1,29 @@
+# {{task}}
+
+**What this task evaluates:** _(fill in: what aspects of flow are we testing here? doc quality on a fresh-repo task? code quality on a bugfix? something else?)_
+
+## Prompt
+
+See `prompt.md`. Pass that to `/flow:flow` verbatim when running this eval.
+
+## How to run a fresh run
+
+1. Open a Claude Code session in the target project (or set up a fixture for it).
+2. Run `/flow:flow` with the prompt from `prompt.md`. Let flow walk through the stages — answer boundary prompts the way a real user would.
+3. After flow ships (or pauses), capture from this repo:
+   ```
+   make eval-record TASK={{task}} PROJECT=<path-to-project>
+   ```
+   Add `COST=<usd> DURATION=<sec>` if you tracked them.
+4. Review:
+   ```
+   make eval-review TASK={{task}} RUN=<run-id>
+   ```
+
+## What's a good outcome?
+
+_(fill in: what does flow doing well on this task look like? a tight diff? a plan that referenced the right files? specifics that the reviewer should look for — these become the "what should flow learn" section anchors.)_
+
+## Notes
+
+_(running history of observations across runs of this task — what's been improving, what's still a problem, what we've tried.)_
diff --git a/scripts/eval-compare.sh b/scripts/eval-compare.sh
@@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+# Compare two recorded runs of the same task.
+
+set -euo pipefail
+
+RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
+
+task=""
+run_a=""
+run_b=""
+
+while [[ $# -gt 0 ]]; do
+    case "$1" in
+        --task) task="$2"; shift 2 ;;
+        --a)    run_a="$2"; shift 2 ;;
+        --b)    run_b="$2"; shift 2 ;;
+        *) echo "unknown arg: $1" >&2; exit 1 ;;
+    esac
+done
+
+if [ -z "$task" ] || [ -z "$run_a" ] || [ -z "$run_b" ]; then
+    echo "Usage: make eval-compare TASK=<id> A=<run-a> B=<run-b>" >&2
+    exit 1
+fi
+
+dir_a="$RUNTIME_ROOT/evals/runs/$task/$run_a"
+dir_b="$RUNTIME_ROOT/evals/runs/$task/$run_b"
+[ -d "$dir_a" ] || { echo "missing: $dir_a" >&2; exit 1; }
+[ -d "$dir_b" ] || { echo "missing: $dir_b" >&2; exit 1; }
+
+field() { jq -r "$1 // \"—\"" "$2" 2>/dev/null || echo "—"; }
+
+ka=$(field '.versions.kernel.sha_short' "$dir_a/manifest.json")
+kb=$(field '.versions.kernel.sha_short' "$dir_b/manifest.json")
+kab=$(field '.versions.kernel.branch'   "$dir_a/manifest.json")
+kbb=$(field '.versions.kernel.branch'   "$dir_b/manifest.json")
+ca=$(field '.versions.cell.sha_short'   "$dir_a/manifest.json")
+cb=$(field '.versions.cell.sha_short'   "$dir_b/manifest.json")
+cost_a=$(field '.metrics.cost_usd'      "$dir_a/manifest.json")
+cost_b=$(field '.metrics.cost_usd'      "$dir_b/manifest.json")
+dur_a=$(field '.metrics.duration_sec'   "$dir_a/manifest.json")
+dur_b=$(field '.metrics.duration_sec'   "$dir_b/manifest.json")
+proj_a=$(field '.project.path'          "$dir_a/manifest.json")
+proj_b=$(field '.project.path'          "$dir_b/manifest.json")
+branch_a=$(field '.project.branch'      "$dir_a/manifest.json")
+branch_b=$(field '.project.branch'      "$dir_b/manifest.json")
+
+echo
+echo "Comparing task: $task"
+echo "============================================================"
+printf "                A: %s\n" "$run_a"
+printf "                B: %s\n" "$run_b"
+echo "------------------------------------------------------------"
+printf "%-16s %-22s %-22s\n" "field"          "A"                       "B"
+printf "%-16s %-22s %-22s\n" "kernel"         "$ka ($kab)"              "$kb ($kbb)"
+printf "%-16s %-22s %-22s\n" "cell"           "$ca"                     "$cb"
+printf "%-16s %-22s %-22s\n" "project"        "$(basename "$proj_a")"   "$(basename "$proj_b")"
+printf "%-16s %-22s %-22s\n" "branch"         "$branch_a"               "$branch_b"
+printf "%-16s %-22s %-22s\n" "cost (USD)"     "$cost_a"                 "$cost_b"
+printf "%-16s %-22s %-22s\n" "duration (s)"   "$dur_a"                  "$dur_b"
+echo
+
+list_reviews() {
+    local d="$1"
+    if compgen -G "$d/reviews/*.md" >/dev/null; then
+        for f in "$d/reviews"/*.md; do
+            local name; name=$(basename "$f" .md)
+            local lines; lines=$(wc -l < "$f" | tr -d ' ')
+            printf "  %-20s (%s lines)\n" "$name" "$lines"
+        done
+    else
+        echo "  (none)"
+    fi
+}
+
+echo "Reviews on A:"
+list_reviews "$dir_a"
+echo "Reviews on B:"
+list_reviews "$dir_b"
+echo
+
+# If both sides have a review by the same reviewer, suggest a side-by-side diff.
+shared=()
+if compgen -G "$dir_a/reviews/*.md" >/dev/null && compgen -G "$dir_b/reviews/*.md" >/dev/null; then
+    for f in "$dir_a/reviews"/*.md; do
+        n=$(basename "$f")
+        [ -f "$dir_b/reviews/$n" ] && shared+=("$n")
+    done
+fi
+if [ ${#shared[@]} -gt 0 ]; then
+    echo "Shared reviewers — diff to compare prose feedback:"
+    for n in "${shared[@]}"; do
+        echo "  diff $dir_a/reviews/$n $dir_b/reviews/$n"
+    done
+    echo
+fi
+
+echo "Artifacts:"
+echo "  A: ${dir_a#$RUNTIME_ROOT/}"
+echo "  B: ${dir_b#$RUNTIME_ROOT/}"
+echo