From c1096d96e4f63d06b4942b67d6063367b50e3db0 Mon Sep 17 00:00:00 2001 From: Jason Liang Date: Wed, 6 May 2026 23:40:56 -0400 Subject: [PATCH 1/2] evals: harness for capturing flow runs and qualitative reviews Adds make eval-task-new / eval-record / eval-review / eval-compare / eval-list to capture a flow run as a versioned record (kernel SHA + cell SHA + thread + diff + manifest), open a long-form review template in $EDITOR, and compare two runs side by side. Reviews are markdown so they're diffable and committed alongside the run for the next iteration of /flow:reflect to read. Co-Authored-By: Claude Opus 4.7 (1M context) --- Makefile | 29 +++++- evals/README.md | 62 +++++++++++++ evals/runs/.gitkeep | 0 evals/tasks/.gitkeep | 0 evals/templates/review.md | 76 +++++++++++++++ evals/templates/task-readme.md | 29 ++++++ scripts/eval-compare.sh | 101 ++++++++++++++++++++ scripts/eval-list.sh | 44 +++++++++ scripts/eval-record.sh | 164 +++++++++++++++++++++++++++++++++ scripts/eval-review.sh | 86 +++++++++++++++++ scripts/eval-task-new.sh | 46 +++++++++ 11 files changed, 636 insertions(+), 1 deletion(-) create mode 100644 evals/README.md create mode 100644 evals/runs/.gitkeep create mode 100644 evals/tasks/.gitkeep create mode 100644 evals/templates/review.md create mode 100644 evals/templates/task-readme.md create mode 100755 scripts/eval-compare.sh create mode 100755 scripts/eval-list.sh create mode 100755 scripts/eval-record.sh create mode 100755 scripts/eval-review.sh create mode 100755 scripts/eval-task-new.sh diff --git a/Makefile b/Makefile index 31a6789..8f6a824 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,8 @@ LINT_DOC_PATHS := README.md kernel commands cells .PHONY: help install doctor list lint-docs \ cell-init cell-new cell-list cell-use cell-rm \ - cell-status cell-link-remote cell-pull cell-push cell-branch cell-pr + cell-status cell-link-remote cell-pull cell-push cell-branch cell-pr \ + eval-task-new eval-record eval-review eval-compare eval-list help: ## Show available targets @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \ @@ -78,6 +79,32 @@ cell-branch: ## Cut a branch in the cell for an edit (vars: BRANCH, optional NAM cell-pr: ## Open a PR for current cell edits (vars: TITLE, BODY; optional NAME) @bash $(RUNTIME_ROOT)/scripts/cell-pr.sh "$(NAME)" "$(TITLE)" "$(BODY)" +# ----- Eval harness (capture flow runs, review them, compare across versions) ----- + +eval-task-new: ## Scaffold a new eval task (var: TASK) + @bash $(RUNTIME_ROOT)/scripts/eval-task-new.sh "$(TASK)" + +eval-record: ## Capture a flow run as an eval (vars: TASK; optional PROJECT, THREAD, BASE, COST, DURATION) + @bash $(RUNTIME_ROOT)/scripts/eval-record.sh \ + --task "$(TASK)" \ + $(if $(PROJECT),--project "$(PROJECT)") \ + $(if $(THREAD),--thread "$(THREAD)") \ + $(if $(BASE),--base "$(BASE)") \ + $(if $(COST),--cost "$(COST)") \ + $(if $(DURATION),--duration "$(DURATION)") + +eval-review: ## Open $$EDITOR on a long-form qualitative review (vars: TASK, RUN; optional REVIEWER) + @bash $(RUNTIME_ROOT)/scripts/eval-review.sh \ + --task "$(TASK)" --run "$(RUN)" \ + $(if $(REVIEWER),--reviewer "$(REVIEWER)") + +eval-compare: ## Compare two recorded runs (vars: TASK, A, B) + @bash $(RUNTIME_ROOT)/scripts/eval-compare.sh \ + --task "$(TASK)" --a "$(A)" --b "$(B)" + +eval-list: ## List eval tasks and recorded runs + @bash $(RUNTIME_ROOT)/scripts/eval-list.sh + # ----- Doc lint (preserved from v2) ----- lint-docs: ## Check markdown docs for style-guide regressions diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 0000000..e55cfc8 --- /dev/null +++ b/evals/README.md @@ -0,0 +1,62 @@ +# evals + +Capture flow runs, review them with rich qualitative feedback, and compare runs across kernel/cell versions. Reviews are committed to this repo so the next iteration of `/flow:reflect` (and the humans iterating on flow) can read them. + +## Workflow + +1. **Define a task.** A task is the prompt + the expected work — what we're evaluating flow against. + ``` + make eval-task-new TASK=add-standup-cmd + # edit evals/tasks/add-standup-cmd/{prompt.md,README.md} + ``` + +2. **Run flow on the task.** In any project, open a Claude Code session and run `/flow:flow` with the task's prompt. Ship, pause, or abort — all are recordable. + +3. **Capture the run.** + ``` + make eval-record TASK=add-standup-cmd PROJECT=/path/to/project + ``` + Discovers the most recent thread under `/agent/threads/`, copies it, generates the diff, and writes a manifest pinned to the current kernel and cell SHAs. Optional flags: `THREAD=`, `BASE=main`, `COST=N`, `DURATION=N`. + +4. **Review the run.** Long-form qualitative review in `$EDITOR`: + ``` + make eval-review TASK=add-standup-cmd RUN= + ``` + Sections cover overall impression, per-stage notes, document quality (for both human and machine readers), code quality, and the highest-value section: **patterns flow should learn**. Numeric scores are optional. + +5. **Compare runs:** + ``` + make eval-compare TASK=add-standup-cmd A= B= + ``` + Prints version pins, metric deltas, and lists reviews on both sides — `diff` the review files yourself to compare prose feedback. + +6. **List what we have:** + ``` + make eval-list + ``` + +7. **Commit.** `evals/tasks/`, `evals/runs/`, and `evals/templates/` are all part of this repo. + +## Layout + +``` +evals/ + tasks// # task definitions (committed) + task.json + prompt.md + README.md + runs/// # captured runs (committed) + manifest.json # versions, project, metrics + prompt.md # snapshot of the prompt at record time + thread/ # copied handoff docs from the run + diff.patch # git diff ... + reviews/.md # one markdown file per reviewer + templates/ + review.md # the review template +``` + +`run-id` format: `-` — sortable, scannable, includes the kernel version axis. + +## Why qualitative + +Numeric scores are easy to game and lose nuance. The richest signal for improving flow is prose: *"the plan invented a file that doesn't exist," "the spec buried acceptance criteria in narrative," "implement re-asked questions explore had already answered."* That's what the review template invites, and that's what feeds the next iteration. diff --git a/evals/runs/.gitkeep b/evals/runs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/tasks/.gitkeep b/evals/tasks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/templates/review.md b/evals/templates/review.md new file mode 100644 index 0000000..37d5cc1 --- /dev/null +++ b/evals/templates/review.md @@ -0,0 +1,76 @@ +# Review: {{run_id}} + +- task: `{{task}}` +- reviewer: `{{reviewer}}` +- reviewed_at: {{reviewed_at}} + +> Read the run artifacts alongside this review: +> +> - manifest: `evals/runs/{{task}}/{{run_id}}/manifest.json` +> - thread docs: `evals/runs/{{task}}/{{run_id}}/thread/*.md` +> - final diff: `evals/runs/{{task}}/{{run_id}}/diff.patch` + +--- + +## Overall impression + +(gut take — write freely. one paragraph or ten, your choice) + +## What went well + +(what should flow keep doing? cite specifics — quote the doc, point at the diff) + +## What went poorly + +(what should flow change? same — be specific) + +## Per-stage notes + +### Spec / explore + +(what did the explore stage produce? was the spec usable? did it capture intent?) + +### Plan + +(was the plan grounded in the actual codebase? did it reference real files? right level of detail?) + +### Implementation / final diff + +(read `diff.patch`. is the code good? right size? scope-disciplined? any over-engineering?) + +### Review (the stage) + +(if a review stage ran — did it catch anything real, or rubber-stamp?) + +## Document quality + +### Human readability + +How easy were the docs for a human to read? Quote awkward passages. Was the structure helpful or noise? + +### Machine clarity + +Did each stage's output give the next stage what it needed? Look for places where the next stage had to re-derive context, re-ask questions, or guess. That's where the upstream document failed its second reader. + +## Code quality + +Was the final diff good? Right things changed, right size, right amount of test coverage / comments / abstraction? Specific quotes from the diff are more useful than general impressions. + +## Patterns flow should learn + +**This is the highest-value section.** What should the next iteration of the kernel or cell do differently based on what you saw? Concrete proposals — *"plan stage should grep for existing patterns before proposing new abstractions," "spec stage should require an explicit acceptance criteria block."* The reflect step will read these. + +## Comparison to other runs + +If you've reviewed prior runs of this task (or similar tasks), what changed here? Better, worse, same — and why? + +## Optional numeric scores (1-5) + +Fill in what's useful, leave the rest blank. These supplement the prose, they don't replace it. + +- doc_readability: +- doc_machine_clarity: +- code_quality: +- cost_satisfaction: +- speed_satisfaction: +- overall: diff --git a/evals/templates/task-readme.md b/evals/templates/task-readme.md new file mode 100644 index 0000000..4ce5a64 --- /dev/null +++ b/evals/templates/task-readme.md @@ -0,0 +1,29 @@ +# {{task}} + +**What this task evaluates:** _(fill in: what aspects of flow are we testing here? doc quality on a fresh-repo task? code quality on a bugfix? something else?)_ + +## Prompt + +See `prompt.md`. Pass that to `/flow:flow` verbatim when running this eval. + +## How to run a fresh run + +1. Open a Claude Code session in the target project (or set up a fixture for it). +2. Run `/flow:flow` with the prompt from `prompt.md`. Let flow walk through the stages — answer boundary prompts the way a real user would. +3. After flow ships (or pauses), capture from this repo: + ``` + make eval-record TASK={{task}} PROJECT= + ``` + Add `COST= DURATION=` if you tracked them. +4. Review: + ``` + make eval-review TASK={{task}} RUN= + ``` + +## What's a good outcome? + +_(fill in: what does flow doing well on this task look like? a tight diff? a plan that referenced the right files? specifics that the reviewer should look for — these become the "what should flow learn" section anchors.)_ + +## Notes + +_(running history of observations across runs of this task — what's been improving, what's still a problem, what we've tried.)_ diff --git a/scripts/eval-compare.sh b/scripts/eval-compare.sh new file mode 100755 index 0000000..a815fd6 --- /dev/null +++ b/scripts/eval-compare.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# Compare two recorded runs of the same task. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +task="" +run_a="" +run_b="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --task) task="$2"; shift 2 ;; + --a) run_a="$2"; shift 2 ;; + --b) run_b="$2"; shift 2 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +if [ -z "$task" ] || [ -z "$run_a" ] || [ -z "$run_b" ]; then + echo "Usage: make eval-compare TASK= A= B=" >&2 + exit 1 +fi + +dir_a="$RUNTIME_ROOT/evals/runs/$task/$run_a" +dir_b="$RUNTIME_ROOT/evals/runs/$task/$run_b" +[ -d "$dir_a" ] || { echo "missing: $dir_a" >&2; exit 1; } +[ -d "$dir_b" ] || { echo "missing: $dir_b" >&2; exit 1; } + +field() { jq -r "$1 // \"—\"" "$2" 2>/dev/null || echo "—"; } + +ka=$(field '.versions.kernel.sha_short' "$dir_a/manifest.json") +kb=$(field '.versions.kernel.sha_short' "$dir_b/manifest.json") +kab=$(field '.versions.kernel.branch' "$dir_a/manifest.json") +kbb=$(field '.versions.kernel.branch' "$dir_b/manifest.json") +ca=$(field '.versions.cell.sha_short' "$dir_a/manifest.json") +cb=$(field '.versions.cell.sha_short' "$dir_b/manifest.json") +cost_a=$(field '.metrics.cost_usd' "$dir_a/manifest.json") +cost_b=$(field '.metrics.cost_usd' "$dir_b/manifest.json") +dur_a=$(field '.metrics.duration_sec' "$dir_a/manifest.json") +dur_b=$(field '.metrics.duration_sec' "$dir_b/manifest.json") +proj_a=$(field '.project.path' "$dir_a/manifest.json") +proj_b=$(field '.project.path' "$dir_b/manifest.json") +branch_a=$(field '.project.branch' "$dir_a/manifest.json") +branch_b=$(field '.project.branch' "$dir_b/manifest.json") + +echo +echo "Comparing task: $task" +echo "============================================================" +printf " A: %s\n" "$run_a" +printf " B: %s\n" "$run_b" +echo "------------------------------------------------------------" +printf "%-16s %-22s %-22s\n" "field" "A" "B" +printf "%-16s %-22s %-22s\n" "kernel" "$ka ($kab)" "$kb ($kbb)" +printf "%-16s %-22s %-22s\n" "cell" "$ca" "$cb" +printf "%-16s %-22s %-22s\n" "project" "$(basename "$proj_a")" "$(basename "$proj_b")" +printf "%-16s %-22s %-22s\n" "branch" "$branch_a" "$branch_b" +printf "%-16s %-22s %-22s\n" "cost (USD)" "$cost_a" "$cost_b" +printf "%-16s %-22s %-22s\n" "duration (s)" "$dur_a" "$dur_b" +echo + +list_reviews() { + local d="$1" + if compgen -G "$d/reviews/*.md" >/dev/null; then + for f in "$d/reviews"/*.md; do + local name; name=$(basename "$f" .md) + local lines; lines=$(wc -l < "$f" | tr -d ' ') + printf " %-20s (%s lines)\n" "$name" "$lines" + done + else + echo " (none)" + fi +} + +echo "Reviews on A:" +list_reviews "$dir_a" +echo "Reviews on B:" +list_reviews "$dir_b" +echo + +# If both sides have a review by the same reviewer, suggest a side-by-side diff. +shared=() +if compgen -G "$dir_a/reviews/*.md" >/dev/null && compgen -G "$dir_b/reviews/*.md" >/dev/null; then + for f in "$dir_a/reviews"/*.md; do + n=$(basename "$f") + [ -f "$dir_b/reviews/$n" ] && shared+=("$n") + done +fi +if [ ${#shared[@]} -gt 0 ]; then + echo "Shared reviewers — diff to compare prose feedback:" + for n in "${shared[@]}"; do + echo " diff $dir_a/reviews/$n $dir_b/reviews/$n" + done + echo +fi + +echo "Artifacts:" +echo " A: ${dir_a#$RUNTIME_ROOT/}" +echo " B: ${dir_b#$RUNTIME_ROOT/}" +echo diff --git a/scripts/eval-list.sh b/scripts/eval-list.sh new file mode 100755 index 0000000..cae55a8 --- /dev/null +++ b/scripts/eval-list.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# List eval tasks and recorded runs. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +echo "Tasks:" +found=0 +for d in "$RUNTIME_ROOT/evals/tasks"/*/; do + [ -d "$d" ] || continue + name=$(basename "$d") + [ "$name" = ".gitkeep" ] && continue + title="" + if [ -f "$d/task.json" ] && command -v jq >/dev/null 2>&1; then + title=$(jq -r '.title // ""' "$d/task.json" 2>/dev/null) + fi + printf " %-32s %s\n" "$name" "$title" + found=$((found + 1)) +done +if [ $found -eq 0 ]; then echo " (none — scaffold one with: make eval-task-new TASK=)"; fi + +echo +echo "Runs:" +found=0 +for task_dir in "$RUNTIME_ROOT/evals/runs"/*/; do + [ -d "$task_dir" ] || continue + task=$(basename "$task_dir") + [ "$task" = ".gitkeep" ] && continue + for run_dir in "$task_dir"*/; do + [ -d "$run_dir" ] || continue + run_id=$(basename "$run_dir") + review_count=$(find "$run_dir/reviews" -maxdepth 1 -name '*.md' 2>/dev/null | wc -l | tr -d ' ') + kernel="?" + if [ -f "$run_dir/manifest.json" ] && command -v jq >/dev/null 2>&1; then + kernel=$(jq -r '.versions.kernel.sha_short // "?"' "$run_dir/manifest.json" 2>/dev/null) + fi + plural="s" + [ "$review_count" = "1" ] && plural="" + printf " %-22s %-40s kernel=%s %s review%s\n" "$task" "$run_id" "$kernel" "$review_count" "$plural" + found=$((found + 1)) + done +done +if [ $found -eq 0 ]; then echo " (none — capture one with: make eval-record TASK=)"; fi diff --git a/scripts/eval-record.sh b/scripts/eval-record.sh new file mode 100755 index 0000000..0c2657d --- /dev/null +++ b/scripts/eval-record.sh @@ -0,0 +1,164 @@ +#!/usr/bin/env bash +# Capture a flow run as an eval record. +# +# Discovers the most recent thread under /agent/threads/, copies it, +# generates a diff against the base branch, and writes a manifest pinned to +# the current kernel and active-cell SHAs. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +FLOW_HOME="${FLOW_HOME:-$HOME/.flow}" + +task="" +thread_dir="" +project_dir="" +base_branch="main" +cost_usd="null" +duration_sec="null" + +while [[ $# -gt 0 ]]; do + case "$1" in + --task) task="$2"; shift 2 ;; + --thread) thread_dir="$2"; shift 2 ;; + --project) project_dir="$2"; shift 2 ;; + --base) base_branch="$2"; shift 2 ;; + --cost) cost_usd="$2"; shift 2 ;; + --duration) duration_sec="$2"; shift 2 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +[ -z "$task" ] && { + echo "Usage: make eval-record TASK= [PROJECT=] [THREAD=] [BASE=main] [COST=] [DURATION=]" >&2 + exit 1 +} + +if [ ! -d "$RUNTIME_ROOT/evals/tasks/$task" ]; then + echo "Task not found: $task" >&2 + echo "Available tasks:" >&2 + ls "$RUNTIME_ROOT/evals/tasks" 2>/dev/null | grep -v '^\.gitkeep$' | sed 's/^/ /' >&2 || echo " (none — scaffold one with: make eval-task-new TASK=)" >&2 + exit 1 +fi + +# Discover thread_dir if not given. +if [ -z "$thread_dir" ]; then + [ -z "$project_dir" ] && project_dir="$(pwd)" + threads_root="$project_dir/agent/threads" + if [ ! -d "$threads_root" ]; then + echo "No agent/threads/ in $project_dir." >&2 + echo "Pass --thread explicitly, or run from a project that has flow threads." >&2 + exit 1 + fi + thread_dir=$(find "$threads_root" -mindepth 1 -maxdepth 1 -type d | sort -r | head -n1) + [ -z "$thread_dir" ] && { echo "No threads found in $threads_root" >&2; exit 1; } +fi + +[ -d "$thread_dir" ] || { echo "Thread dir not found: $thread_dir" >&2; exit 1; } + +# Derive project_dir from thread_dir if still unset. +if [ -z "$project_dir" ]; then + project_dir=$(git -C "$thread_dir" rev-parse --show-toplevel 2>/dev/null) || { + echo "Could not derive project dir from $thread_dir; pass --project explicitly" >&2 + exit 1 + } +fi + +# Derive branch: prefer spec.md frontmatter, fall back to the project's HEAD. +branch="" +if [ -f "$thread_dir/spec.md" ]; then + branch=$(grep -E '^branch:[[:space:]]' "$thread_dir/spec.md" 2>/dev/null | head -n1 | sed -E 's/^branch:[[:space:]]*//' | tr -d '"') +fi +[ -z "$branch" ] && branch=$(git -C "$project_dir" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "") + +# Pin versions. +kernel_sha=$(git -C "$RUNTIME_ROOT" rev-parse HEAD) +kernel_sha7=${kernel_sha:0:7} +kernel_branch=$(git -C "$RUNTIME_ROOT" rev-parse --abbrev-ref HEAD) + +cell_path="" +cell_name="(none)" +cell_sha="" +cell_sha7="" +cell_branch="" +if [ -L "$FLOW_HOME/active-cell" ]; then + cell_path=$(readlink "$FLOW_HOME/active-cell") + cell_name=$(basename "$cell_path") + if [ -d "$cell_path/.git" ]; then + cell_sha=$(git -C "$cell_path" rev-parse HEAD 2>/dev/null || echo "") + cell_sha7=${cell_sha:0:7} + cell_branch=$(git -C "$cell_path" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "") + fi +fi + +# Run-id and destination. +ts=$(date -u +"%Y-%m-%dT%H-%M-%SZ") +run_id="${ts}-${kernel_sha7}" +dest="$RUNTIME_ROOT/evals/runs/$task/$run_id" +mkdir -p "$dest/reviews" + +# Copy thread (skip if missing). +if [ -d "$thread_dir" ]; then + cp -R "$thread_dir" "$dest/thread" +fi + +# Generate diff. base_branch...branch shows what's on the branch since it diverged from base. +diff_path="$dest/diff.patch" +if [ -n "$branch" ] && git -C "$project_dir" rev-parse --verify "$base_branch" >/dev/null 2>&1; then + git -C "$project_dir" diff "$base_branch...$branch" > "$diff_path" 2>/dev/null || { + echo " warning: git diff $base_branch...$branch failed; trying $base_branch..HEAD" >&2 + git -C "$project_dir" diff "$base_branch..HEAD" > "$diff_path" 2>/dev/null || : > "$diff_path" + } +else + : > "$diff_path" + echo " warning: could not generate diff — branch=$branch base=$base_branch" >&2 +fi + +# Snapshot the prompt. +[ -f "$RUNTIME_ROOT/evals/tasks/$task/prompt.md" ] && cp "$RUNTIME_ROOT/evals/tasks/$task/prompt.md" "$dest/prompt.md" + +# Manifest. +recorded_at=$(date -u +%FT%TZ) +jq -n \ + --arg run_id "$run_id" \ + --arg task "$task" \ + --arg recorded_at "$recorded_at" \ + --arg kernel_sha "$kernel_sha" \ + --arg kernel_sha7 "$kernel_sha7" \ + --arg kernel_branch "$kernel_branch" \ + --arg cell_name "$cell_name" \ + --arg cell_sha "$cell_sha" \ + --arg cell_sha7 "$cell_sha7" \ + --arg cell_branch "$cell_branch" \ + --arg project_path "$project_dir" \ + --arg branch "$branch" \ + --arg base "$base_branch" \ + --arg thread_src "$thread_dir" \ + --argjson cost "$cost_usd" \ + --argjson duration "$duration_sec" \ + '{ + run_id: $run_id, + task: $task, + recorded_at: $recorded_at, + versions: { + kernel: { sha: $kernel_sha, sha_short: $kernel_sha7, branch: $kernel_branch }, + cell: { name: $cell_name, sha: $cell_sha, sha_short: $cell_sha7, branch: $cell_branch } + }, + project: { path: $project_path, branch: $branch, base: $base }, + source: { thread_dir: $thread_src }, + metrics: { cost_usd: $cost, duration_sec: $duration }, + notes: "" + }' > "$dest/manifest.json" + +cat <//reviews/.md. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +task="" +run_id="" +reviewer="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --task) task="$2"; shift 2 ;; + --run) run_id="$2"; shift 2 ;; + --reviewer) reviewer="$2"; shift 2 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +if [ -z "$task" ] || [ -z "$run_id" ]; then + echo "Usage: make eval-review TASK= RUN= [REVIEWER=]" >&2 + exit 1 +fi + +run_dir="$RUNTIME_ROOT/evals/runs/$task/$run_id" +[ -d "$run_dir" ] || { echo "run not found: $run_dir" >&2; exit 1; } + +# Default reviewer = git user, slugified. +if [ -z "$reviewer" ]; then + raw=$(git config user.name 2>/dev/null || echo "human") + reviewer=$(echo "$raw" | tr '[:upper:]' '[:lower:]' | tr ' ' '-' | tr -cd 'a-z0-9-') + [ -z "$reviewer" ] && reviewer="human" +fi + +mkdir -p "$run_dir/reviews" +review_file="$run_dir/reviews/$reviewer.md" + +# First-time render: pre-fill the template with run metadata. +if [ ! -f "$review_file" ]; then + template="$RUNTIME_ROOT/evals/templates/review.md" + [ -f "$template" ] || { echo "missing template: $template" >&2; exit 1; } + reviewed_at=$(date -u +%FT%TZ) + sed -e "s|{{run_id}}|$run_id|g" \ + -e "s|{{task}}|$task|g" \ + -e "s|{{reviewer}}|$reviewer|g" \ + -e "s|{{reviewed_at}}|$reviewed_at|g" \ + "$template" > "$review_file" + + # Append an artifact index so the reviewer has paths in-editor. + { + echo "" + echo "---" + echo "" + echo "## Artifact index (for reading alongside)" + echo "" + echo "Manifest:" + if [ -f "$run_dir/manifest.json" ] && command -v jq >/dev/null 2>&1; then + jq -r '" kernel: \(.versions.kernel.sha_short) (\(.versions.kernel.branch))", + " cell: \(.versions.cell.name) @ \(.versions.cell.sha_short)", + " project: \(.project.path)", + " branch: \(.project.branch) vs \(.project.base)", + " cost: \(.metrics.cost_usd // "—")", + " duration: \(.metrics.duration_sec // "—")"' "$run_dir/manifest.json" + fi + echo "" + echo "Thread docs:" + if [ -d "$run_dir/thread" ]; then + find "$run_dir/thread" -type f -name '*.md' | sort | while read -r f; do + printf " %s\n" "${f#$RUNTIME_ROOT/}" + done + fi + echo "" + if [ -s "$run_dir/diff.patch" ]; then + lines=$(wc -l < "$run_dir/diff.patch" | tr -d ' ') + echo "Diff: evals/runs/$task/$run_id/diff.patch ($lines lines)" + else + echo "Diff: (empty)" + fi + } >> "$review_file" +fi + +echo "Opening: $review_file" +"${EDITOR:-vi}" "$review_file" +echo "✓ Review saved: ${review_file#$RUNTIME_ROOT/}" diff --git a/scripts/eval-task-new.sh b/scripts/eval-task-new.sh new file mode 100755 index 0000000..d2fbd11 --- /dev/null +++ b/scripts/eval-task-new.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Scaffold a new eval task under evals/tasks//. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +task="${1:-}" +[ -z "$task" ] && { + echo "Usage: make eval-task-new TASK=" >&2 + exit 1 +} + +dest="$RUNTIME_ROOT/evals/tasks/$task" +[ -e "$dest" ] && { echo "Task already exists: $dest" >&2; exit 1; } + +mkdir -p "$dest" + +cat > "$dest/task.json" < "$dest/prompt.md" < example: Add a /standup command that summarises my git activity over the last week. +EOF + +# Render the task README from template, substituting {{task}}. +sed "s/{{task}}/$task/g" "$RUNTIME_ROOT/evals/templates/task-readme.md" > "$dest/README.md" + +cat < Date: Thu, 7 May 2026 10:32:26 -0400 Subject: [PATCH 2/2] evals: add mini-todo as first eval task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Greenfield code-pipeline target — single-file Python TODO CLI in a fresh repo at ~/Workspace/jyliang/mini-todo (init commit 9f05045). Prompt is intentionally tight so divergences across runs reflect the cell, not prompt ambiguity. Watching for spec stage pinning the implicit edge cases, plan stage avoiding unnecessary structure, and implement stage staying under 150 LOC. Co-Authored-By: Claude Opus 4.7 (1M context) --- evals/tasks/mini-todo/README.md | 62 +++++++++++++++++++++++++++++++++ evals/tasks/mini-todo/prompt.md | 12 +++++++ evals/tasks/mini-todo/task.json | 6 ++++ 3 files changed, 80 insertions(+) create mode 100644 evals/tasks/mini-todo/README.md create mode 100644 evals/tasks/mini-todo/prompt.md create mode 100644 evals/tasks/mini-todo/task.json diff --git a/evals/tasks/mini-todo/README.md b/evals/tasks/mini-todo/README.md new file mode 100644 index 0000000..62f3ccf --- /dev/null +++ b/evals/tasks/mini-todo/README.md @@ -0,0 +1,62 @@ +# mini-todo + +**What this task evaluates:** Greenfield code-pipeline behaviour on a small, fully specified task. The prompt is intentionally tight (commands, storage, std-lib-only) so divergences in flow's output reflect the cell's behaviour, not prompt ambiguity. We're watching for: + +- **Spec stage** — does it pin the JSON schema and the failure-mode behaviour (missing id, malformed file) the prompt left implicit, or does it just restate the prompt? +- **Plan stage** — does it propose a sensible single-file layout, or invent unnecessary structure (`src/`, modules, classes for a 100-line tool)? +- **Implement stage** — is the code tight (target: <150 LOC), does it handle the obvious edge cases (file doesn't exist yet, bad id), and does it stop there? +- **Review stage** — does it actually find anything, or rubber-stamp? +- **Scope discipline** — does flow stay in scope, or does it add config files, GitHub Actions, type hints when prompt didn't ask? + +## Prompt + +See `prompt.md`. Pass that to `/flow:flow` verbatim. + +## Project location + +Run flow against `~/Workspace/jyliang/mini-todo` (a fresh empty repo, scaffolded once and reused across runs — each run cuts its own branch). If that path doesn't exist, see "First-time setup" below. + +## How to run a fresh run + +1. Reset the project to a clean main: + ``` + cd ~/Workspace/jyliang/mini-todo + git checkout main && git reset --hard + ``` + (Initial-commit SHA: see `notes` at the bottom of this file once it's been initialized.) +2. Open a Claude Code session in that directory and run `/flow:flow` with the prompt from `prompt.md`. +3. Walk flow through its stages, answering boundary prompts as a real user would. Note rough cost + duration if you can. +4. From the flow repo: + ``` + make eval-record TASK=mini-todo PROJECT=~/Workspace/jyliang/mini-todo COST= DURATION= + ``` +5. Review: + ``` + make eval-review TASK=mini-todo RUN= + ``` + +## What's a good outcome? + +- Final diff is one Python file (~100–150 LOC) plus a short README. +- All four commands work; `--file` flag is honoured; missing-id is handled cleanly. +- Spec doc captures: data shape, what happens on missing/malformed file, behaviour when id doesn't exist. +- Plan doc proposes the single-file layout without inventing structure. +- No type hints, no test framework, no CI — those weren't asked for. +- Total flow run cost: under $2; duration: under 15 minutes. + +## First-time setup + +``` +mkdir -p ~/Workspace/jyliang/mini-todo +cd ~/Workspace/jyliang/mini-todo +git init -b main +echo "# mini-todo" > README.md +git add README.md && git commit -m "init" +``` + +Then record that initial commit's SHA in the **Notes** section below so future runs can reset to it. + +## Notes + +- **Initial commit SHA** (reset target): `9f05045` (`9f050453dc9081213a1adc1baa1dcae6db7d3dbb`) +- _(running history across runs — what's improving, what's still a problem, goes here.)_ diff --git a/evals/tasks/mini-todo/prompt.md b/evals/tasks/mini-todo/prompt.md new file mode 100644 index 0000000..7549b34 --- /dev/null +++ b/evals/tasks/mini-todo/prompt.md @@ -0,0 +1,12 @@ +# Prompt + +Pass this verbatim to `/flow:flow`: + +> Build a single-file Python TODO CLI named `todo`. It stores tasks in a JSON file (default: `~/.todo.json`, override via `--file`). Commands: +> +> - `todo add ""` — append a new task, print its id and text +> - `todo ls` — list tasks, one per line, with `[ ]` / `[x]` and the id +> - `todo done ` — mark complete +> - `todo rm ` — delete +> +> Use only the Python standard library. Single file is fine. Include a short README with install + usage. diff --git a/evals/tasks/mini-todo/task.json b/evals/tasks/mini-todo/task.json new file mode 100644 index 0000000..4229f34 --- /dev/null +++ b/evals/tasks/mini-todo/task.json @@ -0,0 +1,6 @@ +{ + "id": "mini-todo", + "type": "code", + "title": "Build a mini TODO CLI in a fresh repo", + "description": "Have flow build a single-file Python TODO CLI with JSON-file persistence and the commands: add, ls, done, rm. Tests scope discipline (will flow over- or under-engineer it?), spec quality (does it pin the data shape and edge cases?), and code quality on greenfield work." +}