diff --git a/Makefile b/Makefile index 31a6789..8f6a824 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,8 @@ LINT_DOC_PATHS := README.md kernel commands cells .PHONY: help install doctor list lint-docs \ cell-init cell-new cell-list cell-use cell-rm \ - cell-status cell-link-remote cell-pull cell-push cell-branch cell-pr + cell-status cell-link-remote cell-pull cell-push cell-branch cell-pr \ + eval-task-new eval-record eval-review eval-compare eval-list help: ## Show available targets @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \ @@ -78,6 +79,32 @@ cell-branch: ## Cut a branch in the cell for an edit (vars: BRANCH, optional NAM cell-pr: ## Open a PR for current cell edits (vars: TITLE, BODY; optional NAME) @bash $(RUNTIME_ROOT)/scripts/cell-pr.sh "$(NAME)" "$(TITLE)" "$(BODY)" +# ----- Eval harness (capture flow runs, review them, compare across versions) ----- + +eval-task-new: ## Scaffold a new eval task (var: TASK) + @bash $(RUNTIME_ROOT)/scripts/eval-task-new.sh "$(TASK)" + +eval-record: ## Capture a flow run as an eval (vars: TASK; optional PROJECT, THREAD, BASE, COST, DURATION) + @bash $(RUNTIME_ROOT)/scripts/eval-record.sh \ + --task "$(TASK)" \ + $(if $(PROJECT),--project "$(PROJECT)") \ + $(if $(THREAD),--thread "$(THREAD)") \ + $(if $(BASE),--base "$(BASE)") \ + $(if $(COST),--cost "$(COST)") \ + $(if $(DURATION),--duration "$(DURATION)") + +eval-review: ## Open $$EDITOR on a long-form qualitative review (vars: TASK, RUN; optional REVIEWER) + @bash $(RUNTIME_ROOT)/scripts/eval-review.sh \ + --task "$(TASK)" --run "$(RUN)" \ + $(if $(REVIEWER),--reviewer "$(REVIEWER)") + +eval-compare: ## Compare two recorded runs (vars: TASK, A, B) + @bash $(RUNTIME_ROOT)/scripts/eval-compare.sh \ + --task "$(TASK)" --a "$(A)" --b "$(B)" + +eval-list: ## List eval tasks and recorded runs + @bash $(RUNTIME_ROOT)/scripts/eval-list.sh + # ----- Doc lint (preserved from v2) ----- lint-docs: ## Check markdown docs for style-guide regressions diff --git a/evals/README.md b/evals/README.md new file mode 100644 index 0000000..e55cfc8 --- /dev/null +++ b/evals/README.md @@ -0,0 +1,62 @@ +# evals + +Capture flow runs, review them with rich qualitative feedback, and compare runs across kernel/cell versions. Reviews are committed to this repo so the next iteration of `/flow:reflect` (and the humans iterating on flow) can read them. + +## Workflow + +1. **Define a task.** A task is the prompt + the expected work — what we're evaluating flow against. + ``` + make eval-task-new TASK=add-standup-cmd + # edit evals/tasks/add-standup-cmd/{prompt.md,README.md} + ``` + +2. **Run flow on the task.** In any project, open a Claude Code session and run `/flow:flow` with the task's prompt. Ship, pause, or abort — all are recordable. + +3. **Capture the run.** + ``` + make eval-record TASK=add-standup-cmd PROJECT=/path/to/project + ``` + Discovers the most recent thread under `/agent/threads/`, copies it, generates the diff, and writes a manifest pinned to the current kernel and cell SHAs. Optional flags: `THREAD=`, `BASE=main`, `COST=N`, `DURATION=N`. + +4. **Review the run.** Long-form qualitative review in `$EDITOR`: + ``` + make eval-review TASK=add-standup-cmd RUN= + ``` + Sections cover overall impression, per-stage notes, document quality (for both human and machine readers), code quality, and the highest-value section: **patterns flow should learn**. Numeric scores are optional. + +5. **Compare runs:** + ``` + make eval-compare TASK=add-standup-cmd A= B= + ``` + Prints version pins, metric deltas, and lists reviews on both sides — `diff` the review files yourself to compare prose feedback. + +6. **List what we have:** + ``` + make eval-list + ``` + +7. **Commit.** `evals/tasks/`, `evals/runs/`, and `evals/templates/` are all part of this repo. + +## Layout + +``` +evals/ + tasks// # task definitions (committed) + task.json + prompt.md + README.md + runs/// # captured runs (committed) + manifest.json # versions, project, metrics + prompt.md # snapshot of the prompt at record time + thread/ # copied handoff docs from the run + diff.patch # git diff ... + reviews/.md # one markdown file per reviewer + templates/ + review.md # the review template +``` + +`run-id` format: `-` — sortable, scannable, includes the kernel version axis. + +## Why qualitative + +Numeric scores are easy to game and lose nuance. The richest signal for improving flow is prose: *"the plan invented a file that doesn't exist," "the spec buried acceptance criteria in narrative," "implement re-asked questions explore had already answered."* That's what the review template invites, and that's what feeds the next iteration. diff --git a/evals/runs/.gitkeep b/evals/runs/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/tasks/.gitkeep b/evals/tasks/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/evals/tasks/mini-todo/README.md b/evals/tasks/mini-todo/README.md new file mode 100644 index 0000000..62f3ccf --- /dev/null +++ b/evals/tasks/mini-todo/README.md @@ -0,0 +1,62 @@ +# mini-todo + +**What this task evaluates:** Greenfield code-pipeline behaviour on a small, fully specified task. The prompt is intentionally tight (commands, storage, std-lib-only) so divergences in flow's output reflect the cell's behaviour, not prompt ambiguity. We're watching for: + +- **Spec stage** — does it pin the JSON schema and the failure-mode behaviour (missing id, malformed file) the prompt left implicit, or does it just restate the prompt? +- **Plan stage** — does it propose a sensible single-file layout, or invent unnecessary structure (`src/`, modules, classes for a 100-line tool)? +- **Implement stage** — is the code tight (target: <150 LOC), does it handle the obvious edge cases (file doesn't exist yet, bad id), and does it stop there? +- **Review stage** — does it actually find anything, or rubber-stamp? +- **Scope discipline** — does flow stay in scope, or does it add config files, GitHub Actions, type hints when prompt didn't ask? + +## Prompt + +See `prompt.md`. Pass that to `/flow:flow` verbatim. + +## Project location + +Run flow against `~/Workspace/jyliang/mini-todo` (a fresh empty repo, scaffolded once and reused across runs — each run cuts its own branch). If that path doesn't exist, see "First-time setup" below. + +## How to run a fresh run + +1. Reset the project to a clean main: + ``` + cd ~/Workspace/jyliang/mini-todo + git checkout main && git reset --hard + ``` + (Initial-commit SHA: see `notes` at the bottom of this file once it's been initialized.) +2. Open a Claude Code session in that directory and run `/flow:flow` with the prompt from `prompt.md`. +3. Walk flow through its stages, answering boundary prompts as a real user would. Note rough cost + duration if you can. +4. From the flow repo: + ``` + make eval-record TASK=mini-todo PROJECT=~/Workspace/jyliang/mini-todo COST= DURATION= + ``` +5. Review: + ``` + make eval-review TASK=mini-todo RUN= + ``` + +## What's a good outcome? + +- Final diff is one Python file (~100–150 LOC) plus a short README. +- All four commands work; `--file` flag is honoured; missing-id is handled cleanly. +- Spec doc captures: data shape, what happens on missing/malformed file, behaviour when id doesn't exist. +- Plan doc proposes the single-file layout without inventing structure. +- No type hints, no test framework, no CI — those weren't asked for. +- Total flow run cost: under $2; duration: under 15 minutes. + +## First-time setup + +``` +mkdir -p ~/Workspace/jyliang/mini-todo +cd ~/Workspace/jyliang/mini-todo +git init -b main +echo "# mini-todo" > README.md +git add README.md && git commit -m "init" +``` + +Then record that initial commit's SHA in the **Notes** section below so future runs can reset to it. + +## Notes + +- **Initial commit SHA** (reset target): `9f05045` (`9f050453dc9081213a1adc1baa1dcae6db7d3dbb`) +- _(running history across runs — what's improving, what's still a problem, goes here.)_ diff --git a/evals/tasks/mini-todo/prompt.md b/evals/tasks/mini-todo/prompt.md new file mode 100644 index 0000000..7549b34 --- /dev/null +++ b/evals/tasks/mini-todo/prompt.md @@ -0,0 +1,12 @@ +# Prompt + +Pass this verbatim to `/flow:flow`: + +> Build a single-file Python TODO CLI named `todo`. It stores tasks in a JSON file (default: `~/.todo.json`, override via `--file`). Commands: +> +> - `todo add ""` — append a new task, print its id and text +> - `todo ls` — list tasks, one per line, with `[ ]` / `[x]` and the id +> - `todo done ` — mark complete +> - `todo rm ` — delete +> +> Use only the Python standard library. Single file is fine. Include a short README with install + usage. diff --git a/evals/tasks/mini-todo/task.json b/evals/tasks/mini-todo/task.json new file mode 100644 index 0000000..4229f34 --- /dev/null +++ b/evals/tasks/mini-todo/task.json @@ -0,0 +1,6 @@ +{ + "id": "mini-todo", + "type": "code", + "title": "Build a mini TODO CLI in a fresh repo", + "description": "Have flow build a single-file Python TODO CLI with JSON-file persistence and the commands: add, ls, done, rm. Tests scope discipline (will flow over- or under-engineer it?), spec quality (does it pin the data shape and edge cases?), and code quality on greenfield work." +} diff --git a/evals/templates/review.md b/evals/templates/review.md new file mode 100644 index 0000000..37d5cc1 --- /dev/null +++ b/evals/templates/review.md @@ -0,0 +1,76 @@ +# Review: {{run_id}} + +- task: `{{task}}` +- reviewer: `{{reviewer}}` +- reviewed_at: {{reviewed_at}} + +> Read the run artifacts alongside this review: +> +> - manifest: `evals/runs/{{task}}/{{run_id}}/manifest.json` +> - thread docs: `evals/runs/{{task}}/{{run_id}}/thread/*.md` +> - final diff: `evals/runs/{{task}}/{{run_id}}/diff.patch` + +--- + +## Overall impression + +(gut take — write freely. one paragraph or ten, your choice) + +## What went well + +(what should flow keep doing? cite specifics — quote the doc, point at the diff) + +## What went poorly + +(what should flow change? same — be specific) + +## Per-stage notes + +### Spec / explore + +(what did the explore stage produce? was the spec usable? did it capture intent?) + +### Plan + +(was the plan grounded in the actual codebase? did it reference real files? right level of detail?) + +### Implementation / final diff + +(read `diff.patch`. is the code good? right size? scope-disciplined? any over-engineering?) + +### Review (the stage) + +(if a review stage ran — did it catch anything real, or rubber-stamp?) + +## Document quality + +### Human readability + +How easy were the docs for a human to read? Quote awkward passages. Was the structure helpful or noise? + +### Machine clarity + +Did each stage's output give the next stage what it needed? Look for places where the next stage had to re-derive context, re-ask questions, or guess. That's where the upstream document failed its second reader. + +## Code quality + +Was the final diff good? Right things changed, right size, right amount of test coverage / comments / abstraction? Specific quotes from the diff are more useful than general impressions. + +## Patterns flow should learn + +**This is the highest-value section.** What should the next iteration of the kernel or cell do differently based on what you saw? Concrete proposals — *"plan stage should grep for existing patterns before proposing new abstractions," "spec stage should require an explicit acceptance criteria block."* The reflect step will read these. + +## Comparison to other runs + +If you've reviewed prior runs of this task (or similar tasks), what changed here? Better, worse, same — and why? + +## Optional numeric scores (1-5) + +Fill in what's useful, leave the rest blank. These supplement the prose, they don't replace it. + +- doc_readability: +- doc_machine_clarity: +- code_quality: +- cost_satisfaction: +- speed_satisfaction: +- overall: diff --git a/evals/templates/task-readme.md b/evals/templates/task-readme.md new file mode 100644 index 0000000..4ce5a64 --- /dev/null +++ b/evals/templates/task-readme.md @@ -0,0 +1,29 @@ +# {{task}} + +**What this task evaluates:** _(fill in: what aspects of flow are we testing here? doc quality on a fresh-repo task? code quality on a bugfix? something else?)_ + +## Prompt + +See `prompt.md`. Pass that to `/flow:flow` verbatim when running this eval. + +## How to run a fresh run + +1. Open a Claude Code session in the target project (or set up a fixture for it). +2. Run `/flow:flow` with the prompt from `prompt.md`. Let flow walk through the stages — answer boundary prompts the way a real user would. +3. After flow ships (or pauses), capture from this repo: + ``` + make eval-record TASK={{task}} PROJECT= + ``` + Add `COST= DURATION=` if you tracked them. +4. Review: + ``` + make eval-review TASK={{task}} RUN= + ``` + +## What's a good outcome? + +_(fill in: what does flow doing well on this task look like? a tight diff? a plan that referenced the right files? specifics that the reviewer should look for — these become the "what should flow learn" section anchors.)_ + +## Notes + +_(running history of observations across runs of this task — what's been improving, what's still a problem, what we've tried.)_ diff --git a/scripts/eval-compare.sh b/scripts/eval-compare.sh new file mode 100755 index 0000000..a815fd6 --- /dev/null +++ b/scripts/eval-compare.sh @@ -0,0 +1,101 @@ +#!/usr/bin/env bash +# Compare two recorded runs of the same task. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +task="" +run_a="" +run_b="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --task) task="$2"; shift 2 ;; + --a) run_a="$2"; shift 2 ;; + --b) run_b="$2"; shift 2 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +if [ -z "$task" ] || [ -z "$run_a" ] || [ -z "$run_b" ]; then + echo "Usage: make eval-compare TASK= A= B=" >&2 + exit 1 +fi + +dir_a="$RUNTIME_ROOT/evals/runs/$task/$run_a" +dir_b="$RUNTIME_ROOT/evals/runs/$task/$run_b" +[ -d "$dir_a" ] || { echo "missing: $dir_a" >&2; exit 1; } +[ -d "$dir_b" ] || { echo "missing: $dir_b" >&2; exit 1; } + +field() { jq -r "$1 // \"—\"" "$2" 2>/dev/null || echo "—"; } + +ka=$(field '.versions.kernel.sha_short' "$dir_a/manifest.json") +kb=$(field '.versions.kernel.sha_short' "$dir_b/manifest.json") +kab=$(field '.versions.kernel.branch' "$dir_a/manifest.json") +kbb=$(field '.versions.kernel.branch' "$dir_b/manifest.json") +ca=$(field '.versions.cell.sha_short' "$dir_a/manifest.json") +cb=$(field '.versions.cell.sha_short' "$dir_b/manifest.json") +cost_a=$(field '.metrics.cost_usd' "$dir_a/manifest.json") +cost_b=$(field '.metrics.cost_usd' "$dir_b/manifest.json") +dur_a=$(field '.metrics.duration_sec' "$dir_a/manifest.json") +dur_b=$(field '.metrics.duration_sec' "$dir_b/manifest.json") +proj_a=$(field '.project.path' "$dir_a/manifest.json") +proj_b=$(field '.project.path' "$dir_b/manifest.json") +branch_a=$(field '.project.branch' "$dir_a/manifest.json") +branch_b=$(field '.project.branch' "$dir_b/manifest.json") + +echo +echo "Comparing task: $task" +echo "============================================================" +printf " A: %s\n" "$run_a" +printf " B: %s\n" "$run_b" +echo "------------------------------------------------------------" +printf "%-16s %-22s %-22s\n" "field" "A" "B" +printf "%-16s %-22s %-22s\n" "kernel" "$ka ($kab)" "$kb ($kbb)" +printf "%-16s %-22s %-22s\n" "cell" "$ca" "$cb" +printf "%-16s %-22s %-22s\n" "project" "$(basename "$proj_a")" "$(basename "$proj_b")" +printf "%-16s %-22s %-22s\n" "branch" "$branch_a" "$branch_b" +printf "%-16s %-22s %-22s\n" "cost (USD)" "$cost_a" "$cost_b" +printf "%-16s %-22s %-22s\n" "duration (s)" "$dur_a" "$dur_b" +echo + +list_reviews() { + local d="$1" + if compgen -G "$d/reviews/*.md" >/dev/null; then + for f in "$d/reviews"/*.md; do + local name; name=$(basename "$f" .md) + local lines; lines=$(wc -l < "$f" | tr -d ' ') + printf " %-20s (%s lines)\n" "$name" "$lines" + done + else + echo " (none)" + fi +} + +echo "Reviews on A:" +list_reviews "$dir_a" +echo "Reviews on B:" +list_reviews "$dir_b" +echo + +# If both sides have a review by the same reviewer, suggest a side-by-side diff. +shared=() +if compgen -G "$dir_a/reviews/*.md" >/dev/null && compgen -G "$dir_b/reviews/*.md" >/dev/null; then + for f in "$dir_a/reviews"/*.md; do + n=$(basename "$f") + [ -f "$dir_b/reviews/$n" ] && shared+=("$n") + done +fi +if [ ${#shared[@]} -gt 0 ]; then + echo "Shared reviewers — diff to compare prose feedback:" + for n in "${shared[@]}"; do + echo " diff $dir_a/reviews/$n $dir_b/reviews/$n" + done + echo +fi + +echo "Artifacts:" +echo " A: ${dir_a#$RUNTIME_ROOT/}" +echo " B: ${dir_b#$RUNTIME_ROOT/}" +echo diff --git a/scripts/eval-list.sh b/scripts/eval-list.sh new file mode 100755 index 0000000..cae55a8 --- /dev/null +++ b/scripts/eval-list.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# List eval tasks and recorded runs. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +echo "Tasks:" +found=0 +for d in "$RUNTIME_ROOT/evals/tasks"/*/; do + [ -d "$d" ] || continue + name=$(basename "$d") + [ "$name" = ".gitkeep" ] && continue + title="" + if [ -f "$d/task.json" ] && command -v jq >/dev/null 2>&1; then + title=$(jq -r '.title // ""' "$d/task.json" 2>/dev/null) + fi + printf " %-32s %s\n" "$name" "$title" + found=$((found + 1)) +done +if [ $found -eq 0 ]; then echo " (none — scaffold one with: make eval-task-new TASK=)"; fi + +echo +echo "Runs:" +found=0 +for task_dir in "$RUNTIME_ROOT/evals/runs"/*/; do + [ -d "$task_dir" ] || continue + task=$(basename "$task_dir") + [ "$task" = ".gitkeep" ] && continue + for run_dir in "$task_dir"*/; do + [ -d "$run_dir" ] || continue + run_id=$(basename "$run_dir") + review_count=$(find "$run_dir/reviews" -maxdepth 1 -name '*.md' 2>/dev/null | wc -l | tr -d ' ') + kernel="?" + if [ -f "$run_dir/manifest.json" ] && command -v jq >/dev/null 2>&1; then + kernel=$(jq -r '.versions.kernel.sha_short // "?"' "$run_dir/manifest.json" 2>/dev/null) + fi + plural="s" + [ "$review_count" = "1" ] && plural="" + printf " %-22s %-40s kernel=%s %s review%s\n" "$task" "$run_id" "$kernel" "$review_count" "$plural" + found=$((found + 1)) + done +done +if [ $found -eq 0 ]; then echo " (none — capture one with: make eval-record TASK=)"; fi diff --git a/scripts/eval-record.sh b/scripts/eval-record.sh new file mode 100755 index 0000000..0c2657d --- /dev/null +++ b/scripts/eval-record.sh @@ -0,0 +1,164 @@ +#!/usr/bin/env bash +# Capture a flow run as an eval record. +# +# Discovers the most recent thread under /agent/threads/, copies it, +# generates a diff against the base branch, and writes a manifest pinned to +# the current kernel and active-cell SHAs. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" +FLOW_HOME="${FLOW_HOME:-$HOME/.flow}" + +task="" +thread_dir="" +project_dir="" +base_branch="main" +cost_usd="null" +duration_sec="null" + +while [[ $# -gt 0 ]]; do + case "$1" in + --task) task="$2"; shift 2 ;; + --thread) thread_dir="$2"; shift 2 ;; + --project) project_dir="$2"; shift 2 ;; + --base) base_branch="$2"; shift 2 ;; + --cost) cost_usd="$2"; shift 2 ;; + --duration) duration_sec="$2"; shift 2 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +[ -z "$task" ] && { + echo "Usage: make eval-record TASK= [PROJECT=] [THREAD=] [BASE=main] [COST=] [DURATION=]" >&2 + exit 1 +} + +if [ ! -d "$RUNTIME_ROOT/evals/tasks/$task" ]; then + echo "Task not found: $task" >&2 + echo "Available tasks:" >&2 + ls "$RUNTIME_ROOT/evals/tasks" 2>/dev/null | grep -v '^\.gitkeep$' | sed 's/^/ /' >&2 || echo " (none — scaffold one with: make eval-task-new TASK=)" >&2 + exit 1 +fi + +# Discover thread_dir if not given. +if [ -z "$thread_dir" ]; then + [ -z "$project_dir" ] && project_dir="$(pwd)" + threads_root="$project_dir/agent/threads" + if [ ! -d "$threads_root" ]; then + echo "No agent/threads/ in $project_dir." >&2 + echo "Pass --thread explicitly, or run from a project that has flow threads." >&2 + exit 1 + fi + thread_dir=$(find "$threads_root" -mindepth 1 -maxdepth 1 -type d | sort -r | head -n1) + [ -z "$thread_dir" ] && { echo "No threads found in $threads_root" >&2; exit 1; } +fi + +[ -d "$thread_dir" ] || { echo "Thread dir not found: $thread_dir" >&2; exit 1; } + +# Derive project_dir from thread_dir if still unset. +if [ -z "$project_dir" ]; then + project_dir=$(git -C "$thread_dir" rev-parse --show-toplevel 2>/dev/null) || { + echo "Could not derive project dir from $thread_dir; pass --project explicitly" >&2 + exit 1 + } +fi + +# Derive branch: prefer spec.md frontmatter, fall back to the project's HEAD. +branch="" +if [ -f "$thread_dir/spec.md" ]; then + branch=$(grep -E '^branch:[[:space:]]' "$thread_dir/spec.md" 2>/dev/null | head -n1 | sed -E 's/^branch:[[:space:]]*//' | tr -d '"') +fi +[ -z "$branch" ] && branch=$(git -C "$project_dir" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "") + +# Pin versions. +kernel_sha=$(git -C "$RUNTIME_ROOT" rev-parse HEAD) +kernel_sha7=${kernel_sha:0:7} +kernel_branch=$(git -C "$RUNTIME_ROOT" rev-parse --abbrev-ref HEAD) + +cell_path="" +cell_name="(none)" +cell_sha="" +cell_sha7="" +cell_branch="" +if [ -L "$FLOW_HOME/active-cell" ]; then + cell_path=$(readlink "$FLOW_HOME/active-cell") + cell_name=$(basename "$cell_path") + if [ -d "$cell_path/.git" ]; then + cell_sha=$(git -C "$cell_path" rev-parse HEAD 2>/dev/null || echo "") + cell_sha7=${cell_sha:0:7} + cell_branch=$(git -C "$cell_path" rev-parse --abbrev-ref HEAD 2>/dev/null || echo "") + fi +fi + +# Run-id and destination. +ts=$(date -u +"%Y-%m-%dT%H-%M-%SZ") +run_id="${ts}-${kernel_sha7}" +dest="$RUNTIME_ROOT/evals/runs/$task/$run_id" +mkdir -p "$dest/reviews" + +# Copy thread (skip if missing). +if [ -d "$thread_dir" ]; then + cp -R "$thread_dir" "$dest/thread" +fi + +# Generate diff. base_branch...branch shows what's on the branch since it diverged from base. +diff_path="$dest/diff.patch" +if [ -n "$branch" ] && git -C "$project_dir" rev-parse --verify "$base_branch" >/dev/null 2>&1; then + git -C "$project_dir" diff "$base_branch...$branch" > "$diff_path" 2>/dev/null || { + echo " warning: git diff $base_branch...$branch failed; trying $base_branch..HEAD" >&2 + git -C "$project_dir" diff "$base_branch..HEAD" > "$diff_path" 2>/dev/null || : > "$diff_path" + } +else + : > "$diff_path" + echo " warning: could not generate diff — branch=$branch base=$base_branch" >&2 +fi + +# Snapshot the prompt. +[ -f "$RUNTIME_ROOT/evals/tasks/$task/prompt.md" ] && cp "$RUNTIME_ROOT/evals/tasks/$task/prompt.md" "$dest/prompt.md" + +# Manifest. +recorded_at=$(date -u +%FT%TZ) +jq -n \ + --arg run_id "$run_id" \ + --arg task "$task" \ + --arg recorded_at "$recorded_at" \ + --arg kernel_sha "$kernel_sha" \ + --arg kernel_sha7 "$kernel_sha7" \ + --arg kernel_branch "$kernel_branch" \ + --arg cell_name "$cell_name" \ + --arg cell_sha "$cell_sha" \ + --arg cell_sha7 "$cell_sha7" \ + --arg cell_branch "$cell_branch" \ + --arg project_path "$project_dir" \ + --arg branch "$branch" \ + --arg base "$base_branch" \ + --arg thread_src "$thread_dir" \ + --argjson cost "$cost_usd" \ + --argjson duration "$duration_sec" \ + '{ + run_id: $run_id, + task: $task, + recorded_at: $recorded_at, + versions: { + kernel: { sha: $kernel_sha, sha_short: $kernel_sha7, branch: $kernel_branch }, + cell: { name: $cell_name, sha: $cell_sha, sha_short: $cell_sha7, branch: $cell_branch } + }, + project: { path: $project_path, branch: $branch, base: $base }, + source: { thread_dir: $thread_src }, + metrics: { cost_usd: $cost, duration_sec: $duration }, + notes: "" + }' > "$dest/manifest.json" + +cat <//reviews/.md. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +task="" +run_id="" +reviewer="" + +while [[ $# -gt 0 ]]; do + case "$1" in + --task) task="$2"; shift 2 ;; + --run) run_id="$2"; shift 2 ;; + --reviewer) reviewer="$2"; shift 2 ;; + *) echo "unknown arg: $1" >&2; exit 1 ;; + esac +done + +if [ -z "$task" ] || [ -z "$run_id" ]; then + echo "Usage: make eval-review TASK= RUN= [REVIEWER=]" >&2 + exit 1 +fi + +run_dir="$RUNTIME_ROOT/evals/runs/$task/$run_id" +[ -d "$run_dir" ] || { echo "run not found: $run_dir" >&2; exit 1; } + +# Default reviewer = git user, slugified. +if [ -z "$reviewer" ]; then + raw=$(git config user.name 2>/dev/null || echo "human") + reviewer=$(echo "$raw" | tr '[:upper:]' '[:lower:]' | tr ' ' '-' | tr -cd 'a-z0-9-') + [ -z "$reviewer" ] && reviewer="human" +fi + +mkdir -p "$run_dir/reviews" +review_file="$run_dir/reviews/$reviewer.md" + +# First-time render: pre-fill the template with run metadata. +if [ ! -f "$review_file" ]; then + template="$RUNTIME_ROOT/evals/templates/review.md" + [ -f "$template" ] || { echo "missing template: $template" >&2; exit 1; } + reviewed_at=$(date -u +%FT%TZ) + sed -e "s|{{run_id}}|$run_id|g" \ + -e "s|{{task}}|$task|g" \ + -e "s|{{reviewer}}|$reviewer|g" \ + -e "s|{{reviewed_at}}|$reviewed_at|g" \ + "$template" > "$review_file" + + # Append an artifact index so the reviewer has paths in-editor. + { + echo "" + echo "---" + echo "" + echo "## Artifact index (for reading alongside)" + echo "" + echo "Manifest:" + if [ -f "$run_dir/manifest.json" ] && command -v jq >/dev/null 2>&1; then + jq -r '" kernel: \(.versions.kernel.sha_short) (\(.versions.kernel.branch))", + " cell: \(.versions.cell.name) @ \(.versions.cell.sha_short)", + " project: \(.project.path)", + " branch: \(.project.branch) vs \(.project.base)", + " cost: \(.metrics.cost_usd // "—")", + " duration: \(.metrics.duration_sec // "—")"' "$run_dir/manifest.json" + fi + echo "" + echo "Thread docs:" + if [ -d "$run_dir/thread" ]; then + find "$run_dir/thread" -type f -name '*.md' | sort | while read -r f; do + printf " %s\n" "${f#$RUNTIME_ROOT/}" + done + fi + echo "" + if [ -s "$run_dir/diff.patch" ]; then + lines=$(wc -l < "$run_dir/diff.patch" | tr -d ' ') + echo "Diff: evals/runs/$task/$run_id/diff.patch ($lines lines)" + else + echo "Diff: (empty)" + fi + } >> "$review_file" +fi + +echo "Opening: $review_file" +"${EDITOR:-vi}" "$review_file" +echo "✓ Review saved: ${review_file#$RUNTIME_ROOT/}" diff --git a/scripts/eval-task-new.sh b/scripts/eval-task-new.sh new file mode 100755 index 0000000..d2fbd11 --- /dev/null +++ b/scripts/eval-task-new.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Scaffold a new eval task under evals/tasks//. + +set -euo pipefail + +RUNTIME_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" + +task="${1:-}" +[ -z "$task" ] && { + echo "Usage: make eval-task-new TASK=" >&2 + exit 1 +} + +dest="$RUNTIME_ROOT/evals/tasks/$task" +[ -e "$dest" ] && { echo "Task already exists: $dest" >&2; exit 1; } + +mkdir -p "$dest" + +cat > "$dest/task.json" < "$dest/prompt.md" < example: Add a /standup command that summarises my git activity over the last week. +EOF + +# Render the task README from template, substituting {{task}}. +sed "s/{{task}}/$task/g" "$RUNTIME_ROOT/evals/templates/task-readme.md" > "$dest/README.md" + +cat <