diff --git a/.env.example b/.env.example index c767cf6..34fd1ae 100644 --- a/.env.example +++ b/.env.example @@ -1,34 +1,18 @@ -# Tracecraft Configuration +# Tracecraft environment variables — only what the CLI actually reads. -# SeaweedFS Configuration -TRACECRAFT_SEAWEEDFS_S3_ENDPOINT=localhost:8333 -TRACECRAFT_SEAWEEDFS_ACCESS_KEY=admin -TRACECRAFT_SEAWEEDFS_SECRET_KEY=admin_secret_key -TRACECRAFT_SEAWEEDFS_USE_SSL=false +# S3 backend credentials (read by `tracecraft init`; match docker-compose.dev.yml) +AWS_ACCESS_KEY_ID=admin +AWS_SECRET_ACCESS_KEY=admin123456 -# Security Configuration -TRACECRAFT_SECURITY_ENCRYPTION_ENABLED=true -TRACECRAFT_SECURITY_JWT_SECRET=your-jwt-secret-here +# HuggingFace backend token (read by `tracecraft init --backend hf`) +# HF_TOKEN=hf_... -# Storage Configuration -TRACECRAFT_STORAGE_BUCKET_NAME=tracecraft-data -TRACECRAFT_STORAGE_RETENTION_DAYS=90 +# Override the agent identity per shell/process (lets several agents share one +# directory and .tracecraft.json) +# TRACECRAFT_AGENT=designer -# UI Configuration -TRACECRAFT_UI_HOST=0.0.0.0 -TRACECRAFT_UI_PORT=8000 -TRACECRAFT_UI_AUTH_REQUIRED=false - -# Monitoring Configuration -TRACECRAFT_MONITORING_ENABLED=true - -# Database Configuration -TRACECRAFT_DATABASE_HOST=localhost -TRACECRAFT_DATABASE_PORT=5432 -TRACECRAFT_DATABASE_DATABASE=tracecraft -TRACECRAFT_DATABASE_USER=tracecraft -TRACECRAFT_DATABASE_PASSWORD=tracecraft - -# Redis Configuration -TRACECRAFT_REDIS_HOST=localhost -TRACECRAFT_REDIS_PORT=6379 +# Session-mirror harness location overrides (only if your harness lives in a +# non-default path) +# OPENCLAW_STATE_DIR= +# OPENCLAW_HOME= +# HERMES_HOME= diff --git a/.github/workflows/test.yml b/.github/workflows/ci.yml similarity index 86% rename from .github/workflows/test.yml rename to .github/workflows/ci.yml index 8c4d219..975c5a3 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/ci.yml @@ -1,4 +1,4 @@ -name: tests +name: ci on: push: @@ -7,7 +7,7 @@ on: branches: [main] jobs: - format: + lint: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -18,12 +18,15 @@ jobs: python-version: "3.12" cache: pip - - name: Install ruff + - name: Install package + dev extras working-directory: sdk run: | python -m pip install --upgrade pip pip install -e ".[dev]" + - name: Lint (ruff check) + run: ruff check sdk/ + - name: Check formatting (ruff format) run: ruff format --check sdk/ @@ -32,7 +35,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.10", "3.11", "3.12", "3.13"] + python-version: ["3.10", "3.12"] steps: - uses: actions/checkout@v4 diff --git a/CLAUDE.md b/CLAUDE.md index ea7b340..5668b72 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -38,8 +38,8 @@ pivot lives in `plans/server-archive/` for reference only — nothing in the SDK // agents/.json ← agent registration + heartbeat memory/.json ← shared key-value state - messages//_.json ← per-agent mailbox - messages/_broadcast/_.json ← broadcast + messages//__.json ← per-agent mailbox + messages/_broadcast/__.json ← broadcast steps//claim.json ← atomic claim (If-None-Match=*) steps//status.json ← pending / in_progress / complete steps//handoff.json ← note + from_agent for next agent @@ -56,13 +56,17 @@ pivot lives in `plans/server-archive/` for reference only — nothing in the SDK many isolated projects. - **No server, no daemon**: each CLI call is stateless; state lives on the bucket. - **No vendor lock-in**: AWS, R2, MinIO, B2, Wasabi, HuggingFace all work today. +- **Claim/status crash-window invariant**: `claim.json` (atomic) and `status.json` are + two separate writes; a crash between them leaves a claim with no status. Readers MUST + treat "claim.json exists, status.json missing" as `in_progress` by the claiming agent — + the claim is the authoritative write (`step-status` and `wait-for` implement this via + `_effective_status` in `cli/steps.py`). -## Known gaps (May 2026) +## Known gaps (June 2026) -- No TTL on claims (a crashed claim-holder keeps the lock forever) — Tier 1 work. +- No TTL on claims (a crashed claim-holder keeps the lock forever; `complete --force` + is the manual escape hatch) — Tier 1 work. - Heartbeat is written at `init` only, never refreshed — Tier 1 work. -- Messages keyed by `_.json` can collide same-second — Tier 1 work. -- No tests in `sdk/tests/` — Tier 1 work. ## Building diff --git a/README.md b/README.md index c4576f7..386274a 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![PyPI](https://img.shields.io/pypi/v/tracecraft-ai)](https://pypi.org/project/tracecraft-ai/) [![Python](https://img.shields.io/pypi/pyversions/tracecraft-ai)](https://pypi.org/project/tracecraft-ai/) [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT) -[![Tests](https://github.com/Arrmlet/tracecraft/actions/workflows/test.yml/badge.svg)](https://github.com/Arrmlet/tracecraft/actions/workflows/test.yml) +[![CI](https://github.com/Arrmlet/tracecraft/actions/workflows/ci.yml/badge.svg)](https://github.com/Arrmlet/tracecraft/actions/workflows/ci.yml) **Tracecraft is a CLI coordination layer for multi-agent AI systems** — shared **memory**, a **mailbox**, atomic task **claims**, **handoffs**, and **artifacts**, plus mirrored **session transcripts**, all stored as plain JSON in any **S3** or **HuggingFace** bucket. No server. No database. No SDK lock-in. @@ -29,20 +29,25 @@ docker run -d -p 9000:9000 \ minio/minio server /data ``` -Register two agents against the same project: +(From a checkout, `docker compose -f docker-compose.dev.yml up -d` does the same and adds the MinIO console on `:9001`.) + +Register two agents against the same project. Credentials come from the standard AWS env vars, so they never land in your shell history: ```bash +export AWS_ACCESS_KEY_ID=admin +export AWS_SECRET_ACCESS_KEY=admin123456 + # Terminal 1 tracecraft init --project demo --agent designer \ - --endpoint http://localhost:9000 --bucket tracecraft \ - --access-key admin --secret-key admin123456 + --endpoint http://localhost:9000 --bucket tracecraft # Terminal 2 — same flags, --agent developer tracecraft init --project demo --agent developer \ - --endpoint http://localhost:9000 --bucket tracecraft \ - --access-key admin --secret-key admin123456 + --endpoint http://localhost:9000 --bucket tracecraft ``` +`init` writes the config to `.tracecraft.json` with mode `600` and auto-adds it to `.gitignore` when you're in a git repo. + Now the core move — **two agents cannot grab the same work**, with no lock service and no server to run: ```console @@ -96,16 +101,39 @@ tracecraft send _broadcast "v1 cut at 3pm, wrap your tasks" --- -## Coordination + reasoning in one bucket +## Why not LangGraph / Redis / message queues? + +- **Frameworks (LangGraph, CrewAI, AutoGen)** orchestrate agents *inside one process*. Tracecraft coordinates *any* processes across machines — different harnesses, different clouds, different teams — through storage they already have. +- **Redis / Postgres / a queue** means operating a server: provisioning, auth, uptime, backups. A bucket is zero infra, and every state change is a browsable JSON file — you get an audit trail for free just by opening the bucket. +- **A2A / MCP** are live wire protocols between *running* agents. Tracecraft is durable state for agents that aren't running at the same time — one agent finishes Tuesday, the next picks up the handoff Wednesday. + +## Status & limitations + +Tracecraft is **pre-alpha**. Honest sharp edges, as of now: + +- **No TTL on claims** — a crashed claim-holder keeps the lock until someone runs `complete --force`. +- **Heartbeat isn't refreshed** — `agents` shows who registered, not who's alive right now. +- **HF claims are best-effort** — HuggingFace Buckets have no conditional write, so atomic claims need an S3-compatible backend. + +Open issues and roadmap → [github.com/Arrmlet/tracecraft/issues](https://github.com/Arrmlet/tracecraft/issues) + +--- + +## Session mirroring Most coordination tools store the *events* — who claimed what, who messaged whom. Tracecraft stores those **and** each agent's full reasoning, by mirroring coding-agent session transcripts into the same bucket. When a run goes sideways, one `tracecraft session show` gives you the handoffs **and** the chain of thought behind them — same place, same JSON, no second system to wire up. ```bash -tracecraft session mirror --harness claude-code # tail this session into the bucket -tracecraft session show --tail 50 # read coordination + reasoning together +tracecraft session mirror --harness claude-code # upload this session's new bytes +tracecraft session list # browse mirrored sessions +tracecraft session show --tail 50 # replay: meta + last N transcript lines +tracecraft session stop # clear local cursor, mark session ended ``` -Works with **Claude Code, Codex, OpenClaw, and Hermes**. Source transcripts are never modified; secret-shape redaction (AWS / Anthropic / OpenAI / HF / GitHub / Slack token patterns) is on by default and counted in metadata. +- **Four harnesses** — `claude-code`, `codex`, `openclaw`, `hermes`. Anything else can mirror by writing JSONL to the same layout. +- **Incremental cursor uploads** — `mirror` keeps a per-session byte offset and uploads only what's new as numbered parts, so re-running it from a cron or hook is safe and cheap; a run with nothing new is a no-op. The part sequence is derived from the bucket, so it even survives losing the local state file. +- **Redaction on by default** — AWS / Anthropic / OpenAI / HF / GitHub / Slack token shapes are scrubbed before upload, with per-pattern match counts recorded in the session's `meta.json` (pass `--no-redact` to opt out). Source transcripts are never modified. +- **Replay** — `session show --tail N` concatenates the uploaded parts and prints the last N transcript lines next to the session metadata. Harness matrix, storage formats, and redaction details → **[docs/session-mirror.md](docs/session-mirror.md)** @@ -145,6 +173,8 @@ Bring your own bucket — no vendor lock-in: | Backblaze B2 / Wasabi | S3-compatible endpoint | | | HuggingFace Buckets | `--backend hf --bucket user/name` | browsable on the Hub; `pip install tracecraft-ai[huggingface]` | +**HuggingFace privacy:** `init` creates the bucket **private by default** (pass `--public` to opt out) and prints the bucket's *actual* visibility, read back from the Hub — e.g. `Backend: HuggingFace Buckets Bucket: user/x (private)`. If the bucket already exists as public and you didn't ask for that, init warns loudly: coordination data and mirrored transcripts would be publicly visible. Visibility can't be flipped after creation (`huggingface_hub` has no `update_bucket`) — the only way to change it is delete + recreate. + --- ## Use cases @@ -199,6 +229,19 @@ TRACECRAFT_AGENT=developer tracecraft inbox --- +## Python API + +The CLI is the stable interface; for code that wants direct bucket access, the store factory is the escape hatch: + +```python +from tracecraft.store import get_store + +store, cfg = get_store() # reads .tracecraft.json like the CLI does +store.put_json("memory/build/status.json", {"value": "passing", "set_by": cfg["agent_id"]}) +``` + +--- + ## More - [docs/session-mirror.md](docs/session-mirror.md) — session mirroring: harnesses, formats, redaction diff --git a/docker-compose.dev.yml b/docker-compose.dev.yml index bdadcbd..c43ca8d 100644 --- a/docker-compose.dev.yml +++ b/docker-compose.dev.yml @@ -1,45 +1,17 @@ -version: "3.8" - +# Local dev backend: a single MinIO bucket is all tracecraft needs. +# Console at http://localhost:9001 to watch agents coordinate live. services: - postgres: - image: postgres:16 + minio: + image: minio/minio + command: server /data --console-address ":9001" ports: - - "5432:5432" + - "9000:9000" + - "9001:9001" environment: - POSTGRES_DB: tracecraft - POSTGRES_USER: tracecraft - POSTGRES_PASSWORD: tracecraft + MINIO_ROOT_USER: admin + MINIO_ROOT_PASSWORD: admin123456 volumes: - - postgres_data:/var/lib/postgresql/data - - redis: - image: redis:7-alpine - ports: - - "6379:6379" - - seaweed-master: - image: chrislusf/seaweedfs - command: master -ip=seaweed-master -port=9333 - ports: - - "9333:9333" - - seaweed-volume: - image: chrislusf/seaweedfs - command: volume -mserver=seaweed-master:9333 -port=8080 -ip=seaweed-volume - ports: - - "8080:8080" - depends_on: - - seaweed-master - - seaweed-filer: - image: chrislusf/seaweedfs - command: filer -master=seaweed-master:9333 -port=8888 -s3 -s3.port=8333 - ports: - - "8888:8888" - - "8333:8333" - depends_on: - - seaweed-master - - seaweed-volume + - minio_data:/data volumes: - postgres_data: + minio_data: diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml index c6446f4..db4310f 100644 --- a/sdk/pyproject.toml +++ b/sdk/pyproject.toml @@ -25,8 +25,6 @@ classifiers = [ ] dependencies = [ "click>=8.1.0", - "httpx>=0.25.0", - "pydantic>=2.5.0", "boto3>=1.28.0", ] diff --git a/sdk/tests/test_coordination.py b/sdk/tests/test_coordination.py new file mode 100644 index 0000000..ce6ff8b --- /dev/null +++ b/sdk/tests/test_coordination.py @@ -0,0 +1,143 @@ +"""Tests for coordination correctness: claim races, complete ownership, +the claim/status crash window, and wait-for's blocked fast-fail. + +All run against moto's in-process S3 — no network. +""" + +from __future__ import annotations + +import json +import time + +import boto3 +import pytest +from click.testing import CliRunner +from moto import mock_aws + +from tracecraft.cli import cli + +BUCKET = "tc-coord-test" +PROJECT = "demo" + + +@pytest.fixture +def env(tmp_path, monkeypatch): + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "testing") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "testing") + monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1") + work = tmp_path / "work" + work.mkdir() + monkeypatch.chdir(work) + cfg = { + "backend": "s3", + "endpoint": None, + "bucket": BUCKET, + "project": PROJECT, + "agent_id": "agent-a", + "access_key": "testing", + "secret_key": "testing", + } + (work / ".tracecraft.json").write_text(json.dumps(cfg)) + with mock_aws(): + boto3.client("s3").create_bucket(Bucket=BUCKET) + yield CliRunner() + + +def _as(agent): + return {"TRACECRAFT_AGENT": agent} + + +def _get(key): + c = boto3.client("s3") + return json.loads(c.get_object(Bucket=BUCKET, Key=f"{PROJECT}/{key}")["Body"].read()) + + +# ---------- atomic claim: two claimers, exactly one wins ---------- + + +def test_claim_race_exactly_one_winner(env): + r1 = env.invoke(cli, ["claim", "build"], env=_as("agent-a")) + r2 = env.invoke(cli, ["claim", "build"], env=_as("agent-b")) + outcomes = [r.exit_code == 0 for r in (r1, r2)] + assert outcomes.count(True) == 1, f"exactly one claimer must win: {r1.output} / {r2.output}" + assert "already claimed by agent-a" in r2.output + assert _get("steps/build/claim.json")["agent"] == "agent-a" + + +# ---------- complete: ownership enforced, --force overrides ---------- + + +def test_complete_rejects_non_owner(env): + env.invoke(cli, ["claim", "build"], env=_as("agent-a")) + r = env.invoke(cli, ["complete", "build"], env=_as("agent-b")) + assert r.exit_code != 0 + assert "claimed by 'agent-a'" in r.output + assert "--force" in r.output + # the step's status must be untouched + assert _get("steps/build/status.json")["status"] == "in_progress" + + +def test_complete_owner_succeeds(env): + env.invoke(cli, ["claim", "build"], env=_as("agent-a")) + r = env.invoke(cli, ["complete", "build"], env=_as("agent-a")) + assert r.exit_code == 0, r.output + assert _get("steps/build/status.json")["status"] == "complete" + + +def test_complete_force_overrides_ownership(env): + env.invoke(cli, ["claim", "build"], env=_as("agent-a")) + r = env.invoke(cli, ["complete", "build", "--force"], env=_as("agent-b")) + assert r.exit_code == 0, r.output + doc = _get("steps/build/status.json") + assert doc["status"] == "complete" + assert doc["agent"] == "agent-b" + + +def test_complete_unclaimed_step_is_allowed(env): + """No claim.json at all — nothing to own, complete goes through.""" + r = env.invoke(cli, ["complete", "adhoc"], env=_as("agent-a")) + assert r.exit_code == 0, r.output + + +# ---------- crash window: claim.json exists, status.json missing ---------- + + +def test_step_status_treats_claim_without_status_as_in_progress(env): + env.invoke(cli, ["claim", "build"], env=_as("agent-a")) + # simulate a crash between the two writes: claim landed, status didn't + boto3.client("s3").delete_object(Bucket=BUCKET, Key=f"{PROJECT}/steps/build/status.json") + r = env.invoke(cli, ["step-status", "build"]) + assert r.exit_code == 0, r.output + assert "in_progress" in r.output + assert "agent-a" in r.output + + +def test_wait_for_treats_claim_without_status_as_waiting(env): + env.invoke(cli, ["claim", "build"], env=_as("agent-a")) + boto3.client("s3").delete_object(Bucket=BUCKET, Key=f"{PROJECT}/steps/build/status.json") + r = env.invoke(cli, ["wait-for", "build", "--timeout", "1"]) + # not complete, not blocked → waits, then times out (no crash, no false success) + assert r.exit_code != 0 + assert "Timeout" in r.output + + +# ---------- wait-for: blocked fails fast, needs_review keeps waiting ---------- + + +def test_wait_for_fast_fails_on_blocked(env): + env.invoke(cli, ["claim", "build"], env=_as("agent-a")) + env.invoke(cli, ["complete", "build", "--blocked"], env=_as("agent-a")) + start = time.monotonic() + r = env.invoke(cli, ["wait-for", "build", "--timeout", "300"]) + elapsed = time.monotonic() - start + assert r.exit_code != 0 + assert "blocked" in r.output + assert elapsed < 10, f"must fail fast, not spin toward the timeout (took {elapsed:.1f}s)" + + +def test_wait_for_mentions_needs_review_while_waiting(env): + env.invoke(cli, ["claim", "build"], env=_as("agent-a")) + env.invoke(cli, ["complete", "build", "--needs-review"], env=_as("agent-a")) + r = env.invoke(cli, ["wait-for", "build", "--timeout", "1"]) + assert r.exit_code != 0 # still waiting → times out + assert "needs review: build" in r.output diff --git a/sdk/tests/test_hf_onboarding.py b/sdk/tests/test_hf_onboarding.py new file mode 100644 index 0000000..48c9192 --- /dev/null +++ b/sdk/tests/test_hf_onboarding.py @@ -0,0 +1,250 @@ +"""Tests for the HF onboarding + correctness-honesty fixes. + +Covers three real, externally-reported issues: + - #7: `init --backend hf` against a non-existent bucket must auto-create it + (HF ensure_bucket() was a no-op; first write failed cryptically). + - #8: HF buckets are public-by-default; init must create them PRIVATE by default, + with an explicit --public opt-out. + - correctness honesty: claims on HF are best-effort (no conditional-write), so both + `init --backend hf` and `claim` must SAY SO rather than imply atomicity. + +These mock the HuggingFace SDK (no network) — they verify the wiring (private flag +reaches create_bucket; the warnings are emitted), not HF's servers. +""" + +from __future__ import annotations + +import json +import sys +import types + +import click +import pytest +from click.testing import CliRunner + +from tracecraft.cli.init_cmd import init_cmd +from tracecraft.cli.steps import claim + + +class FakeBucketState: + """Records create_bucket calls and stores written JSON in-memory.""" + + def __init__(self): + self.create_calls = [] # list of (bucket, private, exist_ok) + self.objects = {} # path -> data + self.buckets = {} # bucket_id -> private (bool); pre-seed to simulate existing + + +@pytest.fixture +def hf_stub(monkeypatch): + """Stub huggingface_hub so init/claim run against an in-memory fake HF backend.""" + state = FakeBucketState() + + # --- fake huggingface_hub module surface used by tracecraft.hf --- + class FakeApi: + """Mimics HfApi: create_bucket(exist_ok=True) never changes an existing + bucket's visibility; bucket_info returns the actual state.""" + + def __init__(self, token=None): + self.token = token + + def create_bucket(self, bucket_id, *, private=None, exist_ok=False, **kw): + state.create_calls.append((bucket_id, private, exist_ok)) + if bucket_id in state.buckets: + if not exist_ok: + raise ValueError(f"Bucket {bucket_id} already exists") + else: + state.buckets[bucket_id] = bool(private) + return f"hf://buckets/{bucket_id}" + + def bucket_info(self, bucket_id, **kw): + if bucket_id not in state.buckets: + raise FileNotFoundError(bucket_id) + return types.SimpleNamespace(private=state.buckets[bucket_id]) + + class FakeFS: + def __init__(self, *a, **k): + pass + + def exists(self, path): + return path in state.objects + + def open(self, path, mode="r"): + store = state.objects + + class _F: + def __enter__(self_): + if "r" in mode: + self_._buf = store.get(path, "") + return self_ + + def __exit__(self_, *exc): + return False + + def write(self_, s): + store[path] = store.get(path, "") + s + + def read(self_): + return self_._buf + + return _F() + + def find(self, path, detail=False): + return [p for p in state.objects if p.startswith(path)] + + fake_hf = types.ModuleType("huggingface_hub") + fake_hf.HfFileSystem = FakeFS + fake_hf.HfApi = FakeApi + monkeypatch.setitem(sys.modules, "huggingface_hub", fake_hf) + return state + + +def _init(runner, tmp_path, monkeypatch, *extra): + monkeypatch.chdir(tmp_path) + args = [ + "--backend", + "hf", + "--bucket", + "user/tc-test", + "--project", + "demo", + "--agent", + "tester", + "--hf-token", + "hf_faketoken", + *extra, + ] + return runner.invoke(init_cmd, args) + + +# ---------- #7: auto-create ---------- + + +def test_init_hf_creates_bucket(hf_stub, tmp_path, monkeypatch): + r = _init(CliRunner(), tmp_path, monkeypatch) + assert r.exit_code == 0, r.output + # ensure_bucket() actually called create_bucket (was a no-op before) + assert len(hf_stub.create_calls) == 1 + bucket, private, exist_ok = hf_stub.create_calls[0] + assert bucket == "user/tc-test" + assert exist_ok is True # idempotent: don't fail if it already exists + # the agent record was written (the first write that used to fail cryptically) + assert any("agents/tester.json" in p for p in hf_stub.objects) + + +# ---------- #8: private by default, --public opt-out ---------- + + +def test_init_hf_private_by_default(hf_stub, tmp_path, monkeypatch): + r = _init(CliRunner(), tmp_path, monkeypatch) + assert r.exit_code == 0, r.output + _, private, _ = hf_stub.create_calls[0] + assert private is True + assert "(private)" in r.output + + +def test_init_hf_public_when_asked(hf_stub, tmp_path, monkeypatch): + r = _init(CliRunner(), tmp_path, monkeypatch, "--public") + assert r.exit_code == 0, r.output + _, private, _ = hf_stub.create_calls[0] + assert private is False + assert "(PUBLIC)" in r.output + + +# ---------- correctness honesty ---------- + + +def test_init_hf_warns_claims_are_best_effort(hf_stub, tmp_path, monkeypatch): + r = _init(CliRunner(), tmp_path, monkeypatch) + assert r.exit_code == 0, r.output + # the racy-claim caveat must be surfaced at init (output includes stderr via CliRunner) + assert "best-effort" in r.output.lower() + assert "S3-compatible" in r.output + + +def test_claim_on_hf_warns_best_effort(hf_stub, tmp_path, monkeypatch): + monkeypatch.chdir(tmp_path) + # write an hf config the CWD-first loader will pick up + cfg = { + "backend": "hf", + "bucket": "user/tc-test", + "project": "demo", + "agent_id": "tester", + "hf_token": "hf_faketoken", + } + (tmp_path / ".tracecraft.json").write_text(json.dumps(cfg)) + r = CliRunner().invoke(claim, ["build"]) + assert r.exit_code == 0, r.output + assert "Claimed step build" in r.output + assert "best-effort" in r.output.lower() + + +# ---------- #8: pre-existing PUBLIC bucket triggers a prominent warning ---------- + + +def test_init_hf_existing_public_bucket_warns(hf_stub, tmp_path, monkeypatch): + """Bucket pre-exists as public; user asked for private (default) — init must + say the data will be publicly visible and that delete+recreate is the only fix.""" + hf_stub.buckets["user/tc-test"] = False # exists, public + r = _init(CliRunner(), tmp_path, monkeypatch) + assert r.exit_code == 0, r.output + assert "(PUBLIC)" in r.output # real state, not the requested flag + assert "WARNING" in r.output + assert "publicly visible" in r.output + assert "delete" in r.output.lower() + + +def test_init_hf_existing_public_bucket_no_warning_with_public_flag(hf_stub, tmp_path, monkeypatch): + hf_stub.buckets["user/tc-test"] = False + r = _init(CliRunner(), tmp_path, monkeypatch, "--public") + assert r.exit_code == 0, r.output + assert "WARNING" not in r.output + + +# ---------- write errors name the bucket and point at init ---------- + + +def test_put_against_missing_bucket_is_actionable(hf_stub, monkeypatch): + from tracecraft.hf import HF + + store = HF(bucket="user/tc-test", project="demo", token="hf_faketoken") + + def boom(*a, **k): + raise OSError("unable to resolve path: invalid repository and revision") + + monkeypatch.setattr(store.fs, "open", boom) + with pytest.raises(click.ClickException) as ei: + store.put_json("memory/x.json", {"v": 1}) + msg = str(ei.value) + assert "user/tc-test" in msg + assert "tracecraft init" in msg + assert "repository and revision" not in msg # raw error replaced, not echoed + + +# ---------- exists(): not-found is False, unauthorized raises ---------- + + +def test_exists_not_found_is_false(hf_stub, monkeypatch): + from tracecraft.hf import HF + + store = HF(bucket="user/tc-test", project="demo", token="hf_faketoken") + monkeypatch.setattr(store.fs, "exists", lambda p: (_ for _ in ()).throw(FileNotFoundError(p))) + assert store.exists("memory/x.json") is False + + +def test_exists_surfaces_auth_errors(hf_stub, monkeypatch): + from tracecraft.hf import HF + + store = HF(bucket="user/tc-test", project="demo", token="hf_badtoken") + + def boom(path): + e = Exception("401 Client Error: Unauthorized for url") + e.response = types.SimpleNamespace(status_code=401) + raise e + + monkeypatch.setattr(store.fs, "exists", boom) + with pytest.raises(click.ClickException) as ei: + store.exists("memory/x.json") + msg = str(ei.value) + assert "auth" in msg.lower() + assert "HF_TOKEN" in msg diff --git a/sdk/tests/test_messaging.py b/sdk/tests/test_messaging.py new file mode 100644 index 0000000..e3ed7f1 --- /dev/null +++ b/sdk/tests/test_messaging.py @@ -0,0 +1,127 @@ +"""Tests for agent-to-agent messaging — especially the same-instant key collision. + +The bug these guard against: message keys were `messages//_.json`, +so two messages from one sender to one recipient in the same wall-clock second collided on +the same key and the later one silently overwrote the earlier (a 5-message burst kept 1). +The fix uses nanosecond resolution + a uuid suffix, so every send is a distinct key. +""" + +from __future__ import annotations + +import json + +import boto3 +import pytest +from click.testing import CliRunner +from moto import mock_aws + +from tracecraft.cli import cli + + +BUCKET = "tc-msg-test" +PROJECT = "demo" + + +@pytest.fixture +def env(tmp_path, monkeypatch): + monkeypatch.setenv("AWS_ACCESS_KEY_ID", "testing") + monkeypatch.setenv("AWS_SECRET_ACCESS_KEY", "testing") + monkeypatch.setenv("AWS_DEFAULT_REGION", "us-east-1") + work = tmp_path / "work" + work.mkdir() + monkeypatch.chdir(work) + cfg = { + "backend": "s3", + "endpoint": None, + "bucket": BUCKET, + "project": PROJECT, + "agent_id": "designer", + "access_key": "testing", + "secret_key": "testing", + } + (work / ".tracecraft.json").write_text(json.dumps(cfg)) + fake_home = tmp_path / "home" + (fake_home / ".tracecraft").mkdir(parents=True) + (fake_home / ".tracecraft" / "config.json").write_text(json.dumps(cfg)) + monkeypatch.setenv("HOME", str(fake_home)) + with mock_aws(): + boto3.client("s3").create_bucket(Bucket=BUCKET) + yield CliRunner() + + +def _keys(prefix): + c = boto3.client("s3") + out = c.list_objects_v2(Bucket=BUCKET, Prefix=f"{PROJECT}/{prefix}") + return [o["Key"] for o in out.get("Contents", [])] + + +def test_burst_to_same_recipient_keeps_every_message(env): + """The regression: many messages from one sender to one recipient, sent back to + back (same second), must ALL survive — not collapse onto one overwritten key.""" + n = 8 + for i in range(n): + r = env.invoke(cli, ["send", "reviewer", f"update {i}"]) + assert r.exit_code == 0, r.output + keys = _keys("messages/reviewer/") + assert len(keys) == n, f"expected {n} distinct message keys, got {len(keys)}: {keys}" + # and the bodies are all distinct (no overwrite) + c = boto3.client("s3") + bodies = { + json.loads(c.get_object(Bucket=BUCKET, Key=k)["Body"].read())["message"] for k in keys + } + assert bodies == {f"update {i}" for i in range(n)} + + +def test_inbox_reads_the_whole_burst(env): + """End-to-end: a burst sent by one agent is fully readable by the recipient.""" + for i in range(5): + env.invoke(cli, ["send", "reviewer", f"msg {i}"]) + r = env.invoke(cli, ["inbox"], env={"TRACECRAFT_AGENT": "reviewer"}) + assert r.exit_code == 0, r.output + for i in range(5): + assert f"msg {i}" in r.output + + +def test_key_shape_is_unique_per_send(env): + """Two sends to the same recipient produce two different keys even with no delay.""" + env.invoke(cli, ["send", "reviewer", "a"]) + env.invoke(cli, ["send", "reviewer", "b"]) + keys = _keys("messages/reviewer/") + assert len(set(keys)) == 2 + + +def test_broadcast_and_direct_are_separate(env): + """A broadcast lands under _broadcast, a direct message under the recipient.""" + env.invoke(cli, ["send", "_broadcast", "hello all"]) + env.invoke(cli, ["send", "reviewer", "hello you"]) + assert len(_keys("messages/_broadcast/")) == 1 + assert len(_keys("messages/reviewer/")) == 1 + + +def test_inbox_merges_direct_and_broadcast_chronologically(env): + """inbox must interleave direct + broadcast messages by sent_at, not print + one prefix's raw list order after the other.""" + import time as _time + + env.invoke(cli, ["send", "reviewer", "first-direct"]) + _time.sleep(0.01) + env.invoke(cli, ["send", "_broadcast", "second-broadcast"]) + _time.sleep(0.01) + env.invoke(cli, ["send", "reviewer", "third-direct"]) + r = env.invoke(cli, ["inbox"], env={"TRACECRAFT_AGENT": "reviewer"}) + assert r.exit_code == 0, r.output + out = r.output + assert out.index("first-direct") < out.index("second-broadcast") < out.index("third-direct") + + +def test_message_body_carries_sender_and_recipient(env): + """The body (not the filename) is the source of truth for from/to — readers parse + the body, so the key shape can change freely without breaking inbox or replay.""" + env.invoke(cli, ["send", "reviewer", "check"]) + c = boto3.client("s3") + k = _keys("messages/reviewer/")[0] + doc = json.loads(c.get_object(Bucket=BUCKET, Key=k)["Body"].read()) + assert doc["from"] == "designer" + assert doc["to"] == "reviewer" + assert doc["message"] == "check" + assert "sent_at" in doc diff --git a/sdk/tests/test_session_cli.py b/sdk/tests/test_session_cli.py index f2b2f7b..bc9ce73 100644 --- a/sdk/tests/test_session_cli.py +++ b/sdk/tests/test_session_cli.py @@ -20,8 +20,6 @@ from __future__ import annotations import json -import os -from pathlib import Path import boto3 import pytest diff --git a/sdk/tracecraft/cli/init_cmd.py b/sdk/tracecraft/cli/init_cmd.py index d1a9762..7514a36 100644 --- a/sdk/tracecraft/cli/init_cmd.py +++ b/sdk/tracecraft/cli/init_cmd.py @@ -1,6 +1,5 @@ """tracecraft init — configure and register agent.""" -import os from datetime import datetime, timezone from pathlib import Path @@ -39,7 +38,14 @@ @click.option( "--hf-token", default=None, envvar="HF_TOKEN", help="HuggingFace token (env: HF_TOKEN)" ) -def init_cmd(backend, endpoint, bucket, project, agent, access_key, secret_key, hf_token): +@click.option( + "--private/--public", + "private", + default=True, + help="Create the bucket private (default) or public. HF only. " + "Internal memory/transcripts should stay private.", +) +def init_cmd(backend, endpoint, bucket, project, agent, access_key, secret_key, hf_token, private): """Initialize tracecraft config, create bucket, and register agent.""" cfg = { "backend": backend, @@ -66,7 +72,7 @@ def init_cmd(backend, endpoint, bucket, project, agent, access_key, secret_key, save_config(cfg) _ensure_gitignore_entry() - store = _get_store(cfg) + store = _get_store(cfg, private=private) store.ensure_bucket() now = datetime.now(timezone.utc).isoformat() @@ -86,7 +92,39 @@ def init_cmd(backend, endpoint, bucket, project, agent, access_key, secret_key, if backend == "s3": click.echo(f"Backend: S3 Endpoint: {endpoint} Bucket: {bucket}") else: - click.echo(f"Backend: HuggingFace Buckets Bucket: {bucket}") + # Report the bucket's *actual* visibility, read back from the Hub — + # create_bucket(exist_ok=True) keeps a pre-existing bucket's setting, + # so the --private/--public flag and reality can disagree. + actual_private = store.bucket_privacy() + if actual_private is None: + visibility = "visibility unknown" + else: + visibility = "private" if actual_private else "PUBLIC" + click.echo(f"Backend: HuggingFace Buckets Bucket: {bucket} ({visibility})") + if actual_private is False and private: + # The bucket pre-existed as public; we asked for private but + # create_bucket(exist_ok=True) never changes an existing bucket. + click.echo( + "\n" + " WARNING: bucket already exists and is PUBLIC.\n" + f" Everything tracecraft writes to '{bucket}' — shared memory, messages,\n" + " handoffs, and mirrored session transcripts — will be publicly visible\n" + " on the Hub. huggingface_hub has no update_bucket, so visibility cannot\n" + " be flipped in place: the only remedy is to delete the bucket and\n" + " re-run init so tracecraft recreates it private.\n" + " If public was intentional, pass --public to silence this warning.\n", + err=True, + ) + # Be honest about the core-promise gap on this backend (see hf.py put_json): + # HF has no conditional-write, so atomic claims are best-effort there. + click.echo( + "Note: HuggingFace buckets have no conditional-write primitive, so " + "`tracecraft claim` is best-effort (racy) here — two agents can both think " + "they won. For safe atomic claims, use an S3-compatible backend (AWS, R2, " + "MinIO, B2, Wasabi). Memory, messaging, handoffs, and session mirroring are " + "unaffected.", + err=True, + ) click.echo("Note: .tracecraft.json contains credentials. Keep it out of version control.") @@ -115,13 +153,18 @@ def _ensure_gitignore_entry(): ) -def _get_store(cfg): +def _get_store(cfg, private=True): """Create the right storage backend from config.""" backend = cfg.get("backend", "s3") if backend == "hf": from tracecraft.hf import HF - return HF(bucket=cfg["bucket"], project=cfg["project"], token=cfg.get("hf_token")) + return HF( + bucket=cfg["bucket"], + project=cfg["project"], + token=cfg.get("hf_token"), + private=private, + ) else: from tracecraft.s3 import S3 diff --git a/sdk/tracecraft/cli/messages.py b/sdk/tracecraft/cli/messages.py index efba3c4..fef94fd 100644 --- a/sdk/tracecraft/cli/messages.py +++ b/sdk/tracecraft/cli/messages.py @@ -1,6 +1,7 @@ """tracecraft send/inbox — agent-to-agent messaging via S3.""" import time +import uuid from datetime import datetime, timezone import click @@ -17,10 +18,17 @@ def send(recipient, message): raise click.ClickException("Recipient cannot be empty") store, cfg = get_store() sender = cfg["agent_id"] - ts = int(time.time()) now = datetime.now(timezone.utc).isoformat() - key = f"messages/{recipient}/{ts}_{sender}.json" + # Message keys MUST be unique per send. A whole-second timestamp collides when + # one sender fires two messages to the same recipient in the same second — the + # second silently overwrites the first (measured: a 5-message burst kept only 1). + # Use nanosecond resolution for rough chronological ordering PLUS a uuid suffix + # that guarantees uniqueness even at sub-nanosecond send rates or clock ties. + # (Same approach the session mirror uses for its part keys.) + ts_ns = time.time_ns() + uniq = uuid.uuid4().hex[:8] + key = f"messages/{recipient}/{ts_ns}_{sender}_{uniq}.json" store.put_json( key, { @@ -48,25 +56,30 @@ def inbox(delete): click.echo("No messages.") return - count = 0 + # Merge direct + broadcast and sort by sent_at — raw list order interleaves + # the two prefixes, so a broadcast could print before the direct message + # that preceded it. + messages = [] for key in all_keys: data = store.get_json(key) if data is None: continue - sender = data.get("from", "?") # Skip own broadcasts - if "_broadcast/" in key and sender == my_id: + if "_broadcast/" in key and data.get("from", "?") == my_id: continue + messages.append((key, data)) + messages.sort(key=lambda kd: kd[1].get("sent_at", "")) + + for key, data in messages: + sender = data.get("from", "?") msg = data.get("message", "") sent_at = data.get("sent_at", "?") target = "broadcast" if "_broadcast/" in key else "direct" click.echo(f"[{sent_at}] ({target}) {sender}: {msg}") - count += 1 - if delete: store.delete(key) - if count == 0: + if not messages: click.echo("No messages.") elif delete: - click.echo(f"Deleted {count} message(s).") + click.echo(f"Deleted {len(messages)} message(s).") diff --git a/sdk/tracecraft/cli/steps.py b/sdk/tracecraft/cli/steps.py index c0c34dd..0d42ac9 100644 --- a/sdk/tracecraft/cli/steps.py +++ b/sdk/tracecraft/cli/steps.py @@ -62,6 +62,15 @@ def claim(step_id): }, ) click.echo(f"Claimed step {step_id} as {agent}") + if cfg.get("backend") == "hf": + # The claim is best-effort on HF (no conditional-write); don't let the + # success message imply the race was atomically arbitrated. + click.echo( + "warning: claims on the HuggingFace backend are best-effort (racy) — " + "another agent may also believe it won this step. Use an S3-compatible " + "backend for atomic claims.", + err=True, + ) @click.command() @@ -78,7 +87,14 @@ def claim(step_id): is_flag=True, help="Record files changed (from `git diff`), so the next agent knows what moved. No-op outside a git repo.", ) -def complete(step_id, note, next_agent, next_action, blocked, needs_review, changed_files_from_git): +@click.option( + "--force", + is_flag=True, + help="Complete a step claimed by a different agent (e.g. the claim-holder crashed).", +) +def complete( + step_id, note, next_agent, next_action, blocked, needs_review, changed_files_from_git, force +): """Mark a step complete (or blocked / needs-review) and write a handoff record. The handoff record is what the next agent sees instead of a shared @@ -95,6 +111,15 @@ def complete(step_id, note, next_agent, next_action, blocked, needs_review, chan sid = step_id.lower().replace(".", "-") now = datetime.now(timezone.utc).isoformat() + # A step belongs to whoever claimed it — without this check any agent + # could mark any step complete and silently steal/clobber someone's work. + claim_doc = store.get_json(f"steps/{sid}/claim.json") + if claim_doc and claim_doc.get("agent") not in (None, agent) and not force: + raise click.ClickException( + f"Step {step_id} is claimed by '{claim_doc['agent']}', not '{agent}'. " + f"Pass --force to complete it anyway (e.g. if the claim-holder crashed)." + ) + state = "blocked" if blocked else "needs_review" if needs_review else "complete" # Status reflects the real outcome (not always "complete"). @@ -135,19 +160,34 @@ def complete(step_id, note, next_agent, next_action, blocked, needs_review, chan click.echo(msg) +def _effective_status(store, sid): + """Resolve a step's status, tolerating the claim/status crash window. + + claim.json (atomic) and status.json are two separate writes; a crash + between them leaves a claim with no status. Readers treat that state as + in_progress by the claiming agent — the claim is the authoritative write. + Returns (status, agent); status is 'pending' when neither file exists. + """ + data = store.get_json(f"steps/{sid}/status.json") + if data is not None: + return data.get("status", "unknown"), data.get("agent", "?") + claim_doc = store.get_json(f"steps/{sid}/claim.json") + if claim_doc is not None: + return "in_progress", claim_doc.get("agent", "?") + return "pending", None + + @click.command() @click.argument("step_id") def step_status(step_id): """Check the status of a step.""" store, _ = get_store() sid = step_id.lower().replace(".", "-") - data = store.get_json(f"steps/{sid}/status.json") - if data is None: - click.echo(f"{step_id}: pending") - return - status = data.get("status", "unknown") - agent = data.get("agent", "?") - click.echo(f"{step_id}: {status} (agent: {agent})") + status, agent = _effective_status(store, sid) + if agent is None: + click.echo(f"{step_id}: {status}") + else: + click.echo(f"{step_id}: {status} (agent: {agent})") @click.command() @@ -160,19 +200,31 @@ def wait_for(step_ids, timeout): while time.time() < deadline: all_done = True + needs_review = [] for step_id in step_ids: sid = step_id.lower().replace(".", "-") - data = store.get_json(f"steps/{sid}/status.json") - if data is None or data.get("status") != "complete": + status, agent = _effective_status(store, sid) + if status == "blocked": + # A blocked step won't complete on its own — failing fast beats + # spinning until the full timeout. + raise click.ClickException( + f"Step {step_id} is blocked (agent: {agent}) — it will not " + f"complete without intervention. Resolve it and re-run wait-for." + ) + if status == "needs_review": + needs_review.append(step_id) + if status != "complete": all_done = False - break if all_done: click.echo(f"All steps complete: {', '.join(step_ids)}") return remaining = int(deadline - time.time()) - click.echo(f"Waiting... ({remaining}s remaining)", err=True) + progress = f"Waiting... ({remaining}s remaining)" + if needs_review: + progress += f" — needs review: {', '.join(needs_review)}" + click.echo(progress, err=True) time.sleep(5) raise click.ClickException( diff --git a/sdk/tracecraft/hf.py b/sdk/tracecraft/hf.py index ca2756b..7c67969 100644 --- a/sdk/tracecraft/hf.py +++ b/sdk/tracecraft/hf.py @@ -8,12 +8,14 @@ class HF: - def __init__(self, bucket, project, token=None): + def __init__(self, bucket, project, token=None, private=True): from huggingface_hub import HfFileSystem self.fs = HfFileSystem(token=token) self.bucket = bucket # e.g. "username/my-bucket" self.project = project + self.token = token + self.private = private # safe default: private (these hold internal traces) self.base = f"hf://buckets/{bucket}" @classmethod @@ -28,6 +30,24 @@ def from_config(cls): def _path(self, key): return f"{self.base}/{self.project}/{key}" + def _raise_write_error(self, e): + """Translate raw HfFileSystem write errors into actionable ones. + + A put against a bucket that doesn't exist surfaces as a cryptic + 'repository and revision' / 404 resolution error from HfFileSystem — + name the bucket and say what to do instead. + """ + msg = str(e) + if isinstance(e, FileNotFoundError) or ( + "Repository Not Found" in msg or "repository and revision" in msg or "404" in msg + ): + raise click.ClickException( + f"HF write failed: bucket '{self.bucket}' was not found.\n" + f"Run `tracecraft init --backend hf --bucket {self.bucket} ...` to create it, " + f"and check the bucket handle is 'username/bucket-name'." + ) + raise click.ClickException(f"HF write failed: {e}") + def put_json(self, key, data, if_none_match=False): try: path = self._path(key) @@ -45,7 +65,7 @@ def put_json(self, key, data, if_none_match=False): if isinstance(e, PreconditionFailed): raise - raise click.ClickException(f"HF put failed: {e}") + self._raise_write_error(e) def get_json(self, key): try: @@ -80,7 +100,19 @@ def list_keys(self, prefix=""): def exists(self, key): try: return self.fs.exists(self._path(key)) - except Exception: + except FileNotFoundError: + return False + except Exception as e: + # "Not found" is a legitimate False; "unauthorized" is not — swallowing + # it makes a bad token look like an empty bucket (and lets a best-effort + # claim race past its check-then-write guard). + status = getattr(getattr(e, "response", None), "status_code", None) + if status in (401, 403) or "unauthorized" in str(e).lower(): + raise click.ClickException( + f"HF auth error while checking '{key}': {e}\n" + f"Check that your token (--hf-token / HF_TOKEN) has read access " + f"to '{self.bucket}'." + ) return False def delete(self, key): @@ -95,7 +127,7 @@ def put_file(self, key, local_path): try: self.fs.put(local_path, self._path(key)) except Exception as e: - raise click.ClickException(f"HF upload failed: {e}") + self._raise_write_error(e) def get_file(self, key, local_path): try: @@ -104,6 +136,38 @@ def get_file(self, key, local_path): raise click.ClickException(f"HF download failed: {e}") def ensure_bucket(self): - # HF buckets are created via CLI or web — verify by checking exists or listing - # Empty buckets fail on ls(), so we just pass and let first write validate access - pass + """Create the HF bucket if it doesn't exist (private by default). + + Previously a no-op, which made `init` against a brand-new bucket fail with a + cryptic error on the first write (issue #7). HF buckets default to *public* + on creation, which is a privacy footgun for a tool that stores internal + memory/transcripts (issue #8) — so we create them private unless the caller + opts out via `private=False`. + """ + try: + from huggingface_hub import HfApi + + HfApi(token=self.token).create_bucket(self.bucket, private=self.private, exist_ok=True) + except Exception as e: + # Fall back to the old behavior: let the first write validate access, + # but surface a useful hint instead of a cryptic one. + raise click.ClickException( + f"Could not ensure HF bucket '{self.bucket}' exists: {e}\n" + f"Create it first at https://huggingface.co/new-bucket (set it Private), " + f"or check your --hf-token has write access." + ) + + def bucket_privacy(self): + """Return the bucket's *actual* visibility: True=private, False=public, + None if it can't be determined (network error, no permission). + + Read back from bucket_info() rather than assumed from the flag we passed — + create_bucket(exist_ok=True) silently keeps a pre-existing bucket's + visibility, so the flag and reality can disagree. + """ + try: + from huggingface_hub import HfApi + + return bool(HfApi(token=self.token).bucket_info(self.bucket).private) + except Exception: + return None