Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,27 @@ permissions:
contents: read

jobs:
lint:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
with:
submodules: false

- name: Install uv
uses: astral-sh/setup-uv@v5
with:
enable-cache: true

- name: Set up Python 3.12
run: uv python install 3.12

- name: Sync dependencies (dev extras)
run: uv sync --extra dev --python 3.12

- name: Run pre-commit hooks
run: uv run pre-commit run --all-files

test:
runs-on: ubuntu-latest
strategy:
Expand Down
7 changes: 7 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.4.4
hooks:
- id: ruff
args: [--fix, --select, "E,F,I"]
- id: ruff-format
19 changes: 18 additions & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,24 @@ private modules.
- Add docstrings to public classes and functions
- Use `from __future__ import annotations` for forward references
- Use `Protocol` for interfaces, `@dataclass` for value types
- No linter is enforced yet — just keep it consistent with surrounding code

We use [ruff](https://docs.astral.sh/ruff/) for formatting and linting
(rules `E`, `F`, `I`). Enable the pre-commit hook once:

```bash
uv sync --extra dev
uv run pre-commit install
```

After that, `ruff format` and `ruff check --fix` run automatically on
`git commit`.

To run manually:

```bash
uv run ruff format clawloop tests examples
uv run ruff check --select E,F,I clawloop tests examples
```

## Commits

Expand Down
15 changes: 10 additions & 5 deletions clawloop/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from dataclasses import dataclass, field
from typing import Any

from clawloop.core.env import EvalResult, Sample, TaskEnvironment
from clawloop.core.env import Sample, TaskEnvironment
from clawloop.core.episode import Episode, EpisodeSummary, Message, StepMeta
from clawloop.core.intensity import AdaptiveIntensity
from clawloop.core.paradigm import ParadigmBreakthrough
Expand All @@ -37,7 +37,9 @@ def _sanitize_input(text: str) -> str:
text = text.replace("\x00", "")
if len(text) > _MAX_INPUT_LENGTH:
log.warning(
"Truncating agent input from %d to %d chars", len(text), _MAX_INPUT_LENGTH,
"Truncating agent input from %d to %d chars",
len(text),
_MAX_INPUT_LENGTH,
)
text = text[:_MAX_INPUT_LENGTH]
return text
Expand Down Expand Up @@ -227,9 +229,12 @@ def _run_one(self, sample: Sample, env: TaskEnvironment) -> Episode:
meta_id = ""
if hasattr(sample, "metadata") and isinstance(sample.metadata, dict):
meta_id = sample.metadata.get("id", "")
task_id = meta_id or hashlib.sha256(
f"{self.bench}:{sample.question}:{context}".encode(),
).hexdigest()[:16]
task_id = (
meta_id
or hashlib.sha256(
f"{self.bench}:{sample.question}:{context}".encode(),
).hexdigest()[:16]
)

# Build episode
ep_messages = [
Expand Down
12 changes: 2 additions & 10 deletions clawloop/archive/jsonl_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,7 @@

def _safe_run_id(run_id: str) -> str:
"""Reject run_id values that could escape the archive directory."""
if (
not run_id
or "/" in run_id
or "\\" in run_id
or ".." in run_id
or run_id.startswith(".")
):
if not run_id or "/" in run_id or "\\" in run_id or ".." in run_id or run_id.startswith("."):
raise ValueError(f"unsafe run_id for filesystem path: {run_id!r}")
return run_id

Expand Down Expand Up @@ -248,9 +242,7 @@ def get_similar_runs(
if completion is not None:
start = {
**start,
"best_reward": completion.get(
"best_reward", start.get("best_reward", 0.0)
),
"best_reward": completion.get("best_reward", start.get("best_reward", 0.0)),
"improvement_delta": completion.get(
"improvement_delta", start.get("improvement_delta", 0.0)
),
Expand Down
5 changes: 4 additions & 1 deletion clawloop/archive/null_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@ def log_variant(self, variant: AgentVariant) -> None:
pass

def log_run_complete(
self, run_id: str, best_reward: float, improvement_delta: float,
self,
run_id: str,
best_reward: float,
improvement_delta: float,
total_cost_tokens: int = 0,
) -> None:
pass
Expand Down
5 changes: 4 additions & 1 deletion clawloop/archive/store.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ def log_episodes(self, episodes: list[EpisodeRecord]) -> None: ...
def log_variant(self, variant: AgentVariant) -> None: ...

def log_run_complete(
self, run_id: str, best_reward: float, improvement_delta: float,
self,
run_id: str,
best_reward: float,
improvement_delta: float,
total_cost_tokens: int = 0,
) -> None: ...

Expand Down
26 changes: 16 additions & 10 deletions clawloop/callbacks/litellm_cb.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@

from clawloop.collector import EpisodeCollector
from clawloop.core.episode import Message, TokenLogProb, TokenUsage, ToolCall, cap_logprobs
from clawloop.core.parse import parse_tool_calls, resolve_oi_span_kind, _safe_session_hash
from clawloop.core.parse import _safe_session_hash, parse_tool_calls, resolve_oi_span_kind

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -95,7 +95,11 @@ async def async_log_failure_event(
) -> None:
"""Async variant — delegates to sync."""
self.log_failure_event(
kwargs, response_obj, start_time, end_time, exception,
kwargs,
response_obj,
start_time,
end_time,
exception,
)

def _process(
Expand Down Expand Up @@ -151,14 +155,16 @@ def _process(
logprobs = None
raw_logprobs = getattr(choice, "logprobs", None)
if raw_logprobs and hasattr(raw_logprobs, "content") and raw_logprobs.content:
logprobs = cap_logprobs([
TokenLogProb(
token=lp.token,
token_id=getattr(lp, "token_id", None),
logprob=lp.logprob,
)
for lp in raw_logprobs.content
])
logprobs = cap_logprobs(
[
TokenLogProb(
token=lp.token,
token_id=getattr(lp, "token_id", None),
logprob=lp.logprob,
)
for lp in raw_logprobs.content
]
)

# Build assistant message
ep_messages.append(
Expand Down
46 changes: 28 additions & 18 deletions clawloop/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,38 +19,35 @@ def _build_parser() -> argparse.ArgumentParser:
prog="clawloop",
description="ClawLoop — Learning from Experience unified learning API",
)
parser.add_argument(
"-v", "--verbose", action="store_true", help="Enable debug logging"
)
parser.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
sub = parser.add_subparsers(dest="command", required=True)

# -- run --
run_p = sub.add_parser("run", help="Run the learning loop")
run_p.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
run_p.add_argument("--bench", required=True, help="Benchmark name")
run_p.add_argument("--iterations", type=int, default=1, help="Number of learning iterations")
run_p.add_argument("--episodes", type=int, default=10, help="Episodes per iteration")
run_p.add_argument("--config", type=str, default=None, help="Config JSON file")
run_p.add_argument("--model", type=str, default=None, help="LLM model (litellm format)")
run_p.add_argument(
"--iterations", type=int, default=1, help="Number of learning iterations"
"--api-base", type=str, default=None, help="LLM API base URL (OpenAI-compatible endpoint)"
)
run_p.add_argument(
"--episodes", type=int, default=10, help="Episodes per iteration"
"--task-type",
type=str,
default="base",
help="Task type: base, hallucination, disambiguation",
)
run_p.add_argument("--config", type=str, default=None, help="Config JSON file")
run_p.add_argument("--model", type=str, default=None, help="LLM model (litellm format)")
run_p.add_argument("--api-base", type=str, default=None, help="LLM API base URL (OpenAI-compatible endpoint)")
run_p.add_argument("--task-type", type=str, default="base",
help="Task type: base, hallucination, disambiguation")
run_p.add_argument("--task-split", type=str, default="test",
help="Data split: train, test")
run_p.add_argument("--task-split", type=str, default="test", help="Data split: train, test")
run_p.add_argument("--output", type=str, default=None, help="Output directory")
run_p.add_argument("--seed", type=int, default=None, help="Random seed")

# -- eval --
eval_p = sub.add_parser("eval", help="Evaluate current state (no learning)")
eval_p.add_argument("-v", "--verbose", action="store_true", help="Enable debug logging")
eval_p.add_argument("--bench", required=True, help="Benchmark name")
eval_p.add_argument(
"--episodes", type=int, default=10, help="Number of episodes"
)
eval_p.add_argument("--episodes", type=int, default=10, help="Number of episodes")
eval_p.add_argument("--config", type=str, default=None, help="Config JSON file")

# -- setup-bench --
Expand All @@ -64,7 +61,9 @@ def _build_parser() -> argparse.ArgumentParser:

math_p = demo_sub.add_parser("math", help="Math learning loop demo")
math_p.add_argument("--dry-run", action="store_true", help="Use mock LLMs (no API calls)")
math_p.add_argument("--iterations", type=int, default=None, help="Number of learning iterations")
math_p.add_argument(
"--iterations", type=int, default=None, help="Number of learning iterations"
)
math_p.add_argument("--episodes", type=int, default=None, help="Episodes per iteration")
math_p.add_argument("--output", type=str, default="playbook.json", help="Playbook output path")

Expand Down Expand Up @@ -117,7 +116,9 @@ def _build_evolver(config: dict[str, Any]) -> Any | None:
from clawloop.harness_backends.local import LocalEvolver
from clawloop.llm import LiteLLMClient

model = config.get("reflector_model", config.get("model", "anthropic/claude-haiku-4-5-20251001"))
model = config.get(
"reflector_model", config.get("model", "anthropic/claude-haiku-4-5-20251001")
)
client = LiteLLMClient(
model=model,
api_base=api_base,
Expand All @@ -132,6 +133,7 @@ def _build_evolver(config: dict[str, Any]) -> Any | None:
def _ensure_output_dir(config: dict[str, Any], bench: str) -> None:
"""Set output dir if not configured. Convention: runs/<bench>/<timestamp>."""
import time

if "output" not in config or not config["output"]:
config["output"] = f"./runs/{bench}/{int(time.time())}"

Expand Down Expand Up @@ -162,6 +164,7 @@ def cmd_run(args: argparse.Namespace) -> None:

# Wire LocalEvolver (with Reflector) into harness for ICL learning
from clawloop.learning_layers.harness import Harness

evolver = _build_evolver(config)
agent_state = AgentState(harness=Harness(evolver=evolver))

Expand Down Expand Up @@ -226,7 +229,14 @@ def cmd_eval(args: argparse.Namespace) -> None:
"car": {
"bench_dir": "benchmarks/a2a/car-bench",
"data_setup": "scenarios/car-bench/setup.sh",
"uv_sync_cmd": ["uv", "sync", "--extra", "car-bench-agent", "--extra", "car-bench-evaluator"],
"uv_sync_cmd": [
"uv",
"sync",
"--extra",
"car-bench-agent",
"--extra",
"car-bench-evaluator",
],
},
"entropic": {
"bench_dir": "benchmarks/a2a/entropic-crmarenapro",
Expand Down
4 changes: 3 additions & 1 deletion clawloop/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,9 @@ def ingest(
done=True,
timing_ms=timing_ms or 0.0,
)
] if messages else [],
]
if messages
else [],
summary=EpisodeSummary(
token_usage=usage,
timing=Timing(total_ms=timing_ms or 0.0) if timing_ms else None,
Expand Down
1 change: 1 addition & 0 deletions clawloop/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
Missing files are skipped silently. Existing environment variables are never
overridden (so CI/CD injected secrets always win over local ``.env``).
"""

from __future__ import annotations

import os
Expand Down
26 changes: 14 additions & 12 deletions clawloop/core/background.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
import json
import logging
import time
from dataclasses import dataclass, field
from dataclasses import dataclass
from typing import Any, Protocol, runtime_checkable

from clawloop.core.episode import Episode
Expand Down Expand Up @@ -73,7 +73,10 @@ def run(self, state: BackgroundState) -> None:
report = self.curator.consolidate(state.playbook)
log.info(
"Consolidation: %d->%d entries (merged=%d, pruned=%d)",
report.before, report.after, report.merged, report.pruned,
report.before,
report.after,
report.merged,
report.pruned,
)
except Exception:
log.exception("PlaybookConsolidation failed")
Expand Down Expand Up @@ -111,9 +114,7 @@ def run(self, state: BackgroundState) -> None:
reward = ep.summary.effective_reward()
task = ep.task_id
msgs = len(ep.messages)
episode_summaries.append(
f"- Task={task} reward={reward:.2f} messages={msgs}"
)
episode_summaries.append(f"- Task={task} reward={reward:.2f} messages={msgs}")

if not episode_summaries:
return
Expand All @@ -136,8 +137,7 @@ def run(self, state: BackgroundState) -> None:
+ "\n".join(episode_summaries)
+ "\n\n## Current Playbook Entries\n"
+ "\n".join(
f"- {e.content[:100]}"
for e in state.playbook.active_entries()[:10]
f"- {e.content[:100]}" for e in state.playbook.active_entries()[:10]
)
+ "\n\nWhat meta-patterns do you see across these episodes?"
),
Expand All @@ -155,11 +155,13 @@ def run(self, state: BackgroundState) -> None:
tags.append("meta-pattern")
content = item.get("content", "")
if content:
insights.append(Insight(
action=item.get("action", "add"),
content=content,
tags=tags,
))
insights.append(
Insight(
action=item.get("action", "add"),
content=content,
tags=tags,
)
)

if not insights:
return
Expand Down
Loading
Loading