Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
111 changes: 111 additions & 0 deletions examples/skillsbench-task-source-preflight-smoke.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
#!/usr/bin/env python3
"""Smoke-test SkillsBench canonical task-source preflight."""

from __future__ import annotations

import contextlib
import io
import json
import sys
import tempfile
from pathlib import Path


REPO_ROOT = Path(__file__).resolve().parents[1]
if str(REPO_ROOT) not in sys.path:
sys.path.insert(0, str(REPO_ROOT))

from loopx.benchmark_ledger import load_benchmark_run_ledger # noqa: E402
from scripts.skillsbench_automation_loop import ( # noqa: E402
build_plan,
main as skillsbench_automation_loop_main,
parse_args,
)


def _write_task(root: Path, relative: str) -> None:
task = root / relative
dockerfile = task / "environment" / "Dockerfile"
dockerfile.parent.mkdir(parents=True, exist_ok=True)
dockerfile.write_text("FROM scratch\n", encoding="utf-8")
(task / "task.toml").write_text('version = "1.1"\n', encoding="utf-8")


def test_sanity_task_source_fails_before_runner_spend() -> None:
with tempfile.TemporaryDirectory(prefix="skillsbench-task-source-") as tmp:
root = Path(tmp)
skillsbench_root = root / "skillsbench"
_write_task(skillsbench_root, "experiments/sanity-tasks/hello-world")
_write_task(skillsbench_root, "tasks/citation-check")
_write_task(skillsbench_root, "tasks/powerlifting-coef-calc")

jobs = root / "jobs"
ledger = root / "ledger.json"
args = [
"--task-id",
"hello-world",
"--route",
"raw-codex-autonomous-max5",
"--skillsbench-root",
str(skillsbench_root),
"--jobs-dir",
str(jobs),
"--job-name",
"skillsbench-hello-world-task-source-preflight",
"--run-group-id",
"skillsbench-hello-world-task-source-preflight",
"--ledger-path",
str(ledger),
"--update-ledger",
]
plan = build_plan(parse_args(args))
preflight = plan["task_setup_preflight"]
assert preflight["status"] == "task_missing_from_canonical_tasks", preflight
assert preflight["canonical_task_present"] is False, preflight
assert preflight["alternate_source_kind"] == "experiments_sanity_tasks", (
preflight
)
assert preflight["task_source_path_recorded"] is False, preflight
assert preflight["task_source_content_recorded"] is False, preflight
assert preflight["nearest_canonical_task_ids"] == [
"citation-check",
"powerlifting-coef-calc",
], preflight

stderr = io.StringIO()
with contextlib.redirect_stderr(stderr):
rc = skillsbench_automation_loop_main(args)

assert rc == 0, stderr.getvalue()
compact_path = (
jobs
/ "skillsbench-hello-world-task-source-preflight"
/ "hello-world__raw_codex_autonomous_max5"
/ "benchmark_run.compact.json"
)
compact = json.loads(compact_path.read_text(encoding="utf-8"))
assert compact["first_blocker"] == "skillsbench_task_source_preflight_blocked"
assert compact["score_failure_attribution"] == (
"skillsbench_task_source_preflight_blocked"
)
assert compact["task_setup_preflight"]["status"] == (
"task_missing_from_canonical_tasks"
)
assert compact["task_setup_preflight"]["alternate_source_kind"] == (
"experiments_sanity_tasks"
)
assert compact["validation"]["no_raw_task_text_read"] is True, compact

update = load_benchmark_run_ledger(ledger)
case = update["benchmarks"]["skillsbench@1.1"]["cases"]["hello-world"]
assert case["latest_decision"]["decision"] == (
"baseline_task_source_preflight_selection_required"
), case
assert case["runs"][0]["repair_class"] == (
"skillsbench_task_source_preflight_selection"
)


if __name__ == "__main__":
test_sanity_task_source_fails_before_runner_spend()
print("skillsbench-task-source-preflight-smoke ok")
10 changes: 10 additions & 0 deletions loopx/benchmark_adapters/skillsbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -1352,6 +1352,16 @@ def skillsbench_runner_error_attribution(error_text: str) -> tuple[str, str, lis
if "benchflow result.json not found" in text:
label = "skillsbench_result_json_missing_after_runner_exit"
return label, label, [label, "skillsbench_runner_setup_error"]
if (
"skillsbench task source preflight blocked" in text
or "task missing from canonical tasks source" in text
):
label = "skillsbench_task_source_preflight_blocked"
return label, label, [
label,
"skillsbench_runner_setup_error",
"skillsbench_task_source_preflight",
]
if (
"could not find the file /app" in text
or "main:/app/skills" in text
Expand Down
71 changes: 71 additions & 0 deletions loopx/benchmark_ledger.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,48 @@ def _compact_task_staging(value: Any) -> dict[str, Any]:
return compact


def _compact_task_setup_preflight(value: Any) -> dict[str, Any]:
if not isinstance(value, dict):
return {}
compact: dict[str, Any] = {}
for field in (
"schema_version",
"status",
"sandbox",
"task_id",
"first_blocker",
"alternate_source_kind",
"selection_recommendation",
):
text = _compact_text(value.get(field), limit=140)
if text:
compact[field] = text
for field in (
"raw_task_text_read",
"raw_logs_read",
"raw_trajectory_read",
"apt_setup_risk_detected",
"apt_retry_patch_required",
"dockerfile_present",
"canonical_task_present",
"alternate_source_supported_by_runner",
"task_source_path_recorded",
"task_source_content_recorded",
):
if isinstance(value.get(field), bool):
compact[field] = value[field]
nearest_ids = value.get("nearest_canonical_task_ids")
if isinstance(nearest_ids, list):
compact_nearest: list[str] = []
for item in nearest_ids[:5]:
text = _compact_text(item, limit=120)
if text:
compact_nearest.append(text)
if compact_nearest:
compact["nearest_canonical_task_ids"] = compact_nearest
return compact


def _compact_compose_setup_diagnostic(value: Any) -> dict[str, Any]:
if not isinstance(value, dict):
return {}
Expand Down Expand Up @@ -907,6 +949,28 @@ def _repair_route(
"raw_task_text_required": False,
},
}
if failure_class == "skillsbench_task_source_preflight_blocked":
return {
"repair_priority": "P1",
"repair_class": "skillsbench_task_source_preflight_selection",
"next_action": (
"select a SkillsBench task from the canonical tasks source, or "
"use an explicit sanity-source runner before spending a full "
"baseline/treatment arm"
),
"repair_profile": {
"schema_version": "benchmark_repair_profile_v0",
"repair_class": "skillsbench_task_source_preflight_selection",
"rerun_allowed_after_profile_applied": True,
"required_preflight": [
"skillsbench_task_setup_preflight",
"canonical_task_present",
"nearest_canonical_task_ids",
],
"raw_logs_required": False,
"raw_task_text_required": False,
},
}
if failure_class == "score_missing":
return {
"repair_priority": "P0",
Expand Down Expand Up @@ -1472,6 +1536,11 @@ def build_benchmark_run_ledger_entry(
}
)
entry.update(repair_route)
task_setup_preflight = _compact_task_setup_preflight(
benchmark_run.get("task_setup_preflight")
)
if task_setup_preflight:
entry["task_setup_preflight"] = task_setup_preflight
task_staging = _compact_task_staging(benchmark_run.get("task_staging"))
if task_staging:
entry["task_staging"] = task_staging
Expand Down Expand Up @@ -1716,6 +1785,8 @@ def repair_decision(prefix: str, run: dict[str, Any]) -> dict[str, Any]:
decision = f"{prefix}_codex_acp_post_success_finalization_required"
elif repair_class == "skillsbench_setup_preflight_selection":
decision = f"{prefix}_setup_preflight_selection_required"
elif repair_class == "skillsbench_task_source_preflight_selection":
decision = f"{prefix}_task_source_preflight_selection_required"
elif repair_class == "worker_verifier_alignment":
decision = f"{prefix}_worker_verifier_alignment_required"
elif repair_class == "verifier_or_infra_repair":
Expand Down
45 changes: 45 additions & 0 deletions loopx/status.py
Original file line number Diff line number Diff line change
Expand Up @@ -1709,6 +1709,46 @@ def _compact_benchmark_task_staging(value: Any) -> dict[str, Any]:
return compact


def _compact_benchmark_task_setup_preflight(value: Any) -> dict[str, Any]:
if not isinstance(value, dict):
return {}

compact: dict[str, Any] = {}
for field in (
"schema_version",
"status",
"sandbox",
"task_id",
"first_blocker",
"alternate_source_kind",
"selection_recommendation",
):
text = public_safe_compact_text(value.get(field), limit=180)
if text:
compact[field] = text
for field in (
"raw_task_text_read",
"raw_logs_read",
"raw_trajectory_read",
"apt_setup_risk_detected",
"apt_retry_patch_required",
"dockerfile_present",
"canonical_task_present",
"alternate_source_supported_by_runner",
"task_source_path_recorded",
"task_source_content_recorded",
):
if isinstance(value.get(field), bool):
compact[field] = value[field]
nearest_task_ids = public_safe_compact_list(
value.get("nearest_canonical_task_ids"),
limit=MAX_BENCHMARK_RUN_LIST_ITEMS,
)
if nearest_task_ids:
compact["nearest_canonical_task_ids"] = nearest_task_ids
return compact


def _compact_benchmark_compose_setup_diagnostic(value: Any) -> dict[str, Any]:
if not isinstance(value, dict):
return {}
Expand Down Expand Up @@ -2918,6 +2958,11 @@ def compact_benchmark_run(run: dict[str, Any]) -> dict[str, Any] | None:
)
if runner_prerequisites:
compact["runner_prerequisites"] = runner_prerequisites
task_setup_preflight = _compact_benchmark_task_setup_preflight(
source.get("task_setup_preflight")
)
if task_setup_preflight:
compact["task_setup_preflight"] = task_setup_preflight
task_staging = _compact_benchmark_task_staging(source.get("task_staging"))
if task_staging:
compact["task_staging"] = task_staging
Expand Down
Loading