Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -1926,6 +1926,52 @@
"reward_feedback_forwarded": false,
"conclusion": "The every-round canonical product-mode rerun produced public-safe soft-verify counters and full LoopX lifecycle evidence. It stopped after round 1 because the agent declared done and final official scoring returned 0.0; attribution is solution/case failure, not missing lifecycle, transport, or final-only reward sampling.",
"product_fix_status": "Every-round private soft verify is now the canonical product-mode default; the separate soft-timeout cleanup path is covered by runner smoke and did not need to trigger in this completed run."
},
"goal_start_bridge_timeout_recheck": {
"schema_version": "skillsbench_goal_start_bridge_timeout_recheck_v0",
"route": "loopx_goal_start_product_mode",
"official_score": 0.0,
"score_failure_attribution": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout",
"first_blocker": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout",
"every_round_reward_observation_count": 13,
"controller_followup_prompt_count": 13,
"controller_stop_decision_count": 0,
"controller_official_success_observed": false,
"trajectory_public_summary": {
"schema_version": "skillsbench_acp_trajectory_summary_v0",
"attribution_conclusion": "The goal-start citation-check treatment reached thirteen controller rounds with every-round reward observations but never produced tool calls, LoopX CLI calls, or LoopX state reads/writes in the ACP trajectory; compact runner evidence attributes the 0.0 score to a host-local Codex exec bridge idle timeout rather than task-solving content or verifier feedback being exposed to the agent.",
"private_trajectory_present": true,
"raw_text_copied_to_public": false,
"raw_task_text_copied_to_public": false,
"raw_verifier_output_copied_to_public": false,
"host_path_recorded": false,
"round_count": 13,
"event_count": 39,
"agent_message_count": 13,
"tool_call_count": 0,
"loopx_cli_call_count": 0,
"loopx_cli_calls": [],
"loopx_cli_state_usage_counts": {},
"loopx_cli_state_read_count": 0,
"loopx_cli_state_write_count": 0,
"loopx_case_state_path_count": 2,
"loopx_case_state_read_count": 0,
"loopx_case_state_write_count": 0,
"protected_path_edit_signal_count": 0,
"protected_path_mention_count": 0,
"codex_acp_text_present": false,
"codex_acp_text_bytes": 0,
"source_compact_run": {
"case_id": "citation-check",
"route": "skillsbench_loopx_goal_start_product_mode_treatment",
"official_score": 0.0,
"score_failure_attribution": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout",
"first_blocker": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout",
"official_feedback_blinded": true,
"reward_feedback_forwarded": false,
"case_semantics_changed_by_harness": false
}
}
}
},
{
Expand Down Expand Up @@ -4014,7 +4060,7 @@
"trajectory_recorded": false,
"update_rule": "add one case analysis record after compact result ingest and benchmark-run-ledger update"
},
"updated_at": "2026-06-24T12:31:00Z",
"updated_at": "2026-06-28T04:27:53+08:00",
"terminal_bench_current_protocol_coverage": {
"schema_version": "terminal_bench_current_protocol_coverage_v0",
"source_ledger": "benchmark-run-ledger.json",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ It is intentionally separate from `benchmark-run-ledger.md`. The run ledger
records compact attempts and scores; this file records why a result matters.

- schema_version: `benchmark_case_analysis_v0`
- updated_at: `2026-06-24T12:31:00Z`
- updated_at: `2026-06-28T04:27:53+08:00`
- machine_source: `benchmark-case-analysis.json`
- ledger-only migration audit:
`benchmark-case-analysis-ledger-only-migration-audit-20260618.md`
Expand Down Expand Up @@ -67,13 +67,14 @@ public counters; absence from this table means the durable case record does
not yet contain a public trajectory summary.

- schema_version: `trajectory_public_summary_coverage_v0`
- summary_count: `5`
- attribution_conclusion_count: `5`
- summary_count: `6`
- attribution_conclusion_count: `6`

| Benchmark | Case | Summary | Rounds | Tools | LoopX CLI | Protected Edits | Attribution |
| --- | --- | --- | --- | --- | --- | --- | --- |
| `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.baseline.trajectory_public_summary` (public-safe) | `1` | `7` | `0` | `0` | `yes` |
| `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.treatment.trajectory_public_summary` (public-safe) | `8` | `138` | `73` | `0` | `yes` |
| `skillsbench@1.1` | `citation-check` | `goal_start_bridge_timeout_recheck.trajectory_public_summary` (public-safe) | `13` | `0` | `0` | `0` | `yes` |
| `skillsbench@1.1` | `citation-check` | `post_stop_policy_raw_rerun.trajectory_public_summary` (public-safe) | `1` | `26` | `0` | `0` | `yes` |
| `skillsbench@1.1` | `debug-trl-grpo` | `trajectory_public_summary` (public-safe) | `5` | `112` | `0` | `2` | `yes` |
| `skillsbench@1.1` | `paratransit-routing` | `legacy_blind_loop_positive_result.trajectory_public_summary` (public-safe) | `1` | `16` | `0` | `0` | `yes` |
Expand All @@ -87,13 +88,14 @@ benchmark case records that expose the same public fields. They do not read
or copy raw trajectories, task text, verifier output, logs, or local paths.

- schema_version: `harness_interaction_public_summary_coverage_v0`
- summary_count: `14`
- summary_count: `15`
- benchmark_ids: `skillsbench@1.1, swe-marathon, terminal-bench@2.0`

| Benchmark | Case | Source | Kind | LoopX CLI | Rounds | Tools | Events | Controller Trace | Lifecycle |
| --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
| `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.baseline.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `1` | `7` | `7` | `no` | `no` |
| `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.treatment.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `73` | `8` | `138` | `211` | `no` | `no` |
| `skillsbench@1.1` | `citation-check` | `goal_start_bridge_timeout_recheck.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `13` | `0` | `0` | `no` | `no` |
| `skillsbench@1.1` | `citation-check` | `post_stop_policy_raw_rerun.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `1` | `26` | `0` | `no` | `no` |
| `skillsbench@1.1` | `debug-trl-grpo` | `trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `5` | `112` | `0` | `no` | `no` |
| `skillsbench@1.1` | `llm-prefix-cache-replay` | `native_goal_route_observations` (public-safe) | `native_goal_route_observation` | `0` | `0` | `0` | `0` | `yes` | `no` |
Expand Down
24 changes: 21 additions & 3 deletions examples/benchmark-case-analysis-smoke.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def test_case_analysis_json() -> None:
assert trajectory_coverage["schema_version"] == (
"trajectory_public_summary_coverage_v0"
), trajectory_coverage
assert trajectory_coverage["summary_count"] == 5, trajectory_coverage
assert trajectory_coverage["public_safe_count"] == 5, trajectory_coverage
assert trajectory_coverage["attribution_conclusion_count"] == 5, (
assert trajectory_coverage["summary_count"] == 6, trajectory_coverage
assert trajectory_coverage["public_safe_count"] == 6, trajectory_coverage
assert trajectory_coverage["attribution_conclusion_count"] == 6, (
trajectory_coverage
)
coverage_rows = {
Expand All @@ -83,6 +83,22 @@ def test_case_analysis_json() -> None:
"citation-check",
"post_stop_policy_raw_rerun.trajectory_public_summary",
) in coverage_rows, trajectory_coverage
goal_start_bridge_timeout = coverage_rows[
(
"skillsbench@1.1",
"citation-check",
"goal_start_bridge_timeout_recheck.trajectory_public_summary",
)
]
assert goal_start_bridge_timeout["round_count"] == 13, (
goal_start_bridge_timeout
)
assert goal_start_bridge_timeout["tool_call_count"] == 0, (
goal_start_bridge_timeout
)
assert goal_start_bridge_timeout["loopx_cli_call_count"] == 0, (
goal_start_bridge_timeout
)
assert (
"skillsbench@1.1",
"3d-scan-calc",
Expand Down Expand Up @@ -1269,7 +1285,9 @@ def test_case_analysis_markdown() -> None:
assert "native_goal_route_observations" in text, text
assert "legacy_blind_loop_positive_result.trajectory_public_summary" in text, text
assert "post_stop_policy_raw_rerun.trajectory_public_summary" in text, text
assert "goal_start_bridge_timeout_recheck.trajectory_public_summary" in text, text
assert "`citation-check` | `post_stop_policy_raw_rerun.trajectory_public_summary`" in text, text
assert "`citation-check` | `goal_start_bridge_timeout_recheck.trajectory_public_summary`" in text, text
assert "historical_final_only_lifecycle_trajectory_summaries" in text, text
assert "`3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.treatment.trajectory_public_summary`" in text, text
assert "`debug-trl-grpo` | `trajectory_public_summary`" in text, text
Expand Down