diff --git a/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.json b/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.json index 1640bf41..f24934c9 100644 --- a/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.json +++ b/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.json @@ -1964,6 +1964,52 @@ "reward_feedback_forwarded": false, "conclusion": "The every-round canonical product-mode rerun produced public-safe soft-verify counters and full LoopX lifecycle evidence. It stopped after round 1 because the agent declared done and final official scoring returned 0.0; attribution is solution/case failure, not missing lifecycle, transport, or final-only reward sampling.", "product_fix_status": "Every-round private soft verify is now the canonical product-mode default; the separate soft-timeout cleanup path is covered by runner smoke and did not need to trigger in this completed run." + }, + "goal_start_bridge_timeout_recheck": { + "schema_version": "skillsbench_goal_start_bridge_timeout_recheck_v0", + "route": "loopx_goal_start_product_mode", + "official_score": 0.0, + "score_failure_attribution": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout", + "first_blocker": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout", + "every_round_reward_observation_count": 13, + "controller_followup_prompt_count": 13, + "controller_stop_decision_count": 0, + "controller_official_success_observed": false, + "trajectory_public_summary": { + "schema_version": "skillsbench_acp_trajectory_summary_v0", + "attribution_conclusion": "The goal-start citation-check treatment reached thirteen controller rounds with every-round reward observations but never produced tool calls, LoopX CLI calls, or LoopX state reads/writes in the ACP trajectory; compact runner evidence attributes the 0.0 score to a host-local Codex exec bridge idle timeout rather than task-solving content or verifier feedback being exposed to the agent.", + "private_trajectory_present": true, + "raw_text_copied_to_public": false, + "raw_task_text_copied_to_public": false, + "raw_verifier_output_copied_to_public": false, + "host_path_recorded": false, + "round_count": 13, + "event_count": 39, + "agent_message_count": 13, + "tool_call_count": 0, + "loopx_cli_call_count": 0, + "loopx_cli_calls": [], + "loopx_cli_state_usage_counts": {}, + "loopx_cli_state_read_count": 0, + "loopx_cli_state_write_count": 0, + "loopx_case_state_path_count": 2, + "loopx_case_state_read_count": 0, + "loopx_case_state_write_count": 0, + "protected_path_edit_signal_count": 0, + "protected_path_mention_count": 0, + "codex_acp_text_present": false, + "codex_acp_text_bytes": 0, + "source_compact_run": { + "case_id": "citation-check", + "route": "skillsbench_loopx_goal_start_product_mode_treatment", + "official_score": 0.0, + "score_failure_attribution": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout", + "first_blocker": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout", + "official_feedback_blinded": true, + "reward_feedback_forwarded": false, + "case_semantics_changed_by_harness": false + } + } } }, { @@ -4052,7 +4098,7 @@ "trajectory_recorded": false, "update_rule": "add one case analysis record after compact result ingest and benchmark-run-ledger update" }, - "updated_at": "2026-06-27T23:32:28Z", + "updated_at": "2026-06-28T17:56:06+08:00", "terminal_bench_current_protocol_coverage": { "schema_version": "terminal_bench_current_protocol_coverage_v0", "source_ledger": "benchmark-run-ledger.json", diff --git a/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.md b/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.md index 0e769a29..49ba8e02 100644 --- a/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.md +++ b/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.md @@ -7,7 +7,7 @@ It is intentionally separate from `benchmark-run-ledger.md`. The run ledger records compact attempts and scores; this file records why a result matters. - schema_version: `benchmark_case_analysis_v0` -- updated_at: `2026-06-27T23:32:28Z` +- updated_at: `2026-06-28T17:56:06+08:00` - machine_source: `benchmark-case-analysis.json` - ledger-only migration audit: `benchmark-case-analysis-ledger-only-migration-audit-20260618.md` @@ -67,13 +67,14 @@ public counters; absence from this table means the durable case record does not yet contain a public trajectory summary. - schema_version: `trajectory_public_summary_coverage_v0` -- summary_count: `5` -- attribution_conclusion_count: `5` +- summary_count: `6` +- attribution_conclusion_count: `6` | Benchmark | Case | Summary | Rounds | Tools | LoopX CLI | Protected Edits | Attribution | | --- | --- | --- | --- | --- | --- | --- | --- | | `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.baseline.trajectory_public_summary` (public-safe) | `1` | `7` | `0` | `0` | `yes` | | `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.treatment.trajectory_public_summary` (public-safe) | `8` | `138` | `73` | `0` | `yes` | +| `skillsbench@1.1` | `citation-check` | `goal_start_bridge_timeout_recheck.trajectory_public_summary` (public-safe) | `13` | `0` | `0` | `0` | `yes` | | `skillsbench@1.1` | `citation-check` | `post_stop_policy_raw_rerun.trajectory_public_summary` (public-safe) | `1` | `26` | `0` | `0` | `yes` | | `skillsbench@1.1` | `debug-trl-grpo` | `trajectory_public_summary` (public-safe) | `5` | `112` | `0` | `2` | `yes` | | `skillsbench@1.1` | `paratransit-routing` | `legacy_blind_loop_positive_result.trajectory_public_summary` (public-safe) | `1` | `16` | `0` | `0` | `yes` | @@ -94,6 +95,7 @@ or copy raw trajectories, task text, verifier output, logs, or local paths. | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | | `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.baseline.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `1` | `7` | `7` | `no` | `no` | | `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.treatment.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `73` | `8` | `138` | `211` | `no` | `no` | +| `skillsbench@1.1` | `citation-check` | `goal_start_bridge_timeout_recheck.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `13` | `0` | `0` | `no` | `no` | | `skillsbench@1.1` | `citation-check` | `goal_start_transport_monitor` (public-safe) | `compact_harness_interaction` | `0` | `0` | `0` | `0` | `yes` | `no` | | `skillsbench@1.1` | `citation-check` | `post_stop_policy_raw_rerun.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `1` | `26` | `0` | `no` | `no` | | `skillsbench@1.1` | `debug-trl-grpo` | `trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `5` | `112` | `0` | `no` | `no` | diff --git a/examples/benchmark-case-analysis-smoke.py b/examples/benchmark-case-analysis-smoke.py index a91f3aea..914911b1 100644 --- a/examples/benchmark-case-analysis-smoke.py +++ b/examples/benchmark-case-analysis-smoke.py @@ -69,9 +69,9 @@ def test_case_analysis_json() -> None: assert trajectory_coverage["schema_version"] == ( "trajectory_public_summary_coverage_v0" ), trajectory_coverage - assert trajectory_coverage["summary_count"] == 5, trajectory_coverage - assert trajectory_coverage["public_safe_count"] == 5, trajectory_coverage - assert trajectory_coverage["attribution_conclusion_count"] == 5, ( + assert trajectory_coverage["summary_count"] == 6, trajectory_coverage + assert trajectory_coverage["public_safe_count"] == 6, trajectory_coverage + assert trajectory_coverage["attribution_conclusion_count"] == 6, ( trajectory_coverage ) coverage_rows = { @@ -83,6 +83,22 @@ def test_case_analysis_json() -> None: "citation-check", "post_stop_policy_raw_rerun.trajectory_public_summary", ) in coverage_rows, trajectory_coverage + goal_start_bridge_timeout = coverage_rows[ + ( + "skillsbench@1.1", + "citation-check", + "goal_start_bridge_timeout_recheck.trajectory_public_summary", + ) + ] + assert goal_start_bridge_timeout["round_count"] == 13, ( + goal_start_bridge_timeout + ) + assert goal_start_bridge_timeout["tool_call_count"] == 0, ( + goal_start_bridge_timeout + ) + assert goal_start_bridge_timeout["loopx_cli_call_count"] == 0, ( + goal_start_bridge_timeout + ) assert ( "skillsbench@1.1", "3d-scan-calc", @@ -1269,7 +1285,9 @@ def test_case_analysis_markdown() -> None: assert "native_goal_route_observations" in text, text assert "legacy_blind_loop_positive_result.trajectory_public_summary" in text, text assert "post_stop_policy_raw_rerun.trajectory_public_summary" in text, text + assert "goal_start_bridge_timeout_recheck.trajectory_public_summary" in text, text assert "`citation-check` | `post_stop_policy_raw_rerun.trajectory_public_summary`" in text, text + assert "`citation-check` | `goal_start_bridge_timeout_recheck.trajectory_public_summary`" in text, text assert "historical_final_only_lifecycle_trajectory_summaries" in text, text assert "`3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.treatment.trajectory_public_summary`" in text, text assert "`debug-trl-grpo` | `trajectory_public_summary`" in text, text