huangruiteng · huangruiteng · Jun 27, 2026
diff --git a/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.json b/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.json
@@ -1926,6 +1926,52 @@
         "reward_feedback_forwarded": false,
         "conclusion": "The every-round canonical product-mode rerun produced public-safe soft-verify counters and full LoopX lifecycle evidence. It stopped after round 1 because the agent declared done and final official scoring returned 0.0; attribution is solution/case failure, not missing lifecycle, transport, or final-only reward sampling.",
         "product_fix_status": "Every-round private soft verify is now the canonical product-mode default; the separate soft-timeout cleanup path is covered by runner smoke and did not need to trigger in this completed run."
+      },
+      "goal_start_bridge_timeout_recheck": {
+        "schema_version": "skillsbench_goal_start_bridge_timeout_recheck_v0",
+        "route": "loopx_goal_start_product_mode",
+        "official_score": 0.0,
+        "score_failure_attribution": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout",
+        "first_blocker": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout",
+        "every_round_reward_observation_count": 13,
+        "controller_followup_prompt_count": 13,
+        "controller_stop_decision_count": 0,
+        "controller_official_success_observed": false,
+        "trajectory_public_summary": {
+          "schema_version": "skillsbench_acp_trajectory_summary_v0",
+          "attribution_conclusion": "The goal-start citation-check treatment reached thirteen controller rounds with every-round reward observations but never produced tool calls, LoopX CLI calls, or LoopX state reads/writes in the ACP trajectory; compact runner evidence attributes the 0.0 score to a host-local Codex exec bridge idle timeout rather than task-solving content or verifier feedback being exposed to the agent.",
+          "private_trajectory_present": true,
+          "raw_text_copied_to_public": false,
+          "raw_task_text_copied_to_public": false,
+          "raw_verifier_output_copied_to_public": false,
+          "host_path_recorded": false,
+          "round_count": 13,
+          "event_count": 39,
+          "agent_message_count": 13,
+          "tool_call_count": 0,
+          "loopx_cli_call_count": 0,
+          "loopx_cli_calls": [],
+          "loopx_cli_state_usage_counts": {},
+          "loopx_cli_state_read_count": 0,
+          "loopx_cli_state_write_count": 0,
+          "loopx_case_state_path_count": 2,
+          "loopx_case_state_read_count": 0,
+          "loopx_case_state_write_count": 0,
+          "protected_path_edit_signal_count": 0,
+          "protected_path_mention_count": 0,
+          "codex_acp_text_present": false,
+          "codex_acp_text_bytes": 0,
+          "source_compact_run": {
+            "case_id": "citation-check",
+            "route": "skillsbench_loopx_goal_start_product_mode_treatment",
+            "official_score": 0.0,
+            "score_failure_attribution": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout",
+            "first_blocker": "skillsbench_host_local_acp_codex_exec_failed_codex_exec_bridge_idle_timeout",
+            "official_feedback_blinded": true,
+            "reward_feedback_forwarded": false,
+            "case_semantics_changed_by_harness": false
+          }
+        }
       }
     },
     {
@@ -4014,7 +4060,7 @@
     "trajectory_recorded": false,
     "update_rule": "add one case analysis record after compact result ingest and benchmark-run-ledger update"
   },
-  "updated_at": "2026-06-24T12:31:00Z",
+  "updated_at": "2026-06-28T04:27:53+08:00",
   "terminal_bench_current_protocol_coverage": {
     "schema_version": "terminal_bench_current_protocol_coverage_v0",
     "source_ledger": "benchmark-run-ledger.json",

diff --git a/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.md b/docs/research/long-horizon-agent-benchmarks/benchmark-case-analysis.md
@@ -7,7 +7,7 @@ It is intentionally separate from `benchmark-run-ledger.md`. The run ledger
 records compact attempts and scores; this file records why a result matters.
 
 - schema_version: `benchmark_case_analysis_v0`
-- updated_at: `2026-06-24T12:31:00Z`
+- updated_at: `2026-06-28T04:27:53+08:00`
 - machine_source: `benchmark-case-analysis.json`
 - ledger-only migration audit:
   `benchmark-case-analysis-ledger-only-migration-audit-20260618.md`
@@ -67,13 +67,14 @@ public counters; absence from this table means the durable case record does
 not yet contain a public trajectory summary.
 
 - schema_version: `trajectory_public_summary_coverage_v0`
-- summary_count: `5`
-- attribution_conclusion_count: `5`
+- summary_count: `6`
+- attribution_conclusion_count: `6`
 
 | Benchmark | Case | Summary | Rounds | Tools | LoopX CLI | Protected Edits | Attribution |
 | --- | --- | --- | --- | --- | --- | --- | --- |
 | `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.baseline.trajectory_public_summary` (public-safe) | `1` | `7` | `0` | `0` | `yes` |
 | `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.treatment.trajectory_public_summary` (public-safe) | `8` | `138` | `73` | `0` | `yes` |
+| `skillsbench@1.1` | `citation-check` | `goal_start_bridge_timeout_recheck.trajectory_public_summary` (public-safe) | `13` | `0` | `0` | `0` | `yes` |
 | `skillsbench@1.1` | `citation-check` | `post_stop_policy_raw_rerun.trajectory_public_summary` (public-safe) | `1` | `26` | `0` | `0` | `yes` |
 | `skillsbench@1.1` | `debug-trl-grpo` | `trajectory_public_summary` (public-safe) | `5` | `112` | `0` | `2` | `yes` |
 | `skillsbench@1.1` | `paratransit-routing` | `legacy_blind_loop_positive_result.trajectory_public_summary` (public-safe) | `1` | `16` | `0` | `0` | `yes` |
@@ -87,13 +88,14 @@ benchmark case records that expose the same public fields. They do not read
 or copy raw trajectories, task text, verifier output, logs, or local paths.
 
 - schema_version: `harness_interaction_public_summary_coverage_v0`
-- summary_count: `14`
+- summary_count: `15`
 - benchmark_ids: `skillsbench@1.1, swe-marathon, terminal-bench@2.0`
 
 | Benchmark | Case | Source | Kind | LoopX CLI | Rounds | Tools | Events | Controller Trace | Lifecycle |
 | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
 | `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.baseline.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `1` | `7` | `7` | `no` | `no` |
 | `skillsbench@1.1` | `3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.treatment.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `73` | `8` | `138` | `211` | `no` | `no` |
+| `skillsbench@1.1` | `citation-check` | `goal_start_bridge_timeout_recheck.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `13` | `0` | `0` | `no` | `no` |
 | `skillsbench@1.1` | `citation-check` | `post_stop_policy_raw_rerun.trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `1` | `26` | `0` | `no` | `no` |
 | `skillsbench@1.1` | `debug-trl-grpo` | `trajectory_public_summary` (public-safe) | `trajectory_public_summary` | `0` | `5` | `112` | `0` | `no` | `no` |
 | `skillsbench@1.1` | `llm-prefix-cache-replay` | `native_goal_route_observations` (public-safe) | `native_goal_route_observation` | `0` | `0` | `0` | `0` | `yes` | `no` |

diff --git a/examples/benchmark-case-analysis-smoke.py b/examples/benchmark-case-analysis-smoke.py
@@ -69,9 +69,9 @@ def test_case_analysis_json() -> None:
     assert trajectory_coverage["schema_version"] == (
         "trajectory_public_summary_coverage_v0"
     ), trajectory_coverage
-    assert trajectory_coverage["summary_count"] == 5, trajectory_coverage
-    assert trajectory_coverage["public_safe_count"] == 5, trajectory_coverage
-    assert trajectory_coverage["attribution_conclusion_count"] == 5, (
+    assert trajectory_coverage["summary_count"] == 6, trajectory_coverage
+    assert trajectory_coverage["public_safe_count"] == 6, trajectory_coverage
+    assert trajectory_coverage["attribution_conclusion_count"] == 6, (
         trajectory_coverage
     )
     coverage_rows = {
@@ -83,6 +83,22 @@ def test_case_analysis_json() -> None:
         "citation-check",
         "post_stop_policy_raw_rerun.trajectory_public_summary",
     ) in coverage_rows, trajectory_coverage
+    goal_start_bridge_timeout = coverage_rows[
+        (
+            "skillsbench@1.1",
+            "citation-check",
+            "goal_start_bridge_timeout_recheck.trajectory_public_summary",
+        )
+    ]
+    assert goal_start_bridge_timeout["round_count"] == 13, (
+        goal_start_bridge_timeout
+    )
+    assert goal_start_bridge_timeout["tool_call_count"] == 0, (
+        goal_start_bridge_timeout
+    )
+    assert goal_start_bridge_timeout["loopx_cli_call_count"] == 0, (
+        goal_start_bridge_timeout
+    )
     assert (
         "skillsbench@1.1",
         "3d-scan-calc",
@@ -1269,7 +1285,9 @@ def test_case_analysis_markdown() -> None:
     assert "native_goal_route_observations" in text, text
     assert "legacy_blind_loop_positive_result.trajectory_public_summary" in text, text
     assert "post_stop_policy_raw_rerun.trajectory_public_summary" in text, text
+    assert "goal_start_bridge_timeout_recheck.trajectory_public_summary" in text, text
     assert "`citation-check` | `post_stop_policy_raw_rerun.trajectory_public_summary`" in text, text
+    assert "`citation-check` | `goal_start_bridge_timeout_recheck.trajectory_public_summary`" in text, text
     assert "historical_final_only_lifecycle_trajectory_summaries" in text, text
     assert "`3d-scan-calc` | `historical_final_only_lifecycle_trajectory_summaries.treatment.trajectory_public_summary`" in text, text
     assert "`debug-trl-grpo` | `trajectory_public_summary`" in text, text