From 9e8e71763f7b4313aa07467bf3273df14883e3df Mon Sep 17 00:00:00 2001 From: Forketyfork Date: Thu, 28 May 2026 22:44:43 +0200 Subject: [PATCH 1/3] fix(runtime): keep architect alive when a session's terminal output errors Issue: Sending messages to codex would cause architect to exit silently, with no crash dialog, no confirmation, and no persistence. Logs showed ghostty-vt returning HyperlinkSetOutOfMemory from stream.nextSlice once codex had emitted enough OSC 8 hyperlinks to exhaust the per-page hyperlink set. The error propagated through processOutput, up through the runtime main loop, and out of main(), where Zig's default !void handler printed a stack trace and called exit(1) before any teardown ran. Solution: Catch per-session processOutput and flushPendingWrites errors in the main loop. Log them, mark the session dead via a new failSession helper, and continue iterating instead of unwinding the whole app. The session lands in the same state as a normal child exit, so the user can restart it or quit cleanly with persistence intact. --- src/app/runtime.zig | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/app/runtime.zig b/src/app/runtime.zig index 8f74c14..34fec86 100644 --- a/src/app/runtime.zig +++ b/src/app/runtime.zig @@ -102,6 +102,15 @@ fn countSpawnedSessions(sessions: []const *SessionState) usize { return count; } +/// Mark a session as dead and dirty after a non-recoverable I/O or terminal-state error. +/// Used to keep the runtime loop alive when a single session's output processing fails +/// (e.g. ghostty-vt resource exhaustion like `HyperlinkSetOutOfMemory`). The user can +/// then restart the session or quit cleanly, preserving persistence. +fn failSession(session: *SessionState) void { + session.dead = true; + session.markDirty(); +} + fn remainingFrameBudgetNs(target_frame_ns: i128, frame_ns: i128) u64 { if (frame_ns >= target_frame_ns) return 0; return @intCast(target_frame_ns - frame_ns); @@ -2291,12 +2300,14 @@ pub fn run() !void { } session.checkAlive(); session.processOutput() catch |err| { - log.err("session {d}: process output failed: {}", .{ session.id, err }); - return err; + log.err("session {d}: process output failed, marking session dead: {}", .{ session.id, err }); + failSession(session); + continue; }; session.flushPendingWrites() catch |err| { - log.err("session {d}: flush pending writes failed: {}", .{ session.id, err }); - return err; + log.err("session {d}: flush pending writes failed, marking session dead: {}", .{ session.id, err }); + failSession(session); + continue; }; const prev_cwd_ptr = if (session.cwd_path) |p| p.ptr else null; session.updateCwd(now); @@ -3133,6 +3144,17 @@ test "planExternalSpawnSlot reports full grid" { try std.testing.expect(planExternalSpawnSlot(&sessions, grid_layout.max_grid_size, grid_layout.max_grid_size, 0) == null); } +test "failSession marks the session dead and bumps render epoch" { + var session: SessionState = undefined; + session.dead = false; + session.render_epoch = 7; + + failSession(&session); + + try std.testing.expect(session.dead); + try std.testing.expectEqual(@as(u64, 8), session.render_epoch); +} + test "validateExternalSpawnCwd accepts directories and rejects relative paths" { try std.testing.expect(validateExternalSpawnCwd("/tmp") == null); From 83158cc8dfb08dbfede8b56b9269a3d5d1b0f441 Mon Sep 17 00:00:00 2001 From: Forketyfork Date: Fri, 29 May 2026 06:06:04 +0200 Subject: [PATCH 2/3] fix(session): terminate child and drop queued writes when failing a session Addresses unresolved PR review threads (codex P1; copilot x2) on #322. Before this change, the runtime's failSession helper only set dead=true. Three problems followed: the still-spawned child kept running invisibly behind a [Process completed] UI and could go on touching files; the normal teardown SIGTERM was gated on spawned && !dead so app quit no longer killed the child either; and pending_write was left populated, so flushPendingWrites retried and re-logged every frame on a permanent PTY error. Move the failure logic to SessionState.failAndTerminate. It now mirrors teardown's kill: SIGTERM the child while spawned && !dead, clear pending_write so flushPendingWrites short-circuits on its empty-buffer guard, then flip dead and bump the render epoch. The shell, terminal, and stream wrappers stay alive so scrollback remains visible and the existing restart button path keeps working. --- src/app/runtime.zig | 24 ++---------------------- src/session/state.zig | 39 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 41 insertions(+), 22 deletions(-) diff --git a/src/app/runtime.zig b/src/app/runtime.zig index 34fec86..5f12711 100644 --- a/src/app/runtime.zig +++ b/src/app/runtime.zig @@ -102,15 +102,6 @@ fn countSpawnedSessions(sessions: []const *SessionState) usize { return count; } -/// Mark a session as dead and dirty after a non-recoverable I/O or terminal-state error. -/// Used to keep the runtime loop alive when a single session's output processing fails -/// (e.g. ghostty-vt resource exhaustion like `HyperlinkSetOutOfMemory`). The user can -/// then restart the session or quit cleanly, preserving persistence. -fn failSession(session: *SessionState) void { - session.dead = true; - session.markDirty(); -} - fn remainingFrameBudgetNs(target_frame_ns: i128, frame_ns: i128) u64 { if (frame_ns >= target_frame_ns) return 0; return @intCast(target_frame_ns - frame_ns); @@ -2301,12 +2292,12 @@ pub fn run() !void { session.checkAlive(); session.processOutput() catch |err| { log.err("session {d}: process output failed, marking session dead: {}", .{ session.id, err }); - failSession(session); + session.failAndTerminate(); continue; }; session.flushPendingWrites() catch |err| { log.err("session {d}: flush pending writes failed, marking session dead: {}", .{ session.id, err }); - failSession(session); + session.failAndTerminate(); continue; }; const prev_cwd_ptr = if (session.cwd_path) |p| p.ptr else null; @@ -3144,17 +3135,6 @@ test "planExternalSpawnSlot reports full grid" { try std.testing.expect(planExternalSpawnSlot(&sessions, grid_layout.max_grid_size, grid_layout.max_grid_size, 0) == null); } -test "failSession marks the session dead and bumps render epoch" { - var session: SessionState = undefined; - session.dead = false; - session.render_epoch = 7; - - failSession(&session); - - try std.testing.expect(session.dead); - try std.testing.expectEqual(@as(u64, 8), session.render_epoch); -} - test "validateExternalSpawnCwd accepts directories and rejects relative paths" { try std.testing.expect(validateExternalSpawnCwd("/tmp") == null); diff --git a/src/session/state.zig b/src/session/state.zig index 47b6d77..a1be2d9 100644 --- a/src/session/state.zig +++ b/src/session/state.zig @@ -610,6 +610,24 @@ pub const SessionState = struct { return quit_capture_active; } + /// Mark the session as failed after an unrecoverable output-processing error + /// (e.g. ghostty-vt resource exhaustion like `HyperlinkSetOutOfMemory`). Sends + /// SIGTERM to the still-running child so it stops consuming resources behind a + /// "[Process completed]" UI, drops queued stdin so `flushPendingWrites` does + /// not retry every frame, then flips `dead` and bumps the render epoch. The + /// shell/terminal/stream wrappers stay alive so scrollback remains visible and + /// the session can be restarted through the existing dead-state UI path. + pub fn failAndTerminate(self: *SessionState) void { + if (self.shell) |shell| { + if (self.spawned and !self.dead) { + _ = std.c.kill(shell.child_pid, std.c.SIG.TERM); + } + } + self.pending_write.clearAndFree(self.allocator); + self.dead = true; + self.markDirty(); + } + /// Try to flush any queued stdin data; preserves ordering relative to new input. pub fn flushPendingWrites(self: *SessionState) !void { if (self.pending_write.items.len == 0) return; @@ -1164,6 +1182,27 @@ test "shouldProcessOutput drains dead sessions only during quit capture" { try std.testing.expect(SessionState.shouldProcessOutput(true, true, true)); } +test "failAndTerminate marks dead, bumps render epoch, and drops pending writes" { + const allocator = std.testing.allocator; + + var session: SessionState = undefined; + session.shell = null; + session.spawned = false; + session.dead = false; + session.render_epoch = 4; + session.allocator = allocator; + session.pending_write = .empty; + try session.pending_write.appendSlice(allocator, "queued"); + defer session.pending_write.deinit(allocator); + + session.failAndTerminate(); + + try std.testing.expect(session.dead); + try std.testing.expectEqual(@as(u64, 5), session.render_epoch); + try std.testing.expectEqual(@as(usize, 0), session.pending_write.items.len); + try std.testing.expectEqual(@as(usize, 0), session.pending_write.capacity); +} + test "AgentKind.fromComm recognises known agent names" { try std.testing.expectEqual(AgentKind.claude, AgentKind.fromComm("claude").?); try std.testing.expectEqual(AgentKind.codex, AgentKind.fromComm("codex").?); From ce16eca5208581a33920c200e8567bab582ed2e8 Mon Sep 17 00:00:00 2001 From: Forketyfork Date: Fri, 29 May 2026 12:51:29 +0200 Subject: [PATCH 3/3] feat(runtime): log session_failed event with agent and cwd context Adds two pieces of diagnostic context next to the existing process-output error path so that future investigations of similar failures can move faster: 1. The existing log.err line now includes the detected agent name (codex / claude / gemini / none) and the session cwd. The original incident took a while to attribute to codex because the line only carried the session id and error name. 2. A structured session_failed event is emitted via the existing writeRuntimeEvent helper, with key=value fields session, agent, error, and source. Grepping event=session_failed in architect.log now lists every session that hit an unrecoverable runtime error and which call site (process_output or flush_pending_writes) tripped it. --- src/app/runtime.zig | 43 +++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 2 deletions(-) diff --git a/src/app/runtime.zig b/src/app/runtime.zig index 300c9ec..e3adb12 100644 --- a/src/app/runtime.zig +++ b/src/app/runtime.zig @@ -144,6 +144,33 @@ fn writeRuntimeEvent(message: []const u8, event_name: []const u8, extra_data: [] }; } +fn agentLabel(session: *const SessionState) []const u8 { + if (session.agent_icon) |kind| return kind.name(); + return "none"; +} + +/// Log a structured `session_failed` event plus a human-readable error line +/// when a session is terminated due to an unrecoverable runtime error. The +/// `source` argument names the failing call site (e.g. `process_output`) and +/// becomes the `source=` field on the event so a future investigation can +/// grep `event=session_failed source=process_output` and find every instance. +fn reportSessionFailure(session: *const SessionState, err: anyerror, source: []const u8) void { + const agent = agentLabel(session); + const cwd = session.cwd_path orelse ""; + log.err( + "session {d}: {s} failed, marking session dead: {} (agent={s} cwd={s})", + .{ session.id, source, err, agent, cwd }, + ); + var extra_buf: [192]u8 = undefined; + const extra = std.fmt.bufPrint(&extra_buf, "session={d} agent={s} error={s} source={s}", .{ + session.id, agent, @errorName(err), source, + }) catch |fmt_err| { + log.warn("failed to format session_failed event payload: {}", .{fmt_err}); + return; + }; + writeRuntimeEvent("session terminated after unrecoverable error", "session_failed", extra); +} + fn emitViewModeTransitionEvents( previous_mode: app_state.ViewMode, next_mode: app_state.ViewMode, @@ -2294,12 +2321,12 @@ pub fn run() !void { } session.checkAlive(); session.processOutput() catch |err| { - log.err("session {d}: process output failed, marking session dead: {}", .{ session.id, err }); + reportSessionFailure(session, err, "process_output"); session.failAndTerminate(); continue; }; session.flushPendingWrites() catch |err| { - log.err("session {d}: flush pending writes failed, marking session dead: {}", .{ session.id, err }); + reportSessionFailure(session, err, "flush_pending_writes"); session.failAndTerminate(); continue; }; @@ -3141,6 +3168,18 @@ test "planExternalSpawnSlot reports full grid" { try std.testing.expect(planExternalSpawnSlot(&sessions, grid_layout.max_grid_size, grid_layout.max_grid_size, 0) == null); } +test "agentLabel reports the detected agent name or 'none'" { + var session: SessionState = undefined; + session.agent_icon = null; + try std.testing.expectEqualStrings("none", agentLabel(&session)); + + session.agent_icon = .codex; + try std.testing.expectEqualStrings("codex", agentLabel(&session)); + + session.agent_icon = .claude; + try std.testing.expectEqualStrings("claude", agentLabel(&session)); +} + test "validateExternalSpawnCwd accepts directories and rejects relative paths" { try std.testing.expect(validateExternalSpawnCwd("/tmp") == null);