diff --git a/cmd/agent/main.go b/cmd/agent/main.go index 0c30db26..7d356ad7 100644 --- a/cmd/agent/main.go +++ b/cmd/agent/main.go @@ -95,6 +95,14 @@ func main() { } d.Register("hook.getInfo", newHookGetInfoHandler(hookProvider)) + // ping — client keepalive. The TS client calls this periodically so the + // idle watchdog (StartIdleWatchdog below) can tell a live-but-idle session + // from a vanished client. The handler is a no-op; merely receiving the line + // resets the agent's lastInbound timestamp. + d.Register("ping", func(_ context.Context, _ json.RawMessage) (any, error) { + return struct{}{}, nil + }) + host := stdioserver.New(d, os.Stdin, os.Stdout, agentLogger) fsys.SetEventSink(func(event string, payload any) error { err := host.EmitEvent(event, payload) @@ -143,6 +151,13 @@ func main() { // 일치해야 한다. ctx 취소(드레인) 시 자동 정지한다. host.StartHeartbeat(10 * time.Second) + // Idle watchdog: self-terminate if the client sends nothing for 60s. The + // client pings every ~20s (KEEPALIVE_PING_INTERVAL_MS in pipe.ts), so a + // healthy idle session resets the timer ~3× per window; only a vanished + // client (half-open TCP, hung process, sleep) with no stdin EOF trips it, + // preventing an orphaned remote agent from holding its binary. + host.StartIdleWatchdog(60 * time.Second) + host.Run() } diff --git a/internal/fs/service.go b/internal/fs/service.go index 1811dc00..51b638c2 100644 --- a/internal/fs/service.go +++ b/internal/fs/service.go @@ -20,8 +20,10 @@ import ( // MaxReadableFileSize caps how many bytes one fs.readFile / fs.writeFile call // may move. The threshold matches the renderer's editor capacity so we never -// produce a file that we couldn't reload. -const MaxReadableFileSize = 5 * 1024 * 1024 +// produce a file that we couldn't reload. Raising this requires raising the +// inbound scanner cap in stdioserver.Host.Run accordingly — writeFile content +// travels inbound as one NDJSON line and is rejected by that cap first. +const MaxReadableFileSize = 50 * 1024 * 1024 // EventSink is the callback fs uses to push agent events back to Electron. type EventSink func(event string, payload any) error diff --git a/internal/stdioserver/host.go b/internal/stdioserver/host.go index 4ad55ad3..a4098ded 100644 --- a/internal/stdioserver/host.go +++ b/internal/stdioserver/host.go @@ -63,6 +63,16 @@ type Host struct { // 종료된다 — 멈춘 hook이 셧다운을 막을 수 없다. hooksMu sync.Mutex shutdownHooks []func() + + // lastInbound is the UnixNano timestamp of the most recently received + // request line, read by the idle watchdog (StartIdleWatchdog) to detect a + // vanished client. Written from Run's single reader goroutine, read from + // the watchdog goroutine — atomic keeps that race-free. + lastInbound atomic.Int64 + + // exit terminates the process. Defaults to os.Exit; tests inject a fake so + // drain/watchdog termination can be observed without killing the runner. + exit func(int) } // New constructs a Host bound to the given dispatcher and stdio streams. @@ -81,6 +91,7 @@ func New(d *dispatch.Dispatcher, in io.Reader, out io.Writer, logger *slog.Logge ctx: ctx, cancel: cancel, accepting: true, + exit: os.Exit, } } @@ -120,15 +131,23 @@ func (h *Host) EmitEvent(event string, payload any) error { // the parent's shutdown signal is honored. func (h *Host) Run() { scanner := bufio.NewScanner(h.in) - // 4 MiB cap matches the largest request shape we expect (writeFile - // content up to MaxReadableFileSize plus envelope overhead). - scanner.Buffer(make([]byte, 0, 64*1024), 4*1024*1024) + // Cap matches the largest request shape we expect: a writeFile whose + // content is up to MaxReadableFileSize (50 MiB), JSON-escaped, plus + // envelope overhead. 64 MiB leaves headroom for escaping without + // allocating eagerly — Scanner grows its buffer lazily from 64 KiB up to + // this ceiling. + scanner.Buffer(make([]byte, 0, 64*1024), 64*1024*1024) for scanner.Scan() { // Copy the slice — scanner reuses its internal buffer between // calls and the line escapes into a goroutine below. line := append([]byte(nil), scanner.Bytes()...) - if len(line) == 0 || !h.isAccepting() { + if len(line) == 0 { + continue + } + // Any inbound line proves the client is alive — reset the idle watchdog. + h.lastInbound.Store(time.Now().UnixNano()) + if !h.isAccepting() { continue } h.wg.Add(1) @@ -178,6 +197,41 @@ func (h *Host) StartHeartbeat(interval time.Duration) { }() } +// StartIdleWatchdog self-terminates the agent (via drainAndExit) when no +// inbound request line arrives within `limit`. This reaps the agent — and, +// through it, every PTY child — when the client has vanished but the SSH +// connection lingers without delivering stdin EOF: a half-open TCP link, a +// hung client process, or a suspended laptop. The client sends a periodic +// `ping` so a healthy but idle session keeps resetting lastInbound; only a +// genuinely absent client trips the limit. +// +// `limit` must be comfortably larger than the client's keepalive ping interval +// (KEEPALIVE_PING_INTERVAL_MS in pipe.ts) so normal jitter never false-fires. +// A non-positive limit disables the watchdog. Call before Run(); the goroutine +// stops when h.ctx is cancelled (drain). +func (h *Host) StartIdleWatchdog(limit time.Duration) { + if limit <= 0 { + return + } + h.lastInbound.Store(time.Now().UnixNano()) + go func() { + ticker := time.NewTicker(limit / 3) + defer ticker.Stop() + for { + select { + case <-ticker.C: + last := time.Unix(0, h.lastInbound.Load()) + if time.Since(last) >= limit { + h.drainAndExit(0) + return + } + case <-h.ctx.Done(): + return + } + } + }() +} + // RegisterShutdownHook 는 drainAndExit가 os.Exit 호출 직전에 등록 순서대로 // 실행할 cleanup 콜백을 추가한다. SIGTERM 시 defer가 우회되므로, hookserver // 소켓 파일 정리 등 명시적 정리가 필요한 자원은 이 훅을 사용한다. @@ -266,7 +320,7 @@ func (h *Host) drainAndExit(code int) { // even if Wait() itself blocks", which Wait can do when a // handler is stuck in a syscall. The same forceExit covers // the shutdown-hook execution window below. - forceExit := time.AfterFunc(forceExitAfter, func() { os.Exit(code) }) + forceExit := time.AfterFunc(forceExitAfter, func() { h.exit(code) }) done := make(chan struct{}) go func() { h.wg.Wait() @@ -276,14 +330,15 @@ func (h *Host) drainAndExit(code int) { case <-done: // Continue past the select to run shutdown hooks before exit. case <-time.After(forceExitAfter): - os.Exit(code) + h.exit(code) + return } // Hooks run synchronously under the same forceExit timer. - // If a hook hangs the AfterFunc above still trips os.Exit on time. + // If a hook hangs the AfterFunc above still trips h.exit on time. h.runShutdownHooks() forceExit.Stop() h.cancel() - os.Exit(code) + h.exit(code) }) } diff --git a/internal/stdioserver/host_idle_test.go b/internal/stdioserver/host_idle_test.go new file mode 100644 index 00000000..9916705d --- /dev/null +++ b/internal/stdioserver/host_idle_test.go @@ -0,0 +1,94 @@ +// Package stdioserver — idle watchdog tests. +// +// StartIdleWatchdog must self-terminate the agent when the client stops +// sending (vanished client, half-open link) but must never fire while inbound +// traffic keeps arriving. We inject a fake exit so termination is observable +// without killing the test runner. +package stdioserver + +import ( + "io" + "log/slog" + "strings" + "testing" + "time" + + "github.com/nexus-code/nexus-code/internal/dispatch" +) + +func newTestHost() *Host { + logger := slog.New(slog.NewTextHandler(io.Discard, nil)) + return New(dispatch.New(), strings.NewReader(""), io.Discard, logger) +} + +// Watchdog fires when no inbound line arrives within the limit. +func TestIdleWatchdogExitsWhenClientVanishes(t *testing.T) { + host := newTestHost() + exited := make(chan int, 1) + host.exit = func(code int) { + select { + case exited <- code: + default: + } + } + + host.StartIdleWatchdog(30 * time.Millisecond) + + select { + case code := <-exited: + if code != 0 { + t.Fatalf("exit code = %d, want 0", code) + } + case <-time.After(2 * time.Second): + t.Fatal("idle watchdog did not terminate within 2s of client silence") + } +} + +// Watchdog must NOT fire while inbound lines keep resetting lastInbound. +func TestIdleWatchdogStaysAliveWithTraffic(t *testing.T) { + host := newTestHost() + exited := make(chan int, 1) + host.exit = func(code int) { + select { + case exited <- code: + default: + } + } + + host.StartIdleWatchdog(60 * time.Millisecond) + + stop := time.After(240 * time.Millisecond) + tick := time.NewTicker(20 * time.Millisecond) + defer tick.Stop() + for { + select { + case <-tick.C: + host.lastInbound.Store(time.Now().UnixNano()) + case code := <-exited: + t.Fatalf("watchdog fired during active traffic (code=%d)", code) + case <-stop: + return // survived the window without firing + } + } +} + +// A non-positive limit disables the watchdog entirely. +func TestIdleWatchdogDisabledWhenLimitNonPositive(t *testing.T) { + host := newTestHost() + exited := make(chan int, 1) + host.exit = func(code int) { + select { + case exited <- code: + default: + } + } + + host.StartIdleWatchdog(0) + + select { + case <-exited: + t.Fatal("watchdog fired despite a non-positive (disabled) limit") + case <-time.After(100 * time.Millisecond): + // expected: no termination + } +} diff --git a/package.json b/package.json index 320f2702..ef304e8d 100644 --- a/package.json +++ b/package.json @@ -1,7 +1,7 @@ { "name": "nexus-code", "productName": "NexusCode", - "version": "0.5.1", + "version": "0.5.2", "description": "Multi-workspace VSCode-style editor for macOS. Monaco editor + terminal in one window.", "license": "MIT", "private": true, diff --git a/src/main/features/ssh/ipc.ts b/src/main/features/ssh/ipc.ts index bb9ebbcf..7f5ab258 100644 --- a/src/main/features/ssh/ipc.ts +++ b/src/main/features/ssh/ipc.ts @@ -15,6 +15,7 @@ import type { SshControlMaster } from "../../infra/agent/ssh/master"; import { type EnsureRemoteAgentOptions, ensureRemoteAgent, + type LspBootstrapProgressEvent, } from "../../infra/agent/ssh/ssh-bootstrap/index"; import { register, validateArgs } from "../../infra/ipc-router"; import { BROWSE_MAX_ENTRIES, type SshBrowseSessionRegistry } from "./browse-session-registry"; @@ -64,6 +65,10 @@ export function registerSshChannel(configPath = path.join(os.homedir(), ".ssh", export function registerSshBrowseHandlers( registry: SshBrowseSessionRegistry, promptHandler: SshAuthPromptHandler, + // Optional so existing callers/tests that don't care about progress keep + // working; when supplied, openBrowseSession streams bootstrap progress to + // the renderer via the `ssh.browseProgress` event keyed by progressId. + broadcast?: BrowseProgressBroadcast, ): () => void { register("ssh", { call: { @@ -72,7 +77,7 @@ export function registerSshBrowseHandlers( // cancellation arrives at the renderer as ipcErr("cancelled") — the // router passes the envelope silently without logging, and the // renderer uses ipcCallResult to branch on result.kind. - openBrowseSession: openBrowseSessionResultHandler(registry, promptHandler), + openBrowseSession: openBrowseSessionResultHandler(registry, promptHandler, broadcast), browseSession: browseSessionHandler(registry), closeBrowseSession: closeBrowseSessionHandler(registry), }, @@ -81,6 +86,12 @@ export function registerSshBrowseHandlers( return () => registry.dispose(); } +/** + * Broadcast fn shape used to push browse-session bootstrap progress to the + * renderer — matches the main-process forwardBroadcast signature. + */ +export type BrowseProgressBroadcast = (channelName: string, event: string, args: unknown) => void; + // --------------------------------------------------------------------------- // listConfigHosts // --------------------------------------------------------------------------- @@ -138,13 +149,15 @@ function isMissingOrPermissionError(error: unknown): boolean { export function openBrowseSessionHandler( registry: SshBrowseSessionRegistry, promptHandler: SshAuthPromptHandler, - bootstrap: (options: EnsureRemoteAgentOptions) => ReturnType = ( - options, - ) => + broadcast?: BrowseProgressBroadcast, + bootstrap: ( + options: EnsureRemoteAgentOptions, + onProgress?: (event: LspBootstrapProgressEvent) => void, + ) => ReturnType = (options, onProgress) => // The promptHandler MUST be forwarded to ensureRemoteAgent — without it // createBootstrapContext skips interactive auth and password-only hosts // fail before the agent channel is ever opened. - ensureRemoteAgent(options, { promptHandler }), + ensureRemoteAgent(options, { promptHandler, onProgress }), ): (args: unknown) => Promise<{ sessionId: string; initialPath: string; user: string }> { return async ( args: unknown, @@ -155,6 +168,23 @@ export function openBrowseSessionHandler( // name so the connection (and the saved profile) has a concrete user. const user = resolveSshUser(params.user); + // Stream agent-bootstrap progress to the renderer for this connect attempt. + // Keyed by the caller's progressId so the "add workspace" dialog can show + // the same upload/verify progress that registered workspaces already get, + // even though no workspaceId exists yet. No progressId or broadcast → no-op. + const onProgress = + params.progressId && broadcast + ? (event: LspBootstrapProgressEvent): void => { + broadcast("ssh", "browseProgress", { + progressId: params.progressId, + name: event.name, + phase: event.phase, + bytesDone: event.bytesDone, + bytesTotal: event.bytesTotal, + }); + } + : undefined; + const bootstrapOptions: EnsureRemoteAgentOptions = { host: params.host, user, @@ -181,7 +211,7 @@ export function openBrowseSessionHandler( let bootstrapResult: Awaited> | null = null; let channel: ReturnType | null = null; try { - bootstrapResult = await bootstrap(bootstrapOptions); + bootstrapResult = await bootstrap(bootstrapOptions, onProgress); if (timedOut) { bootstrapResult.dispose?.(); @@ -256,9 +286,13 @@ export function openBrowseSessionHandler( export function openBrowseSessionResultHandler( registry: SshBrowseSessionRegistry, promptHandler: SshAuthPromptHandler, - bootstrap?: (options: EnsureRemoteAgentOptions) => ReturnType, + broadcast?: BrowseProgressBroadcast, + bootstrap?: ( + options: EnsureRemoteAgentOptions, + onProgress?: (event: LspBootstrapProgressEvent) => void, + ) => ReturnType, ): (args: unknown) => Promise | ReturnType> { - const inner = openBrowseSessionHandler(registry, promptHandler, bootstrap); + const inner = openBrowseSessionHandler(registry, promptHandler, broadcast, bootstrap); return async (args: unknown) => { try { const result = await inner(args); diff --git a/src/main/features/workspace/manager.ts b/src/main/features/workspace/manager.ts index bbb82114..06344c53 100644 --- a/src/main/features/workspace/manager.ts +++ b/src/main/features/workspace/manager.ts @@ -72,6 +72,7 @@ type SshWorkspaceLocation = Extract; export type WorkspaceSshChannelFactory = (options: CreateSshChannelOptions) => SshChannel; export type WorkspaceSshBootstrap = ( options: EnsureRemoteAgentOptions, + dependencies?: Pick, ) => Promise; export type WorkspaceSshLspBootstrap = ( options: EnsureRemoteLspServerOptions, @@ -1069,21 +1070,24 @@ export class WorkspaceManager { this.adoptedSshMasters.delete(meta.id); let bootstrap: EnsureRemoteAgentResult; try { - bootstrap = await this.sshBootstrap({ - host: meta.location.host, - user: meta.location.user, - port: meta.location.port, - identityFile: meta.location.identityFile, - authMode: meta.location.authMode, - remotePath: meta.location.remotePath, - cachedRemoteArch: meta.location.remoteArch, - controlPath: adoptedMaster?.controlPath, - // Pass workspaceId so the bootstrap also uploads the per-workspace - // shim rc files (`.zshrc`/`.zshenv`/`bashrc`) into the remote's - // `~/.nexus-code/shim//`, making them available to the - // remote PTY's zsh `ZDOTDIR` / bash `--rcfile` activation. - workspaceId: meta.id, - }); + bootstrap = await this.sshBootstrap( + { + host: meta.location.host, + user: meta.location.user, + port: meta.location.port, + identityFile: meta.location.identityFile, + authMode: meta.location.authMode, + remotePath: meta.location.remotePath, + cachedRemoteArch: meta.location.remoteArch, + controlPath: adoptedMaster?.controlPath, + // Pass workspaceId so the bootstrap also uploads the per-workspace + // shim rc files (`.zshrc`/`.zshenv`/`bashrc`) into the remote's + // `~/.nexus-code/shim//`, making them available to the + // remote PTY's zsh `ZDOTDIR` / bash `--rcfile` activation. + workspaceId: meta.id, + }, + { onProgress: (event) => this.broadcastConnectionProgress(meta.id, event) }, + ); } catch (error) { // Bootstrap failed before any channel existed. Release the adopted // master (we own it now) and surface the error state instead of @@ -1217,6 +1221,23 @@ export class WorkspaceManager { this.broadcastFn("workspace", "connectionChanged", { workspaceId, status }); } + /** + * 에이전트 부트스트랩 진행 이벤트를 렌더러로 전달한다. + * workspaceId 범위로 scoped되므로 "새 워크스페이스 추가"와 "앱 시작 시 재연결" 양쪽 흐름 모두 커버한다. + */ + private broadcastConnectionProgress( + workspaceId: string, + event: LspBootstrapProgressEvent, + ): void { + this.broadcastFn("workspace", "connectionProgress", { + workspaceId, + name: event.name, + phase: event.phase, + bytesDone: event.bytesDone, + bytesTotal: event.bytesTotal, + }); + } + /** * Handles terminal SSH channel lifecycle events and restores the inert SSH provider. */ diff --git a/src/main/index.ts b/src/main/index.ts index 12ed1c19..edf6b7a1 100644 --- a/src/main/index.ts +++ b/src/main/index.ts @@ -150,9 +150,10 @@ const workspaceManager = new WorkspaceManager( createSshChannel(options, { promptHandler: (prompt) => sshAuthPromptHub.request(prompt), }), - (options) => + (options, deps) => ensureRemoteAgent(options, { promptHandler: (prompt) => sshAuthPromptHub.request(prompt), + onProgress: deps?.onProgress, }), ); @@ -218,7 +219,11 @@ registerAppStateChannel(stateService, { registerFsChannel(workspaceManager, agentFsWatcher, workspaceStorage); registerPanelChannel(workspaceStorage); registerSshChannel(); -registerSshBrowseHandlers(sshBrowseRegistry, (prompt) => sshAuthPromptHub.request(prompt)); +registerSshBrowseHandlers( + sshBrowseRegistry, + (prompt) => sshAuthPromptHub.request(prompt), + forwardBroadcast, +); registerSshAuthPromptIpcChannels(sshAuthPromptHub); registerSystemChannel({ openNewWindow: () => createMainWindow(stateService.getState()) }); registerClipboardChannel(); diff --git a/src/main/infra/agent/pipe.ts b/src/main/infra/agent/pipe.ts index 8695b297..e4ee8d4d 100644 --- a/src/main/infra/agent/pipe.ts +++ b/src/main/infra/agent/pipe.ts @@ -110,6 +110,11 @@ const STDOUT_BACKPRESSURE_LWM = 64 * 1024; // 64 KiB /** Event name emitted by the Go agent at a regular heartbeat interval. */ const AGENT_HEARTBEAT_EVENT = "agent.heartbeat"; +// How often the client pings the agent so the agent's idle watchdog +// (StartIdleWatchdog in host.go, 60s limit) can tell a live-but-idle session +// from a vanished client. Must stay well under that limit — ~3 pings/window. +const KEEPALIVE_PING_INTERVAL_MS = 20_000; + const ReadyFrameSchema = z .object({ type: z.literal("ready"), @@ -235,6 +240,11 @@ export function createNdjsonPipe(deps: NdjsonPipeDependencies): NdjsonPipe { let heartbeatWarned = false; let heartbeatWatchdogTimer: ReturnType | null = null; + // Client keepalive sender — pings the agent so its idle watchdog keeps a + // live-but-idle session alive. Started on ready (alongside the heartbeat + // watchdog), cleared on dispose/fail. + let keepaliveTimer: ReturnType | null = null; + const ready = new Promise((resolve, reject) => { resolveReady = () => { if (readySettled) return; @@ -259,6 +269,14 @@ export function createNdjsonPipe(deps: NdjsonPipeDependencies): NdjsonPipe { heartbeatWarned = false; } + /** Clears the keepalive ping timer. */ + function clearKeepalive(): void { + if (keepaliveTimer !== null) { + clearInterval(keepaliveTimer); + keepaliveTimer = null; + } + } + /** Rejects all in-flight requests with the same terminal transport error. */ function rejectPendingRequests(error: Error): void { for (const requestId of Array.from(activeRequestIds)) { @@ -329,6 +347,19 @@ export function createNdjsonPipe(deps: NdjsonPipeDependencies): NdjsonPipe { (timer as NodeJS.Timeout).unref(); } heartbeatWatchdogTimer = timer; + + // Start the keepalive sender on the same condition: the agent only + // runs its idle watchdog when heartbeat is enabled, so pinging is only + // needed then. Best-effort (fire) — a failed ping just means the + // channel is already tearing down. + const ping = setInterval(() => { + if (disposed || terminalError) return; + pipe.fire("ping"); + }, KEEPALIVE_PING_INTERVAL_MS); + if (typeof (ping as NodeJS.Timeout).unref === "function") { + (ping as NodeJS.Timeout).unref(); + } + keepaliveTimer = ping; } resolveReady(); return; @@ -448,7 +479,7 @@ export function createNdjsonPipe(deps: NdjsonPipeDependencies): NdjsonPipe { stderrLines.push(chunk); }); - return { + const pipe: NdjsonPipe = { ready, get methods(): readonly string[] | undefined { return capabilityMethods; @@ -543,6 +574,7 @@ export function createNdjsonPipe(deps: NdjsonPipeDependencies): NdjsonPipe { if (disposed) return; disposed = true; clearHeartbeatWatchdog(); + clearKeepalive(); const error = createDisposedError(); rejectReady(error); rejectPendingRequests(error); @@ -552,6 +584,7 @@ export function createNdjsonPipe(deps: NdjsonPipeDependencies): NdjsonPipe { if (terminalError) return; terminalError = error; clearHeartbeatWatchdog(); + clearKeepalive(); rejectReady(error); rejectPendingRequests(error); }, @@ -561,6 +594,7 @@ export function createNdjsonPipe(deps: NdjsonPipeDependencies): NdjsonPipe { return { wasReady: readySettled, stderrTail: recentStderr.join(" | ") }; }, }; + return pipe; } // === error helpers (exported for orchestrator use) === diff --git a/src/main/infra/agent/ssh/master.ts b/src/main/infra/agent/ssh/master.ts index a97fa461..b6434872 100644 --- a/src/main/infra/agent/ssh/master.ts +++ b/src/main/infra/agent/ssh/master.ts @@ -38,6 +38,22 @@ export interface SshControlMaster { const CONTROL_EXIT_UNLINK_FALLBACK_MS = 5_000; +/** + * Keepalive options applied to every long-lived ssh invocation (the agent + * channel and the persistent ControlMaster). Without these, a client that dies + * abnormally (force-kill, sleep, network drop) leaves the remote agent — and + * the binary it holds — alive until the kernel's default TCP timeout (hours), + * which then blocks the next launch's re-upload. ServerAliveInterval probes the + * peer at the SSH layer; after ServerAliveCountMax unanswered probes ssh exits, + * the remote session tears down, and the agent gets stdin EOF. ~15s × 3 ≈ 45s. + */ +const SSH_KEEPALIVE_ARGS: readonly string[] = [ + "-o", + "ServerAliveInterval=15", + "-o", + "ServerAliveCountMax=3", +]; + /** * Spawns the SSH client for direct stdin/stdout NDJSON exchange. Interactive * ControlMaster authentication and socket reuse live in `ssh-master`'s @@ -59,7 +75,7 @@ export function spawnSshMaster( * Creates the OpenSSH argument list without invoking a shell locally. */ export function buildSshArgs(options: SshMasterOptions): string[] { - const args = ["-o", "BatchMode=yes"]; + const args = ["-o", "BatchMode=yes", ...SSH_KEEPALIVE_ARGS]; if (options.controlPath) { args.push("-S", options.controlPath, "-o", "ControlMaster=no"); } @@ -150,6 +166,7 @@ export function buildSshControlMasterArgs( "ControlMaster=yes", "-o", "ControlPersist=60", + ...SSH_KEEPALIVE_ARGS, "-f", "-N", ]; diff --git a/src/main/infra/agent/ssh/ssh-bootstrap/index.ts b/src/main/infra/agent/ssh/ssh-bootstrap/index.ts index 9d56e452..07879188 100644 --- a/src/main/infra/agent/ssh/ssh-bootstrap/index.ts +++ b/src/main/infra/agent/ssh/ssh-bootstrap/index.ts @@ -22,18 +22,14 @@ import { type NodeRuntimeManifestEntry, type WrapperManifestEntry, } from "../../../../../shared/agent/manifest"; +import { getAgentDistDir } from "../../getAgentBinDir"; +import { createSshError } from "../../pipe"; +import { BASHRC_CONTENT, ZSHENV_CONTENT, ZSHRC_CONTENT } from "../../runtimeDirs"; import { - BASHRC_CONTENT, - ZSHENV_CONTENT, - ZSHRC_CONTENT, -} from "../../runtimeDirs"; -import { - authenticateSshControlMaster, type AuthenticateSshControlMasterDependencies, + authenticateSshControlMaster, type SshAuthPromptHandler, } from "../auth-pty"; -import { createSshError } from "../../pipe"; -import { getAgentDistDir } from "../../getAgentBinDir"; import { absoluteRemotePath, agentArtifactKey, @@ -58,36 +54,24 @@ import { uploadAndVerifyFile, } from "./transport"; import { - LOCAL_AGENT_DIST_DIR, - LSP_BOOTSTRAP_PROGRESS_EVENT, - REMOTE_AGENT_MANIFEST, - REMOTE_AGENT_PROTOCOL_MAJOR, - REMOTE_AGENT_ROOT, - REMOTE_AGENT_VERSION, type EnsureRemoteAgentOptions, type EnsureRemoteAgentResult, type EnsureRemoteLspServerOptions, type EnsureRemoteLspServerResult, + LOCAL_AGENT_DIST_DIR, + LSP_BOOTSTRAP_PROGRESS_EVENT, type LspBootstrapProgressEvent, type LspBootstrapProgressPhase, + REMOTE_AGENT_MANIFEST, + REMOTE_AGENT_PROTOCOL_MAJOR, + REMOTE_AGENT_ROOT, + REMOTE_AGENT_VERSION, type RemoteAgentPlatform, type SshBootstrapDependencies, type SshBootstrapRunner, type SshBootstrapRunnerResult, } from "./types"; -// Re-export the stable public surface so existing callers keep their import -// paths (`"./ssh-bootstrap"`) unchanged. -export { - LOCAL_AGENT_DIST_DIR, - LSP_BOOTSTRAP_PROGRESS_EVENT, - REMOTE_AGENT_MANIFEST, - REMOTE_AGENT_PROTOCOL_MAJOR, - REMOTE_AGENT_ROOT, - REMOTE_AGENT_VERSION, - parseUname, - remoteAgentBinaryPath, -}; export type { EnsureRemoteAgentOptions, EnsureRemoteAgentResult, @@ -100,6 +84,18 @@ export type { SshBootstrapRunner, SshBootstrapRunnerResult, }; +// Re-export the stable public surface so existing callers keep their import +// paths (`"./ssh-bootstrap"`) unchanged. +export { + LOCAL_AGENT_DIST_DIR, + LSP_BOOTSTRAP_PROGRESS_EVENT, + parseUname, + REMOTE_AGENT_MANIFEST, + REMOTE_AGENT_PROTOCOL_MAJOR, + REMOTE_AGENT_ROOT, + REMOTE_AGENT_VERSION, + remoteAgentBinaryPath, +}; interface ArtifactInstallRequest { readonly key: string; @@ -327,12 +323,23 @@ export function buildRemoteAgentCommand(binaryPath: string, remotePath: string): if (!remotePath.startsWith("/")) { throw createSshError( "server.protocol-error", - new Error( - `remotePath must be an absolute path, got: ${JSON.stringify(remotePath)}`, - ), + new Error(`remotePath must be an absolute path, got: ${JSON.stringify(remotePath)}`), ); } - const script = `exec ${quoteShellArg(binaryPath)} ${quoteShellArg(remotePath)}`; + // Retry the exec on ETXTBSY ("Text file busy"): immediately after a fresh + // install (sftp/cat write + `mv` into place) the kernel can briefly refuse to + // execute the binary while a writer fd from the upload — or a lingering writer + // from a previous, half-dead bootstrap connection — is still open. `exec` + // replaces the shell on success (so the loop body never runs twice in the + // healthy case); it only returns on failure, where we retry a handful of + // times over ~5s before giving up with the conventional 126 "cannot execute". + // `shopt -s execfail` is REQUIRED: without it a failed `exec` in a + // non-interactive bash terminates the shell immediately (so the loop would + // never retry). With it, a failed exec returns control and the loop runs. + const exec = `exec ${quoteShellArg(binaryPath)} ${quoteShellArg(remotePath)}`; + const script = + `shopt -s execfail; n=0; while :; do ${exec}; n=$((n+1)); ` + + `if [ "$n" -ge 25 ]; then exit 126; fi; sleep 0.2; done`; return `bash -lc ${singleQuoteShellArg(script)}`; } @@ -431,17 +438,13 @@ async function ensureRemoteArtifact( const existing = artifactLocks.get(lockKey); if (existing) return existing; - const pending = ensureRemoteArtifactUnlocked( - options, - runner, - now, - request, - onProgress, - ).finally(() => { - if (artifactLocks.get(lockKey) === pending) { - artifactLocks.delete(lockKey); - } - }); + const pending = ensureRemoteArtifactUnlocked(options, runner, now, request, onProgress).finally( + () => { + if (artifactLocks.get(lockKey) === pending) { + artifactLocks.delete(lockKey); + } + }, + ); artifactLocks.set(lockKey, pending); return pending; } @@ -614,10 +617,7 @@ async function ensureRemoteShimFiles( remoteHome: string, workspaceId: string, ): Promise { - const remoteShimDir = absoluteRemotePath( - remoteHome, - `${REMOTE_AGENT_ROOT}/shim/${workspaceId}`, - ); + const remoteShimDir = absoluteRemotePath(remoteHome, `${REMOTE_AGENT_ROOT}/shim/${workspaceId}`); // mkdir -p first; subsequent `cat > ` will inherit the 0o700-ish // umask of the remote user. We do not chmod the files explicitly — // they are plain rc files, not executables, and the default user-owned @@ -632,12 +632,7 @@ async function ensureRemoteShimFiles( for (const file of files) { const remotePath = `${remoteShimDir}/${file.name}`; - await runSsh( - options, - runner, - `cat > ${quoteShellArg(remotePath)}`, - file.content, - ); + await runSsh(options, runner, `cat > ${quoteShellArg(remotePath)}`, file.content); } return remoteShimDir; diff --git a/src/main/infra/agent/ssh/ssh-bootstrap/transport.ts b/src/main/infra/agent/ssh/ssh-bootstrap/transport.ts index bf460c82..ad492899 100644 --- a/src/main/infra/agent/ssh/ssh-bootstrap/transport.ts +++ b/src/main/infra/agent/ssh/ssh-bootstrap/transport.ts @@ -4,10 +4,11 @@ * uploading and verifying files, and quoting arbitrary strings into safe * shell or sftp tokens. */ -import { createHash } from "node:crypto"; + +import { spawn as defaultSpawn, type SpawnOptionsWithoutStdio } from "node:child_process"; +import { createHash, randomBytes } from "node:crypto"; import fs from "node:fs/promises"; import path from "node:path"; -import { type SpawnOptionsWithoutStdio, spawn as defaultSpawn } from "node:child_process"; import { createSshError } from "../../pipe"; import type { EnsureRemoteAgentOptions, @@ -73,7 +74,16 @@ export function buildSftpArgs(options: EnsureRemoteAgentOptions): string[] { /** Builds the ssh transport args shared by every remote command we run. */ export function buildSshTransportArgs(options: EnsureRemoteAgentOptions): string[] { - const args = ["-o", "BatchMode=yes"]; + // Keepalive so a bootstrap command over a dead connection fails fast (~45s) + // rather than hanging on the kernel's default TCP timeout. + const args = [ + "-o", + "BatchMode=yes", + "-o", + "ServerAliveInterval=15", + "-o", + "ServerAliveCountMax=3", + ]; if (options.controlPath) args.push("-S", options.controlPath, "-o", "ControlMaster=no"); if (options.port !== undefined) args.push("-p", String(options.port)); if (options.identityFile) args.push("-i", options.identityFile); @@ -103,33 +113,90 @@ export async function uploadAndVerifyFile(args: { args.runner, `mkdir -p ${quoteShellArg(remoteDir)} && chmod 755 ${args.remoteAgentRoot} ${quoteShellArg(remoteDir)}`, ); + // Best-effort sweep of *stale* `.tmp.` files left by earlier interrupted + // installs (a connection dropped after upload-to-temp but before the rename, + // or before our per-attempt rm could run over the now-dead connection). + // + // `-mmin +5` is critical for multi-workspace / multi-user hosts: several + // bootstraps can run against the same shared binary path concurrently (each + // writes its own `.tmp.`), so we must NEVER delete a temp file that a + // concurrent upload is still writing. A genuine orphan is minutes old; an + // in-flight upload is seconds old, so the age filter leaves it untouched. + // + // `find -delete` (not a shell glob) makes an empty match a clean no-op under + // any login shell — zsh aborts on an unmatched glob, find does not. The + // pattern is single-quoted so the login shell passes it to find verbatim. + await runSsh( + args.options, + args.runner, + `find ${quoteShellArg(remoteDir)} -maxdepth 1 -name ${singleQuoteShellArg(`${path.posix.basename(args.remotePath)}.tmp.*`)} -mmin +5 -delete`, + ).catch(() => undefined); const payload = await fs.readFile(args.localPath); if (sha256(payload) !== args.sha256) { throw createSshError("server.protocol-error", new Error("local artifact sha256 mismatch")); } const progressName = args.progressName ?? path.basename(args.remotePath); + let lastError: unknown; for (let attempt = 0; attempt < 2; attempt += 1) { - args.onProgress?.({ - name: progressName, - phase: "uploading", - bytesDone: 0, - bytesTotal: payload.byteLength, - }); - await uploadFile(args.options, args.runner, args.localPath, args.remotePath, payload, { - executable: args.executable, - }); - args.onProgress?.({ - name: progressName, - phase: "uploading", - bytesDone: payload.byteLength, - bytesTotal: payload.byteLength, - }); - args.onProgress?.({ name: progressName, phase: "verifying" }); - const remoteSha = await remoteSha256(args.options, args.runner, args.remotePath); - if (remoteSha === args.sha256) return; + // Upload to a unique temp path in the same directory, then atomically + // rename it into place. `mv -f` over a file that a lingering OLD agent is + // still executing succeeds — the running process keeps the old inode while + // the name is repointed to the new one — so a stale remote agent never + // blocks reinstall with ETXTBSY ("Text file busy"). Same-dir rename keeps + // it on one filesystem, so the swap is atomic with no missing-file window. + const tmpRemotePath = `${args.remotePath}.tmp.${randomBytes(6).toString("hex")}`; + try { + args.onProgress?.({ + name: progressName, + phase: "uploading", + bytesDone: 0, + bytesTotal: payload.byteLength, + }); + await uploadFile(args.options, args.runner, args.localPath, tmpRemotePath, payload, { + executable: args.executable, + }); + // `sftp` exits 0 even when an individual `put` fails (a failed transfer + // is reported on stderr but never sets a nonzero exit code), so a + // transient upload error is invisible to uploadFile() — the temp file + // may be missing or truncated. The `mv` below would then throw + // ("no such file"), aborting the whole bootstrap. We therefore treat the + // entire upload→rename→verify sequence as one fallible attempt: any + // failure (missing temp, rename error, or sha mismatch) retries the full + // upload instead of propagating, restoring the pre-atomic-install + // resilience where the sha check alone gated correctness. + await runSsh( + args.options, + args.runner, + `mv -f ${quoteShellArg(tmpRemotePath)} ${quoteShellArg(args.remotePath)}`, + ); + args.onProgress?.({ + name: progressName, + phase: "uploading", + bytesDone: payload.byteLength, + bytesTotal: payload.byteLength, + }); + args.onProgress?.({ name: progressName, phase: "verifying" }); + const remoteSha = await remoteSha256(args.options, args.runner, args.remotePath); + if (remoteSha === args.sha256) return; + lastError = createSshError( + "server.protocol-error", + new Error("remote artifact sha256 mismatch"), + ); + } catch (error) { + lastError = error; + } + // Best-effort: drop a temp file orphaned by a failed attempt so retries + // (and future bootstraps) never accumulate `.tmp.` litter alongside + // the installed binary. + await runSsh(args.options, args.runner, `rm -f ${quoteShellArg(tmpRemotePath)}`).catch( + () => undefined, + ); } - throw createSshError("server.protocol-error", new Error("remote artifact sha256 mismatch")); + throw ( + lastError ?? + createSshError("server.protocol-error", new Error("remote artifact sha256 mismatch")) + ); } /** sftp put with cat-pipe fallback when sftp is unavailable on the remote. */ diff --git a/src/renderer/components/editor/preview/image-preview.tsx b/src/renderer/components/editor/preview/image-preview.tsx index 94b7c171..bed39750 100644 --- a/src/renderer/components/editor/preview/image-preview.tsx +++ b/src/renderer/components/editor/preview/image-preview.tsx @@ -38,6 +38,8 @@ import { useEffect, useLayoutEffect, useRef, useState } from "react"; import { useTranslation } from "react-i18next"; +import { MAX_READABLE_FILE_SIZE } from "../../../../shared/fs/defaults"; +import { ipcCallResult } from "../../../ipc/client"; import { buildWorkspaceUrl } from "../../../services/editor/preview/workspace-url"; import { useWorkspacesStore } from "../../../state/stores/workspaces"; import { relPath } from "../../../utils/path"; @@ -89,12 +91,24 @@ export function ImagePreview({ workspaceId, filePath, onNaturalSize }: ImagePrev } const url = buildWorkspaceUrl(workspaceId, rel); - return ; + return ( + + ); } interface ImageCanvasProps { url: string; alt: string; + /** Workspace owning the file — used to stat the file on a load error. */ + workspaceId: string; + /** Workspace-relative path — used to stat the file on a load error. */ + relPath: string; onNaturalSize?: (size: { w: number; h: number }) => void; } @@ -129,14 +143,16 @@ interface ZoomAnchor { * - File deleted/moved on disk after the tab opened (404 from protocol). * - Format the OS/Chromium can't decode (rare for the supported set). */ -function ImageCanvas({ url, alt, onNaturalSize }: ImageCanvasProps) { +function ImageCanvas({ url, alt, workspaceId, relPath, onNaturalSize }: ImageCanvasProps) { const { t } = useTranslation(); const containerRef = useRef(null); const imgRef = useRef(null); const [naturalSize, setNaturalSize] = useState<{ w: number; h: number } | null>(null); const [fitScale, setFitScale] = useState(1); const [userScale, setUserScale] = useState(1); - const [errored, setErrored] = useState(false); + // null = loaded OK. "generic" = decode/missing failure. "too_large" = the + // file exceeds the read cap (resolved via a stat round-trip in onError). + const [errorKind, setErrorKind] = useState<"generic" | "too_large" | null>(null); // Anchor for the in-flight zoom: applied in useLayoutEffect after the new // dimensions commit, then cleared. Held in a ref so the wheel callback @@ -226,8 +242,18 @@ function ImageCanvas({ url, alt, onNaturalSize }: ImageCanvasProps) { return () => el.removeEventListener("wheel", onWheel); }, []); - if (errored) { - return ; + if (errorKind) { + return ( + + ); } // Until the image loads we don't know its natural size yet, so render @@ -244,7 +270,7 @@ function ImageCanvas({ url, alt, onNaturalSize }: ImageCanvasProps) { ref={containerRef} className="app-scrollbar flex-1 min-h-0 overflow-auto bg-[var(--surface-backdrop-bg)]" > -
+
setErrored(true)} + onError={() => { + // A load failure on an over-cap file is a size rejection — the + // workspace protocol 404s reads above MAX_READABLE_FILE_SIZE — not + // a missing or undecodable image. stat to disambiguate so the user + // sees the real reason instead of a generic "could not load". + setErrorKind("generic"); + void ipcCallResult("fs", "stat", { workspaceId, relPath }).then((result) => { + if (result.ok && result.value.size > MAX_READABLE_FILE_SIZE) { + setErrorKind("too_large"); + } + }); + }} // maxWidth/maxHeight: 'none' override Tailwind Preflight's global // `img { max-width: 100%; height: auto; }`. Without this, our // explicit width is silently capped at the inner wrapper's width diff --git a/src/renderer/components/workbench/sidebar.tsx b/src/renderer/components/workbench/sidebar.tsx index 4c9801e3..411a6b79 100644 --- a/src/renderer/components/workbench/sidebar.tsx +++ b/src/renderer/components/workbench/sidebar.tsx @@ -629,6 +629,7 @@ export function Sidebar({ function ConnectionStatusDot({ status }: { status: WorkspaceConnectionStatus }) { const { t } = useTranslation(); const label = t("sidebar.ssh_status", { status }); + const isTransient = status === "connecting" || status === "reconnecting"; return ( ); diff --git a/src/renderer/components/workspace/add-workspace/ssh-connection-list-view.tsx b/src/renderer/components/workspace/add-workspace/ssh-connection-list-view.tsx index 71b7e938..ebd7bc69 100644 --- a/src/renderer/components/workspace/add-workspace/ssh-connection-list-view.tsx +++ b/src/renderer/components/workspace/add-workspace/ssh-connection-list-view.tsx @@ -2,6 +2,7 @@ import { ChevronRight, LoaderCircle, Plus, Server, Star, Trash2 } from "lucide-r import { useCallback, useEffect, useState } from "react"; import { useTranslation } from "react-i18next"; import type { ConnectionProfile } from "../../../../shared/types/entry-points"; +import type { SshBrowseProgressEvent } from "../../../../shared/types/workspace"; import { listConnectionProfiles, openSshBrowseSession, @@ -11,9 +12,11 @@ import { } from "../../../services/workspace"; import { EmptyState } from "../../ui/empty-state"; import { Skeleton, SkeletonLine } from "../../ui/skeleton"; +import { BootstrapProgressBar } from "../bootstrap-progress-bar"; import { ErrorNotice } from "./error-notice"; import { formatProfileSubtitle } from "./ssh-helpers"; import type { SshBrowseSession, SshConnectionListViewProps } from "./types"; +import { useBrowseProgress } from "./use-browse-progress"; // --------------------------------------------------------------------------- // SshConnectionListView — T4 implementation @@ -31,6 +34,13 @@ export function SshConnectionListView({ const [connectingId, setConnectingId] = useState(null); const [errorId, setErrorId] = useState(null); const [errorHuman, setErrorHuman] = useState(null); + // Agent-bootstrap progress for the in-flight connect (keyed by a client-minted + // progressId, since no sessionId/workspaceId exists yet). + const { + progress: browseProgress, + begin: beginProgress, + clear: clearProgress, + } = useBrowseProgress(); const loadProfiles = useCallback((): (() => void) => { let cancelled = false; @@ -76,6 +86,7 @@ export function SshConnectionListView({ port: profile.port, identityFile: profile.identityFile ?? undefined, authMode: profile.authMode as "interactive" | "key-only", + progressId: beginProgress(), }); if (!result.ok) { // User cancelled the SSH auth prompt — silent stop, no error banner. @@ -107,6 +118,7 @@ export function SshConnectionListView({ onConnected(session); } finally { setConnectingId(null); + clearProgress(); } } @@ -166,7 +178,9 @@ export function SshConnectionListView({ {favorites.length > 0 ? (
- {t("workspace.favorites")} + + {t("workspace.favorites")} +
    {favorites.map((profile) => ( @@ -174,6 +188,7 @@ export function SshConnectionListView({ key={profile.id} profile={profile} connecting={connectingId === profile.id} + progress={connectingId === profile.id ? browseProgress : null} disabled={busy} errorHuman={errorId === profile.id ? (errorHuman ?? undefined) : undefined} onConnect={() => void connectProfile(profile)} @@ -187,9 +202,14 @@ export function SshConnectionListView({ {/* Recent section */} {recents.length > 0 ? ( -
    0 ? "mt-3" : undefined}> +
    0 ? "mt-3" : undefined} + >
    - {t("workspace.recent")} + + {t("workspace.recent")} +
      {recents.map((profile) => ( @@ -197,6 +217,7 @@ export function SshConnectionListView({ key={profile.id} profile={profile} connecting={connectingId === profile.id} + progress={connectingId === profile.id ? browseProgress : null} disabled={busy} errorHuman={errorId === profile.id ? (errorHuman ?? undefined) : undefined} onConnect={() => void connectProfile(profile)} @@ -233,6 +254,8 @@ export function SshConnectionListView({ interface ConnectionProfileRowProps { readonly profile: ConnectionProfile; readonly connecting: boolean; + /** Live bootstrap progress for this row while connecting (null until events arrive). */ + readonly progress: SshBrowseProgressEvent | null; readonly disabled: boolean; readonly errorHuman: string | undefined; readonly onConnect: () => void; @@ -243,6 +266,7 @@ interface ConnectionProfileRowProps { function ConnectionProfileRow({ profile, connecting, + progress, disabled, errorHuman, onConnect, @@ -294,12 +318,30 @@ function ConnectionProfileRow({