Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 50 additions & 19 deletions cmd/agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ func main() {
// classifier text that also arrive on the same stderr stream.
agentLogger := slog.New(slog.NewJSONHandler(os.Stderr, nil)).With("src", "agent-log")

root := rootPathFromArgv(os.Args)
root, idleWatchdog := parseRunArgs(os.Args)
if root == "" {
agentLogger.Error("Usage: agent <rootPath>")
agentLogger.Error("Usage: agent [--idle-watchdog] <rootPath>")
os.Exit(2)
}

Expand Down Expand Up @@ -127,7 +127,13 @@ func main() {
defer fsys.Close()
defer git.Close()
defer lsp.Close()
defer pty.Close()
// PTY cleanup runs as a shutdown hook, not a defer: every termination path
// (EOF, SIGTERM, idle watchdog) ends in drainAndExit → os.Exit, which skips
// defers. Pdeathsig (Linux only) and the master-fd-close SIGHUP are not
// enough on their own — a non-Linux remote has no Pdeathsig, and a child
// that ignores SIGHUP would orphan. The hook calls SIGKILL on each PTY
// process group, so children are reaped deterministically on every OS.
host.RegisterShutdownHook(pty.Close)
if hooksrv != nil {
// SIGTERM 경로는 os.Exit가 defer를 우회하므로 shutdown hook으로 등록한다.
// hookserver는 /tmp/nexus-h-*.sock 파일을 생성하므로 정리하지 않으면
Expand All @@ -137,12 +143,21 @@ func main() {
}
host.InstallSigtermHandler()

// Idle watchdog limit. Advertised to the client in the Ready frame so it
// pings every limit/6; only enabled for SSH (idleWatchdog flag), where a
// vanished client may leave the connection lingering without stdin EOF. 0
// when disabled — the client reads that as "do not ping".
idleWatchdogMs := 0
if idleWatchdog {
idleWatchdogMs = int((90 * time.Second) / time.Millisecond)
}

// Ready frame must reach the client before any other output so the
// channel handshake on the TS side can settle. A write failure here
// is unrecoverable — without a Ready, the client will time out.
// methods 목록과 heartbeat 간격(10s)을 함께 전달해 클라이언트가 pull 기반으로
// hook.getInfo를 호출할 수 있음을 알린다.
if err := host.WriteFrame(proto.Ready(d.Methods(), 10_000)); err != nil {
// methods 목록과 heartbeat 간격(10s), idle watchdog 한도를 함께 전달해
// 클라이언트가 pull 기반 hook.getInfo 호출과 keepalive ping을 결정하도록 한다.
if err := host.WriteFrame(proto.Ready(d.Methods(), 10_000, idleWatchdogMs)); err != nil {
agentLogger.Error("failed to write ready frame", "err", err)
os.Exit(1)
}
Expand All @@ -151,12 +166,15 @@ func main() {
// 일치해야 한다. ctx 취소(드레인) 시 자동 정지한다.
host.StartHeartbeat(10 * time.Second)

// Idle watchdog: self-terminate if the client sends nothing for 60s. The
// client pings every ~20s (KEEPALIVE_PING_INTERVAL_MS in pipe.ts), so a
// healthy idle session resets the timer ~3× per window; only a vanished
// client (half-open TCP, hung process, sleep) with no stdin EOF trips it,
// preventing an orphaned remote agent from holding its binary.
host.StartIdleWatchdog(60 * time.Second)
// Idle watchdog (SSH only): self-terminate if the client sends nothing for
// the limit. The client pings every limit/6 (derived from the advertised
// idleWatchdogMs in pipe.ts), so a healthy idle session resets the timer ~6×
// per window; only a vanished client (half-open TCP, hung process, sleep)
// with no stdin EOF trips it, preventing an orphaned remote agent from
// holding its binary. Disabled locally, where parent death arrives as EOF.
if idleWatchdog {
host.StartIdleWatchdog(90 * time.Second)
}

host.Run()
}
Expand Down Expand Up @@ -190,14 +208,27 @@ func newHookGetInfoHandler(hs hookInfoProvider) dispatch.Handler {
}
}

// rootPathFromArgv extracts the workspace root from argv. We accept
// exactly one positional argument and return "" when it is missing so
// the caller can print usage and exit non-zero.
func rootPathFromArgv(argv []string) string {
if len(argv) > 1 {
return argv[1]
// parseRunArgs extracts the workspace root and option flags from argv for the
// long-lived stdio agent (after the hook/askpass subcommands have been ruled
// out). The first non-flag positional is the root; "" when absent so the caller
// can print usage and exit non-zero.
//
// --idle-watchdog is set only by the SSH remote launch command
// (buildRemoteAgentCommand in ssh-bootstrap), never by the local launch. It is
// what gates the idle watchdog so a local agent — whose parent death already
// arrives as stdin EOF (plus Pdeathsig on Linux) — never self-terminates on a
// transient main-thread stall or a laptop wake.
func parseRunArgs(argv []string) (root string, idleWatchdog bool) {
for _, arg := range argv[1:] {
if arg == "--idle-watchdog" {
idleWatchdog = true
continue
}
if root == "" {
root = arg
}
}
return ""
return root, idleWatchdog
}

// askpassExitFromArgv detects both the explicit `agent --askpass <socket>`
Expand Down
26 changes: 26 additions & 0 deletions cmd/agent/main_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,29 @@ func TestServerNDJSONAndSIGTERM(t *testing.T) {
t.Fatal("server did not exit after SIGTERM")
}
}

func TestParseRunArgs(t *testing.T) {
cases := []struct {
name string
argv []string
wantRoot string
wantWatchdog bool
}{
{"root only", []string{"agent", "/repo"}, "/repo", false},
{"watchdog before root", []string{"agent", "--idle-watchdog", "/repo"}, "/repo", true},
{"watchdog after root", []string{"agent", "/repo", "--idle-watchdog"}, "/repo", true},
{"missing root", []string{"agent"}, "", false},
{"watchdog only", []string{"agent", "--idle-watchdog"}, "", true},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
root, watchdog := parseRunArgs(tc.argv)
if root != tc.wantRoot {
t.Errorf("root = %q, want %q", root, tc.wantRoot)
}
if watchdog != tc.wantWatchdog {
t.Errorf("idleWatchdog = %v, want %v", watchdog, tc.wantWatchdog)
}
})
}
}
13 changes: 12 additions & 1 deletion internal/proto/proto.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,12 +103,20 @@ type Response struct {
//
// HeartbeatIntervalMs is the interval at which the server will emit
// "agent.heartbeat" events. A value of 0 means heartbeat is disabled.
//
// IdleWatchdogMs is the agent's idle-watchdog limit in milliseconds: if no
// inbound line arrives within it, the agent self-terminates. A value of 0 means
// the watchdog is disabled (local agents), which is the client's signal NOT to
// send keepalive pings. When positive, the client pings every IdleWatchdogMs/6
// so a live-but-idle session keeps resetting the limit. Tying the client's ping
// behavior to this single advertised value keeps the two ends from drifting.
type ReadyFrame struct {
Type string `json:"type"`
ProtocolVersion string `json:"protocolVersion"`
ServerVersion string `json:"serverVersion"`
Methods []string `json:"methods"`
HeartbeatIntervalMs int `json:"heartbeatIntervalMs"`
IdleWatchdogMs int `json:"idleWatchdogMs"`
}

// EventFrame is a server → client broadcast frame. It deliberately has no id:
Expand All @@ -123,7 +131,9 @@ type EventFrame struct {
// methods is the list of RPC method names the server has registered;
// an empty (non-nil) slice is valid. heartbeatIntervalMs is the
// advertised heartbeat interval in milliseconds; 0 means disabled.
func Ready(methods []string, heartbeatIntervalMs int) ReadyFrame {
// idleWatchdogMs is the advertised idle-watchdog limit in milliseconds;
// 0 means the agent runs no watchdog (and the client should not ping).
func Ready(methods []string, heartbeatIntervalMs int, idleWatchdogMs int) ReadyFrame {
if methods == nil {
methods = []string{}
}
Expand All @@ -133,6 +143,7 @@ func Ready(methods []string, heartbeatIntervalMs int) ReadyFrame {
ServerVersion: ServerVersion,
Methods: methods,
HeartbeatIntervalMs: heartbeatIntervalMs,
IdleWatchdogMs: idleWatchdogMs,
}
}

Expand Down
10 changes: 5 additions & 5 deletions internal/proto/proto_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,27 +48,27 @@ func TestSuccessCoercesNilResultToExplicitNull(t *testing.T) {

func TestReadyFrameIncludesVersions(t *testing.T) {
// methods 슬라이스와 heartbeat 간격을 함께 전달한 경우 wire 포맷 확인.
data, err := json.Marshal(Ready([]string{"fs.readFile", "git.log"}, 10_000))
data, err := json.Marshal(Ready([]string{"fs.readFile", "git.log"}, 10_000, 90_000))
if err != nil {
t.Fatalf("Marshal ready: %v", err)
}
want := `{"type":"ready","protocolVersion":"1","serverVersion":"0.1.0","methods":["fs.readFile","git.log"],"heartbeatIntervalMs":10000}`
want := `{"type":"ready","protocolVersion":"1","serverVersion":"0.1.0","methods":["fs.readFile","git.log"],"heartbeatIntervalMs":10000,"idleWatchdogMs":90000}`
if string(data) != want {
t.Fatalf("ready frame = %s, want %s", data, want)
}
}

func TestReadyFrameNilMethodsCoercedToEmptySlice(t *testing.T) {
// nil methods는 빈 슬라이스로 변환되어 JSON "methods":[] 로 직렬화된다.
f := Ready(nil, 0)
f := Ready(nil, 0, 0)
if f.Methods == nil {
t.Fatal("Ready(nil, 0).Methods must not be nil — want empty slice")
t.Fatal("Ready(nil, 0, 0).Methods must not be nil — want empty slice")
}
data, err := json.Marshal(f)
if err != nil {
t.Fatalf("Marshal: %v", err)
}
want := `{"type":"ready","protocolVersion":"1","serverVersion":"0.1.0","methods":[],"heartbeatIntervalMs":0}`
want := `{"type":"ready","protocolVersion":"1","serverVersion":"0.1.0","methods":[],"heartbeatIntervalMs":0,"idleWatchdogMs":0}`
if string(data) != want {
t.Fatalf("ready frame = %s, want %s", data, want)
}
Expand Down
71 changes: 57 additions & 14 deletions internal/stdioserver/host.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,13 @@ import (
// process alive past the parent's expected shutdown window.
const forceExitAfter = 75 * time.Millisecond

// idleWatchdogExitCode is the process exit code used when the idle watchdog
// reaps the agent. It is deliberately non-zero so the client reconnects rather
// than treating the close as a clean shutdown (see StartIdleWatchdog). 75 is
// EX_TEMPFAIL from sysexits.h — "temporary failure, the user is invited to
// retry" — which matches the intent exactly.
const idleWatchdogExitCode = 75

// Host owns the stdio NDJSON server lifecycle. One Host per process —
// stdin / stdout are not multiplexable, and the SIGTERM handler is a
// process-global side effect.
Expand Down Expand Up @@ -64,10 +71,18 @@ type Host struct {
hooksMu sync.Mutex
shutdownHooks []func()

// lastInbound is the UnixNano timestamp of the most recently received
// request line, read by the idle watchdog (StartIdleWatchdog) to detect a
// vanished client. Written from Run's single reader goroutine, read from
// the watchdog goroutine — atomic keeps that race-free.
// startMono anchors the monotonic clock for idle accounting. lastInbound is
// stored as a duration relative to this anchor (not a wall-clock UnixNano),
// so the idle watchdog is immune to wall-clock jumps — NTP steps on the
// remote, or a laptop waking from sleep with a local agent. time.Since on a
// Time that carries a monotonic reading (which startMono does) uses the
// monotonic clock; a bare time.Unix value would silently fall back to wall.
startMono time.Time

// lastInbound is time.Since(startMono) in nanoseconds at the most recently
// received request line, read by the idle watchdog (StartIdleWatchdog) to
// detect a vanished client. Written from Run's single reader goroutine, read
// from the watchdog goroutine — atomic keeps that race-free.
lastInbound atomic.Int64

// exit terminates the process. Defaults to os.Exit; tests inject a fake so
Expand All @@ -92,9 +107,23 @@ func New(d *dispatch.Dispatcher, in io.Reader, out io.Writer, logger *slog.Logge
cancel: cancel,
accepting: true,
exit: os.Exit,
startMono: time.Now(),
}
}

// stampInbound records "now" (monotonic, relative to startMono) as the last
// time an inbound line arrived. Single encoding point for lastInbound so the
// watchdog's idleElapsed reads the same units.
func (h *Host) stampInbound() {
h.lastInbound.Store(int64(time.Since(h.startMono)))
}

// idleElapsed reports how long it has been since the last inbound line, using
// the monotonic clock so it cannot be skewed by wall-clock adjustments.
func (h *Host) idleElapsed() time.Duration {
return time.Since(h.startMono) - time.Duration(h.lastInbound.Load())
}

// WriteFrame serializes one frame as NDJSON onto `out`. Used by the
// caller to emit the boot Ready frame before Run begins; internal
// response writes use the same path.
Expand Down Expand Up @@ -146,7 +175,7 @@ func (h *Host) Run() {
continue
}
// Any inbound line proves the client is alive — reset the idle watchdog.
h.lastInbound.Store(time.Now().UnixNano())
h.stampInbound()
if !h.isAccepting() {
continue
}
Expand Down Expand Up @@ -205,24 +234,38 @@ func (h *Host) StartHeartbeat(interval time.Duration) {
// `ping` so a healthy but idle session keeps resetting lastInbound; only a
// genuinely absent client trips the limit.
//
// `limit` must be comfortably larger than the client's keepalive ping interval
// (KEEPALIVE_PING_INTERVAL_MS in pipe.ts) so normal jitter never false-fires.
// A non-positive limit disables the watchdog. Call before Run(); the goroutine
// stops when h.ctx is cancelled (drain).
// The client pings every limit/6 (it derives that from the idleWatchdogMs the
// agent advertises in its Ready frame), so a healthy session lands ~6 pings per
// window and tolerates several missed ticks before the limit trips — chosen
// because a false fire kills live PTY children, while a slow reap merely lets an
// orphan linger. A non-positive limit disables the watchdog. Call before Run();
// the goroutine stops when h.ctx is cancelled (drain).
func (h *Host) StartIdleWatchdog(limit time.Duration) {
if limit <= 0 {
return
}
h.lastInbound.Store(time.Now().UnixNano())
h.stampInbound()
// Check at limit/6 (independent of the old limit/3) so raising the limit
// keeps the kill window tight: silence trips between limit and limit+limit/6.
check := limit / 6
if check <= 0 {
check = limit
}
go func() {
ticker := time.NewTicker(limit / 3)
ticker := time.NewTicker(check)
defer ticker.Stop()
for {
select {
case <-ticker.C:
last := time.Unix(0, h.lastInbound.Load())
if time.Since(last) >= limit {
h.drainAndExit(0)
if h.idleElapsed() >= limit {
// Exit non-zero so the client distinguishes a watchdog reap
// from a clean shutdown: its handleClose treats code 0 as a
// terminal exit (no reconnect) but reconnects on any other
// code. This only reaches a client in the false-positive case
// (client alive but stalled past the limit) — exactly when an
// automatic reconnect is the desired recovery. When the client
// is genuinely gone, no one observes the code.
h.drainAndExit(idleWatchdogExitCode)
return
}
case <-h.ctx.Done():
Expand Down
8 changes: 5 additions & 3 deletions internal/stdioserver/host_idle_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,10 @@ func TestIdleWatchdogExitsWhenClientVanishes(t *testing.T) {

select {
case code := <-exited:
if code != 0 {
t.Fatalf("exit code = %d, want 0", code)
// Non-zero (EX_TEMPFAIL) so the client reconnects instead of treating
// the close as a clean shutdown.
if code != idleWatchdogExitCode {
t.Fatalf("exit code = %d, want %d", code, idleWatchdogExitCode)
}
case <-time.After(2 * time.Second):
t.Fatal("idle watchdog did not terminate within 2s of client silence")
Expand All @@ -63,7 +65,7 @@ func TestIdleWatchdogStaysAliveWithTraffic(t *testing.T) {
for {
select {
case <-tick.C:
host.lastInbound.Store(time.Now().UnixNano())
host.stampInbound()
case code := <-exited:
t.Fatalf("watchdog fired during active traffic (code=%d)", code)
case <-stop:
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"name": "nexus-code",
"productName": "NexusCode",
"version": "0.5.2",
"version": "0.5.3",
"description": "Multi-workspace VSCode-style editor for macOS. Monaco editor + terminal in one window.",
"license": "MIT",
"private": true,
Expand Down
17 changes: 15 additions & 2 deletions src/main/features/ssh/ipc.ts
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,13 @@ function mapToBrowseError(error: unknown): Error {
}

function createSshErrorObject(code: SshErrorCode): Error & { code: SshErrorCode } {
const err = new Error(messageForSshErrorCode(code)) as Error & { code: SshErrorCode };
// The `.code` property does not survive Electron's IPC error serialization
// (only name/message/stack cross the boundary), so we also embed the code in
// the message text. The renderer classifies the failure by matching the code
// substring in error.message (see humanizeSshError / extractSshErrorKind).
const err = new Error(`${messageForSshErrorCode(code)} [${code}]`) as Error & {
code: SshErrorCode;
};
err.code = code;
return err;
}
Expand All @@ -457,7 +463,12 @@ function sshErrorCodeFromError(error: unknown): SshErrorCode | undefined {
if (typeof error !== "object" || error === null || !("code" in error)) {
return undefined;
}
const parsed = SshErrorCodeSchema.safeParse((error as { code?: unknown }).code);
const rawCode = (error as { code?: unknown }).code;
// Agent filesystem failures surface domain codes (NOT_FOUND, …) that are not
// SSH codes. Map the ones with dedicated UX so a missing path is reported as
// "path not found" instead of the generic ssh.unknown.
if (rawCode === "NOT_FOUND") return "ssh.path-not-found";
const parsed = SshErrorCodeSchema.safeParse(rawCode);
return parsed.success ? parsed.data : undefined;
}

Expand All @@ -471,6 +482,8 @@ function messageForSshErrorCode(code: SshErrorCode): string {
return "SSH authentication cancelled";
case "ssh.session-expired":
return "SSH browse session expired";
case "ssh.path-not-found":
return "Remote path not found";
case "server.spawn-failed":
return "Remote agent failed to start";
case "server.protocol-error":
Expand Down
Loading