From 231fa45ded7415d0a78a32af5850fb370495958e Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Thu, 11 Jun 2026 07:58:04 +0000 Subject: [PATCH 1/2] feat: restart a session's agent in place There was no way to recycle a wedged or stale agent without losing the session: stop + dispatch creates a new identity and drops the provider conversation. Add Service.Restart: stop the running backend session (soft close, the existing HUP-TERM-KILL escalation, record kept), then resume it under the same name - same id, label, store record, and the provider's resume args (claude/omp/opencode -c and equivalents) so the agent picks its conversation back up. A session that is already stopped simply resumes (idempotent). Surfaces: - TUI: the Ctrl+X confirm dialog gains an `r` answer ("y / restart r / N"), with the same id-snapshot semantics as stop (F29) so a refresh reorder cannot retarget it - CLI: `uam restart ` Known limits, accepted: the emulator scrollback resets (new host) and an attached client sees "session ended" and re-attaches; hermes has no resume mechanism, so its restart re-sends the original prompt - the same behavior its resume always had. --- README.md | 3 +- internal/app/app.go | 26 +++++++++++- internal/app/rename_input_test.go | 50 +++++++++++++++++++++++ internal/app/service.go | 17 ++++++++ internal/app/service_test.go | 66 +++++++++++++++++++++++++++++++ internal/cli/cli.go | 13 ++++++ internal/cli/cli_test.go | 30 +++++++++++++- 7 files changed, 201 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 77ec2b6..be28d4c 100644 --- a/README.md +++ b/README.md @@ -93,6 +93,7 @@ uam peek uam attach uam last uam stop # kill the session, keep record +uam restart # stop the agent and resume it in place uam rm # kill the session and remove record uam kill-all # stop every managed session uam version @@ -111,7 +112,7 @@ uam version | `Space` | Toggle peek panel | | `Ctrl+T` | Pin selected session | | `Ctrl+R` | Rename selected session | -| `Ctrl+X` | Stop or remove the selected session with confirmation | +| `Ctrl+X` | Stop, restart, or remove the selected session with confirmation | | `Ctrl+S` | Toggle group-by-directory | | `Shift+↑/↓` | Reorder rows | | `e` | Open the guided dispatch wizard | diff --git a/internal/app/app.go b/internal/app/app.go index f3d25a8..7f189b6 100644 --- a/internal/app/app.go +++ b/internal/app/app.go @@ -389,6 +389,12 @@ func (m Model) handleModalKey(msg tea.KeyMsg, key string) (bool, tea.Model, tea. m.confirmStopID = "" return true, m, m.stopTargetCmd(id, true) } + if key == "r" { + m.confirmStop = false + id := m.confirmStopID + m.confirmStopID = "" + return true, m, m.restartTargetCmd(id) + } if key == "n" || key == "esc" { m.confirmStop = false m.confirmStopID = "" @@ -1073,6 +1079,22 @@ func (m Model) stopTargetCmd(id string, remove bool) tea.Cmd { return sessionsLoadedMsg{err: err} } } + +// restartTargetCmd restarts the session with the snapshotted id (same F29 +// fallback as stopTargetCmd): the agent process is stopped and resumed in +// place, keeping the session's name and provider conversation. +func (m Model) restartTargetCmd(id string) tea.Cmd { + sess, ok := m.sessionByID(id) + if !ok { + return nil + } + return func() tea.Msg { + if err := m.service.Restart(context.Background(), sess.ID); err != nil { + return sessionsLoadedMsg{err: err} + } + return m.loadSessionsCmd()() + } +} func (m Model) pinSelectedCmd() tea.Cmd { sess, ok := m.selectedSession() if !ok { @@ -1467,7 +1489,7 @@ func (m Model) renderHelp() string { rows := []string{ "↑/↓ move Enter/→ attach Space peek", "Tab cycle agent Ctrl+T pin Ctrl+R rename", - "Ctrl+X stop/remove Ctrl+S group-by-dir", + "Ctrl+X stop/restart/remove Ctrl+S group-by-dir", "e new session Esc quit", "in session: ← detach (when input empty) Ctrl+B d detach", "dispatch: @agent:alias #name prompt (alias, name & prompt optional)", @@ -1485,7 +1507,7 @@ func (m Model) renderConfirm() string { name := firstNonEmpty(sess.DisplayName, sess.ID, "session") return "\n " + sectionStyle.Render("Stop session") + "\n " + hintStyle.Render("Stop and remove ") + titleStyle.Render(name) + hintStyle.Render("?") + - " " + brandStyle.Render("y") + hintStyle.Render(" / ") + titleStyle.Render("N") + "\n" + " " + brandStyle.Render("y") + hintStyle.Render(" / restart ") + brandStyle.Render("r") + hintStyle.Render(" / ") + titleStyle.Render("N") + "\n" } func (m Model) renderWizard() string { diff --git a/internal/app/rename_input_test.go b/internal/app/rename_input_test.go index 5ad6565..f7b88b0 100644 --- a/internal/app/rename_input_test.go +++ b/internal/app/rename_input_test.go @@ -199,3 +199,53 @@ func TestStopConfirmTargetsOriginalSessionAfterReorder(t *testing.T) { t.Fatalf("stop targeted the reordered session beta instead of alpha") } } + +// The stop-confirm dialog also offers restart: `r` stops the agent process +// and resumes it in place, targeting the originally-confirmed session (same +// F29 snapshot semantics as stop). +func TestStopConfirmRestartsOriginalSessionAfterReorder(t *testing.T) { + live := []adapter.Session{ + {ID: "alpha", AgentType: "fake", DisplayName: "alpha", SessionName: "uam-fake-alpha", State: adapter.Active, ProcAlive: adapter.Alive, CreatedAt: time.Now()}, + {ID: "beta", AgentType: "fake", DisplayName: "beta", SessionName: "uam-fake-beta", State: adapter.Active, ProcAlive: adapter.Alive, CreatedAt: time.Now()}, + } + dir := t.TempDir() + st, err := store.Open(filepath.Join(dir, "sessions.json")) + if err != nil { + t.Fatal(err) + } + fake := &svcFakeAdapter{name: "fake", available: true, sessions: live, stopRemoves: true} + m := NewWithDeps(st, adapter.NewRegistry([]adapter.AgentAdapter{fake})) + m.sessions = append([]adapter.Session(nil), live...) + m.selected = 0 + if err := st.Update(func(cfg *store.Config) error { + cfg.Sessions[store.Key("fake", "alpha")] = RecordFromSession(live[0], store.ModeYolo) + cfg.Sessions[store.Key("fake", "beta")] = RecordFromSession(live[1], store.ModeYolo) + return nil + }); err != nil { + t.Fatal(err) + } + + // Open the confirm dialog on alpha (index 0), then a refresh reorders. + if handled, _ := m.handleActionKey("ctrl+x"); !handled || !m.confirmStop { + t.Fatal("ctrl+x should open confirm") + } + m.sessions = []adapter.Session{live[1], live[0]} + m.selected = 0 + + _, model, cmd := m.handleModalKey(tea.KeyMsg{Type: tea.KeyRunes, Runes: []rune("r")}, "r") + m = model.(Model) + if cmd == nil { + t.Fatal("expected a restart command from confirm r") + } + if m.confirmStop { + t.Fatal("confirm dialog must close after r") + } + cmd() + + if !fake.stopped { + t.Fatal("restart must stop the live agent process") + } + if fake.resumed == nil || fake.resumed.ID != "alpha" { + t.Fatalf("restart must resume the originally-confirmed session alpha, got %+v", fake.resumed) + } +} diff --git a/internal/app/service.go b/internal/app/service.go index f3e6c83..c143cfb 100644 --- a/internal/app/service.go +++ b/internal/app/service.go @@ -549,6 +549,23 @@ func (s *Service) AttachSpec(ctx context.Context, id string) (adapter.AttachSpec return a.Attach(sess.ID) } +// Restart replaces a session's agent process while keeping its identity: a +// running backend session is stopped (soft close, record kept), then resumed +// under the same name with the provider's resume args so the agent picks its +// conversation back up. A session that is already stopped simply resumes. +func (s *Service) Restart(ctx context.Context, id string) error { + sess, _, err := s.Find(ctx, id) + if err != nil { + return err + } + if sess.ProcAlive == adapter.Alive { + if err := s.Stop(ctx, id, false); err != nil { + return err + } + } + return s.ResumeBackground(ctx, id) +} + // ResumeBackground restarts a stopped session's backend session without // attaching to it. It is a no-op when the session is already running. func (s *Service) ResumeBackground(ctx context.Context, id string) error { diff --git a/internal/app/service_test.go b/internal/app/service_test.go index 55f8cba..6321421 100644 --- a/internal/app/service_test.go +++ b/internal/app/service_test.go @@ -29,6 +29,10 @@ type svcFakeAdapter struct { // F12: simulate a per-adapter List failure so liveSessions can be tested for // logging-then-continue (one bad adapter must not blank the dashboard). listErr error + // stopRemoves makes Stop drop the live sessions, mirroring the real + // backend where Kill returns only after the host is fully gone — so a + // restart's resume step sees the session as dead. + stopRemoves bool } func (f *svcFakeAdapter) Name() string { return f.name } @@ -64,6 +68,9 @@ func (f *svcFakeAdapter) Attach(id string) (adapter.AttachSpec, error) { } func (f *svcFakeAdapter) Stop(ctx adapter.Context, id string) error { f.stopped = true + if f.stopRemoves { + f.sessions = nil + } return f.stopErr } func (f *svcFakeAdapter) HasSession(ctx adapter.Context, id string) bool { return f.alive } @@ -639,6 +646,65 @@ func TestResumeBackgroundClearsClosedStatus(t *testing.T) { } } +// Restart replaces a live session's agent process in place: stop the backend +// session, then resume it under the same identity (id, session name, record) +// with the provider's resume args. +func TestRestartStopsThenResumesLiveSession(t *testing.T) { + dir := t.TempDir() + st, err := store.Open(filepath.Join(dir, "sessions.json")) + if err != nil { + t.Fatal(err) + } + fake := &svcFakeAdapter{name: "fake", available: true, stopRemoves: true} + svc := NewService(st, adapter.NewRegistry([]adapter.AgentAdapter{fake})) + live, err := svc.DispatchNamed(context.Background(), "fake", "tracker", "hello", "/tmp", "yolo") + if err != nil { + t.Fatal(err) + } + fake.sessions = []adapter.Session{live} + if err := svc.Restart(context.Background(), "12345678"); err != nil { + t.Fatalf("Restart: %v", err) + } + if !fake.stopped { + t.Fatal("restart must stop the live backend session first") + } + if fake.resumed == nil { + t.Fatal("restart must resume the session after stopping it") + } + if fake.resumed.ID != "12345678" || fake.resumed.SessionName != "uam-fake-12345678" { + t.Fatalf("restart must keep the session identity, resumed %+v", fake.resumed) + } + cfg, _ := st.Load() + if cfg.Sessions[store.Key("fake", "12345678")].Status != store.StatusActive { + t.Fatalf("record must stay active after restart, got %q", cfg.Sessions[store.Key("fake", "12345678")].Status) + } +} + +// Restarting a session that is already stopped skips the stop and just +// resumes it — an idempotent restart. +func TestRestartOfStoppedSessionJustResumes(t *testing.T) { + dir := t.TempDir() + st, err := store.Open(filepath.Join(dir, "sessions.json")) + if err != nil { + t.Fatal(err) + } + fake := &svcFakeAdapter{name: "fake", available: true} + svc := NewService(st, adapter.NewRegistry([]adapter.AgentAdapter{fake})) + if _, err := svc.DispatchNamed(context.Background(), "fake", "tracker", "hello", "/tmp", "yolo"); err != nil { + t.Fatal(err) + } + // No live session listed: the agent already exited. + if err := svc.Restart(context.Background(), "12345678"); err != nil { + t.Fatalf("Restart: %v", err) + } + if fake.stopped { + t.Fatal("restart of a stopped session must not call Stop") + } + if fake.resumed == nil { + t.Fatal("restart of a stopped session must resume it") + } +} + func captureStdout(t *testing.T, fn func()) string { t.Helper() old := os.Stdout diff --git a/internal/cli/cli.go b/internal/cli/cli.go index f7439e5..2f738bd 100644 --- a/internal/cli/cli.go +++ b/internal/cli/cli.go @@ -56,6 +56,7 @@ func Usage() { fmt.Fprintln(os.Stderr, " uam ls [--json]") fmt.Fprintln(os.Stderr, " uam peek ") fmt.Fprintln(os.Stderr, " uam stop ") + fmt.Fprintln(os.Stderr, " uam restart stop the agent and resume it in place") fmt.Fprintln(os.Stderr, " uam rm ") fmt.Fprintln(os.Stderr, " uam kill-all stop every managed session") fmt.Fprintln(os.Stderr, " uam notify-closed (internal: flag a record user-closed)") @@ -103,6 +104,8 @@ func runCommand(ctx context.Context, svc *app.Service, args []string, runTUI fun return runPeek(ctx, svc, args[1:]) case "stop", "rm": return runStop(ctx, svc, args[0], args[1:]) + case "restart": + return runRestart(ctx, svc, args[1:]) case "notify-closed": return runNotifyClosed(svc, args[1:]) case "kill-all": @@ -157,6 +160,16 @@ func runStop(ctx context.Context, svc *app.Service, cmd string, args []string) e return svc.Stop(ctx, id, cmd == "rm") } +// runRestart stops the session's agent process and resumes it in place: same +// session name and record, with the provider's resume args. +func runRestart(ctx context.Context, svc *app.Service, args []string) error { + id, err := requireArg(args, "restart requires ") + if err != nil { + return err + } + return svc.Restart(ctx, id) +} + // runNotifyClosed flags the matching record as user-closed. Session hosts // mark records closed in-process when their agent exits, so uam itself no // longer shells out to this; it stays for scripts and older tmux hooks that diff --git a/internal/cli/cli_test.go b/internal/cli/cli_test.go index c7ac587..0084865 100644 --- a/internal/cli/cli_test.go +++ b/internal/cli/cli_test.go @@ -21,6 +21,7 @@ import ( type cliFakeAdapter struct { sessions []adapter.Session stopped bool + resumed bool } func (f *cliFakeAdapter) Name() string { return "fake" } @@ -44,7 +45,21 @@ func (f *cliFakeAdapter) Reply(ctx adapter.Context, id, text string) error { ret func (f *cliFakeAdapter) Attach(id string) (adapter.AttachSpec, error) { return adapter.AttachSpec{Argv: []string{"echo", id}}, nil } -func (f *cliFakeAdapter) Stop(ctx adapter.Context, id string) error { f.stopped = true; return nil } + +// Stop drops the live sessions, mirroring the real backend where Kill +// returns only after the host is fully gone. +func (f *cliFakeAdapter) Stop(ctx adapter.Context, id string) error { + f.stopped = true + f.sessions = nil + return nil +} + +func (f *cliFakeAdapter) Resume(ctx adapter.Context, req adapter.ResumeRequest) (adapter.Session, error) { + f.resumed = true + sess := adapter.Session{ID: req.ID, AgentType: "fake", DisplayName: req.Name, Cwd: req.Cwd, SessionName: req.SessionName, State: adapter.Active, ProcAlive: adapter.Alive, CreatedAt: time.Now()} + f.sessions = append(f.sessions, sess) + return sess, nil +} func TestRunDispatchListPeekAndStop(t *testing.T) { svc, fake := newCLITestService(t) @@ -64,6 +79,19 @@ func TestRunDispatchListPeekAndStop(t *testing.T) { } } +// restart stops the live agent process and resumes the session in place. +func TestRunRestart(t *testing.T) { + svc, fake := newCLITestService(t) + id := dispatchAndCaptureID(t, svc, []string{"--cwd", "/tmp", "fake", "#bugfix", "fix", "thing"}) + must(t, runRestart(context.Background(), svc, []string{id})) + if !fake.stopped || !fake.resumed { + t.Fatalf("restart must stop then resume: stopped=%v resumed=%v", fake.stopped, fake.resumed) + } + if err := runRestart(context.Background(), svc, nil); err == nil { + t.Fatal("restart without id should fail") + } +} + func TestCLIArgumentValidationAndParsing(t *testing.T) { svc, _ := newCLITestService(t) if err := RunDispatch(context.Background(), svc, nil); err == nil { From d481f9e362afae8bfc84d577d5c3f962dc25ce4e Mon Sep 17 00:00:00 2001 From: Amit Kumar Date: Thu, 11 Jun 2026 07:59:29 +0000 Subject: [PATCH 2/2] fix(host): release the socket path before Kill returns The host's deferred listener Close unlinked the unix socket path when the process exited - 50ms AFTER the kill response had signalled `cleaned`. A replacement host created in that window (exactly what restart does) had its freshly-bound socket deleted out from under it, leaving a running but unreachable session: state file present, no socket, every dial failing with "session is not running". Close the listener while the path is still ours - before signalling cleaned - so the unlink lands on the old socket and the path is free by the time Kill returns. Established connections (the kill responder) are unaffected; the deferred Close becomes a no-op. Pinned by an end-to-end regression test that kills and immediately recreates a session under the same name, then asserts the replacement still answers after the old host has fully exited. --- internal/session/host.go | 7 +++++++ internal/session/session_test.go | 36 ++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) diff --git a/internal/session/host.go b/internal/session/host.go index e45eef4..5e92fe0 100644 --- a/internal/session/host.go +++ b/internal/session/host.go @@ -194,6 +194,13 @@ func runHost(dir, name, cwd, label string, envs, command []string, ready *os.Fil } } close(h.exited) + // Release the socket path while it is still ours: closing the listener + // unlinks it, and leaving that to the deferred Close would unlink AFTER + // cleaned has signalled — i.e. after Kill has returned and a replacement + // host (restart) may have created its own socket at the same path, + // leaving that new host running but unreachable. Established connections + // (the kill responder) are unaffected. + _ = ln.Close() h.shutdown(exitCode) close(h.cleaned) // Give pending kill responders a moment to flush their replies before the diff --git a/internal/session/session_test.go b/internal/session/session_test.go index ea24acc..eb29fea 100644 --- a/internal/session/session_test.go +++ b/internal/session/session_test.go @@ -63,6 +63,42 @@ func TestValidateName(t *testing.T) { } } +// Killing a session and immediately recreating it under the same name (the +// restart flow) must leave the replacement's socket intact: the old host's +// deferred listener Close used to unlink the socket path when its process +// exited ~50ms AFTER Kill had already returned — deleting the socket the +// replacement host had just created, leaving a running but unreachable host. +func TestRecreateAfterKillKeepsSocket(t *testing.T) { + c := newTestClient(t) + ctx := context.Background() + name := "uam-fake-cccc1111" + if err := c.CreateSession(ctx, name, t.TempDir(), nil, []string{"/bin/sh", "-c", "sleep 60"}); err != nil { + t.Fatalf("CreateSession: %v", err) + } + st, err := readState(c.Dir, name) + if err != nil { + t.Fatalf("readState: %v", err) + } + oldHost, oldStart := st.HostPID, st.HostStart + if err := c.Kill(ctx, name); err != nil { + t.Fatalf("Kill: %v", err) + } + if err := c.CreateSession(ctx, name, t.TempDir(), nil, []string{"/bin/sh", "-c", "sleep 60"}); err != nil { + t.Fatalf("recreate: %v", err) + } + // Wait for the OLD host process to fully exit — its deferred cleanup is + // what used to unlink the new socket — then the socket must still exist + // and answer. + waitFor(t, "old host exit", func() bool { return !procAliveWithStart(oldHost, oldStart) }) + time.Sleep(20 * time.Millisecond) // let any buggy deferred unlink land + if _, err := os.Stat(SocketPath(c.Dir, name)); err != nil { + t.Fatalf("replacement socket gone after old host exit: %v", err) + } + if _, err := c.Capture(ctx, name, 5); err != nil { + t.Fatalf("peek after recreate: %v", err) + } +} + func TestCreateListCaptureSendKill(t *testing.T) { c := newTestClient(t) ctx := context.Background()