diff --git a/.github/workflows/frontend-release.yml b/.github/workflows/frontend-release.yml index cdd1610b..1ab4012f 100644 --- a/.github/workflows/frontend-release.yml +++ b/.github/workflows/frontend-release.yml @@ -4,6 +4,12 @@ name: Desktop release # Generates a GitHub Release (draft) with installers + update manifests. # Triggered by a `desktop-v*` tag or manually. # +# Each target OS builds on its own runner so the bundled `ao` daemon is compiled +# natively for that platform. build-daemon.mjs keys the binary off the build +# host's platform, so cross-OS packaging (e.g. building the Windows installer on +# macOS) would ship a non-Windows binary named `ao` and the app could not launch +# the daemon (issues #235/#256). The per-OS matrix keeps host == target. +# # ⚠️ Until macOS code signing + notarization secrets are configured (see # frontend/docs/desktop-release.md), published builds are UNSIGNED and will # NOT auto-update on macOS. The workflow still produces installable artifacts. @@ -16,7 +22,11 @@ on: jobs: release: - runs-on: macos-latest + strategy: + fail-fast: false + matrix: + os: [macos-latest, windows-latest] + runs-on: ${{ matrix.os }} permissions: contents: write defaults: @@ -29,6 +39,12 @@ jobs: node-version: 20 cache: npm cache-dependency-path: frontend/package-lock.json + # The daemon is compiled by build-daemon.mjs during prepackage/premake, so + # the Go toolchain must be present and pinned on every runner. + - uses: actions/setup-go@v5 + with: + go-version-file: backend/go.mod + cache-dependency-path: backend/go.sum - run: npm ci - name: Publish run: npm run publish diff --git a/backend/internal/daemon/daemon.go b/backend/internal/daemon/daemon.go index 747f5251..32dd4ee5 100644 --- a/backend/internal/daemon/daemon.go +++ b/backend/internal/daemon/daemon.go @@ -7,6 +7,7 @@ import ( "context" "fmt" "log/slog" + "net/http" "os" "os/signal" "syscall" @@ -32,12 +33,17 @@ func Run() error { log := newLogger() - // Fail fast if a live daemon already owns the handshake file. A run-file - // left by a crashed predecessor (dead PID) is treated as stale and - // overwritten when the new server starts. + // Fail fast only if a daemon is genuinely still serving the recorded port. + // CheckStale confirms the run-file's PID is alive, but that alone is not + // proof a predecessor owns the port: the file leaks when the daemon is hard + // killed without a graceful shutdown (the norm on Windows, where the desktop + // supervisor can only TerminateProcess it), and Windows reuses the recorded + // PID for unrelated processes. So a "live" PID is verified against an actual + // /healthz probe; a run-file left by a crashed/hard-killed/reused-PID + // predecessor is treated as stale and overwritten when the new server starts. if live, err := runfile.CheckStale(cfg.RunFilePath); err != nil { return fmt.Errorf("inspect run-file: %w", err) - } else if live != nil { + } else if live != nil && runFileOwnerServing(&http.Client{Timeout: staleProbeTimeout}, config.LoopbackHost, live) { return fmt.Errorf("daemon already running (pid %d, port %d); refusing to start", live.PID, live.Port) } diff --git a/backend/internal/daemon/stale.go b/backend/internal/daemon/stale.go new file mode 100644 index 00000000..e8f7963c --- /dev/null +++ b/backend/internal/daemon/stale.go @@ -0,0 +1,58 @@ +package daemon + +import ( + "encoding/json" + "fmt" + "net/http" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +// staleProbeTimeout bounds the startup ownership probe so a run-file pointing at +// an unreachable port cannot stall daemon startup. +const staleProbeTimeout = 2 * time.Second + +// runFileOwnerServing reports whether an AO daemon matching info is actually +// serving on the recorded loopback port. +// +// runfile.CheckStale only confirms the recorded PID is alive, which is not +// enough to conclude a predecessor still owns the port. On Windows the desktop +// supervisor can only TerminateProcess the daemon (no POSIX signal reaches it), +// so the daemon's graceful shutdown never runs and running.json is never +// removed; the leaked file then survives into the next launch. Because Windows +// reuses PIDs aggressively, the recorded PID routinely belongs to an unrelated +// process, making the PID-only check report "alive" for a daemon that is long +// gone — which is what made the daemon refuse to start (issue #256). +// +// Probing /healthz and matching both the service name and the PID is the ground +// truth that a real predecessor is still listening. When it is not, the +// run-file is stale and the caller should overwrite it instead of refusing. +func runFileOwnerServing(client *http.Client, host string, info *runfile.Info) bool { + if info == nil || info.Port <= 0 { + return false + } + + req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s:%d/healthz", host, info.Port), http.NoBody) + if err != nil { + return false + } + resp, err := client.Do(req) + if err != nil { + return false + } + defer func() { _ = resp.Body.Close() }() + if resp.StatusCode < 200 || resp.StatusCode >= 300 { + return false + } + + var body struct { + Service string `json:"service"` + PID int `json:"pid"` + } + if err := json.NewDecoder(resp.Body).Decode(&body); err != nil { + return false + } + return body.Service == daemonmeta.ServiceName && body.PID == info.PID +} diff --git a/backend/internal/daemon/stale_test.go b/backend/internal/daemon/stale_test.go new file mode 100644 index 00000000..79939ceb --- /dev/null +++ b/backend/internal/daemon/stale_test.go @@ -0,0 +1,116 @@ +package daemon + +import ( + "fmt" + "net/http" + "net/http/httptest" + "net/url" + "strconv" + "testing" + "time" + + "github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta" + "github.com/aoagents/agent-orchestrator/backend/internal/runfile" +) + +// healthzBody returns a handler that answers /healthz with the given service +// name and pid, mimicking the daemon's real liveness probe. +func healthzBody(service string, pid int) http.HandlerFunc { + return func(w http.ResponseWriter, r *http.Request) { + if r.URL.Path != "/healthz" { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + _, _ = fmt.Fprintf(w, `{"status":"ok","service":%q,"pid":%d}`, service, pid) + } +} + +func hostPort(t *testing.T, rawURL string) (string, int) { + t.Helper() + u, err := url.Parse(rawURL) + if err != nil { + t.Fatalf("parse %q: %v", rawURL, err) + } + port, err := strconv.Atoi(u.Port()) + if err != nil { + t.Fatalf("port from %q: %v", rawURL, err) + } + return u.Hostname(), port +} + +func TestRunFileOwnerServing(t *testing.T) { + const pid = 4242 + + tests := []struct { + name string + handler http.HandlerFunc + want bool + }{ + { + name: "matching service and pid is the live owner", + handler: healthzBody(daemonmeta.ServiceName, pid), + want: true, + }, + { + name: "reused pid: same port, different process pid", + handler: healthzBody(daemonmeta.ServiceName, pid+1), + want: false, + }, + { + name: "foreign service occupying the port", + handler: healthzBody("some-other-service", pid), + want: false, + }, + { + name: "non-2xx response", + handler: func(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusInternalServerError) + }, + want: false, + }, + { + name: "unparseable body", + handler: func(w http.ResponseWriter, _ *http.Request) { + _, _ = w.Write([]byte("not json")) + }, + want: false, + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + srv := httptest.NewServer(tc.handler) + defer srv.Close() + host, port := hostPort(t, srv.URL) + + got := runFileOwnerServing(srv.Client(), host, &runfile.Info{PID: pid, Port: port}) + if got != tc.want { + t.Errorf("runFileOwnerServing = %v, want %v", got, tc.want) + } + }) + } +} + +func TestRunFileOwnerServingNoListener(t *testing.T) { + // Bind then immediately close to obtain a port nothing is listening on, so + // the probe hits a refused connection — the leaked-run-file case. + srv := httptest.NewServer(http.NotFoundHandler()) + host, port := hostPort(t, srv.URL) + srv.Close() + + client := &http.Client{Timeout: time.Second} + if runFileOwnerServing(client, host, &runfile.Info{PID: 4242, Port: port}) { + t.Error("runFileOwnerServing on a dead port = true, want false (stale, safe to overwrite)") + } +} + +func TestRunFileOwnerServingNilOrZeroPort(t *testing.T) { + client := &http.Client{Timeout: time.Second} + if runFileOwnerServing(client, "127.0.0.1", nil) { + t.Error("runFileOwnerServing(nil) = true, want false") + } + if runFileOwnerServing(client, "127.0.0.1", &runfile.Info{PID: 1, Port: 0}) { + t.Error("runFileOwnerServing(port 0) = true, want false") + } +}