Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion .github/workflows/frontend-release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,12 @@ name: Desktop release
# Generates a GitHub Release (draft) with installers + update manifests.
# Triggered by a `desktop-v*` tag or manually.
#
# Each target OS builds on its own runner so the bundled `ao` daemon is compiled
# natively for that platform. build-daemon.mjs keys the binary off the build
# host's platform, so cross-OS packaging (e.g. building the Windows installer on
# macOS) would ship a non-Windows binary named `ao` and the app could not launch
# the daemon (issues #235/#256). The per-OS matrix keeps host == target.
#
# ⚠️ Until macOS code signing + notarization secrets are configured (see
# frontend/docs/desktop-release.md), published builds are UNSIGNED and will
# NOT auto-update on macOS. The workflow still produces installable artifacts.
Expand All @@ -16,7 +22,11 @@ on:

jobs:
release:
runs-on: macos-latest
strategy:
fail-fast: false
matrix:
os: [macos-latest, windows-latest]
runs-on: ${{ matrix.os }}
permissions:
contents: write
defaults:
Expand All @@ -29,6 +39,12 @@ jobs:
node-version: 20
cache: npm
cache-dependency-path: frontend/package-lock.json
# The daemon is compiled by build-daemon.mjs during prepackage/premake, so
# the Go toolchain must be present and pinned on every runner.
- uses: actions/setup-go@v5
with:
go-version-file: backend/go.mod
cache-dependency-path: backend/go.sum
- run: npm ci
- name: Publish
run: npm run publish
Expand Down
14 changes: 10 additions & 4 deletions backend/internal/daemon/daemon.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"context"
"fmt"
"log/slog"
"net/http"
"os"
"os/signal"
"syscall"
Expand All @@ -32,12 +33,17 @@ func Run() error {

log := newLogger()

// Fail fast if a live daemon already owns the handshake file. A run-file
// left by a crashed predecessor (dead PID) is treated as stale and
// overwritten when the new server starts.
// Fail fast only if a daemon is genuinely still serving the recorded port.
// CheckStale confirms the run-file's PID is alive, but that alone is not
// proof a predecessor owns the port: the file leaks when the daemon is hard
// killed without a graceful shutdown (the norm on Windows, where the desktop
// supervisor can only TerminateProcess it), and Windows reuses the recorded
// PID for unrelated processes. So a "live" PID is verified against an actual
// /healthz probe; a run-file left by a crashed/hard-killed/reused-PID
// predecessor is treated as stale and overwritten when the new server starts.
if live, err := runfile.CheckStale(cfg.RunFilePath); err != nil {
return fmt.Errorf("inspect run-file: %w", err)
} else if live != nil {
} else if live != nil && runFileOwnerServing(&http.Client{Timeout: staleProbeTimeout}, config.LoopbackHost, live) {
return fmt.Errorf("daemon already running (pid %d, port %d); refusing to start", live.PID, live.Port)
}

Expand Down
58 changes: 58 additions & 0 deletions backend/internal/daemon/stale.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package daemon

import (
"encoding/json"
"fmt"
"net/http"
"time"

"github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta"
"github.com/aoagents/agent-orchestrator/backend/internal/runfile"
)

// staleProbeTimeout bounds the startup ownership probe so a run-file pointing at
// an unreachable port cannot stall daemon startup.
const staleProbeTimeout = 2 * time.Second

// runFileOwnerServing reports whether an AO daemon matching info is actually
// serving on the recorded loopback port.
//
// runfile.CheckStale only confirms the recorded PID is alive, which is not
// enough to conclude a predecessor still owns the port. On Windows the desktop
// supervisor can only TerminateProcess the daemon (no POSIX signal reaches it),
// so the daemon's graceful shutdown never runs and running.json is never
// removed; the leaked file then survives into the next launch. Because Windows
// reuses PIDs aggressively, the recorded PID routinely belongs to an unrelated
// process, making the PID-only check report "alive" for a daemon that is long
// gone — which is what made the daemon refuse to start (issue #256).
//
// Probing /healthz and matching both the service name and the PID is the ground
// truth that a real predecessor is still listening. When it is not, the
// run-file is stale and the caller should overwrite it instead of refusing.
func runFileOwnerServing(client *http.Client, host string, info *runfile.Info) bool {
if info == nil || info.Port <= 0 {
return false
}

req, err := http.NewRequest(http.MethodGet, fmt.Sprintf("http://%s:%d/healthz", host, info.Port), http.NoBody)
if err != nil {
return false
}
resp, err := client.Do(req)
if err != nil {
return false
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return false
}

var body struct {
Service string `json:"service"`
PID int `json:"pid"`
}
if err := json.NewDecoder(resp.Body).Decode(&body); err != nil {
return false
}
return body.Service == daemonmeta.ServiceName && body.PID == info.PID
}
116 changes: 116 additions & 0 deletions backend/internal/daemon/stale_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
package daemon

import (
"fmt"
"net/http"
"net/http/httptest"
"net/url"
"strconv"
"testing"
"time"

"github.com/aoagents/agent-orchestrator/backend/internal/daemonmeta"
"github.com/aoagents/agent-orchestrator/backend/internal/runfile"
)

// healthzBody returns a handler that answers /healthz with the given service
// name and pid, mimicking the daemon's real liveness probe.
func healthzBody(service string, pid int) http.HandlerFunc {
return func(w http.ResponseWriter, r *http.Request) {
if r.URL.Path != "/healthz" {
http.NotFound(w, r)
return
}
w.Header().Set("Content-Type", "application/json")
_, _ = fmt.Fprintf(w, `{"status":"ok","service":%q,"pid":%d}`, service, pid)
}
}

func hostPort(t *testing.T, rawURL string) (string, int) {
t.Helper()
u, err := url.Parse(rawURL)
if err != nil {
t.Fatalf("parse %q: %v", rawURL, err)
}
port, err := strconv.Atoi(u.Port())
if err != nil {
t.Fatalf("port from %q: %v", rawURL, err)
}
return u.Hostname(), port
}

func TestRunFileOwnerServing(t *testing.T) {
const pid = 4242

tests := []struct {
name string
handler http.HandlerFunc
want bool
}{
{
name: "matching service and pid is the live owner",
handler: healthzBody(daemonmeta.ServiceName, pid),
want: true,
},
{
name: "reused pid: same port, different process pid",
handler: healthzBody(daemonmeta.ServiceName, pid+1),
want: false,
},
{
name: "foreign service occupying the port",
handler: healthzBody("some-other-service", pid),
want: false,
},
{
name: "non-2xx response",
handler: func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusInternalServerError)
},
want: false,
},
{
name: "unparseable body",
handler: func(w http.ResponseWriter, _ *http.Request) {
_, _ = w.Write([]byte("not json"))
},
want: false,
},
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
srv := httptest.NewServer(tc.handler)
defer srv.Close()
host, port := hostPort(t, srv.URL)

got := runFileOwnerServing(srv.Client(), host, &runfile.Info{PID: pid, Port: port})
if got != tc.want {
t.Errorf("runFileOwnerServing = %v, want %v", got, tc.want)
}
})
}
}

func TestRunFileOwnerServingNoListener(t *testing.T) {
// Bind then immediately close to obtain a port nothing is listening on, so
// the probe hits a refused connection — the leaked-run-file case.
srv := httptest.NewServer(http.NotFoundHandler())
host, port := hostPort(t, srv.URL)
srv.Close()

client := &http.Client{Timeout: time.Second}
if runFileOwnerServing(client, host, &runfile.Info{PID: 4242, Port: port}) {
t.Error("runFileOwnerServing on a dead port = true, want false (stale, safe to overwrite)")
}
}

func TestRunFileOwnerServingNilOrZeroPort(t *testing.T) {
client := &http.Client{Timeout: time.Second}
if runFileOwnerServing(client, "127.0.0.1", nil) {
t.Error("runFileOwnerServing(nil) = true, want false")
}
if runFileOwnerServing(client, "127.0.0.1", &runfile.Info{PID: 1, Port: 0}) {
t.Error("runFileOwnerServing(port 0) = true, want false")
}
}
Loading