Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions ARCHITECTURE.md
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,12 @@ When a PR changes any of these fields for a package, update its section.
- `GetStatus` reads the live holder socket *before* on-disk `config.json`, so a
still-starting member reports correctly. The image/VMM pull is the client's job before
spawn now; `StateDownloading` covers only the helper's VMM-binary fetch (ADR-0020).
- **Cross-uid liveness (ADR-0023).** `IsRunning` probes the holder with `kill(pid, 0)`
and treats `EPERM` as ALIVE, not dead (`signalMeansAlive`): on Linux the holder an
elevated `up` spawned is root-owned, so a non-root `ls`/`ssh` cannot signal it and gets
`EPERM` though the process exists. Only `ESRCH` (absent) is "not running". Without this,
`GetStatus` short-circuited to `StateStopped` before ever dialing the (0666) socket, so
non-root read-only commands reported a running VM as stopped.

### §5.12 `internal/backend/cloudhypervisor`

Expand Down
28 changes: 28 additions & 0 deletions cmd/fleetbox/elevate.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package main

import "strings"

const (
// elevateProceed: already root (or nothing to do) — run the command in-process.
elevateProceed elevateAction = iota
Expand Down Expand Up @@ -40,3 +42,29 @@ func decideElevation(euid int, alreadyElevated, ttyOpenable, sudoFound bool) ele
}
return elevatePrint
}

// ensureSbinInPath returns path with the standard sbin directories appended if
// missing. The elevated holder shells out to `ip`/`iptables`, which live in
// /sbin and /usr/sbin — directories a regular user's PATH often omits (a stock
// Debian cloud image's login PATH has no sbin). Auto-elevation forwards the user's
// PATH through an `env` wrapper that overrides sudo's secure_path, so without this
// the holder's `iptables` lookup fails and network setup dies (ADR-0023). Empty
// path entries are dropped so a leading/trailing ":" never injects the cwd. Pure
// and un-tagged so it is table-testable on any platform.
func ensureSbinInPath(path string) string {
var dirs []string
seen := map[string]bool{}
add := func(d string) {
if d != "" && !seen[d] {
seen[d] = true
dirs = append(dirs, d)
}
}
for d := range strings.SplitSeq(path, ":") {
add(d)
}
for _, d := range []string{"/usr/local/sbin", "/usr/sbin", "/sbin"} {
add(d)
}
return strings.Join(dirs, ":")
}
2 changes: 1 addition & 1 deletion cmd/fleetbox/elevate_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ func elevatedArgv() ([]string, error) {
return nil, fmt.Errorf("resolve own path: %w", err)
}
argv := make([]string, 0, 5+len(os.Args)-1)
argv = append(argv, "sudo", "env", envElevated+"=1", "PATH="+os.Getenv("PATH"), self)
argv = append(argv, "sudo", "env", envElevated+"=1", "PATH="+ensureSbinInPath(os.Getenv("PATH")), self)
argv = append(argv, os.Args[1:]...)
return argv, nil
}
Expand Down
40 changes: 40 additions & 0 deletions cmd/fleetbox/elevate_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,3 +49,43 @@ func TestDecideElevation(t *testing.T) {
})
}
}

// TestEnsureSbinInPath pins the PATH fix (ADR-0023): the elevated holder needs
// /sbin and /usr/sbin (where ip/iptables live) even when the invoking user's PATH
// omits them — as a stock Debian cloud image's does. Surfaced by dogfooding the
// Linux path inside a fleetbox-booted VM.
func TestEnsureSbinInPath(t *testing.T) {
cases := []struct {
name string
in string
want string
}{
{
name: "sbin-less user PATH gets sbin appended",
in: "/usr/local/bin:/usr/bin:/bin",
want: "/usr/local/bin:/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin",
},
{
name: "already-present sbin dirs are not duplicated",
in: "/usr/sbin:/usr/bin:/sbin:/bin",
want: "/usr/sbin:/usr/bin:/sbin:/bin:/usr/local/sbin",
},
{
name: "empty PATH yields just the sbin dirs (no cwd-injecting empty entry)",
in: "",
want: "/usr/local/sbin:/usr/sbin:/sbin",
},
{
name: "empty entries are dropped",
in: "/usr/bin::/bin:",
want: "/usr/bin:/bin:/usr/local/sbin:/usr/sbin:/sbin",
},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := ensureSbinInPath(tc.in); got != tc.want {
t.Errorf("ensureSbinInPath(%q) = %q, want %q", tc.in, got, tc.want)
}
})
}
}
13 changes: 11 additions & 2 deletions docs/adr/0023-linux-sudo-ux.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,12 @@ itself booby-trapped:
key, so ownership is the only fix. The root holder `chmod 0666`s each unix socket it
creates so a non-root client can connect. Disks/logs stay root-owned; only the holder
touches them.
- **Cross-uid liveness probe.** `control.IsRunning` probes the holder with
`kill(pid, 0)` and treats `EPERM` as alive, not dead: the holder an elevated `up`
spawned is root-owned, so a non-root `ls`/`ssh` is not permitted to signal it and gets
`EPERM` although the process exists. Only `ESRCH` means gone. Without this the status
path short-circuited to "stopped" before ever reaching the `0666` socket — the two
fixups above are necessary but were not sufficient on their own.

Privilege classes: **privileged (auto-elevate)** = `up`, `down`, `rm`; **user-level
(never elevate)** = `ls`, `ssh`, `cp`, `ssh-config`, `version`, `completion`.
Expand Down Expand Up @@ -101,8 +107,11 @@ and the socket block read-only commands; disks/logs are root-only by design.
- Behavioral coverage is mostly manual on a Linux host: the macOS CI runs the
`fleetbox_fake` path (preflight is a no-op), and the Linux VM CI runs as root via
sudo, so `euid==0` short-circuits the elevation and ownership paths. The automated
coverage is the pure functions (`requireRoot`, `resolveBaseHome`, `decideElevation`),
unit-tested off-root on darwin/arm64.
coverage is the pure functions (`requireRoot`, `resolveBaseHome`, `decideElevation`,
`signalMeansAlive`), unit-tested off-root on darwin/arm64. The cross-uid liveness bug
is exactly what slipped past that gap — it surfaced only on the first real non-root run
on a Linux box, which is why that manual run is the true acceptance gate, not the
off-root unit tests.
- References ADR-0011 (Linux backend's privileged shell-outs), ADR-0014 (store layout),
ADR-0020 (self-reexec holder that inherits `SUDO_*`), ADR-0022 (the cobra CLI this
wires into). Amends none of them; it adds the privilege model they assumed but left
Expand Down
15 changes: 14 additions & 1 deletion internal/control/control.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,20 @@ func IsRunning(st *store.Store, name string) bool {
if err != nil {
return false
}
return proc.Signal(syscall.Signal(0)) == nil
return signalMeansAlive(proc.Signal(syscall.Signal(0)))
}

// signalMeansAlive classifies a kill(pid, 0) liveness probe. The process is alive
// if the probe succeeded (nil) OR failed with EPERM — EPERM means the process
// EXISTS but is owned by another user, so we are not permitted to signal it. That
// is exactly a non-root `ls`/`ssh` probing the root-owned holder an elevated `up`
// spawned (ADR-0023): the holder is alive, we just can't signal across the uid
// boundary. Only a genuinely absent process (ESRCH, or any other error) is "not
// running". Treating EPERM as dead was why non-root read-only commands reported a
// running VM as stopped. Split out as a pure function so the cross-uid case is
// unit-testable without spawning a foreign-owned process.
func signalMeansAlive(err error) bool {
return err == nil || errors.Is(err, syscall.EPERM)
}

// GetStatus returns the status of a member. A live holder is authoritative — even
Expand Down
34 changes: 34 additions & 0 deletions internal/control/liveness_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package control

import (
"errors"
"fmt"
"syscall"
"testing"
)

// TestSignalMeansAlive pins the cross-uid liveness classification (ADR-0023): a
// kill(pid, 0) probe means the process is alive on nil OR EPERM (it exists but is
// owned by another user — the non-root `ls`/`ssh` probing the root-owned holder),
// and dead only on ESRCH or any other error. The EPERM row is the regression: it
// used to be read as "stopped", hiding a running VM from non-root commands.
func TestSignalMeansAlive(t *testing.T) {
cases := []struct {
name string
err error
want bool
}{
{name: "nil → alive (we may signal it)", err: nil, want: true},
{name: "EPERM → alive (exists, foreign-owned)", err: syscall.EPERM, want: true},
{name: "EPERM wrapped → alive", err: fmt.Errorf("signal: %w", syscall.EPERM), want: true},
{name: "ESRCH → dead (no such process)", err: syscall.ESRCH, want: false},
{name: "other error → dead", err: errors.New("boom"), want: false},
}
for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
if got := signalMeansAlive(tc.err); got != tc.want {
t.Errorf("signalMeansAlive(%v) = %v, want %v", tc.err, got, tc.want)
}
})
}
}