From d103cee3a45578dfcf822310487d8d9c40b41873 Mon Sep 17 00:00:00 2001
From: Remylus Losius <remyluslosius@gonaibo.com>
Date: Tue, 16 Jun 2026 14:36:46 -0400
Subject: [PATCH] test(ssh): add opt-in live-host SSH/sudo integration test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Closes the project's biggest test blind spot: the dial, auth-ordering,
and sudo -n/-S paths were only unit-tested at the command-construction
level (stubbed transport), never against a real box. A wired-up host
could regress and every test stay green.

internal/ssh/livehost_test.go drives the REAL ssh.Dial + ssh.RunSudo —
the primitives every host-talking path (scan, discovery, collector,
liveness) shares — against an operator-supplied inventory:

  OPENWATCH_LIVE_HOSTS=/path/to/test_hosts.csv  (hostname,ip,username,credential)
  OPENWATCH_LIVE_KEY=/path/to/id_rsa

With either unset the test t.Skip()s, so it never gates normal CI; the
inventory + key stay on the operator's workstation, never in the repo.

The fleet is heterogeneous, so the test DISCOVERS each host's
capabilities rather than demanding every method everywhere. Per host it
asserts the machinery for whatever the host supports:

  - key auth dials      -> ObservedAuth == "key"      (the value the memo records)
  - password auth dials -> ObservedAuth == "password"
  - sudo mode confirmed via the `true` sentinel (nopasswd | password)
  - the real `sudo -S -k -p '' true` password-on-stdin path executes

A server-side auth rejection (key not authorized, or PasswordAuthentication
off) is a tolerated host-config fact; an unreachable host is skipped; only
an unexpected protocol-level error or a wrong ObservedAuth/sudo result
fails the test. A host with no usable auth is skipped.

Validated against the dev fleet: 5 key+NOPASSWD hosts pass (real key dial,
sudo -n, and sudo -S all exercised), key-rejecting and unreachable hosts
skip. The password-AUTH assertion is live-unverified only because the dev
fleet runs PasswordAuthentication=no everywhere (noted in BACKLOG); it
runs as soon as one password-enabled host is in the inventory.

Also drops the completed "wire SSH auth/sudo learning" backlog entry
(shipped in #575 + #576).
---
 BACKLOG.md                    |   3 +-
 internal/ssh/livehost_test.go | 322 ++++++++++++++++++++++++++++++++++
 2 files changed, 323 insertions(+), 2 deletions(-)
 create mode 100644 internal/ssh/livehost_test.go

diff --git a/BACKLOG.md b/BACKLOG.md
index 99881815..e903a41d 100644
--- a/BACKLOG.md
+++ b/BACKLOG.md
@@ -96,7 +96,6 @@ Gaps identified comparing `docs/KENSA_OPENWATCH_BOUNDARY.md` against current Ope
 
 | Item | Priority | Status | Notes |
 |------|----------|--------|-------|
-| Wire SSH auth/sudo learning into discovery / intelligence / liveness | P2 | Not started | #566 landed the substrate (`internal/connprofile` + migration `0035`) and the dial-layer mechanism (`ssh.DialOptions.PreferAuth`/`ObservedAuth`), and wired the **scan** path (auth ordering + per-connection sudo-mode probe, recorded per host). The other three SSH paths — OS discovery, OS intelligence collector, liveness privilege probe — still re-probe each cycle instead of leading with the remembered choice. Their `SSHTransport.Dial` (`internal/intelligence/discovery`, shared by the collector) and the `internal/sshprivilege` dialer take no `hostID`, so thread it + a `connprofile` reader/recorder through. Mechanical — reuses everything from #566 |
 | Retention sweep for soft-deleted hosts | P3 | Not started | Soft-deleted hosts (`hosts.deleted_at` set by `internal/host/host.go:298 SoftDelete`) are retained **indefinitely** — no purge job exists anywhere, confirmed by a soft-deleted row from 2026-05-25 still present in the dev DB. The row stays for scan-history/audit integrity but is hidden from every query (`WHERE deleted_at IS NULL`). Add an optional retention sweep (a daemon-orchestration tick that hard-deletes `hosts WHERE deleted_at < now() - $window`, cascading scan history), with an operator-configurable window that defaults to disabled (keep forever). Low urgency — host volume is trivial — but closes unbounded soft-deleted-row growth and gives operators a real "forget this host" path |
 | `PATCH /api/v1/credentials/{id}` — in-place credential update | P2 | Deferred | Frontend uses replace-on-save (`<ReplaceCredentialModal>` runs `POST → DELETE`). Real PATCH would close the orphan-credential failure mode |
 | `POST /api/v1/bulk/hosts/analyze-csv` + `import-with-mapping` | P2 | Deferred | Today the wizard runs CSV analysis client-side and submits row-by-row — no atomic semantics, no "update existing", no row caps |
@@ -153,7 +152,7 @@ Gaps where working functionality can break without an automated test failing (id
 
 | Item | Priority | Notes |
 |------|----------|-------|
-| Live-host SSH/sudo integration test (gated) | P2 | CI has **no test that actually dials a host and runs sudo** — the dial, `sudo -n`, and `sudo -S` paths are only unit-tested at the command-construction level (the `wrap`/auth-ordering output), never against a real box. Add an opt-in, env-gated test (against `~/Documents/openwatch/test_hosts.csv`) that asserts: key auth, password auth, `sudo -n` (NOPASSWD), and `sudo -S` (password) each actually authenticate/escalate and return real results. Skips when the host env var is unset, so it never gates normal CI. Closes the biggest blind spot (live execution is manual today). |
+| Live-host SSH/sudo integration test (gated) | P2 | **Mostly done** | Delivered `internal/ssh/livehost_test.go` — opt-in (`OPENWATCH_LIVE_HOSTS` CSV + `OPENWATCH_LIVE_KEY`), self-skipping, drives the REAL `ssh.Dial` + `ssh.RunSudo`. Per host it asserts the machinery for whatever the host supports: key auth → `ObservedAuth=="key"`, password auth → `"password"`, sudo mode confirmed via `true`, and the real `sudo -S` password-on-stdin path. Validated against the dev fleet (5 key+NOPASSWD hosts pass; key-rejecting / unreachable hosts skip). **Gap:** the password-AUTH branch (`ObservedAuth=="password"`) is unverified live because the dev fleet has `PasswordAuthentication no` everywhere — needs one host with password SSH enabled to exercise it. Optional next: add a `connprofile.Store` round-trip (record→read-back) assertion against a gated DB. |
 | Frontend E2E (Playwright) for critical flows | P2 | **Zero E2E tests** — `0` Playwright files, no config (the CLAUDE.md Playwright note is Python-era). Component-level vitest only, so a wired-up page can be green in vitest and broken in the browser. Stand up Playwright + cover the critical flows: login, the activated Settings pages (Users, Notifications, Security/API tokens, SSO), and host CRUD (add / edit / delete). |
 | Negative-path ACs for security gates | P2 | The scan kill-switch bug this session passed all 988 tests + the specter gate because the scan path simply had **no AC requiring it to honor the switch** — the suite tests specced behavior, so gaps *between* specs slip through. Generalize the pattern of `system-connection-profile/AC-07` (asserts "kill-switch off / key-only cred → no `sudo -S`") across the other security gates: every gate should have a spec'd AC + test for the **disallowed** path, not just the happy path. |
 
diff --git a/internal/ssh/livehost_test.go b/internal/ssh/livehost_test.go
new file mode 100644
index 00000000..53e98eb5
--- /dev/null
+++ b/internal/ssh/livehost_test.go
@@ -0,0 +1,322 @@
+// @spec system-connection-profile
+//
+// Live-host SSH/sudo integration test. Unlike every other test in this
+// package (which stubs the transport), this one opens REAL SSH connections
+// to real boxes and runs real sudo, closing the project's biggest test
+// blind spot: the dial, auth-ordering, and sudo `-n`/`-S` paths are
+// otherwise only exercised at the command-construction level, never
+// against a live host.
+//
+// It is OPT-IN and self-skipping: set both
+//
+//	OPENWATCH_LIVE_HOSTS=/path/to/test_hosts.csv   (hostname,ip,username,credential[=password])
+//	OPENWATCH_LIVE_KEY=/path/to/id_rsa             (an OpenSSH private key authorized for the user)
+//
+// to run it. With either unset it t.Skip()s, so it never gates normal CI.
+// The inventory + key live on the operator's workstation, never in the repo.
+//
+// What it asserts per reachable host:
+//
+//	AC-04  key auth dials and ObservedAuth reports "key"
+//	AC-04  password auth dials and ObservedAuth reports "password"
+//	AC-10  RunSudo confirms a sudo mode (nopasswd or password) via `true`
+//	AC-10  leading with SudoPassword exercises the real `sudo -S` wire path
+//
+// These are exactly the observations the per-host connprofile memo records,
+// so a green run is end-to-end proof that the learning inputs are real.
+
+package ssh
+
+import (
+	"bufio"
+	"context"
+	"errors"
+	"os"
+	"strings"
+	"testing"
+	"time"
+
+	cryptossh "golang.org/x/crypto/ssh"
+
+	"github.com/Hanalyx/openwatch/internal/credential"
+)
+
+type liveHost struct {
+	name     string
+	addr     string
+	user     string
+	password string
+}
+
+// loadLiveHosts reads the opt-in inventory + key, or skips the test.
+func loadLiveHosts(t *testing.T) ([]liveHost, string) {
+	t.Helper()
+	csvPath := os.Getenv("OPENWATCH_LIVE_HOSTS")
+	keyPath := os.Getenv("OPENWATCH_LIVE_KEY")
+	if csvPath == "" || keyPath == "" {
+		t.Skip("set OPENWATCH_LIVE_HOSTS (csv) and OPENWATCH_LIVE_KEY (private key) to run live-host SSH tests")
+	}
+
+	keyBytes, err := os.ReadFile(keyPath) // #nosec G304 -- operator-supplied test key path
+	if err != nil {
+		t.Fatalf("read OPENWATCH_LIVE_KEY %q: %v", keyPath, err)
+	}
+	// Sanity-check the key parses before we fan out to every host, so a bad
+	// key fails once with a clear message instead of N opaque dial errors.
+	if _, perr := cryptossh.ParsePrivateKey(keyBytes); perr != nil {
+		t.Fatalf("OPENWATCH_LIVE_KEY %q is not a usable private key: %v", keyPath, perr)
+	}
+
+	f, err := os.Open(csvPath) // #nosec G304 -- operator-supplied test inventory path
+	if err != nil {
+		t.Fatalf("open OPENWATCH_LIVE_HOSTS %q: %v", csvPath, err)
+	}
+	defer func() { _ = f.Close() }()
+
+	var hosts []liveHost
+	sc := bufio.NewScanner(f)
+	first := true
+	for sc.Scan() {
+		line := strings.TrimSpace(sc.Text())
+		if line == "" {
+			continue
+		}
+		cols := strings.Split(line, ",")
+		// Skip the header row (hostname,ip,username,credential).
+		if first {
+			first = false
+			if strings.EqualFold(strings.TrimSpace(cols[0]), "hostname") {
+				continue
+			}
+		}
+		if len(cols) < 3 {
+			continue
+		}
+		h := liveHost{
+			name: strings.TrimSpace(cols[0]),
+			addr: strings.TrimSpace(cols[1]),
+			user: strings.TrimSpace(cols[2]),
+		}
+		if len(cols) >= 4 {
+			h.password = strings.TrimSpace(cols[3])
+		}
+		if h.addr == "" || h.user == "" {
+			continue
+		}
+		hosts = append(hosts, h)
+	}
+	if err := sc.Err(); err != nil {
+		t.Fatalf("scan inventory: %v", err)
+	}
+	if len(hosts) == 0 {
+		t.Fatalf("OPENWATCH_LIVE_HOSTS %q yielded no usable rows", csvPath)
+	}
+	return hosts, string(keyBytes)
+}
+
+// liveSession adapts a real *ssh.Client to SudoSession so RunSudo (the
+// shared sudo primitive) can run against it exactly as production does.
+type liveSession struct{ client *cryptossh.Client }
+
+func (s liveSession) Run(_ context.Context, cmd string) ([]byte, int, error) {
+	sess, err := s.client.NewSession()
+	if err != nil {
+		return nil, -1, err
+	}
+	defer func() { _ = sess.Close() }()
+	out, runErr := sess.CombinedOutput(cmd)
+	return out, exitCodeOf(runErr), nonExitErr(runErr)
+}
+
+func (s liveSession) RunWithStdin(_ context.Context, cmd string, stdin []byte) ([]byte, int, error) {
+	sess, err := s.client.NewSession()
+	if err != nil {
+		return nil, -1, err
+	}
+	defer func() { _ = sess.Close() }()
+	pipe, perr := sess.StdinPipe()
+	if perr != nil {
+		return nil, -1, perr
+	}
+	go func() {
+		_, _ = pipe.Write(stdin)
+		_ = pipe.Close()
+	}()
+	out, runErr := sess.CombinedOutput(cmd)
+	return out, exitCodeOf(runErr), nonExitErr(runErr)
+}
+
+// exitCodeOf returns the remote exit status (0 on success), or -1 for a
+// transport-level error.
+func exitCodeOf(err error) int {
+	if err == nil {
+		return 0
+	}
+	var ee *cryptossh.ExitError
+	if errors.As(err, &ee) {
+		return ee.ExitStatus()
+	}
+	return -1
+}
+
+// nonExitErr passes through transport errors but swallows a clean non-zero
+// exit (that is a valid result, surfaced via the code).
+func nonExitErr(err error) error {
+	var ee *cryptossh.ExitError
+	if err == nil || errors.As(err, &ee) {
+		return nil
+	}
+	return err
+}
+
+// isAuthRejection reports a server-side auth REJECTION (the key isn't
+// authorized, or PasswordAuthentication is off) as opposed to a transport
+// fault. A real fleet is heterogeneous — not every host accepts every
+// method — so a rejection is a host-config fact to tolerate, not an
+// OpenWatch bug. (A handshake/protocol error is neither and stays fatal.)
+func isAuthRejection(err error) bool {
+	s := err.Error()
+	return strings.Contains(s, "unable to authenticate") ||
+		strings.Contains(s, "authentication failed") ||
+		strings.Contains(s, "no supported methods remain")
+}
+
+// isUnreachable reports a connection-level failure (host down / firewalled
+// / wrong subnet) — reachability is the operator's network, not our code,
+// so the host is skipped rather than failed.
+func isUnreachable(err error) bool {
+	s := err.Error()
+	return strings.Contains(s, "no route to host") ||
+		strings.Contains(s, "connection refused") ||
+		strings.Contains(s, "i/o timeout") ||
+		strings.Contains(s, "connect: ") ||
+		strings.Contains(s, "tcp connect failed")
+}
+
+// tryAuth performs one real dial with the given method. It returns a live
+// client on success (caller closes it), nil + ok=false on a tolerated auth
+// rejection, skips the whole subtest on an unreachable host, and fails the
+// test on any other (unexpected, protocol-level) error. On success it
+// asserts ObservedAuth reports the method that authenticated — the exact
+// value the connprofile memo records (spec AC-04).
+func tryAuth(t *testing.T, ctx context.Context, h liveHost, method, wantObserved string, cred *credential.Credential, store KnownHostsStore) (*cryptossh.Client, bool) {
+	t.Helper()
+	var observed string
+	client, err := Dial(ctx, h.addr, 22, cred, DialOptions{
+		Mode:         ModeTOFU,
+		Store:        store,
+		Timeout:      DefaultDialTimeout,
+		ObservedAuth: &observed,
+	})
+	if err != nil {
+		switch {
+		case isUnreachable(err):
+			t.Skipf("%s (%s) unreachable: %v", h.name, h.addr, err)
+		case isAuthRejection(err):
+			t.Logf("%s: %s auth not accepted by host (config): %v", h.name, method, err)
+			return nil, false
+		default:
+			t.Fatalf("%s: %s dial failed with an unexpected (non-auth) error: %v", h.name, method, err)
+		}
+	}
+	if observed != wantObserved {
+		t.Errorf("%s: %s auth ObservedAuth=%q, want %q", h.name, method, observed, wantObserved)
+	}
+	return client, true
+}
+
+// TestLiveHost_AuthAndSudoMatrix dials every inventory host and validates,
+// against the REAL box, the auth + sudo machinery the connprofile learning
+// rests on. The fleet is heterogeneous: a host may accept key auth, or
+// password auth, or both, and use NOPASSWD or password sudo. The test
+// discovers each host's capabilities and asserts the machinery is correct
+// for whatever the host supports — it does NOT demand every method on every
+// host. A host with no usable auth, or that is unreachable, is skipped.
+func TestLiveHost_AuthAndSudoMatrix(t *testing.T) {
+	hosts, key := loadLiveHosts(t)
+
+	for _, h := range hosts {
+		h := h
+		t.Run(h.name, func(t *testing.T) {
+			t.Parallel()
+			ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+			defer cancel()
+
+			// One known-hosts store per host: the first dial records the
+			// host key (TOFU); any later dial verifies against it.
+			store := NewMemoryStore()
+
+			// --- AC-04: key auth ---
+			keyClient, keyOK := tryAuth(t, ctx, h, "key", PreferKey, &credential.Credential{
+				Username:   h.user,
+				AuthMethod: credential.AuthSSHKey,
+				PrivateKey: key,
+			}, store)
+			if keyClient != nil {
+				defer func() { _ = keyClient.Close() }()
+			}
+
+			// --- AC-04: password auth (only if the inventory carries one) ---
+			var pwClient *cryptossh.Client
+			pwOK := false
+			if h.password != "" {
+				pwClient, pwOK = tryAuth(t, ctx, h, "password", PreferPassword, &credential.Credential{
+					Username:   h.user,
+					AuthMethod: credential.AuthPassword,
+					Password:   h.password,
+				}, store)
+				if pwClient != nil {
+					defer func() { _ = pwClient.Close() }()
+				}
+			}
+
+			if !keyOK && !pwOK {
+				t.Skipf("%s: neither key nor password auth accepted — nothing to exercise", h.name)
+			}
+
+			// Run sudo over whichever auth succeeded.
+			sudoClient := keyClient
+			if sudoClient == nil {
+				sudoClient = pwClient
+			}
+			sess := liveSession{client: sudoClient}
+			bothCred := &credential.Credential{
+				Username:   h.user,
+				AuthMethod: credential.AuthBoth,
+				Password:   h.password,
+				PrivateKey: key,
+			}
+			policy := SudoPolicy{AllowCredentialPassword: true}
+
+			// --- AC-10: confirm the host's sudo mode via the `true` sentinel.
+			_, code, _, observed, serr := RunSudo(ctx, sess, bothCred, policy, "", "true")
+			if serr != nil {
+				t.Fatalf("sudo true on %s: transport error: %v", h.name, serr)
+			}
+			if code != 0 {
+				t.Fatalf("sudo true on %s: exit %d (sudo not usable with this credential)", h.name, code)
+			}
+			if observed != SudoNopasswd && observed != SudoPassword {
+				t.Fatalf("sudo mode not confirmed on %s: observed=%q", h.name, observed)
+			}
+			t.Logf("%s (%s): keyAuth=%v passwordAuth=%v sudo=%s", h.name, h.addr, keyOK, pwOK, observed)
+
+			// --- AC-10: exercise the real `sudo -S` wire path. Leading with
+			// SudoPassword forces `sudo -S -k -p '' true` with the password on
+			// stdin; a NOPASSWD host still accepts it, so this proves the
+			// password-on-stdin path works end to end regardless of mode.
+			// Only meaningful when the credential carries a password.
+			if h.password != "" {
+				_, codeS, usedS, _, sErr := RunSudo(ctx, sess, bothCred, policy, SudoPassword, "true")
+				switch {
+				case sErr != nil:
+					t.Errorf("sudo -S true on %s: %v", h.name, sErr)
+				case !usedS:
+					t.Errorf("sudo -S true on %s: leading with SudoPassword did not take the sudo -S path", h.name)
+				case codeS != 0 && observed == SudoNopasswd:
+					t.Errorf("sudo -S true on %s: exit %d on a NOPASSWD host (should accept regardless of password)", h.name, codeS)
+				}
+			}
+		})
+	}
+}