diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index d32c397..5c969d8 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -294,7 +294,7 @@ jobs:
 
             def caveats
               <<~EOS
-                Run first-run setup to download the local model and start the daemon:
+                Run first-run setup to install the runtime and download the local model:
                   i init
 
                 For zsh users: install the shell hook so prompts containing ? * [ ]
diff --git a/README.md b/README.md
index 163cdd2..e38052f 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ It is **local-first** by default (no network required after first run, no prompt
 
 That composability applies to subcommands that consume natural language too: `i report "first problem" < extra-notes.txt` appends the piped text after the command-line text before proposing issues.
 
-> **Status: pre-alpha.** The binary builds and the mock backend round-trips the full prompt → propose → confirm → run loop, but the local model runtime, daemon, and self-update flows are still being wired up. See [`INTENT.md`](./INTENT.md) for the full project charter, [`docs/SPEC.md`](./docs/SPEC.md) for the implementation contract, and [open issues](https://github.com/CoreyRDean/intent/issues) for the roadmap.
+> **Status: pre-alpha.** The binary builds and the mock backend round-trips the full prompt → propose → confirm → run loop, but the local model runtime (llama.cpp's `llama-cli`) and self-update flows are still being wired up. See [`INTENT.md`](./INTENT.md) for the full project charter, [`docs/SPEC.md`](./docs/SPEC.md) for the implementation contract, and [open issues](https://github.com/CoreyRDean/intent/issues) for the roadmap.
 
 ## Building from source
 
@@ -125,7 +125,7 @@ With `--literal`, everything after the flag is treated as natural-language promp
 
 ## Managing models
 
-intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llamafile](https://github.com/mozilla-ai/llamafile). You can also point it at any public Hugging Face GGUF repo.
+intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llama.cpp](https://github.com/ggml-org/llama.cpp), which intent installs on demand through your system package manager (Homebrew, apt, dnf, …). Each `i` invocation starts a request-scoped `llama-server` child, holds the model warm for that one request (so multi-step tool calls don't reload it), and kills it on exit — there is **no background daemon**. If `llama-server` isn't available it falls back to one-shot `llama-cli`. You can also point it at any public Hugging Face GGUF repo.
 
 ```sh
 # See what's on offer and which one is current.
diff --git a/install.sh b/install.sh
index 12cf36c..f582a74 100755
--- a/install.sh
+++ b/install.sh
@@ -11,7 +11,7 @@
 #   PREFIX           install root (default /usr/local; needs sudo if not writable)
 #   INTENT_TMPDIR    where to stage downloads (default $TMPDIR or /tmp)
 #
-# This script does not auto-install the daemon, the model runtime, or the
+# This script does not auto-install the model runtime (llama-cli) or the
 # model. Run `i init` after install.
 
 set -Eeuo pipefail
@@ -136,8 +136,8 @@ echo
 
 # Auto-run `intent init` if we have a real TTY on stdin/stderr. Without
 # this, users who curl|bash and ignore the next-steps text get a binary
-# they can't actually use until they read the docs. With it, the model
-# downloads and the daemon starts as part of the install flow.
+# they can't actually use until they read the docs. With it, the runtime
+# installs and the model downloads as part of the install flow.
 #
 # We skip it under `bash -c` / `curl | bash` (no TTY) so non-interactive
 # CI jobs aren't surprised by a 4 GB download.
diff --git a/internal/cli/backend.go b/internal/cli/backend.go
index cc27bc6..92cc682 100644
--- a/internal/cli/backend.go
+++ b/internal/cli/backend.go
@@ -3,16 +3,17 @@ package cli
 import (
 	"context"
 	"fmt"
-	"net"
-	"net/url"
+	"io"
 	"os"
-	"strings"
-	"time"
 
 	"github.com/CoreyRDean/intent/internal/config"
 	"github.com/CoreyRDean/intent/internal/model"
+	"github.com/CoreyRDean/intent/internal/model/llamacli"
 	"github.com/CoreyRDean/intent/internal/model/llamafile"
+	"github.com/CoreyRDean/intent/internal/model/llamaserver"
 	"github.com/CoreyRDean/intent/internal/model/mock"
+	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
+	"github.com/CoreyRDean/intent/internal/state"
 	"github.com/CoreyRDean/intent/internal/verbose"
 )
 
@@ -21,10 +22,11 @@ import (
 // unavailable and we silently fell back to the mock — callers use this to
 // surface a per-invocation warning so users aren't left confused.
 //
-// In v1 we wire: mock, llamafile-local, llamafile-network, ollama (as a
-// llamafile-shaped HTTP), openai (as a llamafile-shaped HTTP). The grammar
-// constraint is the same across all of them; the only differences are the
-// endpoint and the auth header.
+// Backends: mock; llama-cli (local one-shot llama.cpp subprocess, also
+// reachable under the legacy alias "llamafile-local"); llamafile-network,
+// ollama, and openai (all OpenAI-compatible HTTP). The JSON-schema grammar
+// constraint is the same across all of them; they differ only in transport
+// (local subprocess vs. HTTP endpoint + auth header).
 func buildBackend(name string, cfg *config.Config, modelOverride string) (model.Backend, bool, error) {
 	if v := os.Getenv("INTENT_FORCE_BACKEND"); v != "" {
 		name = v
@@ -32,27 +34,48 @@ func buildBackend(name string, cfg *config.Config, modelOverride string) (model.
 	switch name {
 	case "mock":
 		return mock.New(), false, nil
-	case "llamafile-local":
-		// We expect the daemon (`intentd`) to have started llamafile on
-		// the loopback host:port from config. If nothing's listening, we
-		// fall back to the mock backend so `i hello` doesn't hard-fail
-		// for a brand-new install — instead the mock returns an honest
-		// "the local model isn't installed yet" response.
-		host, port, err := resolveLocalDaemonEndpoint(cfg)
+	case "llama-cli", "llamafile-local":
+		// Local inference runs llama.cpp. Preferred: a request-scoped
+		// `llama-server` child held warm for the whole invocation —
+		// native multi-turn messages (no flattening) and no per-step
+		// model reload across the tool-call loop. Fallback: one-shot
+		// `llama-cli` when the server binary isn't present. If neither
+		// the runtime nor the model is installed, fall back to the mock
+		// so `i hello` doesn't hard-fail for a brand-new install —
+		// ensureBackendReady / `i doctor` guide the fix.
+		// ("llamafile-local" is kept as a back-compat alias for configs
+		// written before the switch to llama.cpp.)
+		dirs, err := state.Resolve()
 		if err != nil {
 			return nil, false, err
 		}
-		endpoint := fmt.Sprintf("http://%s:%s", host, port)
-		if !endpointReachable(endpoint) {
+		rt := intentruntime.New(dirs.Cache)
+		modelPath := rt.ModelPath(selectedModelFile(dirs.State, cfg))
+		if !fileExists(modelPath) {
 			return mock.New(), true, nil
 		}
-		b := llamafile.New(endpoint)
+		tag := cfg.Model
 		if modelOverride != "" {
-			b.ModelTag = modelOverride
-		} else {
-			b.ModelTag = cfg.Model
+			tag = modelOverride
+		}
+		ctxTokens := 0
+		if m := loadCatalog(dirs.State).Get(cfg.Model); m != nil {
+			ctxTokens = m.ContextTokens
+		}
+		switch {
+		case rt.HaveLlamaServer():
+			b := llamaserver.New(rt.LlamaServerPath(), modelPath)
+			b.ModelTag = tag
+			b.ContextSize = ctxTokens
+			return b, false, nil
+		case rt.HaveLlamaCLI():
+			b := llamacli.New(rt.LlamaCLIPath(), modelPath)
+			b.ModelTag = tag
+			b.ContextSize = ctxTokens
+			return b, false, nil
+		default:
+			return mock.New(), true, nil
 		}
-		return b, false, nil
 	case "llamafile-network":
 		ep := os.Getenv("INTENT_LLAMAFILE_ENDPOINT")
 		if ep == "" {
@@ -119,6 +142,16 @@ func buildBackendCtx(ctx context.Context, name string, cfg *config.Config, model
 			l.KV("endpoint", b.Endpoint)
 			l.KV("model_tag", b.ModelTag)
 		}
+		if b, ok := be.(*llamacli.Backend); ok {
+			l.KV("binary", b.BinaryPath)
+			l.KV("model_path", b.ModelPath)
+			l.KV("model_tag", b.ModelTag)
+		}
+		if b, ok := be.(*llamaserver.Backend); ok {
+			l.KV("binary", b.BinaryPath)
+			l.KV("model_path", b.ModelPath)
+			l.KV("model_tag", b.ModelTag)
+		}
 		be = verbose.Backend(l, be)
 	}
 	return be, fb, nil
@@ -131,7 +164,21 @@ func printMockFallbackBanner(isFallback bool) {
 	if !isFallback {
 		return
 	}
-	fmt.Fprintln(os.Stderr, "[MOCK] real backend unavailable — responses are simulated. Run 'i doctor', 'i model list', or 'i daemon start' to fix.")
+	fmt.Fprintln(os.Stderr, "[MOCK] real backend unavailable — responses are simulated. Run 'i doctor' or 'i model pull' to fix.")
+}
+
+// isLocalBackend reports whether a backend name uses the local llama-cli
+// runtime (and therefore wants the runtime/model self-healing in
+// ensureBackendReady). The empty string means "use the configured
+// default", which is llama-cli. "llamafile-local" is the back-compat
+// alias for configs predating the switch.
+func isLocalBackend(name string) bool {
+	switch name {
+	case "", "llama-cli", "llamafile-local":
+		return true
+	default:
+		return false
+	}
 }
 
 // isMockBackend reports whether b is the mock backend (by name).
@@ -140,28 +187,18 @@ func isMockBackend(b model.Backend) bool {
 	return b.Name() == "mock"
 }
 
-// endpointReachable does a short-timeout TCP check on the host:port of a URL.
-func endpointReachable(rawURL string) bool {
-	u, err := url.Parse(rawURL)
-	if err != nil {
-		return false
-	}
-	host := u.Host
-	if host == "" {
-		return false
-	}
-	if !strings.Contains(host, ":") {
-		switch u.Scheme {
-		case "https":
-			host += ":443"
-		default:
-			host += ":80"
-		}
-	}
-	c, err := net.DialTimeout("tcp", host, 200*time.Millisecond)
-	if err != nil {
-		return false
+// fileExists reports whether path exists and is a regular file.
+func fileExists(path string) bool {
+	info, err := os.Stat(path)
+	return err == nil && !info.IsDir()
+}
+
+// closeBackend tears down any resources a backend holds — notably the
+// llama-server co-process, which must be killed when the invocation ends.
+// Safe to defer on every backend; a no-op for stateless ones. The verbose
+// wrapper forwards Close to its inner backend.
+func closeBackend(be model.Backend) {
+	if c, ok := be.(io.Closer); ok {
+		_ = c.Close()
 	}
-	_ = c.Close()
-	return true
 }
diff --git a/internal/cli/backend_test.go b/internal/cli/backend_test.go
index b58f9b0..faef6ed 100644
--- a/internal/cli/backend_test.go
+++ b/internal/cli/backend_test.go
@@ -40,39 +40,26 @@ func TestBuildBackend_MockIsNotFallback(t *testing.T) {
 	}
 }
 
-func TestBuildBackend_LlamafileLocalFallsBackWhenUnreachable(t *testing.T) {
+// When llama-cli or the model isn't installed, the local backend falls
+// back to mock so a fresh install doesn't hard-fail. We point the cache
+// at an empty temp dir so the model file is guaranteed absent.
+func TestBuildBackend_LlamaCLILocalFallsBackWhenNotInstalled(t *testing.T) {
 	clearBackendEnv(t)
-	// Point the daemon at a port that is definitely not listening.
-	cfg := minimalConfig()
-	cfg.Raw["daemon.host"] = "127.0.0.1"
-	cfg.Raw["daemon.port"] = "1" // port 1 is reserved; nothing listens there
-
-	be, isFallback, err := buildBackend("llamafile-local", cfg, "")
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if !isFallback {
-		t.Error("unavailable llamafile-local should set isFallback=true")
-	}
-	if be.Name() != "mock" {
-		t.Errorf("expected fallback name %q, got %q", "mock", be.Name())
-	}
-}
-
-func TestBuildBackend_LlamafileLocalRejectsNonLoopbackHost(t *testing.T) {
-	clearBackendEnv(t)
-	cfg := minimalConfig()
-	cfg.Raw["daemon.host"] = "0.0.0.0"
-
-	_, isFallback, err := buildBackend("llamafile-local", cfg, "")
-	if err == nil {
-		t.Fatal("expected error for non-loopback daemon host, got nil")
-	}
-	if isFallback {
-		t.Fatal("invalid daemon host should not silently fall back to mock")
-	}
-	if !strings.Contains(err.Error(), "loopback only") {
-		t.Fatalf("error = %q, want loopback hint", err)
+	t.Setenv("HOME", t.TempDir())
+	t.Setenv("INTENT_STATE_DIR", t.TempDir())
+	t.Setenv("INTENT_CACHE_DIR", t.TempDir())
+
+	for _, name := range []string{"llama-cli", "llamafile-local"} {
+		be, isFallback, err := buildBackend(name, minimalConfig(), "")
+		if err != nil {
+			t.Fatalf("%s: unexpected error: %v", name, err)
+		}
+		if !isFallback {
+			t.Errorf("%s: uninstalled local backend should set isFallback=true", name)
+		}
+		if be.Name() != "mock" {
+			t.Errorf("%s: expected fallback name %q, got %q", name, "mock", be.Name())
+		}
 	}
 }
 
@@ -155,7 +142,7 @@ func TestPrintMockFallbackBanner_MentionsNextSteps(t *testing.T) {
 	io.Copy(&buf, r)
 	out := buf.String()
 
-	for _, hint := range []string{"i doctor", "i daemon start"} {
+	for _, hint := range []string{"i doctor", "i model pull"} {
 		if !strings.Contains(out, hint) {
 			t.Errorf("banner should mention %q; got: %q", hint, out)
 		}
diff --git a/internal/cli/cli.go b/internal/cli/cli.go
index 64701e9..e53543c 100644
--- a/internal/cli/cli.go
+++ b/internal/cli/cli.go
@@ -23,7 +23,6 @@ var knownSubcommands = map[string]commandHandler{
 	"doctor":     cmdDoctor,
 	"config":     cmdConfig,
 	"model":      cmdModel,
-	"daemon":     cmdDaemon,
 	"history":    cmdHistory,
 	"pin":        cmdPin,
 	"run":        cmdRun,
@@ -163,12 +162,11 @@ Tip:
   double quotes for reliable shell parsing across environments.
 
 Subcommands:
-  init        First-run setup (model, daemon, completions).
+  init        First-run setup (model, runtime, completions).
   shell-init  Print shell snippet to source for natural-language quoting.
-  doctor      Diagnose installation, model, daemon, sandbox.
+  doctor      Diagnose installation, runtime, model, sandbox.
   config      Get/set/edit configuration.
   model       Manage local models.
-  daemon      Start/stop/status the background daemon.
   history     Inspect or clear the audit log.
   pin         Promote the last accepted command to a named skill.
   run         Run a pinned skill by name.
@@ -202,7 +200,7 @@ Top-level:
   --help, -h       This help.
   -v, --verbose    Log model I/O, tool calls, and gh round-trips to stderr.
                    (also enabled by INTENT_VERBOSE=1)
-  --uninstall      Remove binary, daemon, and (with consent) state.
+  --uninstall      Remove binary and (with consent) state.
   --update         Equivalent to "update".
 
 Read INTENT.md and docs/SPEC.md before contributing.
diff --git a/internal/cli/config.go b/internal/cli/config.go
index 8592dfa..8e8efac 100644
--- a/internal/cli/config.go
+++ b/internal/cli/config.go
@@ -86,14 +86,14 @@ func cmdConfig(_ context.Context, args []string) int {
 	}
 }
 
+// validateConfigValue is a hook for per-key validation on `i config set`.
+// Local inference no longer binds a network socket (llama-cli runs as a
+// subprocess), so there are currently no keys that need rejecting; the
+// function stays as the extension point.
 func validateConfigValue(key, value string) error {
-	switch key {
-	case "daemon.host":
-		_, err := normalizeLocalDaemonHost(value)
-		return err
-	default:
-		return nil
-	}
+	_ = key
+	_ = value
+	return nil
 }
 
 func lookupKnown(c *config.Config, key string) string {
diff --git a/internal/cli/daemon.go b/internal/cli/daemon.go
deleted file mode 100644
index 1b4da4c..0000000
--- a/internal/cli/daemon.go
+++ /dev/null
@@ -1,363 +0,0 @@
-package cli
-
-import (
-	"context"
-	"fmt"
-	"io"
-	"os"
-	"os/exec"
-	"os/signal"
-	"path/filepath"
-	"runtime"
-	"syscall"
-	"time"
-
-	"github.com/CoreyRDean/intent/internal/config"
-	"github.com/CoreyRDean/intent/internal/daemon"
-	"github.com/CoreyRDean/intent/internal/models"
-	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
-	"github.com/CoreyRDean/intent/internal/state"
-)
-
-const daemonUsage = "usage: i daemon (start | stop | status | logs | install | uninstall)"
-
-// daemonLabel is the launchd / systemd unit name. Stable across versions.
-const daemonLabel = "com.coreyrdean.intent"
-
-func cmdDaemon(ctx context.Context, args []string) int {
-	if len(args) == 0 {
-		errf(daemonUsage)
-		return 1
-	}
-	dirs, err := state.Resolve()
-	if err != nil {
-		errf("daemon: %v", err)
-		return 3
-	}
-	cfg, _ := config.Load(dirs.ConfigPath())
-
-	switch args[0] {
-	case "--help", "-h", "help":
-		fmt.Println(daemonUsage)
-		return 0
-	case "start":
-		return daemonStart(ctx, dirs, cfg, args[1:])
-	case "stop":
-		return daemonStop(dirs)
-	case "status":
-		return daemonStatus(dirs)
-	case "logs":
-		return daemonLogs(dirs)
-	case "install":
-		return daemonInstall(dirs)
-	case "uninstall":
-		return daemonUninstall(dirs)
-	default:
-		errf("unknown subcommand: %q", args[0])
-		return 1
-	}
-}
-
-// daemonStart is the user-visible `i daemon start`. By default it spawns
-// itself in the background (re-execs with --foreground), waits for the
-// control socket to come up, prints a one-line "started" message, and
-// returns — so the user gets their prompt back in well under a second.
-//
-// `--foreground` (or `--attach`) keeps the process attached to the
-// terminal, which is what launchd / systemd want and what `i daemon
-// logs -f` style debugging needs. The env var INTENTD_FOREGROUND is
-// the same switch in env form, so service files don't have to know
-// about the flag.
-func daemonStart(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []string) int {
-	foreground := os.Getenv("INTENTD_FOREGROUND") == "1"
-	for _, a := range args {
-		switch a {
-		case "--foreground", "--attach", "-f":
-			foreground = true
-		case "--background", "-b":
-			foreground = false
-		}
-	}
-	if !foreground {
-		return daemonSpawnDetached(dirs)
-	}
-	return daemonRunForeground(ctx, dirs, cfg)
-}
-
-// daemonSpawnDetached re-execs ourselves with --foreground, redirects
-// the child's stdio to a log file, decouples it from our process group
-// (Setsid), and returns once the control socket is responsive — or
-// after a sane timeout, with the log path so the user can inspect a
-// failure.
-func daemonSpawnDetached(dirs state.Dirs) int {
-	if err := os.MkdirAll(filepath.Join(dirs.State, "logs"), 0o700); err != nil {
-		errf("daemon start: %v", err)
-		return 3
-	}
-	logPath := filepath.Join(dirs.State, "logs", "intentd.log")
-	logF, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600)
-	if err != nil {
-		errf("daemon start: open log %s: %v", logPath, err)
-		return 3
-	}
-	defer logF.Close()
-
-	self, err := os.Executable()
-	if err != nil {
-		errf("daemon start: locate self: %v", err)
-		return 3
-	}
-	cmd := exec.Command(self, "daemon", "start", "--foreground")
-	cmd.Env = append(os.Environ(), "INTENTD_FOREGROUND=1")
-	cmd.Stdout = logF
-	cmd.Stderr = logF
-	cmd.Stdin = nil
-	cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
-	if err := cmd.Start(); err != nil {
-		errf("daemon start: spawn: %v", err)
-		return 3
-	}
-	// Don't wait for it — we want it to outlive us.
-	go func() { _ = cmd.Process.Release() }()
-
-	// Poll the control socket for readiness. The child has 30s to come
-	// up before we report failure; on a cold cache that's mostly
-	// llamafile loading the model.
-	deadline := time.Now().Add(30 * time.Second)
-	c := daemon.NewClient(dirs.SocketPath())
-	for time.Now().Before(deadline) {
-		if resp, err := c.Call(daemon.Request{Op: daemon.OpPing}); err == nil && resp.OK {
-			fmt.Fprintln(os.Stderr, "intentd: started in the background.")
-			fmt.Fprintf(os.Stderr, "  socket: %s\n", dirs.SocketPath())
-			fmt.Fprintf(os.Stderr, "  log:    %s\n", logPath)
-			return 0
-		}
-		time.Sleep(250 * time.Millisecond)
-	}
-	errf("daemon start: timed out waiting for control socket; tail -f %s", logPath)
-	return 3
-}
-
-func daemonRunForeground(ctx context.Context, dirs state.Dirs, cfg *config.Config) int {
-	mgr := intentruntime.New(dirs.Cache)
-	if !mgr.HaveLlamafile() {
-		errf("daemon: llamafile runtime missing — run `i model pull` first")
-		errf("  expected: %s", mgr.LlamafilePath())
-		return 3
-	}
-	// Resolve the model through the full catalog (built-in + custom)
-	// so the daemon loads exactly what `i model use` selected, even
-	// for user-added HF repos that aren't in the built-in list.
-	cat := loadCatalog(dirs.State)
-	id := cfg.Model
-	if id == "" {
-		id = models.DefaultID
-	}
-	host, port, err := resolveLocalDaemonEndpoint(cfg)
-	if err != nil {
-		errf("daemon: %v", err)
-		return 1
-	}
-	m := cat.Get(id)
-	if m == nil {
-		errf("daemon: current model %q not in catalog; run `i model list` and `i model use <id>`", id)
-		return 1
-	}
-	modelPath := mgr.ModelPath(models.ModelFilename(m))
-	if _, err := os.Stat(modelPath); err != nil {
-		errf("daemon: model %q not installed — run `i model pull %s`", id, id)
-		errf("  expected: %s", modelPath)
-		return 3
-	}
-
-	logDir := filepath.Join(dirs.State, "logs")
-	if err := os.MkdirAll(logDir, 0o700); err != nil {
-		errf("daemon: mkdir log dir: %v", err)
-		return 3
-	}
-	logPath := filepath.Join(logDir, "llamafile.log")
-	logF, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600)
-	if err != nil {
-		errf("daemon: open log: %v", err)
-		return 3
-	}
-	defer logF.Close()
-
-	portNum := 18080
-	fmt.Sscanf(port, "%d", &portNum)
-
-	launcher := daemon.NewLauncher(mgr.LlamafilePath(), modelPath, host, portNum)
-	launcher.StdoutLog = logF
-	launcher.StderrLog = io.MultiWriter(logF, os.Stderr)
-
-	startCtx, cancelStart := context.WithTimeout(ctx, 90*time.Second)
-	fmt.Fprintln(os.Stderr, "intentd: starting llamafile...")
-	if err := launcher.Start(startCtx); err != nil {
-		cancelStart()
-		errf("daemon: start llamafile: %v", err)
-		return 3
-	}
-	cancelStart()
-	fmt.Fprintf(os.Stderr, "intentd: llamafile ready on %s (pid %d)\n",
-		launcher.Endpoint(), launcher.PID())
-
-	srv := daemon.New(dirs.SocketPath(), launcher)
-	if err := srv.Listen(); err != nil {
-		launcher.Stop(5 * time.Second)
-		errf("daemon: listen: %v", err)
-		return 3
-	}
-	fmt.Fprintf(os.Stderr, "intentd: control socket at %s\n", dirs.SocketPath())
-
-	sigCtx, cancelSig := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM, syscall.SIGHUP)
-	defer cancelSig()
-	serveDone := make(chan struct{})
-	go func() {
-		_ = srv.Serve(sigCtx)
-		close(serveDone)
-	}()
-
-	// Block until any of: an OS signal, an `i daemon stop` over the
-	// socket, the parent context is canceled, or the supervised
-	// llamafile gives up entirely (Wait drains).
-	llamaDone := make(chan struct{})
-	go func() { launcher.Wait(); close(llamaDone) }()
-	select {
-	case <-sigCtx.Done():
-	case <-srv.Stopped():
-	case <-serveDone:
-	case <-llamaDone:
-	}
-	fmt.Fprintln(os.Stderr, "intentd: shutting down...")
-	srv.SignalStop()
-	launcher.Stop(10 * time.Second)
-	fmt.Fprintln(os.Stderr, "intentd: stopped.")
-	return 0
-}
-
-func daemonStop(dirs state.Dirs) int {
-	c := daemon.NewClient(dirs.SocketPath())
-	resp, err := c.Call(daemon.Request{Op: daemon.OpStop})
-	if err != nil {
-		errf("daemon stop: %v (is the daemon running?)", err)
-		return 1
-	}
-	if !resp.OK {
-		errf("daemon stop: %s", resp.Error)
-		return 1
-	}
-	fmt.Println("daemon: stop requested")
-	return 0
-}
-
-func daemonStatus(dirs state.Dirs) int {
-	c := daemon.NewClient(dirs.SocketPath())
-	resp, err := c.Call(daemon.Request{Op: daemon.OpStatus})
-	if err != nil {
-		fmt.Println("daemon: not running")
-		fmt.Println("  socket:", dirs.SocketPath())
-		fmt.Println("  installed as service:", daemon.IsInstalled(daemonLabel))
-		return 1
-	}
-	if !resp.OK {
-		errf("daemon status: %s", resp.Error)
-		return 1
-	}
-	fmt.Println("daemon: running")
-	for k, v := range resp.Data {
-		fmt.Printf("  %s: %v\n", k, v)
-	}
-	return 0
-}
-
-func daemonLogs(dirs state.Dirs) int {
-	logPath := filepath.Join(dirs.State, "logs", "llamafile.log")
-	if runtime.GOOS == "linux" && daemon.IsInstalled(daemonLabel) {
-		fmt.Fprintln(os.Stderr, "Tip: run `journalctl --user -u "+daemonLabel+".service -f` for the systemd-managed log.")
-		fmt.Fprintln(os.Stderr, "Showing the llamafile subprocess log:", logPath)
-	}
-	f, err := os.Open(logPath)
-	if err != nil {
-		errf("logs: %v", err)
-		return 1
-	}
-	defer f.Close()
-	if _, err := io.Copy(os.Stdout, f); err != nil {
-		errf("logs: %v", err)
-		return 1
-	}
-	return 0
-}
-
-func daemonInstall(dirs state.Dirs) int {
-	bin, err := os.Executable()
-	if err != nil {
-		errf("daemon install: locate self: %v", err)
-		return 3
-	}
-	bin, _ = filepath.EvalSymlinks(bin)
-	res, err := daemon.Install(daemon.InstallParams{
-		Binary: bin,
-		Label:  daemonLabel,
-		LogDir: filepath.Join(dirs.State, "logs"),
-		Socket: dirs.SocketPath(),
-		Cache:  dirs.Cache,
-		State:  dirs.State,
-	})
-	if err != nil {
-		errf("daemon install: %v", err)
-		return 3
-	}
-	fmt.Println("daemon installed as a system service.")
-	fmt.Println("  unit:    ", res.UnitPath)
-	fmt.Println("  start:   ", strJoin(res.StartCmd))
-	fmt.Println("  stop:    ", strJoin(res.StopCmd))
-	if res.LogPath != "" {
-		fmt.Println("  log:     ", res.LogPath)
-	}
-	if res.Notes != "" {
-		fmt.Println()
-		fmt.Println(res.Notes)
-	}
-	return 0
-}
-
-func daemonUninstall(dirs state.Dirs) int {
-	// Try a polite stop first.
-	c := daemon.NewClient(dirs.SocketPath())
-	_, _ = c.Call(daemon.Request{Op: daemon.OpStop})
-	if err := daemon.Uninstall(daemonLabel); err != nil {
-		errf("daemon uninstall: %v", err)
-		return 3
-	}
-	fmt.Println("daemon: service uninstalled.")
-	return 0
-}
-
-func strJoin(parts []string) string {
-	if len(parts) == 0 {
-		return "(none)"
-	}
-	out := ""
-	for i, p := range parts {
-		if i > 0 {
-			out += " "
-		}
-		out += p
-	}
-	return out
-}
-
-// modelFileFor maps a config model tag (e.g. "qwen2.5-coder-7b-instruct-q4_k_m")
-// to the GGUF filename we expect on disk. v1 is hard-coded to one default;
-// future versions consult a model registry.
-func modelFileFor(tag string) string {
-	if tag == intentruntime.DefaultModel.Name {
-		return intentruntime.DefaultModel.File
-	}
-	// Best-effort: assume tag + ".gguf".
-	if filepath.Ext(tag) == ".gguf" {
-		return tag
-	}
-	return tag + ".gguf"
-}
diff --git a/internal/cli/daemon_host.go b/internal/cli/daemon_host.go
deleted file mode 100644
index 871faae..0000000
--- a/internal/cli/daemon_host.go
+++ /dev/null
@@ -1,57 +0,0 @@
-package cli
-
-import (
-	"fmt"
-	"net"
-	"strings"
-
-	"github.com/CoreyRDean/intent/internal/config"
-)
-
-const defaultLocalDaemonHost = "127.0.0.1"
-const defaultLocalDaemonPort = "18080"
-
-// normalizeLocalDaemonHost accepts only loopback hosts for the local daemon.
-// Any accepted value is canonicalized to 127.0.0.1 so the local backend never
-// accidentally exposes the model server on a broader interface.
-func normalizeLocalDaemonHost(raw string) (string, error) {
-	host := strings.TrimSpace(raw)
-	if host == "" {
-		return defaultLocalDaemonHost, nil
-	}
-	if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") {
-		host = strings.TrimSuffix(strings.TrimPrefix(host, "["), "]")
-	}
-	if strings.EqualFold(host, "localhost") {
-		return defaultLocalDaemonHost, nil
-	}
-	if ip := net.ParseIP(host); ip != nil && ip.IsLoopback() {
-		return defaultLocalDaemonHost, nil
-	}
-	return "", fmt.Errorf("daemon.host %q must resolve to loopback only", strings.TrimSpace(raw))
-}
-
-func resolveLocalDaemonHost(cfg *config.Config) (string, error) {
-	if cfg == nil {
-		return normalizeLocalDaemonHost("")
-	}
-	return normalizeLocalDaemonHost(cfg.Raw["daemon.host"])
-}
-
-func resolveLocalDaemonPort(cfg *config.Config) string {
-	if cfg == nil {
-		return defaultLocalDaemonPort
-	}
-	if port := strings.TrimSpace(cfg.Raw["daemon.port"]); port != "" {
-		return port
-	}
-	return defaultLocalDaemonPort
-}
-
-func resolveLocalDaemonEndpoint(cfg *config.Config) (host, port string, err error) {
-	host, err = resolveLocalDaemonHost(cfg)
-	if err != nil {
-		return "", "", err
-	}
-	return host, resolveLocalDaemonPort(cfg), nil
-}
diff --git a/internal/cli/daemon_host_test.go b/internal/cli/daemon_host_test.go
deleted file mode 100644
index 22d7ade..0000000
--- a/internal/cli/daemon_host_test.go
+++ /dev/null
@@ -1,75 +0,0 @@
-package cli
-
-import (
-	"strings"
-	"testing"
-
-	"github.com/CoreyRDean/intent/internal/config"
-)
-
-func TestNormalizeLocalDaemonHost(t *testing.T) {
-	tests := []struct {
-		name    string
-		raw     string
-		want    string
-		wantErr string
-	}{
-		{name: "default empty host", raw: "", want: "127.0.0.1"},
-		{name: "localhost", raw: "localhost", want: "127.0.0.1"},
-		{name: "ipv4 loopback", raw: "127.0.0.1", want: "127.0.0.1"},
-		{name: "ipv6 loopback", raw: "::1", want: "127.0.0.1"},
-		{name: "bracketed ipv6 loopback", raw: "[::1]", want: "127.0.0.1"},
-		{name: "non-loopback wildcard rejected", raw: "0.0.0.0", wantErr: "loopback only"},
-		{name: "non-loopback ip rejected", raw: "192.168.1.10", wantErr: "loopback only"},
-		{name: "hostname rejected", raw: "example.com", wantErr: "loopback only"},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got, err := normalizeLocalDaemonHost(tt.raw)
-			if tt.wantErr != "" {
-				if err == nil {
-					t.Fatalf("expected error containing %q, got nil", tt.wantErr)
-				}
-				if !strings.Contains(err.Error(), tt.wantErr) {
-					t.Fatalf("error = %q, want substring %q", err, tt.wantErr)
-				}
-				return
-			}
-			if err != nil {
-				t.Fatalf("unexpected error: %v", err)
-			}
-			if got != tt.want {
-				t.Fatalf("host = %q, want %q", got, tt.want)
-			}
-		})
-	}
-}
-
-func TestResolveLocalDaemonEndpoint(t *testing.T) {
-	cfg := &config.Config{Raw: map[string]string{
-		"daemon.host": " localhost ",
-		"daemon.port": " 19090 ",
-	}}
-
-	host, port, err := resolveLocalDaemonEndpoint(cfg)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if host != "127.0.0.1" {
-		t.Fatalf("host = %q, want %q", host, "127.0.0.1")
-	}
-	if port != "19090" {
-		t.Fatalf("port = %q, want %q", port, "19090")
-	}
-}
-
-func TestValidateConfigValueRejectsRemoteDaemonHost(t *testing.T) {
-	err := validateConfigValue("daemon.host", "0.0.0.0")
-	if err == nil {
-		t.Fatal("expected daemon.host validation error, got nil")
-	}
-	if !strings.Contains(err.Error(), "loopback only") {
-		t.Fatalf("error = %q, want loopback hint", err)
-	}
-}
diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go
index 5d67363..9bc1240 100644
--- a/internal/cli/doctor.go
+++ b/internal/cli/doctor.go
@@ -7,22 +7,11 @@ import (
 	"runtime"
 
 	"github.com/CoreyRDean/intent/internal/config"
-	"github.com/CoreyRDean/intent/internal/daemon"
 	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
 	"github.com/CoreyRDean/intent/internal/state"
 	"github.com/CoreyRDean/intent/internal/version"
 )
 
-type daemonStatusCaller interface {
-	Call(req daemon.Request) (*daemon.Response, error)
-}
-
-var newDaemonStatusClient = func(socket string) daemonStatusCaller {
-	return daemon.NewClient(socket)
-}
-
-var daemonServiceInstalled = daemon.IsInstalled
-
 func cmdDoctor(_ context.Context, _ []string) int {
 	ok := true
 	check := func(name, status string, good bool) {
@@ -54,15 +43,17 @@ func cmdDoctor(_ context.Context, _ []string) int {
 		}
 
 		rt := intentruntime.New(dirs.Cache)
-		check("llamafile runtime",
-			fmt.Sprintf("expected at %s", rt.LlamafilePath()),
-			rt.HaveLlamafile())
+		runtimeStatus := "missing — run `i model pull` to install via your package manager"
+		switch {
+		case rt.HaveLlamaServer():
+			runtimeStatus = "found at " + rt.LlamaServerPath()
+		case rt.HaveLlamaCLI():
+			runtimeStatus = "llama-server missing; using one-shot fallback at " + rt.LlamaCLIPath()
+		}
+		check("llama.cpp runtime", runtimeStatus, rt.HaveLlamaRuntime())
 
 		modelFile, modelStatus := resolveModelCheck(cfg)
 		check("model", fmt.Sprintf("%s — %s", modelStatus, rt.ModelPath(modelFile)), rt.HaveModel(modelFile))
-
-		daemonStatus, daemonOK := doctorDaemonStatus(dirs)
-		check("daemon", daemonStatus, daemonOK)
 	}
 
 	// Sandbox tooling.
@@ -107,26 +98,3 @@ func okStr(err error) string {
 	}
 	return "missing"
 }
-
-func doctorDaemonStatus(dirs state.Dirs) (string, bool) {
-	installed := daemonServiceInstalled(daemonLabel)
-	resp, err := newDaemonStatusClient(dirs.SocketPath()).Call(daemon.Request{Op: daemon.OpStatus})
-	if err != nil {
-		if installed {
-			return "installed but not responding", false
-		}
-		return "not running (optional)", true
-	}
-	if !resp.OK {
-		return "unhealthy: " + resp.Error, false
-	}
-
-	serviceState := "no"
-	if installed {
-		serviceState = "yes"
-	}
-	if endpoint, _ := resp.Data["llamafile_endpoint"].(string); endpoint != "" {
-		return fmt.Sprintf("running (service installed: %s, endpoint: %s)", serviceState, endpoint), true
-	}
-	return fmt.Sprintf("running (service installed: %s)", serviceState), true
-}
diff --git a/internal/cli/doctor_test.go b/internal/cli/doctor_test.go
index 80010f1..2f8c9c0 100644
--- a/internal/cli/doctor_test.go
+++ b/internal/cli/doctor_test.go
@@ -2,14 +2,11 @@ package cli
 
 import (
 	"context"
-	"errors"
 	"strings"
 	"testing"
 
 	"github.com/CoreyRDean/intent/internal/config"
-	"github.com/CoreyRDean/intent/internal/daemon"
 	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
-	"github.com/CoreyRDean/intent/internal/state"
 )
 
 func TestResolveModelCheck(t *testing.T) {
@@ -58,100 +55,17 @@ func TestResolveModelCheck(t *testing.T) {
 	}
 }
 
-type stubDaemonStatusClient struct {
-	resp *daemon.Response
-	err  error
-}
-
-func (s stubDaemonStatusClient) Call(_ daemon.Request) (*daemon.Response, error) {
-	return s.resp, s.err
-}
-
-func TestDoctorDaemonStatus(t *testing.T) {
-	origNewClient := newDaemonStatusClient
-	origInstalled := daemonServiceInstalled
-	t.Cleanup(func() {
-		newDaemonStatusClient = origNewClient
-		daemonServiceInstalled = origInstalled
-	})
-
-	dirs := state.Dirs{State: t.TempDir()}
-
-	tests := []struct {
-		name      string
-		installed bool
-		client    stubDaemonStatusClient
-		want      string
-		wantOK    bool
-	}{
-		{
-			name:      "missing optional daemon is informational",
-			installed: false,
-			client:    stubDaemonStatusClient{err: errors.New("dial unix: no such file or directory")},
-			want:      "not running (optional)",
-			wantOK:    true,
-		},
-		{
-			name:      "installed daemon that does not respond is unhealthy",
-			installed: true,
-			client:    stubDaemonStatusClient{err: errors.New("connection refused")},
-			want:      "installed but not responding",
-			wantOK:    false,
-		},
-		{
-			name:      "running daemon reports endpoint",
-			installed: false,
-			client: stubDaemonStatusClient{resp: &daemon.Response{
-				OK: true,
-				Data: map[string]any{
-					"llamafile_endpoint": "http://127.0.0.1:18080",
-				},
-			}},
-			want:   "running (service installed: no, endpoint: http://127.0.0.1:18080)",
-			wantOK: true,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			newDaemonStatusClient = func(string) daemonStatusCaller { return tt.client }
-			daemonServiceInstalled = func(string) bool { return tt.installed }
-
-			got, gotOK := doctorDaemonStatus(dirs)
-			if got != tt.want {
-				t.Fatalf("status = %q, want %q", got, tt.want)
-			}
-			if gotOK != tt.wantOK {
-				t.Fatalf("ok = %v, want %v", gotOK, tt.wantOK)
-			}
-		})
-	}
-}
-
-func TestDoctorPrintsDaemonStatus(t *testing.T) {
-	origNewClient := newDaemonStatusClient
-	origInstalled := daemonServiceInstalled
-	t.Cleanup(func() {
-		newDaemonStatusClient = origNewClient
-		daemonServiceInstalled = origInstalled
-	})
-
+// TestDoctorReportsLlamaRuntime verifies doctor surfaces the local
+// llama.cpp runtime line rather than a daemon/server status.
+func TestDoctorReportsLlamaRuntime(t *testing.T) {
 	t.Setenv("HOME", t.TempDir())
 	t.Setenv("INTENT_STATE_DIR", t.TempDir())
 	t.Setenv("INTENT_CACHE_DIR", t.TempDir())
 
-	newDaemonStatusClient = func(string) daemonStatusCaller {
-		return stubDaemonStatusClient{err: errors.New("dial unix: no such file or directory")}
-	}
-	daemonServiceInstalled = func(string) bool { return false }
-
 	out := captureStdout(func() {
 		_ = cmdDoctor(context.Background(), nil)
 	})
-	if !strings.Contains(out, "daemon") {
-		t.Fatalf("doctor output missing daemon line: %q", out)
-	}
-	if !strings.Contains(out, "not running (optional)") {
-		t.Fatalf("doctor output missing optional daemon status: %q", out)
+	if !strings.Contains(out, "llama.cpp runtime") {
+		t.Fatalf("doctor output missing llama.cpp runtime line: %q", out)
 	}
 }
diff --git a/internal/cli/ensure.go b/internal/cli/ensure.go
index f45b399..7cf354a 100644
--- a/internal/cli/ensure.go
+++ b/internal/cli/ensure.go
@@ -4,14 +4,11 @@ import (
 	"bufio"
 	"context"
 	"fmt"
-	"net"
 	"os"
 	"path/filepath"
 	"strings"
-	"time"
 
 	"github.com/CoreyRDean/intent/internal/config"
-	"github.com/CoreyRDean/intent/internal/daemon"
 	"github.com/CoreyRDean/intent/internal/models"
 	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
 	"github.com/CoreyRDean/intent/internal/state"
@@ -19,36 +16,31 @@ import (
 )
 
 // ensureBackendReady is the self-healing precondition for any subcommand
-// that wants to talk to the local model. It checks (in order):
+// that wants to talk to the local model. Local inference runs llama.cpp's
+// `llama-cli` one-shot — there is no daemon to start — so "ready" means:
 //
-//  1. The daemon is reachable. If yes, we're done.
-//  2. The runtime + model are present on disk.
-//     - If not and stdin is a TTY: ask permission, then download.
-//     - If not and we're non-interactive: fail with a clear, copyable
-//     command that fixes it.
-//  3. With files in place but no daemon, start one in the background.
-//  4. Wait briefly for the daemon's control socket to come up.
+//  1. The llama-cli runtime is installed (via the system package manager).
+//  2. The selected GGUF model is downloaded.
 //
-// Returns true if the call site should proceed, false if it should
-// bail out (we already printed the failure reason).
+// If either is missing:
+//   - interactive TTY: ask permission, then install/download.
+//   - non-interactive: fail with a clear, copyable command that fixes it.
 //
-// Backend-name guard: this only fires for the local llamafile backend.
-// Users on `openai`, `ollama`, or `mock` get no prompts and no startup
-// attempts — we're not their package manager.
+// Returns true if the call site should proceed, false if it should bail
+// out (we already printed the failure reason).
+//
+// Backend-name guard: this only fires for the local llama-cli backend.
+// Users on `openai`, `ollama`, `llamafile-network`, or `mock` get no
+// prompts and no install attempts — we're not their package manager.
 func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config) bool {
-	if cfg.Backend != "" && cfg.Backend != "llamafile-local" {
+	if !isLocalBackend(cfg.Backend) {
 		return true
 	}
 
-	// (1) Daemon already up?
-	if pingDaemon(dirs) {
-		return true
-	}
-
-	mgr := intentruntime.New(dirs.Cache)
-	// Resolve the *selected* model through the catalog so self-
-	// healing downloads the right thing when the user has switched
-	// to a custom HF repo or a non-default built-in.
+	rt := intentruntime.New(dirs.Cache)
+	// Resolve the *selected* model through the catalog so self-healing
+	// downloads the right thing when the user has switched to a custom
+	// HF repo or a non-default built-in.
 	cat := loadCatalog(dirs.State)
 	id := cfg.Model
 	if id == "" {
@@ -56,91 +48,62 @@ func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config
 	}
 	selected := cat.Get(id)
 	if selected == nil {
-		// Fall back to the catalog default to at least make progress;
-		// the daemon will complain later if this mismatches config.
 		selected = cat.Default()
 	}
-	haveLF := mgr.HaveLlamafile()
-	haveModel := selected != nil && mgr.HaveModel(models.ModelFilename(selected))
-	interactive := tui.IsTTY(os.Stdin) && tui.IsTTY(os.Stderr)
+	haveRuntime := rt.HaveLlamaRuntime()
+	haveModel := selected != nil && rt.HaveModel(models.ModelFilename(selected))
+	if haveRuntime && haveModel {
+		return true
+	}
 
-	// (2) Missing artifacts.
-	if !haveLF || !haveModel {
-		if !interactive {
-			fmt.Fprintln(os.Stderr, "intent: local model isn't installed yet.")
-			fmt.Fprintln(os.Stderr, "  run: i model pull")
-			return false
-		}
-		fmt.Fprintln(os.Stderr, "intent: the local model isn't installed yet.")
-		if !haveLF {
-			fmt.Fprintln(os.Stderr, "  missing runtime: llamafile-"+intentruntime.LlamafileVersion)
-		}
-		if !haveModel && selected != nil {
-			fmt.Fprintf(os.Stderr, "  missing model:   %s (~%d MB)\n",
-				selected.ID, selected.SizeMB)
-		}
-		if !confirmYes("Download now?") {
-			fmt.Fprintln(os.Stderr, "intent: skipped. Run `i model pull` later.")
-			return false
-		}
-		if !haveLF {
-			fmt.Fprintln(os.Stderr, "downloading runtime...")
-			if err := mgr.EnsureLlamafile(ctx, progressCB("llamafile")); err != nil {
-				fmt.Fprintln(os.Stderr)
-				errf("runtime: %v", err)
-				return false
-			}
-			fmt.Fprintln(os.Stderr)
+	interactive := tui.IsTTY(os.Stdin) && tui.IsTTY(os.Stderr)
+	if !interactive {
+		fmt.Fprintln(os.Stderr, "intent: local model isn't ready yet.")
+		if !haveRuntime {
+			fmt.Fprintln(os.Stderr, "  missing runtime: llama.cpp (llama-server)")
 		}
 		if !haveModel && selected != nil {
-			fmt.Fprintf(os.Stderr, "downloading model (~%d MB)...\n", selected.SizeMB)
-			mi := intentruntime.FromCatalog(selected)
-			if err := mgr.EnsureModel(ctx, mi, progressCB("model")); err != nil {
-				fmt.Fprintln(os.Stderr)
-				errf("model: %v", err)
-				return false
-			}
-			fmt.Fprintln(os.Stderr)
+			fmt.Fprintf(os.Stderr, "  missing model:   %s (~%d MB)\n", selected.ID, selected.SizeMB)
 		}
+		fmt.Fprintln(os.Stderr, "  run: i model pull")
+		return false
 	}
 
-	// (3) Start the daemon. We use the same `i daemon start` code path
-	// as the user would, so behaviour matches and bugs are shared.
-	fmt.Fprintln(os.Stderr, "intent: starting daemon in the background...")
-	if rc := daemonSpawnDetached(dirs); rc != 0 {
-		fmt.Fprintln(os.Stderr, "intent: daemon failed to start; falling back to mock.")
+	fmt.Fprintln(os.Stderr, "intent: the local model isn't ready yet.")
+	if !haveRuntime {
+		fmt.Fprintln(os.Stderr, "  missing runtime: llama.cpp (llama-server)")
+	}
+	if !haveModel && selected != nil {
+		fmt.Fprintf(os.Stderr, "  missing model:   %s (~%d MB)\n", selected.ID, selected.SizeMB)
+	}
+	if !confirmYes("Set up now?") {
+		fmt.Fprintln(os.Stderr, "intent: skipped. Run `i model pull` later.")
 		return false
 	}
 
-	// (4) Confirm it's actually responsive (daemonSpawnDetached already
-	// polls, but be defensive — the socket might be ready while
-	// llamafile is still warming up its first inference).
-	deadline := time.Now().Add(60 * time.Second)
-	for time.Now().Before(deadline) {
-		if pingDaemon(dirs) {
-			return true
+	if !haveRuntime {
+		fmt.Fprintln(os.Stderr, "installing llama.cpp...")
+		if err := rt.EnsureLlamaRuntime(ctx, func(s string) { fmt.Fprintln(os.Stderr, "  "+s) }); err != nil {
+			errf("runtime: %v", err)
+			return false
 		}
-		time.Sleep(200 * time.Millisecond)
 	}
-	fmt.Fprintln(os.Stderr, "intent: daemon started but isn't responding yet; try again in a few seconds.")
-	return false
-}
-
-// pingDaemon checks both that the control socket exists and that the
-// daemon answers a ping. Either an unreachable socket or a sad daemon
-// returns false.
-func pingDaemon(dirs state.Dirs) bool {
-	if _, err := os.Stat(dirs.SocketPath()); err != nil {
-		return false
+	if !haveModel && selected != nil {
+		fmt.Fprintf(os.Stderr, "downloading model (~%d MB)...\n", selected.SizeMB)
+		mi := intentruntime.FromCatalog(selected)
+		if err := rt.EnsureModel(ctx, mi, progressCB("model")); err != nil {
+			fmt.Fprintln(os.Stderr)
+			errf("model: %v", err)
+			return false
+		}
+		fmt.Fprintln(os.Stderr)
 	}
-	c, err := net.DialTimeout("unix", dirs.SocketPath(), 200*time.Millisecond)
-	if err != nil {
+
+	if !rt.HaveLlamaRuntime() {
+		fmt.Fprintln(os.Stderr, "intent: llama.cpp still not available; falling back to mock.")
 		return false
 	}
-	_ = c.Close()
-	cli := daemon.NewClient(dirs.SocketPath())
-	resp, err := cli.Call(daemon.Request{Op: daemon.OpPing})
-	return err == nil && resp.OK
+	return true
 }
 
 // cfgModelFile turns the configured model tag into a GGUF filename,
@@ -188,25 +151,3 @@ func confirmYes(prompt string) bool {
 	line = strings.TrimSpace(strings.ToLower(line))
 	return line == "" || line == "y" || line == "yes"
 }
-
-// startDaemonAndWait is a small helper used by `i init` after a model
-// pull, to bring the daemon up without making the user run a third
-// command. It mirrors ensureBackendReady's daemon-startup half but
-// with louder logging since this is an explicit setup step.
-func startDaemonAndWait(dirs state.Dirs) error {
-	if pingDaemon(dirs) {
-		return nil
-	}
-	if rc := daemonSpawnDetached(dirs); rc != 0 {
-		return fmt.Errorf("daemon failed to start (see %s)",
-			filepath.Join(dirs.State, "logs", "intentd.log"))
-	}
-	deadline := time.Now().Add(60 * time.Second)
-	for time.Now().Before(deadline) {
-		if pingDaemon(dirs) {
-			return nil
-		}
-		time.Sleep(200 * time.Millisecond)
-	}
-	return fmt.Errorf("daemon started but didn't become responsive in 60s")
-}
diff --git a/internal/cli/explain.go b/internal/cli/explain.go
index d5afc99..9ed065c 100644
--- a/internal/cli/explain.go
+++ b/internal/cli/explain.go
@@ -37,6 +37,7 @@ func cmdExplain(ctx context.Context, args []string) int {
 		errf("explain: %v", err)
 		return 3
 	}
+	defer closeBackend(be)
 	printMockFallbackBanner(isFallback)
 
 	vl := verbose.FromContext(ctx)
diff --git a/internal/cli/init.go b/internal/cli/init.go
index 6ecc8aa..082d570 100644
--- a/internal/cli/init.go
+++ b/internal/cli/init.go
@@ -44,20 +44,6 @@ func cmdInit(ctx context.Context, args []string) int {
 	fmt.Printf("  cache dir: %s\n", dirs.Cache)
 	fmt.Println()
 
-	// Daemon prompt — default Yes, per D-004.
-	fmt.Print("Keep intent warm in the background so it never has to load? [Y/n] ")
-	answer := "y"
-	if !autoYes {
-		r := bufio.NewReader(os.Stdin)
-		line, _ := r.ReadString('\n')
-		line = strings.TrimSpace(strings.ToLower(line))
-		if line == "" {
-			line = "y"
-		}
-		answer = line
-	}
-	cfg.DaemonEnabled = answer == "y" || answer == "yes"
-
 	// Shell integration prompt — default Yes. Without it, zsh users
 	// hit "no matches found" the first time they type a prompt with
 	// a literal `?` in it, which is a brutal first impression.
@@ -81,11 +67,7 @@ func cmdInit(ctx context.Context, args []string) int {
 
 	fmt.Println()
 	fmt.Println("Wrote", dirs.ConfigPath())
-	if cfg.DaemonEnabled {
-		fmt.Println("Daemon: enabled. Run `i daemon install` to register it as a launchd/systemd service.")
-	} else {
-		fmt.Println("Daemon: disabled. Each invocation will cold-load the model.")
-	}
+	fmt.Println("Local inference runs llama.cpp's `llama-cli` on demand (no background daemon).")
 
 	if installHook {
 		writeShellHook()
@@ -94,9 +76,9 @@ func cmdInit(ctx context.Context, args []string) int {
 		fmt.Println("            ? * [ ] characters, or run `i shell-init zsh >> ~/.zshrc` later.")
 	}
 
-	// Model pull + daemon start. This is the difference between
-	// "config written, now go figure out three more commands" and
-	// "open a new shell and you're working." Default Yes.
+	// Runtime install + model pull. This is the difference between
+	// "config written, now go figure out more commands" and "open a
+	// new shell and you're working." Default Yes.
 	mgr := intentruntime.New(dirs.Cache)
 	cat := loadCatalog(dirs.State)
 	// Prefer whatever the user already selected in config over the
@@ -106,14 +88,14 @@ func cmdInit(ctx context.Context, args []string) int {
 	if selected == nil {
 		selected = cat.Default()
 	}
-	haveLF := mgr.HaveLlamafile()
+	haveRuntime := mgr.HaveLlamaRuntime()
 	haveModel := selected != nil && mgr.HaveModel(models.ModelFilename(selected))
-	if !haveLF || !haveModel {
+	if !haveRuntime || !haveModel {
 		fmt.Println()
 		if selected != nil {
-			fmt.Printf("Download %s now? (~%d MB) [Y/n] ", selected.ID, selected.SizeMB)
+			fmt.Printf("Install llama.cpp and download %s now? (~%d MB) [Y/n] ", selected.ID, selected.SizeMB)
 		} else {
-			fmt.Printf("Download the default local model now? [Y/n] ")
+			fmt.Printf("Install llama.cpp and the default local model now? [Y/n] ")
 		}
 		pullAnswer := "y"
 		if !autoYes {
@@ -126,11 +108,11 @@ func cmdInit(ctx context.Context, args []string) int {
 			pullAnswer = line
 		}
 		if pullAnswer == "y" || pullAnswer == "yes" {
-			if !haveLF {
-				fmt.Println("downloading runtime...")
-				if err := mgr.EnsureLlamafile(ctx, progressCB("llamafile")); err != nil {
+			if !haveRuntime {
+				fmt.Println("installing llama.cpp via your package manager...")
+				if err := mgr.EnsureLlamaRuntime(ctx, func(s string) { fmt.Println("  " + s) }); err != nil {
 					fmt.Println()
-					errf("init: download runtime: %v", err)
+					errf("init: install runtime: %v", err)
 					fmt.Println("you can retry with `i model pull`.")
 					return 0
 				}
@@ -152,20 +134,10 @@ func cmdInit(ctx context.Context, args []string) int {
 		}
 	} else {
 		fmt.Println()
+		fmt.Println("Runtime:     llama.cpp already installed.")
 		fmt.Println("Model:       already installed.")
 	}
 
-	if cfg.DaemonEnabled {
-		fmt.Println("Starting daemon...")
-		if err := startDaemonAndWait(dirs); err != nil {
-			errf("init: %v", err)
-			fmt.Println("you can retry with `i daemon start` (and inspect logs at",
-				filepath.Join(dirs.State, "logs", "intentd.log")+").")
-		} else {
-			fmt.Println("Daemon:      running.")
-		}
-	}
-
 	fmt.Println()
 	fmt.Println("All set. Try:")
 	fmt.Println("  i hello              # smoke test")
diff --git a/internal/cli/intent.go b/internal/cli/intent.go
index 71bdee4..0966ef8 100644
--- a/internal/cli/intent.go
+++ b/internal/cli/intent.go
@@ -213,10 +213,10 @@ func cmdIntent(ctx context.Context, args []string) int {
 		return 3
 	}
 
-	// Self-heal: if the backend is local-llamafile and the daemon
-	// isn't reachable, offer to download the model and start it.
-	// This collapses what used to be three commands the user had to
-	// guess (`i model pull`, `i daemon install`, retry) into one
+	// Self-heal: if the backend is the local llama-cli runtime and it
+	// (or the selected model) isn't installed yet, offer to install the
+	// runtime via the system package manager and download the model.
+	// This collapses what used to be several setup commands into one
 	// prompt or, if `--yes` is set, zero. See ensure.go.
 	backendForCheck := cfg.Backend
 	if fl.backend != "" {
@@ -251,6 +251,7 @@ func cmdIntent(ctx context.Context, args []string) int {
 		errf("backend: %v", err)
 		return 3
 	}
+	defer closeBackend(be)
 	printMockFallbackBanner(isFallback)
 
 	// Top-level verbose breadcrumbs. Safe no-op when -v is off.
diff --git a/internal/cli/model.go b/internal/cli/model.go
index 580479e..7bb808a 100644
--- a/internal/cli/model.go
+++ b/internal/cli/model.go
@@ -7,7 +7,6 @@ import (
 	"path/filepath"
 	"strings"
 	"text/tabwriter"
-	"time"
 
 	"github.com/CoreyRDean/intent/internal/config"
 	"github.com/CoreyRDean/intent/internal/models"
@@ -268,8 +267,9 @@ func inferQuantFromFilename(filename string) string {
 
 // modelUse switches the current model. Resolves the reference, persists
 // it as a custom entry if it's an HF repo we haven't seen, downloads
-// the model if it's not installed, and finally updates cfg.Model +
-// restarts the daemon so subsequent `i` calls use the new model.
+// the model if it's not installed, and updates cfg.Model. The next `i`
+// call picks up the new model automatically — llama-cli loads it fresh
+// each invocation, so there is no daemon to restart.
 func modelUse(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []string) int {
 	if len(args) == 0 {
 		errf("usage: i model use <id>")
@@ -308,7 +308,7 @@ func modelUse(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []s
 	}
 
 	if !models.ValidGGUFQuant(m.Quant) && m.Quant != "" {
-		fmt.Fprintf(os.Stderr, "warning: quant %q is unusual; llamafile may or may not load it.\n", m.Quant)
+		fmt.Fprintf(os.Stderr, "warning: quant %q is unusual; llama.cpp may or may not load it.\n", m.Quant)
 	}
 
 	// Download if missing.
@@ -337,20 +337,6 @@ func modelUse(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []s
 		return 3
 	}
 	fmt.Printf("current model: %s\n", m.ID)
-
-	// Nudge the daemon to pick up the new model. If it's running,
-	// restart it; if not, leave it alone (user will start it on next
-	// `i` call via ensureBackendReady).
-	if pingDaemon(dirs) {
-		fmt.Fprintln(os.Stderr, "restarting daemon with new model...")
-		_ = daemonStop(dirs)
-		time.Sleep(500 * time.Millisecond)
-		if err := startDaemonAndWait(dirs); err != nil {
-			errf("daemon restart: %v (run `i daemon start` manually)", err)
-			return 3
-		}
-		fmt.Fprintln(os.Stderr, "daemon: ready.")
-	}
 	return 0
 }
 
@@ -383,11 +369,11 @@ func modelPull(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []
 	}
 
 	rt := intentruntime.New(dirs.Cache)
-	if !rt.HaveLlamafile() {
-		fmt.Fprintln(os.Stderr, "downloading runtime...")
-		if err := rt.EnsureLlamafile(ctx, progressCB("llamafile")); err != nil {
+	if !rt.HaveLlamaRuntime() {
+		fmt.Fprintln(os.Stderr, "installing llama.cpp via your package manager...")
+		if err := rt.EnsureLlamaRuntime(ctx, func(s string) { fmt.Fprintln(os.Stderr, "  "+s) }); err != nil {
 			fmt.Fprintln(os.Stderr)
-			errf("llamafile: %v", err)
+			errf("llama.cpp: %v", err)
 			return 3
 		}
 		fmt.Fprintln(os.Stderr)
diff --git a/internal/cli/report.go b/internal/cli/report.go
index dbcf353..722bad0 100644
--- a/internal/cli/report.go
+++ b/internal/cli/report.go
@@ -53,6 +53,7 @@ func cmdReport(ctx context.Context, args []string) int {
 		errf("report: %v", err)
 		return 3
 	}
+	defer closeBackend(be)
 	if isMockBackend(be) {
 		errf("i report requires a real backend — run 'i doctor' to diagnose")
 		return 3
diff --git a/internal/cli/smoke_test.go b/internal/cli/smoke_test.go
index 6aedcab..73f3cba 100644
--- a/internal/cli/smoke_test.go
+++ b/internal/cli/smoke_test.go
@@ -374,27 +374,6 @@ func TestConfigRoundTripSectionedKnownKey(t *testing.T) {
 	}
 }
 
-func TestConfigSetRejectsRemoteDaemonHost(t *testing.T) {
-	stateDir := t.TempDir()
-	cacheDir := t.TempDir()
-	baseEnv := []string{
-		"HOME=" + os.Getenv("HOME"),
-		"PATH=" + os.Getenv("PATH"),
-		"INTENT_STATE_DIR=" + stateDir,
-		"INTENT_CACHE_DIR=" + cacheDir,
-	}
-
-	cmd := exec.Command(testBinary, "config", "set", "daemon.host", "0.0.0.0")
-	cmd.Env = baseEnv
-	out, err := cmd.CombinedOutput()
-	if err == nil {
-		t.Fatal("expected config set daemon.host to fail, got nil error")
-	}
-	if !strings.Contains(string(out), "loopback only") {
-		t.Fatalf("expected loopback validation error, got %q", string(out))
-	}
-}
-
 func TestConfigPath(t *testing.T) {
 	stdout, _, exitCode := run(t, nil, "config", "path")
 	if exitCode != 0 {
diff --git a/internal/config/config.go b/internal/config/config.go
index 216f1c1..c90f59d 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -35,7 +35,11 @@ type Config struct {
 // Defaults returns the project's chosen defaults.
 func Defaults() *Config {
 	return &Config{
-		Backend: "llamafile-local",
+		// Local inference runs llama.cpp's `llama-cli` one-shot. The
+		// legacy "llamafile-local" name is still accepted as an alias by
+		// the backend resolver, so configs written before the switch keep
+		// working without migration.
+		Backend: "llama-cli",
 		// Catalog short-id. See internal/models.DefaultID. Defaults to
 		// the 3B model as the balanced "just works" option: strong
 		// enough that `i report` doesn't routinely hit the fallback
diff --git a/internal/daemon/client.go b/internal/daemon/client.go
deleted file mode 100644
index 5856738..0000000
--- a/internal/daemon/client.go
+++ /dev/null
@@ -1,49 +0,0 @@
-package daemon
-
-import (
-	"bufio"
-	"encoding/json"
-	"fmt"
-	"net"
-	"time"
-)
-
-// Client is a one-shot connection to a daemon socket.
-type Client struct {
-	Socket  string
-	Timeout time.Duration
-}
-
-// NewClient returns a Client for the socket. Timeout defaults to 2s.
-func NewClient(socket string) *Client {
-	return &Client{Socket: socket, Timeout: 2 * time.Second}
-}
-
-// Call sends one request and returns the response.
-func (c *Client) Call(req Request) (*Response, error) {
-	conn, err := net.DialTimeout("unix", c.Socket, c.Timeout)
-	if err != nil {
-		return nil, err
-	}
-	defer conn.Close()
-	if c.Timeout > 0 {
-		_ = conn.SetDeadline(time.Now().Add(c.Timeout))
-	}
-	body, err := json.Marshal(req)
-	if err != nil {
-		return nil, err
-	}
-	if _, err := conn.Write(append(body, '\n')); err != nil {
-		return nil, err
-	}
-	r := bufio.NewReader(conn)
-	line, err := r.ReadBytes('\n')
-	if err != nil {
-		return nil, err
-	}
-	var resp Response
-	if err := json.Unmarshal(line, &resp); err != nil {
-		return nil, fmt.Errorf("decode response: %w", err)
-	}
-	return &resp, nil
-}
diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go
deleted file mode 100644
index 2853ef1..0000000
--- a/internal/daemon/daemon.go
+++ /dev/null
@@ -1,201 +0,0 @@
-// Package daemon implements intentd: a small supervisor that keeps a
-// llamafile --server process warm so the CLI doesn't pay a model-load
-// cost on every invocation.
-//
-// Architecture (v1):
-//
-//   - The daemon spawns llamafile as a subprocess and watches it.
-//   - llamafile exposes its OpenAI-compatible HTTP API on the loopback
-//     port from config. The CLI talks to that port directly. The daemon
-//     does NOT proxy inference traffic through its Unix socket — that
-//     would add a hop for no benefit, since the heavy lifting is the
-//     model, not the network.
-//   - The daemon owns a Unix socket on which it speaks a tiny line-
-//     delimited JSON control protocol: ping / status / stop. That's
-//     also how `i daemon status` and `i daemon stop` work.
-//
-// Idle unload (kill the llamafile subprocess after N minutes of HTTP
-// inactivity, respawn on next CLI request) is a v1.x follow-up. In v1
-// the daemon stays warm until the user stops it.
-package daemon
-
-import (
-	"bufio"
-	"context"
-	"encoding/json"
-	"fmt"
-	"net"
-	"os"
-	"sync"
-	"time"
-)
-
-// Op is the daemon control-protocol operation discriminator.
-type Op string
-
-const (
-	OpPing   Op = "ping"
-	OpStatus Op = "status"
-	OpStop   Op = "stop"
-)
-
-// Request is one daemon control request.
-type Request struct {
-	Op Op     `json:"op"`
-	ID string `json:"id,omitempty"`
-}
-
-// Response is the daemon's reply.
-type Response struct {
-	ID    string         `json:"id,omitempty"`
-	OK    bool           `json:"ok"`
-	Error string         `json:"error,omitempty"`
-	Data  map[string]any `json:"data,omitempty"`
-}
-
-// Server is the Unix-socket control-plane server.
-type Server struct {
-	Socket    string
-	Launcher  *Launcher
-	Started   time.Time
-	mu        sync.Mutex
-	ln        net.Listener
-	stopCh    chan struct{}
-	stopOnce  sync.Once
-	clientCtx context.Context
-}
-
-// New constructs a Server bound to socket and supervising launcher.
-func New(socket string, l *Launcher) *Server {
-	return &Server{
-		Socket:   socket,
-		Launcher: l,
-		stopCh:   make(chan struct{}),
-	}
-}
-
-// Listen binds the Unix socket. Any pre-existing socket file is removed.
-func (s *Server) Listen() error {
-	_ = os.Remove(s.Socket)
-	if err := os.MkdirAll(parentDir(s.Socket), 0o700); err != nil {
-		return fmt.Errorf("mkdir socket parent: %w", err)
-	}
-	ln, err := net.Listen("unix", s.Socket)
-	if err != nil {
-		return fmt.Errorf("listen on %s: %w", s.Socket, err)
-	}
-	if err := os.Chmod(s.Socket, 0o600); err != nil {
-		_ = ln.Close()
-		return fmt.Errorf("chmod socket: %w", err)
-	}
-	s.mu.Lock()
-	s.ln = ln
-	s.Started = time.Now()
-	s.mu.Unlock()
-	return nil
-}
-
-// Serve accepts connections until ctx is canceled OR an OpStop is received.
-func (s *Server) Serve(ctx context.Context) error {
-	if s.ln == nil {
-		return fmt.Errorf("server not listening")
-	}
-	s.clientCtx = ctx
-	go func() {
-		select {
-		case <-ctx.Done():
-		case <-s.stopCh:
-		}
-		_ = s.ln.Close()
-	}()
-	for {
-		conn, err := s.ln.Accept()
-		if err != nil {
-			select {
-			case <-s.stopCh:
-				return nil
-			default:
-			}
-			if ctx.Err() != nil {
-				return nil
-			}
-			return err
-		}
-		go s.handle(conn)
-	}
-}
-
-// SignalStop tells Serve to return. Idempotent.
-func (s *Server) SignalStop() {
-	s.stopOnce.Do(func() { close(s.stopCh) })
-}
-
-// Stopped returns a channel closed when SignalStop has been called.
-// `i daemon start` blocks on this AND on its OS-signal context, so
-// either source can shut the daemon down.
-func (s *Server) Stopped() <-chan struct{} { return s.stopCh }
-
-func (s *Server) handle(conn net.Conn) {
-	defer conn.Close()
-	_ = conn.SetReadDeadline(time.Now().Add(5 * time.Second))
-	r := bufio.NewReader(conn)
-	w := bufio.NewWriter(conn)
-	line, err := r.ReadBytes('\n')
-	if err != nil {
-		return
-	}
-	var req Request
-	if err := json.Unmarshal(line, &req); err != nil {
-		_ = writeJSONLine(w, Response{ID: req.ID, OK: false, Error: "bad json: " + err.Error()})
-		return
-	}
-	resp := s.dispatch(req)
-	resp.ID = req.ID
-	_ = writeJSONLine(w, resp)
-}
-
-func (s *Server) dispatch(req Request) Response {
-	switch req.Op {
-	case OpPing:
-		return Response{OK: true, Data: map[string]any{"pong": true}}
-	case OpStatus:
-		data := map[string]any{
-			"socket":     s.Socket,
-			"started_at": s.Started.UTC().Format(time.RFC3339),
-			"uptime_sec": int64(time.Since(s.Started).Seconds()),
-		}
-		if s.Launcher != nil {
-			data["llamafile_running"] = s.Launcher.Running()
-			data["llamafile_endpoint"] = s.Launcher.Endpoint()
-			data["llamafile_pid"] = s.Launcher.PID()
-			data["llamafile_restarts"] = s.Launcher.Restarts()
-			data["model"] = s.Launcher.ModelPath
-		}
-		return Response{OK: true, Data: data}
-	case OpStop:
-		s.SignalStop()
-		return Response{OK: true, Data: map[string]any{"stopping": true}}
-	default:
-		return Response{OK: false, Error: "unknown op: " + string(req.Op)}
-	}
-}
-
-func writeJSONLine(w *bufio.Writer, r Response) error {
-	b, err := json.Marshal(r)
-	if err != nil {
-		return err
-	}
-	if _, err := w.Write(append(b, '\n')); err != nil {
-		return err
-	}
-	return w.Flush()
-}
-
-func parentDir(p string) string {
-	for i := len(p) - 1; i >= 0; i-- {
-		if p[i] == '/' {
-			return p[:i]
-		}
-	}
-	return "."
-}
diff --git a/internal/daemon/install.go b/internal/daemon/install.go
deleted file mode 100644
index f8265f1..0000000
--- a/internal/daemon/install.go
+++ /dev/null
@@ -1,237 +0,0 @@
-package daemon
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"runtime"
-)
-
-// InstallParams are everything Install needs to write a system service file.
-type InstallParams struct {
-	Binary string // absolute path to the intent binary
-	Label  string // service label, e.g. "com.coreyrdean.intent"
-	LogDir string // directory the service writes stdout/stderr to
-	Socket string // daemon control socket path (informational)
-	Cache  string // cache root (so the service knows where llamafile lives)
-	State  string // state root
-}
-
-// InstallResult describes what was written and how to control it.
-type InstallResult struct {
-	UnitPath string   // path to the launchd plist or systemd unit
-	StartCmd []string // command to start the unit
-	StopCmd  []string // command to stop the unit
-	LogPath  string   // path to the stdout log
-	Notes    string   // human-readable post-install hint
-}
-
-// Install writes the platform-appropriate service file and starts it.
-// On macOS, returns the LaunchAgent plist path.
-// On Linux, returns the user systemd unit path.
-// Other platforms return an error.
-func Install(p InstallParams) (*InstallResult, error) {
-	switch runtime.GOOS {
-	case "darwin":
-		return installLaunchd(p)
-	case "linux":
-		return installSystemd(p)
-	default:
-		return nil, fmt.Errorf("daemon install not supported on %s yet", runtime.GOOS)
-	}
-}
-
-// Uninstall removes the platform-appropriate service file (and stops it).
-func Uninstall(label string) error {
-	switch runtime.GOOS {
-	case "darwin":
-		return uninstallLaunchd(label)
-	case "linux":
-		return uninstallSystemd(label)
-	default:
-		return fmt.Errorf("daemon uninstall not supported on %s yet", runtime.GOOS)
-	}
-}
-
-// IsInstalled reports whether the platform-appropriate service file exists.
-func IsInstalled(label string) bool {
-	switch runtime.GOOS {
-	case "darwin":
-		path, _ := launchdPlistPath(label)
-		_, err := os.Stat(path)
-		return err == nil
-	case "linux":
-		path, _ := systemdUnitPath(label)
-		_, err := os.Stat(path)
-		return err == nil
-	}
-	return false
-}
-
-// --- macOS / launchd ---
-
-func launchdPlistPath(label string) (string, error) {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-	return filepath.Join(home, "Library", "LaunchAgents", label+".plist"), nil
-}
-
-func installLaunchd(p InstallParams) (*InstallResult, error) {
-	plistPath, err := launchdPlistPath(p.Label)
-	if err != nil {
-		return nil, err
-	}
-	if err := os.MkdirAll(filepath.Dir(plistPath), 0o755); err != nil {
-		return nil, err
-	}
-	if err := os.MkdirAll(p.LogDir, 0o700); err != nil {
-		return nil, err
-	}
-	logOut := filepath.Join(p.LogDir, "intentd.out.log")
-	logErr := filepath.Join(p.LogDir, "intentd.err.log")
-	plist := fmt.Sprintf(`<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-  <key>Label</key>            <string>%s</string>
-  <key>ProgramArguments</key> <array>
-    <string>%s</string>
-    <string>daemon</string>
-    <string>start</string>
-    <string>--foreground</string>
-  </array>
-  <key>RunAtLoad</key>        <true/>
-  <key>KeepAlive</key>        <true/>
-  <key>ProcessType</key>      <string>Background</string>
-  <key>StandardOutPath</key>  <string>%s</string>
-  <key>StandardErrorPath</key><string>%s</string>
-  <key>EnvironmentVariables</key>
-  <dict>
-    <key>HOME</key>           <string>%s</string>
-    <key>PATH</key>           <string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
-  </dict>
-</dict>
-</plist>
-`, p.Label, p.Binary, logOut, logErr, mustHome())
-	if err := os.WriteFile(plistPath, []byte(plist), 0o644); err != nil {
-		return nil, fmt.Errorf("write plist: %w", err)
-	}
-	// Best-effort start. launchctl load is the right verb on macOS LaunchAgents
-	// even though it's been deprecated in favor of bootstrap. bootstrap requires
-	// a target like `gui/$UID` and is awkward; load still works.
-	_, _ = exec.Command("launchctl", "unload", plistPath).CombinedOutput()
-	if out, err := exec.Command("launchctl", "load", plistPath).CombinedOutput(); err != nil {
-		return &InstallResult{
-			UnitPath: plistPath,
-			StartCmd: []string{"launchctl", "load", plistPath},
-			StopCmd:  []string{"launchctl", "unload", plistPath},
-			LogPath:  logOut,
-			Notes: fmt.Sprintf("plist installed but launchctl load failed: %s\n"+
-				"start manually with: launchctl load %s", string(out), plistPath),
-		}, nil
-	}
-	return &InstallResult{
-		UnitPath: plistPath,
-		StartCmd: []string{"launchctl", "load", plistPath},
-		StopCmd:  []string{"launchctl", "unload", plistPath},
-		LogPath:  logOut,
-		Notes:    "intentd is now running and will start at login.",
-	}, nil
-}
-
-func uninstallLaunchd(label string) error {
-	plistPath, err := launchdPlistPath(label)
-	if err != nil {
-		return err
-	}
-	if _, err := os.Stat(plistPath); os.IsNotExist(err) {
-		return nil
-	}
-	_, _ = exec.Command("launchctl", "unload", plistPath).CombinedOutput()
-	return os.Remove(plistPath)
-}
-
-// --- Linux / systemd user unit ---
-
-func systemdUnitPath(label string) (string, error) {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-	// systemd allows arbitrary unit names; we use the label suffix as the
-	// unit basename to avoid collisions with system units.
-	return filepath.Join(home, ".config", "systemd", "user", label+".service"), nil
-}
-
-func installSystemd(p InstallParams) (*InstallResult, error) {
-	unitPath, err := systemdUnitPath(p.Label)
-	if err != nil {
-		return nil, err
-	}
-	if err := os.MkdirAll(filepath.Dir(unitPath), 0o755); err != nil {
-		return nil, err
-	}
-	unit := fmt.Sprintf(`[Unit]
-Description=intent daemon (keeps a local LLM warm)
-After=default.target
-
-[Service]
-Type=simple
-ExecStart=%s daemon start --foreground
-Restart=on-failure
-RestartSec=2
-Environment=PATH=/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin
-NoNewPrivileges=yes
-PrivateTmp=yes
-
-[Install]
-WantedBy=default.target
-`, p.Binary)
-	if err := os.WriteFile(unitPath, []byte(unit), 0o644); err != nil {
-		return nil, fmt.Errorf("write unit: %w", err)
-	}
-	_, _ = exec.Command("systemctl", "--user", "daemon-reload").CombinedOutput()
-	unitName := p.Label + ".service"
-	if out, err := exec.Command("systemctl", "--user", "enable", "--now", unitName).CombinedOutput(); err != nil {
-		return &InstallResult{
-			UnitPath: unitPath,
-			StartCmd: []string{"systemctl", "--user", "start", unitName},
-			StopCmd:  []string{"systemctl", "--user", "stop", unitName},
-			LogPath:  "journalctl --user -u " + unitName,
-			Notes: fmt.Sprintf("unit installed but `systemctl --user enable --now` failed: %s\n"+
-				"start manually with: systemctl --user start %s", string(out), unitName),
-		}, nil
-	}
-	return &InstallResult{
-		UnitPath: unitPath,
-		StartCmd: []string{"systemctl", "--user", "start", unitName},
-		StopCmd:  []string{"systemctl", "--user", "stop", unitName},
-		LogPath:  "journalctl --user -u " + unitName,
-		Notes:    "intentd is enabled and running. Logs: journalctl --user -u " + unitName,
-	}, nil
-}
-
-func uninstallSystemd(label string) error {
-	unitPath, err := systemdUnitPath(label)
-	if err != nil {
-		return err
-	}
-	if _, err := os.Stat(unitPath); os.IsNotExist(err) {
-		return nil
-	}
-	unitName := label + ".service"
-	_, _ = exec.Command("systemctl", "--user", "disable", "--now", unitName).CombinedOutput()
-	if err := os.Remove(unitPath); err != nil {
-		return err
-	}
-	_, _ = exec.Command("systemctl", "--user", "daemon-reload").CombinedOutput()
-	return nil
-}
-
-func mustHome() string {
-	h, _ := os.UserHomeDir()
-	return h
-}
diff --git a/internal/daemon/launcher.go b/internal/daemon/launcher.go
deleted file mode 100644
index b6e8e25..0000000
--- a/internal/daemon/launcher.go
+++ /dev/null
@@ -1,388 +0,0 @@
-package daemon
-
-import (
-	"context"
-	"fmt"
-	"io"
-	"net/http"
-	"os"
-	"os/exec"
-	"sync"
-	"sync/atomic"
-	"syscall"
-	"time"
-)
-
-// Launcher supervises a `llamafile --server` subprocess. It exposes the
-// HTTP endpoint llamafile is bound to so the CLI can dial it directly.
-//
-// Restart policy: if llamafile exits with a non-zero code or is killed
-// by anything other than us, we restart it up to MaxRestarts times within
-// RestartWindow. Beyond that we give up; the user gets an honest "daemon
-// died, see the logs" rather than a thrashing supervisor.
-type Launcher struct {
-	BinaryPath    string        // path to llamafile-VERSION
-	ModelPath     string        // path to .gguf
-	Host          string        // 127.0.0.1
-	Port          int           // 18080
-	ContextSize   int           // -c, 0 = llamafile default
-	GPULayers     int           // -ngl, -1 = let llamafile decide
-	StdoutLog     io.Writer     // where llamafile's stdout goes
-	StderrLog     io.Writer     // where llamafile's stderr goes
-	MaxRestarts   int           // default 5
-	RestartWindow time.Duration // default 60s
-	StartupGrace  time.Duration // how long to wait for /v1/models to respond
-
-	mu        sync.Mutex
-	cmd       *exec.Cmd
-	pid       int32
-	restarts  atomic.Int32
-	stopped   atomic.Bool
-	doneCh    chan struct{}
-	restartTs []time.Time
-}
-
-// NewLauncher constructs a Launcher with sensible defaults.
-func NewLauncher(binary, model string, host string, port int) *Launcher {
-	return &Launcher{
-		BinaryPath:    binary,
-		ModelPath:     model,
-		Host:          host,
-		Port:          port,
-		StdoutLog:     io.Discard,
-		StderrLog:     os.Stderr,
-		MaxRestarts:   5,
-		RestartWindow: 60 * time.Second,
-		StartupGrace:  60 * time.Second,
-		GPULayers:     -1,
-		doneCh:        make(chan struct{}),
-	}
-}
-
-// Endpoint returns the http://host:port the supervised llamafile listens on.
-func (l *Launcher) Endpoint() string {
-	return fmt.Sprintf("http://%s:%d", l.Host, l.Port)
-}
-
-// PID returns the current llamafile PID (0 if not running).
-func (l *Launcher) PID() int { return int(atomic.LoadInt32(&l.pid)) }
-
-// Running reports whether the subprocess is alive.
-func (l *Launcher) Running() bool { return l.PID() != 0 }
-
-// Restarts returns the cumulative restart count.
-func (l *Launcher) Restarts() int { return int(l.restarts.Load()) }
-
-// Start launches llamafile and blocks until either:
-//   - the HTTP /v1/models endpoint answers (success), OR
-//   - StartupGrace expires (failure), OR
-//   - llamafile exits before becoming ready (failure)
-//
-// On success, the Launcher's supervise goroutine is also running.
-func (l *Launcher) Start(ctx context.Context) error {
-	if err := l.spawn(ctx); err != nil {
-		return err
-	}
-	if err := l.waitReady(ctx); err != nil {
-		l.stop(syscall.SIGTERM)
-		return fmt.Errorf("llamafile did not become ready: %w", err)
-	}
-	go l.supervise(ctx)
-	return nil
-}
-
-// Wait blocks until the launcher's supervise loop exits.
-func (l *Launcher) Wait() { <-l.doneCh }
-
-// Stop signals the launcher to terminate and waits for it. Idempotent.
-func (l *Launcher) Stop(timeout time.Duration) {
-	if !l.stopped.CompareAndSwap(false, true) {
-		return
-	}
-	l.stop(syscall.SIGTERM)
-	select {
-	case <-l.doneCh:
-	case <-time.After(timeout):
-		l.stop(syscall.SIGKILL)
-		<-l.doneCh
-	}
-}
-
-func (l *Launcher) spawn(ctx context.Context) error {
-	args := []string{
-		"--server",
-		"-m", l.ModelPath,
-		"--host", l.Host,
-		"--port", fmt.Sprintf("%d", l.Port),
-	}
-	if l.ContextSize > 0 {
-		args = append(args, "-c", fmt.Sprintf("%d", l.ContextSize))
-	}
-	if l.GPULayers >= 0 {
-		args = append(args, "-ngl", fmt.Sprintf("%d", l.GPULayers))
-	}
-
-	// llamafile is an Actually Portable Executable (APE). On macOS the
-	// kernel rejects APE binaries directly with "exec format error" —
-	// the file's leading shell-script trampoline only fires when the
-	// shell loads it. So we run it via /bin/sh on every Unix to keep the
-	// invocation consistent and let the shell pick the right loader.
-	//
-	// Note: not exec.CommandContext — we manage lifecycle explicitly so
-	// that a CLI command's ctx cancellation doesn't kill the daemon's
-	// supervised subprocess.
-	shArgs := append([]string{l.BinaryPath}, args...)
-	cmd := exec.Command("/bin/sh", "-c", quoteShellArgs(shArgs), "intentd-llamafile")
-	cmd.Stdout = l.StdoutLog
-	cmd.Stderr = l.StderrLog
-	// New process group so llamafile doesn't catch terminal signals
-	// directed at the daemon.
-	cmd.SysProcAttr = procAttrNewGroup()
-
-	if err := cmd.Start(); err != nil {
-		return fmt.Errorf("start %s: %w", l.BinaryPath, err)
-	}
-	l.mu.Lock()
-	l.cmd = cmd
-	atomic.StoreInt32(&l.pid, int32(cmd.Process.Pid))
-	l.mu.Unlock()
-	return nil
-}
-
-func (l *Launcher) waitReady(ctx context.Context) error {
-	deadline := time.Now().Add(l.StartupGrace)
-	url := l.Endpoint() + "/v1/models"
-	cli := &http.Client{Timeout: 1 * time.Second}
-	for time.Now().Before(deadline) {
-		select {
-		case <-ctx.Done():
-			return ctx.Err()
-		default:
-		}
-		// If the subprocess died on us, fail fast.
-		if !l.processAlive() {
-			return fmt.Errorf("subprocess exited before ready")
-		}
-		req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
-		resp, err := cli.Do(req)
-		if err == nil {
-			_ = resp.Body.Close()
-			if resp.StatusCode < 500 {
-				return nil
-			}
-		}
-		time.Sleep(250 * time.Millisecond)
-	}
-	return fmt.Errorf("timeout after %s", l.StartupGrace)
-}
-
-func (l *Launcher) processAlive() bool {
-	l.mu.Lock()
-	cmd := l.cmd
-	l.mu.Unlock()
-	if cmd == nil || cmd.Process == nil {
-		return false
-	}
-	// Signal 0: existence check, no actual signal delivered.
-	return cmd.Process.Signal(syscall.Signal(0)) == nil
-}
-
-func (l *Launcher) supervise(ctx context.Context) {
-	defer close(l.doneCh)
-	for {
-		l.mu.Lock()
-		cmd := l.cmd
-		l.mu.Unlock()
-		if cmd == nil {
-			return
-		}
-		err := cmd.Wait()
-		atomic.StoreInt32(&l.pid, 0)
-
-		if l.stopped.Load() || ctx.Err() != nil {
-			return
-		}
-
-		// Crash. Decide whether to restart.
-		if err != nil {
-			fmt.Fprintf(l.StderrLog, "intentd: llamafile exited: %v\n", err)
-		}
-		if !l.shouldRestart() {
-			fmt.Fprintf(l.StderrLog, "intentd: too many restarts in %s; giving up\n", l.RestartWindow)
-			return
-		}
-		l.restarts.Add(1)
-		fmt.Fprintf(l.StderrLog, "intentd: restarting llamafile (attempt %d)\n", l.restarts.Load())
-		// Brief backoff so we don't hot-loop.
-		time.Sleep(time.Second)
-		if err := l.spawn(ctx); err != nil {
-			fmt.Fprintf(l.StderrLog, "intentd: respawn failed: %v\n", err)
-			return
-		}
-		if err := l.waitReady(ctx); err != nil {
-			fmt.Fprintf(l.StderrLog, "intentd: respawn not ready: %v\n", err)
-			l.stop(syscall.SIGTERM)
-			return
-		}
-	}
-}
-
-// shouldRestart returns true if we are within the restart budget.
-func (l *Launcher) shouldRestart() bool {
-	now := time.Now()
-	cutoff := now.Add(-l.RestartWindow)
-	kept := l.restartTs[:0]
-	for _, t := range l.restartTs {
-		if t.After(cutoff) {
-			kept = append(kept, t)
-		}
-	}
-	l.restartTs = append(kept, now)
-	return len(l.restartTs) <= l.MaxRestarts
-}
-
-// quoteShellArgs renders argv as a single shell command string with
-// each argument single-quoted. We never embed user-supplied unescaped
-// strings here, but doing it correctly is cheap insurance.
-func quoteShellArgs(argv []string) string {
-	out := ""
-	for i, a := range argv {
-		if i > 0 {
-			out += " "
-		}
-		// Replace each ' with '\''.
-		escaped := ""
-		for _, r := range a {
-			if r == '\'' {
-				escaped += `'\''`
-			} else {
-				escaped += string(r)
-			}
-		}
-		out += "'" + escaped + "'"
-	}
-	return out
-}
-
-func (l *Launcher) stop(sig syscall.Signal) {
-	l.mu.Lock()
-	cmd := l.cmd
-	l.mu.Unlock()
-	if cmd == nil || cmd.Process == nil {
-		return
-	}
-	pid := cmd.Process.Pid
-
-	// llamafile is an Actually Portable Executable. APE binaries on
-	// macOS work like this: the bytes are simultaneously a PE header
-	// (rejected by the kernel) and a shell script (interpreted by sh
-	// when execve fails). The script then mmaps a temp-extracted
-	// Mach-O and re-execs into it via `posix_spawn`, which in practice
-	// FORKS off a worker process whose parent becomes our intent
-	// daemon (the original sh wrapper exits). So:
-	//
-	// - cmd.Process.Pid points at the long-dead sh wrapper.
-	// - The actual llamafile is reparented to our daemon (os.Getpid()).
-	// - It also lives in its own process group via setsid.
-	//
-	// We therefore signal four populations to be sure:
-	//   1) the original spawned PID (no-op if already gone),
-	//   2) the spawned PID's process group (no-op if separated),
-	//   3) every descendant of *us* that runs our llamafile binary,
-	//   4) every such descendant's own process group.
-	_ = cmd.Process.Signal(sig)
-	if pgid, err := syscall.Getpgid(pid); err == nil && pgid != 0 && pgid != os.Getpid() {
-		_ = syscall.Kill(-pgid, sig)
-	}
-	for _, p := range descendantsRunning(os.Getpid(), l.BinaryPath) {
-		_ = syscall.Kill(p, sig)
-		if pgid, err := syscall.Getpgid(p); err == nil && pgid != 0 && pgid != os.Getpid() {
-			_ = syscall.Kill(-pgid, sig)
-		}
-	}
-}
-
-// descendantsRunning returns every descendant of root whose command
-// line contains needle. We filter on needle so that signaling does
-// not accidentally hit unrelated processes that happen to share the
-// daemon as an ancestor (e.g. user shells launched from `i daemon
-// start` in a terminal).
-func descendantsRunning(root int, needle string) []int {
-	if _, err := exec.LookPath("pgrep"); err != nil {
-		return nil
-	}
-	candidates := allDescendants(root)
-	if len(candidates) == 0 || needle == "" {
-		return candidates
-	}
-	out := candidates[:0]
-	for _, p := range candidates {
-		// `ps -o command= -p PID` prints just the command line.
-		b, err := exec.Command("ps", "-o", "command=", "-p", fmt.Sprintf("%d", p)).Output()
-		if err != nil {
-			continue
-		}
-		if bytesContains(b, needle) {
-			out = append(out, p)
-		}
-	}
-	return out
-}
-
-func allDescendants(root int) []int {
-	seen := map[int]struct{}{root: {}}
-	queue := []int{root}
-	var out []int
-	for len(queue) > 0 {
-		p := queue[0]
-		queue = queue[1:]
-		b, err := exec.Command("pgrep", "-P", fmt.Sprintf("%d", p)).Output()
-		if err != nil {
-			continue
-		}
-		for _, line := range bytesLines(b) {
-			var child int
-			_, _ = fmt.Sscanf(line, "%d", &child)
-			if child <= 0 {
-				continue
-			}
-			if _, ok := seen[child]; ok {
-				continue
-			}
-			seen[child] = struct{}{}
-			out = append(out, child)
-			queue = append(queue, child)
-		}
-	}
-	return out
-}
-
-func bytesContains(b []byte, needle string) bool {
-	if len(needle) == 0 {
-		return true
-	}
-	n := []byte(needle)
-	for i := 0; i+len(n) <= len(b); i++ {
-		if string(b[i:i+len(n)]) == needle {
-			return true
-		}
-	}
-	return false
-}
-
-func bytesLines(b []byte) []string {
-	var out []string
-	start := 0
-	for i, c := range b {
-		if c == '\n' {
-			if i > start {
-				out = append(out, string(b[start:i]))
-			}
-			start = i + 1
-		}
-	}
-	if start < len(b) {
-		out = append(out, string(b[start:]))
-	}
-	return out
-}
diff --git a/internal/daemon/procattr_other.go b/internal/daemon/procattr_other.go
deleted file mode 100644
index d7baa02..0000000
--- a/internal/daemon/procattr_other.go
+++ /dev/null
@@ -1,9 +0,0 @@
-//go:build !unix
-
-package daemon
-
-import "syscall"
-
-// procAttrNewGroup is a no-op on non-unix platforms (Windows). The
-// daemon isn't supported there yet anyway.
-func procAttrNewGroup() *syscall.SysProcAttr { return nil }
diff --git a/internal/daemon/procattr_unix.go b/internal/daemon/procattr_unix.go
deleted file mode 100644
index 696d4b9..0000000
--- a/internal/daemon/procattr_unix.go
+++ /dev/null
@@ -1,12 +0,0 @@
-//go:build unix
-
-package daemon
-
-import "syscall"
-
-// procAttrNewGroup returns SysProcAttr that puts the child in its own
-// process group, so signals to our PID don't reach it (and so we can
-// signal -PGID to take down the entire subtree).
-func procAttrNewGroup() *syscall.SysProcAttr {
-	return &syscall.SysProcAttr{Setpgid: true}
-}
diff --git a/internal/model/llamacli/llamacli.go b/internal/model/llamacli/llamacli.go
new file mode 100644
index 0000000..3f7439e
--- /dev/null
+++ b/internal/model/llamacli/llamacli.go
@@ -0,0 +1,313 @@
+// Package llamacli runs local inference by shelling out to llama.cpp's
+// `llama-cli` binary one-shot, instead of talking to a long-lived server.
+//
+// Each Complete/CompleteStructured call spawns `llama-cli` with the model,
+// a JSON-schema grammar constraint, and the flattened conversation, then
+// parses the single JSON object the model prints to stdout. There is no
+// daemon, no HTTP, and no warm process: the OS process *is* the request.
+//
+// Trade-off vs. the old `llamafile --server` path: every call pays the
+// model-load cost. In exchange there is nothing to supervise, nothing
+// bound to a socket, and nothing to leave running. The grammar constraint
+// (`--json-schema`) is the same mechanism llama.cpp's server used, so the
+// output contract is unchanged.
+package llamacli
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"strings"
+	"time"
+
+	"github.com/CoreyRDean/intent/internal/model"
+)
+
+// Backend drives one-shot `llama-cli` inference.
+type Backend struct {
+	// BinaryPath is the resolved llama-cli executable. May be a bare
+	// "llama-cli" if it's expected to be found on PATH at exec time.
+	BinaryPath string
+	// ModelPath is the absolute path to the .gguf to load.
+	ModelPath string
+	// ModelTag is cosmetic; it feeds the cache identity so switching
+	// models invalidates cached proposals.
+	ModelTag string
+	// ContextSize maps to -c (0 = let llama.cpp use the model default).
+	ContextSize int
+	// GPULayers maps to -ngl (-1 = let llama.cpp decide).
+	GPULayers int
+	// ExtraArgs are appended verbatim, for power users / debugging.
+	ExtraArgs []string
+	// Timeout caps a single inference. 0 = no deadline beyond ctx.
+	Timeout time.Duration
+}
+
+// New constructs a Backend for the given binary and model.
+func New(binary, modelPath string) *Backend {
+	if binary == "" {
+		binary = "llama-cli"
+	}
+	return &Backend{
+		BinaryPath: binary,
+		ModelPath:  modelPath,
+		GPULayers:  -1,
+		Timeout:    5 * time.Minute,
+	}
+}
+
+func (b *Backend) Name() string { return "llama-cli" }
+
+func (b *Backend) CacheIdentity() string {
+	return strings.Join([]string{b.Name(), b.ModelPath, b.ModelTag}, "|")
+}
+
+// Available verifies the binary resolves and the model file is present.
+func (b *Backend) Available(ctx context.Context) error {
+	if b.ModelPath == "" {
+		return fmt.Errorf("llama-cli: no model path configured")
+	}
+	if _, err := exec.LookPath(b.BinaryPath); err != nil {
+		return fmt.Errorf("llama-cli not found (%s): %w", b.BinaryPath, err)
+	}
+	return nil
+}
+
+// Complete runs inference constrained to the standard Response envelope.
+func (b *Backend) Complete(ctx context.Context, in model.CompleteRequest) (*model.Response, error) {
+	content, err := b.run(ctx, in.Messages, []byte(model.SchemaJSON), in.Temperature, in.MaxTokens, in.Seed)
+	if err != nil {
+		return nil, err
+	}
+	var out model.Response
+	if err := json.Unmarshal([]byte(content), &out); err != nil {
+		return nil, fmt.Errorf("model output not valid JSON: %w (got %q)", err, truncate(content, 200))
+	}
+	backfillRequiredFields(&out)
+	if err := out.Validate(); err != nil {
+		return nil, fmt.Errorf("model response failed schema: %w (got %q)", err, truncate(content, 400))
+	}
+	return &out, nil
+}
+
+// CompleteStructured implements model.StructuredBackend: the caller-
+// supplied schema is enforced by llama.cpp's grammar, so the returned
+// bytes are already schema-valid JSON.
+func (b *Backend) CompleteStructured(ctx context.Context, in model.StructuredRequest) ([]byte, error) {
+	if len(in.SchemaJSON) == 0 {
+		return nil, fmt.Errorf("CompleteStructured: SchemaJSON is required")
+	}
+	content, err := b.run(ctx, in.Messages, in.SchemaJSON, in.Temperature, in.MaxTokens, in.Seed)
+	if err != nil {
+		return nil, err
+	}
+	var any json.RawMessage
+	if err := json.Unmarshal([]byte(content), &any); err != nil {
+		return nil, fmt.Errorf("structured output not valid JSON: %w (got %q)", err, truncate(content, 200))
+	}
+	return []byte(content), nil
+}
+
+// run spawns llama-cli once and returns the JSON object it produced.
+func (b *Backend) run(ctx context.Context, messages []model.Message, schema []byte, temp float64, maxTok int, seed *int64) (string, error) {
+	if b.ModelPath == "" {
+		return "", fmt.Errorf("llama-cli: no model path configured")
+	}
+	system, prompt := flattenMessages(messages)
+
+	if b.Timeout > 0 {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, b.Timeout)
+		defer cancel()
+	}
+
+	args := []string{
+		"-m", b.ModelPath,
+		// Conversation + single-turn: apply the model's chat template,
+		// process exactly one user turn, then exit. This is the supported
+		// scripting mode — no interactive loop, no hanging on stdin.
+		"-cnv", "-st",
+		"--no-display-prompt",
+		"--json-schema", string(schema),
+		"--temp", fmt.Sprintf("%g", temp),
+	}
+	if maxTok > 0 {
+		args = append(args, "-n", fmt.Sprintf("%d", maxTok))
+	}
+	if seed != nil {
+		args = append(args, "-s", fmt.Sprintf("%d", *seed))
+	}
+	if b.ContextSize > 0 {
+		args = append(args, "-c", fmt.Sprintf("%d", b.ContextSize))
+	}
+	if b.GPULayers >= 0 {
+		args = append(args, "-ngl", fmt.Sprintf("%d", b.GPULayers))
+	}
+	if system != "" {
+		args = append(args, "-sys", system)
+	}
+	args = append(args, "-p", prompt)
+	args = append(args, b.ExtraArgs...)
+
+	cmd := exec.CommandContext(ctx, b.BinaryPath, args...)
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+	// Give the child an empty stdin so conversation mode sees EOF and
+	// never blocks waiting for a second turn.
+	cmd.Stdin = bytes.NewReader(nil)
+
+	if err := cmd.Run(); err != nil {
+		if ctx.Err() == context.DeadlineExceeded {
+			return "", fmt.Errorf("llama-cli timed out after %s", b.Timeout)
+		}
+		return "", fmt.Errorf("llama-cli failed: %w (stderr: %s)", err, truncate(strings.TrimSpace(stderr.String()), 400))
+	}
+
+	// Generation goes to stdout; llama.cpp logs/timings go to stderr.
+	// Extract the first balanced JSON object from stdout, falling back
+	// to stderr in case a build routes the message there.
+	if obj := extractJSONObject(stdout.String()); obj != "" {
+		return obj, nil
+	}
+	if obj := extractJSONObject(stderr.String()); obj != "" {
+		return obj, nil
+	}
+	return "", fmt.Errorf("llama-cli produced no JSON object (stdout: %q)", truncate(strings.TrimSpace(stdout.String()), 400))
+}
+
+// flattenMessages splits a conversation into a single system prompt and a
+// single user-turn transcript. llama-cli takes one -sys and one -p, so
+// multi-turn context (the engine's tool-call loop) is rendered as labeled
+// text inside the user turn. The schema grammar still pins the output
+// shape regardless of how the context is framed.
+func flattenMessages(msgs []model.Message) (system, prompt string) {
+	var sys []string
+	var convo []string
+	for _, m := range msgs {
+		switch m.Role {
+		case "system":
+			sys = append(sys, m.Content)
+		case "assistant":
+			convo = append(convo, "Assistant (previous response):\n"+m.Content)
+		case "tool":
+			name := m.Name
+			if name == "" {
+				name = "tool"
+			}
+			convo = append(convo, fmt.Sprintf("Result of %s:\n%s", name, m.Content))
+		default: // user and anything else
+			convo = append(convo, m.Content)
+		}
+	}
+	return strings.Join(sys, "\n\n"), strings.Join(convo, "\n\n")
+}
+
+// extractJSONObject returns the first balanced top-level {...} object in s,
+// tolerating ```json fences, leading log noise, and trailing EOS markers
+// that small local models and llama-cli sometimes emit around the payload.
+func extractJSONObject(s string) string {
+	s = stripFences(strings.TrimSpace(s))
+	start := strings.IndexByte(s, '{')
+	if start < 0 {
+		return ""
+	}
+	depth := 0
+	inStr := false
+	esc := false
+	for i := start; i < len(s); i++ {
+		c := s[i]
+		if inStr {
+			switch {
+			case esc:
+				esc = false
+			case c == '\\':
+				esc = true
+			case c == '"':
+				inStr = false
+			}
+			continue
+		}
+		switch c {
+		case '"':
+			inStr = true
+		case '{':
+			depth++
+		case '}':
+			depth--
+			if depth == 0 {
+				return s[start : i+1]
+			}
+		}
+	}
+	return ""
+}
+
+// stripFences tolerates a model that wraps JSON in ```json ... ``` fences.
+func stripFences(s string) string {
+	s = strings.TrimSpace(s)
+	if !strings.HasPrefix(s, "```") {
+		return s
+	}
+	s = strings.TrimPrefix(s, "```json")
+	s = strings.TrimPrefix(s, "```")
+	s = strings.TrimSuffix(s, "```")
+	return strings.TrimSpace(s)
+}
+
+// backfillRequiredFields supplies sane defaults for fields small local
+// models routinely omit despite the schema. We never invent the command
+// itself; we only fill metadata the safety guard and TUI need. Mirrors
+// the llamafile backend so behaviour is identical across local runtimes.
+func backfillRequiredFields(r *model.Response) {
+	if r == nil {
+		return
+	}
+	if r.Description == "" {
+		switch {
+		case r.Command != "":
+			r.Description = "Run: " + truncate(r.Command, 120)
+		case r.Script != nil && r.Script.Body != "":
+			first := strings.SplitN(r.Script.Body, "\n", 2)[0]
+			r.Description = "Run script (" + r.Script.Interpreter + "): " + truncate(first, 100)
+		case r.StdoutToUser != "":
+			r.Description = "Print informational answer."
+		case r.ToolCall != nil && r.ToolCall.Name != "":
+			r.Description = "Gather context via " + r.ToolCall.Name + "."
+		case r.ClarifyingQuestion != "":
+			r.Description = "Ask the user a clarifying question."
+		case r.RefusalReason != "":
+			r.Description = "Refuse this request."
+		default:
+			r.Description = "(no description provided by model)"
+		}
+	}
+	if r.Risk == "" {
+		r.Risk = model.RiskSafe
+	}
+	if r.Approach == "" {
+		switch {
+		case r.Script != nil && r.Script.Body != "":
+			r.Approach = model.ApproachScript
+		case r.Command != "":
+			r.Approach = model.ApproachCommand
+		case r.ToolCall != nil && r.ToolCall.Name != "":
+			r.Approach = model.ApproachToolCall
+		case r.StdoutToUser != "":
+			r.Approach = model.ApproachInform
+		case r.ClarifyingQuestion != "":
+			r.Approach = model.ApproachClarify
+		case r.RefusalReason != "":
+			r.Approach = model.ApproachRefuse
+		}
+	}
+}
+
+func truncate(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	return s[:n] + "..."
+}
diff --git a/internal/model/llamacli/llamacli_test.go b/internal/model/llamacli/llamacli_test.go
new file mode 100644
index 0000000..3df8198
--- /dev/null
+++ b/internal/model/llamacli/llamacli_test.go
@@ -0,0 +1,85 @@
+package llamacli
+
+import (
+	"testing"
+
+	"github.com/CoreyRDean/intent/internal/model"
+)
+
+func TestExtractJSONObject(t *testing.T) {
+	tests := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{"plain object", `{"a":1}`, `{"a":1}`},
+		{"leading log noise", "loading model...\nllama_init\n{\"a\":1}\n", `{"a":1}`},
+		{"trailing eos marker", `{"a":1} [end of text]`, `{"a":1}`},
+		{"code fence", "```json\n{\"a\":1}\n```", `{"a":1}`},
+		{"nested braces", `{"a":{"b":2},"c":3}`, `{"a":{"b":2},"c":3}`},
+		{"brace inside string", `{"a":"}{","b":1}`, `{"a":"}{","b":1}`},
+		{"escaped quote in string", `{"a":"he said \"hi\"","b":1}`, `{"a":"he said \"hi\"","b":1}`},
+		{"no object", `just text`, ``},
+		{"trailing garbage after object", `{"a":1}garbage{`, `{"a":1}`},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := extractJSONObject(tt.in); got != tt.want {
+				t.Errorf("extractJSONObject(%q) = %q, want %q", tt.in, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestFlattenMessages(t *testing.T) {
+	msgs := []model.Message{
+		{Role: "system", Content: "you are a shell"},
+		{Role: "system", Content: "be terse"},
+		{Role: "user", Content: "list files"},
+		{Role: "assistant", Content: `{"approach":"tool_call"}`},
+		{Role: "tool", Name: "list_dir", Content: `{"files":["a","b"]}`},
+	}
+	system, prompt := flattenMessages(msgs)
+
+	if system != "you are a shell\n\nbe terse" {
+		t.Fatalf("system = %q", system)
+	}
+	for _, want := range []string{"list files", "Assistant (previous response)", "Result of list_dir"} {
+		if !contains(prompt, want) {
+			t.Errorf("prompt missing %q; got %q", want, prompt)
+		}
+	}
+}
+
+func TestFlattenMessages_UnnamedTool(t *testing.T) {
+	_, prompt := flattenMessages([]model.Message{
+		{Role: "tool", Content: "result"},
+	})
+	if !contains(prompt, "Result of tool:") {
+		t.Fatalf("expected default tool label, got %q", prompt)
+	}
+}
+
+func TestCacheIdentity_DistinctPerModel(t *testing.T) {
+	a := New("llama-cli", "/cache/models/a.gguf")
+	b := New("llama-cli", "/cache/models/b.gguf")
+	if a.CacheIdentity() == b.CacheIdentity() {
+		t.Fatal("different models should yield different cache identities")
+	}
+	if a.Name() != "llama-cli" {
+		t.Fatalf("Name() = %q", a.Name())
+	}
+}
+
+func contains(haystack, needle string) bool {
+	return len(needle) == 0 || (len(haystack) >= len(needle) && indexOf(haystack, needle) >= 0)
+}
+
+func indexOf(s, sub string) int {
+	for i := 0; i+len(sub) <= len(s); i++ {
+		if s[i:i+len(sub)] == sub {
+			return i
+		}
+	}
+	return -1
+}
diff --git a/internal/model/llamaserver/llamaserver.go b/internal/model/llamaserver/llamaserver.go
new file mode 100644
index 0000000..7044f16
--- /dev/null
+++ b/internal/model/llamaserver/llamaserver.go
@@ -0,0 +1,299 @@
+// Package llamaserver runs local inference through a request-scoped
+// llama.cpp `llama-server` child process.
+//
+// Unlike the one-shot `llama-cli` path, the server is started once on the
+// first inference of an `intent` invocation and held warm for the rest of
+// that invocation — so the engine's tool-call loop reuses the loaded
+// weights and KV cache instead of reloading the model on every step. And
+// because the server speaks the OpenAI-compatible /v1/chat/completions
+// API, the native messages array (system/user/assistant/tool) is sent
+// as-is: no flattening into a single prompt.
+//
+// It is *not* a daemon. The process is bound to a private loopback port,
+// owned by this one CLI invocation, and killed on Close (and, on Linux,
+// auto-killed by the kernel if intent dies — see procAttr). There is no
+// persistent listener and nothing to manage between commands.
+package llamaserver
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"sync"
+	"syscall"
+	"time"
+
+	"github.com/CoreyRDean/intent/internal/model"
+	"github.com/CoreyRDean/intent/internal/model/llamafile"
+)
+
+// Backend manages the llama-server child and delegates inference to an
+// OpenAI-compatible HTTP client pointed at it. The zero value is not
+// usable; construct with New.
+type Backend struct {
+	BinaryPath   string        // path to llama-server
+	ModelPath    string        // path to the .gguf to load
+	ModelTag     string        // cosmetic; feeds the cache identity
+	ContextSize  int           // -c, 0 = llama.cpp default
+	GPULayers    int           // -ngl, -1 = let llama.cpp decide
+	Host         string        // loopback only; default 127.0.0.1
+	StartupGrace time.Duration // how long to wait for /health
+
+	startOnce sync.Once
+	startErr  error
+
+	mu      sync.Mutex
+	cmd     *exec.Cmd
+	inner   *llamafile.Backend
+	stopped bool
+	logBuf  *cappedBuffer
+}
+
+// New constructs a Backend for the given binary and model.
+func New(binary, modelPath string) *Backend {
+	if binary == "" {
+		binary = "llama-server"
+	}
+	return &Backend{
+		BinaryPath:   binary,
+		ModelPath:    modelPath,
+		Host:         "127.0.0.1",
+		GPULayers:    -1,
+		StartupGrace: 120 * time.Second,
+		logBuf:       &cappedBuffer{max: 8 << 10},
+	}
+}
+
+func (b *Backend) Name() string { return "llama-server" }
+
+// CacheIdentity is derived from config alone so it can be computed for
+// the cache key without starting the server.
+func (b *Backend) CacheIdentity() string {
+	return "llama-server|" + b.ModelPath + "|" + b.ModelTag
+}
+
+// Available ensures the server is up and healthy.
+func (b *Backend) Available(ctx context.Context) error {
+	if err := b.ensureStarted(ctx); err != nil {
+		return err
+	}
+	return b.inner.Available(ctx)
+}
+
+// Complete starts the server if needed, then delegates over HTTP. The
+// inner client sends the native messages array, so there is no flattening.
+func (b *Backend) Complete(ctx context.Context, in model.CompleteRequest) (*model.Response, error) {
+	if err := b.ensureStarted(ctx); err != nil {
+		return nil, err
+	}
+	return b.inner.Complete(ctx, in)
+}
+
+// CompleteStructured implements model.StructuredBackend via the inner
+// HTTP client's grammar-constrained path.
+func (b *Backend) CompleteStructured(ctx context.Context, in model.StructuredRequest) ([]byte, error) {
+	if err := b.ensureStarted(ctx); err != nil {
+		return nil, err
+	}
+	return b.inner.CompleteStructured(ctx, in)
+}
+
+// ensureStarted spawns llama-server exactly once and waits for /health.
+func (b *Backend) ensureStarted(ctx context.Context) error {
+	b.startOnce.Do(func() { b.startErr = b.start(ctx) })
+	return b.startErr
+}
+
+func (b *Backend) start(ctx context.Context) error {
+	if b.ModelPath == "" {
+		return fmt.Errorf("llama-server: no model path configured")
+	}
+	host := b.Host
+	if host == "" {
+		host = "127.0.0.1"
+	}
+	port, err := freeLoopbackPort(host)
+	if err != nil {
+		return fmt.Errorf("llama-server: pick port: %w", err)
+	}
+
+	args := []string{
+		"-m", b.ModelPath,
+		"--host", host,
+		"--port", fmt.Sprintf("%d", port),
+	}
+	if b.ContextSize > 0 {
+		args = append(args, "-c", fmt.Sprintf("%d", b.ContextSize))
+	}
+	if b.GPULayers >= 0 {
+		args = append(args, "-ngl", fmt.Sprintf("%d", b.GPULayers))
+	}
+
+	cmd := exec.Command(b.BinaryPath, args...)
+	cmd.Stdout = b.logBuf
+	cmd.Stderr = b.logBuf
+	cmd.Stdin = nil
+	cmd.SysProcAttr = procAttr()
+	if err := cmd.Start(); err != nil {
+		return fmt.Errorf("start llama-server (%s): %w", b.BinaryPath, err)
+	}
+
+	b.mu.Lock()
+	b.cmd = cmd
+	b.mu.Unlock()
+
+	endpoint := fmt.Sprintf("http://%s:%d", host, port)
+	if err := b.waitHealthy(ctx, endpoint); err != nil {
+		b.kill()
+		return fmt.Errorf("llama-server did not become ready: %w (log tail: %s)", err, b.logBuf.String())
+	}
+
+	inner := llamafile.New(endpoint)
+	inner.ModelTag = b.ModelTag
+	b.mu.Lock()
+	b.inner = inner
+	b.mu.Unlock()
+	return nil
+}
+
+// waitHealthy polls /health until the server reports ready, the process
+// exits, ctx is canceled, or StartupGrace elapses. Startup is bounded by
+// StartupGrace independent of any short per-request deadline, but ctx
+// cancellation (e.g. Ctrl-C) still aborts immediately.
+func (b *Backend) waitHealthy(ctx context.Context, endpoint string) error {
+	deadline := time.Now().Add(b.StartupGrace)
+	cli := &http.Client{Timeout: 2 * time.Second}
+	for time.Now().Before(deadline) {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+		if !b.processAlive() {
+			return fmt.Errorf("process exited before becoming ready")
+		}
+		req, _ := http.NewRequestWithContext(ctx, http.MethodGet, endpoint+"/health", nil)
+		resp, err := cli.Do(req)
+		if err == nil {
+			_ = resp.Body.Close()
+			if resp.StatusCode == http.StatusOK {
+				return nil
+			}
+		}
+		time.Sleep(200 * time.Millisecond)
+	}
+	return fmt.Errorf("timeout after %s", b.StartupGrace)
+}
+
+func (b *Backend) processAlive() bool {
+	b.mu.Lock()
+	cmd := b.cmd
+	b.mu.Unlock()
+	if cmd == nil || cmd.Process == nil {
+		return false
+	}
+	return cmd.Process.Signal(syscall.Signal(0)) == nil
+}
+
+// Close kills the server. Safe to call multiple times and when the
+// server never started. It implements io.Closer so the CLI can defer it.
+func (b *Backend) Close() error {
+	b.kill()
+	return nil
+}
+
+func (b *Backend) kill() {
+	b.mu.Lock()
+	cmd := b.cmd
+	if b.stopped || cmd == nil || cmd.Process == nil {
+		b.stopped = true
+		b.mu.Unlock()
+		return
+	}
+	b.stopped = true
+	b.mu.Unlock()
+
+	pid := cmd.Process.Pid
+	signalGroup := func(sig syscall.Signal) {
+		// Signal the whole process group (negative pid) so any worker
+		// llama-server forked also dies; fall back to the bare pid.
+		if pgid, err := syscall.Getpgid(pid); err == nil && pgid > 0 && pgid != os.Getpid() {
+			_ = syscall.Kill(-pgid, sig)
+		}
+		_ = cmd.Process.Signal(sig)
+	}
+
+	signalGroup(syscall.SIGTERM)
+	done := make(chan struct{})
+	go func() { _, _ = cmd.Process.Wait(); close(done) }()
+	select {
+	case <-done:
+	case <-time.After(3 * time.Second):
+		signalGroup(syscall.SIGKILL)
+		<-waitOrTimeout(cmd, 2*time.Second)
+	}
+}
+
+func waitOrTimeout(cmd *exec.Cmd, d time.Duration) <-chan struct{} {
+	ch := make(chan struct{})
+	go func() {
+		_, _ = cmd.Process.Wait()
+		close(ch)
+	}()
+	out := make(chan struct{})
+	go func() {
+		select {
+		case <-ch:
+		case <-time.After(d):
+		}
+		close(out)
+	}()
+	return out
+}
+
+// freeLoopbackPort asks the OS for an unused port on host by binding :0
+// and reading back the assignment. There is a small race between closing
+// the probe listener and llama-server binding the port, but for a local
+// request-scoped child that is acceptable.
+func freeLoopbackPort(host string) (int, error) {
+	ln, err := net.Listen("tcp", net.JoinHostPort(host, "0"))
+	if err != nil {
+		return 0, err
+	}
+	defer ln.Close()
+	return ln.Addr().(*net.TCPAddr).Port, nil
+}
+
+// cappedBuffer is a goroutine-safe writer that retains only the last max
+// bytes — enough to surface a startup failure without growing unbounded
+// over a long session.
+type cappedBuffer struct {
+	mu  sync.Mutex
+	buf []byte
+	max int
+}
+
+func (c *cappedBuffer) Write(p []byte) (int, error) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.buf = append(c.buf, p...)
+	if len(c.buf) > c.max {
+		c.buf = c.buf[len(c.buf)-c.max:]
+	}
+	return len(p), nil
+}
+
+func (c *cappedBuffer) String() string {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	return string(c.buf)
+}
+
+// compile-time assertions for the optional capabilities the CLI relies on.
+var (
+	_ model.Backend           = (*Backend)(nil)
+	_ model.StructuredBackend = (*Backend)(nil)
+	_ io.Closer               = (*Backend)(nil)
+)
diff --git a/internal/model/llamaserver/llamaserver_test.go b/internal/model/llamaserver/llamaserver_test.go
new file mode 100644
index 0000000..200ae2f
--- /dev/null
+++ b/internal/model/llamaserver/llamaserver_test.go
@@ -0,0 +1,63 @@
+package llamaserver
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestFreeLoopbackPort(t *testing.T) {
+	p, err := freeLoopbackPort("127.0.0.1")
+	if err != nil {
+		t.Fatalf("freeLoopbackPort: %v", err)
+	}
+	if p <= 0 || p > 65535 {
+		t.Fatalf("port out of range: %d", p)
+	}
+}
+
+func TestCappedBuffer_RetainsTail(t *testing.T) {
+	b := &cappedBuffer{max: 8}
+	if _, err := b.Write([]byte("abcdefghij")); err != nil { // 10 bytes into an 8-byte cap
+		t.Fatal(err)
+	}
+	if got := b.String(); got != "cdefghij" {
+		t.Fatalf("capped buffer = %q, want %q", got, "cdefghij")
+	}
+}
+
+func TestCappedBuffer_ReportsFullWriteLen(t *testing.T) {
+	b := &cappedBuffer{max: 4}
+	n, err := b.Write([]byte("hello"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if n != 5 {
+		t.Fatalf("Write returned %d, want 5 (full input length)", n)
+	}
+}
+
+func TestCacheIdentity_DistinctPerModel(t *testing.T) {
+	a := New("llama-server", "/cache/models/a.gguf")
+	b := New("llama-server", "/cache/models/b.gguf")
+	if a.CacheIdentity() == b.CacheIdentity() {
+		t.Fatal("different models should yield different cache identities")
+	}
+	if !strings.HasPrefix(a.CacheIdentity(), "llama-server|") {
+		t.Fatalf("cache identity = %q, want llama-server prefix", a.CacheIdentity())
+	}
+	if a.Name() != "llama-server" {
+		t.Fatalf("Name() = %q", a.Name())
+	}
+}
+
+// Close must be safe when the server never started (no process spawned).
+func TestClose_BeforeStartIsNoop(t *testing.T) {
+	b := New("llama-server", "/cache/models/a.gguf")
+	if err := b.Close(); err != nil {
+		t.Fatalf("Close before start: %v", err)
+	}
+	// Idempotent.
+	if err := b.Close(); err != nil {
+		t.Fatalf("second Close: %v", err)
+	}
+}
diff --git a/internal/model/llamaserver/procattr_linux.go b/internal/model/llamaserver/procattr_linux.go
new file mode 100644
index 0000000..5d717c3
--- /dev/null
+++ b/internal/model/llamaserver/procattr_linux.go
@@ -0,0 +1,17 @@
+//go:build linux
+
+package llamaserver
+
+import "syscall"
+
+// procAttr puts llama-server in its own process group (so we can signal
+// the whole group) and asks the kernel to SIGKILL it if the parent
+// intent process dies — even on a crash or -9, where our deferred Close
+// never runs. This is the safety net that keeps a request-scoped child
+// from being orphaned with the model resident in memory.
+func procAttr() *syscall.SysProcAttr {
+	return &syscall.SysProcAttr{
+		Setpgid:   true,
+		Pdeathsig: syscall.SIGKILL,
+	}
+}
diff --git a/internal/model/llamaserver/procattr_other.go b/internal/model/llamaserver/procattr_other.go
new file mode 100644
index 0000000..6f7b81a
--- /dev/null
+++ b/internal/model/llamaserver/procattr_other.go
@@ -0,0 +1,13 @@
+//go:build !linux
+
+package llamaserver
+
+import "syscall"
+
+// procAttr puts llama-server in its own process group so Close can signal
+// the whole group. Non-Linux platforms (notably macOS) have no
+// Pdeathsig equivalent, so a hard kill (-9) of the parent can orphan the
+// child; normal exits and signals are handled by the deferred Close.
+func procAttr() *syscall.SysProcAttr {
+	return &syscall.SysProcAttr{Setpgid: true}
+}
diff --git a/internal/runtime/llamacli.go b/internal/runtime/llamacli.go
new file mode 100644
index 0000000..6493a65
--- /dev/null
+++ b/internal/runtime/llamacli.go
@@ -0,0 +1,183 @@
+package runtime
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+)
+
+// llama.cpp ships several frontends; intent uses two:
+//
+//   - llama-server: a request-scoped child process that holds the model
+//     warm for the duration of one `intent` invocation and speaks the
+//     OpenAI-compatible HTTP API (native multi-turn messages + grammar).
+//     This is the preferred local backend.
+//   - llama-cli: a one-shot fallback used when llama-server isn't present.
+//
+// Both come from the same package (`brew install llama.cpp`), so the
+// installer ensures the package and the resolvers find whichever binaries
+// landed on PATH (or in a known Homebrew prefix).
+const (
+	LlamaServerBinary = "llama-server"
+	LlamaCLIBinary    = "llama-cli"
+)
+
+// binaryCandidates lists well-known absolute locations to check when a
+// llama.cpp binary isn't on PATH — chiefly the Homebrew prefixes on macOS
+// and Linux, where `brew install llama.cpp` drops the binaries.
+func binaryCandidates(bin string) []string {
+	paths := []string{
+		"/opt/homebrew/bin/" + bin,              // Apple Silicon brew
+		"/usr/local/bin/" + bin,                 // Intel mac / manual
+		"/home/linuxbrew/.linuxbrew/bin/" + bin, // shared linuxbrew
+	}
+	if home := os.Getenv("HOME"); home != "" {
+		paths = append(paths, filepath.Join(home, ".linuxbrew", "bin", bin))
+	}
+	return paths
+}
+
+// isExecutable reports whether path exists and has an executable bit set.
+func isExecutable(path string) bool {
+	info, err := os.Stat(path)
+	if err != nil {
+		return false
+	}
+	return !info.IsDir() && info.Mode()&0o111 != 0
+}
+
+// resolveBinary returns a usable path for bin: PATH first, then known
+// install locations. Falls back to the bare name so callers still get a
+// value to hand exec (which re-resolves via PATH) when nothing is found.
+func resolveBinary(bin string) string {
+	if p, err := exec.LookPath(bin); err == nil {
+		return p
+	}
+	for _, p := range binaryCandidates(bin) {
+		if isExecutable(p) {
+			return p
+		}
+	}
+	return bin
+}
+
+// haveBinary reports whether bin resolves to something runnable.
+func haveBinary(bin string) bool {
+	if _, err := exec.LookPath(bin); err == nil {
+		return true
+	}
+	for _, p := range binaryCandidates(bin) {
+		if isExecutable(p) {
+			return true
+		}
+	}
+	return false
+}
+
+// LlamaServerPath resolves the llama-server binary.
+func (m *Manager) LlamaServerPath() string { return resolveBinary(LlamaServerBinary) }
+
+// HaveLlamaServer reports whether llama-server is installed.
+func (m *Manager) HaveLlamaServer() bool { return haveBinary(LlamaServerBinary) }
+
+// LlamaCLIPath resolves the llama-cli binary.
+func (m *Manager) LlamaCLIPath() string { return resolveBinary(LlamaCLIBinary) }
+
+// HaveLlamaCLI reports whether llama-cli is installed.
+func (m *Manager) HaveLlamaCLI() bool { return haveBinary(LlamaCLIBinary) }
+
+// HaveLlamaRuntime reports whether a usable llama.cpp frontend is present.
+// Either binary is enough to run local inference (server preferred, cli
+// as a one-shot fallback).
+func (m *Manager) HaveLlamaRuntime() bool {
+	return m.HaveLlamaServer() || m.HaveLlamaCLI()
+}
+
+// pkgManager describes how to install llama.cpp via one system package
+// manager. cmd is the manager binary; args are the install arguments
+// (package name included); needsSudo asks us to prefix sudo when we are
+// not already root.
+type pkgManager struct {
+	name      string
+	cmd       string
+	args      []string
+	needsSudo bool
+}
+
+// llamaManagers is the ordered preference list. Homebrew is first because
+// it ships an official, up-to-date `llama.cpp` formula (with both
+// llama-server and llama-cli) on macOS and Linux; the native managers are
+// best-effort fallbacks.
+func llamaManagers() []pkgManager {
+	managers := []pkgManager{
+		{name: "Homebrew", cmd: "brew", args: []string{"install", "llama.cpp"}},
+	}
+	if runtime.GOOS == "linux" {
+		managers = append(managers,
+			pkgManager{name: "apt", cmd: "apt-get", args: []string{"install", "-y", "llama.cpp"}, needsSudo: true},
+			pkgManager{name: "dnf", cmd: "dnf", args: []string{"install", "-y", "llama.cpp"}, needsSudo: true},
+			pkgManager{name: "pacman", cmd: "pacman", args: []string{"-S", "--noconfirm", "llama.cpp"}, needsSudo: true},
+			pkgManager{name: "zypper", cmd: "zypper", args: []string{"install", "-y", "llama.cpp"}, needsSudo: true},
+		)
+	}
+	return managers
+}
+
+// EnsureLlamaRuntime installs llama.cpp via the system package manager if
+// neither llama-server nor llama-cli is already available. log, if
+// non-nil, receives human-readable progress lines. It returns a clear,
+// actionable error when no supported package manager is found or the
+// install fails.
+func (m *Manager) EnsureLlamaRuntime(ctx context.Context, log func(string)) error {
+	if m.HaveLlamaRuntime() {
+		return nil
+	}
+	logf := func(format string, a ...any) {
+		if log != nil {
+			log(fmt.Sprintf(format, a...))
+		}
+	}
+
+	var available []pkgManager
+	for _, pm := range llamaManagers() {
+		if _, err := exec.LookPath(pm.cmd); err == nil {
+			available = append(available, pm)
+		}
+	}
+	if len(available) == 0 {
+		return fmt.Errorf("no supported package manager found to install llama.cpp.\n" +
+			"  Install Homebrew (https://brew.sh) and run `brew install llama.cpp`,\n" +
+			"  or install llama.cpp from https://github.com/ggml-org/llama.cpp")
+	}
+
+	var lastErr error
+	for _, pm := range available {
+		name, args := pm.cmd, pm.args
+		if pm.needsSudo && os.Geteuid() != 0 {
+			if _, err := exec.LookPath("sudo"); err == nil {
+				args = append([]string{name}, args...)
+				name = "sudo"
+			}
+		}
+		logf("installing llama.cpp via %s (%s %s)...", pm.name, name, strings.Join(args, " "))
+		cmd := exec.CommandContext(ctx, name, args...)
+		cmd.Stdout = os.Stderr
+		cmd.Stderr = os.Stderr
+		if err := cmd.Run(); err != nil {
+			lastErr = fmt.Errorf("%s install failed: %w", pm.name, err)
+			logf("  %s failed: %v", pm.name, err)
+			continue
+		}
+		if m.HaveLlamaRuntime() {
+			logf("llama.cpp installed.")
+			return nil
+		}
+		lastErr = fmt.Errorf("%s reported success but no llama.cpp binary was found", pm.name)
+	}
+	return fmt.Errorf("could not install llama.cpp automatically: %w.\n"+
+		"  Try `brew install llama.cpp` or build from https://github.com/ggml-org/llama.cpp", lastErr)
+}
diff --git a/internal/runtime/runtime.go b/internal/runtime/runtime.go
index f3e128c..8ada7b9 100644
--- a/internal/runtime/runtime.go
+++ b/internal/runtime/runtime.go
@@ -1,7 +1,8 @@
-// Package runtime manages the local llamafile binary and model files.
-// In v1 it can: report whether a runtime/model is present, and download
-// either on demand with progress callbacks. Actually starting llamafile as a
-// subprocess is wired into Phase 4 (daemon).
+// Package runtime manages the local inference runtime (llama.cpp's
+// `llama-cli`, installed via the system package manager — see
+// llamacli.go) and the GGUF model files it loads. It can report whether
+// the runtime/model is present and download models on demand with
+// progress callbacks.
 package runtime
 
 import (
@@ -16,9 +17,6 @@ import (
 	"github.com/CoreyRDean/intent/internal/models"
 )
 
-// LlamafileVersion is the runtime version we ship against.
-const LlamafileVersion = "0.10.0"
-
 // ModelInfo is the minimal shape the runtime package needs to
 // download a model. It's a projection of models.Model kept here for
 // backward compatibility; new code should pass models.Model around.
@@ -80,25 +78,11 @@ type Manager struct {
 
 func New(cacheDir string) *Manager { return &Manager{CacheDir: cacheDir} }
 
-// LlamafilePath returns the expected path of the llamafile binary.
-func (m *Manager) LlamafilePath() string {
-	return filepath.Join(m.CacheDir, "runtime", "llamafile-"+LlamafileVersion)
-}
-
 // ModelPath returns the expected path of the named model file.
 func (m *Manager) ModelPath(file string) string {
 	return filepath.Join(m.CacheDir, "models", file)
 }
 
-// HaveLlamafile reports whether the runtime exists and is executable.
-func (m *Manager) HaveLlamafile() bool {
-	info, err := os.Stat(m.LlamafilePath())
-	if err != nil {
-		return false
-	}
-	return info.Mode()&0o111 != 0
-}
-
 // HaveModel reports whether the named model file exists.
 func (m *Manager) HaveModel(file string) bool {
 	_, err := os.Stat(m.ModelPath(file))
@@ -108,22 +92,6 @@ func (m *Manager) HaveModel(file string) bool {
 // Progress is a download progress callback.
 type Progress func(downloaded, total int64)
 
-// EnsureLlamafile downloads the runtime if missing.
-func (m *Manager) EnsureLlamafile(ctx context.Context, progress Progress) error {
-	if m.HaveLlamafile() {
-		return nil
-	}
-	if err := os.MkdirAll(filepath.Dir(m.LlamafilePath()), 0o755); err != nil {
-		return err
-	}
-	url := fmt.Sprintf("https://github.com/mozilla-ai/llamafile/releases/download/%s/llamafile-%s",
-		LlamafileVersion, LlamafileVersion)
-	if err := download(ctx, url, m.LlamafilePath(), progress); err != nil {
-		return fmt.Errorf("download llamafile: %w", err)
-	}
-	return os.Chmod(m.LlamafilePath(), 0o755)
-}
-
 // EnsureModel downloads the model if missing.
 func (m *Manager) EnsureModel(ctx context.Context, mi ModelInfo, progress Progress) error {
 	dest := m.ModelPath(mi.File)
diff --git a/internal/verbose/backend.go b/internal/verbose/backend.go
index c2cfb87..cf7b91d 100644
--- a/internal/verbose/backend.go
+++ b/internal/verbose/backend.go
@@ -2,6 +2,7 @@ package verbose
 
 import (
 	"context"
+	"io"
 	"time"
 
 	"github.com/CoreyRDean/intent/internal/model"
@@ -41,6 +42,16 @@ func (v *vb) CacheIdentity() string {
 	return v.inner.Name()
 }
 
+// Close forwards to the wrapped backend if it owns resources (e.g. the
+// llama-server co-process), so callers can defer Close through the
+// verbose decorator. No-op when the inner backend isn't a Closer.
+func (v *vb) Close() error {
+	if c, ok := v.inner.(io.Closer); ok {
+		return c.Close()
+	}
+	return nil
+}
+
 func (v *vb) Complete(ctx context.Context, req model.CompleteRequest) (*model.Response, error) {
 	v.log.Section("model request (envelope)")
 	v.log.KV("backend", v.inner.Name())