From 56444ac5601d108742bbb8e2a989c66d6a0d614f Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 5 Jun 2026 20:17:12 +0000
Subject: [PATCH 1/3] feat(local): run local inference via llama-cli, drop the
 server daemon
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Local models previously ran as a long-lived `llamafile --server`
subprocess supervised by an `intentd` daemon, with the CLI talking to
it over a loopback OpenAI-compatible HTTP endpoint. This replaces that
whole path with one-shot `llama.cpp` `llama-cli` invocations: each
request spawns the binary, constrains output with the same JSON-schema
grammar, parses the JSON it prints, and exits. No daemon, no socket, no
HTTP, nothing to supervise or leave running.

The runtime is installed on demand through the system package manager
(Homebrew first — it ships an up-to-date `llama.cpp` formula on macOS
and Linux — then apt/dnf/pacman/zypper as best-effort fallbacks) when
`llama-cli` isn't already on PATH.

Changes:
- new internal/model/llamacli backend (Backend + StructuredBackend)
- new internal/runtime llama-cli resolution + package-manager install;
  drop the llamafile binary download (GGUF model management stays)
- backend resolver builds the llama-cli backend for "llama-cli" and the
  back-compat alias "llamafile-local"; default backend is now llama-cli
- remove internal/daemon and the `i daemon` subcommand; rework
  ensure/doctor/init/model self-heal around runtime+model install
- network backends (llamafile-network, ollama, openai) are unchanged

The local path no longer binds any network socket, so the loopback host
validation is gone (moot). Tests updated; new unit tests cover the
llama-cli JSON extraction and message flattening.
---
 .github/workflows/release.yml            |   2 +-
 README.md                                |   4 +-
 install.sh                               |   6 +-
 internal/cli/backend.go                  |  91 +++---
 internal/cli/backend_test.go             |  53 ++--
 internal/cli/cli.go                      |   8 +-
 internal/cli/config.go                   |  14 +-
 internal/cli/daemon.go                   | 363 ---------------------
 internal/cli/daemon_host.go              |  57 ----
 internal/cli/daemon_host_test.go         |  75 -----
 internal/cli/doctor.go                   |  45 +--
 internal/cli/doctor_test.go              |  96 +-----
 internal/cli/ensure.go                   | 175 ++++------
 internal/cli/init.go                     |  54 +---
 internal/cli/intent.go                   |   8 +-
 internal/cli/model.go                    |  30 +-
 internal/cli/smoke_test.go               |  21 --
 internal/config/config.go                |   6 +-
 internal/daemon/client.go                |  49 ---
 internal/daemon/daemon.go                | 201 ------------
 internal/daemon/install.go               | 237 --------------
 internal/daemon/launcher.go              | 388 -----------------------
 internal/daemon/procattr_other.go        |   9 -
 internal/daemon/procattr_unix.go         |  12 -
 internal/model/llamacli/llamacli.go      | 313 ++++++++++++++++++
 internal/model/llamacli/llamacli_test.go |  85 +++++
 internal/runtime/llamacli.go             | 152 +++++++++
 internal/runtime/runtime.go              |  42 +--
 28 files changed, 737 insertions(+), 1859 deletions(-)
 delete mode 100644 internal/cli/daemon.go
 delete mode 100644 internal/cli/daemon_host.go
 delete mode 100644 internal/cli/daemon_host_test.go
 delete mode 100644 internal/daemon/client.go
 delete mode 100644 internal/daemon/daemon.go
 delete mode 100644 internal/daemon/install.go
 delete mode 100644 internal/daemon/launcher.go
 delete mode 100644 internal/daemon/procattr_other.go
 delete mode 100644 internal/daemon/procattr_unix.go
 create mode 100644 internal/model/llamacli/llamacli.go
 create mode 100644 internal/model/llamacli/llamacli_test.go
 create mode 100644 internal/runtime/llamacli.go

diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index d32c397..5c969d8 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -294,7 +294,7 @@ jobs:
 
             def caveats
               <<~EOS
-                Run first-run setup to download the local model and start the daemon:
+                Run first-run setup to install the runtime and download the local model:
                   i init
 
                 For zsh users: install the shell hook so prompts containing ? * [ ]
diff --git a/README.md b/README.md
index 163cdd2..e1a1530 100644
--- a/README.md
+++ b/README.md
@@ -16,7 +16,7 @@ It is **local-first** by default (no network required after first run, no prompt
 
 That composability applies to subcommands that consume natural language too: `i report "first problem" < extra-notes.txt` appends the piped text after the command-line text before proposing issues.
 
-> **Status: pre-alpha.** The binary builds and the mock backend round-trips the full prompt → propose → confirm → run loop, but the local model runtime, daemon, and self-update flows are still being wired up. See [`INTENT.md`](./INTENT.md) for the full project charter, [`docs/SPEC.md`](./docs/SPEC.md) for the implementation contract, and [open issues](https://github.com/CoreyRDean/intent/issues) for the roadmap.
+> **Status: pre-alpha.** The binary builds and the mock backend round-trips the full prompt → propose → confirm → run loop, but the local model runtime (llama.cpp's `llama-cli`) and self-update flows are still being wired up. See [`INTENT.md`](./INTENT.md) for the full project charter, [`docs/SPEC.md`](./docs/SPEC.md) for the implementation contract, and [open issues](https://github.com/CoreyRDean/intent/issues) for the roadmap.
 
 ## Building from source
 
@@ -125,7 +125,7 @@ With `--literal`, everything after the flag is treated as natural-language promp
 
 ## Managing models
 
-intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llamafile](https://github.com/mozilla-ai/llamafile). You can also point it at any public Hugging Face GGUF repo.
+intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llama.cpp](https://github.com/ggml-org/llama.cpp)'s `llama-cli`, which intent installs on demand through your system package manager (Homebrew, apt, dnf, …). Each prompt runs `llama-cli` one-shot — there is no background daemon or server. You can also point it at any public Hugging Face GGUF repo.
 
 ```sh
 # See what's on offer and which one is current.
diff --git a/install.sh b/install.sh
index 12cf36c..f582a74 100755
--- a/install.sh
+++ b/install.sh
@@ -11,7 +11,7 @@
 #   PREFIX           install root (default /usr/local; needs sudo if not writable)
 #   INTENT_TMPDIR    where to stage downloads (default $TMPDIR or /tmp)
 #
-# This script does not auto-install the daemon, the model runtime, or the
+# This script does not auto-install the model runtime (llama-cli) or the
 # model. Run `i init` after install.
 
 set -Eeuo pipefail
@@ -136,8 +136,8 @@ echo
 
 # Auto-run `intent init` if we have a real TTY on stdin/stderr. Without
 # this, users who curl|bash and ignore the next-steps text get a binary
-# they can't actually use until they read the docs. With it, the model
-# downloads and the daemon starts as part of the install flow.
+# they can't actually use until they read the docs. With it, the runtime
+# installs and the model downloads as part of the install flow.
 #
 # We skip it under `bash -c` / `curl | bash` (no TTY) so non-interactive
 # CI jobs aren't surprised by a 4 GB download.
diff --git a/internal/cli/backend.go b/internal/cli/backend.go
index cc27bc6..547e3eb 100644
--- a/internal/cli/backend.go
+++ b/internal/cli/backend.go
@@ -3,16 +3,15 @@ package cli
 import (
 	"context"
 	"fmt"
-	"net"
-	"net/url"
 	"os"
-	"strings"
-	"time"
 
 	"github.com/CoreyRDean/intent/internal/config"
 	"github.com/CoreyRDean/intent/internal/model"
+	"github.com/CoreyRDean/intent/internal/model/llamacli"
 	"github.com/CoreyRDean/intent/internal/model/llamafile"
 	"github.com/CoreyRDean/intent/internal/model/mock"
+	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
+	"github.com/CoreyRDean/intent/internal/state"
 	"github.com/CoreyRDean/intent/internal/verbose"
 )
 
@@ -21,10 +20,11 @@ import (
 // unavailable and we silently fell back to the mock — callers use this to
 // surface a per-invocation warning so users aren't left confused.
 //
-// In v1 we wire: mock, llamafile-local, llamafile-network, ollama (as a
-// llamafile-shaped HTTP), openai (as a llamafile-shaped HTTP). The grammar
-// constraint is the same across all of them; the only differences are the
-// endpoint and the auth header.
+// Backends: mock; llama-cli (local one-shot llama.cpp subprocess, also
+// reachable under the legacy alias "llamafile-local"); llamafile-network,
+// ollama, and openai (all OpenAI-compatible HTTP). The JSON-schema grammar
+// constraint is the same across all of them; they differ only in transport
+// (local subprocess vs. HTTP endpoint + auth header).
 func buildBackend(name string, cfg *config.Config, modelOverride string) (model.Backend, bool, error) {
 	if v := os.Getenv("INTENT_FORCE_BACKEND"); v != "" {
 		name = v
@@ -32,26 +32,32 @@ func buildBackend(name string, cfg *config.Config, modelOverride string) (model.
 	switch name {
 	case "mock":
 		return mock.New(), false, nil
-	case "llamafile-local":
-		// We expect the daemon (`intentd`) to have started llamafile on
-		// the loopback host:port from config. If nothing's listening, we
-		// fall back to the mock backend so `i hello` doesn't hard-fail
-		// for a brand-new install — instead the mock returns an honest
-		// "the local model isn't installed yet" response.
-		host, port, err := resolveLocalDaemonEndpoint(cfg)
+	case "llama-cli", "llamafile-local":
+		// Local inference runs llama.cpp's `llama-cli` one-shot. If the
+		// binary or the selected model isn't installed yet, fall back to
+		// the mock backend so `i hello` doesn't hard-fail for a brand-new
+		// install — the mock returns an honest "not installed yet"
+		// response, and ensureBackendReady / `i doctor` guide the fix.
+		// ("llamafile-local" is kept as a back-compat alias for configs
+		// written before the switch to llama-cli.)
+		dirs, err := state.Resolve()
 		if err != nil {
 			return nil, false, err
 		}
-		endpoint := fmt.Sprintf("http://%s:%s", host, port)
-		if !endpointReachable(endpoint) {
+		rt := intentruntime.New(dirs.Cache)
+		modelPath := rt.ModelPath(selectedModelFile(dirs.State, cfg))
+		if !rt.HaveLlamaCLI() || !fileExists(modelPath) {
 			return mock.New(), true, nil
 		}
-		b := llamafile.New(endpoint)
+		b := llamacli.New(rt.LlamaCLIPath(), modelPath)
 		if modelOverride != "" {
 			b.ModelTag = modelOverride
 		} else {
 			b.ModelTag = cfg.Model
 		}
+		if m := loadCatalog(dirs.State).Get(cfg.Model); m != nil && m.ContextTokens > 0 {
+			b.ContextSize = m.ContextTokens
+		}
 		return b, false, nil
 	case "llamafile-network":
 		ep := os.Getenv("INTENT_LLAMAFILE_ENDPOINT")
@@ -119,6 +125,11 @@ func buildBackendCtx(ctx context.Context, name string, cfg *config.Config, model
 			l.KV("endpoint", b.Endpoint)
 			l.KV("model_tag", b.ModelTag)
 		}
+		if b, ok := be.(*llamacli.Backend); ok {
+			l.KV("binary", b.BinaryPath)
+			l.KV("model_path", b.ModelPath)
+			l.KV("model_tag", b.ModelTag)
+		}
 		be = verbose.Backend(l, be)
 	}
 	return be, fb, nil
@@ -131,7 +142,21 @@ func printMockFallbackBanner(isFallback bool) {
 	if !isFallback {
 		return
 	}
-	fmt.Fprintln(os.Stderr, "[MOCK] real backend unavailable — responses are simulated. Run 'i doctor', 'i model list', or 'i daemon start' to fix.")
+	fmt.Fprintln(os.Stderr, "[MOCK] real backend unavailable — responses are simulated. Run 'i doctor' or 'i model pull' to fix.")
+}
+
+// isLocalBackend reports whether a backend name uses the local llama-cli
+// runtime (and therefore wants the runtime/model self-healing in
+// ensureBackendReady). The empty string means "use the configured
+// default", which is llama-cli. "llamafile-local" is the back-compat
+// alias for configs predating the switch.
+func isLocalBackend(name string) bool {
+	switch name {
+	case "", "llama-cli", "llamafile-local":
+		return true
+	default:
+		return false
+	}
 }
 
 // isMockBackend reports whether b is the mock backend (by name).
@@ -140,28 +165,8 @@ func isMockBackend(b model.Backend) bool {
 	return b.Name() == "mock"
 }
 
-// endpointReachable does a short-timeout TCP check on the host:port of a URL.
-func endpointReachable(rawURL string) bool {
-	u, err := url.Parse(rawURL)
-	if err != nil {
-		return false
-	}
-	host := u.Host
-	if host == "" {
-		return false
-	}
-	if !strings.Contains(host, ":") {
-		switch u.Scheme {
-		case "https":
-			host += ":443"
-		default:
-			host += ":80"
-		}
-	}
-	c, err := net.DialTimeout("tcp", host, 200*time.Millisecond)
-	if err != nil {
-		return false
-	}
-	_ = c.Close()
-	return true
+// fileExists reports whether path exists and is a regular file.
+func fileExists(path string) bool {
+	info, err := os.Stat(path)
+	return err == nil && !info.IsDir()
 }
diff --git a/internal/cli/backend_test.go b/internal/cli/backend_test.go
index b58f9b0..faef6ed 100644
--- a/internal/cli/backend_test.go
+++ b/internal/cli/backend_test.go
@@ -40,39 +40,26 @@ func TestBuildBackend_MockIsNotFallback(t *testing.T) {
 	}
 }
 
-func TestBuildBackend_LlamafileLocalFallsBackWhenUnreachable(t *testing.T) {
+// When llama-cli or the model isn't installed, the local backend falls
+// back to mock so a fresh install doesn't hard-fail. We point the cache
+// at an empty temp dir so the model file is guaranteed absent.
+func TestBuildBackend_LlamaCLILocalFallsBackWhenNotInstalled(t *testing.T) {
 	clearBackendEnv(t)
-	// Point the daemon at a port that is definitely not listening.
-	cfg := minimalConfig()
-	cfg.Raw["daemon.host"] = "127.0.0.1"
-	cfg.Raw["daemon.port"] = "1" // port 1 is reserved; nothing listens there
-
-	be, isFallback, err := buildBackend("llamafile-local", cfg, "")
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if !isFallback {
-		t.Error("unavailable llamafile-local should set isFallback=true")
-	}
-	if be.Name() != "mock" {
-		t.Errorf("expected fallback name %q, got %q", "mock", be.Name())
-	}
-}
-
-func TestBuildBackend_LlamafileLocalRejectsNonLoopbackHost(t *testing.T) {
-	clearBackendEnv(t)
-	cfg := minimalConfig()
-	cfg.Raw["daemon.host"] = "0.0.0.0"
-
-	_, isFallback, err := buildBackend("llamafile-local", cfg, "")
-	if err == nil {
-		t.Fatal("expected error for non-loopback daemon host, got nil")
-	}
-	if isFallback {
-		t.Fatal("invalid daemon host should not silently fall back to mock")
-	}
-	if !strings.Contains(err.Error(), "loopback only") {
-		t.Fatalf("error = %q, want loopback hint", err)
+	t.Setenv("HOME", t.TempDir())
+	t.Setenv("INTENT_STATE_DIR", t.TempDir())
+	t.Setenv("INTENT_CACHE_DIR", t.TempDir())
+
+	for _, name := range []string{"llama-cli", "llamafile-local"} {
+		be, isFallback, err := buildBackend(name, minimalConfig(), "")
+		if err != nil {
+			t.Fatalf("%s: unexpected error: %v", name, err)
+		}
+		if !isFallback {
+			t.Errorf("%s: uninstalled local backend should set isFallback=true", name)
+		}
+		if be.Name() != "mock" {
+			t.Errorf("%s: expected fallback name %q, got %q", name, "mock", be.Name())
+		}
 	}
 }
 
@@ -155,7 +142,7 @@ func TestPrintMockFallbackBanner_MentionsNextSteps(t *testing.T) {
 	io.Copy(&buf, r)
 	out := buf.String()
 
-	for _, hint := range []string{"i doctor", "i daemon start"} {
+	for _, hint := range []string{"i doctor", "i model pull"} {
 		if !strings.Contains(out, hint) {
 			t.Errorf("banner should mention %q; got: %q", hint, out)
 		}
diff --git a/internal/cli/cli.go b/internal/cli/cli.go
index 64701e9..e53543c 100644
--- a/internal/cli/cli.go
+++ b/internal/cli/cli.go
@@ -23,7 +23,6 @@ var knownSubcommands = map[string]commandHandler{
 	"doctor":     cmdDoctor,
 	"config":     cmdConfig,
 	"model":      cmdModel,
-	"daemon":     cmdDaemon,
 	"history":    cmdHistory,
 	"pin":        cmdPin,
 	"run":        cmdRun,
@@ -163,12 +162,11 @@ Tip:
   double quotes for reliable shell parsing across environments.
 
 Subcommands:
-  init        First-run setup (model, daemon, completions).
+  init        First-run setup (model, runtime, completions).
   shell-init  Print shell snippet to source for natural-language quoting.
-  doctor      Diagnose installation, model, daemon, sandbox.
+  doctor      Diagnose installation, runtime, model, sandbox.
   config      Get/set/edit configuration.
   model       Manage local models.
-  daemon      Start/stop/status the background daemon.
   history     Inspect or clear the audit log.
   pin         Promote the last accepted command to a named skill.
   run         Run a pinned skill by name.
@@ -202,7 +200,7 @@ Top-level:
   --help, -h       This help.
   -v, --verbose    Log model I/O, tool calls, and gh round-trips to stderr.
                    (also enabled by INTENT_VERBOSE=1)
-  --uninstall      Remove binary, daemon, and (with consent) state.
+  --uninstall      Remove binary and (with consent) state.
   --update         Equivalent to "update".
 
 Read INTENT.md and docs/SPEC.md before contributing.
diff --git a/internal/cli/config.go b/internal/cli/config.go
index 8592dfa..8e8efac 100644
--- a/internal/cli/config.go
+++ b/internal/cli/config.go
@@ -86,14 +86,14 @@ func cmdConfig(_ context.Context, args []string) int {
 	}
 }
 
+// validateConfigValue is a hook for per-key validation on `i config set`.
+// Local inference no longer binds a network socket (llama-cli runs as a
+// subprocess), so there are currently no keys that need rejecting; the
+// function stays as the extension point.
 func validateConfigValue(key, value string) error {
-	switch key {
-	case "daemon.host":
-		_, err := normalizeLocalDaemonHost(value)
-		return err
-	default:
-		return nil
-	}
+	_ = key
+	_ = value
+	return nil
 }
 
 func lookupKnown(c *config.Config, key string) string {
diff --git a/internal/cli/daemon.go b/internal/cli/daemon.go
deleted file mode 100644
index 1b4da4c..0000000
--- a/internal/cli/daemon.go
+++ /dev/null
@@ -1,363 +0,0 @@
-package cli
-
-import (
-	"context"
-	"fmt"
-	"io"
-	"os"
-	"os/exec"
-	"os/signal"
-	"path/filepath"
-	"runtime"
-	"syscall"
-	"time"
-
-	"github.com/CoreyRDean/intent/internal/config"
-	"github.com/CoreyRDean/intent/internal/daemon"
-	"github.com/CoreyRDean/intent/internal/models"
-	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
-	"github.com/CoreyRDean/intent/internal/state"
-)
-
-const daemonUsage = "usage: i daemon (start | stop | status | logs | install | uninstall)"
-
-// daemonLabel is the launchd / systemd unit name. Stable across versions.
-const daemonLabel = "com.coreyrdean.intent"
-
-func cmdDaemon(ctx context.Context, args []string) int {
-	if len(args) == 0 {
-		errf(daemonUsage)
-		return 1
-	}
-	dirs, err := state.Resolve()
-	if err != nil {
-		errf("daemon: %v", err)
-		return 3
-	}
-	cfg, _ := config.Load(dirs.ConfigPath())
-
-	switch args[0] {
-	case "--help", "-h", "help":
-		fmt.Println(daemonUsage)
-		return 0
-	case "start":
-		return daemonStart(ctx, dirs, cfg, args[1:])
-	case "stop":
-		return daemonStop(dirs)
-	case "status":
-		return daemonStatus(dirs)
-	case "logs":
-		return daemonLogs(dirs)
-	case "install":
-		return daemonInstall(dirs)
-	case "uninstall":
-		return daemonUninstall(dirs)
-	default:
-		errf("unknown subcommand: %q", args[0])
-		return 1
-	}
-}
-
-// daemonStart is the user-visible `i daemon start`. By default it spawns
-// itself in the background (re-execs with --foreground), waits for the
-// control socket to come up, prints a one-line "started" message, and
-// returns — so the user gets their prompt back in well under a second.
-//
-// `--foreground` (or `--attach`) keeps the process attached to the
-// terminal, which is what launchd / systemd want and what `i daemon
-// logs -f` style debugging needs. The env var INTENTD_FOREGROUND is
-// the same switch in env form, so service files don't have to know
-// about the flag.
-func daemonStart(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []string) int {
-	foreground := os.Getenv("INTENTD_FOREGROUND") == "1"
-	for _, a := range args {
-		switch a {
-		case "--foreground", "--attach", "-f":
-			foreground = true
-		case "--background", "-b":
-			foreground = false
-		}
-	}
-	if !foreground {
-		return daemonSpawnDetached(dirs)
-	}
-	return daemonRunForeground(ctx, dirs, cfg)
-}
-
-// daemonSpawnDetached re-execs ourselves with --foreground, redirects
-// the child's stdio to a log file, decouples it from our process group
-// (Setsid), and returns once the control socket is responsive — or
-// after a sane timeout, with the log path so the user can inspect a
-// failure.
-func daemonSpawnDetached(dirs state.Dirs) int {
-	if err := os.MkdirAll(filepath.Join(dirs.State, "logs"), 0o700); err != nil {
-		errf("daemon start: %v", err)
-		return 3
-	}
-	logPath := filepath.Join(dirs.State, "logs", "intentd.log")
-	logF, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600)
-	if err != nil {
-		errf("daemon start: open log %s: %v", logPath, err)
-		return 3
-	}
-	defer logF.Close()
-
-	self, err := os.Executable()
-	if err != nil {
-		errf("daemon start: locate self: %v", err)
-		return 3
-	}
-	cmd := exec.Command(self, "daemon", "start", "--foreground")
-	cmd.Env = append(os.Environ(), "INTENTD_FOREGROUND=1")
-	cmd.Stdout = logF
-	cmd.Stderr = logF
-	cmd.Stdin = nil
-	cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true}
-	if err := cmd.Start(); err != nil {
-		errf("daemon start: spawn: %v", err)
-		return 3
-	}
-	// Don't wait for it — we want it to outlive us.
-	go func() { _ = cmd.Process.Release() }()
-
-	// Poll the control socket for readiness. The child has 30s to come
-	// up before we report failure; on a cold cache that's mostly
-	// llamafile loading the model.
-	deadline := time.Now().Add(30 * time.Second)
-	c := daemon.NewClient(dirs.SocketPath())
-	for time.Now().Before(deadline) {
-		if resp, err := c.Call(daemon.Request{Op: daemon.OpPing}); err == nil && resp.OK {
-			fmt.Fprintln(os.Stderr, "intentd: started in the background.")
-			fmt.Fprintf(os.Stderr, "  socket: %s\n", dirs.SocketPath())
-			fmt.Fprintf(os.Stderr, "  log:    %s\n", logPath)
-			return 0
-		}
-		time.Sleep(250 * time.Millisecond)
-	}
-	errf("daemon start: timed out waiting for control socket; tail -f %s", logPath)
-	return 3
-}
-
-func daemonRunForeground(ctx context.Context, dirs state.Dirs, cfg *config.Config) int {
-	mgr := intentruntime.New(dirs.Cache)
-	if !mgr.HaveLlamafile() {
-		errf("daemon: llamafile runtime missing — run `i model pull` first")
-		errf("  expected: %s", mgr.LlamafilePath())
-		return 3
-	}
-	// Resolve the model through the full catalog (built-in + custom)
-	// so the daemon loads exactly what `i model use` selected, even
-	// for user-added HF repos that aren't in the built-in list.
-	cat := loadCatalog(dirs.State)
-	id := cfg.Model
-	if id == "" {
-		id = models.DefaultID
-	}
-	host, port, err := resolveLocalDaemonEndpoint(cfg)
-	if err != nil {
-		errf("daemon: %v", err)
-		return 1
-	}
-	m := cat.Get(id)
-	if m == nil {
-		errf("daemon: current model %q not in catalog; run `i model list` and `i model use <id>`", id)
-		return 1
-	}
-	modelPath := mgr.ModelPath(models.ModelFilename(m))
-	if _, err := os.Stat(modelPath); err != nil {
-		errf("daemon: model %q not installed — run `i model pull %s`", id, id)
-		errf("  expected: %s", modelPath)
-		return 3
-	}
-
-	logDir := filepath.Join(dirs.State, "logs")
-	if err := os.MkdirAll(logDir, 0o700); err != nil {
-		errf("daemon: mkdir log dir: %v", err)
-		return 3
-	}
-	logPath := filepath.Join(logDir, "llamafile.log")
-	logF, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600)
-	if err != nil {
-		errf("daemon: open log: %v", err)
-		return 3
-	}
-	defer logF.Close()
-
-	portNum := 18080
-	fmt.Sscanf(port, "%d", &portNum)
-
-	launcher := daemon.NewLauncher(mgr.LlamafilePath(), modelPath, host, portNum)
-	launcher.StdoutLog = logF
-	launcher.StderrLog = io.MultiWriter(logF, os.Stderr)
-
-	startCtx, cancelStart := context.WithTimeout(ctx, 90*time.Second)
-	fmt.Fprintln(os.Stderr, "intentd: starting llamafile...")
-	if err := launcher.Start(startCtx); err != nil {
-		cancelStart()
-		errf("daemon: start llamafile: %v", err)
-		return 3
-	}
-	cancelStart()
-	fmt.Fprintf(os.Stderr, "intentd: llamafile ready on %s (pid %d)\n",
-		launcher.Endpoint(), launcher.PID())
-
-	srv := daemon.New(dirs.SocketPath(), launcher)
-	if err := srv.Listen(); err != nil {
-		launcher.Stop(5 * time.Second)
-		errf("daemon: listen: %v", err)
-		return 3
-	}
-	fmt.Fprintf(os.Stderr, "intentd: control socket at %s\n", dirs.SocketPath())
-
-	sigCtx, cancelSig := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM, syscall.SIGHUP)
-	defer cancelSig()
-	serveDone := make(chan struct{})
-	go func() {
-		_ = srv.Serve(sigCtx)
-		close(serveDone)
-	}()
-
-	// Block until any of: an OS signal, an `i daemon stop` over the
-	// socket, the parent context is canceled, or the supervised
-	// llamafile gives up entirely (Wait drains).
-	llamaDone := make(chan struct{})
-	go func() { launcher.Wait(); close(llamaDone) }()
-	select {
-	case <-sigCtx.Done():
-	case <-srv.Stopped():
-	case <-serveDone:
-	case <-llamaDone:
-	}
-	fmt.Fprintln(os.Stderr, "intentd: shutting down...")
-	srv.SignalStop()
-	launcher.Stop(10 * time.Second)
-	fmt.Fprintln(os.Stderr, "intentd: stopped.")
-	return 0
-}
-
-func daemonStop(dirs state.Dirs) int {
-	c := daemon.NewClient(dirs.SocketPath())
-	resp, err := c.Call(daemon.Request{Op: daemon.OpStop})
-	if err != nil {
-		errf("daemon stop: %v (is the daemon running?)", err)
-		return 1
-	}
-	if !resp.OK {
-		errf("daemon stop: %s", resp.Error)
-		return 1
-	}
-	fmt.Println("daemon: stop requested")
-	return 0
-}
-
-func daemonStatus(dirs state.Dirs) int {
-	c := daemon.NewClient(dirs.SocketPath())
-	resp, err := c.Call(daemon.Request{Op: daemon.OpStatus})
-	if err != nil {
-		fmt.Println("daemon: not running")
-		fmt.Println("  socket:", dirs.SocketPath())
-		fmt.Println("  installed as service:", daemon.IsInstalled(daemonLabel))
-		return 1
-	}
-	if !resp.OK {
-		errf("daemon status: %s", resp.Error)
-		return 1
-	}
-	fmt.Println("daemon: running")
-	for k, v := range resp.Data {
-		fmt.Printf("  %s: %v\n", k, v)
-	}
-	return 0
-}
-
-func daemonLogs(dirs state.Dirs) int {
-	logPath := filepath.Join(dirs.State, "logs", "llamafile.log")
-	if runtime.GOOS == "linux" && daemon.IsInstalled(daemonLabel) {
-		fmt.Fprintln(os.Stderr, "Tip: run `journalctl --user -u "+daemonLabel+".service -f` for the systemd-managed log.")
-		fmt.Fprintln(os.Stderr, "Showing the llamafile subprocess log:", logPath)
-	}
-	f, err := os.Open(logPath)
-	if err != nil {
-		errf("logs: %v", err)
-		return 1
-	}
-	defer f.Close()
-	if _, err := io.Copy(os.Stdout, f); err != nil {
-		errf("logs: %v", err)
-		return 1
-	}
-	return 0
-}
-
-func daemonInstall(dirs state.Dirs) int {
-	bin, err := os.Executable()
-	if err != nil {
-		errf("daemon install: locate self: %v", err)
-		return 3
-	}
-	bin, _ = filepath.EvalSymlinks(bin)
-	res, err := daemon.Install(daemon.InstallParams{
-		Binary: bin,
-		Label:  daemonLabel,
-		LogDir: filepath.Join(dirs.State, "logs"),
-		Socket: dirs.SocketPath(),
-		Cache:  dirs.Cache,
-		State:  dirs.State,
-	})
-	if err != nil {
-		errf("daemon install: %v", err)
-		return 3
-	}
-	fmt.Println("daemon installed as a system service.")
-	fmt.Println("  unit:    ", res.UnitPath)
-	fmt.Println("  start:   ", strJoin(res.StartCmd))
-	fmt.Println("  stop:    ", strJoin(res.StopCmd))
-	if res.LogPath != "" {
-		fmt.Println("  log:     ", res.LogPath)
-	}
-	if res.Notes != "" {
-		fmt.Println()
-		fmt.Println(res.Notes)
-	}
-	return 0
-}
-
-func daemonUninstall(dirs state.Dirs) int {
-	// Try a polite stop first.
-	c := daemon.NewClient(dirs.SocketPath())
-	_, _ = c.Call(daemon.Request{Op: daemon.OpStop})
-	if err := daemon.Uninstall(daemonLabel); err != nil {
-		errf("daemon uninstall: %v", err)
-		return 3
-	}
-	fmt.Println("daemon: service uninstalled.")
-	return 0
-}
-
-func strJoin(parts []string) string {
-	if len(parts) == 0 {
-		return "(none)"
-	}
-	out := ""
-	for i, p := range parts {
-		if i > 0 {
-			out += " "
-		}
-		out += p
-	}
-	return out
-}
-
-// modelFileFor maps a config model tag (e.g. "qwen2.5-coder-7b-instruct-q4_k_m")
-// to the GGUF filename we expect on disk. v1 is hard-coded to one default;
-// future versions consult a model registry.
-func modelFileFor(tag string) string {
-	if tag == intentruntime.DefaultModel.Name {
-		return intentruntime.DefaultModel.File
-	}
-	// Best-effort: assume tag + ".gguf".
-	if filepath.Ext(tag) == ".gguf" {
-		return tag
-	}
-	return tag + ".gguf"
-}
diff --git a/internal/cli/daemon_host.go b/internal/cli/daemon_host.go
deleted file mode 100644
index 871faae..0000000
--- a/internal/cli/daemon_host.go
+++ /dev/null
@@ -1,57 +0,0 @@
-package cli
-
-import (
-	"fmt"
-	"net"
-	"strings"
-
-	"github.com/CoreyRDean/intent/internal/config"
-)
-
-const defaultLocalDaemonHost = "127.0.0.1"
-const defaultLocalDaemonPort = "18080"
-
-// normalizeLocalDaemonHost accepts only loopback hosts for the local daemon.
-// Any accepted value is canonicalized to 127.0.0.1 so the local backend never
-// accidentally exposes the model server on a broader interface.
-func normalizeLocalDaemonHost(raw string) (string, error) {
-	host := strings.TrimSpace(raw)
-	if host == "" {
-		return defaultLocalDaemonHost, nil
-	}
-	if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") {
-		host = strings.TrimSuffix(strings.TrimPrefix(host, "["), "]")
-	}
-	if strings.EqualFold(host, "localhost") {
-		return defaultLocalDaemonHost, nil
-	}
-	if ip := net.ParseIP(host); ip != nil && ip.IsLoopback() {
-		return defaultLocalDaemonHost, nil
-	}
-	return "", fmt.Errorf("daemon.host %q must resolve to loopback only", strings.TrimSpace(raw))
-}
-
-func resolveLocalDaemonHost(cfg *config.Config) (string, error) {
-	if cfg == nil {
-		return normalizeLocalDaemonHost("")
-	}
-	return normalizeLocalDaemonHost(cfg.Raw["daemon.host"])
-}
-
-func resolveLocalDaemonPort(cfg *config.Config) string {
-	if cfg == nil {
-		return defaultLocalDaemonPort
-	}
-	if port := strings.TrimSpace(cfg.Raw["daemon.port"]); port != "" {
-		return port
-	}
-	return defaultLocalDaemonPort
-}
-
-func resolveLocalDaemonEndpoint(cfg *config.Config) (host, port string, err error) {
-	host, err = resolveLocalDaemonHost(cfg)
-	if err != nil {
-		return "", "", err
-	}
-	return host, resolveLocalDaemonPort(cfg), nil
-}
diff --git a/internal/cli/daemon_host_test.go b/internal/cli/daemon_host_test.go
deleted file mode 100644
index 22d7ade..0000000
--- a/internal/cli/daemon_host_test.go
+++ /dev/null
@@ -1,75 +0,0 @@
-package cli
-
-import (
-	"strings"
-	"testing"
-
-	"github.com/CoreyRDean/intent/internal/config"
-)
-
-func TestNormalizeLocalDaemonHost(t *testing.T) {
-	tests := []struct {
-		name    string
-		raw     string
-		want    string
-		wantErr string
-	}{
-		{name: "default empty host", raw: "", want: "127.0.0.1"},
-		{name: "localhost", raw: "localhost", want: "127.0.0.1"},
-		{name: "ipv4 loopback", raw: "127.0.0.1", want: "127.0.0.1"},
-		{name: "ipv6 loopback", raw: "::1", want: "127.0.0.1"},
-		{name: "bracketed ipv6 loopback", raw: "[::1]", want: "127.0.0.1"},
-		{name: "non-loopback wildcard rejected", raw: "0.0.0.0", wantErr: "loopback only"},
-		{name: "non-loopback ip rejected", raw: "192.168.1.10", wantErr: "loopback only"},
-		{name: "hostname rejected", raw: "example.com", wantErr: "loopback only"},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			got, err := normalizeLocalDaemonHost(tt.raw)
-			if tt.wantErr != "" {
-				if err == nil {
-					t.Fatalf("expected error containing %q, got nil", tt.wantErr)
-				}
-				if !strings.Contains(err.Error(), tt.wantErr) {
-					t.Fatalf("error = %q, want substring %q", err, tt.wantErr)
-				}
-				return
-			}
-			if err != nil {
-				t.Fatalf("unexpected error: %v", err)
-			}
-			if got != tt.want {
-				t.Fatalf("host = %q, want %q", got, tt.want)
-			}
-		})
-	}
-}
-
-func TestResolveLocalDaemonEndpoint(t *testing.T) {
-	cfg := &config.Config{Raw: map[string]string{
-		"daemon.host": " localhost ",
-		"daemon.port": " 19090 ",
-	}}
-
-	host, port, err := resolveLocalDaemonEndpoint(cfg)
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if host != "127.0.0.1" {
-		t.Fatalf("host = %q, want %q", host, "127.0.0.1")
-	}
-	if port != "19090" {
-		t.Fatalf("port = %q, want %q", port, "19090")
-	}
-}
-
-func TestValidateConfigValueRejectsRemoteDaemonHost(t *testing.T) {
-	err := validateConfigValue("daemon.host", "0.0.0.0")
-	if err == nil {
-		t.Fatal("expected daemon.host validation error, got nil")
-	}
-	if !strings.Contains(err.Error(), "loopback only") {
-		t.Fatalf("error = %q, want loopback hint", err)
-	}
-}
diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go
index 5d67363..76995f3 100644
--- a/internal/cli/doctor.go
+++ b/internal/cli/doctor.go
@@ -7,22 +7,11 @@ import (
 	"runtime"
 
 	"github.com/CoreyRDean/intent/internal/config"
-	"github.com/CoreyRDean/intent/internal/daemon"
 	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
 	"github.com/CoreyRDean/intent/internal/state"
 	"github.com/CoreyRDean/intent/internal/version"
 )
 
-type daemonStatusCaller interface {
-	Call(req daemon.Request) (*daemon.Response, error)
-}
-
-var newDaemonStatusClient = func(socket string) daemonStatusCaller {
-	return daemon.NewClient(socket)
-}
-
-var daemonServiceInstalled = daemon.IsInstalled
-
 func cmdDoctor(_ context.Context, _ []string) int {
 	ok := true
 	check := func(name, status string, good bool) {
@@ -54,15 +43,14 @@ func cmdDoctor(_ context.Context, _ []string) int {
 		}
 
 		rt := intentruntime.New(dirs.Cache)
-		check("llamafile runtime",
-			fmt.Sprintf("expected at %s", rt.LlamafilePath()),
-			rt.HaveLlamafile())
+		cliStatus := "missing — run `i model pull` to install via your package manager"
+		if rt.HaveLlamaCLI() {
+			cliStatus = "found at " + rt.LlamaCLIPath()
+		}
+		check("llama-cli runtime", cliStatus, rt.HaveLlamaCLI())
 
 		modelFile, modelStatus := resolveModelCheck(cfg)
 		check("model", fmt.Sprintf("%s — %s", modelStatus, rt.ModelPath(modelFile)), rt.HaveModel(modelFile))
-
-		daemonStatus, daemonOK := doctorDaemonStatus(dirs)
-		check("daemon", daemonStatus, daemonOK)
 	}
 
 	// Sandbox tooling.
@@ -107,26 +95,3 @@ func okStr(err error) string {
 	}
 	return "missing"
 }
-
-func doctorDaemonStatus(dirs state.Dirs) (string, bool) {
-	installed := daemonServiceInstalled(daemonLabel)
-	resp, err := newDaemonStatusClient(dirs.SocketPath()).Call(daemon.Request{Op: daemon.OpStatus})
-	if err != nil {
-		if installed {
-			return "installed but not responding", false
-		}
-		return "not running (optional)", true
-	}
-	if !resp.OK {
-		return "unhealthy: " + resp.Error, false
-	}
-
-	serviceState := "no"
-	if installed {
-		serviceState = "yes"
-	}
-	if endpoint, _ := resp.Data["llamafile_endpoint"].(string); endpoint != "" {
-		return fmt.Sprintf("running (service installed: %s, endpoint: %s)", serviceState, endpoint), true
-	}
-	return fmt.Sprintf("running (service installed: %s)", serviceState), true
-}
diff --git a/internal/cli/doctor_test.go b/internal/cli/doctor_test.go
index 80010f1..ff9d8fb 100644
--- a/internal/cli/doctor_test.go
+++ b/internal/cli/doctor_test.go
@@ -2,14 +2,11 @@ package cli
 
 import (
 	"context"
-	"errors"
 	"strings"
 	"testing"
 
 	"github.com/CoreyRDean/intent/internal/config"
-	"github.com/CoreyRDean/intent/internal/daemon"
 	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
-	"github.com/CoreyRDean/intent/internal/state"
 )
 
 func TestResolveModelCheck(t *testing.T) {
@@ -58,100 +55,17 @@ func TestResolveModelCheck(t *testing.T) {
 	}
 }
 
-type stubDaemonStatusClient struct {
-	resp *daemon.Response
-	err  error
-}
-
-func (s stubDaemonStatusClient) Call(_ daemon.Request) (*daemon.Response, error) {
-	return s.resp, s.err
-}
-
-func TestDoctorDaemonStatus(t *testing.T) {
-	origNewClient := newDaemonStatusClient
-	origInstalled := daemonServiceInstalled
-	t.Cleanup(func() {
-		newDaemonStatusClient = origNewClient
-		daemonServiceInstalled = origInstalled
-	})
-
-	dirs := state.Dirs{State: t.TempDir()}
-
-	tests := []struct {
-		name      string
-		installed bool
-		client    stubDaemonStatusClient
-		want      string
-		wantOK    bool
-	}{
-		{
-			name:      "missing optional daemon is informational",
-			installed: false,
-			client:    stubDaemonStatusClient{err: errors.New("dial unix: no such file or directory")},
-			want:      "not running (optional)",
-			wantOK:    true,
-		},
-		{
-			name:      "installed daemon that does not respond is unhealthy",
-			installed: true,
-			client:    stubDaemonStatusClient{err: errors.New("connection refused")},
-			want:      "installed but not responding",
-			wantOK:    false,
-		},
-		{
-			name:      "running daemon reports endpoint",
-			installed: false,
-			client: stubDaemonStatusClient{resp: &daemon.Response{
-				OK: true,
-				Data: map[string]any{
-					"llamafile_endpoint": "http://127.0.0.1:18080",
-				},
-			}},
-			want:   "running (service installed: no, endpoint: http://127.0.0.1:18080)",
-			wantOK: true,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			newDaemonStatusClient = func(string) daemonStatusCaller { return tt.client }
-			daemonServiceInstalled = func(string) bool { return tt.installed }
-
-			got, gotOK := doctorDaemonStatus(dirs)
-			if got != tt.want {
-				t.Fatalf("status = %q, want %q", got, tt.want)
-			}
-			if gotOK != tt.wantOK {
-				t.Fatalf("ok = %v, want %v", gotOK, tt.wantOK)
-			}
-		})
-	}
-}
-
-func TestDoctorPrintsDaemonStatus(t *testing.T) {
-	origNewClient := newDaemonStatusClient
-	origInstalled := daemonServiceInstalled
-	t.Cleanup(func() {
-		newDaemonStatusClient = origNewClient
-		daemonServiceInstalled = origInstalled
-	})
-
+// TestDoctorReportsLlamaCLIRuntime verifies doctor surfaces the local
+// runtime line (llama-cli) rather than a daemon/server status.
+func TestDoctorReportsLlamaCLIRuntime(t *testing.T) {
 	t.Setenv("HOME", t.TempDir())
 	t.Setenv("INTENT_STATE_DIR", t.TempDir())
 	t.Setenv("INTENT_CACHE_DIR", t.TempDir())
 
-	newDaemonStatusClient = func(string) daemonStatusCaller {
-		return stubDaemonStatusClient{err: errors.New("dial unix: no such file or directory")}
-	}
-	daemonServiceInstalled = func(string) bool { return false }
-
 	out := captureStdout(func() {
 		_ = cmdDoctor(context.Background(), nil)
 	})
-	if !strings.Contains(out, "daemon") {
-		t.Fatalf("doctor output missing daemon line: %q", out)
-	}
-	if !strings.Contains(out, "not running (optional)") {
-		t.Fatalf("doctor output missing optional daemon status: %q", out)
+	if !strings.Contains(out, "llama-cli runtime") {
+		t.Fatalf("doctor output missing llama-cli runtime line: %q", out)
 	}
 }
diff --git a/internal/cli/ensure.go b/internal/cli/ensure.go
index f45b399..7287c7c 100644
--- a/internal/cli/ensure.go
+++ b/internal/cli/ensure.go
@@ -4,14 +4,11 @@ import (
 	"bufio"
 	"context"
 	"fmt"
-	"net"
 	"os"
 	"path/filepath"
 	"strings"
-	"time"
 
 	"github.com/CoreyRDean/intent/internal/config"
-	"github.com/CoreyRDean/intent/internal/daemon"
 	"github.com/CoreyRDean/intent/internal/models"
 	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
 	"github.com/CoreyRDean/intent/internal/state"
@@ -19,36 +16,31 @@ import (
 )
 
 // ensureBackendReady is the self-healing precondition for any subcommand
-// that wants to talk to the local model. It checks (in order):
+// that wants to talk to the local model. Local inference runs llama.cpp's
+// `llama-cli` one-shot — there is no daemon to start — so "ready" means:
 //
-//  1. The daemon is reachable. If yes, we're done.
-//  2. The runtime + model are present on disk.
-//     - If not and stdin is a TTY: ask permission, then download.
-//     - If not and we're non-interactive: fail with a clear, copyable
-//     command that fixes it.
-//  3. With files in place but no daemon, start one in the background.
-//  4. Wait briefly for the daemon's control socket to come up.
+//  1. The llama-cli runtime is installed (via the system package manager).
+//  2. The selected GGUF model is downloaded.
 //
-// Returns true if the call site should proceed, false if it should
-// bail out (we already printed the failure reason).
+// If either is missing:
+//   - interactive TTY: ask permission, then install/download.
+//   - non-interactive: fail with a clear, copyable command that fixes it.
 //
-// Backend-name guard: this only fires for the local llamafile backend.
-// Users on `openai`, `ollama`, or `mock` get no prompts and no startup
-// attempts — we're not their package manager.
+// Returns true if the call site should proceed, false if it should bail
+// out (we already printed the failure reason).
+//
+// Backend-name guard: this only fires for the local llama-cli backend.
+// Users on `openai`, `ollama`, `llamafile-network`, or `mock` get no
+// prompts and no install attempts — we're not their package manager.
 func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config) bool {
-	if cfg.Backend != "" && cfg.Backend != "llamafile-local" {
+	if !isLocalBackend(cfg.Backend) {
 		return true
 	}
 
-	// (1) Daemon already up?
-	if pingDaemon(dirs) {
-		return true
-	}
-
-	mgr := intentruntime.New(dirs.Cache)
-	// Resolve the *selected* model through the catalog so self-
-	// healing downloads the right thing when the user has switched
-	// to a custom HF repo or a non-default built-in.
+	rt := intentruntime.New(dirs.Cache)
+	// Resolve the *selected* model through the catalog so self-healing
+	// downloads the right thing when the user has switched to a custom
+	// HF repo or a non-default built-in.
 	cat := loadCatalog(dirs.State)
 	id := cfg.Model
 	if id == "" {
@@ -56,91 +48,62 @@ func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config
 	}
 	selected := cat.Get(id)
 	if selected == nil {
-		// Fall back to the catalog default to at least make progress;
-		// the daemon will complain later if this mismatches config.
 		selected = cat.Default()
 	}
-	haveLF := mgr.HaveLlamafile()
-	haveModel := selected != nil && mgr.HaveModel(models.ModelFilename(selected))
-	interactive := tui.IsTTY(os.Stdin) && tui.IsTTY(os.Stderr)
+	haveCLI := rt.HaveLlamaCLI()
+	haveModel := selected != nil && rt.HaveModel(models.ModelFilename(selected))
+	if haveCLI && haveModel {
+		return true
+	}
 
-	// (2) Missing artifacts.
-	if !haveLF || !haveModel {
-		if !interactive {
-			fmt.Fprintln(os.Stderr, "intent: local model isn't installed yet.")
-			fmt.Fprintln(os.Stderr, "  run: i model pull")
-			return false
-		}
-		fmt.Fprintln(os.Stderr, "intent: the local model isn't installed yet.")
-		if !haveLF {
-			fmt.Fprintln(os.Stderr, "  missing runtime: llamafile-"+intentruntime.LlamafileVersion)
-		}
-		if !haveModel && selected != nil {
-			fmt.Fprintf(os.Stderr, "  missing model:   %s (~%d MB)\n",
-				selected.ID, selected.SizeMB)
-		}
-		if !confirmYes("Download now?") {
-			fmt.Fprintln(os.Stderr, "intent: skipped. Run `i model pull` later.")
-			return false
-		}
-		if !haveLF {
-			fmt.Fprintln(os.Stderr, "downloading runtime...")
-			if err := mgr.EnsureLlamafile(ctx, progressCB("llamafile")); err != nil {
-				fmt.Fprintln(os.Stderr)
-				errf("runtime: %v", err)
-				return false
-			}
-			fmt.Fprintln(os.Stderr)
+	interactive := tui.IsTTY(os.Stdin) && tui.IsTTY(os.Stderr)
+	if !interactive {
+		fmt.Fprintln(os.Stderr, "intent: local model isn't ready yet.")
+		if !haveCLI {
+			fmt.Fprintln(os.Stderr, "  missing runtime: llama-cli (llama.cpp)")
 		}
 		if !haveModel && selected != nil {
-			fmt.Fprintf(os.Stderr, "downloading model (~%d MB)...\n", selected.SizeMB)
-			mi := intentruntime.FromCatalog(selected)
-			if err := mgr.EnsureModel(ctx, mi, progressCB("model")); err != nil {
-				fmt.Fprintln(os.Stderr)
-				errf("model: %v", err)
-				return false
-			}
-			fmt.Fprintln(os.Stderr)
+			fmt.Fprintf(os.Stderr, "  missing model:   %s (~%d MB)\n", selected.ID, selected.SizeMB)
 		}
+		fmt.Fprintln(os.Stderr, "  run: i model pull")
+		return false
 	}
 
-	// (3) Start the daemon. We use the same `i daemon start` code path
-	// as the user would, so behaviour matches and bugs are shared.
-	fmt.Fprintln(os.Stderr, "intent: starting daemon in the background...")
-	if rc := daemonSpawnDetached(dirs); rc != 0 {
-		fmt.Fprintln(os.Stderr, "intent: daemon failed to start; falling back to mock.")
+	fmt.Fprintln(os.Stderr, "intent: the local model isn't ready yet.")
+	if !haveCLI {
+		fmt.Fprintln(os.Stderr, "  missing runtime: llama-cli (llama.cpp)")
+	}
+	if !haveModel && selected != nil {
+		fmt.Fprintf(os.Stderr, "  missing model:   %s (~%d MB)\n", selected.ID, selected.SizeMB)
+	}
+	if !confirmYes("Set up now?") {
+		fmt.Fprintln(os.Stderr, "intent: skipped. Run `i model pull` later.")
 		return false
 	}
 
-	// (4) Confirm it's actually responsive (daemonSpawnDetached already
-	// polls, but be defensive — the socket might be ready while
-	// llamafile is still warming up its first inference).
-	deadline := time.Now().Add(60 * time.Second)
-	for time.Now().Before(deadline) {
-		if pingDaemon(dirs) {
-			return true
+	if !haveCLI {
+		fmt.Fprintln(os.Stderr, "installing llama-cli...")
+		if err := rt.EnsureLlamaCLI(ctx, func(s string) { fmt.Fprintln(os.Stderr, "  "+s) }); err != nil {
+			errf("runtime: %v", err)
+			return false
 		}
-		time.Sleep(200 * time.Millisecond)
 	}
-	fmt.Fprintln(os.Stderr, "intent: daemon started but isn't responding yet; try again in a few seconds.")
-	return false
-}
-
-// pingDaemon checks both that the control socket exists and that the
-// daemon answers a ping. Either an unreachable socket or a sad daemon
-// returns false.
-func pingDaemon(dirs state.Dirs) bool {
-	if _, err := os.Stat(dirs.SocketPath()); err != nil {
-		return false
+	if !haveModel && selected != nil {
+		fmt.Fprintf(os.Stderr, "downloading model (~%d MB)...\n", selected.SizeMB)
+		mi := intentruntime.FromCatalog(selected)
+		if err := rt.EnsureModel(ctx, mi, progressCB("model")); err != nil {
+			fmt.Fprintln(os.Stderr)
+			errf("model: %v", err)
+			return false
+		}
+		fmt.Fprintln(os.Stderr)
 	}
-	c, err := net.DialTimeout("unix", dirs.SocketPath(), 200*time.Millisecond)
-	if err != nil {
+
+	if !rt.HaveLlamaCLI() {
+		fmt.Fprintln(os.Stderr, "intent: llama-cli still not available; falling back to mock.")
 		return false
 	}
-	_ = c.Close()
-	cli := daemon.NewClient(dirs.SocketPath())
-	resp, err := cli.Call(daemon.Request{Op: daemon.OpPing})
-	return err == nil && resp.OK
+	return true
 }
 
 // cfgModelFile turns the configured model tag into a GGUF filename,
@@ -188,25 +151,3 @@ func confirmYes(prompt string) bool {
 	line = strings.TrimSpace(strings.ToLower(line))
 	return line == "" || line == "y" || line == "yes"
 }
-
-// startDaemonAndWait is a small helper used by `i init` after a model
-// pull, to bring the daemon up without making the user run a third
-// command. It mirrors ensureBackendReady's daemon-startup half but
-// with louder logging since this is an explicit setup step.
-func startDaemonAndWait(dirs state.Dirs) error {
-	if pingDaemon(dirs) {
-		return nil
-	}
-	if rc := daemonSpawnDetached(dirs); rc != 0 {
-		return fmt.Errorf("daemon failed to start (see %s)",
-			filepath.Join(dirs.State, "logs", "intentd.log"))
-	}
-	deadline := time.Now().Add(60 * time.Second)
-	for time.Now().Before(deadline) {
-		if pingDaemon(dirs) {
-			return nil
-		}
-		time.Sleep(200 * time.Millisecond)
-	}
-	return fmt.Errorf("daemon started but didn't become responsive in 60s")
-}
diff --git a/internal/cli/init.go b/internal/cli/init.go
index 6ecc8aa..94b7437 100644
--- a/internal/cli/init.go
+++ b/internal/cli/init.go
@@ -44,20 +44,6 @@ func cmdInit(ctx context.Context, args []string) int {
 	fmt.Printf("  cache dir: %s\n", dirs.Cache)
 	fmt.Println()
 
-	// Daemon prompt — default Yes, per D-004.
-	fmt.Print("Keep intent warm in the background so it never has to load? [Y/n] ")
-	answer := "y"
-	if !autoYes {
-		r := bufio.NewReader(os.Stdin)
-		line, _ := r.ReadString('\n')
-		line = strings.TrimSpace(strings.ToLower(line))
-		if line == "" {
-			line = "y"
-		}
-		answer = line
-	}
-	cfg.DaemonEnabled = answer == "y" || answer == "yes"
-
 	// Shell integration prompt — default Yes. Without it, zsh users
 	// hit "no matches found" the first time they type a prompt with
 	// a literal `?` in it, which is a brutal first impression.
@@ -81,11 +67,7 @@ func cmdInit(ctx context.Context, args []string) int {
 
 	fmt.Println()
 	fmt.Println("Wrote", dirs.ConfigPath())
-	if cfg.DaemonEnabled {
-		fmt.Println("Daemon: enabled. Run `i daemon install` to register it as a launchd/systemd service.")
-	} else {
-		fmt.Println("Daemon: disabled. Each invocation will cold-load the model.")
-	}
+	fmt.Println("Local inference runs llama.cpp's `llama-cli` on demand (no background daemon).")
 
 	if installHook {
 		writeShellHook()
@@ -94,9 +76,9 @@ func cmdInit(ctx context.Context, args []string) int {
 		fmt.Println("            ? * [ ] characters, or run `i shell-init zsh >> ~/.zshrc` later.")
 	}
 
-	// Model pull + daemon start. This is the difference between
-	// "config written, now go figure out three more commands" and
-	// "open a new shell and you're working." Default Yes.
+	// Runtime install + model pull. This is the difference between
+	// "config written, now go figure out more commands" and "open a
+	// new shell and you're working." Default Yes.
 	mgr := intentruntime.New(dirs.Cache)
 	cat := loadCatalog(dirs.State)
 	// Prefer whatever the user already selected in config over the
@@ -106,14 +88,14 @@ func cmdInit(ctx context.Context, args []string) int {
 	if selected == nil {
 		selected = cat.Default()
 	}
-	haveLF := mgr.HaveLlamafile()
+	haveCLI := mgr.HaveLlamaCLI()
 	haveModel := selected != nil && mgr.HaveModel(models.ModelFilename(selected))
-	if !haveLF || !haveModel {
+	if !haveCLI || !haveModel {
 		fmt.Println()
 		if selected != nil {
-			fmt.Printf("Download %s now? (~%d MB) [Y/n] ", selected.ID, selected.SizeMB)
+			fmt.Printf("Install llama-cli and download %s now? (~%d MB) [Y/n] ", selected.ID, selected.SizeMB)
 		} else {
-			fmt.Printf("Download the default local model now? [Y/n] ")
+			fmt.Printf("Install llama-cli and the default local model now? [Y/n] ")
 		}
 		pullAnswer := "y"
 		if !autoYes {
@@ -126,11 +108,11 @@ func cmdInit(ctx context.Context, args []string) int {
 			pullAnswer = line
 		}
 		if pullAnswer == "y" || pullAnswer == "yes" {
-			if !haveLF {
-				fmt.Println("downloading runtime...")
-				if err := mgr.EnsureLlamafile(ctx, progressCB("llamafile")); err != nil {
+			if !haveCLI {
+				fmt.Println("installing llama-cli via your package manager...")
+				if err := mgr.EnsureLlamaCLI(ctx, func(s string) { fmt.Println("  " + s) }); err != nil {
 					fmt.Println()
-					errf("init: download runtime: %v", err)
+					errf("init: install runtime: %v", err)
 					fmt.Println("you can retry with `i model pull`.")
 					return 0
 				}
@@ -152,20 +134,10 @@ func cmdInit(ctx context.Context, args []string) int {
 		}
 	} else {
 		fmt.Println()
+		fmt.Println("Runtime:     llama-cli already installed.")
 		fmt.Println("Model:       already installed.")
 	}
 
-	if cfg.DaemonEnabled {
-		fmt.Println("Starting daemon...")
-		if err := startDaemonAndWait(dirs); err != nil {
-			errf("init: %v", err)
-			fmt.Println("you can retry with `i daemon start` (and inspect logs at",
-				filepath.Join(dirs.State, "logs", "intentd.log")+").")
-		} else {
-			fmt.Println("Daemon:      running.")
-		}
-	}
-
 	fmt.Println()
 	fmt.Println("All set. Try:")
 	fmt.Println("  i hello              # smoke test")
diff --git a/internal/cli/intent.go b/internal/cli/intent.go
index 71bdee4..0d1dbfa 100644
--- a/internal/cli/intent.go
+++ b/internal/cli/intent.go
@@ -213,10 +213,10 @@ func cmdIntent(ctx context.Context, args []string) int {
 		return 3
 	}
 
-	// Self-heal: if the backend is local-llamafile and the daemon
-	// isn't reachable, offer to download the model and start it.
-	// This collapses what used to be three commands the user had to
-	// guess (`i model pull`, `i daemon install`, retry) into one
+	// Self-heal: if the backend is the local llama-cli runtime and it
+	// (or the selected model) isn't installed yet, offer to install the
+	// runtime via the system package manager and download the model.
+	// This collapses what used to be several setup commands into one
 	// prompt or, if `--yes` is set, zero. See ensure.go.
 	backendForCheck := cfg.Backend
 	if fl.backend != "" {
diff --git a/internal/cli/model.go b/internal/cli/model.go
index 580479e..efc95a5 100644
--- a/internal/cli/model.go
+++ b/internal/cli/model.go
@@ -7,7 +7,6 @@ import (
 	"path/filepath"
 	"strings"
 	"text/tabwriter"
-	"time"
 
 	"github.com/CoreyRDean/intent/internal/config"
 	"github.com/CoreyRDean/intent/internal/models"
@@ -268,8 +267,9 @@ func inferQuantFromFilename(filename string) string {
 
 // modelUse switches the current model. Resolves the reference, persists
 // it as a custom entry if it's an HF repo we haven't seen, downloads
-// the model if it's not installed, and finally updates cfg.Model +
-// restarts the daemon so subsequent `i` calls use the new model.
+// the model if it's not installed, and updates cfg.Model. The next `i`
+// call picks up the new model automatically — llama-cli loads it fresh
+// each invocation, so there is no daemon to restart.
 func modelUse(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []string) int {
 	if len(args) == 0 {
 		errf("usage: i model use <id>")
@@ -308,7 +308,7 @@ func modelUse(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []s
 	}
 
 	if !models.ValidGGUFQuant(m.Quant) && m.Quant != "" {
-		fmt.Fprintf(os.Stderr, "warning: quant %q is unusual; llamafile may or may not load it.\n", m.Quant)
+		fmt.Fprintf(os.Stderr, "warning: quant %q is unusual; llama.cpp may or may not load it.\n", m.Quant)
 	}
 
 	// Download if missing.
@@ -337,20 +337,6 @@ func modelUse(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []s
 		return 3
 	}
 	fmt.Printf("current model: %s\n", m.ID)
-
-	// Nudge the daemon to pick up the new model. If it's running,
-	// restart it; if not, leave it alone (user will start it on next
-	// `i` call via ensureBackendReady).
-	if pingDaemon(dirs) {
-		fmt.Fprintln(os.Stderr, "restarting daemon with new model...")
-		_ = daemonStop(dirs)
-		time.Sleep(500 * time.Millisecond)
-		if err := startDaemonAndWait(dirs); err != nil {
-			errf("daemon restart: %v (run `i daemon start` manually)", err)
-			return 3
-		}
-		fmt.Fprintln(os.Stderr, "daemon: ready.")
-	}
 	return 0
 }
 
@@ -383,11 +369,11 @@ func modelPull(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []
 	}
 
 	rt := intentruntime.New(dirs.Cache)
-	if !rt.HaveLlamafile() {
-		fmt.Fprintln(os.Stderr, "downloading runtime...")
-		if err := rt.EnsureLlamafile(ctx, progressCB("llamafile")); err != nil {
+	if !rt.HaveLlamaCLI() {
+		fmt.Fprintln(os.Stderr, "installing llama-cli via your package manager...")
+		if err := rt.EnsureLlamaCLI(ctx, func(s string) { fmt.Fprintln(os.Stderr, "  "+s) }); err != nil {
 			fmt.Fprintln(os.Stderr)
-			errf("llamafile: %v", err)
+			errf("llama-cli: %v", err)
 			return 3
 		}
 		fmt.Fprintln(os.Stderr)
diff --git a/internal/cli/smoke_test.go b/internal/cli/smoke_test.go
index 6aedcab..73f3cba 100644
--- a/internal/cli/smoke_test.go
+++ b/internal/cli/smoke_test.go
@@ -374,27 +374,6 @@ func TestConfigRoundTripSectionedKnownKey(t *testing.T) {
 	}
 }
 
-func TestConfigSetRejectsRemoteDaemonHost(t *testing.T) {
-	stateDir := t.TempDir()
-	cacheDir := t.TempDir()
-	baseEnv := []string{
-		"HOME=" + os.Getenv("HOME"),
-		"PATH=" + os.Getenv("PATH"),
-		"INTENT_STATE_DIR=" + stateDir,
-		"INTENT_CACHE_DIR=" + cacheDir,
-	}
-
-	cmd := exec.Command(testBinary, "config", "set", "daemon.host", "0.0.0.0")
-	cmd.Env = baseEnv
-	out, err := cmd.CombinedOutput()
-	if err == nil {
-		t.Fatal("expected config set daemon.host to fail, got nil error")
-	}
-	if !strings.Contains(string(out), "loopback only") {
-		t.Fatalf("expected loopback validation error, got %q", string(out))
-	}
-}
-
 func TestConfigPath(t *testing.T) {
 	stdout, _, exitCode := run(t, nil, "config", "path")
 	if exitCode != 0 {
diff --git a/internal/config/config.go b/internal/config/config.go
index 216f1c1..c90f59d 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -35,7 +35,11 @@ type Config struct {
 // Defaults returns the project's chosen defaults.
 func Defaults() *Config {
 	return &Config{
-		Backend: "llamafile-local",
+		// Local inference runs llama.cpp's `llama-cli` one-shot. The
+		// legacy "llamafile-local" name is still accepted as an alias by
+		// the backend resolver, so configs written before the switch keep
+		// working without migration.
+		Backend: "llama-cli",
 		// Catalog short-id. See internal/models.DefaultID. Defaults to
 		// the 3B model as the balanced "just works" option: strong
 		// enough that `i report` doesn't routinely hit the fallback
diff --git a/internal/daemon/client.go b/internal/daemon/client.go
deleted file mode 100644
index 5856738..0000000
--- a/internal/daemon/client.go
+++ /dev/null
@@ -1,49 +0,0 @@
-package daemon
-
-import (
-	"bufio"
-	"encoding/json"
-	"fmt"
-	"net"
-	"time"
-)
-
-// Client is a one-shot connection to a daemon socket.
-type Client struct {
-	Socket  string
-	Timeout time.Duration
-}
-
-// NewClient returns a Client for the socket. Timeout defaults to 2s.
-func NewClient(socket string) *Client {
-	return &Client{Socket: socket, Timeout: 2 * time.Second}
-}
-
-// Call sends one request and returns the response.
-func (c *Client) Call(req Request) (*Response, error) {
-	conn, err := net.DialTimeout("unix", c.Socket, c.Timeout)
-	if err != nil {
-		return nil, err
-	}
-	defer conn.Close()
-	if c.Timeout > 0 {
-		_ = conn.SetDeadline(time.Now().Add(c.Timeout))
-	}
-	body, err := json.Marshal(req)
-	if err != nil {
-		return nil, err
-	}
-	if _, err := conn.Write(append(body, '\n')); err != nil {
-		return nil, err
-	}
-	r := bufio.NewReader(conn)
-	line, err := r.ReadBytes('\n')
-	if err != nil {
-		return nil, err
-	}
-	var resp Response
-	if err := json.Unmarshal(line, &resp); err != nil {
-		return nil, fmt.Errorf("decode response: %w", err)
-	}
-	return &resp, nil
-}
diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go
deleted file mode 100644
index 2853ef1..0000000
--- a/internal/daemon/daemon.go
+++ /dev/null
@@ -1,201 +0,0 @@
-// Package daemon implements intentd: a small supervisor that keeps a
-// llamafile --server process warm so the CLI doesn't pay a model-load
-// cost on every invocation.
-//
-// Architecture (v1):
-//
-//   - The daemon spawns llamafile as a subprocess and watches it.
-//   - llamafile exposes its OpenAI-compatible HTTP API on the loopback
-//     port from config. The CLI talks to that port directly. The daemon
-//     does NOT proxy inference traffic through its Unix socket — that
-//     would add a hop for no benefit, since the heavy lifting is the
-//     model, not the network.
-//   - The daemon owns a Unix socket on which it speaks a tiny line-
-//     delimited JSON control protocol: ping / status / stop. That's
-//     also how `i daemon status` and `i daemon stop` work.
-//
-// Idle unload (kill the llamafile subprocess after N minutes of HTTP
-// inactivity, respawn on next CLI request) is a v1.x follow-up. In v1
-// the daemon stays warm until the user stops it.
-package daemon
-
-import (
-	"bufio"
-	"context"
-	"encoding/json"
-	"fmt"
-	"net"
-	"os"
-	"sync"
-	"time"
-)
-
-// Op is the daemon control-protocol operation discriminator.
-type Op string
-
-const (
-	OpPing   Op = "ping"
-	OpStatus Op = "status"
-	OpStop   Op = "stop"
-)
-
-// Request is one daemon control request.
-type Request struct {
-	Op Op     `json:"op"`
-	ID string `json:"id,omitempty"`
-}
-
-// Response is the daemon's reply.
-type Response struct {
-	ID    string         `json:"id,omitempty"`
-	OK    bool           `json:"ok"`
-	Error string         `json:"error,omitempty"`
-	Data  map[string]any `json:"data,omitempty"`
-}
-
-// Server is the Unix-socket control-plane server.
-type Server struct {
-	Socket    string
-	Launcher  *Launcher
-	Started   time.Time
-	mu        sync.Mutex
-	ln        net.Listener
-	stopCh    chan struct{}
-	stopOnce  sync.Once
-	clientCtx context.Context
-}
-
-// New constructs a Server bound to socket and supervising launcher.
-func New(socket string, l *Launcher) *Server {
-	return &Server{
-		Socket:   socket,
-		Launcher: l,
-		stopCh:   make(chan struct{}),
-	}
-}
-
-// Listen binds the Unix socket. Any pre-existing socket file is removed.
-func (s *Server) Listen() error {
-	_ = os.Remove(s.Socket)
-	if err := os.MkdirAll(parentDir(s.Socket), 0o700); err != nil {
-		return fmt.Errorf("mkdir socket parent: %w", err)
-	}
-	ln, err := net.Listen("unix", s.Socket)
-	if err != nil {
-		return fmt.Errorf("listen on %s: %w", s.Socket, err)
-	}
-	if err := os.Chmod(s.Socket, 0o600); err != nil {
-		_ = ln.Close()
-		return fmt.Errorf("chmod socket: %w", err)
-	}
-	s.mu.Lock()
-	s.ln = ln
-	s.Started = time.Now()
-	s.mu.Unlock()
-	return nil
-}
-
-// Serve accepts connections until ctx is canceled OR an OpStop is received.
-func (s *Server) Serve(ctx context.Context) error {
-	if s.ln == nil {
-		return fmt.Errorf("server not listening")
-	}
-	s.clientCtx = ctx
-	go func() {
-		select {
-		case <-ctx.Done():
-		case <-s.stopCh:
-		}
-		_ = s.ln.Close()
-	}()
-	for {
-		conn, err := s.ln.Accept()
-		if err != nil {
-			select {
-			case <-s.stopCh:
-				return nil
-			default:
-			}
-			if ctx.Err() != nil {
-				return nil
-			}
-			return err
-		}
-		go s.handle(conn)
-	}
-}
-
-// SignalStop tells Serve to return. Idempotent.
-func (s *Server) SignalStop() {
-	s.stopOnce.Do(func() { close(s.stopCh) })
-}
-
-// Stopped returns a channel closed when SignalStop has been called.
-// `i daemon start` blocks on this AND on its OS-signal context, so
-// either source can shut the daemon down.
-func (s *Server) Stopped() <-chan struct{} { return s.stopCh }
-
-func (s *Server) handle(conn net.Conn) {
-	defer conn.Close()
-	_ = conn.SetReadDeadline(time.Now().Add(5 * time.Second))
-	r := bufio.NewReader(conn)
-	w := bufio.NewWriter(conn)
-	line, err := r.ReadBytes('\n')
-	if err != nil {
-		return
-	}
-	var req Request
-	if err := json.Unmarshal(line, &req); err != nil {
-		_ = writeJSONLine(w, Response{ID: req.ID, OK: false, Error: "bad json: " + err.Error()})
-		return
-	}
-	resp := s.dispatch(req)
-	resp.ID = req.ID
-	_ = writeJSONLine(w, resp)
-}
-
-func (s *Server) dispatch(req Request) Response {
-	switch req.Op {
-	case OpPing:
-		return Response{OK: true, Data: map[string]any{"pong": true}}
-	case OpStatus:
-		data := map[string]any{
-			"socket":     s.Socket,
-			"started_at": s.Started.UTC().Format(time.RFC3339),
-			"uptime_sec": int64(time.Since(s.Started).Seconds()),
-		}
-		if s.Launcher != nil {
-			data["llamafile_running"] = s.Launcher.Running()
-			data["llamafile_endpoint"] = s.Launcher.Endpoint()
-			data["llamafile_pid"] = s.Launcher.PID()
-			data["llamafile_restarts"] = s.Launcher.Restarts()
-			data["model"] = s.Launcher.ModelPath
-		}
-		return Response{OK: true, Data: data}
-	case OpStop:
-		s.SignalStop()
-		return Response{OK: true, Data: map[string]any{"stopping": true}}
-	default:
-		return Response{OK: false, Error: "unknown op: " + string(req.Op)}
-	}
-}
-
-func writeJSONLine(w *bufio.Writer, r Response) error {
-	b, err := json.Marshal(r)
-	if err != nil {
-		return err
-	}
-	if _, err := w.Write(append(b, '\n')); err != nil {
-		return err
-	}
-	return w.Flush()
-}
-
-func parentDir(p string) string {
-	for i := len(p) - 1; i >= 0; i-- {
-		if p[i] == '/' {
-			return p[:i]
-		}
-	}
-	return "."
-}
diff --git a/internal/daemon/install.go b/internal/daemon/install.go
deleted file mode 100644
index f8265f1..0000000
--- a/internal/daemon/install.go
+++ /dev/null
@@ -1,237 +0,0 @@
-package daemon
-
-import (
-	"fmt"
-	"os"
-	"os/exec"
-	"path/filepath"
-	"runtime"
-)
-
-// InstallParams are everything Install needs to write a system service file.
-type InstallParams struct {
-	Binary string // absolute path to the intent binary
-	Label  string // service label, e.g. "com.coreyrdean.intent"
-	LogDir string // directory the service writes stdout/stderr to
-	Socket string // daemon control socket path (informational)
-	Cache  string // cache root (so the service knows where llamafile lives)
-	State  string // state root
-}
-
-// InstallResult describes what was written and how to control it.
-type InstallResult struct {
-	UnitPath string   // path to the launchd plist or systemd unit
-	StartCmd []string // command to start the unit
-	StopCmd  []string // command to stop the unit
-	LogPath  string   // path to the stdout log
-	Notes    string   // human-readable post-install hint
-}
-
-// Install writes the platform-appropriate service file and starts it.
-// On macOS, returns the LaunchAgent plist path.
-// On Linux, returns the user systemd unit path.
-// Other platforms return an error.
-func Install(p InstallParams) (*InstallResult, error) {
-	switch runtime.GOOS {
-	case "darwin":
-		return installLaunchd(p)
-	case "linux":
-		return installSystemd(p)
-	default:
-		return nil, fmt.Errorf("daemon install not supported on %s yet", runtime.GOOS)
-	}
-}
-
-// Uninstall removes the platform-appropriate service file (and stops it).
-func Uninstall(label string) error {
-	switch runtime.GOOS {
-	case "darwin":
-		return uninstallLaunchd(label)
-	case "linux":
-		return uninstallSystemd(label)
-	default:
-		return fmt.Errorf("daemon uninstall not supported on %s yet", runtime.GOOS)
-	}
-}
-
-// IsInstalled reports whether the platform-appropriate service file exists.
-func IsInstalled(label string) bool {
-	switch runtime.GOOS {
-	case "darwin":
-		path, _ := launchdPlistPath(label)
-		_, err := os.Stat(path)
-		return err == nil
-	case "linux":
-		path, _ := systemdUnitPath(label)
-		_, err := os.Stat(path)
-		return err == nil
-	}
-	return false
-}
-
-// --- macOS / launchd ---
-
-func launchdPlistPath(label string) (string, error) {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-	return filepath.Join(home, "Library", "LaunchAgents", label+".plist"), nil
-}
-
-func installLaunchd(p InstallParams) (*InstallResult, error) {
-	plistPath, err := launchdPlistPath(p.Label)
-	if err != nil {
-		return nil, err
-	}
-	if err := os.MkdirAll(filepath.Dir(plistPath), 0o755); err != nil {
-		return nil, err
-	}
-	if err := os.MkdirAll(p.LogDir, 0o700); err != nil {
-		return nil, err
-	}
-	logOut := filepath.Join(p.LogDir, "intentd.out.log")
-	logErr := filepath.Join(p.LogDir, "intentd.err.log")
-	plist := fmt.Sprintf(`<?xml version="1.0" encoding="UTF-8"?>
-<!DOCTYPE plist PUBLIC "-//Apple//DTD PLIST 1.0//EN" "http://www.apple.com/DTDs/PropertyList-1.0.dtd">
-<plist version="1.0">
-<dict>
-  <key>Label</key>            <string>%s</string>
-  <key>ProgramArguments</key> <array>
-    <string>%s</string>
-    <string>daemon</string>
-    <string>start</string>
-    <string>--foreground</string>
-  </array>
-  <key>RunAtLoad</key>        <true/>
-  <key>KeepAlive</key>        <true/>
-  <key>ProcessType</key>      <string>Background</string>
-  <key>StandardOutPath</key>  <string>%s</string>
-  <key>StandardErrorPath</key><string>%s</string>
-  <key>EnvironmentVariables</key>
-  <dict>
-    <key>HOME</key>           <string>%s</string>
-    <key>PATH</key>           <string>/usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin:/usr/sbin:/sbin</string>
-  </dict>
-</dict>
-</plist>
-`, p.Label, p.Binary, logOut, logErr, mustHome())
-	if err := os.WriteFile(plistPath, []byte(plist), 0o644); err != nil {
-		return nil, fmt.Errorf("write plist: %w", err)
-	}
-	// Best-effort start. launchctl load is the right verb on macOS LaunchAgents
-	// even though it's been deprecated in favor of bootstrap. bootstrap requires
-	// a target like `gui/$UID` and is awkward; load still works.
-	_, _ = exec.Command("launchctl", "unload", plistPath).CombinedOutput()
-	if out, err := exec.Command("launchctl", "load", plistPath).CombinedOutput(); err != nil {
-		return &InstallResult{
-			UnitPath: plistPath,
-			StartCmd: []string{"launchctl", "load", plistPath},
-			StopCmd:  []string{"launchctl", "unload", plistPath},
-			LogPath:  logOut,
-			Notes: fmt.Sprintf("plist installed but launchctl load failed: %s\n"+
-				"start manually with: launchctl load %s", string(out), plistPath),
-		}, nil
-	}
-	return &InstallResult{
-		UnitPath: plistPath,
-		StartCmd: []string{"launchctl", "load", plistPath},
-		StopCmd:  []string{"launchctl", "unload", plistPath},
-		LogPath:  logOut,
-		Notes:    "intentd is now running and will start at login.",
-	}, nil
-}
-
-func uninstallLaunchd(label string) error {
-	plistPath, err := launchdPlistPath(label)
-	if err != nil {
-		return err
-	}
-	if _, err := os.Stat(plistPath); os.IsNotExist(err) {
-		return nil
-	}
-	_, _ = exec.Command("launchctl", "unload", plistPath).CombinedOutput()
-	return os.Remove(plistPath)
-}
-
-// --- Linux / systemd user unit ---
-
-func systemdUnitPath(label string) (string, error) {
-	home, err := os.UserHomeDir()
-	if err != nil {
-		return "", err
-	}
-	// systemd allows arbitrary unit names; we use the label suffix as the
-	// unit basename to avoid collisions with system units.
-	return filepath.Join(home, ".config", "systemd", "user", label+".service"), nil
-}
-
-func installSystemd(p InstallParams) (*InstallResult, error) {
-	unitPath, err := systemdUnitPath(p.Label)
-	if err != nil {
-		return nil, err
-	}
-	if err := os.MkdirAll(filepath.Dir(unitPath), 0o755); err != nil {
-		return nil, err
-	}
-	unit := fmt.Sprintf(`[Unit]
-Description=intent daemon (keeps a local LLM warm)
-After=default.target
-
-[Service]
-Type=simple
-ExecStart=%s daemon start --foreground
-Restart=on-failure
-RestartSec=2
-Environment=PATH=/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin
-NoNewPrivileges=yes
-PrivateTmp=yes
-
-[Install]
-WantedBy=default.target
-`, p.Binary)
-	if err := os.WriteFile(unitPath, []byte(unit), 0o644); err != nil {
-		return nil, fmt.Errorf("write unit: %w", err)
-	}
-	_, _ = exec.Command("systemctl", "--user", "daemon-reload").CombinedOutput()
-	unitName := p.Label + ".service"
-	if out, err := exec.Command("systemctl", "--user", "enable", "--now", unitName).CombinedOutput(); err != nil {
-		return &InstallResult{
-			UnitPath: unitPath,
-			StartCmd: []string{"systemctl", "--user", "start", unitName},
-			StopCmd:  []string{"systemctl", "--user", "stop", unitName},
-			LogPath:  "journalctl --user -u " + unitName,
-			Notes: fmt.Sprintf("unit installed but `systemctl --user enable --now` failed: %s\n"+
-				"start manually with: systemctl --user start %s", string(out), unitName),
-		}, nil
-	}
-	return &InstallResult{
-		UnitPath: unitPath,
-		StartCmd: []string{"systemctl", "--user", "start", unitName},
-		StopCmd:  []string{"systemctl", "--user", "stop", unitName},
-		LogPath:  "journalctl --user -u " + unitName,
-		Notes:    "intentd is enabled and running. Logs: journalctl --user -u " + unitName,
-	}, nil
-}
-
-func uninstallSystemd(label string) error {
-	unitPath, err := systemdUnitPath(label)
-	if err != nil {
-		return err
-	}
-	if _, err := os.Stat(unitPath); os.IsNotExist(err) {
-		return nil
-	}
-	unitName := label + ".service"
-	_, _ = exec.Command("systemctl", "--user", "disable", "--now", unitName).CombinedOutput()
-	if err := os.Remove(unitPath); err != nil {
-		return err
-	}
-	_, _ = exec.Command("systemctl", "--user", "daemon-reload").CombinedOutput()
-	return nil
-}
-
-func mustHome() string {
-	h, _ := os.UserHomeDir()
-	return h
-}
diff --git a/internal/daemon/launcher.go b/internal/daemon/launcher.go
deleted file mode 100644
index b6e8e25..0000000
--- a/internal/daemon/launcher.go
+++ /dev/null
@@ -1,388 +0,0 @@
-package daemon
-
-import (
-	"context"
-	"fmt"
-	"io"
-	"net/http"
-	"os"
-	"os/exec"
-	"sync"
-	"sync/atomic"
-	"syscall"
-	"time"
-)
-
-// Launcher supervises a `llamafile --server` subprocess. It exposes the
-// HTTP endpoint llamafile is bound to so the CLI can dial it directly.
-//
-// Restart policy: if llamafile exits with a non-zero code or is killed
-// by anything other than us, we restart it up to MaxRestarts times within
-// RestartWindow. Beyond that we give up; the user gets an honest "daemon
-// died, see the logs" rather than a thrashing supervisor.
-type Launcher struct {
-	BinaryPath    string        // path to llamafile-VERSION
-	ModelPath     string        // path to .gguf
-	Host          string        // 127.0.0.1
-	Port          int           // 18080
-	ContextSize   int           // -c, 0 = llamafile default
-	GPULayers     int           // -ngl, -1 = let llamafile decide
-	StdoutLog     io.Writer     // where llamafile's stdout goes
-	StderrLog     io.Writer     // where llamafile's stderr goes
-	MaxRestarts   int           // default 5
-	RestartWindow time.Duration // default 60s
-	StartupGrace  time.Duration // how long to wait for /v1/models to respond
-
-	mu        sync.Mutex
-	cmd       *exec.Cmd
-	pid       int32
-	restarts  atomic.Int32
-	stopped   atomic.Bool
-	doneCh    chan struct{}
-	restartTs []time.Time
-}
-
-// NewLauncher constructs a Launcher with sensible defaults.
-func NewLauncher(binary, model string, host string, port int) *Launcher {
-	return &Launcher{
-		BinaryPath:    binary,
-		ModelPath:     model,
-		Host:          host,
-		Port:          port,
-		StdoutLog:     io.Discard,
-		StderrLog:     os.Stderr,
-		MaxRestarts:   5,
-		RestartWindow: 60 * time.Second,
-		StartupGrace:  60 * time.Second,
-		GPULayers:     -1,
-		doneCh:        make(chan struct{}),
-	}
-}
-
-// Endpoint returns the http://host:port the supervised llamafile listens on.
-func (l *Launcher) Endpoint() string {
-	return fmt.Sprintf("http://%s:%d", l.Host, l.Port)
-}
-
-// PID returns the current llamafile PID (0 if not running).
-func (l *Launcher) PID() int { return int(atomic.LoadInt32(&l.pid)) }
-
-// Running reports whether the subprocess is alive.
-func (l *Launcher) Running() bool { return l.PID() != 0 }
-
-// Restarts returns the cumulative restart count.
-func (l *Launcher) Restarts() int { return int(l.restarts.Load()) }
-
-// Start launches llamafile and blocks until either:
-//   - the HTTP /v1/models endpoint answers (success), OR
-//   - StartupGrace expires (failure), OR
-//   - llamafile exits before becoming ready (failure)
-//
-// On success, the Launcher's supervise goroutine is also running.
-func (l *Launcher) Start(ctx context.Context) error {
-	if err := l.spawn(ctx); err != nil {
-		return err
-	}
-	if err := l.waitReady(ctx); err != nil {
-		l.stop(syscall.SIGTERM)
-		return fmt.Errorf("llamafile did not become ready: %w", err)
-	}
-	go l.supervise(ctx)
-	return nil
-}
-
-// Wait blocks until the launcher's supervise loop exits.
-func (l *Launcher) Wait() { <-l.doneCh }
-
-// Stop signals the launcher to terminate and waits for it. Idempotent.
-func (l *Launcher) Stop(timeout time.Duration) {
-	if !l.stopped.CompareAndSwap(false, true) {
-		return
-	}
-	l.stop(syscall.SIGTERM)
-	select {
-	case <-l.doneCh:
-	case <-time.After(timeout):
-		l.stop(syscall.SIGKILL)
-		<-l.doneCh
-	}
-}
-
-func (l *Launcher) spawn(ctx context.Context) error {
-	args := []string{
-		"--server",
-		"-m", l.ModelPath,
-		"--host", l.Host,
-		"--port", fmt.Sprintf("%d", l.Port),
-	}
-	if l.ContextSize > 0 {
-		args = append(args, "-c", fmt.Sprintf("%d", l.ContextSize))
-	}
-	if l.GPULayers >= 0 {
-		args = append(args, "-ngl", fmt.Sprintf("%d", l.GPULayers))
-	}
-
-	// llamafile is an Actually Portable Executable (APE). On macOS the
-	// kernel rejects APE binaries directly with "exec format error" —
-	// the file's leading shell-script trampoline only fires when the
-	// shell loads it. So we run it via /bin/sh on every Unix to keep the
-	// invocation consistent and let the shell pick the right loader.
-	//
-	// Note: not exec.CommandContext — we manage lifecycle explicitly so
-	// that a CLI command's ctx cancellation doesn't kill the daemon's
-	// supervised subprocess.
-	shArgs := append([]string{l.BinaryPath}, args...)
-	cmd := exec.Command("/bin/sh", "-c", quoteShellArgs(shArgs), "intentd-llamafile")
-	cmd.Stdout = l.StdoutLog
-	cmd.Stderr = l.StderrLog
-	// New process group so llamafile doesn't catch terminal signals
-	// directed at the daemon.
-	cmd.SysProcAttr = procAttrNewGroup()
-
-	if err := cmd.Start(); err != nil {
-		return fmt.Errorf("start %s: %w", l.BinaryPath, err)
-	}
-	l.mu.Lock()
-	l.cmd = cmd
-	atomic.StoreInt32(&l.pid, int32(cmd.Process.Pid))
-	l.mu.Unlock()
-	return nil
-}
-
-func (l *Launcher) waitReady(ctx context.Context) error {
-	deadline := time.Now().Add(l.StartupGrace)
-	url := l.Endpoint() + "/v1/models"
-	cli := &http.Client{Timeout: 1 * time.Second}
-	for time.Now().Before(deadline) {
-		select {
-		case <-ctx.Done():
-			return ctx.Err()
-		default:
-		}
-		// If the subprocess died on us, fail fast.
-		if !l.processAlive() {
-			return fmt.Errorf("subprocess exited before ready")
-		}
-		req, _ := http.NewRequestWithContext(ctx, "GET", url, nil)
-		resp, err := cli.Do(req)
-		if err == nil {
-			_ = resp.Body.Close()
-			if resp.StatusCode < 500 {
-				return nil
-			}
-		}
-		time.Sleep(250 * time.Millisecond)
-	}
-	return fmt.Errorf("timeout after %s", l.StartupGrace)
-}
-
-func (l *Launcher) processAlive() bool {
-	l.mu.Lock()
-	cmd := l.cmd
-	l.mu.Unlock()
-	if cmd == nil || cmd.Process == nil {
-		return false
-	}
-	// Signal 0: existence check, no actual signal delivered.
-	return cmd.Process.Signal(syscall.Signal(0)) == nil
-}
-
-func (l *Launcher) supervise(ctx context.Context) {
-	defer close(l.doneCh)
-	for {
-		l.mu.Lock()
-		cmd := l.cmd
-		l.mu.Unlock()
-		if cmd == nil {
-			return
-		}
-		err := cmd.Wait()
-		atomic.StoreInt32(&l.pid, 0)
-
-		if l.stopped.Load() || ctx.Err() != nil {
-			return
-		}
-
-		// Crash. Decide whether to restart.
-		if err != nil {
-			fmt.Fprintf(l.StderrLog, "intentd: llamafile exited: %v\n", err)
-		}
-		if !l.shouldRestart() {
-			fmt.Fprintf(l.StderrLog, "intentd: too many restarts in %s; giving up\n", l.RestartWindow)
-			return
-		}
-		l.restarts.Add(1)
-		fmt.Fprintf(l.StderrLog, "intentd: restarting llamafile (attempt %d)\n", l.restarts.Load())
-		// Brief backoff so we don't hot-loop.
-		time.Sleep(time.Second)
-		if err := l.spawn(ctx); err != nil {
-			fmt.Fprintf(l.StderrLog, "intentd: respawn failed: %v\n", err)
-			return
-		}
-		if err := l.waitReady(ctx); err != nil {
-			fmt.Fprintf(l.StderrLog, "intentd: respawn not ready: %v\n", err)
-			l.stop(syscall.SIGTERM)
-			return
-		}
-	}
-}
-
-// shouldRestart returns true if we are within the restart budget.
-func (l *Launcher) shouldRestart() bool {
-	now := time.Now()
-	cutoff := now.Add(-l.RestartWindow)
-	kept := l.restartTs[:0]
-	for _, t := range l.restartTs {
-		if t.After(cutoff) {
-			kept = append(kept, t)
-		}
-	}
-	l.restartTs = append(kept, now)
-	return len(l.restartTs) <= l.MaxRestarts
-}
-
-// quoteShellArgs renders argv as a single shell command string with
-// each argument single-quoted. We never embed user-supplied unescaped
-// strings here, but doing it correctly is cheap insurance.
-func quoteShellArgs(argv []string) string {
-	out := ""
-	for i, a := range argv {
-		if i > 0 {
-			out += " "
-		}
-		// Replace each ' with '\''.
-		escaped := ""
-		for _, r := range a {
-			if r == '\'' {
-				escaped += `'\''`
-			} else {
-				escaped += string(r)
-			}
-		}
-		out += "'" + escaped + "'"
-	}
-	return out
-}
-
-func (l *Launcher) stop(sig syscall.Signal) {
-	l.mu.Lock()
-	cmd := l.cmd
-	l.mu.Unlock()
-	if cmd == nil || cmd.Process == nil {
-		return
-	}
-	pid := cmd.Process.Pid
-
-	// llamafile is an Actually Portable Executable. APE binaries on
-	// macOS work like this: the bytes are simultaneously a PE header
-	// (rejected by the kernel) and a shell script (interpreted by sh
-	// when execve fails). The script then mmaps a temp-extracted
-	// Mach-O and re-execs into it via `posix_spawn`, which in practice
-	// FORKS off a worker process whose parent becomes our intent
-	// daemon (the original sh wrapper exits). So:
-	//
-	// - cmd.Process.Pid points at the long-dead sh wrapper.
-	// - The actual llamafile is reparented to our daemon (os.Getpid()).
-	// - It also lives in its own process group via setsid.
-	//
-	// We therefore signal four populations to be sure:
-	//   1) the original spawned PID (no-op if already gone),
-	//   2) the spawned PID's process group (no-op if separated),
-	//   3) every descendant of *us* that runs our llamafile binary,
-	//   4) every such descendant's own process group.
-	_ = cmd.Process.Signal(sig)
-	if pgid, err := syscall.Getpgid(pid); err == nil && pgid != 0 && pgid != os.Getpid() {
-		_ = syscall.Kill(-pgid, sig)
-	}
-	for _, p := range descendantsRunning(os.Getpid(), l.BinaryPath) {
-		_ = syscall.Kill(p, sig)
-		if pgid, err := syscall.Getpgid(p); err == nil && pgid != 0 && pgid != os.Getpid() {
-			_ = syscall.Kill(-pgid, sig)
-		}
-	}
-}
-
-// descendantsRunning returns every descendant of root whose command
-// line contains needle. We filter on needle so that signaling does
-// not accidentally hit unrelated processes that happen to share the
-// daemon as an ancestor (e.g. user shells launched from `i daemon
-// start` in a terminal).
-func descendantsRunning(root int, needle string) []int {
-	if _, err := exec.LookPath("pgrep"); err != nil {
-		return nil
-	}
-	candidates := allDescendants(root)
-	if len(candidates) == 0 || needle == "" {
-		return candidates
-	}
-	out := candidates[:0]
-	for _, p := range candidates {
-		// `ps -o command= -p PID` prints just the command line.
-		b, err := exec.Command("ps", "-o", "command=", "-p", fmt.Sprintf("%d", p)).Output()
-		if err != nil {
-			continue
-		}
-		if bytesContains(b, needle) {
-			out = append(out, p)
-		}
-	}
-	return out
-}
-
-func allDescendants(root int) []int {
-	seen := map[int]struct{}{root: {}}
-	queue := []int{root}
-	var out []int
-	for len(queue) > 0 {
-		p := queue[0]
-		queue = queue[1:]
-		b, err := exec.Command("pgrep", "-P", fmt.Sprintf("%d", p)).Output()
-		if err != nil {
-			continue
-		}
-		for _, line := range bytesLines(b) {
-			var child int
-			_, _ = fmt.Sscanf(line, "%d", &child)
-			if child <= 0 {
-				continue
-			}
-			if _, ok := seen[child]; ok {
-				continue
-			}
-			seen[child] = struct{}{}
-			out = append(out, child)
-			queue = append(queue, child)
-		}
-	}
-	return out
-}
-
-func bytesContains(b []byte, needle string) bool {
-	if len(needle) == 0 {
-		return true
-	}
-	n := []byte(needle)
-	for i := 0; i+len(n) <= len(b); i++ {
-		if string(b[i:i+len(n)]) == needle {
-			return true
-		}
-	}
-	return false
-}
-
-func bytesLines(b []byte) []string {
-	var out []string
-	start := 0
-	for i, c := range b {
-		if c == '\n' {
-			if i > start {
-				out = append(out, string(b[start:i]))
-			}
-			start = i + 1
-		}
-	}
-	if start < len(b) {
-		out = append(out, string(b[start:]))
-	}
-	return out
-}
diff --git a/internal/daemon/procattr_other.go b/internal/daemon/procattr_other.go
deleted file mode 100644
index d7baa02..0000000
--- a/internal/daemon/procattr_other.go
+++ /dev/null
@@ -1,9 +0,0 @@
-//go:build !unix
-
-package daemon
-
-import "syscall"
-
-// procAttrNewGroup is a no-op on non-unix platforms (Windows). The
-// daemon isn't supported there yet anyway.
-func procAttrNewGroup() *syscall.SysProcAttr { return nil }
diff --git a/internal/daemon/procattr_unix.go b/internal/daemon/procattr_unix.go
deleted file mode 100644
index 696d4b9..0000000
--- a/internal/daemon/procattr_unix.go
+++ /dev/null
@@ -1,12 +0,0 @@
-//go:build unix
-
-package daemon
-
-import "syscall"
-
-// procAttrNewGroup returns SysProcAttr that puts the child in its own
-// process group, so signals to our PID don't reach it (and so we can
-// signal -PGID to take down the entire subtree).
-func procAttrNewGroup() *syscall.SysProcAttr {
-	return &syscall.SysProcAttr{Setpgid: true}
-}
diff --git a/internal/model/llamacli/llamacli.go b/internal/model/llamacli/llamacli.go
new file mode 100644
index 0000000..3f7439e
--- /dev/null
+++ b/internal/model/llamacli/llamacli.go
@@ -0,0 +1,313 @@
+// Package llamacli runs local inference by shelling out to llama.cpp's
+// `llama-cli` binary one-shot, instead of talking to a long-lived server.
+//
+// Each Complete/CompleteStructured call spawns `llama-cli` with the model,
+// a JSON-schema grammar constraint, and the flattened conversation, then
+// parses the single JSON object the model prints to stdout. There is no
+// daemon, no HTTP, and no warm process: the OS process *is* the request.
+//
+// Trade-off vs. the old `llamafile --server` path: every call pays the
+// model-load cost. In exchange there is nothing to supervise, nothing
+// bound to a socket, and nothing to leave running. The grammar constraint
+// (`--json-schema`) is the same mechanism llama.cpp's server used, so the
+// output contract is unchanged.
+package llamacli
+
+import (
+	"bytes"
+	"context"
+	"encoding/json"
+	"fmt"
+	"os/exec"
+	"strings"
+	"time"
+
+	"github.com/CoreyRDean/intent/internal/model"
+)
+
+// Backend drives one-shot `llama-cli` inference.
+type Backend struct {
+	// BinaryPath is the resolved llama-cli executable. May be a bare
+	// "llama-cli" if it's expected to be found on PATH at exec time.
+	BinaryPath string
+	// ModelPath is the absolute path to the .gguf to load.
+	ModelPath string
+	// ModelTag is cosmetic; it feeds the cache identity so switching
+	// models invalidates cached proposals.
+	ModelTag string
+	// ContextSize maps to -c (0 = let llama.cpp use the model default).
+	ContextSize int
+	// GPULayers maps to -ngl (-1 = let llama.cpp decide).
+	GPULayers int
+	// ExtraArgs are appended verbatim, for power users / debugging.
+	ExtraArgs []string
+	// Timeout caps a single inference. 0 = no deadline beyond ctx.
+	Timeout time.Duration
+}
+
+// New constructs a Backend for the given binary and model.
+func New(binary, modelPath string) *Backend {
+	if binary == "" {
+		binary = "llama-cli"
+	}
+	return &Backend{
+		BinaryPath: binary,
+		ModelPath:  modelPath,
+		GPULayers:  -1,
+		Timeout:    5 * time.Minute,
+	}
+}
+
+func (b *Backend) Name() string { return "llama-cli" }
+
+func (b *Backend) CacheIdentity() string {
+	return strings.Join([]string{b.Name(), b.ModelPath, b.ModelTag}, "|")
+}
+
+// Available verifies the binary resolves and the model file is present.
+func (b *Backend) Available(ctx context.Context) error {
+	if b.ModelPath == "" {
+		return fmt.Errorf("llama-cli: no model path configured")
+	}
+	if _, err := exec.LookPath(b.BinaryPath); err != nil {
+		return fmt.Errorf("llama-cli not found (%s): %w", b.BinaryPath, err)
+	}
+	return nil
+}
+
+// Complete runs inference constrained to the standard Response envelope.
+func (b *Backend) Complete(ctx context.Context, in model.CompleteRequest) (*model.Response, error) {
+	content, err := b.run(ctx, in.Messages, []byte(model.SchemaJSON), in.Temperature, in.MaxTokens, in.Seed)
+	if err != nil {
+		return nil, err
+	}
+	var out model.Response
+	if err := json.Unmarshal([]byte(content), &out); err != nil {
+		return nil, fmt.Errorf("model output not valid JSON: %w (got %q)", err, truncate(content, 200))
+	}
+	backfillRequiredFields(&out)
+	if err := out.Validate(); err != nil {
+		return nil, fmt.Errorf("model response failed schema: %w (got %q)", err, truncate(content, 400))
+	}
+	return &out, nil
+}
+
+// CompleteStructured implements model.StructuredBackend: the caller-
+// supplied schema is enforced by llama.cpp's grammar, so the returned
+// bytes are already schema-valid JSON.
+func (b *Backend) CompleteStructured(ctx context.Context, in model.StructuredRequest) ([]byte, error) {
+	if len(in.SchemaJSON) == 0 {
+		return nil, fmt.Errorf("CompleteStructured: SchemaJSON is required")
+	}
+	content, err := b.run(ctx, in.Messages, in.SchemaJSON, in.Temperature, in.MaxTokens, in.Seed)
+	if err != nil {
+		return nil, err
+	}
+	var any json.RawMessage
+	if err := json.Unmarshal([]byte(content), &any); err != nil {
+		return nil, fmt.Errorf("structured output not valid JSON: %w (got %q)", err, truncate(content, 200))
+	}
+	return []byte(content), nil
+}
+
+// run spawns llama-cli once and returns the JSON object it produced.
+func (b *Backend) run(ctx context.Context, messages []model.Message, schema []byte, temp float64, maxTok int, seed *int64) (string, error) {
+	if b.ModelPath == "" {
+		return "", fmt.Errorf("llama-cli: no model path configured")
+	}
+	system, prompt := flattenMessages(messages)
+
+	if b.Timeout > 0 {
+		var cancel context.CancelFunc
+		ctx, cancel = context.WithTimeout(ctx, b.Timeout)
+		defer cancel()
+	}
+
+	args := []string{
+		"-m", b.ModelPath,
+		// Conversation + single-turn: apply the model's chat template,
+		// process exactly one user turn, then exit. This is the supported
+		// scripting mode — no interactive loop, no hanging on stdin.
+		"-cnv", "-st",
+		"--no-display-prompt",
+		"--json-schema", string(schema),
+		"--temp", fmt.Sprintf("%g", temp),
+	}
+	if maxTok > 0 {
+		args = append(args, "-n", fmt.Sprintf("%d", maxTok))
+	}
+	if seed != nil {
+		args = append(args, "-s", fmt.Sprintf("%d", *seed))
+	}
+	if b.ContextSize > 0 {
+		args = append(args, "-c", fmt.Sprintf("%d", b.ContextSize))
+	}
+	if b.GPULayers >= 0 {
+		args = append(args, "-ngl", fmt.Sprintf("%d", b.GPULayers))
+	}
+	if system != "" {
+		args = append(args, "-sys", system)
+	}
+	args = append(args, "-p", prompt)
+	args = append(args, b.ExtraArgs...)
+
+	cmd := exec.CommandContext(ctx, b.BinaryPath, args...)
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+	// Give the child an empty stdin so conversation mode sees EOF and
+	// never blocks waiting for a second turn.
+	cmd.Stdin = bytes.NewReader(nil)
+
+	if err := cmd.Run(); err != nil {
+		if ctx.Err() == context.DeadlineExceeded {
+			return "", fmt.Errorf("llama-cli timed out after %s", b.Timeout)
+		}
+		return "", fmt.Errorf("llama-cli failed: %w (stderr: %s)", err, truncate(strings.TrimSpace(stderr.String()), 400))
+	}
+
+	// Generation goes to stdout; llama.cpp logs/timings go to stderr.
+	// Extract the first balanced JSON object from stdout, falling back
+	// to stderr in case a build routes the message there.
+	if obj := extractJSONObject(stdout.String()); obj != "" {
+		return obj, nil
+	}
+	if obj := extractJSONObject(stderr.String()); obj != "" {
+		return obj, nil
+	}
+	return "", fmt.Errorf("llama-cli produced no JSON object (stdout: %q)", truncate(strings.TrimSpace(stdout.String()), 400))
+}
+
+// flattenMessages splits a conversation into a single system prompt and a
+// single user-turn transcript. llama-cli takes one -sys and one -p, so
+// multi-turn context (the engine's tool-call loop) is rendered as labeled
+// text inside the user turn. The schema grammar still pins the output
+// shape regardless of how the context is framed.
+func flattenMessages(msgs []model.Message) (system, prompt string) {
+	var sys []string
+	var convo []string
+	for _, m := range msgs {
+		switch m.Role {
+		case "system":
+			sys = append(sys, m.Content)
+		case "assistant":
+			convo = append(convo, "Assistant (previous response):\n"+m.Content)
+		case "tool":
+			name := m.Name
+			if name == "" {
+				name = "tool"
+			}
+			convo = append(convo, fmt.Sprintf("Result of %s:\n%s", name, m.Content))
+		default: // user and anything else
+			convo = append(convo, m.Content)
+		}
+	}
+	return strings.Join(sys, "\n\n"), strings.Join(convo, "\n\n")
+}
+
+// extractJSONObject returns the first balanced top-level {...} object in s,
+// tolerating ```json fences, leading log noise, and trailing EOS markers
+// that small local models and llama-cli sometimes emit around the payload.
+func extractJSONObject(s string) string {
+	s = stripFences(strings.TrimSpace(s))
+	start := strings.IndexByte(s, '{')
+	if start < 0 {
+		return ""
+	}
+	depth := 0
+	inStr := false
+	esc := false
+	for i := start; i < len(s); i++ {
+		c := s[i]
+		if inStr {
+			switch {
+			case esc:
+				esc = false
+			case c == '\\':
+				esc = true
+			case c == '"':
+				inStr = false
+			}
+			continue
+		}
+		switch c {
+		case '"':
+			inStr = true
+		case '{':
+			depth++
+		case '}':
+			depth--
+			if depth == 0 {
+				return s[start : i+1]
+			}
+		}
+	}
+	return ""
+}
+
+// stripFences tolerates a model that wraps JSON in ```json ... ``` fences.
+func stripFences(s string) string {
+	s = strings.TrimSpace(s)
+	if !strings.HasPrefix(s, "```") {
+		return s
+	}
+	s = strings.TrimPrefix(s, "```json")
+	s = strings.TrimPrefix(s, "```")
+	s = strings.TrimSuffix(s, "```")
+	return strings.TrimSpace(s)
+}
+
+// backfillRequiredFields supplies sane defaults for fields small local
+// models routinely omit despite the schema. We never invent the command
+// itself; we only fill metadata the safety guard and TUI need. Mirrors
+// the llamafile backend so behaviour is identical across local runtimes.
+func backfillRequiredFields(r *model.Response) {
+	if r == nil {
+		return
+	}
+	if r.Description == "" {
+		switch {
+		case r.Command != "":
+			r.Description = "Run: " + truncate(r.Command, 120)
+		case r.Script != nil && r.Script.Body != "":
+			first := strings.SplitN(r.Script.Body, "\n", 2)[0]
+			r.Description = "Run script (" + r.Script.Interpreter + "): " + truncate(first, 100)
+		case r.StdoutToUser != "":
+			r.Description = "Print informational answer."
+		case r.ToolCall != nil && r.ToolCall.Name != "":
+			r.Description = "Gather context via " + r.ToolCall.Name + "."
+		case r.ClarifyingQuestion != "":
+			r.Description = "Ask the user a clarifying question."
+		case r.RefusalReason != "":
+			r.Description = "Refuse this request."
+		default:
+			r.Description = "(no description provided by model)"
+		}
+	}
+	if r.Risk == "" {
+		r.Risk = model.RiskSafe
+	}
+	if r.Approach == "" {
+		switch {
+		case r.Script != nil && r.Script.Body != "":
+			r.Approach = model.ApproachScript
+		case r.Command != "":
+			r.Approach = model.ApproachCommand
+		case r.ToolCall != nil && r.ToolCall.Name != "":
+			r.Approach = model.ApproachToolCall
+		case r.StdoutToUser != "":
+			r.Approach = model.ApproachInform
+		case r.ClarifyingQuestion != "":
+			r.Approach = model.ApproachClarify
+		case r.RefusalReason != "":
+			r.Approach = model.ApproachRefuse
+		}
+	}
+}
+
+func truncate(s string, n int) string {
+	if len(s) <= n {
+		return s
+	}
+	return s[:n] + "..."
+}
diff --git a/internal/model/llamacli/llamacli_test.go b/internal/model/llamacli/llamacli_test.go
new file mode 100644
index 0000000..3df8198
--- /dev/null
+++ b/internal/model/llamacli/llamacli_test.go
@@ -0,0 +1,85 @@
+package llamacli
+
+import (
+	"testing"
+
+	"github.com/CoreyRDean/intent/internal/model"
+)
+
+func TestExtractJSONObject(t *testing.T) {
+	tests := []struct {
+		name string
+		in   string
+		want string
+	}{
+		{"plain object", `{"a":1}`, `{"a":1}`},
+		{"leading log noise", "loading model...\nllama_init\n{\"a\":1}\n", `{"a":1}`},
+		{"trailing eos marker", `{"a":1} [end of text]`, `{"a":1}`},
+		{"code fence", "```json\n{\"a\":1}\n```", `{"a":1}`},
+		{"nested braces", `{"a":{"b":2},"c":3}`, `{"a":{"b":2},"c":3}`},
+		{"brace inside string", `{"a":"}{","b":1}`, `{"a":"}{","b":1}`},
+		{"escaped quote in string", `{"a":"he said \"hi\"","b":1}`, `{"a":"he said \"hi\"","b":1}`},
+		{"no object", `just text`, ``},
+		{"trailing garbage after object", `{"a":1}garbage{`, `{"a":1}`},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			if got := extractJSONObject(tt.in); got != tt.want {
+				t.Errorf("extractJSONObject(%q) = %q, want %q", tt.in, got, tt.want)
+			}
+		})
+	}
+}
+
+func TestFlattenMessages(t *testing.T) {
+	msgs := []model.Message{
+		{Role: "system", Content: "you are a shell"},
+		{Role: "system", Content: "be terse"},
+		{Role: "user", Content: "list files"},
+		{Role: "assistant", Content: `{"approach":"tool_call"}`},
+		{Role: "tool", Name: "list_dir", Content: `{"files":["a","b"]}`},
+	}
+	system, prompt := flattenMessages(msgs)
+
+	if system != "you are a shell\n\nbe terse" {
+		t.Fatalf("system = %q", system)
+	}
+	for _, want := range []string{"list files", "Assistant (previous response)", "Result of list_dir"} {
+		if !contains(prompt, want) {
+			t.Errorf("prompt missing %q; got %q", want, prompt)
+		}
+	}
+}
+
+func TestFlattenMessages_UnnamedTool(t *testing.T) {
+	_, prompt := flattenMessages([]model.Message{
+		{Role: "tool", Content: "result"},
+	})
+	if !contains(prompt, "Result of tool:") {
+		t.Fatalf("expected default tool label, got %q", prompt)
+	}
+}
+
+func TestCacheIdentity_DistinctPerModel(t *testing.T) {
+	a := New("llama-cli", "/cache/models/a.gguf")
+	b := New("llama-cli", "/cache/models/b.gguf")
+	if a.CacheIdentity() == b.CacheIdentity() {
+		t.Fatal("different models should yield different cache identities")
+	}
+	if a.Name() != "llama-cli" {
+		t.Fatalf("Name() = %q", a.Name())
+	}
+}
+
+func contains(haystack, needle string) bool {
+	return len(needle) == 0 || (len(haystack) >= len(needle) && indexOf(haystack, needle) >= 0)
+}
+
+func indexOf(s, sub string) int {
+	for i := 0; i+len(sub) <= len(s); i++ {
+		if s[i:i+len(sub)] == sub {
+			return i
+		}
+	}
+	return -1
+}
diff --git a/internal/runtime/llamacli.go b/internal/runtime/llamacli.go
new file mode 100644
index 0000000..b600957
--- /dev/null
+++ b/internal/runtime/llamacli.go
@@ -0,0 +1,152 @@
+package runtime
+
+import (
+	"context"
+	"fmt"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"runtime"
+	"strings"
+)
+
+// LlamaCLIBinary is the executable name we look for and install. It is
+// the one-shot inference frontend from llama.cpp.
+const LlamaCLIBinary = "llama-cli"
+
+// llamaCLICandidates are well-known absolute locations to check when
+// llama-cli isn't on PATH — chiefly the Homebrew prefixes on macOS and
+// Linux, where `brew install llama.cpp` drops the binary.
+func llamaCLICandidates() []string {
+	paths := []string{
+		"/opt/homebrew/bin/" + LlamaCLIBinary,              // Apple Silicon brew
+		"/usr/local/bin/" + LlamaCLIBinary,                 // Intel mac / manual
+		"/home/linuxbrew/.linuxbrew/bin/" + LlamaCLIBinary, // shared linuxbrew
+	}
+	if home := os.Getenv("HOME"); home != "" {
+		paths = append(paths, filepath.Join(home, ".linuxbrew", "bin", LlamaCLIBinary))
+	}
+	return paths
+}
+
+// isExecutable reports whether path exists and has an executable bit set.
+func isExecutable(path string) bool {
+	info, err := os.Stat(path)
+	if err != nil {
+		return false
+	}
+	return !info.IsDir() && info.Mode()&0o111 != 0
+}
+
+// LlamaCLIPath resolves the llama-cli binary: PATH first, then known
+// install locations. Falls back to the bare name so callers still get a
+// value to hand exec (which will re-resolve via PATH) when nothing is
+// found yet.
+func (m *Manager) LlamaCLIPath() string {
+	if p, err := exec.LookPath(LlamaCLIBinary); err == nil {
+		return p
+	}
+	for _, p := range llamaCLICandidates() {
+		if isExecutable(p) {
+			return p
+		}
+	}
+	return LlamaCLIBinary
+}
+
+// HaveLlamaCLI reports whether a usable llama-cli is installed.
+func (m *Manager) HaveLlamaCLI() bool {
+	if _, err := exec.LookPath(LlamaCLIBinary); err == nil {
+		return true
+	}
+	for _, p := range llamaCLICandidates() {
+		if isExecutable(p) {
+			return true
+		}
+	}
+	return false
+}
+
+// pkgManager describes how to install llama.cpp via one system package
+// manager. cmd is the manager binary; args are the install arguments
+// (package name included); needsSudo asks us to prefix sudo when we are
+// not already root.
+type pkgManager struct {
+	name      string
+	cmd       string
+	args      []string
+	needsSudo bool
+}
+
+// llamaCLIManagers is the ordered preference list. Homebrew is first
+// because it ships an official, up-to-date `llama.cpp` formula on both
+// macOS and Linux; the native managers are best-effort fallbacks.
+func llamaCLIManagers() []pkgManager {
+	managers := []pkgManager{
+		{name: "Homebrew", cmd: "brew", args: []string{"install", "llama.cpp"}},
+	}
+	if runtime.GOOS == "linux" {
+		managers = append(managers,
+			pkgManager{name: "apt", cmd: "apt-get", args: []string{"install", "-y", "llama.cpp"}, needsSudo: true},
+			pkgManager{name: "dnf", cmd: "dnf", args: []string{"install", "-y", "llama.cpp"}, needsSudo: true},
+			pkgManager{name: "pacman", cmd: "pacman", args: []string{"-S", "--noconfirm", "llama.cpp"}, needsSudo: true},
+			pkgManager{name: "zypper", cmd: "zypper", args: []string{"install", "-y", "llama.cpp"}, needsSudo: true},
+		)
+	}
+	return managers
+}
+
+// EnsureLlamaCLI installs llama-cli via the system package manager if it
+// isn't already available. log, if non-nil, receives human-readable
+// progress lines. It returns a clear, actionable error when no supported
+// package manager is found or the install fails.
+func (m *Manager) EnsureLlamaCLI(ctx context.Context, log func(string)) error {
+	if m.HaveLlamaCLI() {
+		return nil
+	}
+	logf := func(format string, a ...any) {
+		if log != nil {
+			log(fmt.Sprintf(format, a...))
+		}
+	}
+
+	managers := llamaCLIManagers()
+	var available []pkgManager
+	for _, pm := range managers {
+		if _, err := exec.LookPath(pm.cmd); err == nil {
+			available = append(available, pm)
+		}
+	}
+	if len(available) == 0 {
+		return fmt.Errorf("no supported package manager found to install %s.\n"+
+			"  Install Homebrew (https://brew.sh) and run `brew install llama.cpp`,\n"+
+			"  or install llama.cpp from https://github.com/ggml-org/llama.cpp", LlamaCLIBinary)
+	}
+
+	var lastErr error
+	for _, pm := range available {
+		name, args := pm.cmd, pm.args
+		if pm.needsSudo && os.Geteuid() != 0 {
+			if _, err := exec.LookPath("sudo"); err == nil {
+				args = append([]string{name}, args...)
+				name = "sudo"
+			}
+		}
+		logf("installing %s via %s (%s %s)...", LlamaCLIBinary, pm.name, name, strings.Join(args, " "))
+		cmd := exec.CommandContext(ctx, name, args...)
+		cmd.Stdout = os.Stderr
+		cmd.Stderr = os.Stderr
+		if err := cmd.Run(); err != nil {
+			lastErr = fmt.Errorf("%s install failed: %w", pm.name, err)
+			logf("  %s failed: %v", pm.name, err)
+			continue
+		}
+		if m.HaveLlamaCLI() {
+			logf("%s installed.", LlamaCLIBinary)
+			return nil
+		}
+		lastErr = fmt.Errorf("%s reported success but %s still not found", pm.name, LlamaCLIBinary)
+	}
+	return fmt.Errorf("could not install %s automatically: %w.\n"+
+		"  Try `brew install llama.cpp` or build from https://github.com/ggml-org/llama.cpp", LlamaCLIBinary, lastErr)
+}
diff --git a/internal/runtime/runtime.go b/internal/runtime/runtime.go
index f3e128c..8ada7b9 100644
--- a/internal/runtime/runtime.go
+++ b/internal/runtime/runtime.go
@@ -1,7 +1,8 @@
-// Package runtime manages the local llamafile binary and model files.
-// In v1 it can: report whether a runtime/model is present, and download
-// either on demand with progress callbacks. Actually starting llamafile as a
-// subprocess is wired into Phase 4 (daemon).
+// Package runtime manages the local inference runtime (llama.cpp's
+// `llama-cli`, installed via the system package manager — see
+// llamacli.go) and the GGUF model files it loads. It can report whether
+// the runtime/model is present and download models on demand with
+// progress callbacks.
 package runtime
 
 import (
@@ -16,9 +17,6 @@ import (
 	"github.com/CoreyRDean/intent/internal/models"
 )
 
-// LlamafileVersion is the runtime version we ship against.
-const LlamafileVersion = "0.10.0"
-
 // ModelInfo is the minimal shape the runtime package needs to
 // download a model. It's a projection of models.Model kept here for
 // backward compatibility; new code should pass models.Model around.
@@ -80,25 +78,11 @@ type Manager struct {
 
 func New(cacheDir string) *Manager { return &Manager{CacheDir: cacheDir} }
 
-// LlamafilePath returns the expected path of the llamafile binary.
-func (m *Manager) LlamafilePath() string {
-	return filepath.Join(m.CacheDir, "runtime", "llamafile-"+LlamafileVersion)
-}
-
 // ModelPath returns the expected path of the named model file.
 func (m *Manager) ModelPath(file string) string {
 	return filepath.Join(m.CacheDir, "models", file)
 }
 
-// HaveLlamafile reports whether the runtime exists and is executable.
-func (m *Manager) HaveLlamafile() bool {
-	info, err := os.Stat(m.LlamafilePath())
-	if err != nil {
-		return false
-	}
-	return info.Mode()&0o111 != 0
-}
-
 // HaveModel reports whether the named model file exists.
 func (m *Manager) HaveModel(file string) bool {
 	_, err := os.Stat(m.ModelPath(file))
@@ -108,22 +92,6 @@ func (m *Manager) HaveModel(file string) bool {
 // Progress is a download progress callback.
 type Progress func(downloaded, total int64)
 
-// EnsureLlamafile downloads the runtime if missing.
-func (m *Manager) EnsureLlamafile(ctx context.Context, progress Progress) error {
-	if m.HaveLlamafile() {
-		return nil
-	}
-	if err := os.MkdirAll(filepath.Dir(m.LlamafilePath()), 0o755); err != nil {
-		return err
-	}
-	url := fmt.Sprintf("https://github.com/mozilla-ai/llamafile/releases/download/%s/llamafile-%s",
-		LlamafileVersion, LlamafileVersion)
-	if err := download(ctx, url, m.LlamafilePath(), progress); err != nil {
-		return fmt.Errorf("download llamafile: %w", err)
-	}
-	return os.Chmod(m.LlamafilePath(), 0o755)
-}
-
 // EnsureModel downloads the model if missing.
 func (m *Manager) EnsureModel(ctx context.Context, mi ModelInfo, progress Progress) error {
 	dest := m.ModelPath(mi.File)

From 76ddfe61a842b35ca2def470a4f333e162586a22 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 5 Jun 2026 20:46:18 +0000
Subject: [PATCH 2/3] feat(local): hold a request-scoped llama-server warm
 across the tool loop
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Refines the local backend per review discussion: instead of spawning
`llama-cli` one-shot per engine step (which reloads the model every step
and flattens the multi-turn history into a single prompt), prefer a
request-scoped `llama-server` child.

The server starts lazily on the first inference of an `intent`
invocation and is held warm for the rest of it, so the engine's
tool-call loop reuses the loaded weights and KV cache. Because it speaks
the OpenAI-compatible /v1/chat/completions API, the native messages
array (system/user/assistant/tool) is sent as-is — no flattening. It is
not a daemon: bound to a private loopback port, owned by the single
invocation, killed on Close (and SIGKILL'd by the kernel via Pdeathsig
if intent dies on Linux). The one-shot `llama-cli` path remains as a
fallback when llama-server isn't installed.

- new internal/model/llamaserver: lazy-start co-process backend that
  delegates to the existing OpenAI-compatible HTTP client; Close() kills
  the process group
- runtime: manage both llama-server and llama-cli (same brew package);
  EnsureLlamaRuntime / HaveLlamaRuntime / Have*Server*/*CLI* resolvers
- backend resolver ladder: llama-server -> llama-cli -> mock; defer
  closeBackend at the intent/explain/report call sites; verbose wrapper
  forwards Close
- doctor/ensure/init/model report the unified llama.cpp runtime

Trade-off unchanged across separate invocations (a fresh `i` reloads the
model) — keeping weights warm between commands would require a resident
daemon, which is exactly what this design avoids.
---
 internal/cli/backend.go                       |  60 +++-
 internal/cli/doctor.go                        |  11 +-
 internal/cli/doctor_test.go                   |  10 +-
 internal/cli/ensure.go                        |  22 +-
 internal/cli/explain.go                       |   1 +
 internal/cli/init.go                          |  16 +-
 internal/cli/intent.go                        |   1 +
 internal/cli/model.go                         |   8 +-
 internal/cli/report.go                        |   1 +
 internal/model/llamaserver/llamaserver.go     | 299 ++++++++++++++++++
 .../model/llamaserver/llamaserver_test.go     |  63 ++++
 internal/model/llamaserver/procattr_linux.go  |  17 +
 internal/model/llamaserver/procattr_other.go  |  13 +
 internal/runtime/llamacli.go                  | 119 ++++---
 internal/verbose/backend.go                   |  11 +
 15 files changed, 562 insertions(+), 90 deletions(-)
 create mode 100644 internal/model/llamaserver/llamaserver.go
 create mode 100644 internal/model/llamaserver/llamaserver_test.go
 create mode 100644 internal/model/llamaserver/procattr_linux.go
 create mode 100644 internal/model/llamaserver/procattr_other.go

diff --git a/internal/cli/backend.go b/internal/cli/backend.go
index 547e3eb..92cc682 100644
--- a/internal/cli/backend.go
+++ b/internal/cli/backend.go
@@ -3,12 +3,14 @@ package cli
 import (
 	"context"
 	"fmt"
+	"io"
 	"os"
 
 	"github.com/CoreyRDean/intent/internal/config"
 	"github.com/CoreyRDean/intent/internal/model"
 	"github.com/CoreyRDean/intent/internal/model/llamacli"
 	"github.com/CoreyRDean/intent/internal/model/llamafile"
+	"github.com/CoreyRDean/intent/internal/model/llamaserver"
 	"github.com/CoreyRDean/intent/internal/model/mock"
 	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
 	"github.com/CoreyRDean/intent/internal/state"
@@ -33,32 +35,47 @@ func buildBackend(name string, cfg *config.Config, modelOverride string) (model.
 	case "mock":
 		return mock.New(), false, nil
 	case "llama-cli", "llamafile-local":
-		// Local inference runs llama.cpp's `llama-cli` one-shot. If the
-		// binary or the selected model isn't installed yet, fall back to
-		// the mock backend so `i hello` doesn't hard-fail for a brand-new
-		// install — the mock returns an honest "not installed yet"
-		// response, and ensureBackendReady / `i doctor` guide the fix.
+		// Local inference runs llama.cpp. Preferred: a request-scoped
+		// `llama-server` child held warm for the whole invocation —
+		// native multi-turn messages (no flattening) and no per-step
+		// model reload across the tool-call loop. Fallback: one-shot
+		// `llama-cli` when the server binary isn't present. If neither
+		// the runtime nor the model is installed, fall back to the mock
+		// so `i hello` doesn't hard-fail for a brand-new install —
+		// ensureBackendReady / `i doctor` guide the fix.
 		// ("llamafile-local" is kept as a back-compat alias for configs
-		// written before the switch to llama-cli.)
+		// written before the switch to llama.cpp.)
 		dirs, err := state.Resolve()
 		if err != nil {
 			return nil, false, err
 		}
 		rt := intentruntime.New(dirs.Cache)
 		modelPath := rt.ModelPath(selectedModelFile(dirs.State, cfg))
-		if !rt.HaveLlamaCLI() || !fileExists(modelPath) {
+		if !fileExists(modelPath) {
 			return mock.New(), true, nil
 		}
-		b := llamacli.New(rt.LlamaCLIPath(), modelPath)
+		tag := cfg.Model
 		if modelOverride != "" {
-			b.ModelTag = modelOverride
-		} else {
-			b.ModelTag = cfg.Model
+			tag = modelOverride
 		}
-		if m := loadCatalog(dirs.State).Get(cfg.Model); m != nil && m.ContextTokens > 0 {
-			b.ContextSize = m.ContextTokens
+		ctxTokens := 0
+		if m := loadCatalog(dirs.State).Get(cfg.Model); m != nil {
+			ctxTokens = m.ContextTokens
+		}
+		switch {
+		case rt.HaveLlamaServer():
+			b := llamaserver.New(rt.LlamaServerPath(), modelPath)
+			b.ModelTag = tag
+			b.ContextSize = ctxTokens
+			return b, false, nil
+		case rt.HaveLlamaCLI():
+			b := llamacli.New(rt.LlamaCLIPath(), modelPath)
+			b.ModelTag = tag
+			b.ContextSize = ctxTokens
+			return b, false, nil
+		default:
+			return mock.New(), true, nil
 		}
-		return b, false, nil
 	case "llamafile-network":
 		ep := os.Getenv("INTENT_LLAMAFILE_ENDPOINT")
 		if ep == "" {
@@ -130,6 +147,11 @@ func buildBackendCtx(ctx context.Context, name string, cfg *config.Config, model
 			l.KV("model_path", b.ModelPath)
 			l.KV("model_tag", b.ModelTag)
 		}
+		if b, ok := be.(*llamaserver.Backend); ok {
+			l.KV("binary", b.BinaryPath)
+			l.KV("model_path", b.ModelPath)
+			l.KV("model_tag", b.ModelTag)
+		}
 		be = verbose.Backend(l, be)
 	}
 	return be, fb, nil
@@ -170,3 +192,13 @@ func fileExists(path string) bool {
 	info, err := os.Stat(path)
 	return err == nil && !info.IsDir()
 }
+
+// closeBackend tears down any resources a backend holds — notably the
+// llama-server co-process, which must be killed when the invocation ends.
+// Safe to defer on every backend; a no-op for stateless ones. The verbose
+// wrapper forwards Close to its inner backend.
+func closeBackend(be model.Backend) {
+	if c, ok := be.(io.Closer); ok {
+		_ = c.Close()
+	}
+}
diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go
index 76995f3..9bc1240 100644
--- a/internal/cli/doctor.go
+++ b/internal/cli/doctor.go
@@ -43,11 +43,14 @@ func cmdDoctor(_ context.Context, _ []string) int {
 		}
 
 		rt := intentruntime.New(dirs.Cache)
-		cliStatus := "missing — run `i model pull` to install via your package manager"
-		if rt.HaveLlamaCLI() {
-			cliStatus = "found at " + rt.LlamaCLIPath()
+		runtimeStatus := "missing — run `i model pull` to install via your package manager"
+		switch {
+		case rt.HaveLlamaServer():
+			runtimeStatus = "found at " + rt.LlamaServerPath()
+		case rt.HaveLlamaCLI():
+			runtimeStatus = "llama-server missing; using one-shot fallback at " + rt.LlamaCLIPath()
 		}
-		check("llama-cli runtime", cliStatus, rt.HaveLlamaCLI())
+		check("llama.cpp runtime", runtimeStatus, rt.HaveLlamaRuntime())
 
 		modelFile, modelStatus := resolveModelCheck(cfg)
 		check("model", fmt.Sprintf("%s — %s", modelStatus, rt.ModelPath(modelFile)), rt.HaveModel(modelFile))
diff --git a/internal/cli/doctor_test.go b/internal/cli/doctor_test.go
index ff9d8fb..2f8c9c0 100644
--- a/internal/cli/doctor_test.go
+++ b/internal/cli/doctor_test.go
@@ -55,9 +55,9 @@ func TestResolveModelCheck(t *testing.T) {
 	}
 }
 
-// TestDoctorReportsLlamaCLIRuntime verifies doctor surfaces the local
-// runtime line (llama-cli) rather than a daemon/server status.
-func TestDoctorReportsLlamaCLIRuntime(t *testing.T) {
+// TestDoctorReportsLlamaRuntime verifies doctor surfaces the local
+// llama.cpp runtime line rather than a daemon/server status.
+func TestDoctorReportsLlamaRuntime(t *testing.T) {
 	t.Setenv("HOME", t.TempDir())
 	t.Setenv("INTENT_STATE_DIR", t.TempDir())
 	t.Setenv("INTENT_CACHE_DIR", t.TempDir())
@@ -65,7 +65,7 @@ func TestDoctorReportsLlamaCLIRuntime(t *testing.T) {
 	out := captureStdout(func() {
 		_ = cmdDoctor(context.Background(), nil)
 	})
-	if !strings.Contains(out, "llama-cli runtime") {
-		t.Fatalf("doctor output missing llama-cli runtime line: %q", out)
+	if !strings.Contains(out, "llama.cpp runtime") {
+		t.Fatalf("doctor output missing llama.cpp runtime line: %q", out)
 	}
 }
diff --git a/internal/cli/ensure.go b/internal/cli/ensure.go
index 7287c7c..7cf354a 100644
--- a/internal/cli/ensure.go
+++ b/internal/cli/ensure.go
@@ -50,17 +50,17 @@ func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config
 	if selected == nil {
 		selected = cat.Default()
 	}
-	haveCLI := rt.HaveLlamaCLI()
+	haveRuntime := rt.HaveLlamaRuntime()
 	haveModel := selected != nil && rt.HaveModel(models.ModelFilename(selected))
-	if haveCLI && haveModel {
+	if haveRuntime && haveModel {
 		return true
 	}
 
 	interactive := tui.IsTTY(os.Stdin) && tui.IsTTY(os.Stderr)
 	if !interactive {
 		fmt.Fprintln(os.Stderr, "intent: local model isn't ready yet.")
-		if !haveCLI {
-			fmt.Fprintln(os.Stderr, "  missing runtime: llama-cli (llama.cpp)")
+		if !haveRuntime {
+			fmt.Fprintln(os.Stderr, "  missing runtime: llama.cpp (llama-server)")
 		}
 		if !haveModel && selected != nil {
 			fmt.Fprintf(os.Stderr, "  missing model:   %s (~%d MB)\n", selected.ID, selected.SizeMB)
@@ -70,8 +70,8 @@ func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config
 	}
 
 	fmt.Fprintln(os.Stderr, "intent: the local model isn't ready yet.")
-	if !haveCLI {
-		fmt.Fprintln(os.Stderr, "  missing runtime: llama-cli (llama.cpp)")
+	if !haveRuntime {
+		fmt.Fprintln(os.Stderr, "  missing runtime: llama.cpp (llama-server)")
 	}
 	if !haveModel && selected != nil {
 		fmt.Fprintf(os.Stderr, "  missing model:   %s (~%d MB)\n", selected.ID, selected.SizeMB)
@@ -81,9 +81,9 @@ func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config
 		return false
 	}
 
-	if !haveCLI {
-		fmt.Fprintln(os.Stderr, "installing llama-cli...")
-		if err := rt.EnsureLlamaCLI(ctx, func(s string) { fmt.Fprintln(os.Stderr, "  "+s) }); err != nil {
+	if !haveRuntime {
+		fmt.Fprintln(os.Stderr, "installing llama.cpp...")
+		if err := rt.EnsureLlamaRuntime(ctx, func(s string) { fmt.Fprintln(os.Stderr, "  "+s) }); err != nil {
 			errf("runtime: %v", err)
 			return false
 		}
@@ -99,8 +99,8 @@ func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config
 		fmt.Fprintln(os.Stderr)
 	}
 
-	if !rt.HaveLlamaCLI() {
-		fmt.Fprintln(os.Stderr, "intent: llama-cli still not available; falling back to mock.")
+	if !rt.HaveLlamaRuntime() {
+		fmt.Fprintln(os.Stderr, "intent: llama.cpp still not available; falling back to mock.")
 		return false
 	}
 	return true
diff --git a/internal/cli/explain.go b/internal/cli/explain.go
index d5afc99..9ed065c 100644
--- a/internal/cli/explain.go
+++ b/internal/cli/explain.go
@@ -37,6 +37,7 @@ func cmdExplain(ctx context.Context, args []string) int {
 		errf("explain: %v", err)
 		return 3
 	}
+	defer closeBackend(be)
 	printMockFallbackBanner(isFallback)
 
 	vl := verbose.FromContext(ctx)
diff --git a/internal/cli/init.go b/internal/cli/init.go
index 94b7437..082d570 100644
--- a/internal/cli/init.go
+++ b/internal/cli/init.go
@@ -88,14 +88,14 @@ func cmdInit(ctx context.Context, args []string) int {
 	if selected == nil {
 		selected = cat.Default()
 	}
-	haveCLI := mgr.HaveLlamaCLI()
+	haveRuntime := mgr.HaveLlamaRuntime()
 	haveModel := selected != nil && mgr.HaveModel(models.ModelFilename(selected))
-	if !haveCLI || !haveModel {
+	if !haveRuntime || !haveModel {
 		fmt.Println()
 		if selected != nil {
-			fmt.Printf("Install llama-cli and download %s now? (~%d MB) [Y/n] ", selected.ID, selected.SizeMB)
+			fmt.Printf("Install llama.cpp and download %s now? (~%d MB) [Y/n] ", selected.ID, selected.SizeMB)
 		} else {
-			fmt.Printf("Install llama-cli and the default local model now? [Y/n] ")
+			fmt.Printf("Install llama.cpp and the default local model now? [Y/n] ")
 		}
 		pullAnswer := "y"
 		if !autoYes {
@@ -108,9 +108,9 @@ func cmdInit(ctx context.Context, args []string) int {
 			pullAnswer = line
 		}
 		if pullAnswer == "y" || pullAnswer == "yes" {
-			if !haveCLI {
-				fmt.Println("installing llama-cli via your package manager...")
-				if err := mgr.EnsureLlamaCLI(ctx, func(s string) { fmt.Println("  " + s) }); err != nil {
+			if !haveRuntime {
+				fmt.Println("installing llama.cpp via your package manager...")
+				if err := mgr.EnsureLlamaRuntime(ctx, func(s string) { fmt.Println("  " + s) }); err != nil {
 					fmt.Println()
 					errf("init: install runtime: %v", err)
 					fmt.Println("you can retry with `i model pull`.")
@@ -134,7 +134,7 @@ func cmdInit(ctx context.Context, args []string) int {
 		}
 	} else {
 		fmt.Println()
-		fmt.Println("Runtime:     llama-cli already installed.")
+		fmt.Println("Runtime:     llama.cpp already installed.")
 		fmt.Println("Model:       already installed.")
 	}
 
diff --git a/internal/cli/intent.go b/internal/cli/intent.go
index 0d1dbfa..0966ef8 100644
--- a/internal/cli/intent.go
+++ b/internal/cli/intent.go
@@ -251,6 +251,7 @@ func cmdIntent(ctx context.Context, args []string) int {
 		errf("backend: %v", err)
 		return 3
 	}
+	defer closeBackend(be)
 	printMockFallbackBanner(isFallback)
 
 	// Top-level verbose breadcrumbs. Safe no-op when -v is off.
diff --git a/internal/cli/model.go b/internal/cli/model.go
index efc95a5..7bb808a 100644
--- a/internal/cli/model.go
+++ b/internal/cli/model.go
@@ -369,11 +369,11 @@ func modelPull(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []
 	}
 
 	rt := intentruntime.New(dirs.Cache)
-	if !rt.HaveLlamaCLI() {
-		fmt.Fprintln(os.Stderr, "installing llama-cli via your package manager...")
-		if err := rt.EnsureLlamaCLI(ctx, func(s string) { fmt.Fprintln(os.Stderr, "  "+s) }); err != nil {
+	if !rt.HaveLlamaRuntime() {
+		fmt.Fprintln(os.Stderr, "installing llama.cpp via your package manager...")
+		if err := rt.EnsureLlamaRuntime(ctx, func(s string) { fmt.Fprintln(os.Stderr, "  "+s) }); err != nil {
 			fmt.Fprintln(os.Stderr)
-			errf("llama-cli: %v", err)
+			errf("llama.cpp: %v", err)
 			return 3
 		}
 		fmt.Fprintln(os.Stderr)
diff --git a/internal/cli/report.go b/internal/cli/report.go
index dbcf353..722bad0 100644
--- a/internal/cli/report.go
+++ b/internal/cli/report.go
@@ -53,6 +53,7 @@ func cmdReport(ctx context.Context, args []string) int {
 		errf("report: %v", err)
 		return 3
 	}
+	defer closeBackend(be)
 	if isMockBackend(be) {
 		errf("i report requires a real backend — run 'i doctor' to diagnose")
 		return 3
diff --git a/internal/model/llamaserver/llamaserver.go b/internal/model/llamaserver/llamaserver.go
new file mode 100644
index 0000000..7044f16
--- /dev/null
+++ b/internal/model/llamaserver/llamaserver.go
@@ -0,0 +1,299 @@
+// Package llamaserver runs local inference through a request-scoped
+// llama.cpp `llama-server` child process.
+//
+// Unlike the one-shot `llama-cli` path, the server is started once on the
+// first inference of an `intent` invocation and held warm for the rest of
+// that invocation — so the engine's tool-call loop reuses the loaded
+// weights and KV cache instead of reloading the model on every step. And
+// because the server speaks the OpenAI-compatible /v1/chat/completions
+// API, the native messages array (system/user/assistant/tool) is sent
+// as-is: no flattening into a single prompt.
+//
+// It is *not* a daemon. The process is bound to a private loopback port,
+// owned by this one CLI invocation, and killed on Close (and, on Linux,
+// auto-killed by the kernel if intent dies — see procAttr). There is no
+// persistent listener and nothing to manage between commands.
+package llamaserver
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net"
+	"net/http"
+	"os"
+	"os/exec"
+	"sync"
+	"syscall"
+	"time"
+
+	"github.com/CoreyRDean/intent/internal/model"
+	"github.com/CoreyRDean/intent/internal/model/llamafile"
+)
+
+// Backend manages the llama-server child and delegates inference to an
+// OpenAI-compatible HTTP client pointed at it. The zero value is not
+// usable; construct with New.
+type Backend struct {
+	BinaryPath   string        // path to llama-server
+	ModelPath    string        // path to the .gguf to load
+	ModelTag     string        // cosmetic; feeds the cache identity
+	ContextSize  int           // -c, 0 = llama.cpp default
+	GPULayers    int           // -ngl, -1 = let llama.cpp decide
+	Host         string        // loopback only; default 127.0.0.1
+	StartupGrace time.Duration // how long to wait for /health
+
+	startOnce sync.Once
+	startErr  error
+
+	mu      sync.Mutex
+	cmd     *exec.Cmd
+	inner   *llamafile.Backend
+	stopped bool
+	logBuf  *cappedBuffer
+}
+
+// New constructs a Backend for the given binary and model.
+func New(binary, modelPath string) *Backend {
+	if binary == "" {
+		binary = "llama-server"
+	}
+	return &Backend{
+		BinaryPath:   binary,
+		ModelPath:    modelPath,
+		Host:         "127.0.0.1",
+		GPULayers:    -1,
+		StartupGrace: 120 * time.Second,
+		logBuf:       &cappedBuffer{max: 8 << 10},
+	}
+}
+
+func (b *Backend) Name() string { return "llama-server" }
+
+// CacheIdentity is derived from config alone so it can be computed for
+// the cache key without starting the server.
+func (b *Backend) CacheIdentity() string {
+	return "llama-server|" + b.ModelPath + "|" + b.ModelTag
+}
+
+// Available ensures the server is up and healthy.
+func (b *Backend) Available(ctx context.Context) error {
+	if err := b.ensureStarted(ctx); err != nil {
+		return err
+	}
+	return b.inner.Available(ctx)
+}
+
+// Complete starts the server if needed, then delegates over HTTP. The
+// inner client sends the native messages array, so there is no flattening.
+func (b *Backend) Complete(ctx context.Context, in model.CompleteRequest) (*model.Response, error) {
+	if err := b.ensureStarted(ctx); err != nil {
+		return nil, err
+	}
+	return b.inner.Complete(ctx, in)
+}
+
+// CompleteStructured implements model.StructuredBackend via the inner
+// HTTP client's grammar-constrained path.
+func (b *Backend) CompleteStructured(ctx context.Context, in model.StructuredRequest) ([]byte, error) {
+	if err := b.ensureStarted(ctx); err != nil {
+		return nil, err
+	}
+	return b.inner.CompleteStructured(ctx, in)
+}
+
+// ensureStarted spawns llama-server exactly once and waits for /health.
+func (b *Backend) ensureStarted(ctx context.Context) error {
+	b.startOnce.Do(func() { b.startErr = b.start(ctx) })
+	return b.startErr
+}
+
+func (b *Backend) start(ctx context.Context) error {
+	if b.ModelPath == "" {
+		return fmt.Errorf("llama-server: no model path configured")
+	}
+	host := b.Host
+	if host == "" {
+		host = "127.0.0.1"
+	}
+	port, err := freeLoopbackPort(host)
+	if err != nil {
+		return fmt.Errorf("llama-server: pick port: %w", err)
+	}
+
+	args := []string{
+		"-m", b.ModelPath,
+		"--host", host,
+		"--port", fmt.Sprintf("%d", port),
+	}
+	if b.ContextSize > 0 {
+		args = append(args, "-c", fmt.Sprintf("%d", b.ContextSize))
+	}
+	if b.GPULayers >= 0 {
+		args = append(args, "-ngl", fmt.Sprintf("%d", b.GPULayers))
+	}
+
+	cmd := exec.Command(b.BinaryPath, args...)
+	cmd.Stdout = b.logBuf
+	cmd.Stderr = b.logBuf
+	cmd.Stdin = nil
+	cmd.SysProcAttr = procAttr()
+	if err := cmd.Start(); err != nil {
+		return fmt.Errorf("start llama-server (%s): %w", b.BinaryPath, err)
+	}
+
+	b.mu.Lock()
+	b.cmd = cmd
+	b.mu.Unlock()
+
+	endpoint := fmt.Sprintf("http://%s:%d", host, port)
+	if err := b.waitHealthy(ctx, endpoint); err != nil {
+		b.kill()
+		return fmt.Errorf("llama-server did not become ready: %w (log tail: %s)", err, b.logBuf.String())
+	}
+
+	inner := llamafile.New(endpoint)
+	inner.ModelTag = b.ModelTag
+	b.mu.Lock()
+	b.inner = inner
+	b.mu.Unlock()
+	return nil
+}
+
+// waitHealthy polls /health until the server reports ready, the process
+// exits, ctx is canceled, or StartupGrace elapses. Startup is bounded by
+// StartupGrace independent of any short per-request deadline, but ctx
+// cancellation (e.g. Ctrl-C) still aborts immediately.
+func (b *Backend) waitHealthy(ctx context.Context, endpoint string) error {
+	deadline := time.Now().Add(b.StartupGrace)
+	cli := &http.Client{Timeout: 2 * time.Second}
+	for time.Now().Before(deadline) {
+		if ctx.Err() != nil {
+			return ctx.Err()
+		}
+		if !b.processAlive() {
+			return fmt.Errorf("process exited before becoming ready")
+		}
+		req, _ := http.NewRequestWithContext(ctx, http.MethodGet, endpoint+"/health", nil)
+		resp, err := cli.Do(req)
+		if err == nil {
+			_ = resp.Body.Close()
+			if resp.StatusCode == http.StatusOK {
+				return nil
+			}
+		}
+		time.Sleep(200 * time.Millisecond)
+	}
+	return fmt.Errorf("timeout after %s", b.StartupGrace)
+}
+
+func (b *Backend) processAlive() bool {
+	b.mu.Lock()
+	cmd := b.cmd
+	b.mu.Unlock()
+	if cmd == nil || cmd.Process == nil {
+		return false
+	}
+	return cmd.Process.Signal(syscall.Signal(0)) == nil
+}
+
+// Close kills the server. Safe to call multiple times and when the
+// server never started. It implements io.Closer so the CLI can defer it.
+func (b *Backend) Close() error {
+	b.kill()
+	return nil
+}
+
+func (b *Backend) kill() {
+	b.mu.Lock()
+	cmd := b.cmd
+	if b.stopped || cmd == nil || cmd.Process == nil {
+		b.stopped = true
+		b.mu.Unlock()
+		return
+	}
+	b.stopped = true
+	b.mu.Unlock()
+
+	pid := cmd.Process.Pid
+	signalGroup := func(sig syscall.Signal) {
+		// Signal the whole process group (negative pid) so any worker
+		// llama-server forked also dies; fall back to the bare pid.
+		if pgid, err := syscall.Getpgid(pid); err == nil && pgid > 0 && pgid != os.Getpid() {
+			_ = syscall.Kill(-pgid, sig)
+		}
+		_ = cmd.Process.Signal(sig)
+	}
+
+	signalGroup(syscall.SIGTERM)
+	done := make(chan struct{})
+	go func() { _, _ = cmd.Process.Wait(); close(done) }()
+	select {
+	case <-done:
+	case <-time.After(3 * time.Second):
+		signalGroup(syscall.SIGKILL)
+		<-waitOrTimeout(cmd, 2*time.Second)
+	}
+}
+
+func waitOrTimeout(cmd *exec.Cmd, d time.Duration) <-chan struct{} {
+	ch := make(chan struct{})
+	go func() {
+		_, _ = cmd.Process.Wait()
+		close(ch)
+	}()
+	out := make(chan struct{})
+	go func() {
+		select {
+		case <-ch:
+		case <-time.After(d):
+		}
+		close(out)
+	}()
+	return out
+}
+
+// freeLoopbackPort asks the OS for an unused port on host by binding :0
+// and reading back the assignment. There is a small race between closing
+// the probe listener and llama-server binding the port, but for a local
+// request-scoped child that is acceptable.
+func freeLoopbackPort(host string) (int, error) {
+	ln, err := net.Listen("tcp", net.JoinHostPort(host, "0"))
+	if err != nil {
+		return 0, err
+	}
+	defer ln.Close()
+	return ln.Addr().(*net.TCPAddr).Port, nil
+}
+
+// cappedBuffer is a goroutine-safe writer that retains only the last max
+// bytes — enough to surface a startup failure without growing unbounded
+// over a long session.
+type cappedBuffer struct {
+	mu  sync.Mutex
+	buf []byte
+	max int
+}
+
+func (c *cappedBuffer) Write(p []byte) (int, error) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	c.buf = append(c.buf, p...)
+	if len(c.buf) > c.max {
+		c.buf = c.buf[len(c.buf)-c.max:]
+	}
+	return len(p), nil
+}
+
+func (c *cappedBuffer) String() string {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	return string(c.buf)
+}
+
+// compile-time assertions for the optional capabilities the CLI relies on.
+var (
+	_ model.Backend           = (*Backend)(nil)
+	_ model.StructuredBackend = (*Backend)(nil)
+	_ io.Closer               = (*Backend)(nil)
+)
diff --git a/internal/model/llamaserver/llamaserver_test.go b/internal/model/llamaserver/llamaserver_test.go
new file mode 100644
index 0000000..200ae2f
--- /dev/null
+++ b/internal/model/llamaserver/llamaserver_test.go
@@ -0,0 +1,63 @@
+package llamaserver
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestFreeLoopbackPort(t *testing.T) {
+	p, err := freeLoopbackPort("127.0.0.1")
+	if err != nil {
+		t.Fatalf("freeLoopbackPort: %v", err)
+	}
+	if p <= 0 || p > 65535 {
+		t.Fatalf("port out of range: %d", p)
+	}
+}
+
+func TestCappedBuffer_RetainsTail(t *testing.T) {
+	b := &cappedBuffer{max: 8}
+	if _, err := b.Write([]byte("abcdefghij")); err != nil { // 10 bytes into an 8-byte cap
+		t.Fatal(err)
+	}
+	if got := b.String(); got != "cdefghij" {
+		t.Fatalf("capped buffer = %q, want %q", got, "cdefghij")
+	}
+}
+
+func TestCappedBuffer_ReportsFullWriteLen(t *testing.T) {
+	b := &cappedBuffer{max: 4}
+	n, err := b.Write([]byte("hello"))
+	if err != nil {
+		t.Fatal(err)
+	}
+	if n != 5 {
+		t.Fatalf("Write returned %d, want 5 (full input length)", n)
+	}
+}
+
+func TestCacheIdentity_DistinctPerModel(t *testing.T) {
+	a := New("llama-server", "/cache/models/a.gguf")
+	b := New("llama-server", "/cache/models/b.gguf")
+	if a.CacheIdentity() == b.CacheIdentity() {
+		t.Fatal("different models should yield different cache identities")
+	}
+	if !strings.HasPrefix(a.CacheIdentity(), "llama-server|") {
+		t.Fatalf("cache identity = %q, want llama-server prefix", a.CacheIdentity())
+	}
+	if a.Name() != "llama-server" {
+		t.Fatalf("Name() = %q", a.Name())
+	}
+}
+
+// Close must be safe when the server never started (no process spawned).
+func TestClose_BeforeStartIsNoop(t *testing.T) {
+	b := New("llama-server", "/cache/models/a.gguf")
+	if err := b.Close(); err != nil {
+		t.Fatalf("Close before start: %v", err)
+	}
+	// Idempotent.
+	if err := b.Close(); err != nil {
+		t.Fatalf("second Close: %v", err)
+	}
+}
diff --git a/internal/model/llamaserver/procattr_linux.go b/internal/model/llamaserver/procattr_linux.go
new file mode 100644
index 0000000..5d717c3
--- /dev/null
+++ b/internal/model/llamaserver/procattr_linux.go
@@ -0,0 +1,17 @@
+//go:build linux
+
+package llamaserver
+
+import "syscall"
+
+// procAttr puts llama-server in its own process group (so we can signal
+// the whole group) and asks the kernel to SIGKILL it if the parent
+// intent process dies — even on a crash or -9, where our deferred Close
+// never runs. This is the safety net that keeps a request-scoped child
+// from being orphaned with the model resident in memory.
+func procAttr() *syscall.SysProcAttr {
+	return &syscall.SysProcAttr{
+		Setpgid:   true,
+		Pdeathsig: syscall.SIGKILL,
+	}
+}
diff --git a/internal/model/llamaserver/procattr_other.go b/internal/model/llamaserver/procattr_other.go
new file mode 100644
index 0000000..6f7b81a
--- /dev/null
+++ b/internal/model/llamaserver/procattr_other.go
@@ -0,0 +1,13 @@
+//go:build !linux
+
+package llamaserver
+
+import "syscall"
+
+// procAttr puts llama-server in its own process group so Close can signal
+// the whole group. Non-Linux platforms (notably macOS) have no
+// Pdeathsig equivalent, so a hard kill (-9) of the parent can orphan the
+// child; normal exits and signals are handled by the deferred Close.
+func procAttr() *syscall.SysProcAttr {
+	return &syscall.SysProcAttr{Setpgid: true}
+}
diff --git a/internal/runtime/llamacli.go b/internal/runtime/llamacli.go
index b600957..6493a65 100644
--- a/internal/runtime/llamacli.go
+++ b/internal/runtime/llamacli.go
@@ -10,21 +10,33 @@ import (
 	"strings"
 )
 
-// LlamaCLIBinary is the executable name we look for and install. It is
-// the one-shot inference frontend from llama.cpp.
-const LlamaCLIBinary = "llama-cli"
-
-// llamaCLICandidates are well-known absolute locations to check when
-// llama-cli isn't on PATH — chiefly the Homebrew prefixes on macOS and
-// Linux, where `brew install llama.cpp` drops the binary.
-func llamaCLICandidates() []string {
+// llama.cpp ships several frontends; intent uses two:
+//
+//   - llama-server: a request-scoped child process that holds the model
+//     warm for the duration of one `intent` invocation and speaks the
+//     OpenAI-compatible HTTP API (native multi-turn messages + grammar).
+//     This is the preferred local backend.
+//   - llama-cli: a one-shot fallback used when llama-server isn't present.
+//
+// Both come from the same package (`brew install llama.cpp`), so the
+// installer ensures the package and the resolvers find whichever binaries
+// landed on PATH (or in a known Homebrew prefix).
+const (
+	LlamaServerBinary = "llama-server"
+	LlamaCLIBinary    = "llama-cli"
+)
+
+// binaryCandidates lists well-known absolute locations to check when a
+// llama.cpp binary isn't on PATH — chiefly the Homebrew prefixes on macOS
+// and Linux, where `brew install llama.cpp` drops the binaries.
+func binaryCandidates(bin string) []string {
 	paths := []string{
-		"/opt/homebrew/bin/" + LlamaCLIBinary,              // Apple Silicon brew
-		"/usr/local/bin/" + LlamaCLIBinary,                 // Intel mac / manual
-		"/home/linuxbrew/.linuxbrew/bin/" + LlamaCLIBinary, // shared linuxbrew
+		"/opt/homebrew/bin/" + bin,              // Apple Silicon brew
+		"/usr/local/bin/" + bin,                 // Intel mac / manual
+		"/home/linuxbrew/.linuxbrew/bin/" + bin, // shared linuxbrew
 	}
 	if home := os.Getenv("HOME"); home != "" {
-		paths = append(paths, filepath.Join(home, ".linuxbrew", "bin", LlamaCLIBinary))
+		paths = append(paths, filepath.Join(home, ".linuxbrew", "bin", bin))
 	}
 	return paths
 }
@@ -38,28 +50,27 @@ func isExecutable(path string) bool {
 	return !info.IsDir() && info.Mode()&0o111 != 0
 }
 
-// LlamaCLIPath resolves the llama-cli binary: PATH first, then known
+// resolveBinary returns a usable path for bin: PATH first, then known
 // install locations. Falls back to the bare name so callers still get a
-// value to hand exec (which will re-resolve via PATH) when nothing is
-// found yet.
-func (m *Manager) LlamaCLIPath() string {
-	if p, err := exec.LookPath(LlamaCLIBinary); err == nil {
+// value to hand exec (which re-resolves via PATH) when nothing is found.
+func resolveBinary(bin string) string {
+	if p, err := exec.LookPath(bin); err == nil {
 		return p
 	}
-	for _, p := range llamaCLICandidates() {
+	for _, p := range binaryCandidates(bin) {
 		if isExecutable(p) {
 			return p
 		}
 	}
-	return LlamaCLIBinary
+	return bin
 }
 
-// HaveLlamaCLI reports whether a usable llama-cli is installed.
-func (m *Manager) HaveLlamaCLI() bool {
-	if _, err := exec.LookPath(LlamaCLIBinary); err == nil {
+// haveBinary reports whether bin resolves to something runnable.
+func haveBinary(bin string) bool {
+	if _, err := exec.LookPath(bin); err == nil {
 		return true
 	}
-	for _, p := range llamaCLICandidates() {
+	for _, p := range binaryCandidates(bin) {
 		if isExecutable(p) {
 			return true
 		}
@@ -67,6 +78,25 @@ func (m *Manager) HaveLlamaCLI() bool {
 	return false
 }
 
+// LlamaServerPath resolves the llama-server binary.
+func (m *Manager) LlamaServerPath() string { return resolveBinary(LlamaServerBinary) }
+
+// HaveLlamaServer reports whether llama-server is installed.
+func (m *Manager) HaveLlamaServer() bool { return haveBinary(LlamaServerBinary) }
+
+// LlamaCLIPath resolves the llama-cli binary.
+func (m *Manager) LlamaCLIPath() string { return resolveBinary(LlamaCLIBinary) }
+
+// HaveLlamaCLI reports whether llama-cli is installed.
+func (m *Manager) HaveLlamaCLI() bool { return haveBinary(LlamaCLIBinary) }
+
+// HaveLlamaRuntime reports whether a usable llama.cpp frontend is present.
+// Either binary is enough to run local inference (server preferred, cli
+// as a one-shot fallback).
+func (m *Manager) HaveLlamaRuntime() bool {
+	return m.HaveLlamaServer() || m.HaveLlamaCLI()
+}
+
 // pkgManager describes how to install llama.cpp via one system package
 // manager. cmd is the manager binary; args are the install arguments
 // (package name included); needsSudo asks us to prefix sudo when we are
@@ -78,10 +108,11 @@ type pkgManager struct {
 	needsSudo bool
 }
 
-// llamaCLIManagers is the ordered preference list. Homebrew is first
-// because it ships an official, up-to-date `llama.cpp` formula on both
-// macOS and Linux; the native managers are best-effort fallbacks.
-func llamaCLIManagers() []pkgManager {
+// llamaManagers is the ordered preference list. Homebrew is first because
+// it ships an official, up-to-date `llama.cpp` formula (with both
+// llama-server and llama-cli) on macOS and Linux; the native managers are
+// best-effort fallbacks.
+func llamaManagers() []pkgManager {
 	managers := []pkgManager{
 		{name: "Homebrew", cmd: "brew", args: []string{"install", "llama.cpp"}},
 	}
@@ -96,12 +127,13 @@ func llamaCLIManagers() []pkgManager {
 	return managers
 }
 
-// EnsureLlamaCLI installs llama-cli via the system package manager if it
-// isn't already available. log, if non-nil, receives human-readable
-// progress lines. It returns a clear, actionable error when no supported
-// package manager is found or the install fails.
-func (m *Manager) EnsureLlamaCLI(ctx context.Context, log func(string)) error {
-	if m.HaveLlamaCLI() {
+// EnsureLlamaRuntime installs llama.cpp via the system package manager if
+// neither llama-server nor llama-cli is already available. log, if
+// non-nil, receives human-readable progress lines. It returns a clear,
+// actionable error when no supported package manager is found or the
+// install fails.
+func (m *Manager) EnsureLlamaRuntime(ctx context.Context, log func(string)) error {
+	if m.HaveLlamaRuntime() {
 		return nil
 	}
 	logf := func(format string, a ...any) {
@@ -110,17 +142,16 @@ func (m *Manager) EnsureLlamaCLI(ctx context.Context, log func(string)) error {
 		}
 	}
 
-	managers := llamaCLIManagers()
 	var available []pkgManager
-	for _, pm := range managers {
+	for _, pm := range llamaManagers() {
 		if _, err := exec.LookPath(pm.cmd); err == nil {
 			available = append(available, pm)
 		}
 	}
 	if len(available) == 0 {
-		return fmt.Errorf("no supported package manager found to install %s.\n"+
-			"  Install Homebrew (https://brew.sh) and run `brew install llama.cpp`,\n"+
-			"  or install llama.cpp from https://github.com/ggml-org/llama.cpp", LlamaCLIBinary)
+		return fmt.Errorf("no supported package manager found to install llama.cpp.\n" +
+			"  Install Homebrew (https://brew.sh) and run `brew install llama.cpp`,\n" +
+			"  or install llama.cpp from https://github.com/ggml-org/llama.cpp")
 	}
 
 	var lastErr error
@@ -132,7 +163,7 @@ func (m *Manager) EnsureLlamaCLI(ctx context.Context, log func(string)) error {
 				name = "sudo"
 			}
 		}
-		logf("installing %s via %s (%s %s)...", LlamaCLIBinary, pm.name, name, strings.Join(args, " "))
+		logf("installing llama.cpp via %s (%s %s)...", pm.name, name, strings.Join(args, " "))
 		cmd := exec.CommandContext(ctx, name, args...)
 		cmd.Stdout = os.Stderr
 		cmd.Stderr = os.Stderr
@@ -141,12 +172,12 @@ func (m *Manager) EnsureLlamaCLI(ctx context.Context, log func(string)) error {
 			logf("  %s failed: %v", pm.name, err)
 			continue
 		}
-		if m.HaveLlamaCLI() {
-			logf("%s installed.", LlamaCLIBinary)
+		if m.HaveLlamaRuntime() {
+			logf("llama.cpp installed.")
 			return nil
 		}
-		lastErr = fmt.Errorf("%s reported success but %s still not found", pm.name, LlamaCLIBinary)
+		lastErr = fmt.Errorf("%s reported success but no llama.cpp binary was found", pm.name)
 	}
-	return fmt.Errorf("could not install %s automatically: %w.\n"+
-		"  Try `brew install llama.cpp` or build from https://github.com/ggml-org/llama.cpp", LlamaCLIBinary, lastErr)
+	return fmt.Errorf("could not install llama.cpp automatically: %w.\n"+
+		"  Try `brew install llama.cpp` or build from https://github.com/ggml-org/llama.cpp", lastErr)
 }
diff --git a/internal/verbose/backend.go b/internal/verbose/backend.go
index c2cfb87..cf7b91d 100644
--- a/internal/verbose/backend.go
+++ b/internal/verbose/backend.go
@@ -2,6 +2,7 @@ package verbose
 
 import (
 	"context"
+	"io"
 	"time"
 
 	"github.com/CoreyRDean/intent/internal/model"
@@ -41,6 +42,16 @@ func (v *vb) CacheIdentity() string {
 	return v.inner.Name()
 }
 
+// Close forwards to the wrapped backend if it owns resources (e.g. the
+// llama-server co-process), so callers can defer Close through the
+// verbose decorator. No-op when the inner backend isn't a Closer.
+func (v *vb) Close() error {
+	if c, ok := v.inner.(io.Closer); ok {
+		return c.Close()
+	}
+	return nil
+}
+
 func (v *vb) Complete(ctx context.Context, req model.CompleteRequest) (*model.Response, error) {
 	v.log.Section("model request (envelope)")
 	v.log.KV("backend", v.inner.Name())

From 9aadab5d9084aa4c3883d80f2be6cc0df857a1cf Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Fri, 5 Jun 2026 20:46:40 +0000
Subject: [PATCH 3/3] docs(readme): describe the request-scoped llama-server
 co-process

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e1a1530..e38052f 100644
--- a/README.md
+++ b/README.md
@@ -125,7 +125,7 @@ With `--literal`, everything after the flag is treated as natural-language promp
 
 ## Managing models
 
-intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llama.cpp](https://github.com/ggml-org/llama.cpp)'s `llama-cli`, which intent installs on demand through your system package manager (Homebrew, apt, dnf, …). Each prompt runs `llama-cli` one-shot — there is no background daemon or server. You can also point it at any public Hugging Face GGUF repo.
+intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llama.cpp](https://github.com/ggml-org/llama.cpp), which intent installs on demand through your system package manager (Homebrew, apt, dnf, …). Each `i` invocation starts a request-scoped `llama-server` child, holds the model warm for that one request (so multi-step tool calls don't reload it), and kills it on exit — there is **no background daemon**. If `llama-server` isn't available it falls back to one-shot `llama-cli`. You can also point it at any public Hugging Face GGUF repo.
 
 ```sh
 # See what's on offer and which one is current.