CoreyRDean · CoreyRDean · Jun 5, 2026 · Jun 5, 2026 · Jun 5, 2026
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
@@ -294,7 +294,7 @@ jobs:
 
             def caveats
               <<~EOS
-                Run first-run setup to download the local model and start the daemon:
+                Run first-run setup to install the runtime and download the local model:
                   i init
 
                 For zsh users: install the shell hook so prompts containing ? * [ ]

diff --git a/README.md b/README.md
@@ -16,7 +16,7 @@ It is **local-first** by default (no network required after first run, no prompt
 
 That composability applies to subcommands that consume natural language too: `i report "first problem" < extra-notes.txt` appends the piped text after the command-line text before proposing issues.
 
-> **Status: pre-alpha.** The binary builds and the mock backend round-trips the full prompt → propose → confirm → run loop, but the local model runtime, daemon, and self-update flows are still being wired up. See [`INTENT.md`](./INTENT.md) for the full project charter, [`docs/SPEC.md`](./docs/SPEC.md) for the implementation contract, and [open issues](https://github.com/CoreyRDean/intent/issues) for the roadmap.
+> **Status: pre-alpha.** The binary builds and the mock backend round-trips the full prompt → propose → confirm → run loop, but the local model runtime (llama.cpp's `llama-cli`) and self-update flows are still being wired up. See [`INTENT.md`](./INTENT.md) for the full project charter, [`docs/SPEC.md`](./docs/SPEC.md) for the implementation contract, and [open issues](https://github.com/CoreyRDean/intent/issues) for the roadmap.
 
 ## Building from source
 
@@ -125,7 +125,7 @@ With `--literal`, everything after the flag is treated as natural-language promp
 
 ## Managing models
 
-intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llamafile](https://github.com/mozilla-ai/llamafile). You can also point it at any public Hugging Face GGUF repo.
+intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llama.cpp](https://github.com/ggml-org/llama.cpp), which intent installs on demand through your system package manager (Homebrew, apt, dnf, …). Each `i` invocation starts a request-scoped `llama-server` child, holds the model warm for that one request (so multi-step tool calls don't reload it), and kills it on exit — there is **no background daemon**. If `llama-server` isn't available it falls back to one-shot `llama-cli`. You can also point it at any public Hugging Face GGUF repo.
 
 ```sh
 # See what's on offer and which one is current.

diff --git a/install.sh b/install.sh
@@ -11,7 +11,7 @@
 #   PREFIX           install root (default /usr/local; needs sudo if not writable)
 #   INTENT_TMPDIR    where to stage downloads (default $TMPDIR or /tmp)
 #
-# This script does not auto-install the daemon, the model runtime, or the
+# This script does not auto-install the model runtime (llama-cli) or the
 # model. Run `i init` after install.
 
 set -Eeuo pipefail
@@ -136,8 +136,8 @@ echo
 
 # Auto-run `intent init` if we have a real TTY on stdin/stderr. Without
 # this, users who curl|bash and ignore the next-steps text get a binary
-# they can't actually use until they read the docs. With it, the model
-# downloads and the daemon starts as part of the install flow.
+# they can't actually use until they read the docs. With it, the runtime
+# installs and the model downloads as part of the install flow.
 #
 # We skip it under `bash -c` / `curl | bash` (no TTY) so non-interactive
 # CI jobs aren't surprised by a 4 GB download.

diff --git a/internal/cli/backend.go b/internal/cli/backend.go
@@ -3,16 +3,17 @@ package cli
 import (
 	"context"
 	"fmt"
-	"net"
-	"net/url"
+	"io"
 	"os"
-	"strings"
-	"time"
 
 	"github.com/CoreyRDean/intent/internal/config"
 	"github.com/CoreyRDean/intent/internal/model"
+	"github.com/CoreyRDean/intent/internal/model/llamacli"
 	"github.com/CoreyRDean/intent/internal/model/llamafile"
+	"github.com/CoreyRDean/intent/internal/model/llamaserver"
 	"github.com/CoreyRDean/intent/internal/model/mock"
+	intentruntime "github.com/CoreyRDean/intent/internal/runtime"
+	"github.com/CoreyRDean/intent/internal/state"
 	"github.com/CoreyRDean/intent/internal/verbose"
 )
 
@@ -21,38 +22,60 @@ import (
 // unavailable and we silently fell back to the mock — callers use this to
 // surface a per-invocation warning so users aren't left confused.
 //
-// In v1 we wire: mock, llamafile-local, llamafile-network, ollama (as a
-// llamafile-shaped HTTP), openai (as a llamafile-shaped HTTP). The grammar
-// constraint is the same across all of them; the only differences are the
-// endpoint and the auth header.
+// Backends: mock; llama-cli (local one-shot llama.cpp subprocess, also
+// reachable under the legacy alias "llamafile-local"); llamafile-network,
+// ollama, and openai (all OpenAI-compatible HTTP). The JSON-schema grammar
+// constraint is the same across all of them; they differ only in transport
+// (local subprocess vs. HTTP endpoint + auth header).
 func buildBackend(name string, cfg *config.Config, modelOverride string) (model.Backend, bool, error) {
 	if v := os.Getenv("INTENT_FORCE_BACKEND"); v != "" {
 		name = v
 	}
 	switch name {
 	case "mock":
 		return mock.New(), false, nil
-	case "llamafile-local":
-		// We expect the daemon (`intentd`) to have started llamafile on
-		// the loopback host:port from config. If nothing's listening, we
-		// fall back to the mock backend so `i hello` doesn't hard-fail
-		// for a brand-new install — instead the mock returns an honest
-		// "the local model isn't installed yet" response.
-		host, port, err := resolveLocalDaemonEndpoint(cfg)
+	case "llama-cli", "llamafile-local":
+		// Local inference runs llama.cpp. Preferred: a request-scoped
+		// `llama-server` child held warm for the whole invocation —
+		// native multi-turn messages (no flattening) and no per-step
+		// model reload across the tool-call loop. Fallback: one-shot
+		// `llama-cli` when the server binary isn't present. If neither
+		// the runtime nor the model is installed, fall back to the mock
+		// so `i hello` doesn't hard-fail for a brand-new install —
+		// ensureBackendReady / `i doctor` guide the fix.
+		// ("llamafile-local" is kept as a back-compat alias for configs
+		// written before the switch to llama.cpp.)
+		dirs, err := state.Resolve()
 		if err != nil {
 			return nil, false, err
 		}
-		endpoint := fmt.Sprintf("http://%s:%s", host, port)
-		if !endpointReachable(endpoint) {
+		rt := intentruntime.New(dirs.Cache)
+		modelPath := rt.ModelPath(selectedModelFile(dirs.State, cfg))
+		if !fileExists(modelPath) {
 			return mock.New(), true, nil
 		}
-		b := llamafile.New(endpoint)
+		tag := cfg.Model
 		if modelOverride != "" {
-			b.ModelTag = modelOverride
-		} else {
-			b.ModelTag = cfg.Model
+			tag = modelOverride
+		}
+		ctxTokens := 0
+		if m := loadCatalog(dirs.State).Get(cfg.Model); m != nil {
+			ctxTokens = m.ContextTokens
+		}
+		switch {
+		case rt.HaveLlamaServer():
+			b := llamaserver.New(rt.LlamaServerPath(), modelPath)
+			b.ModelTag = tag
+			b.ContextSize = ctxTokens
+			return b, false, nil
+		case rt.HaveLlamaCLI():
+			b := llamacli.New(rt.LlamaCLIPath(), modelPath)
+			b.ModelTag = tag
+			b.ContextSize = ctxTokens
+			return b, false, nil
+		default:
+			return mock.New(), true, nil
 		}
-		return b, false, nil
 	case "llamafile-network":
 		ep := os.Getenv("INTENT_LLAMAFILE_ENDPOINT")
 		if ep == "" {
@@ -119,6 +142,16 @@ func buildBackendCtx(ctx context.Context, name string, cfg *config.Config, model
 			l.KV("endpoint", b.Endpoint)
 			l.KV("model_tag", b.ModelTag)
 		}
+		if b, ok := be.(*llamacli.Backend); ok {
+			l.KV("binary", b.BinaryPath)
+			l.KV("model_path", b.ModelPath)
+			l.KV("model_tag", b.ModelTag)
+		}
+		if b, ok := be.(*llamaserver.Backend); ok {
+			l.KV("binary", b.BinaryPath)
+			l.KV("model_path", b.ModelPath)
+			l.KV("model_tag", b.ModelTag)
+		}
 		be = verbose.Backend(l, be)
 	}
 	return be, fb, nil
@@ -131,7 +164,21 @@ func printMockFallbackBanner(isFallback bool) {
 	if !isFallback {
 		return
 	}
-	fmt.Fprintln(os.Stderr, "[MOCK] real backend unavailable — responses are simulated. Run 'i doctor', 'i model list', or 'i daemon start' to fix.")
+	fmt.Fprintln(os.Stderr, "[MOCK] real backend unavailable — responses are simulated. Run 'i doctor' or 'i model pull' to fix.")
+}
+
+// isLocalBackend reports whether a backend name uses the local llama-cli
+// runtime (and therefore wants the runtime/model self-healing in
+// ensureBackendReady). The empty string means "use the configured
+// default", which is llama-cli. "llamafile-local" is the back-compat
+// alias for configs predating the switch.
+func isLocalBackend(name string) bool {
+	switch name {
+	case "", "llama-cli", "llamafile-local":
+		return true
+	default:
+		return false
+	}
 }
 
 // isMockBackend reports whether b is the mock backend (by name).
@@ -140,28 +187,18 @@ func isMockBackend(b model.Backend) bool {
 	return b.Name() == "mock"
 }
 
-// endpointReachable does a short-timeout TCP check on the host:port of a URL.
-func endpointReachable(rawURL string) bool {
-	u, err := url.Parse(rawURL)
-	if err != nil {
-		return false
-	}
-	host := u.Host
-	if host == "" {
-		return false
-	}
-	if !strings.Contains(host, ":") {
-		switch u.Scheme {
-		case "https":
-			host += ":443"
-		default:
-			host += ":80"
-		}
-	}
-	c, err := net.DialTimeout("tcp", host, 200*time.Millisecond)
-	if err != nil {
-		return false
+// fileExists reports whether path exists and is a regular file.
+func fileExists(path string) bool {
+	info, err := os.Stat(path)
+	return err == nil && !info.IsDir()
+}
+
+// closeBackend tears down any resources a backend holds — notably the
+// llama-server co-process, which must be killed when the invocation ends.
+// Safe to defer on every backend; a no-op for stateless ones. The verbose
+// wrapper forwards Close to its inner backend.
+func closeBackend(be model.Backend) {
+	if c, ok := be.(io.Closer); ok {
+		_ = c.Close()
 	}
-	_ = c.Close()
-	return true
 }
diff --git a/internal/cli/backend_test.go b/internal/cli/backend_test.go
@@ -40,39 +40,26 @@ func TestBuildBackend_MockIsNotFallback(t *testing.T) {
 	}
 }
 
-func TestBuildBackend_LlamafileLocalFallsBackWhenUnreachable(t *testing.T) {
+// When llama-cli or the model isn't installed, the local backend falls
+// back to mock so a fresh install doesn't hard-fail. We point the cache
+// at an empty temp dir so the model file is guaranteed absent.
+func TestBuildBackend_LlamaCLILocalFallsBackWhenNotInstalled(t *testing.T) {
 	clearBackendEnv(t)
-	// Point the daemon at a port that is definitely not listening.
-	cfg := minimalConfig()
-	cfg.Raw["daemon.host"] = "127.0.0.1"
-	cfg.Raw["daemon.port"] = "1" // port 1 is reserved; nothing listens there
-
-	be, isFallback, err := buildBackend("llamafile-local", cfg, "")
-	if err != nil {
-		t.Fatalf("unexpected error: %v", err)
-	}
-	if !isFallback {
-		t.Error("unavailable llamafile-local should set isFallback=true")
-	}
-	if be.Name() != "mock" {
-		t.Errorf("expected fallback name %q, got %q", "mock", be.Name())
-	}
-}
-
-func TestBuildBackend_LlamafileLocalRejectsNonLoopbackHost(t *testing.T) {
-	clearBackendEnv(t)
-	cfg := minimalConfig()
-	cfg.Raw["daemon.host"] = "0.0.0.0"
-
-	_, isFallback, err := buildBackend("llamafile-local", cfg, "")
-	if err == nil {
-		t.Fatal("expected error for non-loopback daemon host, got nil")
-	}
-	if isFallback {
-		t.Fatal("invalid daemon host should not silently fall back to mock")
-	}
-	if !strings.Contains(err.Error(), "loopback only") {
-		t.Fatalf("error = %q, want loopback hint", err)
+	t.Setenv("HOME", t.TempDir())
+	t.Setenv("INTENT_STATE_DIR", t.TempDir())
+	t.Setenv("INTENT_CACHE_DIR", t.TempDir())
+
+	for _, name := range []string{"llama-cli", "llamafile-local"} {
+		be, isFallback, err := buildBackend(name, minimalConfig(), "")
+		if err != nil {
+			t.Fatalf("%s: unexpected error: %v", name, err)
+		}
+		if !isFallback {
+			t.Errorf("%s: uninstalled local backend should set isFallback=true", name)
+		}
+		if be.Name() != "mock" {
+			t.Errorf("%s: expected fallback name %q, got %q", name, "mock", be.Name())
+		}
 	}
 }
 
@@ -155,7 +142,7 @@ func TestPrintMockFallbackBanner_MentionsNextSteps(t *testing.T) {
 	io.Copy(&buf, r)
 	out := buf.String()
 
-	for _, hint := range []string{"i doctor", "i daemon start"} {
+	for _, hint := range []string{"i doctor", "i model pull"} {
 		if !strings.Contains(out, hint) {
 			t.Errorf("banner should mention %q; got: %q", hint, out)
 		}

diff --git a/internal/cli/cli.go b/internal/cli/cli.go
@@ -23,7 +23,6 @@ var knownSubcommands = map[string]commandHandler{
 	"doctor":     cmdDoctor,
 	"config":     cmdConfig,
 	"model":      cmdModel,
-	"daemon":     cmdDaemon,
 	"history":    cmdHistory,
 	"pin":        cmdPin,
 	"run":        cmdRun,
@@ -163,12 +162,11 @@ Tip:
   double quotes for reliable shell parsing across environments.
 
 Subcommands:
-  init        First-run setup (model, daemon, completions).
+  init        First-run setup (model, runtime, completions).
   shell-init  Print shell snippet to source for natural-language quoting.
-  doctor      Diagnose installation, model, daemon, sandbox.
+  doctor      Diagnose installation, runtime, model, sandbox.
   config      Get/set/edit configuration.
   model       Manage local models.
-  daemon      Start/stop/status the background daemon.
   history     Inspect or clear the audit log.
   pin         Promote the last accepted command to a named skill.
   run         Run a pinned skill by name.
@@ -202,7 +200,7 @@ Top-level:
   --help, -h       This help.
   -v, --verbose    Log model I/O, tool calls, and gh round-trips to stderr.
                    (also enabled by INTENT_VERBOSE=1)
-  --uninstall      Remove binary, daemon, and (with consent) state.
+  --uninstall      Remove binary and (with consent) state.
   --update         Equivalent to "update".
 
 Read INTENT.md and docs/SPEC.md before contributing.

diff --git a/internal/cli/config.go b/internal/cli/config.go
@@ -86,14 +86,14 @@ func cmdConfig(_ context.Context, args []string) int {
 	}
 }
 
+// validateConfigValue is a hook for per-key validation on `i config set`.
+// Local inference no longer binds a network socket (llama-cli runs as a
+// subprocess), so there are currently no keys that need rejecting; the
+// function stays as the extension point.
 func validateConfigValue(key, value string) error {
-	switch key {
-	case "daemon.host":
-		_, err := normalizeLocalDaemonHost(value)
-		return err
-	default:
-		return nil
-	}
+	_ = key
+	_ = value
+	return nil
 }
 
 func lookupKnown(c *config.Config, key string) string {