diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index d32c397..5c969d8 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -294,7 +294,7 @@ jobs: def caveats <<~EOS - Run first-run setup to download the local model and start the daemon: + Run first-run setup to install the runtime and download the local model: i init For zsh users: install the shell hook so prompts containing ? * [ ] diff --git a/README.md b/README.md index 163cdd2..e38052f 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ It is **local-first** by default (no network required after first run, no prompt That composability applies to subcommands that consume natural language too: `i report "first problem" < extra-notes.txt` appends the piped text after the command-line text before proposing issues. -> **Status: pre-alpha.** The binary builds and the mock backend round-trips the full prompt → propose → confirm → run loop, but the local model runtime, daemon, and self-update flows are still being wired up. See [`INTENT.md`](./INTENT.md) for the full project charter, [`docs/SPEC.md`](./docs/SPEC.md) for the implementation contract, and [open issues](https://github.com/CoreyRDean/intent/issues) for the roadmap. +> **Status: pre-alpha.** The binary builds and the mock backend round-trips the full prompt → propose → confirm → run loop, but the local model runtime (llama.cpp's `llama-cli`) and self-update flows are still being wired up. See [`INTENT.md`](./INTENT.md) for the full project charter, [`docs/SPEC.md`](./docs/SPEC.md) for the implementation contract, and [open issues](https://github.com/CoreyRDean/intent/issues) for the roadmap. ## Building from source @@ -125,7 +125,7 @@ With `--literal`, everything after the flag is treated as natural-language promp ## Managing models -intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llamafile](https://github.com/mozilla-ai/llamafile). You can also point it at any public Hugging Face GGUF repo. +intent ships with a curated catalog of small-to-medium GGUF models that run locally via [llama.cpp](https://github.com/ggml-org/llama.cpp), which intent installs on demand through your system package manager (Homebrew, apt, dnf, …). Each `i` invocation starts a request-scoped `llama-server` child, holds the model warm for that one request (so multi-step tool calls don't reload it), and kills it on exit — there is **no background daemon**. If `llama-server` isn't available it falls back to one-shot `llama-cli`. You can also point it at any public Hugging Face GGUF repo. ```sh # See what's on offer and which one is current. diff --git a/install.sh b/install.sh index 12cf36c..f582a74 100755 --- a/install.sh +++ b/install.sh @@ -11,7 +11,7 @@ # PREFIX install root (default /usr/local; needs sudo if not writable) # INTENT_TMPDIR where to stage downloads (default $TMPDIR or /tmp) # -# This script does not auto-install the daemon, the model runtime, or the +# This script does not auto-install the model runtime (llama-cli) or the # model. Run `i init` after install. set -Eeuo pipefail @@ -136,8 +136,8 @@ echo # Auto-run `intent init` if we have a real TTY on stdin/stderr. Without # this, users who curl|bash and ignore the next-steps text get a binary -# they can't actually use until they read the docs. With it, the model -# downloads and the daemon starts as part of the install flow. +# they can't actually use until they read the docs. With it, the runtime +# installs and the model downloads as part of the install flow. # # We skip it under `bash -c` / `curl | bash` (no TTY) so non-interactive # CI jobs aren't surprised by a 4 GB download. diff --git a/internal/cli/backend.go b/internal/cli/backend.go index cc27bc6..92cc682 100644 --- a/internal/cli/backend.go +++ b/internal/cli/backend.go @@ -3,16 +3,17 @@ package cli import ( "context" "fmt" - "net" - "net/url" + "io" "os" - "strings" - "time" "github.com/CoreyRDean/intent/internal/config" "github.com/CoreyRDean/intent/internal/model" + "github.com/CoreyRDean/intent/internal/model/llamacli" "github.com/CoreyRDean/intent/internal/model/llamafile" + "github.com/CoreyRDean/intent/internal/model/llamaserver" "github.com/CoreyRDean/intent/internal/model/mock" + intentruntime "github.com/CoreyRDean/intent/internal/runtime" + "github.com/CoreyRDean/intent/internal/state" "github.com/CoreyRDean/intent/internal/verbose" ) @@ -21,10 +22,11 @@ import ( // unavailable and we silently fell back to the mock — callers use this to // surface a per-invocation warning so users aren't left confused. // -// In v1 we wire: mock, llamafile-local, llamafile-network, ollama (as a -// llamafile-shaped HTTP), openai (as a llamafile-shaped HTTP). The grammar -// constraint is the same across all of them; the only differences are the -// endpoint and the auth header. +// Backends: mock; llama-cli (local one-shot llama.cpp subprocess, also +// reachable under the legacy alias "llamafile-local"); llamafile-network, +// ollama, and openai (all OpenAI-compatible HTTP). The JSON-schema grammar +// constraint is the same across all of them; they differ only in transport +// (local subprocess vs. HTTP endpoint + auth header). func buildBackend(name string, cfg *config.Config, modelOverride string) (model.Backend, bool, error) { if v := os.Getenv("INTENT_FORCE_BACKEND"); v != "" { name = v @@ -32,27 +34,48 @@ func buildBackend(name string, cfg *config.Config, modelOverride string) (model. switch name { case "mock": return mock.New(), false, nil - case "llamafile-local": - // We expect the daemon (`intentd`) to have started llamafile on - // the loopback host:port from config. If nothing's listening, we - // fall back to the mock backend so `i hello` doesn't hard-fail - // for a brand-new install — instead the mock returns an honest - // "the local model isn't installed yet" response. - host, port, err := resolveLocalDaemonEndpoint(cfg) + case "llama-cli", "llamafile-local": + // Local inference runs llama.cpp. Preferred: a request-scoped + // `llama-server` child held warm for the whole invocation — + // native multi-turn messages (no flattening) and no per-step + // model reload across the tool-call loop. Fallback: one-shot + // `llama-cli` when the server binary isn't present. If neither + // the runtime nor the model is installed, fall back to the mock + // so `i hello` doesn't hard-fail for a brand-new install — + // ensureBackendReady / `i doctor` guide the fix. + // ("llamafile-local" is kept as a back-compat alias for configs + // written before the switch to llama.cpp.) + dirs, err := state.Resolve() if err != nil { return nil, false, err } - endpoint := fmt.Sprintf("http://%s:%s", host, port) - if !endpointReachable(endpoint) { + rt := intentruntime.New(dirs.Cache) + modelPath := rt.ModelPath(selectedModelFile(dirs.State, cfg)) + if !fileExists(modelPath) { return mock.New(), true, nil } - b := llamafile.New(endpoint) + tag := cfg.Model if modelOverride != "" { - b.ModelTag = modelOverride - } else { - b.ModelTag = cfg.Model + tag = modelOverride + } + ctxTokens := 0 + if m := loadCatalog(dirs.State).Get(cfg.Model); m != nil { + ctxTokens = m.ContextTokens + } + switch { + case rt.HaveLlamaServer(): + b := llamaserver.New(rt.LlamaServerPath(), modelPath) + b.ModelTag = tag + b.ContextSize = ctxTokens + return b, false, nil + case rt.HaveLlamaCLI(): + b := llamacli.New(rt.LlamaCLIPath(), modelPath) + b.ModelTag = tag + b.ContextSize = ctxTokens + return b, false, nil + default: + return mock.New(), true, nil } - return b, false, nil case "llamafile-network": ep := os.Getenv("INTENT_LLAMAFILE_ENDPOINT") if ep == "" { @@ -119,6 +142,16 @@ func buildBackendCtx(ctx context.Context, name string, cfg *config.Config, model l.KV("endpoint", b.Endpoint) l.KV("model_tag", b.ModelTag) } + if b, ok := be.(*llamacli.Backend); ok { + l.KV("binary", b.BinaryPath) + l.KV("model_path", b.ModelPath) + l.KV("model_tag", b.ModelTag) + } + if b, ok := be.(*llamaserver.Backend); ok { + l.KV("binary", b.BinaryPath) + l.KV("model_path", b.ModelPath) + l.KV("model_tag", b.ModelTag) + } be = verbose.Backend(l, be) } return be, fb, nil @@ -131,7 +164,21 @@ func printMockFallbackBanner(isFallback bool) { if !isFallback { return } - fmt.Fprintln(os.Stderr, "[MOCK] real backend unavailable — responses are simulated. Run 'i doctor', 'i model list', or 'i daemon start' to fix.") + fmt.Fprintln(os.Stderr, "[MOCK] real backend unavailable — responses are simulated. Run 'i doctor' or 'i model pull' to fix.") +} + +// isLocalBackend reports whether a backend name uses the local llama-cli +// runtime (and therefore wants the runtime/model self-healing in +// ensureBackendReady). The empty string means "use the configured +// default", which is llama-cli. "llamafile-local" is the back-compat +// alias for configs predating the switch. +func isLocalBackend(name string) bool { + switch name { + case "", "llama-cli", "llamafile-local": + return true + default: + return false + } } // isMockBackend reports whether b is the mock backend (by name). @@ -140,28 +187,18 @@ func isMockBackend(b model.Backend) bool { return b.Name() == "mock" } -// endpointReachable does a short-timeout TCP check on the host:port of a URL. -func endpointReachable(rawURL string) bool { - u, err := url.Parse(rawURL) - if err != nil { - return false - } - host := u.Host - if host == "" { - return false - } - if !strings.Contains(host, ":") { - switch u.Scheme { - case "https": - host += ":443" - default: - host += ":80" - } - } - c, err := net.DialTimeout("tcp", host, 200*time.Millisecond) - if err != nil { - return false +// fileExists reports whether path exists and is a regular file. +func fileExists(path string) bool { + info, err := os.Stat(path) + return err == nil && !info.IsDir() +} + +// closeBackend tears down any resources a backend holds — notably the +// llama-server co-process, which must be killed when the invocation ends. +// Safe to defer on every backend; a no-op for stateless ones. The verbose +// wrapper forwards Close to its inner backend. +func closeBackend(be model.Backend) { + if c, ok := be.(io.Closer); ok { + _ = c.Close() } - _ = c.Close() - return true } diff --git a/internal/cli/backend_test.go b/internal/cli/backend_test.go index b58f9b0..faef6ed 100644 --- a/internal/cli/backend_test.go +++ b/internal/cli/backend_test.go @@ -40,39 +40,26 @@ func TestBuildBackend_MockIsNotFallback(t *testing.T) { } } -func TestBuildBackend_LlamafileLocalFallsBackWhenUnreachable(t *testing.T) { +// When llama-cli or the model isn't installed, the local backend falls +// back to mock so a fresh install doesn't hard-fail. We point the cache +// at an empty temp dir so the model file is guaranteed absent. +func TestBuildBackend_LlamaCLILocalFallsBackWhenNotInstalled(t *testing.T) { clearBackendEnv(t) - // Point the daemon at a port that is definitely not listening. - cfg := minimalConfig() - cfg.Raw["daemon.host"] = "127.0.0.1" - cfg.Raw["daemon.port"] = "1" // port 1 is reserved; nothing listens there - - be, isFallback, err := buildBackend("llamafile-local", cfg, "") - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if !isFallback { - t.Error("unavailable llamafile-local should set isFallback=true") - } - if be.Name() != "mock" { - t.Errorf("expected fallback name %q, got %q", "mock", be.Name()) - } -} - -func TestBuildBackend_LlamafileLocalRejectsNonLoopbackHost(t *testing.T) { - clearBackendEnv(t) - cfg := minimalConfig() - cfg.Raw["daemon.host"] = "0.0.0.0" - - _, isFallback, err := buildBackend("llamafile-local", cfg, "") - if err == nil { - t.Fatal("expected error for non-loopback daemon host, got nil") - } - if isFallback { - t.Fatal("invalid daemon host should not silently fall back to mock") - } - if !strings.Contains(err.Error(), "loopback only") { - t.Fatalf("error = %q, want loopback hint", err) + t.Setenv("HOME", t.TempDir()) + t.Setenv("INTENT_STATE_DIR", t.TempDir()) + t.Setenv("INTENT_CACHE_DIR", t.TempDir()) + + for _, name := range []string{"llama-cli", "llamafile-local"} { + be, isFallback, err := buildBackend(name, minimalConfig(), "") + if err != nil { + t.Fatalf("%s: unexpected error: %v", name, err) + } + if !isFallback { + t.Errorf("%s: uninstalled local backend should set isFallback=true", name) + } + if be.Name() != "mock" { + t.Errorf("%s: expected fallback name %q, got %q", name, "mock", be.Name()) + } } } @@ -155,7 +142,7 @@ func TestPrintMockFallbackBanner_MentionsNextSteps(t *testing.T) { io.Copy(&buf, r) out := buf.String() - for _, hint := range []string{"i doctor", "i daemon start"} { + for _, hint := range []string{"i doctor", "i model pull"} { if !strings.Contains(out, hint) { t.Errorf("banner should mention %q; got: %q", hint, out) } diff --git a/internal/cli/cli.go b/internal/cli/cli.go index 64701e9..e53543c 100644 --- a/internal/cli/cli.go +++ b/internal/cli/cli.go @@ -23,7 +23,6 @@ var knownSubcommands = map[string]commandHandler{ "doctor": cmdDoctor, "config": cmdConfig, "model": cmdModel, - "daemon": cmdDaemon, "history": cmdHistory, "pin": cmdPin, "run": cmdRun, @@ -163,12 +162,11 @@ Tip: double quotes for reliable shell parsing across environments. Subcommands: - init First-run setup (model, daemon, completions). + init First-run setup (model, runtime, completions). shell-init Print shell snippet to source for natural-language quoting. - doctor Diagnose installation, model, daemon, sandbox. + doctor Diagnose installation, runtime, model, sandbox. config Get/set/edit configuration. model Manage local models. - daemon Start/stop/status the background daemon. history Inspect or clear the audit log. pin Promote the last accepted command to a named skill. run Run a pinned skill by name. @@ -202,7 +200,7 @@ Top-level: --help, -h This help. -v, --verbose Log model I/O, tool calls, and gh round-trips to stderr. (also enabled by INTENT_VERBOSE=1) - --uninstall Remove binary, daemon, and (with consent) state. + --uninstall Remove binary and (with consent) state. --update Equivalent to "update". Read INTENT.md and docs/SPEC.md before contributing. diff --git a/internal/cli/config.go b/internal/cli/config.go index 8592dfa..8e8efac 100644 --- a/internal/cli/config.go +++ b/internal/cli/config.go @@ -86,14 +86,14 @@ func cmdConfig(_ context.Context, args []string) int { } } +// validateConfigValue is a hook for per-key validation on `i config set`. +// Local inference no longer binds a network socket (llama-cli runs as a +// subprocess), so there are currently no keys that need rejecting; the +// function stays as the extension point. func validateConfigValue(key, value string) error { - switch key { - case "daemon.host": - _, err := normalizeLocalDaemonHost(value) - return err - default: - return nil - } + _ = key + _ = value + return nil } func lookupKnown(c *config.Config, key string) string { diff --git a/internal/cli/daemon.go b/internal/cli/daemon.go deleted file mode 100644 index 1b4da4c..0000000 --- a/internal/cli/daemon.go +++ /dev/null @@ -1,363 +0,0 @@ -package cli - -import ( - "context" - "fmt" - "io" - "os" - "os/exec" - "os/signal" - "path/filepath" - "runtime" - "syscall" - "time" - - "github.com/CoreyRDean/intent/internal/config" - "github.com/CoreyRDean/intent/internal/daemon" - "github.com/CoreyRDean/intent/internal/models" - intentruntime "github.com/CoreyRDean/intent/internal/runtime" - "github.com/CoreyRDean/intent/internal/state" -) - -const daemonUsage = "usage: i daemon (start | stop | status | logs | install | uninstall)" - -// daemonLabel is the launchd / systemd unit name. Stable across versions. -const daemonLabel = "com.coreyrdean.intent" - -func cmdDaemon(ctx context.Context, args []string) int { - if len(args) == 0 { - errf(daemonUsage) - return 1 - } - dirs, err := state.Resolve() - if err != nil { - errf("daemon: %v", err) - return 3 - } - cfg, _ := config.Load(dirs.ConfigPath()) - - switch args[0] { - case "--help", "-h", "help": - fmt.Println(daemonUsage) - return 0 - case "start": - return daemonStart(ctx, dirs, cfg, args[1:]) - case "stop": - return daemonStop(dirs) - case "status": - return daemonStatus(dirs) - case "logs": - return daemonLogs(dirs) - case "install": - return daemonInstall(dirs) - case "uninstall": - return daemonUninstall(dirs) - default: - errf("unknown subcommand: %q", args[0]) - return 1 - } -} - -// daemonStart is the user-visible `i daemon start`. By default it spawns -// itself in the background (re-execs with --foreground), waits for the -// control socket to come up, prints a one-line "started" message, and -// returns — so the user gets their prompt back in well under a second. -// -// `--foreground` (or `--attach`) keeps the process attached to the -// terminal, which is what launchd / systemd want and what `i daemon -// logs -f` style debugging needs. The env var INTENTD_FOREGROUND is -// the same switch in env form, so service files don't have to know -// about the flag. -func daemonStart(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []string) int { - foreground := os.Getenv("INTENTD_FOREGROUND") == "1" - for _, a := range args { - switch a { - case "--foreground", "--attach", "-f": - foreground = true - case "--background", "-b": - foreground = false - } - } - if !foreground { - return daemonSpawnDetached(dirs) - } - return daemonRunForeground(ctx, dirs, cfg) -} - -// daemonSpawnDetached re-execs ourselves with --foreground, redirects -// the child's stdio to a log file, decouples it from our process group -// (Setsid), and returns once the control socket is responsive — or -// after a sane timeout, with the log path so the user can inspect a -// failure. -func daemonSpawnDetached(dirs state.Dirs) int { - if err := os.MkdirAll(filepath.Join(dirs.State, "logs"), 0o700); err != nil { - errf("daemon start: %v", err) - return 3 - } - logPath := filepath.Join(dirs.State, "logs", "intentd.log") - logF, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600) - if err != nil { - errf("daemon start: open log %s: %v", logPath, err) - return 3 - } - defer logF.Close() - - self, err := os.Executable() - if err != nil { - errf("daemon start: locate self: %v", err) - return 3 - } - cmd := exec.Command(self, "daemon", "start", "--foreground") - cmd.Env = append(os.Environ(), "INTENTD_FOREGROUND=1") - cmd.Stdout = logF - cmd.Stderr = logF - cmd.Stdin = nil - cmd.SysProcAttr = &syscall.SysProcAttr{Setsid: true} - if err := cmd.Start(); err != nil { - errf("daemon start: spawn: %v", err) - return 3 - } - // Don't wait for it — we want it to outlive us. - go func() { _ = cmd.Process.Release() }() - - // Poll the control socket for readiness. The child has 30s to come - // up before we report failure; on a cold cache that's mostly - // llamafile loading the model. - deadline := time.Now().Add(30 * time.Second) - c := daemon.NewClient(dirs.SocketPath()) - for time.Now().Before(deadline) { - if resp, err := c.Call(daemon.Request{Op: daemon.OpPing}); err == nil && resp.OK { - fmt.Fprintln(os.Stderr, "intentd: started in the background.") - fmt.Fprintf(os.Stderr, " socket: %s\n", dirs.SocketPath()) - fmt.Fprintf(os.Stderr, " log: %s\n", logPath) - return 0 - } - time.Sleep(250 * time.Millisecond) - } - errf("daemon start: timed out waiting for control socket; tail -f %s", logPath) - return 3 -} - -func daemonRunForeground(ctx context.Context, dirs state.Dirs, cfg *config.Config) int { - mgr := intentruntime.New(dirs.Cache) - if !mgr.HaveLlamafile() { - errf("daemon: llamafile runtime missing — run `i model pull` first") - errf(" expected: %s", mgr.LlamafilePath()) - return 3 - } - // Resolve the model through the full catalog (built-in + custom) - // so the daemon loads exactly what `i model use` selected, even - // for user-added HF repos that aren't in the built-in list. - cat := loadCatalog(dirs.State) - id := cfg.Model - if id == "" { - id = models.DefaultID - } - host, port, err := resolveLocalDaemonEndpoint(cfg) - if err != nil { - errf("daemon: %v", err) - return 1 - } - m := cat.Get(id) - if m == nil { - errf("daemon: current model %q not in catalog; run `i model list` and `i model use `", id) - return 1 - } - modelPath := mgr.ModelPath(models.ModelFilename(m)) - if _, err := os.Stat(modelPath); err != nil { - errf("daemon: model %q not installed — run `i model pull %s`", id, id) - errf(" expected: %s", modelPath) - return 3 - } - - logDir := filepath.Join(dirs.State, "logs") - if err := os.MkdirAll(logDir, 0o700); err != nil { - errf("daemon: mkdir log dir: %v", err) - return 3 - } - logPath := filepath.Join(logDir, "llamafile.log") - logF, err := os.OpenFile(logPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600) - if err != nil { - errf("daemon: open log: %v", err) - return 3 - } - defer logF.Close() - - portNum := 18080 - fmt.Sscanf(port, "%d", &portNum) - - launcher := daemon.NewLauncher(mgr.LlamafilePath(), modelPath, host, portNum) - launcher.StdoutLog = logF - launcher.StderrLog = io.MultiWriter(logF, os.Stderr) - - startCtx, cancelStart := context.WithTimeout(ctx, 90*time.Second) - fmt.Fprintln(os.Stderr, "intentd: starting llamafile...") - if err := launcher.Start(startCtx); err != nil { - cancelStart() - errf("daemon: start llamafile: %v", err) - return 3 - } - cancelStart() - fmt.Fprintf(os.Stderr, "intentd: llamafile ready on %s (pid %d)\n", - launcher.Endpoint(), launcher.PID()) - - srv := daemon.New(dirs.SocketPath(), launcher) - if err := srv.Listen(); err != nil { - launcher.Stop(5 * time.Second) - errf("daemon: listen: %v", err) - return 3 - } - fmt.Fprintf(os.Stderr, "intentd: control socket at %s\n", dirs.SocketPath()) - - sigCtx, cancelSig := signal.NotifyContext(ctx, os.Interrupt, syscall.SIGTERM, syscall.SIGHUP) - defer cancelSig() - serveDone := make(chan struct{}) - go func() { - _ = srv.Serve(sigCtx) - close(serveDone) - }() - - // Block until any of: an OS signal, an `i daemon stop` over the - // socket, the parent context is canceled, or the supervised - // llamafile gives up entirely (Wait drains). - llamaDone := make(chan struct{}) - go func() { launcher.Wait(); close(llamaDone) }() - select { - case <-sigCtx.Done(): - case <-srv.Stopped(): - case <-serveDone: - case <-llamaDone: - } - fmt.Fprintln(os.Stderr, "intentd: shutting down...") - srv.SignalStop() - launcher.Stop(10 * time.Second) - fmt.Fprintln(os.Stderr, "intentd: stopped.") - return 0 -} - -func daemonStop(dirs state.Dirs) int { - c := daemon.NewClient(dirs.SocketPath()) - resp, err := c.Call(daemon.Request{Op: daemon.OpStop}) - if err != nil { - errf("daemon stop: %v (is the daemon running?)", err) - return 1 - } - if !resp.OK { - errf("daemon stop: %s", resp.Error) - return 1 - } - fmt.Println("daemon: stop requested") - return 0 -} - -func daemonStatus(dirs state.Dirs) int { - c := daemon.NewClient(dirs.SocketPath()) - resp, err := c.Call(daemon.Request{Op: daemon.OpStatus}) - if err != nil { - fmt.Println("daemon: not running") - fmt.Println(" socket:", dirs.SocketPath()) - fmt.Println(" installed as service:", daemon.IsInstalled(daemonLabel)) - return 1 - } - if !resp.OK { - errf("daemon status: %s", resp.Error) - return 1 - } - fmt.Println("daemon: running") - for k, v := range resp.Data { - fmt.Printf(" %s: %v\n", k, v) - } - return 0 -} - -func daemonLogs(dirs state.Dirs) int { - logPath := filepath.Join(dirs.State, "logs", "llamafile.log") - if runtime.GOOS == "linux" && daemon.IsInstalled(daemonLabel) { - fmt.Fprintln(os.Stderr, "Tip: run `journalctl --user -u "+daemonLabel+".service -f` for the systemd-managed log.") - fmt.Fprintln(os.Stderr, "Showing the llamafile subprocess log:", logPath) - } - f, err := os.Open(logPath) - if err != nil { - errf("logs: %v", err) - return 1 - } - defer f.Close() - if _, err := io.Copy(os.Stdout, f); err != nil { - errf("logs: %v", err) - return 1 - } - return 0 -} - -func daemonInstall(dirs state.Dirs) int { - bin, err := os.Executable() - if err != nil { - errf("daemon install: locate self: %v", err) - return 3 - } - bin, _ = filepath.EvalSymlinks(bin) - res, err := daemon.Install(daemon.InstallParams{ - Binary: bin, - Label: daemonLabel, - LogDir: filepath.Join(dirs.State, "logs"), - Socket: dirs.SocketPath(), - Cache: dirs.Cache, - State: dirs.State, - }) - if err != nil { - errf("daemon install: %v", err) - return 3 - } - fmt.Println("daemon installed as a system service.") - fmt.Println(" unit: ", res.UnitPath) - fmt.Println(" start: ", strJoin(res.StartCmd)) - fmt.Println(" stop: ", strJoin(res.StopCmd)) - if res.LogPath != "" { - fmt.Println(" log: ", res.LogPath) - } - if res.Notes != "" { - fmt.Println() - fmt.Println(res.Notes) - } - return 0 -} - -func daemonUninstall(dirs state.Dirs) int { - // Try a polite stop first. - c := daemon.NewClient(dirs.SocketPath()) - _, _ = c.Call(daemon.Request{Op: daemon.OpStop}) - if err := daemon.Uninstall(daemonLabel); err != nil { - errf("daemon uninstall: %v", err) - return 3 - } - fmt.Println("daemon: service uninstalled.") - return 0 -} - -func strJoin(parts []string) string { - if len(parts) == 0 { - return "(none)" - } - out := "" - for i, p := range parts { - if i > 0 { - out += " " - } - out += p - } - return out -} - -// modelFileFor maps a config model tag (e.g. "qwen2.5-coder-7b-instruct-q4_k_m") -// to the GGUF filename we expect on disk. v1 is hard-coded to one default; -// future versions consult a model registry. -func modelFileFor(tag string) string { - if tag == intentruntime.DefaultModel.Name { - return intentruntime.DefaultModel.File - } - // Best-effort: assume tag + ".gguf". - if filepath.Ext(tag) == ".gguf" { - return tag - } - return tag + ".gguf" -} diff --git a/internal/cli/daemon_host.go b/internal/cli/daemon_host.go deleted file mode 100644 index 871faae..0000000 --- a/internal/cli/daemon_host.go +++ /dev/null @@ -1,57 +0,0 @@ -package cli - -import ( - "fmt" - "net" - "strings" - - "github.com/CoreyRDean/intent/internal/config" -) - -const defaultLocalDaemonHost = "127.0.0.1" -const defaultLocalDaemonPort = "18080" - -// normalizeLocalDaemonHost accepts only loopback hosts for the local daemon. -// Any accepted value is canonicalized to 127.0.0.1 so the local backend never -// accidentally exposes the model server on a broader interface. -func normalizeLocalDaemonHost(raw string) (string, error) { - host := strings.TrimSpace(raw) - if host == "" { - return defaultLocalDaemonHost, nil - } - if strings.HasPrefix(host, "[") && strings.HasSuffix(host, "]") { - host = strings.TrimSuffix(strings.TrimPrefix(host, "["), "]") - } - if strings.EqualFold(host, "localhost") { - return defaultLocalDaemonHost, nil - } - if ip := net.ParseIP(host); ip != nil && ip.IsLoopback() { - return defaultLocalDaemonHost, nil - } - return "", fmt.Errorf("daemon.host %q must resolve to loopback only", strings.TrimSpace(raw)) -} - -func resolveLocalDaemonHost(cfg *config.Config) (string, error) { - if cfg == nil { - return normalizeLocalDaemonHost("") - } - return normalizeLocalDaemonHost(cfg.Raw["daemon.host"]) -} - -func resolveLocalDaemonPort(cfg *config.Config) string { - if cfg == nil { - return defaultLocalDaemonPort - } - if port := strings.TrimSpace(cfg.Raw["daemon.port"]); port != "" { - return port - } - return defaultLocalDaemonPort -} - -func resolveLocalDaemonEndpoint(cfg *config.Config) (host, port string, err error) { - host, err = resolveLocalDaemonHost(cfg) - if err != nil { - return "", "", err - } - return host, resolveLocalDaemonPort(cfg), nil -} diff --git a/internal/cli/daemon_host_test.go b/internal/cli/daemon_host_test.go deleted file mode 100644 index 22d7ade..0000000 --- a/internal/cli/daemon_host_test.go +++ /dev/null @@ -1,75 +0,0 @@ -package cli - -import ( - "strings" - "testing" - - "github.com/CoreyRDean/intent/internal/config" -) - -func TestNormalizeLocalDaemonHost(t *testing.T) { - tests := []struct { - name string - raw string - want string - wantErr string - }{ - {name: "default empty host", raw: "", want: "127.0.0.1"}, - {name: "localhost", raw: "localhost", want: "127.0.0.1"}, - {name: "ipv4 loopback", raw: "127.0.0.1", want: "127.0.0.1"}, - {name: "ipv6 loopback", raw: "::1", want: "127.0.0.1"}, - {name: "bracketed ipv6 loopback", raw: "[::1]", want: "127.0.0.1"}, - {name: "non-loopback wildcard rejected", raw: "0.0.0.0", wantErr: "loopback only"}, - {name: "non-loopback ip rejected", raw: "192.168.1.10", wantErr: "loopback only"}, - {name: "hostname rejected", raw: "example.com", wantErr: "loopback only"}, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - got, err := normalizeLocalDaemonHost(tt.raw) - if tt.wantErr != "" { - if err == nil { - t.Fatalf("expected error containing %q, got nil", tt.wantErr) - } - if !strings.Contains(err.Error(), tt.wantErr) { - t.Fatalf("error = %q, want substring %q", err, tt.wantErr) - } - return - } - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if got != tt.want { - t.Fatalf("host = %q, want %q", got, tt.want) - } - }) - } -} - -func TestResolveLocalDaemonEndpoint(t *testing.T) { - cfg := &config.Config{Raw: map[string]string{ - "daemon.host": " localhost ", - "daemon.port": " 19090 ", - }} - - host, port, err := resolveLocalDaemonEndpoint(cfg) - if err != nil { - t.Fatalf("unexpected error: %v", err) - } - if host != "127.0.0.1" { - t.Fatalf("host = %q, want %q", host, "127.0.0.1") - } - if port != "19090" { - t.Fatalf("port = %q, want %q", port, "19090") - } -} - -func TestValidateConfigValueRejectsRemoteDaemonHost(t *testing.T) { - err := validateConfigValue("daemon.host", "0.0.0.0") - if err == nil { - t.Fatal("expected daemon.host validation error, got nil") - } - if !strings.Contains(err.Error(), "loopback only") { - t.Fatalf("error = %q, want loopback hint", err) - } -} diff --git a/internal/cli/doctor.go b/internal/cli/doctor.go index 5d67363..9bc1240 100644 --- a/internal/cli/doctor.go +++ b/internal/cli/doctor.go @@ -7,22 +7,11 @@ import ( "runtime" "github.com/CoreyRDean/intent/internal/config" - "github.com/CoreyRDean/intent/internal/daemon" intentruntime "github.com/CoreyRDean/intent/internal/runtime" "github.com/CoreyRDean/intent/internal/state" "github.com/CoreyRDean/intent/internal/version" ) -type daemonStatusCaller interface { - Call(req daemon.Request) (*daemon.Response, error) -} - -var newDaemonStatusClient = func(socket string) daemonStatusCaller { - return daemon.NewClient(socket) -} - -var daemonServiceInstalled = daemon.IsInstalled - func cmdDoctor(_ context.Context, _ []string) int { ok := true check := func(name, status string, good bool) { @@ -54,15 +43,17 @@ func cmdDoctor(_ context.Context, _ []string) int { } rt := intentruntime.New(dirs.Cache) - check("llamafile runtime", - fmt.Sprintf("expected at %s", rt.LlamafilePath()), - rt.HaveLlamafile()) + runtimeStatus := "missing — run `i model pull` to install via your package manager" + switch { + case rt.HaveLlamaServer(): + runtimeStatus = "found at " + rt.LlamaServerPath() + case rt.HaveLlamaCLI(): + runtimeStatus = "llama-server missing; using one-shot fallback at " + rt.LlamaCLIPath() + } + check("llama.cpp runtime", runtimeStatus, rt.HaveLlamaRuntime()) modelFile, modelStatus := resolveModelCheck(cfg) check("model", fmt.Sprintf("%s — %s", modelStatus, rt.ModelPath(modelFile)), rt.HaveModel(modelFile)) - - daemonStatus, daemonOK := doctorDaemonStatus(dirs) - check("daemon", daemonStatus, daemonOK) } // Sandbox tooling. @@ -107,26 +98,3 @@ func okStr(err error) string { } return "missing" } - -func doctorDaemonStatus(dirs state.Dirs) (string, bool) { - installed := daemonServiceInstalled(daemonLabel) - resp, err := newDaemonStatusClient(dirs.SocketPath()).Call(daemon.Request{Op: daemon.OpStatus}) - if err != nil { - if installed { - return "installed but not responding", false - } - return "not running (optional)", true - } - if !resp.OK { - return "unhealthy: " + resp.Error, false - } - - serviceState := "no" - if installed { - serviceState = "yes" - } - if endpoint, _ := resp.Data["llamafile_endpoint"].(string); endpoint != "" { - return fmt.Sprintf("running (service installed: %s, endpoint: %s)", serviceState, endpoint), true - } - return fmt.Sprintf("running (service installed: %s)", serviceState), true -} diff --git a/internal/cli/doctor_test.go b/internal/cli/doctor_test.go index 80010f1..2f8c9c0 100644 --- a/internal/cli/doctor_test.go +++ b/internal/cli/doctor_test.go @@ -2,14 +2,11 @@ package cli import ( "context" - "errors" "strings" "testing" "github.com/CoreyRDean/intent/internal/config" - "github.com/CoreyRDean/intent/internal/daemon" intentruntime "github.com/CoreyRDean/intent/internal/runtime" - "github.com/CoreyRDean/intent/internal/state" ) func TestResolveModelCheck(t *testing.T) { @@ -58,100 +55,17 @@ func TestResolveModelCheck(t *testing.T) { } } -type stubDaemonStatusClient struct { - resp *daemon.Response - err error -} - -func (s stubDaemonStatusClient) Call(_ daemon.Request) (*daemon.Response, error) { - return s.resp, s.err -} - -func TestDoctorDaemonStatus(t *testing.T) { - origNewClient := newDaemonStatusClient - origInstalled := daemonServiceInstalled - t.Cleanup(func() { - newDaemonStatusClient = origNewClient - daemonServiceInstalled = origInstalled - }) - - dirs := state.Dirs{State: t.TempDir()} - - tests := []struct { - name string - installed bool - client stubDaemonStatusClient - want string - wantOK bool - }{ - { - name: "missing optional daemon is informational", - installed: false, - client: stubDaemonStatusClient{err: errors.New("dial unix: no such file or directory")}, - want: "not running (optional)", - wantOK: true, - }, - { - name: "installed daemon that does not respond is unhealthy", - installed: true, - client: stubDaemonStatusClient{err: errors.New("connection refused")}, - want: "installed but not responding", - wantOK: false, - }, - { - name: "running daemon reports endpoint", - installed: false, - client: stubDaemonStatusClient{resp: &daemon.Response{ - OK: true, - Data: map[string]any{ - "llamafile_endpoint": "http://127.0.0.1:18080", - }, - }}, - want: "running (service installed: no, endpoint: http://127.0.0.1:18080)", - wantOK: true, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - newDaemonStatusClient = func(string) daemonStatusCaller { return tt.client } - daemonServiceInstalled = func(string) bool { return tt.installed } - - got, gotOK := doctorDaemonStatus(dirs) - if got != tt.want { - t.Fatalf("status = %q, want %q", got, tt.want) - } - if gotOK != tt.wantOK { - t.Fatalf("ok = %v, want %v", gotOK, tt.wantOK) - } - }) - } -} - -func TestDoctorPrintsDaemonStatus(t *testing.T) { - origNewClient := newDaemonStatusClient - origInstalled := daemonServiceInstalled - t.Cleanup(func() { - newDaemonStatusClient = origNewClient - daemonServiceInstalled = origInstalled - }) - +// TestDoctorReportsLlamaRuntime verifies doctor surfaces the local +// llama.cpp runtime line rather than a daemon/server status. +func TestDoctorReportsLlamaRuntime(t *testing.T) { t.Setenv("HOME", t.TempDir()) t.Setenv("INTENT_STATE_DIR", t.TempDir()) t.Setenv("INTENT_CACHE_DIR", t.TempDir()) - newDaemonStatusClient = func(string) daemonStatusCaller { - return stubDaemonStatusClient{err: errors.New("dial unix: no such file or directory")} - } - daemonServiceInstalled = func(string) bool { return false } - out := captureStdout(func() { _ = cmdDoctor(context.Background(), nil) }) - if !strings.Contains(out, "daemon") { - t.Fatalf("doctor output missing daemon line: %q", out) - } - if !strings.Contains(out, "not running (optional)") { - t.Fatalf("doctor output missing optional daemon status: %q", out) + if !strings.Contains(out, "llama.cpp runtime") { + t.Fatalf("doctor output missing llama.cpp runtime line: %q", out) } } diff --git a/internal/cli/ensure.go b/internal/cli/ensure.go index f45b399..7cf354a 100644 --- a/internal/cli/ensure.go +++ b/internal/cli/ensure.go @@ -4,14 +4,11 @@ import ( "bufio" "context" "fmt" - "net" "os" "path/filepath" "strings" - "time" "github.com/CoreyRDean/intent/internal/config" - "github.com/CoreyRDean/intent/internal/daemon" "github.com/CoreyRDean/intent/internal/models" intentruntime "github.com/CoreyRDean/intent/internal/runtime" "github.com/CoreyRDean/intent/internal/state" @@ -19,36 +16,31 @@ import ( ) // ensureBackendReady is the self-healing precondition for any subcommand -// that wants to talk to the local model. It checks (in order): +// that wants to talk to the local model. Local inference runs llama.cpp's +// `llama-cli` one-shot — there is no daemon to start — so "ready" means: // -// 1. The daemon is reachable. If yes, we're done. -// 2. The runtime + model are present on disk. -// - If not and stdin is a TTY: ask permission, then download. -// - If not and we're non-interactive: fail with a clear, copyable -// command that fixes it. -// 3. With files in place but no daemon, start one in the background. -// 4. Wait briefly for the daemon's control socket to come up. +// 1. The llama-cli runtime is installed (via the system package manager). +// 2. The selected GGUF model is downloaded. // -// Returns true if the call site should proceed, false if it should -// bail out (we already printed the failure reason). +// If either is missing: +// - interactive TTY: ask permission, then install/download. +// - non-interactive: fail with a clear, copyable command that fixes it. // -// Backend-name guard: this only fires for the local llamafile backend. -// Users on `openai`, `ollama`, or `mock` get no prompts and no startup -// attempts — we're not their package manager. +// Returns true if the call site should proceed, false if it should bail +// out (we already printed the failure reason). +// +// Backend-name guard: this only fires for the local llama-cli backend. +// Users on `openai`, `ollama`, `llamafile-network`, or `mock` get no +// prompts and no install attempts — we're not their package manager. func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config) bool { - if cfg.Backend != "" && cfg.Backend != "llamafile-local" { + if !isLocalBackend(cfg.Backend) { return true } - // (1) Daemon already up? - if pingDaemon(dirs) { - return true - } - - mgr := intentruntime.New(dirs.Cache) - // Resolve the *selected* model through the catalog so self- - // healing downloads the right thing when the user has switched - // to a custom HF repo or a non-default built-in. + rt := intentruntime.New(dirs.Cache) + // Resolve the *selected* model through the catalog so self-healing + // downloads the right thing when the user has switched to a custom + // HF repo or a non-default built-in. cat := loadCatalog(dirs.State) id := cfg.Model if id == "" { @@ -56,91 +48,62 @@ func ensureBackendReady(ctx context.Context, dirs state.Dirs, cfg *config.Config } selected := cat.Get(id) if selected == nil { - // Fall back to the catalog default to at least make progress; - // the daemon will complain later if this mismatches config. selected = cat.Default() } - haveLF := mgr.HaveLlamafile() - haveModel := selected != nil && mgr.HaveModel(models.ModelFilename(selected)) - interactive := tui.IsTTY(os.Stdin) && tui.IsTTY(os.Stderr) + haveRuntime := rt.HaveLlamaRuntime() + haveModel := selected != nil && rt.HaveModel(models.ModelFilename(selected)) + if haveRuntime && haveModel { + return true + } - // (2) Missing artifacts. - if !haveLF || !haveModel { - if !interactive { - fmt.Fprintln(os.Stderr, "intent: local model isn't installed yet.") - fmt.Fprintln(os.Stderr, " run: i model pull") - return false - } - fmt.Fprintln(os.Stderr, "intent: the local model isn't installed yet.") - if !haveLF { - fmt.Fprintln(os.Stderr, " missing runtime: llamafile-"+intentruntime.LlamafileVersion) - } - if !haveModel && selected != nil { - fmt.Fprintf(os.Stderr, " missing model: %s (~%d MB)\n", - selected.ID, selected.SizeMB) - } - if !confirmYes("Download now?") { - fmt.Fprintln(os.Stderr, "intent: skipped. Run `i model pull` later.") - return false - } - if !haveLF { - fmt.Fprintln(os.Stderr, "downloading runtime...") - if err := mgr.EnsureLlamafile(ctx, progressCB("llamafile")); err != nil { - fmt.Fprintln(os.Stderr) - errf("runtime: %v", err) - return false - } - fmt.Fprintln(os.Stderr) + interactive := tui.IsTTY(os.Stdin) && tui.IsTTY(os.Stderr) + if !interactive { + fmt.Fprintln(os.Stderr, "intent: local model isn't ready yet.") + if !haveRuntime { + fmt.Fprintln(os.Stderr, " missing runtime: llama.cpp (llama-server)") } if !haveModel && selected != nil { - fmt.Fprintf(os.Stderr, "downloading model (~%d MB)...\n", selected.SizeMB) - mi := intentruntime.FromCatalog(selected) - if err := mgr.EnsureModel(ctx, mi, progressCB("model")); err != nil { - fmt.Fprintln(os.Stderr) - errf("model: %v", err) - return false - } - fmt.Fprintln(os.Stderr) + fmt.Fprintf(os.Stderr, " missing model: %s (~%d MB)\n", selected.ID, selected.SizeMB) } + fmt.Fprintln(os.Stderr, " run: i model pull") + return false } - // (3) Start the daemon. We use the same `i daemon start` code path - // as the user would, so behaviour matches and bugs are shared. - fmt.Fprintln(os.Stderr, "intent: starting daemon in the background...") - if rc := daemonSpawnDetached(dirs); rc != 0 { - fmt.Fprintln(os.Stderr, "intent: daemon failed to start; falling back to mock.") + fmt.Fprintln(os.Stderr, "intent: the local model isn't ready yet.") + if !haveRuntime { + fmt.Fprintln(os.Stderr, " missing runtime: llama.cpp (llama-server)") + } + if !haveModel && selected != nil { + fmt.Fprintf(os.Stderr, " missing model: %s (~%d MB)\n", selected.ID, selected.SizeMB) + } + if !confirmYes("Set up now?") { + fmt.Fprintln(os.Stderr, "intent: skipped. Run `i model pull` later.") return false } - // (4) Confirm it's actually responsive (daemonSpawnDetached already - // polls, but be defensive — the socket might be ready while - // llamafile is still warming up its first inference). - deadline := time.Now().Add(60 * time.Second) - for time.Now().Before(deadline) { - if pingDaemon(dirs) { - return true + if !haveRuntime { + fmt.Fprintln(os.Stderr, "installing llama.cpp...") + if err := rt.EnsureLlamaRuntime(ctx, func(s string) { fmt.Fprintln(os.Stderr, " "+s) }); err != nil { + errf("runtime: %v", err) + return false } - time.Sleep(200 * time.Millisecond) } - fmt.Fprintln(os.Stderr, "intent: daemon started but isn't responding yet; try again in a few seconds.") - return false -} - -// pingDaemon checks both that the control socket exists and that the -// daemon answers a ping. Either an unreachable socket or a sad daemon -// returns false. -func pingDaemon(dirs state.Dirs) bool { - if _, err := os.Stat(dirs.SocketPath()); err != nil { - return false + if !haveModel && selected != nil { + fmt.Fprintf(os.Stderr, "downloading model (~%d MB)...\n", selected.SizeMB) + mi := intentruntime.FromCatalog(selected) + if err := rt.EnsureModel(ctx, mi, progressCB("model")); err != nil { + fmt.Fprintln(os.Stderr) + errf("model: %v", err) + return false + } + fmt.Fprintln(os.Stderr) } - c, err := net.DialTimeout("unix", dirs.SocketPath(), 200*time.Millisecond) - if err != nil { + + if !rt.HaveLlamaRuntime() { + fmt.Fprintln(os.Stderr, "intent: llama.cpp still not available; falling back to mock.") return false } - _ = c.Close() - cli := daemon.NewClient(dirs.SocketPath()) - resp, err := cli.Call(daemon.Request{Op: daemon.OpPing}) - return err == nil && resp.OK + return true } // cfgModelFile turns the configured model tag into a GGUF filename, @@ -188,25 +151,3 @@ func confirmYes(prompt string) bool { line = strings.TrimSpace(strings.ToLower(line)) return line == "" || line == "y" || line == "yes" } - -// startDaemonAndWait is a small helper used by `i init` after a model -// pull, to bring the daemon up without making the user run a third -// command. It mirrors ensureBackendReady's daemon-startup half but -// with louder logging since this is an explicit setup step. -func startDaemonAndWait(dirs state.Dirs) error { - if pingDaemon(dirs) { - return nil - } - if rc := daemonSpawnDetached(dirs); rc != 0 { - return fmt.Errorf("daemon failed to start (see %s)", - filepath.Join(dirs.State, "logs", "intentd.log")) - } - deadline := time.Now().Add(60 * time.Second) - for time.Now().Before(deadline) { - if pingDaemon(dirs) { - return nil - } - time.Sleep(200 * time.Millisecond) - } - return fmt.Errorf("daemon started but didn't become responsive in 60s") -} diff --git a/internal/cli/explain.go b/internal/cli/explain.go index d5afc99..9ed065c 100644 --- a/internal/cli/explain.go +++ b/internal/cli/explain.go @@ -37,6 +37,7 @@ func cmdExplain(ctx context.Context, args []string) int { errf("explain: %v", err) return 3 } + defer closeBackend(be) printMockFallbackBanner(isFallback) vl := verbose.FromContext(ctx) diff --git a/internal/cli/init.go b/internal/cli/init.go index 6ecc8aa..082d570 100644 --- a/internal/cli/init.go +++ b/internal/cli/init.go @@ -44,20 +44,6 @@ func cmdInit(ctx context.Context, args []string) int { fmt.Printf(" cache dir: %s\n", dirs.Cache) fmt.Println() - // Daemon prompt — default Yes, per D-004. - fmt.Print("Keep intent warm in the background so it never has to load? [Y/n] ") - answer := "y" - if !autoYes { - r := bufio.NewReader(os.Stdin) - line, _ := r.ReadString('\n') - line = strings.TrimSpace(strings.ToLower(line)) - if line == "" { - line = "y" - } - answer = line - } - cfg.DaemonEnabled = answer == "y" || answer == "yes" - // Shell integration prompt — default Yes. Without it, zsh users // hit "no matches found" the first time they type a prompt with // a literal `?` in it, which is a brutal first impression. @@ -81,11 +67,7 @@ func cmdInit(ctx context.Context, args []string) int { fmt.Println() fmt.Println("Wrote", dirs.ConfigPath()) - if cfg.DaemonEnabled { - fmt.Println("Daemon: enabled. Run `i daemon install` to register it as a launchd/systemd service.") - } else { - fmt.Println("Daemon: disabled. Each invocation will cold-load the model.") - } + fmt.Println("Local inference runs llama.cpp's `llama-cli` on demand (no background daemon).") if installHook { writeShellHook() @@ -94,9 +76,9 @@ func cmdInit(ctx context.Context, args []string) int { fmt.Println(" ? * [ ] characters, or run `i shell-init zsh >> ~/.zshrc` later.") } - // Model pull + daemon start. This is the difference between - // "config written, now go figure out three more commands" and - // "open a new shell and you're working." Default Yes. + // Runtime install + model pull. This is the difference between + // "config written, now go figure out more commands" and "open a + // new shell and you're working." Default Yes. mgr := intentruntime.New(dirs.Cache) cat := loadCatalog(dirs.State) // Prefer whatever the user already selected in config over the @@ -106,14 +88,14 @@ func cmdInit(ctx context.Context, args []string) int { if selected == nil { selected = cat.Default() } - haveLF := mgr.HaveLlamafile() + haveRuntime := mgr.HaveLlamaRuntime() haveModel := selected != nil && mgr.HaveModel(models.ModelFilename(selected)) - if !haveLF || !haveModel { + if !haveRuntime || !haveModel { fmt.Println() if selected != nil { - fmt.Printf("Download %s now? (~%d MB) [Y/n] ", selected.ID, selected.SizeMB) + fmt.Printf("Install llama.cpp and download %s now? (~%d MB) [Y/n] ", selected.ID, selected.SizeMB) } else { - fmt.Printf("Download the default local model now? [Y/n] ") + fmt.Printf("Install llama.cpp and the default local model now? [Y/n] ") } pullAnswer := "y" if !autoYes { @@ -126,11 +108,11 @@ func cmdInit(ctx context.Context, args []string) int { pullAnswer = line } if pullAnswer == "y" || pullAnswer == "yes" { - if !haveLF { - fmt.Println("downloading runtime...") - if err := mgr.EnsureLlamafile(ctx, progressCB("llamafile")); err != nil { + if !haveRuntime { + fmt.Println("installing llama.cpp via your package manager...") + if err := mgr.EnsureLlamaRuntime(ctx, func(s string) { fmt.Println(" " + s) }); err != nil { fmt.Println() - errf("init: download runtime: %v", err) + errf("init: install runtime: %v", err) fmt.Println("you can retry with `i model pull`.") return 0 } @@ -152,20 +134,10 @@ func cmdInit(ctx context.Context, args []string) int { } } else { fmt.Println() + fmt.Println("Runtime: llama.cpp already installed.") fmt.Println("Model: already installed.") } - if cfg.DaemonEnabled { - fmt.Println("Starting daemon...") - if err := startDaemonAndWait(dirs); err != nil { - errf("init: %v", err) - fmt.Println("you can retry with `i daemon start` (and inspect logs at", - filepath.Join(dirs.State, "logs", "intentd.log")+").") - } else { - fmt.Println("Daemon: running.") - } - } - fmt.Println() fmt.Println("All set. Try:") fmt.Println(" i hello # smoke test") diff --git a/internal/cli/intent.go b/internal/cli/intent.go index 71bdee4..0966ef8 100644 --- a/internal/cli/intent.go +++ b/internal/cli/intent.go @@ -213,10 +213,10 @@ func cmdIntent(ctx context.Context, args []string) int { return 3 } - // Self-heal: if the backend is local-llamafile and the daemon - // isn't reachable, offer to download the model and start it. - // This collapses what used to be three commands the user had to - // guess (`i model pull`, `i daemon install`, retry) into one + // Self-heal: if the backend is the local llama-cli runtime and it + // (or the selected model) isn't installed yet, offer to install the + // runtime via the system package manager and download the model. + // This collapses what used to be several setup commands into one // prompt or, if `--yes` is set, zero. See ensure.go. backendForCheck := cfg.Backend if fl.backend != "" { @@ -251,6 +251,7 @@ func cmdIntent(ctx context.Context, args []string) int { errf("backend: %v", err) return 3 } + defer closeBackend(be) printMockFallbackBanner(isFallback) // Top-level verbose breadcrumbs. Safe no-op when -v is off. diff --git a/internal/cli/model.go b/internal/cli/model.go index 580479e..7bb808a 100644 --- a/internal/cli/model.go +++ b/internal/cli/model.go @@ -7,7 +7,6 @@ import ( "path/filepath" "strings" "text/tabwriter" - "time" "github.com/CoreyRDean/intent/internal/config" "github.com/CoreyRDean/intent/internal/models" @@ -268,8 +267,9 @@ func inferQuantFromFilename(filename string) string { // modelUse switches the current model. Resolves the reference, persists // it as a custom entry if it's an HF repo we haven't seen, downloads -// the model if it's not installed, and finally updates cfg.Model + -// restarts the daemon so subsequent `i` calls use the new model. +// the model if it's not installed, and updates cfg.Model. The next `i` +// call picks up the new model automatically — llama-cli loads it fresh +// each invocation, so there is no daemon to restart. func modelUse(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []string) int { if len(args) == 0 { errf("usage: i model use ") @@ -308,7 +308,7 @@ func modelUse(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []s } if !models.ValidGGUFQuant(m.Quant) && m.Quant != "" { - fmt.Fprintf(os.Stderr, "warning: quant %q is unusual; llamafile may or may not load it.\n", m.Quant) + fmt.Fprintf(os.Stderr, "warning: quant %q is unusual; llama.cpp may or may not load it.\n", m.Quant) } // Download if missing. @@ -337,20 +337,6 @@ func modelUse(ctx context.Context, dirs state.Dirs, cfg *config.Config, args []s return 3 } fmt.Printf("current model: %s\n", m.ID) - - // Nudge the daemon to pick up the new model. If it's running, - // restart it; if not, leave it alone (user will start it on next - // `i` call via ensureBackendReady). - if pingDaemon(dirs) { - fmt.Fprintln(os.Stderr, "restarting daemon with new model...") - _ = daemonStop(dirs) - time.Sleep(500 * time.Millisecond) - if err := startDaemonAndWait(dirs); err != nil { - errf("daemon restart: %v (run `i daemon start` manually)", err) - return 3 - } - fmt.Fprintln(os.Stderr, "daemon: ready.") - } return 0 } @@ -383,11 +369,11 @@ func modelPull(ctx context.Context, dirs state.Dirs, cfg *config.Config, args [] } rt := intentruntime.New(dirs.Cache) - if !rt.HaveLlamafile() { - fmt.Fprintln(os.Stderr, "downloading runtime...") - if err := rt.EnsureLlamafile(ctx, progressCB("llamafile")); err != nil { + if !rt.HaveLlamaRuntime() { + fmt.Fprintln(os.Stderr, "installing llama.cpp via your package manager...") + if err := rt.EnsureLlamaRuntime(ctx, func(s string) { fmt.Fprintln(os.Stderr, " "+s) }); err != nil { fmt.Fprintln(os.Stderr) - errf("llamafile: %v", err) + errf("llama.cpp: %v", err) return 3 } fmt.Fprintln(os.Stderr) diff --git a/internal/cli/report.go b/internal/cli/report.go index dbcf353..722bad0 100644 --- a/internal/cli/report.go +++ b/internal/cli/report.go @@ -53,6 +53,7 @@ func cmdReport(ctx context.Context, args []string) int { errf("report: %v", err) return 3 } + defer closeBackend(be) if isMockBackend(be) { errf("i report requires a real backend — run 'i doctor' to diagnose") return 3 diff --git a/internal/cli/smoke_test.go b/internal/cli/smoke_test.go index 6aedcab..73f3cba 100644 --- a/internal/cli/smoke_test.go +++ b/internal/cli/smoke_test.go @@ -374,27 +374,6 @@ func TestConfigRoundTripSectionedKnownKey(t *testing.T) { } } -func TestConfigSetRejectsRemoteDaemonHost(t *testing.T) { - stateDir := t.TempDir() - cacheDir := t.TempDir() - baseEnv := []string{ - "HOME=" + os.Getenv("HOME"), - "PATH=" + os.Getenv("PATH"), - "INTENT_STATE_DIR=" + stateDir, - "INTENT_CACHE_DIR=" + cacheDir, - } - - cmd := exec.Command(testBinary, "config", "set", "daemon.host", "0.0.0.0") - cmd.Env = baseEnv - out, err := cmd.CombinedOutput() - if err == nil { - t.Fatal("expected config set daemon.host to fail, got nil error") - } - if !strings.Contains(string(out), "loopback only") { - t.Fatalf("expected loopback validation error, got %q", string(out)) - } -} - func TestConfigPath(t *testing.T) { stdout, _, exitCode := run(t, nil, "config", "path") if exitCode != 0 { diff --git a/internal/config/config.go b/internal/config/config.go index 216f1c1..c90f59d 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -35,7 +35,11 @@ type Config struct { // Defaults returns the project's chosen defaults. func Defaults() *Config { return &Config{ - Backend: "llamafile-local", + // Local inference runs llama.cpp's `llama-cli` one-shot. The + // legacy "llamafile-local" name is still accepted as an alias by + // the backend resolver, so configs written before the switch keep + // working without migration. + Backend: "llama-cli", // Catalog short-id. See internal/models.DefaultID. Defaults to // the 3B model as the balanced "just works" option: strong // enough that `i report` doesn't routinely hit the fallback diff --git a/internal/daemon/client.go b/internal/daemon/client.go deleted file mode 100644 index 5856738..0000000 --- a/internal/daemon/client.go +++ /dev/null @@ -1,49 +0,0 @@ -package daemon - -import ( - "bufio" - "encoding/json" - "fmt" - "net" - "time" -) - -// Client is a one-shot connection to a daemon socket. -type Client struct { - Socket string - Timeout time.Duration -} - -// NewClient returns a Client for the socket. Timeout defaults to 2s. -func NewClient(socket string) *Client { - return &Client{Socket: socket, Timeout: 2 * time.Second} -} - -// Call sends one request and returns the response. -func (c *Client) Call(req Request) (*Response, error) { - conn, err := net.DialTimeout("unix", c.Socket, c.Timeout) - if err != nil { - return nil, err - } - defer conn.Close() - if c.Timeout > 0 { - _ = conn.SetDeadline(time.Now().Add(c.Timeout)) - } - body, err := json.Marshal(req) - if err != nil { - return nil, err - } - if _, err := conn.Write(append(body, '\n')); err != nil { - return nil, err - } - r := bufio.NewReader(conn) - line, err := r.ReadBytes('\n') - if err != nil { - return nil, err - } - var resp Response - if err := json.Unmarshal(line, &resp); err != nil { - return nil, fmt.Errorf("decode response: %w", err) - } - return &resp, nil -} diff --git a/internal/daemon/daemon.go b/internal/daemon/daemon.go deleted file mode 100644 index 2853ef1..0000000 --- a/internal/daemon/daemon.go +++ /dev/null @@ -1,201 +0,0 @@ -// Package daemon implements intentd: a small supervisor that keeps a -// llamafile --server process warm so the CLI doesn't pay a model-load -// cost on every invocation. -// -// Architecture (v1): -// -// - The daemon spawns llamafile as a subprocess and watches it. -// - llamafile exposes its OpenAI-compatible HTTP API on the loopback -// port from config. The CLI talks to that port directly. The daemon -// does NOT proxy inference traffic through its Unix socket — that -// would add a hop for no benefit, since the heavy lifting is the -// model, not the network. -// - The daemon owns a Unix socket on which it speaks a tiny line- -// delimited JSON control protocol: ping / status / stop. That's -// also how `i daemon status` and `i daemon stop` work. -// -// Idle unload (kill the llamafile subprocess after N minutes of HTTP -// inactivity, respawn on next CLI request) is a v1.x follow-up. In v1 -// the daemon stays warm until the user stops it. -package daemon - -import ( - "bufio" - "context" - "encoding/json" - "fmt" - "net" - "os" - "sync" - "time" -) - -// Op is the daemon control-protocol operation discriminator. -type Op string - -const ( - OpPing Op = "ping" - OpStatus Op = "status" - OpStop Op = "stop" -) - -// Request is one daemon control request. -type Request struct { - Op Op `json:"op"` - ID string `json:"id,omitempty"` -} - -// Response is the daemon's reply. -type Response struct { - ID string `json:"id,omitempty"` - OK bool `json:"ok"` - Error string `json:"error,omitempty"` - Data map[string]any `json:"data,omitempty"` -} - -// Server is the Unix-socket control-plane server. -type Server struct { - Socket string - Launcher *Launcher - Started time.Time - mu sync.Mutex - ln net.Listener - stopCh chan struct{} - stopOnce sync.Once - clientCtx context.Context -} - -// New constructs a Server bound to socket and supervising launcher. -func New(socket string, l *Launcher) *Server { - return &Server{ - Socket: socket, - Launcher: l, - stopCh: make(chan struct{}), - } -} - -// Listen binds the Unix socket. Any pre-existing socket file is removed. -func (s *Server) Listen() error { - _ = os.Remove(s.Socket) - if err := os.MkdirAll(parentDir(s.Socket), 0o700); err != nil { - return fmt.Errorf("mkdir socket parent: %w", err) - } - ln, err := net.Listen("unix", s.Socket) - if err != nil { - return fmt.Errorf("listen on %s: %w", s.Socket, err) - } - if err := os.Chmod(s.Socket, 0o600); err != nil { - _ = ln.Close() - return fmt.Errorf("chmod socket: %w", err) - } - s.mu.Lock() - s.ln = ln - s.Started = time.Now() - s.mu.Unlock() - return nil -} - -// Serve accepts connections until ctx is canceled OR an OpStop is received. -func (s *Server) Serve(ctx context.Context) error { - if s.ln == nil { - return fmt.Errorf("server not listening") - } - s.clientCtx = ctx - go func() { - select { - case <-ctx.Done(): - case <-s.stopCh: - } - _ = s.ln.Close() - }() - for { - conn, err := s.ln.Accept() - if err != nil { - select { - case <-s.stopCh: - return nil - default: - } - if ctx.Err() != nil { - return nil - } - return err - } - go s.handle(conn) - } -} - -// SignalStop tells Serve to return. Idempotent. -func (s *Server) SignalStop() { - s.stopOnce.Do(func() { close(s.stopCh) }) -} - -// Stopped returns a channel closed when SignalStop has been called. -// `i daemon start` blocks on this AND on its OS-signal context, so -// either source can shut the daemon down. -func (s *Server) Stopped() <-chan struct{} { return s.stopCh } - -func (s *Server) handle(conn net.Conn) { - defer conn.Close() - _ = conn.SetReadDeadline(time.Now().Add(5 * time.Second)) - r := bufio.NewReader(conn) - w := bufio.NewWriter(conn) - line, err := r.ReadBytes('\n') - if err != nil { - return - } - var req Request - if err := json.Unmarshal(line, &req); err != nil { - _ = writeJSONLine(w, Response{ID: req.ID, OK: false, Error: "bad json: " + err.Error()}) - return - } - resp := s.dispatch(req) - resp.ID = req.ID - _ = writeJSONLine(w, resp) -} - -func (s *Server) dispatch(req Request) Response { - switch req.Op { - case OpPing: - return Response{OK: true, Data: map[string]any{"pong": true}} - case OpStatus: - data := map[string]any{ - "socket": s.Socket, - "started_at": s.Started.UTC().Format(time.RFC3339), - "uptime_sec": int64(time.Since(s.Started).Seconds()), - } - if s.Launcher != nil { - data["llamafile_running"] = s.Launcher.Running() - data["llamafile_endpoint"] = s.Launcher.Endpoint() - data["llamafile_pid"] = s.Launcher.PID() - data["llamafile_restarts"] = s.Launcher.Restarts() - data["model"] = s.Launcher.ModelPath - } - return Response{OK: true, Data: data} - case OpStop: - s.SignalStop() - return Response{OK: true, Data: map[string]any{"stopping": true}} - default: - return Response{OK: false, Error: "unknown op: " + string(req.Op)} - } -} - -func writeJSONLine(w *bufio.Writer, r Response) error { - b, err := json.Marshal(r) - if err != nil { - return err - } - if _, err := w.Write(append(b, '\n')); err != nil { - return err - } - return w.Flush() -} - -func parentDir(p string) string { - for i := len(p) - 1; i >= 0; i-- { - if p[i] == '/' { - return p[:i] - } - } - return "." -} diff --git a/internal/daemon/install.go b/internal/daemon/install.go deleted file mode 100644 index f8265f1..0000000 --- a/internal/daemon/install.go +++ /dev/null @@ -1,237 +0,0 @@ -package daemon - -import ( - "fmt" - "os" - "os/exec" - "path/filepath" - "runtime" -) - -// InstallParams are everything Install needs to write a system service file. -type InstallParams struct { - Binary string // absolute path to the intent binary - Label string // service label, e.g. "com.coreyrdean.intent" - LogDir string // directory the service writes stdout/stderr to - Socket string // daemon control socket path (informational) - Cache string // cache root (so the service knows where llamafile lives) - State string // state root -} - -// InstallResult describes what was written and how to control it. -type InstallResult struct { - UnitPath string // path to the launchd plist or systemd unit - StartCmd []string // command to start the unit - StopCmd []string // command to stop the unit - LogPath string // path to the stdout log - Notes string // human-readable post-install hint -} - -// Install writes the platform-appropriate service file and starts it. -// On macOS, returns the LaunchAgent plist path. -// On Linux, returns the user systemd unit path. -// Other platforms return an error. -func Install(p InstallParams) (*InstallResult, error) { - switch runtime.GOOS { - case "darwin": - return installLaunchd(p) - case "linux": - return installSystemd(p) - default: - return nil, fmt.Errorf("daemon install not supported on %s yet", runtime.GOOS) - } -} - -// Uninstall removes the platform-appropriate service file (and stops it). -func Uninstall(label string) error { - switch runtime.GOOS { - case "darwin": - return uninstallLaunchd(label) - case "linux": - return uninstallSystemd(label) - default: - return fmt.Errorf("daemon uninstall not supported on %s yet", runtime.GOOS) - } -} - -// IsInstalled reports whether the platform-appropriate service file exists. -func IsInstalled(label string) bool { - switch runtime.GOOS { - case "darwin": - path, _ := launchdPlistPath(label) - _, err := os.Stat(path) - return err == nil - case "linux": - path, _ := systemdUnitPath(label) - _, err := os.Stat(path) - return err == nil - } - return false -} - -// --- macOS / launchd --- - -func launchdPlistPath(label string) (string, error) { - home, err := os.UserHomeDir() - if err != nil { - return "", err - } - return filepath.Join(home, "Library", "LaunchAgents", label+".plist"), nil -} - -func installLaunchd(p InstallParams) (*InstallResult, error) { - plistPath, err := launchdPlistPath(p.Label) - if err != nil { - return nil, err - } - if err := os.MkdirAll(filepath.Dir(plistPath), 0o755); err != nil { - return nil, err - } - if err := os.MkdirAll(p.LogDir, 0o700); err != nil { - return nil, err - } - logOut := filepath.Join(p.LogDir, "intentd.out.log") - logErr := filepath.Join(p.LogDir, "intentd.err.log") - plist := fmt.Sprintf(` - - - - Label %s - ProgramArguments - %s - daemon - start - --foreground - - RunAtLoad - KeepAlive - ProcessType Background - StandardOutPath %s - StandardErrorPath%s - EnvironmentVariables - - HOME %s - PATH /usr/local/bin:/opt/homebrew/bin:/usr/bin:/bin:/usr/sbin:/sbin - - - -`, p.Label, p.Binary, logOut, logErr, mustHome()) - if err := os.WriteFile(plistPath, []byte(plist), 0o644); err != nil { - return nil, fmt.Errorf("write plist: %w", err) - } - // Best-effort start. launchctl load is the right verb on macOS LaunchAgents - // even though it's been deprecated in favor of bootstrap. bootstrap requires - // a target like `gui/$UID` and is awkward; load still works. - _, _ = exec.Command("launchctl", "unload", plistPath).CombinedOutput() - if out, err := exec.Command("launchctl", "load", plistPath).CombinedOutput(); err != nil { - return &InstallResult{ - UnitPath: plistPath, - StartCmd: []string{"launchctl", "load", plistPath}, - StopCmd: []string{"launchctl", "unload", plistPath}, - LogPath: logOut, - Notes: fmt.Sprintf("plist installed but launchctl load failed: %s\n"+ - "start manually with: launchctl load %s", string(out), plistPath), - }, nil - } - return &InstallResult{ - UnitPath: plistPath, - StartCmd: []string{"launchctl", "load", plistPath}, - StopCmd: []string{"launchctl", "unload", plistPath}, - LogPath: logOut, - Notes: "intentd is now running and will start at login.", - }, nil -} - -func uninstallLaunchd(label string) error { - plistPath, err := launchdPlistPath(label) - if err != nil { - return err - } - if _, err := os.Stat(plistPath); os.IsNotExist(err) { - return nil - } - _, _ = exec.Command("launchctl", "unload", plistPath).CombinedOutput() - return os.Remove(plistPath) -} - -// --- Linux / systemd user unit --- - -func systemdUnitPath(label string) (string, error) { - home, err := os.UserHomeDir() - if err != nil { - return "", err - } - // systemd allows arbitrary unit names; we use the label suffix as the - // unit basename to avoid collisions with system units. - return filepath.Join(home, ".config", "systemd", "user", label+".service"), nil -} - -func installSystemd(p InstallParams) (*InstallResult, error) { - unitPath, err := systemdUnitPath(p.Label) - if err != nil { - return nil, err - } - if err := os.MkdirAll(filepath.Dir(unitPath), 0o755); err != nil { - return nil, err - } - unit := fmt.Sprintf(`[Unit] -Description=intent daemon (keeps a local LLM warm) -After=default.target - -[Service] -Type=simple -ExecStart=%s daemon start --foreground -Restart=on-failure -RestartSec=2 -Environment=PATH=/usr/local/bin:/usr/bin:/bin:/usr/sbin:/sbin -NoNewPrivileges=yes -PrivateTmp=yes - -[Install] -WantedBy=default.target -`, p.Binary) - if err := os.WriteFile(unitPath, []byte(unit), 0o644); err != nil { - return nil, fmt.Errorf("write unit: %w", err) - } - _, _ = exec.Command("systemctl", "--user", "daemon-reload").CombinedOutput() - unitName := p.Label + ".service" - if out, err := exec.Command("systemctl", "--user", "enable", "--now", unitName).CombinedOutput(); err != nil { - return &InstallResult{ - UnitPath: unitPath, - StartCmd: []string{"systemctl", "--user", "start", unitName}, - StopCmd: []string{"systemctl", "--user", "stop", unitName}, - LogPath: "journalctl --user -u " + unitName, - Notes: fmt.Sprintf("unit installed but `systemctl --user enable --now` failed: %s\n"+ - "start manually with: systemctl --user start %s", string(out), unitName), - }, nil - } - return &InstallResult{ - UnitPath: unitPath, - StartCmd: []string{"systemctl", "--user", "start", unitName}, - StopCmd: []string{"systemctl", "--user", "stop", unitName}, - LogPath: "journalctl --user -u " + unitName, - Notes: "intentd is enabled and running. Logs: journalctl --user -u " + unitName, - }, nil -} - -func uninstallSystemd(label string) error { - unitPath, err := systemdUnitPath(label) - if err != nil { - return err - } - if _, err := os.Stat(unitPath); os.IsNotExist(err) { - return nil - } - unitName := label + ".service" - _, _ = exec.Command("systemctl", "--user", "disable", "--now", unitName).CombinedOutput() - if err := os.Remove(unitPath); err != nil { - return err - } - _, _ = exec.Command("systemctl", "--user", "daemon-reload").CombinedOutput() - return nil -} - -func mustHome() string { - h, _ := os.UserHomeDir() - return h -} diff --git a/internal/daemon/launcher.go b/internal/daemon/launcher.go deleted file mode 100644 index b6e8e25..0000000 --- a/internal/daemon/launcher.go +++ /dev/null @@ -1,388 +0,0 @@ -package daemon - -import ( - "context" - "fmt" - "io" - "net/http" - "os" - "os/exec" - "sync" - "sync/atomic" - "syscall" - "time" -) - -// Launcher supervises a `llamafile --server` subprocess. It exposes the -// HTTP endpoint llamafile is bound to so the CLI can dial it directly. -// -// Restart policy: if llamafile exits with a non-zero code or is killed -// by anything other than us, we restart it up to MaxRestarts times within -// RestartWindow. Beyond that we give up; the user gets an honest "daemon -// died, see the logs" rather than a thrashing supervisor. -type Launcher struct { - BinaryPath string // path to llamafile-VERSION - ModelPath string // path to .gguf - Host string // 127.0.0.1 - Port int // 18080 - ContextSize int // -c, 0 = llamafile default - GPULayers int // -ngl, -1 = let llamafile decide - StdoutLog io.Writer // where llamafile's stdout goes - StderrLog io.Writer // where llamafile's stderr goes - MaxRestarts int // default 5 - RestartWindow time.Duration // default 60s - StartupGrace time.Duration // how long to wait for /v1/models to respond - - mu sync.Mutex - cmd *exec.Cmd - pid int32 - restarts atomic.Int32 - stopped atomic.Bool - doneCh chan struct{} - restartTs []time.Time -} - -// NewLauncher constructs a Launcher with sensible defaults. -func NewLauncher(binary, model string, host string, port int) *Launcher { - return &Launcher{ - BinaryPath: binary, - ModelPath: model, - Host: host, - Port: port, - StdoutLog: io.Discard, - StderrLog: os.Stderr, - MaxRestarts: 5, - RestartWindow: 60 * time.Second, - StartupGrace: 60 * time.Second, - GPULayers: -1, - doneCh: make(chan struct{}), - } -} - -// Endpoint returns the http://host:port the supervised llamafile listens on. -func (l *Launcher) Endpoint() string { - return fmt.Sprintf("http://%s:%d", l.Host, l.Port) -} - -// PID returns the current llamafile PID (0 if not running). -func (l *Launcher) PID() int { return int(atomic.LoadInt32(&l.pid)) } - -// Running reports whether the subprocess is alive. -func (l *Launcher) Running() bool { return l.PID() != 0 } - -// Restarts returns the cumulative restart count. -func (l *Launcher) Restarts() int { return int(l.restarts.Load()) } - -// Start launches llamafile and blocks until either: -// - the HTTP /v1/models endpoint answers (success), OR -// - StartupGrace expires (failure), OR -// - llamafile exits before becoming ready (failure) -// -// On success, the Launcher's supervise goroutine is also running. -func (l *Launcher) Start(ctx context.Context) error { - if err := l.spawn(ctx); err != nil { - return err - } - if err := l.waitReady(ctx); err != nil { - l.stop(syscall.SIGTERM) - return fmt.Errorf("llamafile did not become ready: %w", err) - } - go l.supervise(ctx) - return nil -} - -// Wait blocks until the launcher's supervise loop exits. -func (l *Launcher) Wait() { <-l.doneCh } - -// Stop signals the launcher to terminate and waits for it. Idempotent. -func (l *Launcher) Stop(timeout time.Duration) { - if !l.stopped.CompareAndSwap(false, true) { - return - } - l.stop(syscall.SIGTERM) - select { - case <-l.doneCh: - case <-time.After(timeout): - l.stop(syscall.SIGKILL) - <-l.doneCh - } -} - -func (l *Launcher) spawn(ctx context.Context) error { - args := []string{ - "--server", - "-m", l.ModelPath, - "--host", l.Host, - "--port", fmt.Sprintf("%d", l.Port), - } - if l.ContextSize > 0 { - args = append(args, "-c", fmt.Sprintf("%d", l.ContextSize)) - } - if l.GPULayers >= 0 { - args = append(args, "-ngl", fmt.Sprintf("%d", l.GPULayers)) - } - - // llamafile is an Actually Portable Executable (APE). On macOS the - // kernel rejects APE binaries directly with "exec format error" — - // the file's leading shell-script trampoline only fires when the - // shell loads it. So we run it via /bin/sh on every Unix to keep the - // invocation consistent and let the shell pick the right loader. - // - // Note: not exec.CommandContext — we manage lifecycle explicitly so - // that a CLI command's ctx cancellation doesn't kill the daemon's - // supervised subprocess. - shArgs := append([]string{l.BinaryPath}, args...) - cmd := exec.Command("/bin/sh", "-c", quoteShellArgs(shArgs), "intentd-llamafile") - cmd.Stdout = l.StdoutLog - cmd.Stderr = l.StderrLog - // New process group so llamafile doesn't catch terminal signals - // directed at the daemon. - cmd.SysProcAttr = procAttrNewGroup() - - if err := cmd.Start(); err != nil { - return fmt.Errorf("start %s: %w", l.BinaryPath, err) - } - l.mu.Lock() - l.cmd = cmd - atomic.StoreInt32(&l.pid, int32(cmd.Process.Pid)) - l.mu.Unlock() - return nil -} - -func (l *Launcher) waitReady(ctx context.Context) error { - deadline := time.Now().Add(l.StartupGrace) - url := l.Endpoint() + "/v1/models" - cli := &http.Client{Timeout: 1 * time.Second} - for time.Now().Before(deadline) { - select { - case <-ctx.Done(): - return ctx.Err() - default: - } - // If the subprocess died on us, fail fast. - if !l.processAlive() { - return fmt.Errorf("subprocess exited before ready") - } - req, _ := http.NewRequestWithContext(ctx, "GET", url, nil) - resp, err := cli.Do(req) - if err == nil { - _ = resp.Body.Close() - if resp.StatusCode < 500 { - return nil - } - } - time.Sleep(250 * time.Millisecond) - } - return fmt.Errorf("timeout after %s", l.StartupGrace) -} - -func (l *Launcher) processAlive() bool { - l.mu.Lock() - cmd := l.cmd - l.mu.Unlock() - if cmd == nil || cmd.Process == nil { - return false - } - // Signal 0: existence check, no actual signal delivered. - return cmd.Process.Signal(syscall.Signal(0)) == nil -} - -func (l *Launcher) supervise(ctx context.Context) { - defer close(l.doneCh) - for { - l.mu.Lock() - cmd := l.cmd - l.mu.Unlock() - if cmd == nil { - return - } - err := cmd.Wait() - atomic.StoreInt32(&l.pid, 0) - - if l.stopped.Load() || ctx.Err() != nil { - return - } - - // Crash. Decide whether to restart. - if err != nil { - fmt.Fprintf(l.StderrLog, "intentd: llamafile exited: %v\n", err) - } - if !l.shouldRestart() { - fmt.Fprintf(l.StderrLog, "intentd: too many restarts in %s; giving up\n", l.RestartWindow) - return - } - l.restarts.Add(1) - fmt.Fprintf(l.StderrLog, "intentd: restarting llamafile (attempt %d)\n", l.restarts.Load()) - // Brief backoff so we don't hot-loop. - time.Sleep(time.Second) - if err := l.spawn(ctx); err != nil { - fmt.Fprintf(l.StderrLog, "intentd: respawn failed: %v\n", err) - return - } - if err := l.waitReady(ctx); err != nil { - fmt.Fprintf(l.StderrLog, "intentd: respawn not ready: %v\n", err) - l.stop(syscall.SIGTERM) - return - } - } -} - -// shouldRestart returns true if we are within the restart budget. -func (l *Launcher) shouldRestart() bool { - now := time.Now() - cutoff := now.Add(-l.RestartWindow) - kept := l.restartTs[:0] - for _, t := range l.restartTs { - if t.After(cutoff) { - kept = append(kept, t) - } - } - l.restartTs = append(kept, now) - return len(l.restartTs) <= l.MaxRestarts -} - -// quoteShellArgs renders argv as a single shell command string with -// each argument single-quoted. We never embed user-supplied unescaped -// strings here, but doing it correctly is cheap insurance. -func quoteShellArgs(argv []string) string { - out := "" - for i, a := range argv { - if i > 0 { - out += " " - } - // Replace each ' with '\''. - escaped := "" - for _, r := range a { - if r == '\'' { - escaped += `'\''` - } else { - escaped += string(r) - } - } - out += "'" + escaped + "'" - } - return out -} - -func (l *Launcher) stop(sig syscall.Signal) { - l.mu.Lock() - cmd := l.cmd - l.mu.Unlock() - if cmd == nil || cmd.Process == nil { - return - } - pid := cmd.Process.Pid - - // llamafile is an Actually Portable Executable. APE binaries on - // macOS work like this: the bytes are simultaneously a PE header - // (rejected by the kernel) and a shell script (interpreted by sh - // when execve fails). The script then mmaps a temp-extracted - // Mach-O and re-execs into it via `posix_spawn`, which in practice - // FORKS off a worker process whose parent becomes our intent - // daemon (the original sh wrapper exits). So: - // - // - cmd.Process.Pid points at the long-dead sh wrapper. - // - The actual llamafile is reparented to our daemon (os.Getpid()). - // - It also lives in its own process group via setsid. - // - // We therefore signal four populations to be sure: - // 1) the original spawned PID (no-op if already gone), - // 2) the spawned PID's process group (no-op if separated), - // 3) every descendant of *us* that runs our llamafile binary, - // 4) every such descendant's own process group. - _ = cmd.Process.Signal(sig) - if pgid, err := syscall.Getpgid(pid); err == nil && pgid != 0 && pgid != os.Getpid() { - _ = syscall.Kill(-pgid, sig) - } - for _, p := range descendantsRunning(os.Getpid(), l.BinaryPath) { - _ = syscall.Kill(p, sig) - if pgid, err := syscall.Getpgid(p); err == nil && pgid != 0 && pgid != os.Getpid() { - _ = syscall.Kill(-pgid, sig) - } - } -} - -// descendantsRunning returns every descendant of root whose command -// line contains needle. We filter on needle so that signaling does -// not accidentally hit unrelated processes that happen to share the -// daemon as an ancestor (e.g. user shells launched from `i daemon -// start` in a terminal). -func descendantsRunning(root int, needle string) []int { - if _, err := exec.LookPath("pgrep"); err != nil { - return nil - } - candidates := allDescendants(root) - if len(candidates) == 0 || needle == "" { - return candidates - } - out := candidates[:0] - for _, p := range candidates { - // `ps -o command= -p PID` prints just the command line. - b, err := exec.Command("ps", "-o", "command=", "-p", fmt.Sprintf("%d", p)).Output() - if err != nil { - continue - } - if bytesContains(b, needle) { - out = append(out, p) - } - } - return out -} - -func allDescendants(root int) []int { - seen := map[int]struct{}{root: {}} - queue := []int{root} - var out []int - for len(queue) > 0 { - p := queue[0] - queue = queue[1:] - b, err := exec.Command("pgrep", "-P", fmt.Sprintf("%d", p)).Output() - if err != nil { - continue - } - for _, line := range bytesLines(b) { - var child int - _, _ = fmt.Sscanf(line, "%d", &child) - if child <= 0 { - continue - } - if _, ok := seen[child]; ok { - continue - } - seen[child] = struct{}{} - out = append(out, child) - queue = append(queue, child) - } - } - return out -} - -func bytesContains(b []byte, needle string) bool { - if len(needle) == 0 { - return true - } - n := []byte(needle) - for i := 0; i+len(n) <= len(b); i++ { - if string(b[i:i+len(n)]) == needle { - return true - } - } - return false -} - -func bytesLines(b []byte) []string { - var out []string - start := 0 - for i, c := range b { - if c == '\n' { - if i > start { - out = append(out, string(b[start:i])) - } - start = i + 1 - } - } - if start < len(b) { - out = append(out, string(b[start:])) - } - return out -} diff --git a/internal/daemon/procattr_other.go b/internal/daemon/procattr_other.go deleted file mode 100644 index d7baa02..0000000 --- a/internal/daemon/procattr_other.go +++ /dev/null @@ -1,9 +0,0 @@ -//go:build !unix - -package daemon - -import "syscall" - -// procAttrNewGroup is a no-op on non-unix platforms (Windows). The -// daemon isn't supported there yet anyway. -func procAttrNewGroup() *syscall.SysProcAttr { return nil } diff --git a/internal/daemon/procattr_unix.go b/internal/daemon/procattr_unix.go deleted file mode 100644 index 696d4b9..0000000 --- a/internal/daemon/procattr_unix.go +++ /dev/null @@ -1,12 +0,0 @@ -//go:build unix - -package daemon - -import "syscall" - -// procAttrNewGroup returns SysProcAttr that puts the child in its own -// process group, so signals to our PID don't reach it (and so we can -// signal -PGID to take down the entire subtree). -func procAttrNewGroup() *syscall.SysProcAttr { - return &syscall.SysProcAttr{Setpgid: true} -} diff --git a/internal/model/llamacli/llamacli.go b/internal/model/llamacli/llamacli.go new file mode 100644 index 0000000..3f7439e --- /dev/null +++ b/internal/model/llamacli/llamacli.go @@ -0,0 +1,313 @@ +// Package llamacli runs local inference by shelling out to llama.cpp's +// `llama-cli` binary one-shot, instead of talking to a long-lived server. +// +// Each Complete/CompleteStructured call spawns `llama-cli` with the model, +// a JSON-schema grammar constraint, and the flattened conversation, then +// parses the single JSON object the model prints to stdout. There is no +// daemon, no HTTP, and no warm process: the OS process *is* the request. +// +// Trade-off vs. the old `llamafile --server` path: every call pays the +// model-load cost. In exchange there is nothing to supervise, nothing +// bound to a socket, and nothing to leave running. The grammar constraint +// (`--json-schema`) is the same mechanism llama.cpp's server used, so the +// output contract is unchanged. +package llamacli + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "os/exec" + "strings" + "time" + + "github.com/CoreyRDean/intent/internal/model" +) + +// Backend drives one-shot `llama-cli` inference. +type Backend struct { + // BinaryPath is the resolved llama-cli executable. May be a bare + // "llama-cli" if it's expected to be found on PATH at exec time. + BinaryPath string + // ModelPath is the absolute path to the .gguf to load. + ModelPath string + // ModelTag is cosmetic; it feeds the cache identity so switching + // models invalidates cached proposals. + ModelTag string + // ContextSize maps to -c (0 = let llama.cpp use the model default). + ContextSize int + // GPULayers maps to -ngl (-1 = let llama.cpp decide). + GPULayers int + // ExtraArgs are appended verbatim, for power users / debugging. + ExtraArgs []string + // Timeout caps a single inference. 0 = no deadline beyond ctx. + Timeout time.Duration +} + +// New constructs a Backend for the given binary and model. +func New(binary, modelPath string) *Backend { + if binary == "" { + binary = "llama-cli" + } + return &Backend{ + BinaryPath: binary, + ModelPath: modelPath, + GPULayers: -1, + Timeout: 5 * time.Minute, + } +} + +func (b *Backend) Name() string { return "llama-cli" } + +func (b *Backend) CacheIdentity() string { + return strings.Join([]string{b.Name(), b.ModelPath, b.ModelTag}, "|") +} + +// Available verifies the binary resolves and the model file is present. +func (b *Backend) Available(ctx context.Context) error { + if b.ModelPath == "" { + return fmt.Errorf("llama-cli: no model path configured") + } + if _, err := exec.LookPath(b.BinaryPath); err != nil { + return fmt.Errorf("llama-cli not found (%s): %w", b.BinaryPath, err) + } + return nil +} + +// Complete runs inference constrained to the standard Response envelope. +func (b *Backend) Complete(ctx context.Context, in model.CompleteRequest) (*model.Response, error) { + content, err := b.run(ctx, in.Messages, []byte(model.SchemaJSON), in.Temperature, in.MaxTokens, in.Seed) + if err != nil { + return nil, err + } + var out model.Response + if err := json.Unmarshal([]byte(content), &out); err != nil { + return nil, fmt.Errorf("model output not valid JSON: %w (got %q)", err, truncate(content, 200)) + } + backfillRequiredFields(&out) + if err := out.Validate(); err != nil { + return nil, fmt.Errorf("model response failed schema: %w (got %q)", err, truncate(content, 400)) + } + return &out, nil +} + +// CompleteStructured implements model.StructuredBackend: the caller- +// supplied schema is enforced by llama.cpp's grammar, so the returned +// bytes are already schema-valid JSON. +func (b *Backend) CompleteStructured(ctx context.Context, in model.StructuredRequest) ([]byte, error) { + if len(in.SchemaJSON) == 0 { + return nil, fmt.Errorf("CompleteStructured: SchemaJSON is required") + } + content, err := b.run(ctx, in.Messages, in.SchemaJSON, in.Temperature, in.MaxTokens, in.Seed) + if err != nil { + return nil, err + } + var any json.RawMessage + if err := json.Unmarshal([]byte(content), &any); err != nil { + return nil, fmt.Errorf("structured output not valid JSON: %w (got %q)", err, truncate(content, 200)) + } + return []byte(content), nil +} + +// run spawns llama-cli once and returns the JSON object it produced. +func (b *Backend) run(ctx context.Context, messages []model.Message, schema []byte, temp float64, maxTok int, seed *int64) (string, error) { + if b.ModelPath == "" { + return "", fmt.Errorf("llama-cli: no model path configured") + } + system, prompt := flattenMessages(messages) + + if b.Timeout > 0 { + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(ctx, b.Timeout) + defer cancel() + } + + args := []string{ + "-m", b.ModelPath, + // Conversation + single-turn: apply the model's chat template, + // process exactly one user turn, then exit. This is the supported + // scripting mode — no interactive loop, no hanging on stdin. + "-cnv", "-st", + "--no-display-prompt", + "--json-schema", string(schema), + "--temp", fmt.Sprintf("%g", temp), + } + if maxTok > 0 { + args = append(args, "-n", fmt.Sprintf("%d", maxTok)) + } + if seed != nil { + args = append(args, "-s", fmt.Sprintf("%d", *seed)) + } + if b.ContextSize > 0 { + args = append(args, "-c", fmt.Sprintf("%d", b.ContextSize)) + } + if b.GPULayers >= 0 { + args = append(args, "-ngl", fmt.Sprintf("%d", b.GPULayers)) + } + if system != "" { + args = append(args, "-sys", system) + } + args = append(args, "-p", prompt) + args = append(args, b.ExtraArgs...) + + cmd := exec.CommandContext(ctx, b.BinaryPath, args...) + var stdout, stderr bytes.Buffer + cmd.Stdout = &stdout + cmd.Stderr = &stderr + // Give the child an empty stdin so conversation mode sees EOF and + // never blocks waiting for a second turn. + cmd.Stdin = bytes.NewReader(nil) + + if err := cmd.Run(); err != nil { + if ctx.Err() == context.DeadlineExceeded { + return "", fmt.Errorf("llama-cli timed out after %s", b.Timeout) + } + return "", fmt.Errorf("llama-cli failed: %w (stderr: %s)", err, truncate(strings.TrimSpace(stderr.String()), 400)) + } + + // Generation goes to stdout; llama.cpp logs/timings go to stderr. + // Extract the first balanced JSON object from stdout, falling back + // to stderr in case a build routes the message there. + if obj := extractJSONObject(stdout.String()); obj != "" { + return obj, nil + } + if obj := extractJSONObject(stderr.String()); obj != "" { + return obj, nil + } + return "", fmt.Errorf("llama-cli produced no JSON object (stdout: %q)", truncate(strings.TrimSpace(stdout.String()), 400)) +} + +// flattenMessages splits a conversation into a single system prompt and a +// single user-turn transcript. llama-cli takes one -sys and one -p, so +// multi-turn context (the engine's tool-call loop) is rendered as labeled +// text inside the user turn. The schema grammar still pins the output +// shape regardless of how the context is framed. +func flattenMessages(msgs []model.Message) (system, prompt string) { + var sys []string + var convo []string + for _, m := range msgs { + switch m.Role { + case "system": + sys = append(sys, m.Content) + case "assistant": + convo = append(convo, "Assistant (previous response):\n"+m.Content) + case "tool": + name := m.Name + if name == "" { + name = "tool" + } + convo = append(convo, fmt.Sprintf("Result of %s:\n%s", name, m.Content)) + default: // user and anything else + convo = append(convo, m.Content) + } + } + return strings.Join(sys, "\n\n"), strings.Join(convo, "\n\n") +} + +// extractJSONObject returns the first balanced top-level {...} object in s, +// tolerating ```json fences, leading log noise, and trailing EOS markers +// that small local models and llama-cli sometimes emit around the payload. +func extractJSONObject(s string) string { + s = stripFences(strings.TrimSpace(s)) + start := strings.IndexByte(s, '{') + if start < 0 { + return "" + } + depth := 0 + inStr := false + esc := false + for i := start; i < len(s); i++ { + c := s[i] + if inStr { + switch { + case esc: + esc = false + case c == '\\': + esc = true + case c == '"': + inStr = false + } + continue + } + switch c { + case '"': + inStr = true + case '{': + depth++ + case '}': + depth-- + if depth == 0 { + return s[start : i+1] + } + } + } + return "" +} + +// stripFences tolerates a model that wraps JSON in ```json ... ``` fences. +func stripFences(s string) string { + s = strings.TrimSpace(s) + if !strings.HasPrefix(s, "```") { + return s + } + s = strings.TrimPrefix(s, "```json") + s = strings.TrimPrefix(s, "```") + s = strings.TrimSuffix(s, "```") + return strings.TrimSpace(s) +} + +// backfillRequiredFields supplies sane defaults for fields small local +// models routinely omit despite the schema. We never invent the command +// itself; we only fill metadata the safety guard and TUI need. Mirrors +// the llamafile backend so behaviour is identical across local runtimes. +func backfillRequiredFields(r *model.Response) { + if r == nil { + return + } + if r.Description == "" { + switch { + case r.Command != "": + r.Description = "Run: " + truncate(r.Command, 120) + case r.Script != nil && r.Script.Body != "": + first := strings.SplitN(r.Script.Body, "\n", 2)[0] + r.Description = "Run script (" + r.Script.Interpreter + "): " + truncate(first, 100) + case r.StdoutToUser != "": + r.Description = "Print informational answer." + case r.ToolCall != nil && r.ToolCall.Name != "": + r.Description = "Gather context via " + r.ToolCall.Name + "." + case r.ClarifyingQuestion != "": + r.Description = "Ask the user a clarifying question." + case r.RefusalReason != "": + r.Description = "Refuse this request." + default: + r.Description = "(no description provided by model)" + } + } + if r.Risk == "" { + r.Risk = model.RiskSafe + } + if r.Approach == "" { + switch { + case r.Script != nil && r.Script.Body != "": + r.Approach = model.ApproachScript + case r.Command != "": + r.Approach = model.ApproachCommand + case r.ToolCall != nil && r.ToolCall.Name != "": + r.Approach = model.ApproachToolCall + case r.StdoutToUser != "": + r.Approach = model.ApproachInform + case r.ClarifyingQuestion != "": + r.Approach = model.ApproachClarify + case r.RefusalReason != "": + r.Approach = model.ApproachRefuse + } + } +} + +func truncate(s string, n int) string { + if len(s) <= n { + return s + } + return s[:n] + "..." +} diff --git a/internal/model/llamacli/llamacli_test.go b/internal/model/llamacli/llamacli_test.go new file mode 100644 index 0000000..3df8198 --- /dev/null +++ b/internal/model/llamacli/llamacli_test.go @@ -0,0 +1,85 @@ +package llamacli + +import ( + "testing" + + "github.com/CoreyRDean/intent/internal/model" +) + +func TestExtractJSONObject(t *testing.T) { + tests := []struct { + name string + in string + want string + }{ + {"plain object", `{"a":1}`, `{"a":1}`}, + {"leading log noise", "loading model...\nllama_init\n{\"a\":1}\n", `{"a":1}`}, + {"trailing eos marker", `{"a":1} [end of text]`, `{"a":1}`}, + {"code fence", "```json\n{\"a\":1}\n```", `{"a":1}`}, + {"nested braces", `{"a":{"b":2},"c":3}`, `{"a":{"b":2},"c":3}`}, + {"brace inside string", `{"a":"}{","b":1}`, `{"a":"}{","b":1}`}, + {"escaped quote in string", `{"a":"he said \"hi\"","b":1}`, `{"a":"he said \"hi\"","b":1}`}, + {"no object", `just text`, ``}, + {"trailing garbage after object", `{"a":1}garbage{`, `{"a":1}`}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := extractJSONObject(tt.in); got != tt.want { + t.Errorf("extractJSONObject(%q) = %q, want %q", tt.in, got, tt.want) + } + }) + } +} + +func TestFlattenMessages(t *testing.T) { + msgs := []model.Message{ + {Role: "system", Content: "you are a shell"}, + {Role: "system", Content: "be terse"}, + {Role: "user", Content: "list files"}, + {Role: "assistant", Content: `{"approach":"tool_call"}`}, + {Role: "tool", Name: "list_dir", Content: `{"files":["a","b"]}`}, + } + system, prompt := flattenMessages(msgs) + + if system != "you are a shell\n\nbe terse" { + t.Fatalf("system = %q", system) + } + for _, want := range []string{"list files", "Assistant (previous response)", "Result of list_dir"} { + if !contains(prompt, want) { + t.Errorf("prompt missing %q; got %q", want, prompt) + } + } +} + +func TestFlattenMessages_UnnamedTool(t *testing.T) { + _, prompt := flattenMessages([]model.Message{ + {Role: "tool", Content: "result"}, + }) + if !contains(prompt, "Result of tool:") { + t.Fatalf("expected default tool label, got %q", prompt) + } +} + +func TestCacheIdentity_DistinctPerModel(t *testing.T) { + a := New("llama-cli", "/cache/models/a.gguf") + b := New("llama-cli", "/cache/models/b.gguf") + if a.CacheIdentity() == b.CacheIdentity() { + t.Fatal("different models should yield different cache identities") + } + if a.Name() != "llama-cli" { + t.Fatalf("Name() = %q", a.Name()) + } +} + +func contains(haystack, needle string) bool { + return len(needle) == 0 || (len(haystack) >= len(needle) && indexOf(haystack, needle) >= 0) +} + +func indexOf(s, sub string) int { + for i := 0; i+len(sub) <= len(s); i++ { + if s[i:i+len(sub)] == sub { + return i + } + } + return -1 +} diff --git a/internal/model/llamaserver/llamaserver.go b/internal/model/llamaserver/llamaserver.go new file mode 100644 index 0000000..7044f16 --- /dev/null +++ b/internal/model/llamaserver/llamaserver.go @@ -0,0 +1,299 @@ +// Package llamaserver runs local inference through a request-scoped +// llama.cpp `llama-server` child process. +// +// Unlike the one-shot `llama-cli` path, the server is started once on the +// first inference of an `intent` invocation and held warm for the rest of +// that invocation — so the engine's tool-call loop reuses the loaded +// weights and KV cache instead of reloading the model on every step. And +// because the server speaks the OpenAI-compatible /v1/chat/completions +// API, the native messages array (system/user/assistant/tool) is sent +// as-is: no flattening into a single prompt. +// +// It is *not* a daemon. The process is bound to a private loopback port, +// owned by this one CLI invocation, and killed on Close (and, on Linux, +// auto-killed by the kernel if intent dies — see procAttr). There is no +// persistent listener and nothing to manage between commands. +package llamaserver + +import ( + "context" + "fmt" + "io" + "net" + "net/http" + "os" + "os/exec" + "sync" + "syscall" + "time" + + "github.com/CoreyRDean/intent/internal/model" + "github.com/CoreyRDean/intent/internal/model/llamafile" +) + +// Backend manages the llama-server child and delegates inference to an +// OpenAI-compatible HTTP client pointed at it. The zero value is not +// usable; construct with New. +type Backend struct { + BinaryPath string // path to llama-server + ModelPath string // path to the .gguf to load + ModelTag string // cosmetic; feeds the cache identity + ContextSize int // -c, 0 = llama.cpp default + GPULayers int // -ngl, -1 = let llama.cpp decide + Host string // loopback only; default 127.0.0.1 + StartupGrace time.Duration // how long to wait for /health + + startOnce sync.Once + startErr error + + mu sync.Mutex + cmd *exec.Cmd + inner *llamafile.Backend + stopped bool + logBuf *cappedBuffer +} + +// New constructs a Backend for the given binary and model. +func New(binary, modelPath string) *Backend { + if binary == "" { + binary = "llama-server" + } + return &Backend{ + BinaryPath: binary, + ModelPath: modelPath, + Host: "127.0.0.1", + GPULayers: -1, + StartupGrace: 120 * time.Second, + logBuf: &cappedBuffer{max: 8 << 10}, + } +} + +func (b *Backend) Name() string { return "llama-server" } + +// CacheIdentity is derived from config alone so it can be computed for +// the cache key without starting the server. +func (b *Backend) CacheIdentity() string { + return "llama-server|" + b.ModelPath + "|" + b.ModelTag +} + +// Available ensures the server is up and healthy. +func (b *Backend) Available(ctx context.Context) error { + if err := b.ensureStarted(ctx); err != nil { + return err + } + return b.inner.Available(ctx) +} + +// Complete starts the server if needed, then delegates over HTTP. The +// inner client sends the native messages array, so there is no flattening. +func (b *Backend) Complete(ctx context.Context, in model.CompleteRequest) (*model.Response, error) { + if err := b.ensureStarted(ctx); err != nil { + return nil, err + } + return b.inner.Complete(ctx, in) +} + +// CompleteStructured implements model.StructuredBackend via the inner +// HTTP client's grammar-constrained path. +func (b *Backend) CompleteStructured(ctx context.Context, in model.StructuredRequest) ([]byte, error) { + if err := b.ensureStarted(ctx); err != nil { + return nil, err + } + return b.inner.CompleteStructured(ctx, in) +} + +// ensureStarted spawns llama-server exactly once and waits for /health. +func (b *Backend) ensureStarted(ctx context.Context) error { + b.startOnce.Do(func() { b.startErr = b.start(ctx) }) + return b.startErr +} + +func (b *Backend) start(ctx context.Context) error { + if b.ModelPath == "" { + return fmt.Errorf("llama-server: no model path configured") + } + host := b.Host + if host == "" { + host = "127.0.0.1" + } + port, err := freeLoopbackPort(host) + if err != nil { + return fmt.Errorf("llama-server: pick port: %w", err) + } + + args := []string{ + "-m", b.ModelPath, + "--host", host, + "--port", fmt.Sprintf("%d", port), + } + if b.ContextSize > 0 { + args = append(args, "-c", fmt.Sprintf("%d", b.ContextSize)) + } + if b.GPULayers >= 0 { + args = append(args, "-ngl", fmt.Sprintf("%d", b.GPULayers)) + } + + cmd := exec.Command(b.BinaryPath, args...) + cmd.Stdout = b.logBuf + cmd.Stderr = b.logBuf + cmd.Stdin = nil + cmd.SysProcAttr = procAttr() + if err := cmd.Start(); err != nil { + return fmt.Errorf("start llama-server (%s): %w", b.BinaryPath, err) + } + + b.mu.Lock() + b.cmd = cmd + b.mu.Unlock() + + endpoint := fmt.Sprintf("http://%s:%d", host, port) + if err := b.waitHealthy(ctx, endpoint); err != nil { + b.kill() + return fmt.Errorf("llama-server did not become ready: %w (log tail: %s)", err, b.logBuf.String()) + } + + inner := llamafile.New(endpoint) + inner.ModelTag = b.ModelTag + b.mu.Lock() + b.inner = inner + b.mu.Unlock() + return nil +} + +// waitHealthy polls /health until the server reports ready, the process +// exits, ctx is canceled, or StartupGrace elapses. Startup is bounded by +// StartupGrace independent of any short per-request deadline, but ctx +// cancellation (e.g. Ctrl-C) still aborts immediately. +func (b *Backend) waitHealthy(ctx context.Context, endpoint string) error { + deadline := time.Now().Add(b.StartupGrace) + cli := &http.Client{Timeout: 2 * time.Second} + for time.Now().Before(deadline) { + if ctx.Err() != nil { + return ctx.Err() + } + if !b.processAlive() { + return fmt.Errorf("process exited before becoming ready") + } + req, _ := http.NewRequestWithContext(ctx, http.MethodGet, endpoint+"/health", nil) + resp, err := cli.Do(req) + if err == nil { + _ = resp.Body.Close() + if resp.StatusCode == http.StatusOK { + return nil + } + } + time.Sleep(200 * time.Millisecond) + } + return fmt.Errorf("timeout after %s", b.StartupGrace) +} + +func (b *Backend) processAlive() bool { + b.mu.Lock() + cmd := b.cmd + b.mu.Unlock() + if cmd == nil || cmd.Process == nil { + return false + } + return cmd.Process.Signal(syscall.Signal(0)) == nil +} + +// Close kills the server. Safe to call multiple times and when the +// server never started. It implements io.Closer so the CLI can defer it. +func (b *Backend) Close() error { + b.kill() + return nil +} + +func (b *Backend) kill() { + b.mu.Lock() + cmd := b.cmd + if b.stopped || cmd == nil || cmd.Process == nil { + b.stopped = true + b.mu.Unlock() + return + } + b.stopped = true + b.mu.Unlock() + + pid := cmd.Process.Pid + signalGroup := func(sig syscall.Signal) { + // Signal the whole process group (negative pid) so any worker + // llama-server forked also dies; fall back to the bare pid. + if pgid, err := syscall.Getpgid(pid); err == nil && pgid > 0 && pgid != os.Getpid() { + _ = syscall.Kill(-pgid, sig) + } + _ = cmd.Process.Signal(sig) + } + + signalGroup(syscall.SIGTERM) + done := make(chan struct{}) + go func() { _, _ = cmd.Process.Wait(); close(done) }() + select { + case <-done: + case <-time.After(3 * time.Second): + signalGroup(syscall.SIGKILL) + <-waitOrTimeout(cmd, 2*time.Second) + } +} + +func waitOrTimeout(cmd *exec.Cmd, d time.Duration) <-chan struct{} { + ch := make(chan struct{}) + go func() { + _, _ = cmd.Process.Wait() + close(ch) + }() + out := make(chan struct{}) + go func() { + select { + case <-ch: + case <-time.After(d): + } + close(out) + }() + return out +} + +// freeLoopbackPort asks the OS for an unused port on host by binding :0 +// and reading back the assignment. There is a small race between closing +// the probe listener and llama-server binding the port, but for a local +// request-scoped child that is acceptable. +func freeLoopbackPort(host string) (int, error) { + ln, err := net.Listen("tcp", net.JoinHostPort(host, "0")) + if err != nil { + return 0, err + } + defer ln.Close() + return ln.Addr().(*net.TCPAddr).Port, nil +} + +// cappedBuffer is a goroutine-safe writer that retains only the last max +// bytes — enough to surface a startup failure without growing unbounded +// over a long session. +type cappedBuffer struct { + mu sync.Mutex + buf []byte + max int +} + +func (c *cappedBuffer) Write(p []byte) (int, error) { + c.mu.Lock() + defer c.mu.Unlock() + c.buf = append(c.buf, p...) + if len(c.buf) > c.max { + c.buf = c.buf[len(c.buf)-c.max:] + } + return len(p), nil +} + +func (c *cappedBuffer) String() string { + c.mu.Lock() + defer c.mu.Unlock() + return string(c.buf) +} + +// compile-time assertions for the optional capabilities the CLI relies on. +var ( + _ model.Backend = (*Backend)(nil) + _ model.StructuredBackend = (*Backend)(nil) + _ io.Closer = (*Backend)(nil) +) diff --git a/internal/model/llamaserver/llamaserver_test.go b/internal/model/llamaserver/llamaserver_test.go new file mode 100644 index 0000000..200ae2f --- /dev/null +++ b/internal/model/llamaserver/llamaserver_test.go @@ -0,0 +1,63 @@ +package llamaserver + +import ( + "strings" + "testing" +) + +func TestFreeLoopbackPort(t *testing.T) { + p, err := freeLoopbackPort("127.0.0.1") + if err != nil { + t.Fatalf("freeLoopbackPort: %v", err) + } + if p <= 0 || p > 65535 { + t.Fatalf("port out of range: %d", p) + } +} + +func TestCappedBuffer_RetainsTail(t *testing.T) { + b := &cappedBuffer{max: 8} + if _, err := b.Write([]byte("abcdefghij")); err != nil { // 10 bytes into an 8-byte cap + t.Fatal(err) + } + if got := b.String(); got != "cdefghij" { + t.Fatalf("capped buffer = %q, want %q", got, "cdefghij") + } +} + +func TestCappedBuffer_ReportsFullWriteLen(t *testing.T) { + b := &cappedBuffer{max: 4} + n, err := b.Write([]byte("hello")) + if err != nil { + t.Fatal(err) + } + if n != 5 { + t.Fatalf("Write returned %d, want 5 (full input length)", n) + } +} + +func TestCacheIdentity_DistinctPerModel(t *testing.T) { + a := New("llama-server", "/cache/models/a.gguf") + b := New("llama-server", "/cache/models/b.gguf") + if a.CacheIdentity() == b.CacheIdentity() { + t.Fatal("different models should yield different cache identities") + } + if !strings.HasPrefix(a.CacheIdentity(), "llama-server|") { + t.Fatalf("cache identity = %q, want llama-server prefix", a.CacheIdentity()) + } + if a.Name() != "llama-server" { + t.Fatalf("Name() = %q", a.Name()) + } +} + +// Close must be safe when the server never started (no process spawned). +func TestClose_BeforeStartIsNoop(t *testing.T) { + b := New("llama-server", "/cache/models/a.gguf") + if err := b.Close(); err != nil { + t.Fatalf("Close before start: %v", err) + } + // Idempotent. + if err := b.Close(); err != nil { + t.Fatalf("second Close: %v", err) + } +} diff --git a/internal/model/llamaserver/procattr_linux.go b/internal/model/llamaserver/procattr_linux.go new file mode 100644 index 0000000..5d717c3 --- /dev/null +++ b/internal/model/llamaserver/procattr_linux.go @@ -0,0 +1,17 @@ +//go:build linux + +package llamaserver + +import "syscall" + +// procAttr puts llama-server in its own process group (so we can signal +// the whole group) and asks the kernel to SIGKILL it if the parent +// intent process dies — even on a crash or -9, where our deferred Close +// never runs. This is the safety net that keeps a request-scoped child +// from being orphaned with the model resident in memory. +func procAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{ + Setpgid: true, + Pdeathsig: syscall.SIGKILL, + } +} diff --git a/internal/model/llamaserver/procattr_other.go b/internal/model/llamaserver/procattr_other.go new file mode 100644 index 0000000..6f7b81a --- /dev/null +++ b/internal/model/llamaserver/procattr_other.go @@ -0,0 +1,13 @@ +//go:build !linux + +package llamaserver + +import "syscall" + +// procAttr puts llama-server in its own process group so Close can signal +// the whole group. Non-Linux platforms (notably macOS) have no +// Pdeathsig equivalent, so a hard kill (-9) of the parent can orphan the +// child; normal exits and signals are handled by the deferred Close. +func procAttr() *syscall.SysProcAttr { + return &syscall.SysProcAttr{Setpgid: true} +} diff --git a/internal/runtime/llamacli.go b/internal/runtime/llamacli.go new file mode 100644 index 0000000..6493a65 --- /dev/null +++ b/internal/runtime/llamacli.go @@ -0,0 +1,183 @@ +package runtime + +import ( + "context" + "fmt" + "os" + "os/exec" + "path/filepath" + "runtime" + "strings" +) + +// llama.cpp ships several frontends; intent uses two: +// +// - llama-server: a request-scoped child process that holds the model +// warm for the duration of one `intent` invocation and speaks the +// OpenAI-compatible HTTP API (native multi-turn messages + grammar). +// This is the preferred local backend. +// - llama-cli: a one-shot fallback used when llama-server isn't present. +// +// Both come from the same package (`brew install llama.cpp`), so the +// installer ensures the package and the resolvers find whichever binaries +// landed on PATH (or in a known Homebrew prefix). +const ( + LlamaServerBinary = "llama-server" + LlamaCLIBinary = "llama-cli" +) + +// binaryCandidates lists well-known absolute locations to check when a +// llama.cpp binary isn't on PATH — chiefly the Homebrew prefixes on macOS +// and Linux, where `brew install llama.cpp` drops the binaries. +func binaryCandidates(bin string) []string { + paths := []string{ + "/opt/homebrew/bin/" + bin, // Apple Silicon brew + "/usr/local/bin/" + bin, // Intel mac / manual + "/home/linuxbrew/.linuxbrew/bin/" + bin, // shared linuxbrew + } + if home := os.Getenv("HOME"); home != "" { + paths = append(paths, filepath.Join(home, ".linuxbrew", "bin", bin)) + } + return paths +} + +// isExecutable reports whether path exists and has an executable bit set. +func isExecutable(path string) bool { + info, err := os.Stat(path) + if err != nil { + return false + } + return !info.IsDir() && info.Mode()&0o111 != 0 +} + +// resolveBinary returns a usable path for bin: PATH first, then known +// install locations. Falls back to the bare name so callers still get a +// value to hand exec (which re-resolves via PATH) when nothing is found. +func resolveBinary(bin string) string { + if p, err := exec.LookPath(bin); err == nil { + return p + } + for _, p := range binaryCandidates(bin) { + if isExecutable(p) { + return p + } + } + return bin +} + +// haveBinary reports whether bin resolves to something runnable. +func haveBinary(bin string) bool { + if _, err := exec.LookPath(bin); err == nil { + return true + } + for _, p := range binaryCandidates(bin) { + if isExecutable(p) { + return true + } + } + return false +} + +// LlamaServerPath resolves the llama-server binary. +func (m *Manager) LlamaServerPath() string { return resolveBinary(LlamaServerBinary) } + +// HaveLlamaServer reports whether llama-server is installed. +func (m *Manager) HaveLlamaServer() bool { return haveBinary(LlamaServerBinary) } + +// LlamaCLIPath resolves the llama-cli binary. +func (m *Manager) LlamaCLIPath() string { return resolveBinary(LlamaCLIBinary) } + +// HaveLlamaCLI reports whether llama-cli is installed. +func (m *Manager) HaveLlamaCLI() bool { return haveBinary(LlamaCLIBinary) } + +// HaveLlamaRuntime reports whether a usable llama.cpp frontend is present. +// Either binary is enough to run local inference (server preferred, cli +// as a one-shot fallback). +func (m *Manager) HaveLlamaRuntime() bool { + return m.HaveLlamaServer() || m.HaveLlamaCLI() +} + +// pkgManager describes how to install llama.cpp via one system package +// manager. cmd is the manager binary; args are the install arguments +// (package name included); needsSudo asks us to prefix sudo when we are +// not already root. +type pkgManager struct { + name string + cmd string + args []string + needsSudo bool +} + +// llamaManagers is the ordered preference list. Homebrew is first because +// it ships an official, up-to-date `llama.cpp` formula (with both +// llama-server and llama-cli) on macOS and Linux; the native managers are +// best-effort fallbacks. +func llamaManagers() []pkgManager { + managers := []pkgManager{ + {name: "Homebrew", cmd: "brew", args: []string{"install", "llama.cpp"}}, + } + if runtime.GOOS == "linux" { + managers = append(managers, + pkgManager{name: "apt", cmd: "apt-get", args: []string{"install", "-y", "llama.cpp"}, needsSudo: true}, + pkgManager{name: "dnf", cmd: "dnf", args: []string{"install", "-y", "llama.cpp"}, needsSudo: true}, + pkgManager{name: "pacman", cmd: "pacman", args: []string{"-S", "--noconfirm", "llama.cpp"}, needsSudo: true}, + pkgManager{name: "zypper", cmd: "zypper", args: []string{"install", "-y", "llama.cpp"}, needsSudo: true}, + ) + } + return managers +} + +// EnsureLlamaRuntime installs llama.cpp via the system package manager if +// neither llama-server nor llama-cli is already available. log, if +// non-nil, receives human-readable progress lines. It returns a clear, +// actionable error when no supported package manager is found or the +// install fails. +func (m *Manager) EnsureLlamaRuntime(ctx context.Context, log func(string)) error { + if m.HaveLlamaRuntime() { + return nil + } + logf := func(format string, a ...any) { + if log != nil { + log(fmt.Sprintf(format, a...)) + } + } + + var available []pkgManager + for _, pm := range llamaManagers() { + if _, err := exec.LookPath(pm.cmd); err == nil { + available = append(available, pm) + } + } + if len(available) == 0 { + return fmt.Errorf("no supported package manager found to install llama.cpp.\n" + + " Install Homebrew (https://brew.sh) and run `brew install llama.cpp`,\n" + + " or install llama.cpp from https://github.com/ggml-org/llama.cpp") + } + + var lastErr error + for _, pm := range available { + name, args := pm.cmd, pm.args + if pm.needsSudo && os.Geteuid() != 0 { + if _, err := exec.LookPath("sudo"); err == nil { + args = append([]string{name}, args...) + name = "sudo" + } + } + logf("installing llama.cpp via %s (%s %s)...", pm.name, name, strings.Join(args, " ")) + cmd := exec.CommandContext(ctx, name, args...) + cmd.Stdout = os.Stderr + cmd.Stderr = os.Stderr + if err := cmd.Run(); err != nil { + lastErr = fmt.Errorf("%s install failed: %w", pm.name, err) + logf(" %s failed: %v", pm.name, err) + continue + } + if m.HaveLlamaRuntime() { + logf("llama.cpp installed.") + return nil + } + lastErr = fmt.Errorf("%s reported success but no llama.cpp binary was found", pm.name) + } + return fmt.Errorf("could not install llama.cpp automatically: %w.\n"+ + " Try `brew install llama.cpp` or build from https://github.com/ggml-org/llama.cpp", lastErr) +} diff --git a/internal/runtime/runtime.go b/internal/runtime/runtime.go index f3e128c..8ada7b9 100644 --- a/internal/runtime/runtime.go +++ b/internal/runtime/runtime.go @@ -1,7 +1,8 @@ -// Package runtime manages the local llamafile binary and model files. -// In v1 it can: report whether a runtime/model is present, and download -// either on demand with progress callbacks. Actually starting llamafile as a -// subprocess is wired into Phase 4 (daemon). +// Package runtime manages the local inference runtime (llama.cpp's +// `llama-cli`, installed via the system package manager — see +// llamacli.go) and the GGUF model files it loads. It can report whether +// the runtime/model is present and download models on demand with +// progress callbacks. package runtime import ( @@ -16,9 +17,6 @@ import ( "github.com/CoreyRDean/intent/internal/models" ) -// LlamafileVersion is the runtime version we ship against. -const LlamafileVersion = "0.10.0" - // ModelInfo is the minimal shape the runtime package needs to // download a model. It's a projection of models.Model kept here for // backward compatibility; new code should pass models.Model around. @@ -80,25 +78,11 @@ type Manager struct { func New(cacheDir string) *Manager { return &Manager{CacheDir: cacheDir} } -// LlamafilePath returns the expected path of the llamafile binary. -func (m *Manager) LlamafilePath() string { - return filepath.Join(m.CacheDir, "runtime", "llamafile-"+LlamafileVersion) -} - // ModelPath returns the expected path of the named model file. func (m *Manager) ModelPath(file string) string { return filepath.Join(m.CacheDir, "models", file) } -// HaveLlamafile reports whether the runtime exists and is executable. -func (m *Manager) HaveLlamafile() bool { - info, err := os.Stat(m.LlamafilePath()) - if err != nil { - return false - } - return info.Mode()&0o111 != 0 -} - // HaveModel reports whether the named model file exists. func (m *Manager) HaveModel(file string) bool { _, err := os.Stat(m.ModelPath(file)) @@ -108,22 +92,6 @@ func (m *Manager) HaveModel(file string) bool { // Progress is a download progress callback. type Progress func(downloaded, total int64) -// EnsureLlamafile downloads the runtime if missing. -func (m *Manager) EnsureLlamafile(ctx context.Context, progress Progress) error { - if m.HaveLlamafile() { - return nil - } - if err := os.MkdirAll(filepath.Dir(m.LlamafilePath()), 0o755); err != nil { - return err - } - url := fmt.Sprintf("https://github.com/mozilla-ai/llamafile/releases/download/%s/llamafile-%s", - LlamafileVersion, LlamafileVersion) - if err := download(ctx, url, m.LlamafilePath(), progress); err != nil { - return fmt.Errorf("download llamafile: %w", err) - } - return os.Chmod(m.LlamafilePath(), 0o755) -} - // EnsureModel downloads the model if missing. func (m *Manager) EnsureModel(ctx context.Context, mi ModelInfo, progress Progress) error { dest := m.ModelPath(mi.File) diff --git a/internal/verbose/backend.go b/internal/verbose/backend.go index c2cfb87..cf7b91d 100644 --- a/internal/verbose/backend.go +++ b/internal/verbose/backend.go @@ -2,6 +2,7 @@ package verbose import ( "context" + "io" "time" "github.com/CoreyRDean/intent/internal/model" @@ -41,6 +42,16 @@ func (v *vb) CacheIdentity() string { return v.inner.Name() } +// Close forwards to the wrapped backend if it owns resources (e.g. the +// llama-server co-process), so callers can defer Close through the +// verbose decorator. No-op when the inner backend isn't a Closer. +func (v *vb) Close() error { + if c, ok := v.inner.(io.Closer); ok { + return c.Close() + } + return nil +} + func (v *vb) Complete(ctx context.Context, req model.CompleteRequest) (*model.Response, error) { v.log.Section("model request (envelope)") v.log.KV("backend", v.inner.Name())