diff --git a/.cursor/rules/operator-fork.mdc b/.cursor/rules/operator-fork.mdc new file mode 100644 index 0000000000..d0db843630 --- /dev/null +++ b/.cursor/rules/operator-fork.mdc @@ -0,0 +1,13 @@ +--- +description: Operator fork - canonical agent guide is docs/operator/AGENTS.md only +alwaysApply: true +--- + +# heavygee/hapi operator fork + +This repo is a **fork** of `tiann/hapi`. Root **`AGENTS.md` is intentionally absent** - upstream's copy is not used here. + +**Read `docs/operator/AGENTS.md`** - single canonical guide (HAPI baseline + fork intent + upstream PR discipline). + +- Upstream PR branches: from `upstream/main` only; never include `docs/operator/`, `docs/plans/`, or `AGENTS.md` edits in the PR diff. +- Plans: `docs/plans/`. diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..5ab9ec991a --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Fork-only: keep AGENTS.md deleted when merging upstream/main (requires: git config merge.ours.driver true) +AGENTS.md merge=ours diff --git a/AGENTS.md b/AGENTS.md deleted file mode 100644 index d924ba67df..0000000000 --- a/AGENTS.md +++ /dev/null @@ -1,148 +0,0 @@ -# AGENTS.md - -Work style: telegraph; noun-phrases ok; drop grammar; - -Short guide for AI agents in this repo. Prefer progressive loading: start with the root README, then package READMEs as needed. - -## What is HAPI? - -Local-first platform for running AI coding agents (Claude Code, Codex, Gemini) with remote control via web/phone. CLI wraps agents and connects to hub; hub serves web app and handles real-time sync. - -## Repo layout - -``` -cli/ - CLI binary, agent wrappers, runner daemon -hub/ - HTTP API + Socket.IO + SSE + Telegram bot -web/ - React PWA for remote control -shared/ - Common types, schemas, utilities -docs/ - VitePress documentation site -website/ - Marketing site -``` - -Bun workspaces; `shared` consumed by cli, hub, web. - -## Architecture overview - -``` -┌─────────┐ Socket.IO ┌─────────┐ SSE/REST ┌─────────┐ -│ CLI │ ──────────── │ Hub │ ──────────── │ Web │ -│ (agent) │ │ (server)│ │ (PWA) │ -└─────────┘ └─────────┘ └─────────┘ - │ │ │ - ├─ Wraps Claude/Codex ├─ SQLite persistence ├─ TanStack Query - ├─ Socket.IO client ├─ Session cache ├─ SSE for updates - └─ RPC handlers ├─ RPC gateway └─ assistant-ui - └─ Telegram bot -``` - -**Data flow:** -1. CLI spawns agent (claude/codex/gemini), connects to hub via Socket.IO -2. Agent events → CLI → hub (socket `message` event) → DB + SSE broadcast -3. Web subscribes to SSE `/api/events`, receives live updates -4. User actions → Web → hub REST API → RPC to CLI → agent - -## Reference docs - -- `README.md` - User overview, quick start -- `cli/README.md` - CLI commands, config, runner -- `hub/README.md` - Hub config, HTTP API, Socket.IO events -- `web/README.md` - Routes, components, hooks -- `docs/guide/` - User guides (installation, how-it-works, FAQ) - -## Shared rules - -- No backward compatibility: breaking old formats freely -- Prioritize Pragmatism, and Avoid Overengineering. -- Write necessary tests ONLY. -- TypeScript strict; no untyped code -- Bun workspaces; run `bun` commands from repo root -- Path alias `@/*` maps to `./src/*` per package -- Prefer 4-space indentation -- Zod for runtime validation (schemas in `shared/src/schemas.ts`) - -## Common commands (repo root) - -```bash -bun typecheck # All packages -bun run test # cli + hub tests -bun run dev # hub + web concurrently -bun run build:single-exe # All-in-one binary -``` - -## Key source dirs - -### CLI (`cli/src/`) -- `api/` - Hub connection (Socket.IO client, auth) -- `claude/` - Claude Code integration (wrapper, hooks) -- `codex/` - Codex mode integration -- `agent/` - Multi-agent support (Gemini via ACP) -- `runner/` - Background daemon for remote spawn -- `commands/` - CLI subcommands (auth, runner, doctor) -- `modules/` - Tool implementations (ripgrep, difftastic, git) -- `ui/` - Terminal UI (Ink components) - -### Hub (`hub/src/`) -- `web/routes/` - REST API endpoints -- `socket/` - Socket.IO setup -- `socket/handlers/cli/` - CLI event handlers (session, terminal, machine, RPC) -- `sync/` - Core logic (sessionCache, messageService, rpcGateway) -- `store/` - SQLite persistence (better-sqlite3) -- `sse/` - Server-Sent Events manager -- `telegram/` - Bot commands, callbacks -- `notifications/` - Push (VAPID) and Telegram notifications -- `config/` - Settings loading, token generation -- `visibility/` - Client visibility tracking - -### Web (`web/src/`) -- `routes/` - TanStack Router pages -- `routes/sessions/` - Session views (chat, files, terminal) -- `components/` - Reusable UI (SessionList, SessionChat, NewSession/) -- `hooks/queries/` - TanStack Query hooks -- `hooks/mutations/` - Mutation hooks -- `hooks/useSSE.ts` - SSE subscription -- `api/client.ts` - API client wrapper - -### Shared (`shared/src/`) -- `types.ts` - Core types (Session, Message, Machine) -- `schemas.ts` - Zod schemas for validation -- `socket.ts` - Socket.IO event types -- `messages.ts` - Message parsing utilities -- `modes.ts` - Permission/model mode definitions - -## Testing - -- Test framework: Vitest (via `bun run test`) -- Test files: `*.test.ts` next to source -- Run: `bun run test` (from root) or `bun run test` (from package) -- Hub tests: `hub/src/**/*.test.ts` -- CLI tests: `cli/src/**/*.test.ts` -- No web tests currently - -## Common tasks - -| Task | Key files | -|------|-----------| -| Add CLI command | `cli/src/commands/`, `cli/src/index.ts` | -| Add API endpoint | `hub/src/web/routes/`, register in `hub/src/web/index.ts` | -| Add Socket.IO event | `hub/src/socket/handlers/cli/`, `shared/src/socket.ts` | -| Add web route | `web/src/routes/`, `web/src/router.tsx` | -| Add web component | `web/src/components/` | -| Modify session logic | `hub/src/sync/sessionCache.ts`, `hub/src/sync/syncEngine.ts` | -| Modify message handling | `hub/src/sync/messageService.ts` | -| Add notification type | `hub/src/notifications/` | -| Add shared type | `shared/src/types.ts`, `shared/src/schemas.ts` | - -## Important patterns - -- **RPC**: CLI registers handlers (`rpc-register`), hub routes requests via `rpcGateway.ts` -- **Versioned updates**: CLI sends `update-metadata`/`update-state` with version; hub rejects stale -- **Session modes**: `local` (terminal) vs `remote` (web-controlled); switchable mid-session -- **Permission modes**: `default`, `acceptEdits`, `bypassPermissions`, `plan` -- **Namespaces**: Multi-user isolation via `CLI_API_TOKEN:` suffix - -## Critical Thinking - -1. Fix root cause (not band-aid). -2. Unsure: read more code; if still stuck, ask w/ short options. -3. Conflicts: call out; pick safer path. -4. Unrecognized changes: assume other agent; keep going; focus your changes. If it causes issues, stop + ask user. diff --git a/docs/operator-local-tooling.md b/docs/operator-local-tooling.md new file mode 100644 index 0000000000..4187f9d6d9 --- /dev/null +++ b/docs/operator-local-tooling.md @@ -0,0 +1,54 @@ +# Operator-local tooling (not for upstream) + +Personal scripts and machine scans must **not** land in upstream PRs. This repo already ignores `localdocs/` and `execplan/` - use those instead of extending `.gitignore`. + +**Agent instructions:** canonical guide is [`docs/operator/AGENTS.md`](operator/AGENTS.md) only. Root `AGENTS.md` is **deleted on this fork** (upstream keeps theirs; we never PR either). + +## Where things live + +| Kind | Location | Committed? | +|------|----------|------------| +| Machine chat index (293 sessions, paths, summaries) | `~/.hapi/operator/reconnectable-agent-chats.{json,txt}` | No - outside repo | +| Personal batch attach presets | `localdocs/operator/attach-existing-agent-sessions.sh` | No - `localdocs/` ignored | +| Regenerate chat index | `localdocs/operator/regenerate-chat-index.sh` | No | +| Generic attach-by-id (future PR F) | `scripts/attach-agent-chat.sh` | Yes, when ready | +| Voice dogfood evidence | `docs/dogfood/*.md` (sanitized) | Yes - product evidence | + +## Commands + +```bash +# Refresh index after new chats +./localdocs/operator/regenerate-chat-index.sh + +# Lookup / attach (reads ~/.hapi/operator/ by default) +./scripts/lookup-agent-chat.sh 12 +./scripts/attach-agent-chat.sh 3054d570 + +# Your hardcoded 8-session batch +./localdocs/operator/attach-existing-agent-sessions.sh +``` + +Override index path: `HAPI_CHAT_INDEX=/path/to.json` + +## Belt-and-suspenders (no repo changes) + +**`.git/info/exclude`** - personal gitignore, never committed. See yours for `PLAN.md` and any scratch paths still in the tree. + +**Global ignore** - `~/.config/git/ignore` applies to all repos on this machine. + +**Local pre-commit hook** - `.git/hooks/pre-commit` can refuse `git add` of operator paths; template in comments below. + +```bash +#!/bin/sh +# .git/hooks/pre-commit (local only) +blocked='localdocs/|~/.hapi/operator|reconnectable-agent-chats|attach-existing-agent-sessions' +if git diff --cached --name-only | grep -Eq "$blocked"; then + echo "Blocked: operator-local paths" >&2 + exit 1 +fi +``` + +## Friction check before `git add -A` + +- `docs/dogfood/` here is for **sanitized voice dogfood**, not full-machine chat inventories. +- `PLAN.md` at repo root is operator scratch unless you explicitly want it upstream. diff --git a/docs/operator/AGENTS.md b/docs/operator/AGENTS.md new file mode 100644 index 0000000000..810b058408 --- /dev/null +++ b/docs/operator/AGENTS.md @@ -0,0 +1,296 @@ +# AGENTS.md (operator fork) + +Work style: telegraph; noun-phrases ok; drop grammar; + +**Canonical agent guide for `heavygee/hapi`.** Upstream `tiann/hapi` ships root `AGENTS.md` - **this fork deletes that file** and keeps everything here. Never PR this path or a root `AGENTS.md` change to upstream. + +Prefer progressive loading: this file → root `README.md` → package READMEs → `docs/plans/` for voice/integration depth. + +--- + +## Meta: what this fork is + +**HAPI is an agent-corralling platform** - local-first remote control for CLI coding agents (Claude Code, Codex, Cursor Agent, Gemini, OpenCode). Extension of upstream **[tiann/hapi](https://github.com/tiann/hapi)** (AGPL-3). + +| Layer | Role | +|-------|------| +| **Upstream HAPI** | Multi-agent hub, PWA, Telegram, ElevenLabs voice, session sync | +| **Our layer** | Voice-first modality, deterministic mode state, optional `AGENT_NOTIFY_SUMMARY`, multi-agent fleet ops from phone while AFK | +| **Legacy reference** | CursorVox, CursorRemote - mine patterns, do not rebuild parallel stacks | + +**North star:** *Gardening while agents work* - [`docs/plans/2026-05-23-voice-agent-state-integration.md`](../plans/2026-05-23-voice-agent-state-integration.md) §14. + +--- + +## Upstream relationship + +```text +upstream → https://github.com/tiann/hapi.git +origin → https://github.com/heavygee/hapi.git +``` + +- Extend upstream; PR-sized slices; default path unchanged when new code off +- **Never modify maintainer canon** in upstream PRs - see § Upstream file boundaries +- **Upstream PR branches** start from `upstream/main` only - product code diffs, nothing fork-local + +--- + +## Strategic direction: voice-first + +ElevenLabs ConvAI today: handoff OK, readback weak, payment, no mode machine. Target: pluggable voice modality + hub-owned state. Plan: `docs/plans/2026-05-23-voice-agent-state-integration.md`. Do **not** port CursorVox `dispatch_agent.py` as state owner. + +--- + +## Operator docs map + +| Doc | Purpose | +|-----|---------| +| **`docs/operator/AGENTS.md`** | This file | +| `docs/plans/*` | Integration plans, PR A-F | +| `docs/operator-local-tooling.md` | `localdocs/`, machine indexes | +| `docs/dogfood/*.md` | Voice evidence for upstream PR bodies | + +--- + +## Upstream PR series + +| PR | Scope | +|----|-------| +| **A** | Voice readback - `contextFormatters.ts`, `voiceHooks.ts` | +| **B** | ElevenLabs archive - `hub/src/voice/`, `voice.ts` | +| **C** | Optional `AGENT_NOTIFY_SUMMARY` | +| **D** | Mode state + modality wrapper | +| **E** | Local OpenAI backend (after #401) | +| **F** | Web import picker | + +Coordinate **#401**, **#640**. Details §16 in integration plan. + +--- + +## Upstream file boundaries + +### Never touch in upstream-bound PRs + +`AGENTS.md`, `CONTRIBUTING.md`, `LICENSE`, `SECURITY.md`, root `README.md`, `.github/*`, `website/`, `docs/operator/*`, `docs/plans/*` + +PR branch sanity check before push: + +```bash +git fetch upstream +git diff --name-only upstream/main...HEAD | grep -E '^(AGENTS\.md|CONTRIBUTING|docs/operator|docs/plans)' && echo STOP || echo OK +``` + +### Fork-only (stay on `origin/main`, not in upstream PRs) + +- `docs/operator/AGENTS.md` (this file) +- `docs/plans/*`, `docs/operator-local-tooling.md` +- `.cursor/rules/operator-fork.mdc` +- Root `.gitattributes` (`AGENTS.md merge=ours` - fork merge hygiene) +- Absence of root `AGENTS.md` (deleted on fork) + +### Keeping a clean tree after upstream sync + +One-time per clone: + +```bash +git config merge.ours.driver true +``` + +Fork root `.gitattributes` keeps **`AGENTS.md` deleted** when merging `upstream/main` (ours = fork side). + +If `AGENTS.md` reappears after a rebase anyway: + +```bash +git rm -f AGENTS.md +git commit -m "chore(fork): drop upstream AGENTS.md (canonical copy in docs/operator/)" +``` + +**Upstream PR branches:** branched from `upstream/main` - root `AGENTS.md` exists on the branch but **leave it untouched**; your PR diff must not include it. + +--- + +## Upstream PR voice (diffident contributor) + +PR prose = humble first-timer; work = first-class. Silent checklist: rebase, tests, default path note, no fork docs in diff. + +Never in upstream PRs: AI disclosure, fork strategy, internal plans, canon edits. + +See prior skeleton in git history or integration plan §16.9 - Summary / Problem / Approach / Testing / Related / Questions. + +--- + +## Voice mode states (gardening) + +`idle_warm|cold`, `align_intent`, `await_confirm`, `executing_async` (silence), `reporting`, `blocked`, `report_refresh`. Ack only after hub queues. Optional `AGENT_NOTIFY_SUMMARY` - parse when present; `~/coding/agent-notify/ACTUALSPEC.md`. + +--- + +## Git workflow + +### Two branches, two purposes + +| Branch | Base | Purpose | +|--------|------|---------| +| **`main`** on `origin` (fork) | upstream + fork-only commits | Local dev; operator docs; deleted `AGENTS.md` - **never open a PR to tiann from this branch** | +| **`fix/…`, `feat/…`** | **`upstream/main` only** | Upstream PRs - diff must be product code only | + +Committing fork metadata on fork `main` is **fine**. It only leaks into an upstream PR if you branch wrong. + +**Safe (upstream PR):** + +```bash +git fetch upstream +git checkout -b fix/voice-ready-inline-summary upstream/main # NOT fork main +# ... edits in cli/hub/web/shared only ... +git diff --name-only upstream/main...HEAD # must not list AGENTS.md, docs/operator/, docs/plans/ +git push -u origin fix/voice-ready-inline-summary +gh pr create --repo tiann/hapi --head heavygee:fix/voice-ready-inline-summary +``` + +**Unsafe (will PR the deletion + operator docs):** + +```bash +git checkout -b fix/voice main # fork main includes fork-only commits +gh pr create --repo tiann/hapi # BAD - ancestry includes AGENTS.md deletion +``` + +If you started from fork `main` by mistake, re-cut before push: + +```bash +git fetch upstream +git checkout -b fix/voice-ready-inline-summary upstream/main +git cherry-pick # product commits only, not fork config commits +``` + +Or: `git rebase --onto upstream/main upstream/main fix/voice` after ensuring fork-only commits aren't in the chain. + +### Sync fork main with upstream + +```bash +git fetch upstream && git checkout main && git merge upstream/main # AGENTS.md stays deleted (merge=ours) +git rm -f AGENTS.md 2>/dev/null; true +``` + +One-time per clone: `git config merge.ours.driver true` + +Before `git add` on **PR branches**: no `localdocs/`, secrets, `docs/operator/`, `docs/plans/`. + +--- + +## HAPI baseline (from upstream `tiann/hapi` AGENTS.md) + +Inlined here so the fork does not need root `AGENTS.md`. When upstream updates their copy, manually port relevant technical deltas into this section. + +### What is HAPI? + +Local-first platform for running AI coding agents (Claude Code, Codex, Gemini, Cursor Agent, OpenCode) with remote control via web/phone. CLI wraps agents and connects to hub; hub serves web app and handles real-time sync. + +### Repo layout + +``` +cli/ - CLI binary, agent wrappers, runner daemon +hub/ - HTTP API + Socket.IO + SSE + Telegram bot +web/ - React PWA for remote control +shared/ - Common types, schemas, utilities +docs/ - VitePress documentation site +website/ - Marketing site +``` + +Bun workspaces; `shared` consumed by cli, hub, web. + +### Architecture overview + +``` +┌─────────┐ Socket.IO ┌─────────┐ SSE/REST ┌─────────┐ +│ CLI │ ──────────── │ Hub │ ──────────── │ Web │ +│ (agent) │ │ (server)│ │ (PWA) │ +└─────────┘ └─────────┘ └─────────┘ +``` + +**Data flow:** +1. CLI spawns agent, connects to hub via Socket.IO +2. Agent events → CLI → hub → DB + SSE broadcast +3. Web subscribes to SSE `/api/events` +4. User actions → Web → hub REST → RPC → CLI → agent + +**Voice path (ElevenLabs default):** + +```text +Browser WebRTC ↔ ElevenLabs ConvAI → client tools → hub queue → coding agent CLI + ↑ voiceHooks contextual updates +``` + +### Reference docs + +- `README.md`, `cli/README.md`, `hub/README.md`, `web/README.md`, `docs/guide/`, `CONTRIBUTING.md` (read only) + +### Shared rules + +- No backward compatibility required +- Pragmatism; avoid overengineering; tests only when needed +- TypeScript strict; Bun from repo root; `@/*` → `./src/*`; 4-space; Zod in `shared/src/schemas.ts` + +### Common commands + +```bash +bun typecheck +bun run test +bun run dev +bun run build:single-exe +``` + +### Key source dirs + +**CLI (`cli/src/`):** `api/`, `claude/`, `codex/`, `agent/`, `runner/`, `commands/`, `modules/`, `ui/` + +**Hub (`hub/src/`):** `web/routes/`, `socket/handlers/cli/`, `sync/`, `store/`, `sse/`, `telegram/`, `notifications/`, `config/`, `visibility/`, **`voice/`** (operator extensions) + +**Web (`web/src/`):** `routes/`, `components/`, `hooks/`, `api/client.ts`, **`realtime/`** (voice) + +**Shared (`shared/src/`):** `types.ts`, `schemas.ts`, `socket.ts`, `messages.ts`, `modes.ts`, **`voice.ts`** + +### Voice integration seams + +| Concern | Path | +|---------|------| +| Voice prompt + tools | `shared/src/voice.ts` | +| Default transport | `web/src/realtime/RealtimeVoiceSession.tsx` | +| Client tools | `web/src/realtime/realtimeClientTools.ts` | +| Context feed | `voiceHooks.ts`, `contextFormatters.ts` | +| Token API | `hub/src/web/routes/voice.ts` | +| Notify + mode hook | `hub/src/socket/handlers/cli/sessionHandlers.ts` | +| Outbound messages | `hub/src/sync/messageService.ts` | + +### Testing + +Vitest; `*.test.ts` next to source; hub + cli tests; no web tests currently. + +### Common tasks + +| Task | Key files | +|------|-----------| +| Add CLI command | `cli/src/commands/`, `cli/src/index.ts` | +| Add API endpoint | `hub/src/web/routes/`, `hub/src/web/index.ts` | +| Add Socket.IO event | `hub/src/socket/handlers/cli/`, `shared/src/socket.ts` | +| Modify session logic | `sessionCache.ts`, `syncEngine.ts` | +| Modify messages | `messageService.ts` | +| Voice readback / mode | `contextFormatters.ts`, `sessionHandlers.ts`, `hub/src/voice/` | +| Attach agent chat | `machines.ts`, `scripts/attach-agent-chat.sh` | + +### Important patterns + +- **RPC:** `rpc-register` + `rpcGateway.ts` +- **Versioned updates:** stale rejected +- **Session modes:** `local` vs `remote` +- **Permission modes:** `default`, `acceptEdits`, `bypassPermissions`, `plan` +- **Namespaces:** `CLI_API_TOKEN:` + +### Critical thinking + +1. Fix root cause (not band-aid). +2. Unsure: read more code; ask w/ short options. +3. Conflicts: call out; pick safer path. +4. Unrecognized changes: assume other agent; focus your changes. +5. **Upstream first** - general fixes → upstream PR. +6. **Maintainer canon read-only** - never PR edits to `AGENTS.md`, `CONTRIBUTING.md`, root `README.md`. +7. **Fork agent doc is here only** - root `AGENTS.md` must not exist on fork `main`. diff --git a/docs/plans/2026-05-23-upstream-voice-pr-handoff.md b/docs/plans/2026-05-23-upstream-voice-pr-handoff.md new file mode 100644 index 0000000000..0648dd05ea --- /dev/null +++ b/docs/plans/2026-05-23-upstream-voice-pr-handoff.md @@ -0,0 +1,9 @@ +# Handoff plan: upstream voice PRs for `tiann/hapi` + +**This document is deprecated as a separate spec.** + +All content - dogfood findings, PR A-E series, #401/#640 coordination, local WIP map, new-agent checklist, and reproduction commands - lives in the main integration plan: + +**[2026-05-23-voice-agent-state-integration.md](./2026-05-23-voice-agent-state-integration.md) §16** + +Start there for a clean-agent handoff. This file remains only as a bookmark for old links. diff --git a/docs/plans/2026-05-23-voice-agent-state-integration.md b/docs/plans/2026-05-23-voice-agent-state-integration.md new file mode 100644 index 0000000000..8790e4597b --- /dev/null +++ b/docs/plans/2026-05-23-voice-agent-state-integration.md @@ -0,0 +1,1069 @@ +# Plan: HAPI + Voice Agent State Layer (coexistence with upstream) + +**Status:** Re-reviewed 2026-05-23 after proxmox dogfood start +**Repo:** `/home/heavygee/coding/hapi` (cloned from `https://github.com/tiann/hapi`, upstream `main` @ `04d3d02`) +**Author context:** Architecture conversation May 2026 - pivot from CursorRemote/CursorVox stack to HAPI as primary remote + multi-agent platform +**Goal:** **Coexist with upstream HAPI** - extend, do not replace. Ship additive modules and a **voice-first modality** (local OpenAI-compatible speech stack) while keeping the tree **PR-able** against `tiann/hapi` at all times. +**License note:** Upstream HAPI is **AGPL-3**. Any modified hub exposed on a network must comply with AGPL source-offer requirements (see [why-hapi.md](../guide/why-hapi.md) and root `LICENSE`). + +--- + +## 1. Executive summary + +We spent significant effort on **CursorRemote** (CDP → IDE composer remote control) and **CursorVox** (voice cockpit + deterministic routing → CursorRemote). The strategic direction has shifted: + +| Old assumption | New assumption | +|----------------|----------------| +| Primary agent surface = Cursor IDE composer | Primary agent surface = **CLI agents** (`agent`, `claude`, `gemini`, etc.) | +| Remote = scrape Electron DOM (CursorRemote) | Remote = **wrap CLI sessions** with web/PWA/Telegram | +| Voice = custom FastAPI stack (CursorVox) | Voice = **orchestration layer** on top of remote hub (HAPI today uses ElevenLabs ConvAI) | + +**HAPI** already delivers most of what we built toward: multi-agent remote control, session resume, permissions from phone, PWA, Telegram mini-app, optional Tailscale self-host, voice via ElevenLabs. + +**This plan** is to **extend upstream HAPI in place** with the **differentiated layer we already designed in CursorVox**: + +- Deterministic **voice agent mode state machine** (warm/cold idle, confirm gating, async execution phases) +- **`AGENT_NOTIFY_SUMMARY` contract** for machine-readable completion/status (shared with `agent-notify`) +- A **voice-first modality** backed by **local OpenAI-compatible endpoints** (STT/TTS/optional classifier LLM) - additive to today's ElevenLabs path, not a replacement fork +- **Modality wrapper** on outbound agent messages (voice-originated turns only) + +**Coexistence contract:** Existing HAPI behavior (text UI, Telegram, ElevenLabs ConvAI, default settings) must keep working unchanged when new code is off or env vars unset. Every slice should be merge-shaped: isolated modules, tests, feature flags or config gates, minimal edits to existing call sites. + +**Operator model (full spec):** §14 - *gardening while agents work* - async delegation, hub-owned truth, voice as thin classifier + speaker, responsive only when agents need you. + +CursorRemote and CursorVox become **legacy / reference implementations**, not the primary product path. + +### 2026-05-23 re-review decisions + +After deploying HAPI on proxmox and starting a real Cursor session against `~/coding/jellybot`, the direction is still right, but the first implementation slice should be narrower: + +- **Keep HAPI as primary**. CLR is a good Cursor-only reference, not a parallel production stack. +- **Prove state contract on existing ElevenLabs path first**, then add the **voice-first local modality** as a pluggable provider - not a parallel stack or sidecar. +- **Do not Docker the runner yet**. Hub-only Docker is possible later, but the runner needs host agent CLIs, auth, and workspace writes. Current boot path is systemd. +- **Treat Cursor remote mode separately**. HAPI Cursor remote mode uses `agent -p --output-format stream-json --trust`; it does not provide the same per-tool approval story as Claude/Codex. +- **Hook inbound assistant messages in CLI session handling**, not only `MessageService`. `hub/src/socket/handlers/cli/sessionHandlers.ts` is where CLI-originated messages are persisted and SSE updates are emitted. +- **Prefer session metadata/state over a new API endpoint at first**. Store derived voice mode / notify status in existing session update flow; add `GET /voice-mode` only if the web UI really needs it. + +### CursorVox postmortem: do not port the intermediary agent blindly + +The latest CursorVox run against `Workspaces / Project assessment and role clarification` showed the failure pattern clearly: + +1. User asked, in fragments: "What is this project about?" then "its purpose" / "overall purpose". +2. CursorVox knew the bound target, but the dispatcher still kept asking which project/what context instead of using the selected session. +3. When it finally chose to dispatch, it spoke "I'm starting the workspace check now..." before the handoff was proven. +4. CursorRemote then failed delivery after three CDP attempts: text inserted + Enter pressed, but "Sent message was not observed in chat transcript after submit". +5. The user heard both a false start and an internal transport failure. + +Conclusion: the **stateful voice intermediary agent** is the wrong thing to transplant. It adds a second conversation brain with stale memory, fuzzy target reasoning, and premature acknowledgements. HAPI should instead use: + +- a thin voice tool layer that sends user intent to the already-selected HAPI session; +- deterministic state transitions derived from real HAPI events (`message queued`, `message consumed`, `agent ready/done`, `permission requested`); +- no "execution started" spoken acknowledgement until the hub has accepted/queued the message; +- `AGENT_NOTIFY_SUMMARY` as the completion/status contract, not dispatcher memory as truth; +- LLM voice routing only for lightweight intent classification/wording, never as the owner of task state. + +--- + +## 2. Problem statement + +### What HAPI solves today + +- Run Claude Code, Codex, Cursor Agent CLI, Gemini, OpenCode locally +- Control from browser/PWA/Telegram while AFK +- Session handoff local ↔ remote, permission approval, terminal access +- Voice: ElevenLabs ConvAI → `messageCodingAgent` / `processPermissionRequest` client tools + +Note: permission behavior is agent-flavor specific. Cursor remote mode currently runs with `--trust`, so voice-triggered Cursor tasks need extra guardrails if we want confirmation before destructive work. + +See upstream: `README.md`, `docs/guide/how-it-works.md`, `docs/guide/voice-assistant.md`, `docs/guide/cursor.md`. + +### What HAPI does not solve (our gap) + +- **Voice completion readback** - handoff via `messageCodingAgent` works; summarizing coding-agent output after `thinking` stops is broken on production upstream (see §16) +- **Deterministic voice routing** - HAPI voice delegates intent to ElevenLabs LLM + two tools; CursorVox used dispatcher + explicit mode transitions +- **Optional `AGENT_NOTIFY_SUMMARY` parsing** - not in upstream; useful only for operators who add that tail convention to their own agent rules (see §6.2, §16) +- **Local speech stack** - HAPI is ElevenLabs-centric today; operator runs Speaches/Chatterbox on proxmox (`local-speech-agent` compose) as future `VOICE_BACKEND=local-openai` after upstream PR #401 lands +- **agent-notify integration** - stop-hook TTS/tmux driven by same contract as voice readback (operator-specific) +- **ElevenLabs conversation logging** - transcripts live on ElevenLabs unless hub archives them (WIP PR B, §16) +- **In-app builder surface** (future) - separate concern; shared `BuilderClient` interface may sit above HAPI REST/SSE later + +### What HAPI voice actually is (ElevenLabs path today) + +HAPI voice is **not** dumb STT + TTS. It is **ElevenLabs Conversational AI (ConvAI)**: + +```text +Browser WebRTC ↔ ElevenLabs ConvAI (STT + orchestrator LLM + TTS) + ↓ client tools + messageCodingAgent / processPermissionRequest + ↓ + HAPI hub session queue → coding agent CLI + ↑ contextual updates (voiceHooks) +``` + +- **Orchestrator LLM** (in auto-created agent): `gemini-2.5-flash` per `shared/src/voice.ts` / `buildVoiceAgentConfig()` +- **System prompt + tools:** `VOICE_SYSTEM_PROMPT`, same two client tools as our thin-adapter model +- **Hub role:** mint conversation token, accept queued messages, push session events to ConvAI via web hooks +- **ConvAI conversation logs:** `GET /v1/convai/conversations/{id}` on ElevenLabs (not on hub unless PR B lands) + +This is the same architectural slot CursorVox tried to fill with a stateful Python dispatcher - HAPI already has the ConvAI intermediary; what's missing is **deterministic completion readback** and optional operator notify parsing. + +### What CursorRemote solved (now secondary) + +- IDE composer DOM control via CDP (`--remote-debugging-port=9222`) +- Only relevant when **Cursor IDE is open** with agent panel visible +- Path: `~/coding/CursorRemote/` - see `docs/architecture.md`, `docs/prd.md` + +--- + +## 3. Strategic decision record + +### Why not keep building CursorVox? + +CursorVox proved voice → agent routing against **CursorRemote only**. That is the wrong substrate if the operator lives in **CLI agent mode** (Composer 2.5 Fast, no IDE chrome). HAPI + [cursor-local-remote](https://github.com/Vovch/cursor-local-remote) class tools address CLI remote; HAPI additionally unifies **Claude/Gemini/Codex**. + +### Coexistence with upstream HAPI (primary strategy) + +**We are not building a permanent fork or a second product.** We extend `tiann/hapi` so our work can land as **upstream PRs** (or a small series of PRs) once dogfood passes. + +| Principle | Meaning | +|-----------|---------| +| **Additive modules** | New code lives in `shared/src/*`, `hub/src/voice/*`, optional `web/src/realtime/providers/*` - not scattered rewrites | +| **Default path unchanged** | ElevenLabs + existing web voice session remain the default; local voice-first is opt-in via settings/env | +| **Thin integration seams** | Hook `sessionHandlers.ts`, extend session metadata, wrap outbound sends - avoid replacing `syncEngine`, `messageService`, or auth | +| **PR-sized commits** | Each phase should be reviewable independently: parser + tests, mode engine + tests, modality wrapper, provider interface, local provider impl | +| **Track upstream** | Rebase or merge `tiann/hapi` regularly; resolve conflicts at integration seams, not by duplicating upstream files | +| **No sidecar in production** | CursorVox FastAPI → HAPI API doubles ops and session truth; reference only | + +**Rejected for production:** permanent private fork without upstream path, sidecar voice stack, or changes that break ElevenLabs/text/Telegram when local voice is disabled. + +**Optional extract later:** if upstream wants a smaller first PR, `@hapi/voice-state` or similar package boundary is fine - but integration stays in this repo until accepted upstream. + +### Voice-first modality (local OpenAI-compatible) + +A **modality** is a transport + orchestration path for the same HAPI session store - not a separate agent platform. + +| Modality today | Modality we add | +|----------------|-----------------| +| Text (web/PWA, Telegram) | **Voice-first local** - mic/speaker loop on device or via hub proxy | +| ElevenLabs ConvAI (cloud STT/LLM/TTS, WebRTC) | **Local OpenAI-compatible stack** on tailnet (already operated on proxmox) | + +**Local stack (existing ops, new HAPI adapter):** + +| Role | Service | API shape | +|------|---------|-----------| +| STT | Speaches (`local-speech-agent`) | OpenAI-compatible `/v1/audio/transcriptions` | +| TTS | Chatterbox gateway | OpenAI-compatible `/v1/audio/speech` (HTTP; not WS realtime required for v1) | +| Classifier LLM (optional) | Local OpenAI-compatible chat (Speaches-hosted or separate) | `/v1/chat/completions` with same tool schema as `VOICE_TOOLS` | + +**Shared across modalities (not duplicated per provider):** + +- `VOICE_SYSTEM_PROMPT`, `VOICE_TOOLS`, `realtimeClientTools` → hub message path +- `voiceMode` state + `AGENT_NOTIFY` parsing in hub +- `modalityWrapper` on voice-originated sends +- `voiceHooks` context feed (session history, permissions, ready) + +**Provider interface (PR-able shape):** + +```text +VoiceTransportProvider (interface) + ├─ ElevenLabsConvAIProvider # existing; wrap current RealtimeVoiceSession path + └─ LocalOpenAIVoiceProvider # new; Speaches STT + local LLM tools + Chatterbox TTS +``` + +Selection via hub/web config (e.g. `voice.provider: elevenlabs | local_openai`) with env-backed base URLs for local endpoints. **No hard dependency** on local services in default build; CI runs without them. + +**Voice-first UX (later slice):** optional UI entry that opens mic-first session (large talk control, minimal chrome) - still the same HAPI session underneath; can ship after provider abstraction lands. + +### Decommission candidates (after HAPI slice passes) + +| Service | Tailnet name | Action | +|---------|--------------|--------| +| CursorVox Docker `:7861` | `svc:cursorvox` | Deprecate after voice-state on HAPI | +| CursorRemote Docker `:3000` | `svc:cursor-d` | Keep optional for IDE-only workflows | +| Cursor CDP on teemo-ssd `:9222` | `svc:cursor-cdp` | Only needed if CursorRemote kept | + +Scripts: `~/coding/server-setup/scripts/tailscale/harden-*-cursor*.sh` + +--- + +## 4. Target architecture + +```text +┌─────────────────────────────────────────────────────────────────┐ +│ Phone / PWA / Telegram │ +│ ├─ Text UI (existing, unchanged default) │ +│ ├─ Voice: ElevenLabs ConvAI (existing default) │ +│ └─ Voice-first: LocalOpenAIVoiceProvider (NEW, opt-in) │ +└────────────────────────────┬────────────────────────────────────┘ + │ REST / SSE / Socket.IO +┌────────────────────────────▼────────────────────────────────────┐ +│ HAPI Hub (upstream + additive extensions) │ +│ ├─ syncEngine / messageService (existing, untouched core) │ +│ ├─ VoiceModeEngine (NEW) - per-session state machine │ +│ ├─ AgentNotifyParser (NEW) - AGENT_NOTIFY_SUMMARY on msgs │ +│ ├─ ModalityWrapper (NEW) - voice-originated outbound text │ +│ ├─ VoiceTransportProvider registry (NEW) - pluggable voice │ +│ └─ SQLite session metadata extensions (backward compatible) │ +└────────────────────────────┬────────────────────────────────────┘ + │ Socket.IO +┌────────────────────────────▼────────────────────────────────────┐ +│ HAPI CLI → claude | agent | gemini | codex | opencode │ +└─────────────────────────────────────────────────────────────────┘ + +Optional future: agent-notify stop hook reads same JSONL contract +``` + +--- + +## 5. HAPI integration map (start here in this repo) + +Read **`docs/operator/AGENTS.md`** (canonical on this fork; upstream baseline inlined there). Key seams for voice-state work: + +| Concern | HAPI path | Notes | +|---------|-----------|-------| +| Voice orchestrator prompt + tools | `shared/src/voice.ts` | `VOICE_SYSTEM_PROMPT`, `VOICE_TOOLS`, `buildVoiceAgentConfig()` - shared by all providers | +| Voice transport (default) | `web/src/realtime/RealtimeVoiceSession.tsx` | ElevenLabs ConvAI WebRTC - **unchanged default** | +| Voice transport (voice-first local) | `shared/src/voiceProvider.ts`, `hub/src/voice/providers/localOpenAi.ts`, `web/src/realtime/providers/` | **NEW opt-in** - OpenAI-compatible STT/TTS/chat | +| ElevenLabs token API | `hub/src/web/routes/voice.ts` | `POST /voice/token` | +| Web voice client tools | `web/src/realtime/realtimeClientTools.ts` | `messageCodingAgent`, `processPermissionRequest` - shared hub bridge for all providers | +| Web voice session registry | `web/src/realtime/RealtimeSession.ts` | Provider-agnostic session id + hooks | +| Inbound assistant/agent messages | `hub/src/socket/handlers/cli/sessionHandlers.ts` | **Primary hook for AGENT_NOTIFY + mode transitions** after CLI message normalization and persistence | +| Outbound user messages | `hub/src/sync/messageService.ts` | Web/Telegram queued send path; useful for voice-origin metadata and wrappers | +| Session lifecycle | `hub/src/sync/syncEngine.ts`, `hub/src/sync/sessionCache.ts` | Activity, ready events | +| Persistence | `hub/src/store/sessions.ts`, `hub/src/store/messages.ts` | Extend metadata | +| Permissions (phone approve) | `hub/src/web/routes/permissions.ts` | Already agent-flavor aware | +| Notifications | `hub/src/notifications/` | May drive proactive voice readback | +| Cursor CLI wrapper | `docs/guide/cursor.md`, `cli/src/cursor/` | Remote mode: `agent -p --output-format stream-json --trust --resume`; no per-tool remote approval | + +### Suggested new modules (names tentative) + +``` +shared/src/voiceMode.ts # ModeState enum, transitions, types +shared/src/agentNotify.ts # Parse/strip AGENT_NOTIFY_SUMMARY JSON +shared/src/voiceProvider.ts # VoiceTransportProvider interface + types (PR boundary) +hub/src/voice/modeEngine.ts # Pure transition helper invoked by sessionHandlers +hub/src/voice/modalityWrapper.ts # Outbound prompt wrapping +hub/src/voice/providers/localOpenAi.ts # Speaches STT + OpenAI chat tools + Chatterbox TTS +web/src/realtime/providers/ # Client-side provider wiring; ElevenLabs stays default export +``` + +**PR discipline:** prefer new files + re-exports over editing upstream-heavy files. When editing existing files (`sessionHandlers.ts`, `realtimeClientTools.ts`, `voice.ts`), keep diffs minimal and behind config checks. + +--- + +## 6. Legacy assets to mine (external repos) + +All paths under `~/coding/` unless noted. + +### 6.1 CursorVox - voice agent state + routing (PRIMARY) + +**Repo:** `~/coding/cursorvox/` +**Architecture doc:** `cursorvox/docs/ARCHITECTURE.md` +**Operator guide:** `cursorvox/docs/OPERATOR_GUIDE.md` +**Interface spec:** `cursorvox/docs/plans/2026-04-29-legit-cursorvox-interface-spec.md` + +| File | Port? | What it does | +|------|-------|--------------| +| `src/cursorvox/voice_mode_state.py` | **Yes → TS** | Mode state machine: `idle_warm`, `idle_cold`, `align_intent`, `await_confirm`, `executing_async`, `reporting`, `blocked`; transitions from agent text + user turns | +| `src/cursorvox/modality.py` | **Yes → TS** | Wraps voice user messages; injects `[CursorVox mode context]` + `AGENT_NOTIFY_SUMMARY` output contract | +| `src/cursorvox/voice_speakable.py` | **Yes → TS** | Strip `AGENT_NOTIFY_SUMMARY` tail before TTS | +| `src/cursorvox/command_router.py` | Partial | Target resolution + focus + `send_message` - **replace** with HAPI session targeting; keep confirm/approve routing ideas | +| `src/cursorvox/safety.py` | Partial | High-risk command gating before mutate | +| `src/cursorvox/dispatch_agent.py` | Cautionary reference | Do **not** port as a stateful intermediary agent. Mine only narrow tests/prompts for lightweight classification wording if needed. | +| `src/cursorvox/audio_intent.py` | Reference | Legacy deterministic intents; mostly superseded by dispatcher | +| `src/cursorvox/targets.py` | **No** (IDE) | VoiceTarget from CursorRemote `windowSnapshots` - CLI sessions use HAPI session list instead | +| `src/cursorvox/cursorremote_client.py` | **No** | Socket.io bridge to CursorRemote - obsolete on CLI-first path | +| `src/cursorvox/app.py` | Reference | FastAPI routes, health, voice intent handler wiring | +| `src/cursorvox/proactive.py`, `attention*.py` | Later | Proactive spoken queue from agent state - port after core loop | +| `src/cursorvox/tts/chatterbox_gateway.py` | **Yes** | HTTP TTS to local gateway | +| `src/cursorvox/tts/speaches.py` | **Yes** | STT/TTS HTTP to Speaches | + +**Tests to read before porting:** + +- `cursorvox/tests/test_modality.py` +- `cursorvox/tests/test_voice_speakable.py` +- `cursorvox/tests/test_audio_intent_processor.py` (mode/confirm scenarios) +- `cursorvox/tests/test_targets.py` (skip if not doing IDE targeting) + +**Local speech stack (operational):** + +- CursorVox README documents Speaches `:18001`, Chatterbox gateway `:18008` via `local-speech-agent` compose +- `cursorvox/docker-compose.yml` - env patterns for `CURSORREMOTE_BASE_URL` (will become HAPI hub URL) + +### 6.2 agent-notify / AGENT_NOTIFY_SUMMARY (optional operator convention) + +**Repos:** `~/coding/agent-notify/` (spec), operator `~/coding/AGENTS.md` (optional rule) +**Canonical spec:** `agent-notify/ACTUALSPEC.md` + +**Upstream framing (critical):** `AGENT_NOTIFY_SUMMARY` is **not** a HAPI built-in requirement. Most HAPI users will never emit it. It is an **optional machine-readable tail** some operators add to **their own** agent instructions (`AGENTS.md`, project rules, hooks). HAPI should: + +- Parse it **when present** in assistant messages +- Prefer `summary` for voice readback when parsed +- **Never** require agents to emit it +- **Never** add it to upstream HAPI default prompts + +When present, the JSON tail is source of truth for spoken/read status (`action`, `status`, `summary`, etc.) for that operator's stack. + +Planned HAPI behavior (PR C / Phase 1): + +1. Parse from hub message stream (`shared/src/agentNotify.ts`) +2. Drive mode transitions when combined with mode engine (`done` → `idle_warm`, etc.) +3. Feed speakable text to voice output (strip JSON tail) +4. Optionally share parsed object with agent-notify stop hook (operator deploy only) + +### 6.3 CursorRemote - IDE remote (LEGACY REFERENCE) + +**Repo:** `~/coding/CursorRemote/` (this workspace) + +| Asset | Relevance | +|-------|-----------| +| `docs/architecture.md`, `docs/prd.md` | CDP/DOM model - understand what we're **not** building on | +| `src/server/command-executor.ts` | Send message verification pattern | +| `src/server/activity-derive.ts` | Agent idle/heuristic detection (inferior to AGENT_NOTIFY) | +| `src/server/relay.ts` | socket.io command protocol - different from HAPI | +| Tailscale/Docker setup | **Reuse patterns** for HAPI: `Dockerfile`, `docker-compose.yml`, `DEVELOPMENT.md`, server-setup hardening scripts | + +Do **not** wire new voice-state to CursorRemote unless explicitly maintaining IDE lane. + +### 6.4 cursor-local-remote (alternative CLI remote) + +**Upstream:** https://github.com/Vovch/cursor-local-remote (`clr`) + +Lighter-weight Cursor-only CLI remote. HAPI supersedes for multi-agent; CLR useful as reference for: + +- `agent -p --output-format stream-json` + session transcript paths under `~/.cursor/projects/` +- Cursor-first session browser UX +- Git panel from phone (diff, commit, push, branch) +- Simple webhook completion notifications (for `ntfy` / Discord / Slack style sinks) + +Do not run CLR beside HAPI as a production stack unless explicitly testing a UX gap; two remotes spawning Cursor sessions will create duplicate control surfaces. + +### 6.5 Infrastructure + +| Path | Use | +|------|-----| +| `~/coding/server-setup/scripts/tailscale/harden-*-cursor*.sh` | Template for `harden-hapi-service.sh` | +| `~/coding/server-setup/docs/runbooks/tailscale-internal-services.md` | Runbook pattern | +| `~/coding/cursorvox/docs/plans/2026-04-29-legit-cursorvox-interface-spec.md` | UX requirements that may inform HAPI web voice UI | + +--- + +## 7. Phased implementation plan + +**Upstream alignment:** Phases map to merge-shaped PRs in **§16**. Prefer landing upstream PR **#401** (pluggable backends) before local OpenAI provider work. + +### Phase 0 - Baseline dogfood (partly complete) + +- [x] Run HAPI on proxmox with hub + runner + Tailscale Serve +- [x] Confirm no third-party relay: hub binds `127.0.0.1:3006`; Tailscale Serve publishes `https://hapi.tail9944ee.ts.net/` +- [x] Run one Cursor session from web/PWA against `~/coding/jellybot` +- [x] Remove `specstory run ...` shell alias interference so HAPI resolves real `agent`, `codex`, and `gemini` binaries +- [x] Read `shared/src/voice.ts` + `web/src/realtime/realtimeClientTools.ts` end-to-end +- [x] ElevenLabs voice dogfood: hello test + subtitle-search feature (see §16.3-16.4, `docs/dogfood/`) +- [x] Document upstream PR strategy and ConvAI architecture insight (§16) +- [x] Decide integration shape: **coexist with upstream**; additive modules; PR series to `tiann/hapi` (not permanent fork or sidecar) +- [ ] Verify real non-Cursor agent sessions from web: Claude, Codex, Gemini +- [ ] Document AGPL compliance for `svc:hapi` on tailnet +- [ ] Help rebase/review upstream **PR #401** (pluggable voice backends) - see §16.7 + +**Exit criteria:** Dogfood evidence written; go/no-go in §10; voice handoff proven (messageCodingAgent → hub 200). + +### Phase 1 / PR C - Optional AGENT_NOTIFY parsing + mode state + +- [ ] **PR A first (upstream):** fix ready-event inline assistant text (§16.6) - can ship before this phase +- [ ] **PR B (upstream):** ElevenLabs transcript archive (§16.6) +- [ ] Port `AGENT_NOTIFY_SUMMARY` parser → `shared/src/agentNotify.ts` (**opt-in convention only**) +- [ ] Port mode state types + transitions from `voice_mode_state.py` → `shared/src/voiceMode.ts` +- [ ] Hook `hub/src/socket/handlers/cli/sessionHandlers.ts`: on assistant message, parse notify JSON when present, update session voice mode +- [ ] `voiceHooks.onReady`: prefer parsed notify `summary` over last-message heuristic when available +- [ ] Add unit tests mirroring `cursorvox/tests/test_modality.py`, `test_voice_speakable.py` +- [ ] Publish state through existing session update/SSE path +- [ ] Keep stored chat messages intact; strip contract tail only at voice/TTS boundaries + +**Exit criteria:** When operator agent emits notify line, voice readback uses `summary`; when absent, PR A last-message behavior still works. + +### Phase 2 - Outbound modality wrapper (PR D) + +- [ ] Port `wrap_voice_user_message()` behavior → `hub/src/voice/modalityWrapper.ts` +- [ ] Add explicit `sentFrom: 'voice'` (or equivalent metadata) from `realtimeClientTools.ts` through the hub send path +- [ ] Apply wrapper only when message origin = voice +- [ ] Confirm gating: `await_confirm` blocks `messageCodingAgent` until user confirms (port logic from `command_router.py`; avoid `dispatch_agent.py` as state owner) +- [ ] Ack policy: spoken "sent" only after hub persists/queues message (see §14.6) + +**Exit criteria:** Voice-originated sends include mode context; agent replies with parseable `AGENT_NOTIFY_SUMMARY`; confirm flow works on destructive request; failed queue never produces execution narrative. + +### Phase 3 - Pluggable backends (#401) + local OpenAI (PR E) + +- [ ] Land or rebase upstream **[PR #401](https://github.com/tiann/hapi/pull/401)** (`VOICE_BACKEND=elevenlabs | gemini-live | qwen-realtime`, `GET /api/voice/backend`) - §16.7 +- [ ] Stack PR A/B/C on `VoiceBackendSession` if #401 merges first +- [ ] Extend ElevenLabs/Gemini prompts with mode-aware instructions (backward compatible) +- [ ] Wire proactive readback on mode transition to `reporting` / notify `status: done` +- [ ] **PR E:** `VOICE_BACKEND=local-openai` - Speaches STT + OpenAI-compatible chat tools + Chatterbox TTS (after #401 interface exists; do not parallel-switcher) +- [ ] Optional: voice-first mic-primary UI entry + +**Exit criteria:** Default ElevenLabs unchanged; Gemini Live or local-openai dogfood passes subtitle-search scenario with audible completion summary. + +### Phase 4 - Ops + decommission + +- [x] Tailscale serve `svc:hapi` on proxmox (mirror cursorvox/cursor-d pattern) +- [x] `hapi-runner.service` for remote spawn without terminal babysitter +- [ ] Consider hub-only Docker/Compose after core dogfood; keep runner on host unless a container auth/workspace model proves cleaner +- [ ] Archive CursorVox docker stack; document legacy CursorRemote IDE lane +- [ ] Update `~/coding/AGENTS.md` or homelab runbook with new canonical remote URL + +--- + +## 8. Explicit non-goals (v1) + +- Replacing HAPI's core sync/session architecture +- CDP / Cursor IDE DOM scraping in this integration +- Multi-user tenancy beyond HAPI namespaces +- In-app builder pull-down UI (separate future project; may call HAPI REST) +- Breaking or removing upstream ElevenLabs voice for users who do not opt into local modality +- A permanent private fork that diverges without a PR path to `tiann/hapi` +- Requiring local speech services in default install or CI + +**Not a non-goal:** upstream PRs - dogfood first, then open PR-sized slices once tests and default-path regression pass. + +--- + +## 9. Testing strategy + +| Layer | Approach | +|-------|----------| +| `agentNotify` / `voiceMode` | Port existing CursorVox unit tests to Vitest | +| Hub message hooks | Extend `hub/src/socket/handlers/cli/sessionHandlers` tests/patterns; use `messageService` tests for outbound user-message metadata | +| Voice tools | Mock session + simulate `messageCodingAgent` RPC | +| Provider regression | With default config, ElevenLabs path matches upstream behavior (no local services required in CI) | +| Local provider | Optional integration tests behind env flag; mock OpenAI-compatible HTTP | +| E2E | Manual dogfood matrix from `cursorvox/scripts/dogfood_voice_self_loop.py` (adapt for HAPI) | + +--- + +## 10. Open questions (for cold eval agent) + +**Decided (May 2026 dogfood):** + +| Question | Decision | +|----------|----------| +| Upstream workflow | Coexist with `tiann/hapi`; PR series §16; no permanent fork | +| Voice architecture | ElevenLabs ConvAI is already the thin intermediary; fix readback + logging, not rebuild dispatcher | +| AGENT_NOTIFY | **Opt-in** operator convention only; parse when present (PR C) | +| Pluggable backends | **Track PR #401**; local Speaches/Chatterbox = PR E after #401 | +| No upstream issue | File PR A with dogfood evidence; reference #640 and #401 | + +**Still open:** + +1. **AGPL:** Private tailnet only - is source offer to household devices sufficient via git mirror? +2. **CursorRemote:** Full decommission or keep `cursor-d` for IDE composer indefinitely? +3. **agent-notify:** Hub webhook vs continue relying on Cursor stop hook + JSONL scan (operator deploy)? +4. **Docker:** Hub-only Compose after core dogfood? +5. **Cursor permissions:** Is `--trust` acceptable for remote Cursor sessions, or separate voice guardrails? +6. **AGENT_NOTIFY upstream docs:** Generic JSON shape only vs link to external community spec? +7. **PR #401 default backend:** Confirm `DEFAULT_VOICE_BACKEND='elevenlabs'` on rebased head (early branch flipped to gemini-live; author claims revert) +8. **PR #401 Composer Enter change:** Split out of voice PR or get explicit tiann OK? (HAPI Bot blocker; author claims separate request) +9. **PR #640 ready trigger:** Confirm SSE-only path before stacking PR A + +--- + +## 11. Local environment (ElevenLabs voice) + +Secrets live in **gitignored** env files (not committed): + +| File | Purpose | +|------|---------| +| `~/coding/hapi/.env` | Repo-level reference; same active key | +| `~/coding/hapi/hub/.env` | **Loaded by Bun** when running `cd hub && bun run dev` | +| `~/coding/hapi/.env.example` | Template without secrets | + +**Active key:** `hg` (heavygee). Alternates `justg` and `gc` are commented in both `.env` files - swap by commenting/uncommenting `ELEVENLABS_API_KEY` (one active only). + +For single-binary / CLI outside `hub/`, export before start: + +```bash +set -a && source ~/coding/hapi/.env && set +a +``` + +Voice requires `ELEVENLABS_API_KEY` on the **hub process** (`hub/src/web/routes/voice.ts`). + +**API key permissions:** Auto-create agent + conversation token require ElevenLabs scopes **`convai_read`** and **`convai_write`**. Keys with TTS-only scopes fail with HTTP 500 *"Failed to create ElevenLabs agent automatically"* or missing permission errors on list/create endpoints. + +Optional after first auto-create: set `ELEVENLABS_AGENT_ID=agent_...` in `hub/.env` to skip create step. + +### Tailscale (proxmox, no third-party relay) + +**URL:** `https://hapi.tail9944ee.ts.net/` + +| Item | Path | +|------|------| +| Harden (VIP + ACL + Serve) | `~/coding/server-setup/scripts/tailscale/harden-hapi-service.sh` | +| Boot units | `sudo ~/coding/server-setup/scripts/tailscale/install-hapi-tailnet-services.sh` | +| Verify | `~/coding/server-setup/scripts/verify-hapi-tailnet.sh` | +| Hub systemd | `server-setup/systemd/hapi-hub.service` | +| Runner systemd | `server-setup/systemd/hapi-runner.service` | +| Serve systemd | `server-setup/systemd/tailscale-serve-hapi.service` | + +Do **not** run `hapi hub --relay`. Hub binds `127.0.0.1:3006`; Tailscale Serve proxies HTTPS only on your tailnet. + +After first hub start: **`CLI_API_TOKEN`** in `~/.hapi/settings.json` — use for phone/web login. + +Current proxmox runner workspace roots: + +- `/home/heavygee/coding` +- `/home/heavygee/coding/hapi` + +No official HAPI Docker image exists for this project. `hapiproject/hapi` on Docker Hub is unrelated HAPI FHIR. If Docker is added, start with hub-only Compose; keep the runner host-native until agent CLI auth and workspace mounts have a tested contract. + +--- + +## 12. Quick start for next agent + +```bash +cd ~/coding/hapi +bun install +bun typecheck +bun run test + +# Terminal 1 - hub (loads hub/.env) +bun run dev:hub # or use existing hapi-hub.service on proxmox + +# Terminal 2 - CLI session +npx @twsxtd/hapi cursor # or bun cli after build; verify agent on PATH + +# Production-ish proxmox path +systemctl status hapi-hub.service hapi-runner.service tailscale-serve-hapi.service +hapi runner status + +# Read this plan (§14 operator model, §16 upstream PR handoff + dogfood) +less docs/plans/2026-05-23-voice-agent-state-integration.md + +# Reference repos +ls ~/coding/cursorvox/src/cursorvox/voice_mode_state.py +ls ~/coding/cursorvox/src/cursorvox/modality.py +ls ~/coding/agent-notify/ACTUALSPEC.md +ls ~/coding/CursorRemote/docs/architecture.md +``` + +--- + +## 13. Conversation context (how we got here) + +Condensed arc for the eval agent: + +1. Built **CursorRemote** to remote-control **Cursor IDE composer** via CDP (Tailscale `cursor-d`, Docker on proxmox). +2. Built **CursorVox** as voice cockpit → CursorRemote (`cursorvox` tailnet), local Speaches/Chatterbox, dispatcher-led intents, mode state machine, `AGENT_NOTIFY_SUMMARY`. +3. Identified **inside-out app builder** loop (in-app surface → agent → reload) as separate from voice remote. +4. Recognized **CLI agent** (Composer 2.5 Fast, no IDE) as primary modality - CursorRemote cannot see CLI sessions. +5. Found **HAPI** (and CLR) as existing CLI remote multi-agent solutions. +6. Concluded HAPI subsumes CursorRemote+CursorVox **platform role**; our value is the **voice agent state layer** to graft on. + +--- + +## 14. Operator model: gardening while agents work (full spec) + +This section is the **product and architecture contract** for voice on HAPI. It captures everything worth keeping from the CursorVox attempt, everything we refuse to repeat, and the operator stance: + +> **I am doing the gardening. I want my agents kept busy. I want to be responsive when they actually need me - not nagged, not lied to, not asked which bed I'm standing in when I already picked one.** + +That sentence is the north star. Implementation must optimize for **async delegation + selective interruption**, not for **conversational co-piloting**. + +### 14.1 Roles (who owns what) + +| Actor | Job | Must NOT do | +|-------|-----|-------------| +| **Operator (you)** | Pick session, delegate tasks, approve/deny, unblock, occasionally check status | Babysit tool traces, re-explain context the agent already has | +| **Coding agent (CLI)** | Execute work, emit `AGENT_NOTIFY_SUMMARY`, ask when blocked | Assume voice heard a message that never queued | +| **HAPI hub** | Session truth, message queue, permissions, mode state, SSE/events | Guess intent from stale sidecar memory | +| **Voice layer (ElevenLabs + web tools)** | STT/TTS, classify utterance → HAPI tool, speak hub-backed updates | Own task state, target resolution, or "worker started" fiction | +| **Modality wrapper (hub)** | Inject voice-only execution policy into **outbound** user messages | Pollute desktop/non-voice turns | + +CursorVox collapsed the last three rows into a **dispatch orchestrator** (`dispatch_agent.py`) that tried to be a second agent. That is the spaghetti. HAPI splits them again. + +### 14.2 Gardening metaphor → mode states + +Port the **deterministic mode machine** from `voice_mode_state.py`, but store transitions in **HAPI session metadata**, driven by hub events - not dispatcher memory. + +| Mode | Gardener experience | Voice should | Hub/driver events | +|------|---------------------|--------------|-------------------| +| `idle_warm` | Just finished a task; still in flow | Stay quiet unless spoken to; short acks OK | Agent `status: done` notify; recent report within warm window (~30 min) | +| `idle_cold` | Away for hours; context stale | On next task, **recap first** then confirm intent | Time since last report exceeds warm window; or long gap since user turn (~12 h cold recap) | +| `report_refresh` | "What's going on?" while hands busy | Answer from **session truth**: pending permissions, last notify summary, blocked | User check-in phrases ("what needs me", "where did we leave off"); read-only query | +| `align_intent` | New instruction, not yet sent | Mirror outcome briefly; clarify **task details only** | User message classified as work; before hub accepts queue | +| `await_confirm` | About to do something risky | Hold sends; ask yes/no/revise; **block** `messageCodingAgent` until resolved | High-risk intent or agent asked confirm; safety gate armed | +| `executing_async` | **Gardening** - agent should be busy | **Silence.** No "still working?" No filler. | Message queued + consumed; agent working; until notify or permission | +| `reporting` | Agent finished a beat worth hearing | Speak **notify summary** (stripped JSON tail), 1-3 sentences | Assistant message with parseable `AGENT_NOTIFY_SUMMARY` or ready + substantive reply | +| `blocked` | Agent needs you **now** | Clear blocker + concrete ask ("allow bash?", "pick A or B") | Notify `status: blocked`; permission pending; send failed; transport error | + +**Warm vs cold idle** matters for returning gardeners: cold start should trigger recap + confirm, not "what project?" when the HAPI session is already selected. + +### 14.3 What CursorVox proved worth keeping + +These are **non-negotiable ports** (Phase 1-3): + +1. **`AGENT_NOTIFY_SUMMARY` contract** (`modality.py`, `agent-notify/ACTUALSPEC.md`) - machine-readable completion; voice reads `summary`, mode reads `status`/`action`. +2. **Mode state machine** (`voice_mode_state.py`) - especially `executing_async` silence and `report_refresh` check-ins. +3. **Modality wrapper on voice sends only** (`wrap_voice_user_message`) - desktop CLI sessions stay normal; phone voice gets execution policy + mode context block. +4. **Speakable stripping** (`voice_speakable.py`) - never TTS the JSON tail or raw tool dumps. +5. **Confirm gating for risky work** (`safety.py`, `command_router.py` confirm paths) - two-step approve where needed; **hub-owned** pending confirm, not dispatcher memory. +6. **Read-only status intents** (Vox: `what_needs_me`, `read_last_response`) - answer from cache/state **without** spawning agent work. +7. **Attention semantics** (`ATTENTION_QUEUE_VOX.md`) - sort by "needs me now" (approval, blocked, action required) vs calm idle tabs. +8. **Auditability** (interface spec §8) - every voice turn should be traceable: heard → classified → hub action → spoken result. +9. **Operator guide flow** - arrive → check state → send work → approve in two steps when high risk. + +### 14.4 What CursorVox proved we must NOT repeat + +Documented in postmortem (2026-05-23 jellybot session) and reinforced by `voice-sessions.jsonl`: + +| Failure | Evidence | HAPI rule | +|---------|----------|-----------| +| Second brain re-asks target | Bound session known; dispatcher still "which project?" | **Selected HAPI session is the target.** Voice never resolves windows/tabs. | +| Premature execution narrative | `"I'm starting the workspace check now..."` before transport proof | Speak **only after hub accepts/queues** message. Tool return `"sent"` is not enough if persistence fails - tighten in Phase 2. | +| Transport lies to user | CursorRemote: text inserted, Enter pressed, not in transcript | HAPI path: runner + CLI stream-json; failure surfaces as `blocked`, not fake progress | +| Dispatcher memory drift | `DispatchMemory.short_state/long_state` diverges from IDE | **No rolling dispatcher state.** Session messages + notify JSON are truth. | +| Over-clarification loops | User: "overall purpose" → still aligning | With bound session, clarify **task outcome only**, max N rounds then pass-through (configurable) | +| Chatty async | Filling silence while agent works | `executing_async` = **mandatory silence** in voice prompt + no proactive TTS except permissions/blockers/done | + +`dispatch_agent.py` remains a **cautionary reference** - mine phrasing for lightweight classification if needed, never port as state owner. + +### 14.5 Understanding intent (three layers, not one spaghetti LLM) + +**Layer 0 - Transport interrupts (deterministic, optional port)** + +Vox `intents.py` handles only hard interrupts: "stop talking", "be quiet". No semantic parsing. HAPI may add similar client-side hooks (mute TTS, end voice session) without LLM. + +**Layer 1 - Voice LLM classifier (already in HAPI)** + +`shared/src/voice.ts` + `realtimeClientTools.ts`: + +- `messageCodingAgent(message)` - delegate work to **active session** +- `processPermissionRequest(allow|deny)` - respond to **hub's pending permission** +- Direct answer - meta/voice questions the assistant can answer from injected context + +The LLM **classifies and phrases**. It does not maintain `pending_worker_message`, `short_state`, or choose targets. + +Context for classification arrives via `voiceHooks.ts` → `sendContextualUpdate`: + +- Session focus + history (`formatSessionFull`) +- New agent messages (`onMessages`) +- Permission requests (`onPermissionRequested`) +- Ready/done (`onReady`) + +That is how the voice layer "understands" without owning task memory: **HAPI pushes truth in; LLM maps speech to tools.** + +**Layer 2 - Hub mode gating (deterministic, Phase 2)** + +Even if the LLM calls `messageCodingAgent`, the hub/client may **reject** when: + +- `mode_state === await_confirm` and message is not confirm/revise/cancel +- High-risk mutation without completed confirm flow (port `safety.py` ideas) +- No active session / runner offline → spoken `blocked`, not dispatch + +This is the Vox `command_router` lesson without the CursorRemote target picker. + +**Explicit non-layer:** CursorVox `dispatch_agent.py` JSON (`action: ask_user | dispatch_worker`, rolling memory). **Deleted from architecture.** + +### 14.6 Communicating with the agent on the user's behalf (end-to-end) + +```text +User speech + → STT (ElevenLabs today; Speaches later) + → Voice LLM picks tool OR answers locally + → [Gate] mode + safety check + → realtimeClientTools.messageCodingAgent(msg) + → sessionStore.sendMessage(sessionId, msg) # must include sentFrom: voice (Phase 2) + → hub messageService / sync queue + → [Wrap] modalityWrapper adds mode block + notify contract reminder (voice only) + → runner forwards to agent CLI + → agent works (operator gardens) + → assistant messages → sessionHandlers + → AgentNotifyParser extracts AGENT_NOTIFY_SUMMARY + → modeEngine transitions (e.g. executing_async → reporting → idle_warm) + → SSE + voiceHooks push context + → Voice speaks summary (TTS) when mode allows +``` + +**Acknowledgement policy (fixes Vox false start):** + +1. User finishes speaking. +2. Tool invoked. +3. **Hub persists and queues** user message (exit criterion Phase 2). +4. Only then: brief ack ("Sent." / "Got it.") - optionally **hub-generated**, not LLM improvisation. +5. Transition to `executing_async`. +6. **Silence** until permission, blocker, or notify done. + +**Pass-through wording:** Voice should forward user intent **without rewriting into a different task**. Dispatcher paraphrase caused drift ("workspace check" vs "what is this project about"). Modality wrapper tells the **agent** how to behave; it should not replace the user's words. + +### 14.7 Being responsive to agents (when to interrupt the gardener) + +Interrupt priority (highest first): + +1. **Permission request** - agent blocked on tool approval; speak immediately with allow/deny prompt. +2. **`AGENT_NOTIFY_SUMMARY` with `status: blocked`** - agent needs decision or info. +3. **`AGENT_NOTIFY_SUMMARY` with `status: needs_decision`** - multiple options; speak options briefly. +4. **Send/transport failure** - queue rejected, runner offline; do not claim work started. +5. **`status: done`** - speak summary when operator would want to know work finished (respect DND later). +6. **Agent question in `await_confirm` territory** - agent asked "is that right?"; hold execution. + +Do **not** interrupt for: + +- Routine tool calls mid-flight +- Partial streaming tokens +- Dispatcher "helpful" check-ins +- Low-confidence re-clarification when session + user text are sufficient + +This maps Vox **attention queue** to HAPI: surface sessions with `pending permission`, `blocked notify`, or `failed send` - not every idle session. + +### 14.8 Check-in workflows (gardening-friendly) + +Port Vox check-in detection (`_is_check_in_text` in `voice_mode_state.py`) as shared helpers. These are **read-mostly** - they should query hub state, not spawn agents unless user explicitly adds new work: + +| User says (examples) | Mode transition | Action | +|----------------------|-----------------|--------| +| "What needs me?" | `report_refresh` | Summarize pending permissions + blocked sessions + last notify per focused session | +| "Where did we leave off?" | `report_refresh` | Last agent notify summary + recent user/agent exchange | +| "Read last response" | `report_refresh` | Speakable tail of last assistant message (strip contract JSON) | +| "Status?" / "Catch me up" | `report_refresh` | Same as above; cold idle adds recap preamble | + +After recap, if user adds new work in same utterance, transition `align_intent` → queue send. + +Future tools (Phase 3+): `getSessionStatus`, `listAttentionItems` - only if prompt injection via context updates proves insufficient. + +### 14.9 Modality wrapper content (what agents see vs what user hears) + +User hears: natural speech, summaries, permission prompts. + +Agent sees (voice-originated turns only), adapted from `modality.py`: + +- `[user said]` block with verbatim user text +- `[HAPI voice mode context]` with current `mode_state`, optional `pending_intent_digest` +- Execution policy bullets per mode (e.g. `executing_async`: milestone/blocker/done only) +- Reminder to emit `AGENT_NOTIFY_SUMMARY {"version":1,...}` on completion + +Agent does **not** see dispatcher `short_state`/`long_state` rolls. + +Global `AGENTS.md` rule already requires notify line format; HAPI wrapper reinforces it for voice sessions. + +### 14.10 Permission and destructive work (responsive, not reckless) + +Vox pattern: `approve current` → `confirm approve` (two-step). HAPI already has `processPermissionRequest` against hub permission state. + +Extend for Cursor `--trust` remote mode (Phase 2 guardrails): + +- Voice-triggered **mutating** tasks may require `await_confirm` + spoken playback of intent +- Destructive keywords / high-risk tool patterns port selectively from `safety.py` +- Never auto-approve from voice without explicit allow + +Operator gardening implies **trust but verify**: agent stays busy on safe work; risky work waits for a nod. + +### 14.11 Feedback loop: one truth, three consumers + +When an assistant message lands in `sessionHandlers.ts`: + +1. **Store** full message (unchanged) in SQLite +2. **Parse** `AGENT_NOTIFY_SUMMARY` if present → typed object +3. **Update** session voice mode via `modeEngine` +4. **Broadcast** SSE/session update to web/Telegram +5. **Voice** via hooks: contextual update for LLM; optional proactive TTS on `reporting` / done +6. **agent-notify** (optional): same parsed object for stop-hook TTS/tmux - single contract, multiple outputs (ACTUALSPEC §1) + +Speak **notify.summary**, not dispatcher paraphrase. Strip JSON before TTS. + +### 14.12 UX principles (from legit interface spec, adapted for HAPI) + +Keep from CursorVox product contract: + +- Command-first, not generic chatbot that sometimes sends work +- Mobile-first: large talk control, visible session, clear connection health +- Show what was heard and what happened (audit trail in UI - HAPI web can extend session view) +- Errors speakable: no session, runner offline, permission missing, send failed + +Drop: + +- Cursor target cards from `windowSnapshots` +- Pipecat demo as primary surface (HAPI PWA replaces) +- Dependency on CursorRemote bridge health + +### 14.13 Phase mapping to gardening capabilities + +| Phase | Gardener capability unlocked | +|-------|------------------------------| +| **0** (dogfood) | Can open HAPI, pick session, type/send; voice path read end-to-end | +| **1** | Hub knows mode + notify; "what needs me" can be answered from truth; completion visible in API | +| **2** | Voice send is wrapped + gated; no false "starting"; confirm before risky voice tasks | +| **3** | Full loop: speak task → garden → hear done summary; permissions interrupt reliably | +| **4** | Retire CursorVox stack; HAPI is canonical remote | + +### 14.14 Dogfood acceptance matrix (adapt from CursorVox) + +Each scenario must pass on HAPI before CursorVox decommission: + +| # | Scenario | Pass criteria | +|---|----------|---------------| +| 1 | Cold return | After simulated 12h idle, "where did we leave off" gives recap from session, not "which project?" | +| 2 | Delegate async | Voice sends harmless task; **no speech** until done notify or permission | +| 3 | Permission interrupt | Agent requests bash; voice prompts; allow/deny works; agent continues | +| 4 | Blocked notify | Agent emits `status: blocked`; voice speaks action field; mode = blocked | +| 5 | Done notify | Agent emits `status: done`; voice speaks summary; mode → idle_warm | +| 6 | Confirm gate | Risky task enters await_confirm; spurious send blocked until confirm | +| 7 | Send failure | Runner offline → blocked spoken; **no** "starting now" | +| 8 | Check-in read-only | "What needs me" does not enqueue new agent message | +| 9 | Multi-session | Switch focus in UI; voice respects new session context | +| 10 | Cursor trust path | Document behavior when `--trust` skips per-tool approval | + +Record evidence in hub logs + `~/.hapi/voice-sessions.jsonl` (via `POST /api/voice/sessions/log` on voice disconnect). Example: `docs/dogfood/2026-05-23-elevenlabs-voice-first-hello.md`. + +### 14.15 Open design choices (gardening tuning) + +1. **Max clarification rounds** in `align_intent` before pass-through (Vox had dispatcher loops; suggest cap at 1-2 when session bound). +2. **Proactive done speech** when voice session inactive but notify arrives (Telegram push vs later voice readback). +3. **Attention API** on HAPI: dedicated endpoint vs derive from session list + permission counts. +4. **Upstream PR order:** state contract (Phase 1) → modality wrapper (Phase 2) → `VoiceTransportProvider` + local OpenAI stack (Phase 3) - each slice reviewable without breaking default ElevenLabs path. + +Default stance: **ship deterministic hub behavior first**; tune LLM phrasing second. + +--- + +## 15. Document maintenance + +When phases complete, update: + +- [ ] This file (checkboxes + §10 decisions + §16 PR status) +- [ ] `~/coding/server-setup` runbook if `svc:hapi` deployed +- [ ] `~/coding/cursorvox/README.md` - add deprecation pointer to HAPI voice integration (upstream PR path) +- [ ] `~/coding/CursorRemote/README.md` - clarify IDE-legacy scope +- [ ] `docs/dogfood/*` when new voice sessions are recorded + +**Do not** commit upstream submodule changes without explicit operator request. + +--- + +## 16. Upstream PR strategy, dogfood findings, and new-agent handoff + +**Audience:** New agent taking this forward with a **clean start** (fresh branch, re-implement WIP; do not assume operator's uncommitted diff is deployed). + +**Canonical dogfood artifacts:** + +| File | Content | +|------|---------| +| `docs/dogfood/2026-05-23-elevenlabs-voice-first-hello.md` | Hello test; handoff OK; readback failed | +| `docs/dogfood/2026-05-23-elevenlabs-subtitle-search.md` | Real feature request; handoff OK; summary never delivered | +| `docs/dogfood/*.jsonl` | Exported ElevenLabs conversation JSON (sanitized for repo) | +| `~/.hapi/voice-sessions.jsonl` | Runtime log after PR B (operator machine) | + +### 16.1 Production gaps (May 2026 proxmox dogfood) + +| Symptom | Evidence | +|---------|----------| +| **Handoff works** | `POST .../messages 200` after `messageCodingAgent` | +| **Readback fails** | ConvAI says "finished" but cannot summarize coding-agent output | +| **Ready hook misleads** | `formatReadyEvent` injects *"previous messages ARE the summary"* without embedding text; shows as fake `user` turn in ElevenLabs transcript | +| **Wrong agent label** | Ready text says "Claude Code" on Cursor sessions | +| **ConvAI chattiness** | "Are you still there?" during async work despite `VOICE_SYSTEM_PROMPT` silence rules | +| **No hub transcript** | Voice conversation only on ElevenLabs unless manually fetched | +| **No upstream issue** | No matching open issue on `tiann/hapi` | + +### 16.2 How HAPI readback works today (and why it failed) + +| Mechanism | Role | +|-----------|------| +| `voiceHooks.onMessages` | Push new coding-agent messages to ConvAI as **contextual updates** (includes text) | +| `voiceHooks.onReady` | When `session.thinking` false → `sendUserMessage(formatReadyEvent(...))` | +| `VOICE_SYSTEM_PROMPT` | Tells ConvAI to wait after tool send; summarize on updates | + +**Root bug:** `onReady` asserts the summary already exists in context but **does not paste assistant text**. ConvAI hallucinates progress ("summary in the previous message") when context is empty or thin. + +**Planned fixes (PR A):** `extractLastAssistantSpeakable()` + embed in `formatReadyEvent`. **Planned enhancement (PR C):** prefer optional `AGENT_NOTIFY_SUMMARY.summary` when operator agents emit it. + +### 16.3 Dogfood session 1 - hello test + +- **Conv ID:** `conv_1201ksawpq32evna7dcy4ksaw3eh` · 83s · "Message Coding Agent" +- **Hub session:** `9d04335d-2b90-4941-98a7-eb414823f0e0` (jellybot / Cursor) +- **Hub:** token 18:04:11, message POST 18:04:34 +- **Result:** `messageCodingAgent("hello")` succeeded; user asked for summary; ConvAI had nothing substantive to report +- **Lesson:** Even trivial tasks expose ready-hook gap + +### 16.4 Dogfood session 2 - subtitle search feature + +- **Conv ID:** `conv_8501ksaxzm0tfv98198ar2r2t777` · 291s · "Subtitle Search Feature" +- **Hub:** token 18:26:31, message POST 18:27:35 (real jellybot feature request) +- **Flow:** User described subtitle index/search feature → ConvAI confirmed → `messageCodingAgent` → "sent" (tool obeyed) +- **Failures:** + - ConvAI check-ins at 89s, 130s (violates async silence) + - Ready injection at 176s with empty summary claim + - User "yes please" for summary at 228s → never delivered; ended politely useless at 282s +- **Lesson:** Handoff production-ready; **completion reporting is not** + +### 16.5 Upstream landscape + +| Item | Type | Status | Relevance | +|------|------|--------|-----------| +| **[PR #640](https://github.com/tiann/hapi/pull/640)** | PR | OPEN | Codex messages → voice context; ready on completion messages. **Coordinate with PR A.** Does not fix inline ready text or Cursor. | +| **[PR #401](https://github.com/tiann/hapi/pull/401)** | PR | OPEN, conflicts, changes requested | `VOICE_BACKEND`, Gemini Live, Qwen; runtime `GET /api/voice/backend`. **Enable/track - do not reimplement.** | +| **[#462](https://github.com/tiann/hapi/issues/462)** | Issue | OPEN | Composer dictation - **not** voice-assistant flow | + +### 16.5.1 Maintainer review gate - PR #401 (`tiann`, CHANGES_REQUESTED) + +**tiann (2026-04-06):** *"I believe this is a good feature. Please fix the comments first."* + +That means resolve **HAPI Bot** inline review threads before re-requesting review - not invent a parallel architecture. tiann has no separate inline comments; the actionable list is the bot findings on the PR head (`aa9802d` at last check). + +**Must hold (maintainer coexistence contract):** + +| Requirement | Why | +|-------------|-----| +| `DEFAULT_VOICE_BACKEND = 'elevenlabs'` | Hubs with only `ELEVENLABS_API_KEY` must not route to Gemini/Qwen | +| ElevenLabs code paths untouched | Zero regression on default install | +| ElevenLabs prompt/language unchanged | Chinese prompt block must **not** leak into ElevenLabs config; use backend-specific prompts | +| WebSocket proxies JWT-gated | `/api/voice/gemini-ws` and `/api/voice/qwen-ws` require hub JWT before upgrade | +| No provider secrets to browser | Token endpoints return proxied `wsUrl` only; DashScope/Gemini keys stay server-side | +| Split upstream vs client WS URLs | `*_UPSTREAM_WS_URL` server-only; browser always gets `/api/voice/*-ws` | +| Voice button gated until registered | `onRegistered` after lazy chunk mounts + `registerVoiceSession()` - not just backend discovery | +| `HAPI_PUBLIC_URL` / request origin for proxy URLs | Remote browsers must not get `ws://localhost:...` | +| Sequential Gemini tool calls | No `Promise.all` on shared permission/session state | +| AudioWorklet graph pulls frames | Worklet connected through zero-gain sink to destination | +| Mobile AudioContext in user gesture | Create/resume playback context at start of click handler | +| Cleanup on failed starts | Close leaked `AudioContext` on throw paths | +| No `skipWaiting`/`clientsClaim` in SW | Avoid lazy-chunk hash mismatch mid-session after deploy | +| Debug-guard voice tool logs | `messageCodingAgent` logs behind `VOICE_CONFIG.ENABLE_DEBUG_LOGGING` | + +**Claimed fixed by author (verify on rebase):** Qwen WS auth, default backend revert, proxy URL split, ElevenLabs language split, voice-button readiness, ws URL origin, audio graph, sequential tools, SW revert, debug logging, mic mute on start. + +**Still open per HAPI Bot at last head (must confirm or fix before merge):** + +| Finding | File area | Notes | +|---------|-----------|-------| +| Gemini unmutes user after `turnComplete` | `GeminiLiveVoiceSession.tsx` | Must respect user mute across model speech (barge-in mute ≠ user mute) | +| Failed starts leak `AudioContext` | Gemini + Qwen sessions | `try/catch` + `cleanup()` on all early exits | +| Composer Enter-to-send inverted | `HappyComposer.tsx` | Bot flags as unrelated regression; author claims separate intentional UX change - **needs tiann ruling**, not assumption | +| Gemini setup message dropped under proxy backpressure | `server.ts` / Gemini WS | Queue initial setup if upstream slow | +| Qwen stuck in `connecting` on setup error after `session.created` | `QwenVoiceSession.tsx` | Reject promise + surface error | +| Merge conflicts with current `tiann/main` | whole PR | Rebase required | + +**Our PR A/B/C:** Do not bundle unrelated composer or SW changes. Stack on #401 only after above gate passes. + +### 16.5.2 Maintainer review gate - PR #640 + +**tiann:** No review yet (OPEN, no CHANGES_REQUESTED). + +**HAPI Bot (initial review) - must fix before merge:** + +| Finding | Issue | Required fix | +|---------|-------|--------------| +| Historical ready replay | `SessionChat.tsx` scans `newMessages` including hydrated history | Move ready detection to **live SSE** `message-received` path only; never fire `onReady` for refetched/old `ready` / `task_complete` rows | + +**Overlap with our PR A:** #640 improves Codex message formatting + ready **trigger**; PR A fixes ready **payload** (inline assistant text). Prefer: merge #640 first or one coordinated PR; do not duplicate Codex formatter work in PR A. + +**Does not replace PR A:** #640 does not embed assistant text in `formatReadyEvent` or fix Cursor sessions. + +### 16.6 Upstream PR series (merge-shaped) + +Each PR: **default ElevenLabs behavior unchanged** when env unset. + +#### PR A - Voice completion readback (ship first) + +- `formatReadyEvent(sessionId, lastAssistantText?)` embeds assistant text; agent-neutral wording +- `voiceHooks.onReady` uses `extractLastAssistantSpeakable(messages)` +- Tests: `contextFormatters.test.ts` +- Coordinate with PR #640 + +#### PR B - ElevenLabs conversation archive + +- `hub/src/voice/elevenLabsConversationLog.ts` +- `POST /api/voice/sessions/log` +- Web: store `conversationId` on start, archive on disconnect +- Log path: `{HAPI_HOME}/voice-sessions.jsonl` + +#### PR C - Optional AGENT_NOTIFY parsing + +- `shared/src/agentNotify.ts`; hook `sessionHandlers.ts` +- Voice prefers notify `summary` when present +- **Not** a HAPI default; **not** required for all users +- Upstream docs: "optional convention for custom agent rules" + +#### PR D - Mode state + modality wrapper (later) + +- `voiceMode.ts`, `modalityWrapper.ts`, `await_confirm` gating +- Operator fork or post A-C if maintainer wants scope + +#### PR E - Local OpenAI backend (after #401) + +- `VOICE_BACKEND=local-openai` using #401's switcher +- Speaches STT + local chat tools + Chatterbox TTS +- **Not** a parallel architecture + +### 16.7 PR #401 - enable and extend (do not reinvent) + +PR #401 adds: + +```bash +VOICE_BACKEND=elevenlabs # default, unchanged +VOICE_BACKEND=gemini-live # Google Live API, function calling +VOICE_BACKEND=qwen-realtime +GET /api/voice/backend # runtime discovery, no Vite rebuild +``` + +**Recommendation:** + +1. Rebase `Overbaker:feat/pluggable-voice-backend` onto `tiann/main`; resolve merge conflicts +2. Walk **§16.5.1 checklist** against PR head; fix or verify each HAPI Bot thread; re-request tiann review +3. On Composer Enter change: confirm with tiann whether it stays in #401 or splits to separate PR (bot treats it as blocker) +4. Stack PR A/B/C on `VoiceBackendSession` after #401 merges +5. Use **Gemini Live** for early non-ElevenLabs dogfood (free tier, tools) while local stack waits on PR E +6. Verify ElevenLabs remains sole default when env unset (`DEFAULT_VOICE_BACKEND`) + +### 16.8 Local WIP status (operator clone, May 2026) + +**Not committed. Not running on proxmox** (hub since 15:29 without restart; web not rebuilt). + +| Path | Target PR | +|------|-----------| +| `web/src/realtime/hooks/contextFormatters.ts` | A | +| `web/src/realtime/hooks/voiceHooks.ts` | A, C | +| `hub/src/voice/elevenLabsConversationLog.ts` | B | +| `hub/src/web/routes/voice.ts` | B | +| `web/src/realtime/RealtimeSession.ts` | B | +| `web/src/realtime/RealtimeVoiceSession.tsx` | B | +| `web/src/api/client.ts` | B | + +New agent: **re-implement on fresh branch** from current upstream; use above as reference only. + +### 16.9 New-agent checklist + +- [ ] Read §14 (gardening model) and §16 (this section) +- [ ] Read `docs/dogfood/*` +- [ ] `git pull` on `tiann/hapi/main` +- [ ] Review PR #640 and #401 (**§16.5.1-16.5.2 maintainer gates**) +- [ ] Implement PR A + tests; `bun run test` + `bun typecheck:web` +- [ ] Re-dogfood subtitle-search scenario; verify ElevenLabs transcript shows inline assistant text in ready injection +- [ ] Open upstream PR with conv IDs; note AGENT_NOTIFY is opt-in in PR C description +- [ ] PR B, then PR C, then help on #401, then PR E + +### 16.10 Reproduction commands + +```bash +cd ~/coding/hapi/hub && bun run dev +cd ~/coding/hapi/web && bun run dev +# Voice session → delegate task → wait for agent → ask for summary + +curl -H "xi-api-key: $ELEVENLABS_API_KEY" \ + "https://api.elevenlabs.io/v1/convai/conversations?agent_id=$ELEVENLABS_AGENT_ID" +``` + +Suggested Git workflow: + +```bash +cd ~/coding/hapi +git fetch origin +git checkout main && git pull origin main + +# Option 1: PR A only (smallest) +git checkout -b fix/voice-ready-inline-summary + +# Option 2: help upstream #401 first +gh pr checkout 401 --repo tiann/hapi +# resolve conflicts, address tiann review, then stack PR A on top +``` + +Open PR with: dogfood conv IDs, **"default behavior unchanged"** regression note, PR #640/#401 relationship, and for PR C explicit **AGENT_NOTIFY is opt-in user agent convention**. + +**Explicit non-goals for upstream series:** stateful voice dispatcher; required AGENT_NOTIFY for all users; breaking ElevenLabs default; CursorVox/CursorRemote coupling; Docker/systemd in same PRs. diff --git a/docs/plans/2026-05-23-web-agent-chat-import-picker.md b/docs/plans/2026-05-23-web-agent-chat-import-picker.md new file mode 100644 index 0000000000..9d8d731dde --- /dev/null +++ b/docs/plans/2026-05-23-web-agent-chat-import-picker.md @@ -0,0 +1,128 @@ +# Plan: Web-based import picker for existing agent chats + +**Status:** Draft +**Related:** [voice integration plan](./2026-05-23-voice-agent-state-integration.md) +**Operator need:** Attach pre-HAPI agent conversations (Cursor, Claude, Codex, etc.) to the HAPI session list without restarting work or running CLI one-liners per repo. + +--- + +## Problem + +HAPI only lists sessions created through the hub (CLI `hapi *`, web spawn, runner RPC). Chats started outside HAPI (`agent`, `claude`, `codex` in a terminal) are invisible until manually wrapped. + +Today the operator workaround is: + +```bash +# CLI per project (local-first; blocks terminal unless remote spawn) +cd ~/coding/myproject && hapi cursor resume + +# Batch remote attach (May 2026 operator script) +~/coding/hapi/localdocs/operator/attach-existing-agent-sessions.sh +``` + +That script uses `POST /api/machines/:id/spawn` with **`resumeSessionId`** (added to `SpawnSessionRequestSchema` May 2026). Web UI does not expose this yet. + +--- + +## Goal + +**Browse workspace → pick agent flavor → pick discovered local chat → attach to HAPI** (remote runner spawn), with: + +- No duplicate HAPI session if same agent chat already attached +- Clear label (project path + chat title/first message + mtime) +- Optional pin/rename in HAPI metadata after attach + +--- + +## Discovery sources (read-only, machine-local via runner RPC) + +| Agent | Discovery | Resume id field | +|-------|-----------|-----------------| +| **Cursor** | `agent ls` (TTY) or parse `~/.cursor/projects/home-heavygee-coding-*/agent-transcripts/` + optional `store.db` titles | Cursor chat UUID | +| **Claude** | Latest `~/.claude/projects/-home-heavygee-coding-/*.jsonl` | Claude session UUID | +| **Codex** | Scan `~/.codex/sessions/**/rollout-*.jsonl` `session_meta.payload.id` filtered by `cwd` | Codex thread id | +| **Gemini / OpenCode / Kimi** | Existing HAPI scanner patterns in `cli/src/*/utils/*Scanner*` | flavor-specific metadata | + +**Runner constraint:** discovery runs on the machine with workspace roots; hub never reads `~/.cursor` directly. + +--- + +## Proposed UX (web) + +1. **Entry:** Session list empty state or `/browse` → **Import existing chat** +2. **Step 1:** Machine (if multiple) + directory picker (existing browse UI) +3. **Step 2:** Agent flavor tabs (cursor | claude | codex | …) +4. **Step 3:** List discovered chats: + - Title heuristic: first user message / sqlite title / "Untitled" + - Subtitle: `mtime`, message count estimate, agent id prefix + - Badge: **Already in HAPI** if hub metadata matches `cursorSessionId` / etc. +5. **Attach:** `POST /api/machines/:id/spawn` with `{ directory, agent, resumeSessionId }` +6. **Result:** Navigate to new session; show note that **full transcript backfill is not guaranteed** (agent context intact; HAPI history may be sparse until new traffic) + +--- + +## API / hub work (upstream-shaped) + +### PR F1 - Expose resume on spawn (done locally May 2026) + +- `shared/src/apiTypes.ts`: `resumeSessionId?: string` on `SpawnSessionRequestSchema` +- `hub/src/web/routes/machines.ts`: forward to `engine.spawnSession` +- Tests: spawn route passes resume id to RPC + +### PR F2 - Discover local agent chats (new) + +- `GET /api/machines/:id/agent-chats?directory=&agent=cursor|claude|codex` +- Runner RPC: `list-agent-chats` executes flavor-specific discovery in cwd +- Returns `{ chats: [{ id, title, updatedAt, cwd, alreadyAttachedSessionId? }] }` +- Security: path must be under runner workspace roots (same as spawn) + +### PR F3 - Web import wizard + +- `web/src/components/ImportAgentChat/` wizard +- Uses F1 + F2; no new hub session store tables + +--- + +## Edge cases + +| Case | Handling | +|------|----------| +| **Named chat not literal on disk** (e.g. operator says "login2oidc") | Search titles/first-message heuristics; show match confidence; allow manual id paste | +| **`agent ls` needs TTY** | Runner uses transcript mtime fallback; optional `script`/`CI=1` probe | +| **specstory `agent` alias** | Document `command agent` or full cursor-agent path in runner env | +| **Duplicate attach** | Disable Attach if `alreadyAttachedSessionId`; offer Open existing | +| **Wrong cwd** | Warn when discovery cwd ≠ picker directory | + +--- + +## Dogfood attach map (May 2026 operator batch) + +| Project | Agent | Resume id | Notes | +|---------|-------|-----------|-------| +| sparling | cursor | `d2b0370c-3e29-4462-9296-f984f0614aef` | Latest transcript mtime | +| sparling | claude | `12d15516-adb9-49cf-8e7a-9bb18ede3246` | Latest `.jsonl` in claude projects dir | +| meister-export-web | cursor | `c5add90a-2389-48c1-a9d6-10d24195435c` | | +| server-setup | cursor | `3054d570-fe5d-4d0d-8d4e-9f5ac2a45dea` | OIDC/htaccess modernization thread (operator label: login2oidc) | +| ExcuseMe | cursor | `f0f6291f-7ecc-4bf3-9c9d-c09bfb831ff7` | | +| gtxr | codex | `019e4b52-a96d-7283-9098-3a7ff8599a54` | From rollout `session_meta` cwd match | +| local-speech-agent | cursor | `ba02940f-f488-489b-9a2d-c00a0880cfe2` | | +| YourChores | cursor | `9118c502-4253-42a8-8d4d-0ce123c1f519` | | + +Script: `localdocs/operator/attach-existing-agent-sessions.sh` (operator-local; see `docs/operator-local-tooling.md`) + +--- + +## Non-goals + +- Full historical transcript import into HAPI message store (separate backfill project) +- Replacing agent-native session pickers inside Cursor/Claude UIs +- Requiring AGENT_NOTIFY or voice changes + +--- + +## Success criteria + +- Operator attaches 8 chats without CLI one-liners +- All appear in web session list as **remote** runner sessions +- Agent continues prior context (smoke: send "where did we leave off" via web) +- Web picker PR F2+F3 removes need for hardcoded script diff --git a/hub/src/web/routes/voice.test.ts b/hub/src/web/routes/voice.test.ts new file mode 100644 index 0000000000..f4a63987bf --- /dev/null +++ b/hub/src/web/routes/voice.test.ts @@ -0,0 +1,135 @@ +import { describe, expect, it, mock } from 'bun:test' +import { Hono } from 'hono' +import { SignJWT } from 'jose' +import type { WebAppEnv } from '../middleware/auth' +import { createAuthMiddleware } from '../middleware/auth' +import { createVoiceRoutes } from './voice' + +const JWT_SECRET = new TextEncoder().encode('test-secret') + +async function authHeaders() { + const token = await new SignJWT({ uid: 1, ns: 'default' }) + .setProtectedHeader({ alg: 'HS256' }) + .setIssuedAt() + .setExpirationTime('1h') + .sign(JWT_SECRET) + return { authorization: `Bearer ${token}` } +} + +function createApp() { + const app = new Hono() + app.use('*', createAuthMiddleware(JWT_SECRET)) + app.route('/api', createVoiceRoutes()) + return app +} + +describe('GET /api/voice/voices', () => { + it('returns 401 without auth', async () => { + const app = createApp() + const res = await app.request('/api/voice/voices') + expect(res.status).toBe(401) + }) + + it('returns empty list when ELEVENLABS_API_KEY is not set', async () => { + const app = createApp() + const headers = await authHeaders() + const prev = process.env.ELEVENLABS_API_KEY + delete process.env.ELEVENLABS_API_KEY + + const res = await app.request('/api/voice/voices', { headers }) + expect(res.status).toBe(200) + expect(await res.json()).toEqual({ voices: [] }) + + if (prev) process.env.ELEVENLABS_API_KEY = prev + }) + + it('maps ElevenLabs voice fields correctly', async () => { + const app = createApp() + const headers = await authHeaders() + const prev = process.env.ELEVENLABS_API_KEY + process.env.ELEVENLABS_API_KEY = 'test-key' + + const fetchMock = mock(() => Promise.resolve(new Response(JSON.stringify({ + voices: [ + { voice_id: 'v1', name: 'Alice', preview_url: 'https://cdn.example/a.mp3', category: 'premade' }, + { voice_id: 'v2', name: 'MyClone', preview_url: 'https://cdn.example/c.mp3', category: 'cloned' }, + ] + }), { status: 200 }))) + + const originalFetch = global.fetch + // @ts-expect-error test override + global.fetch = fetchMock + + const res = await app.request('/api/voice/voices', { headers }) + expect(res.status).toBe(200) + expect(await res.json()).toEqual({ + voices: [ + { id: 'v1', name: 'Alice', previewUrl: 'https://cdn.example/a.mp3', category: 'premade' }, + { id: 'v2', name: 'MyClone', previewUrl: 'https://cdn.example/c.mp3', category: 'cloned' }, + ] + }) + + global.fetch = originalFetch + if (prev) process.env.ELEVENLABS_API_KEY = prev + else delete process.env.ELEVENLABS_API_KEY + }) +}) + +describe('POST /api/voice/token', () => { + it('creates/selects voice-specific agent when voiceId is provided', async () => { + const app = createApp() + const headers = { + ...(await authHeaders()), + 'content-type': 'application/json' + } + + const prevKey = process.env.ELEVENLABS_API_KEY + const prevAgent = process.env.ELEVENLABS_AGENT_ID + process.env.ELEVENLABS_API_KEY = 'test-key' + delete process.env.ELEVENLABS_AGENT_ID + + const requests: Array<{ url: string; init?: RequestInit }> = [] + const originalFetch = global.fetch + // @ts-expect-error test override + global.fetch = (async (input: RequestInfo | URL, init?: RequestInit) => { + const url = String(input) + requests.push({ url, init }) + + if (url.endsWith('/convai/agents') && init?.method === 'GET') { + return new Response(JSON.stringify({ agents: [] }), { status: 200 }) + } + if (url.endsWith('/convai/agents/create') && init?.method === 'POST') { + return new Response(JSON.stringify({ agent_id: 'agent_voice_alice' }), { status: 200 }) + } + if (url.includes('/convai/conversation/token?agent_id=')) { + return new Response(JSON.stringify({ token: 'tok_alice' }), { status: 200 }) + } + return new Response('not found', { status: 404 }) + }) as typeof fetch + + const res = await app.request('/api/voice/token', { + method: 'POST', + headers, + body: JSON.stringify({ voiceId: 'alice-voice-id' }) + }) + + expect(res.status).toBe(200) + expect(await res.json()).toEqual({ + allowed: true, + token: 'tok_alice', + agentId: 'agent_voice_alice' + }) + + const createCall = requests.find(r => r.url.endsWith('/convai/agents/create')) + expect(createCall).toBeTruthy() + const createBody = JSON.parse(String(createCall?.init?.body)) + expect(createBody.name).toContain('[voice:alice-voice-id]') + expect(createBody.conversation_config?.tts?.voice_id).toBe('alice-voice-id') + + global.fetch = originalFetch + if (prevKey) process.env.ELEVENLABS_API_KEY = prevKey + else delete process.env.ELEVENLABS_API_KEY + if (prevAgent) process.env.ELEVENLABS_AGENT_ID = prevAgent + else delete process.env.ELEVENLABS_AGENT_ID + }) +}) diff --git a/hub/src/web/routes/voice.ts b/hub/src/web/routes/voice.ts index 1a55f83639..7aa6093967 100644 --- a/hub/src/web/routes/voice.ts +++ b/hub/src/web/routes/voice.ts @@ -9,7 +9,17 @@ import { const tokenRequestSchema = z.object({ customAgentId: z.string().optional(), - customApiKey: z.string().optional() + customApiKey: z.string().optional(), + voiceId: z.string().optional() +}) + +const telemetryEventSchema = z.object({ + stage: z.string().min(1), + message: z.string().min(1), + sessionId: z.string().optional(), + voiceId: z.string().optional(), + language: z.string().optional(), + details: z.record(z.string(), z.unknown()).optional() }) // Cache for auto-created agent IDs (keyed by API key hash) @@ -20,10 +30,26 @@ interface ElevenLabsAgent { name: string } +function parseVoiceAgentMap(): Record { + const raw = process.env.ELEVENLABS_VOICE_AGENT_MAP + if (!raw) return {} + try { + const parsed = JSON.parse(raw) as unknown + if (!parsed || typeof parsed !== 'object') return {} + return Object.fromEntries( + Object.entries(parsed as Record) + .filter(([k, v]) => typeof k === 'string' && typeof v === 'string') + .map(([k, v]) => [k, v as string]) + ) + } catch { + return {} + } +} + /** * Find an existing "Hapi Voice Assistant" agent */ -async function findHapiAgent(apiKey: string): Promise { +async function findHapiAgent(apiKey: string, agentName: string = VOICE_AGENT_NAME): Promise { try { const response = await fetch(`${ELEVENLABS_API_BASE}/convai/agents`, { method: 'GET', @@ -39,7 +65,7 @@ async function findHapiAgent(apiKey: string): Promise { const data = await response.json() as { agents?: ElevenLabsAgent[] } const agents: ElevenLabsAgent[] = data.agents || [] - const hapiAgent = agents.find(agent => agent.name === VOICE_AGENT_NAME) + const hapiAgent = agents.find(agent => agent.name === agentName) return hapiAgent?.agent_id || null } catch { @@ -51,7 +77,17 @@ async function findHapiAgent(apiKey: string): Promise { * Create a new "Hapi Voice Assistant" agent */ async function createHapiAgent(apiKey: string): Promise { + return createNamedHapiAgent(apiKey, VOICE_AGENT_NAME) +} + +async function createNamedHapiAgent(apiKey: string, agentName: string, voiceId?: string): Promise { try { + const config = buildVoiceAgentConfig() + config.name = agentName + if (voiceId) { + config.conversation_config.tts.voice_id = voiceId + } + const response = await fetch(`${ELEVENLABS_API_BASE}/convai/agents/create`, { method: 'POST', headers: { @@ -59,7 +95,7 @@ async function createHapiAgent(apiKey: string): Promise { 'Content-Type': 'application/json', 'Accept': 'application/json' }, - body: JSON.stringify(buildVoiceAgentConfig()) + body: JSON.stringify(config) }) if (!response.ok) { @@ -83,23 +119,37 @@ async function createHapiAgent(apiKey: string): Promise { * Get or create agent ID - finds existing or creates new "Hapi Voice Assistant" agent */ async function getOrCreateAgentId(apiKey: string): Promise { + return getOrCreateAgentIdForVoice(apiKey) +} + +function getVoiceAgentName(voiceId?: string): string { + if (!voiceId || voiceId.trim().length === 0) return VOICE_AGENT_NAME + return `${VOICE_AGENT_NAME} [voice:${voiceId}]` +} + +async function getOrCreateAgentIdForVoice(apiKey: string, voiceId?: string): Promise { // Check cache first (simple hash of first/last chars of API key) - const cacheKey = `${apiKey.slice(0, 4)}...${apiKey.slice(-4)}` + const cacheKey = `${apiKey.slice(0, 4)}...${apiKey.slice(-4)}::${voiceId ?? 'default'}` const cached = agentIdCache.get(cacheKey) if (cached) { return cached } + const agentName = getVoiceAgentName(voiceId) + // Try to find existing agent - console.log('[Voice] No agent ID configured, searching for existing agent...') - let agentId = await findHapiAgent(apiKey) + console.log('[Voice] No agent ID configured, searching for existing agent...', { + voiceId, + agentName + }) + let agentId = await findHapiAgent(apiKey, agentName) if (agentId) { console.log('[Voice] Found existing agent:', agentId) } else { // Create new agent console.log('[Voice] No existing agent found, creating new one...') - agentId = await createHapiAgent(apiKey) + agentId = await createNamedHapiAgent(apiKey, agentName, voiceId) if (agentId) { console.log('[Voice] Created new agent:', agentId) } @@ -118,19 +168,24 @@ export function createVoiceRoutes(): Hono { // Get ElevenLabs ConvAI conversation token app.post('/voice/token', async (c) => { + const requestId = crypto.randomUUID() const json = await c.req.json().catch(() => null) const parsed = tokenRequestSchema.safeParse(json ?? {}) if (!parsed.success) { + console.warn('[Voice][Token] Invalid request body', { requestId }) return c.json({ allowed: false, error: 'Invalid request body' }, 400) } - const { customAgentId, customApiKey } = parsed.data + const { customAgentId, customApiKey, voiceId } = parsed.data // Use custom credentials if provided, otherwise fall back to env vars const apiKey = customApiKey || process.env.ELEVENLABS_API_KEY - let agentId = customAgentId || process.env.ELEVENLABS_AGENT_ID + const voiceAgentMap = parseVoiceAgentMap() + const mappedAgentId = voiceId ? voiceAgentMap[voiceId] : undefined + let agentId = customAgentId || mappedAgentId || process.env.ELEVENLABS_AGENT_ID if (!apiKey) { + console.warn('[Voice][Token] Missing API key', { requestId }) return c.json({ allowed: false, error: 'ElevenLabs API key not configured' @@ -139,8 +194,9 @@ export function createVoiceRoutes(): Hono { // Auto-create agent if not configured if (!agentId) { - agentId = await getOrCreateAgentId(apiKey) ?? undefined + agentId = await getOrCreateAgentIdForVoice(apiKey, mappedAgentId ? undefined : voiceId) ?? undefined if (!agentId) { + console.error('[Voice][Token] Failed to resolve/create agent ID', { requestId }) return c.json({ allowed: false, error: 'Failed to create ElevenLabs agent automatically' @@ -149,6 +205,15 @@ export function createVoiceRoutes(): Hono { } try { + console.log('[Voice][Token] Requesting ElevenLabs conversation token', { + requestId, + agentId, + voiceId, + hasCustomAgentId: Boolean(customAgentId), + hasMappedAgentId: Boolean(mappedAgentId), + hasCustomApiKey: Boolean(customApiKey) + }) + // Fetch conversation token from ElevenLabs const response = await fetch( `https://api.elevenlabs.io/v1/convai/conversation/token?agent_id=${encodeURIComponent(agentId)}`, @@ -164,7 +229,12 @@ export function createVoiceRoutes(): Hono { if (!response.ok) { const errorData = await response.json().catch(() => ({})) as { detail?: { message?: string }; error?: string } const errorMessage = errorData.detail?.message || errorData.error || `ElevenLabs API error: ${response.status}` - console.error('[Voice] Failed to get token from ElevenLabs:', errorMessage) + console.error('[Voice][Token] Failed to get token from ElevenLabs', { + requestId, + agentId, + status: response.status, + errorMessage + }) return c.json({ allowed: false, error: errorMessage @@ -173,19 +243,29 @@ export function createVoiceRoutes(): Hono { const data = await response.json() as { token?: string } if (!data.token) { + console.error('[Voice][Token] Token response missing token field', { + requestId, + agentId + }) return c.json({ allowed: false, error: 'No token in ElevenLabs response' }, 500) } + console.log('[Voice][Token] Token issued successfully', { requestId, agentId }) + return c.json({ allowed: true, token: data.token, agentId }) } catch (error) { - console.error('[Voice] Error fetching token:', error) + console.error('[Voice][Token] Error fetching token', { + requestId, + agentId, + error: error instanceof Error ? error.message : String(error) + }) return c.json({ allowed: false, error: error instanceof Error ? error.message : 'Network error' @@ -193,5 +273,84 @@ export function createVoiceRoutes(): Hono { } }) + // Get available ElevenLabs voices (includes user's voice clones) + app.get('/voice/voices', async (c) => { + const requestId = crypto.randomUUID() + const apiKey = process.env.ELEVENLABS_API_KEY + if (!apiKey) { + console.warn('[Voice][Voices] Missing API key, returning empty voices list', { requestId }) + return c.json({ voices: [] }) + } + + try { + const response = await fetch(`${ELEVENLABS_API_BASE}/voices`, { + headers: { + 'xi-api-key': apiKey, + 'Accept': 'application/json' + } + }) + + if (!response.ok) { + console.error('[Voice][Voices] ElevenLabs voices request failed', { + requestId, + status: response.status + }) + return c.json({ voices: [] }) + } + + const data = await response.json() as { + voices?: Array<{ + voice_id: string + name: string + preview_url: string + category: string + }> + } + + const voices = (data.voices ?? []).map(v => ({ + id: v.voice_id, + name: v.name, + previewUrl: v.preview_url, + category: v.category + })) + + console.log('[Voice][Voices] Voices fetched', { + requestId, + count: voices.length + }) + + return c.json({ voices }) + } catch (error) { + console.error('[Voice][Voices] Unexpected error fetching voices', { + requestId, + error: error instanceof Error ? error.message : String(error) + }) + return c.json({ voices: [] }) + } + }) + + app.post('/voice/telemetry', async (c) => { + const requestId = crypto.randomUUID() + const json = await c.req.json().catch(() => null) + const parsed = telemetryEventSchema.safeParse(json ?? {}) + if (!parsed.success) { + console.warn('[Voice][Telemetry] Invalid payload', { requestId }) + return c.json({ ok: false, error: 'Invalid telemetry payload' }, 400) + } + + const { stage, message, sessionId, voiceId, language, details } = parsed.data + console.log('[Voice][Telemetry]', { + requestId, + stage, + message, + sessionId, + voiceId, + language, + details + }) + + return c.json({ ok: true }) + }) + return app } diff --git a/shared/src/voice.ts b/shared/src/voice.ts index 6751f0eba4..2843d84eb4 100644 --- a/shared/src/voice.ts +++ b/shared/src/voice.ts @@ -208,6 +208,9 @@ export interface VoiceAgentConfig { language?: boolean first_message?: boolean } + tts?: { + voice_id?: boolean + } } } } @@ -249,6 +252,9 @@ export function buildVoiceAgentConfig(): VoiceAgentConfig { conversation_config_override: { agent: { language: true + }, + tts: { + voice_id: true } } } diff --git a/web/src/api/client.ts b/web/src/api/client.ts index 52c25619b0..f8bd941abd 100644 --- a/web/src/api/client.ts +++ b/web/src/api/client.ts @@ -525,7 +525,7 @@ export class ApiClient { }) } - async fetchVoiceToken(options?: { customAgentId?: string; customApiKey?: string }): Promise<{ + async fetchVoiceToken(options?: { customAgentId?: string; customApiKey?: string; voiceId?: string }): Promise<{ allowed: boolean token?: string agentId?: string @@ -536,4 +536,22 @@ export class ApiClient { body: JSON.stringify(options || {}) }) } + + async fetchVoices(): Promise<{ voices: Array<{ id: string; name: string; previewUrl: string; category: string }> }> { + return await this.request('/api/voice/voices') + } + + async sendVoiceTelemetry(event: { + stage: string + message: string + sessionId?: string + voiceId?: string + language?: string + details?: Record + }): Promise { + await this.request('/api/voice/telemetry', { + method: 'POST', + body: JSON.stringify(event) + }) + } } diff --git a/web/src/api/voice.ts b/web/src/api/voice.ts index 66cee443f1..c5e8f27a4d 100644 --- a/web/src/api/voice.ts +++ b/web/src/api/voice.ts @@ -26,6 +26,7 @@ export interface VoiceTokenResponse { export interface VoiceTokenRequest { customAgentId?: string customApiKey?: string + voiceId?: string } /** @@ -50,6 +51,22 @@ export async function fetchVoiceToken( } } +export interface VoiceInfo { + id: string + name: string + previewUrl: string + category: string +} + +export async function fetchVoices(api: ApiClient): Promise { + try { + const result = await api.fetchVoices() + return result.voices + } catch { + return [] + } +} + export interface ElevenLabsAgent { agent_id: string name: string diff --git a/web/src/lib/locales/en.ts b/web/src/lib/locales/en.ts index 905cda7c2a..3615adcc75 100644 --- a/web/src/lib/locales/en.ts +++ b/web/src/lib/locales/en.ts @@ -403,6 +403,8 @@ export default { 'settings.voice.title': 'Voice Assistant', 'settings.voice.language': 'Voice Language', 'settings.voice.autoDetect': 'Auto-detect', + 'settings.voice.voice': 'Voice', + 'settings.voice.voiceDefault': 'Default', 'settings.about.title': 'About', 'settings.about.website': 'Website', 'settings.about.appVersion': 'App Version', diff --git a/web/src/lib/locales/zh-CN.ts b/web/src/lib/locales/zh-CN.ts index cf5c7d73a8..50ae69eeab 100644 --- a/web/src/lib/locales/zh-CN.ts +++ b/web/src/lib/locales/zh-CN.ts @@ -405,6 +405,8 @@ export default { 'settings.voice.title': '语音助手', 'settings.voice.language': '语音语言', 'settings.voice.autoDetect': '自动检测', + 'settings.voice.voice': '声音', + 'settings.voice.voiceDefault': '默认', 'settings.about.title': '关于', 'settings.about.website': '官方网站', 'settings.about.appVersion': '应用版本', diff --git a/web/src/lib/voice-context.tsx b/web/src/lib/voice-context.tsx index 71424a4362..cebc030beb 100644 --- a/web/src/lib/voice-context.tsx +++ b/web/src/lib/voice-context.tsx @@ -40,11 +40,12 @@ export function VoiceProvider({ children }: { children: ReactNode }) { setCurrentSessionId(sessionId) const initialContext = voiceHooks.onVoiceStarted(sessionId) - // Read voice language preference from localStorage + // Read voice preferences from localStorage const voiceLang = localStorage.getItem('hapi-voice-lang') const elevenLabsLang = getElevenLabsCodeFromPreference(voiceLang) + const voiceId = localStorage.getItem('hapi-voice-id') ?? undefined - await startRealtimeSession(sessionId, initialContext, elevenLabsLang) + await startRealtimeSession(sessionId, initialContext, elevenLabsLang, voiceId) }, []) const stopVoice = useCallback(async () => { diff --git a/web/src/lib/voices.test.ts b/web/src/lib/voices.test.ts new file mode 100644 index 0000000000..d6d70888e5 --- /dev/null +++ b/web/src/lib/voices.test.ts @@ -0,0 +1,14 @@ +import { describe, it, expect } from 'vitest' +import { getFallbackVoices } from './voices' + +describe('getFallbackVoices', () => { + it('returns localized Chinese aliases for zh-CN fallback list', () => { + const voices = getFallbackVoices('zh-CN') + expect(voices.some(v => /杰西卡|瑞秋|贝拉|乔什|亚当/.test(v.name))).toBe(true) + }) + + it('keeps canonical English names for en fallback list', () => { + const voices = getFallbackVoices('en') + expect(voices.some(v => v.name === 'Jessica')).toBe(true) + }) +}) diff --git a/web/src/lib/voices.ts b/web/src/lib/voices.ts new file mode 100644 index 0000000000..6092cea23b --- /dev/null +++ b/web/src/lib/voices.ts @@ -0,0 +1,65 @@ +import type { Locale } from '@/lib/use-translation' + +export interface Voice { + id: string + name: string + gender: 'female' | 'male' + description: string + aliases?: Partial> +} + +export const VOICES: Voice[] = [ + { + id: 'cgSgspJ2msm6clMCkdW9', + name: 'Jessica', + aliases: { 'zh-CN': '杰西卡' }, + gender: 'female', + description: 'Default — warm, conversational', + }, + { + id: '21m00Tcm4TlvDq8ikWAM', + name: 'Rachel', + aliases: { 'zh-CN': '瑞秋' }, + gender: 'female', + description: 'Calm, professional', + }, + { + id: 'EXAVITQu4vr4xnSDxMaL', + name: 'Bella', + aliases: { 'zh-CN': '贝拉' }, + gender: 'female', + description: 'Soft, warm', + }, + { + id: 'TxGEqnHWrfWFTfGW9XjX', + name: 'Josh', + aliases: { 'zh-CN': '乔什' }, + gender: 'male', + description: 'Deep, smooth', + }, + { + id: 'pNInz6obpgDQGcFmaJgB', + name: 'Adam', + aliases: { 'zh-CN': '亚当' }, + gender: 'male', + description: 'Narration, clear', + }, + { id: 'AZnzlk1XvdvUeBnXmlld', name: 'Domi', gender: 'female', description: 'Strong, confident' }, + { id: 'MF3mGyEYCl7XYWbV9V6O', name: 'Elli', gender: 'female', description: 'Young, clear' }, + { id: 'VR6AewLTigWG4xSOukaG', name: 'Arnold', gender: 'male', description: 'Crisp, authoritative' }, + { id: 'ErXwobaYiN019PkySvjV', name: 'Antoni', gender: 'male', description: 'Well-rounded' }, + { id: 'yoZ06aMxZJJ28mfd3POQ', name: 'Sam', gender: 'male', description: 'Raspy, dynamic' }, +] + +export const DEFAULT_VOICE_ID = 'cgSgspJ2msm6clMCkdW9' + +export function getVoiceById(id: string | null): Voice | undefined { + return VOICES.find(v => v.id === id) +} + +export function getFallbackVoices(locale: Locale): Voice[] { + return VOICES.map((voice) => ({ + ...voice, + name: voice.aliases?.[locale] ?? voice.name, + })) +} diff --git a/web/src/realtime/RealtimeSession.ts b/web/src/realtime/RealtimeSession.ts index 132304f86e..ee9883d45d 100644 --- a/web/src/realtime/RealtimeSession.ts +++ b/web/src/realtime/RealtimeSession.ts @@ -8,7 +8,8 @@ let currentSessionId: string | null = null export async function startRealtimeSession( sessionId: string, initialContext?: string, - language?: ElevenLabsLanguage + language?: ElevenLabsLanguage, + voiceId?: string ) { if (!voiceSession) { console.warn('[Voice] No voice session registered') @@ -20,7 +21,8 @@ export async function startRealtimeSession( await voiceSession.startSession({ sessionId, initialContext, - language + language, + voiceId }) voiceSessionStarted = true } catch (error) { diff --git a/web/src/realtime/RealtimeVoiceSession.tsx b/web/src/realtime/RealtimeVoiceSession.tsx index fff9b7b44b..a510d1eef5 100644 --- a/web/src/realtime/RealtimeVoiceSession.tsx +++ b/web/src/realtime/RealtimeVoiceSession.tsx @@ -6,6 +6,7 @@ import { fetchVoiceToken } from '@/api/voice' import type { VoiceSession, VoiceSessionConfig, ConversationStatus, StatusCallback } from './types' import type { ApiClient } from '@/api/client' import type { Session } from '@/types/api' +import { DEFAULT_VOICE_ID } from '@/lib/voices' // Debug logging const DEBUG = import.meta.env.DEV @@ -15,6 +16,28 @@ let conversationInstance: ReturnType | null = null // Store reference for status updates let statusCallback: StatusCallback | null = null +let telemetryApi: ApiClient | null = null +let activeVoiceContext: { + sessionId?: string + voiceId?: string + language?: string +} = {} + +async function emitVoiceTelemetry(event: { + stage: string + message: string + sessionId?: string + voiceId?: string + language?: string + details?: Record +}): Promise { + if (!telemetryApi) return + try { + await telemetryApi.sendVoiceTelemetry(event) + } catch { + // Telemetry must not break voice flows + } +} // Global voice session implementation class RealtimeVoiceSessionImpl implements VoiceSession { @@ -24,10 +47,41 @@ class RealtimeVoiceSessionImpl implements VoiceSession { this.api = api } + private async sendTelemetry(event: { + stage: string + message: string + sessionId?: string + voiceId?: string + language?: string + details?: Record + }): Promise { + await emitVoiceTelemetry(event) + } + async startSession(config: VoiceSessionConfig): Promise { + activeVoiceContext = { + sessionId: config.sessionId, + voiceId: config.voiceId, + language: config.language + } + await this.sendTelemetry({ + stage: 'start-session', + message: 'Voice start requested', + sessionId: config.sessionId, + voiceId: config.voiceId, + language: config.language + }) + if (!conversationInstance) { const error = new Error('Realtime voice session not initialized') console.warn('[Voice] Realtime voice session not initialized') + await this.sendTelemetry({ + stage: 'init-missing', + message: error.message, + sessionId: config.sessionId, + voiceId: config.voiceId, + language: config.language + }) statusCallback?.('error', 'Voice session not initialized') throw error } @@ -40,6 +94,13 @@ class RealtimeVoiceSessionImpl implements VoiceSession { permissionStream = await navigator.mediaDevices.getUserMedia({ audio: true }) } catch (error) { console.error('[Voice] Failed to get microphone permission:', error) + await this.sendTelemetry({ + stage: 'mic-permission-denied', + message: error instanceof Error ? error.message : String(error), + sessionId: config.sessionId, + voiceId: config.voiceId, + language: config.language + }) statusCallback?.('error', 'Microphone permission denied') throw error } finally { @@ -49,43 +110,96 @@ class RealtimeVoiceSessionImpl implements VoiceSession { // Fetch conversation token from server let tokenResponse: Awaited> try { - tokenResponse = await fetchVoiceToken(this.api) + tokenResponse = await fetchVoiceToken(this.api, { + voiceId: config.voiceId + }) } catch (error) { console.error('[Voice] Failed to fetch voice token:', error) + await this.sendTelemetry({ + stage: 'token-fetch-error', + message: error instanceof Error ? error.message : String(error), + sessionId: config.sessionId, + voiceId: config.voiceId, + language: config.language + }) statusCallback?.('error', 'Network error') throw error } if (!tokenResponse.allowed || !tokenResponse.token) { const error = new Error(tokenResponse.error ?? 'Voice not allowed or no token') console.error('[Voice] Voice not allowed or no token:', tokenResponse.error) + await this.sendTelemetry({ + stage: 'token-not-allowed', + message: tokenResponse.error ?? 'Voice not allowed', + sessionId: config.sessionId, + voiceId: config.voiceId, + language: config.language, + details: { + agentId: tokenResponse.agentId + } + }) statusCallback?.('error', tokenResponse.error ?? 'Voice not allowed') throw error } + const baseSessionConfig = { + conversationToken: tokenResponse.token, + connectionType: 'webrtc' as const, + dynamicVariables: { + sessionId: config.sessionId, + initialConversationContext: config.initialContext || '' + }, + // Language override — requires override permissions enabled on the agent + // See: https://elevenlabs.io/docs/agents-platform/customization/personalization/overrides + overrides: { + agent: { + language: config.language + } + } + } + + await this.sendTelemetry({ + stage: 'override-decision', + message: 'Skipping runtime voice override; using token-selected agent voice', + sessionId: config.sessionId, + voiceId: config.voiceId, + language: config.language, + details: { + defaultVoiceId: DEFAULT_VOICE_ID, + selectedVoiceId: config.voiceId + } + }) + // Use conversation token from server (private agent flow) try { - const conversationId = await conversationInstance.startSession({ - conversationToken: tokenResponse.token, - connectionType: 'webrtc', - dynamicVariables: { - sessionId: config.sessionId, - initialConversationContext: config.initialContext || '' - }, - // Language override - requires agent to have platform_settings.overrides enabled - // See: https://elevenlabs.io/docs/agents-platform/customization/personalization/overrides - overrides: { - agent: { - language: config.language - } - } - }) + const conversationId = await conversationInstance.startSession(baseSessionConfig) if (DEBUG) { console.log('[Voice] Started conversation with ID:', conversationId) } + await this.sendTelemetry({ + stage: 'start-success', + message: 'Voice session started successfully', + sessionId: config.sessionId, + voiceId: config.voiceId, + language: config.language + }) } catch (error) { - console.error('[Voice] Failed to start realtime session:', error) - statusCallback?.('error', 'Failed to start voice session') + const errorMessage = error instanceof Error ? error.message : String(error) + console.error('[Voice] Failed to start realtime session:', { + error: errorMessage, + sessionId: config.sessionId, + language: config.language, + voiceId: config.voiceId + }) + await this.sendTelemetry({ + stage: 'start-failed', + message: errorMessage, + sessionId: config.sessionId, + voiceId: config.voiceId, + language: config.language + }) + statusCallback?.('error', `Failed to start voice session: ${errorMessage}`) throw error } } @@ -180,12 +294,30 @@ export function RealtimeVoiceSession({ const handleDisconnect = useCallback(() => { if (DEBUG) console.log('[Voice] Realtime session disconnected') resetRealtimeSessionState() + void emitVoiceTelemetry({ + stage: 'disconnect', + message: 'Realtime voice session disconnected', + ...activeVoiceContext + }) onStatusChange?.('disconnected') }, [onStatusChange]) const handleError = useCallback((error: unknown) => { if (DEBUG) console.error('[Voice] Realtime error:', error) - const errorMessage = error instanceof Error ? error.message : 'Connection error' + const errorMessage = error instanceof Error + ? error.message + : (() => { + try { + return JSON.stringify(error) + } catch { + return String(error ?? 'Connection error') + } + })() + void emitVoiceTelemetry({ + stage: 'runtime-error', + message: errorMessage, + ...activeVoiceContext + }) onStatusChange?.('error', errorMessage) }, [onStatusChange]) @@ -223,6 +355,7 @@ export function RealtimeVoiceSession({ }) useEffect(() => { + telemetryApi = api // Store the conversation instance globally conversationInstance = conversation @@ -239,6 +372,7 @@ export function RealtimeVoiceSession({ return () => { // Clean up on unmount conversationInstance = null + telemetryApi = null } }, [conversation, api]) diff --git a/web/src/realtime/types.ts b/web/src/realtime/types.ts index 47753c5286..32b4b75dae 100644 --- a/web/src/realtime/types.ts +++ b/web/src/realtime/types.ts @@ -4,6 +4,7 @@ export interface VoiceSessionConfig { sessionId: string initialContext?: string language?: ElevenLabsLanguage + voiceId?: string } export interface VoiceSession { diff --git a/web/src/routes/settings/index.test.tsx b/web/src/routes/settings/index.test.tsx index daf059a4a5..cb0cee342c 100644 --- a/web/src/routes/settings/index.test.tsx +++ b/web/src/routes/settings/index.test.tsx @@ -1,5 +1,5 @@ -import { describe, it, expect, vi, beforeEach } from 'vitest' -import { render, screen } from '@testing-library/react' +import { describe, it, expect, vi, beforeEach, afterEach } from 'vitest' +import { render, screen, fireEvent, waitFor, act, cleanup } from '@testing-library/react' import { I18nContext, I18nProvider } from '@/lib/i18n-context' import { en } from '@/lib/locales' import { PROTOCOL_VERSION } from '@hapi/protocol' @@ -95,6 +95,40 @@ vi.mock('@/lib/languages', () => ({ getLanguageDisplayName: (lang: { code: string | null; name: string }) => lang.name, })) +// Use vi.hoisted so these mocks are available when vi.mock factories run +const { mockFetchVoices, mockApi } = vi.hoisted(() => { + const mockFetchVoices = vi.fn(() => Promise.resolve([])) + const mockApi = { + fetchVoices: vi.fn(() => Promise.resolve({ voices: [] })), + } + return { mockFetchVoices, mockApi } +}) + +// Mock static voices list +vi.mock('@/lib/voices', () => ({ + VOICES: [{ id: 'voice1', name: 'Jessica', gender: 'female', description: 'Default' }], + DEFAULT_VOICE_ID: 'voice1', + getVoiceById: (id: string | null) => + id === 'voice1' ? { id: 'voice1', name: 'Jessica', gender: 'female', description: 'Default' } : undefined, + getFallbackVoices: () => [{ id: 'voice1', name: 'Jessica', gender: 'female', description: 'Default' }], +})) + +// Mock fetchVoices to return a resolved list by default +vi.mock('@/api/voice', () => ({ + fetchVoices: mockFetchVoices, + fetchVoiceToken: vi.fn(() => Promise.resolve({ allowed: true, token: 'tok' })), +})) + +// Mock useAppContext so the page doesn't throw "AppContext is not available" +vi.mock('@/lib/app-context', () => ({ + useAppContext: () => ({ api: mockApi, token: 'test', baseUrl: '' }), + AppContextProvider: ({ children }: { children: React.ReactNode }) => children, +})) + + +afterEach(() => { + cleanup() +}) function renderWithProviders(ui: React.ReactElement) { return render( @@ -117,9 +151,11 @@ function renderWithSpyT(ui: React.ReactElement) { describe('SettingsPage', () => { beforeEach(() => { vi.clearAllMocks() + // Reset fetchVoices mock to return empty list by default + mockFetchVoices.mockResolvedValue([]) // Mock localStorage const localStorageMock = { - getItem: vi.fn(() => 'en'), + getItem: vi.fn(() => null), setItem: vi.fn(), removeItem: vi.fn(), clear: vi.fn(), @@ -231,4 +267,100 @@ describe('SettingsPage', () => { expect(calledKeys).toContain('settings.chat.userMessageBackground') expect(calledKeys).toContain('settings.chat.surfaceColor.default') }) + + // Voice picker tests + it('renders the Voice section with "Voice" label', () => { + renderWithProviders() + expect(screen.getAllByText('Voice').length).toBeGreaterThanOrEqual(1) + }) + + it('uses correct i18n keys for the voice picker', () => { + const spyT = renderWithSpyT() + const calledKeys = spyT.mock.calls.map((call) => call[0]) + expect(calledKeys).toContain('settings.voice.voice') + expect(calledKeys).toContain('settings.voice.voiceDefault') + }) + + it('voice picker shows "Default" option when opened', () => { + renderWithProviders() + // The current value "Default" is shown in the closed picker button + expect(screen.getAllByText('Default').length).toBeGreaterThanOrEqual(1) + }) + + it('opens voice picker and shows "Default" option in the list', () => { + renderWithProviders() + // Click the voice picker button (aria-label target via the label text) + const voiceButtons = screen.getAllByRole('button', { name: /Default/i }) + // Find the button that has aria-haspopup — that's the voice picker trigger + const pickerButton = voiceButtons.find(btn => btn.getAttribute('aria-haspopup') === 'listbox') + expect(pickerButton).toBeTruthy() + fireEvent.click(pickerButton!) + // The listbox should appear with a "Default" option inside + const listbox = screen.getByRole('listbox', { name: 'Voice' }) + expect(listbox).toBeInTheDocument() + expect(listbox.textContent).toContain('Default') + }) + + it('shows dynamic voices in picker when fetchVoices returns a list', async () => { + mockFetchVoices.mockResolvedValue([ + { id: 'dyn1', name: 'Alice', previewUrl: '', category: 'premade' }, + { id: 'dyn2', name: 'Bob', previewUrl: 'https://example.com/bob.mp3', category: 'premade' }, + ]) + + renderWithProviders() + + const pickerButton = screen.getByRole('button', { name: /Voice\s*Default/i }) + fireEvent.click(pickerButton) + + await waitFor(() => { + expect(screen.getByText('Alice')).toBeInTheDocument() + expect(screen.getByText('Bob')).toBeInTheDocument() + }) + }) + + + it('shows a disabled preview button with tooltip when previewUrl is missing', async () => { + mockFetchVoices.mockResolvedValue([ + { id: 'dyn1', name: 'Alice', previewUrl: '', category: 'premade' }, + ]) + + renderWithProviders() + + const pickerButton = screen.getByRole('button', { name: /Voice\s*Default/i }) + fireEvent.click(pickerButton) + + const previewButton = await screen.findByLabelText('Preview voice') + expect(previewButton).toBeDisabled() + expect(previewButton).toHaveAttribute('title', 'Preview unavailable without an ElevenLabs API key') + }) + + it('shows a play button for voices with a previewUrl', async () => { + mockFetchVoices.mockResolvedValue([ + { id: 'dyn1', name: 'Alice', previewUrl: 'https://example.com/alice.mp3', category: 'premade' }, + ]) + + renderWithProviders() + + const pickerButton = screen.getByRole('button', { name: /Voice\s*Default/i }) + fireEvent.click(pickerButton) + + await screen.findByText('Alice') + expect(screen.getByLabelText('Preview voice')).toBeInTheDocument() + expect(screen.getByLabelText('Preview voice')).not.toBeDisabled() + }) + + it('selecting a voice calls localStorage.setItem with the voice id', async () => { + mockFetchVoices.mockResolvedValue([ + { id: 'dyn1', name: 'Alice', previewUrl: '', category: 'premade' }, + ]) + + renderWithProviders() + + const pickerButton = screen.getByRole('button', { name: /Voice\s*Default/i }) + fireEvent.click(pickerButton) + + const alice = await screen.findByText('Alice') + fireEvent.click(alice) + expect(window.localStorage.setItem).toHaveBeenCalledWith('hapi-voice-id', 'dyn1') + }) }) diff --git a/web/src/routes/settings/index.tsx b/web/src/routes/settings/index.tsx index 78dc263d88..83ae3ae45f 100644 --- a/web/src/routes/settings/index.tsx +++ b/web/src/routes/settings/index.tsx @@ -2,6 +2,9 @@ import { useState, useRef, useEffect } from 'react' import { useTranslation, type Locale } from '@/lib/use-translation' import { useAppGoBack } from '@/hooks/useAppGoBack' import { getElevenLabsSupportedLanguages, getLanguageDisplayName, type Language } from '@/lib/languages' +import { VOICES, getFallbackVoices } from '@/lib/voices' +import { useAppContext } from '@/lib/app-context' +import { fetchVoices, type VoiceInfo } from '@/api/voice' import { getFontScaleOptions, useFontScale, type FontScale } from '@/hooks/useFontScale' import { getTerminalFontSizeOptions, useTerminalFontSize, type TerminalFontSize } from '@/hooks/useTerminalFontSize' import { getComposerEnterBehaviorOptions, useComposerEnterBehavior, type ComposerEnterBehavior } from '@/hooks/useComposerEnterBehavior' @@ -88,6 +91,36 @@ function ChevronDownIcon(props: { className?: string }) { ) } +function PlayIcon(props: { className?: string }) { + return ( + + + + ) +} + +function StopIcon(props: { className?: string }) { + return ( + + + + ) +} + function MinusIcon(props: { className?: string }) { return ( (null) const appearanceContainerRef = useRef(null) const fontContainerRef = useRef(null) @@ -281,6 +316,7 @@ export default function SettingsPage() { const chatContainerRef = useRef(null) const terminalToolDisplayContainerRef = useRef(null) const voiceContainerRef = useRef(null) + const voicePickerContainerRef = useRef(null) const { fontScale, setFontScale } = useFontScale() const { terminalFontSize, setTerminalFontSize } = useTerminalFontSize() const { sessionPreviewLimit, setSessionPreviewLimit } = useSessionPreviewLimit() @@ -299,6 +335,16 @@ export default function SettingsPage() { return localStorage.getItem('hapi-voice-lang') }) + // Voice ID state - read from localStorage + const [voiceId, setVoiceId] = useState(() => { + return localStorage.getItem('hapi-voice-id') + }) + + // Dynamic voice list fetched from hub (includes user's cloned voices) + const [dynamicVoices, setDynamicVoices] = useState(null) + const [playingVoiceId, setPlayingVoiceId] = useState(null) + const currentAudioRef = useRef(null) + const fontScaleOptions = getFontScaleOptions() const terminalFontSizeOptions = getTerminalFontSizeOptions() const composerEnterBehaviorOptions = getComposerEnterBehaviorOptions() @@ -312,6 +358,16 @@ export default function SettingsPage() { const currentTerminalToolDisplayModeLabel = terminalToolDisplayModeOptions.find((opt) => opt.value === terminalToolDisplayMode)?.labelKey ?? 'settings.chat.terminalToolDisplay.compact' const currentVoiceLanguage = voiceLanguages.find((lang) => lang.code === voiceLanguage) + // Voice list: dynamic (from ElevenLabs API, includes clones) or static fallback + const fallbackVoices = getFallbackVoices(locale) + const voiceOptions: VoiceInfo[] = dynamicVoices && dynamicVoices.length > 0 + ? dynamicVoices + : fallbackVoices.map(v => ({ id: v.id, name: v.name, previewUrl: '', category: 'premade' })) + + const currentVoiceName = voiceId + ? (voiceOptions.find(v => v.id === voiceId)?.name ?? fallbackVoices.find(v => v.id === voiceId)?.name ?? voiceId) + : null + const handleLocaleChange = (newLocale: Locale) => { setLocale(newLocale) setIsOpen(false) @@ -352,9 +408,48 @@ export default function SettingsPage() { setIsVoiceOpen(false) } + const handleVoiceChange = (id: string | null) => { + setVoiceId(id) + if (id === null) { + localStorage.removeItem('hapi-voice-id') + } else { + localStorage.setItem('hapi-voice-id', id) + } + setIsVoicePickerOpen(false) + } + + // Fetch available voices from hub on mount + useEffect(() => { + fetchVoices(api).then(voices => { + if (voices.length > 0) setDynamicVoices(voices) + }) + }, [api]) + + const handleVoicePreview = (previewUrl: string, voiceId: string, event: React.MouseEvent) => { + event.stopPropagation() + if (!previewUrl) return + + if (playingVoiceId === voiceId) { + currentAudioRef.current?.pause() + currentAudioRef.current = null + setPlayingVoiceId(null) + return + } + + currentAudioRef.current?.pause() + const audio = new Audio(previewUrl) + currentAudioRef.current = audio + setPlayingVoiceId(voiceId) + audio.play().catch(() => setPlayingVoiceId(null)) + audio.addEventListener('ended', () => { + setPlayingVoiceId(null) + currentAudioRef.current = null + }) + } + // Close dropdown when clicking outside useEffect(() => { - if (!isOpen && !isAppearanceOpen && !isFontOpen && !isTerminalFontOpen && !isChatOpen && !isTerminalToolDisplayOpen && !isVoiceOpen) return + if (!isOpen && !isAppearanceOpen && !isFontOpen && !isTerminalFontOpen && !isChatOpen && !isTerminalToolDisplayOpen && !isVoiceOpen && !isVoicePickerOpen) return const handleClickOutside = (event: MouseEvent) => { if (isOpen && containerRef.current && !containerRef.current.contains(event.target as Node)) { @@ -378,15 +473,18 @@ export default function SettingsPage() { if (isVoiceOpen && voiceContainerRef.current && !voiceContainerRef.current.contains(event.target as Node)) { setIsVoiceOpen(false) } + if (isVoicePickerOpen && voicePickerContainerRef.current && !voicePickerContainerRef.current.contains(event.target as Node)) { + setIsVoicePickerOpen(false) + } } document.addEventListener('mousedown', handleClickOutside) return () => document.removeEventListener('mousedown', handleClickOutside) - }, [isOpen, isAppearanceOpen, isFontOpen, isTerminalFontOpen, isChatOpen, isTerminalToolDisplayOpen, isVoiceOpen]) + }, [isOpen, isAppearanceOpen, isFontOpen, isTerminalFontOpen, isChatOpen, isTerminalToolDisplayOpen, isVoiceOpen, isVoicePickerOpen]) // Close on escape key useEffect(() => { - if (!isOpen && !isAppearanceOpen && !isFontOpen && !isTerminalFontOpen && !isChatOpen && !isTerminalToolDisplayOpen && !isVoiceOpen) return + if (!isOpen && !isAppearanceOpen && !isFontOpen && !isTerminalFontOpen && !isChatOpen && !isTerminalToolDisplayOpen && !isVoiceOpen && !isVoicePickerOpen) return const handleEscape = (event: KeyboardEvent) => { if (event.key === 'Escape') { @@ -397,12 +495,13 @@ export default function SettingsPage() { setIsChatOpen(false) setIsTerminalToolDisplayOpen(false) setIsVoiceOpen(false) + setIsVoicePickerOpen(false) } } document.addEventListener('keydown', handleEscape) return () => document.removeEventListener('keydown', handleEscape) - }, [isOpen, isAppearanceOpen, isFontOpen, isTerminalFontOpen, isChatOpen, isTerminalToolDisplayOpen, isVoiceOpen]) + }, [isOpen, isAppearanceOpen, isFontOpen, isTerminalFontOpen, isChatOpen, isTerminalToolDisplayOpen, isVoiceOpen, isVoicePickerOpen]) return (
@@ -813,6 +912,93 @@ export default function SettingsPage() {
)} + +
+ + + {isVoicePickerOpen && ( +
+
+ +
+ {voiceOptions.map((voice) => { + const isSelected = voiceId === voice.id + const isPlaying = playingVoiceId === voice.id + return ( +
+ + +
+ ) + })} +
+ )} +
{/* About section */}