diff --git a/.changeset/realtime-speech-channel.md b/.changeset/realtime-speech-channel.md new file mode 100644 index 000000000..d3be76d72 --- /dev/null +++ b/.changeset/realtime-speech-channel.md @@ -0,0 +1,5 @@ +--- +"eve": patch +--- + +Add a Vercel realtime speech channel (exported at `eve/channels/vercel/speech`) plus a React voice hook. The channel mints AI Gateway realtime client secrets so the browser can hold the audio socket, while finalized transcripts run as ordinary durable turns through the existing `/eve/v1/session` routes and event stream — no Eve request blocks for a full model turn, and spoken replies are read back on `message.completed`. `eve/react/voice` provides the browser hook (`useEveVoice`); non-React clients use `setupVoice` plus `client.session()`. diff --git a/apps/docs/lib/integrations/data.ts b/apps/docs/lib/integrations/data.ts index ac8e86ac6..7c7cf0656 100644 --- a/apps/docs/lib/integrations/data.ts +++ b/apps/docs/lib/integrations/data.ts @@ -299,6 +299,43 @@ export default eveChannel(); Point your frontend at the session routes eve serves (\`/eve/v1/session\`) and stream responses with the eve web client.`, configure: `The eve channel is the lowest-friction way to talk to your agent, with no third-party provisioning required. Layer in auth and route protection as needed. See the [eve channel docs](/docs/channels/eve) and the [Frontend guide](/docs/guides/frontend/overview).`, }, + "realtime-speech": { + logo: "voice", + docsHref: "/docs/channels/realtime-speech", + keywords: ["voice", "audio", "realtime", "microphone", "ai gateway"], + install: `Install the framework and the AI SDK React realtime peer: + +\`\`\`bash +npm install eve@latest @ai-sdk/react +\`\`\``, + quickStart: `Create \`agent/channels/speech.ts\`: + +\`\`\`ts +// agent/channels/speech.ts +import { vercelSpeechChannel } from "eve/channels/vercel/speech"; +import { localDev, vercelOidc } from "eve/channels/auth"; + +export default vercelSpeechChannel({ + auth: [localDev(), vercelOidc()], +}); +\`\`\` + +Then render a microphone wherever it fits your UI: + +\`\`\`tsx +"use client"; + +import { useEveVoice } from "eve/react/voice"; + +export function ComposerActions() { + const voice = useEveVoice(); + const active = voice.status === "connected" || voice.status === "connecting"; + + return ; +} +\`\`\``, + configure: `Set \`AI_GATEWAY_API_KEY\` so the setup route can mint short-lived AI Gateway realtime client secrets. The browser keeps the realtime audio socket open, while each finalized utterance runs as an ordinary durable turn through the existing \`/eve/v1/session\` routes and event stream. The voice session id is a client-visible correlation id only; principal binding comes from normal session-route auth.`, + }, }; /** diff --git a/apps/docs/lib/integrations/logos.tsx b/apps/docs/lib/integrations/logos.tsx index 69e15f7ee..787ae97c8 100644 --- a/apps/docs/lib/integrations/logos.tsx +++ b/apps/docs/lib/integrations/logos.tsx @@ -21,6 +21,25 @@ export const webLogo = (props: LogoProps) => ( ); +export const voiceLogo = (props: LogoProps) => ( + + + + +); + export const githubLogo = (props: LogoProps) => ( ( export const logos = { eve: eveLogo, web: webLogo, + voice: voiceLogo, github: githubLogo, slack: slackLogo, discord: discordLogo, diff --git a/apps/frameworks/next/agent/channel-auth.ts b/apps/frameworks/next/agent/channel-auth.ts new file mode 100644 index 000000000..4c1a65ba2 --- /dev/null +++ b/apps/frameworks/next/agent/channel-auth.ts @@ -0,0 +1,32 @@ +import { type AuthFn, localDev, vercelOidc } from "eve/channels/auth"; +import { getAuthJsSession } from "@/lib/auth"; + +function authjsSession(): AuthFn { + return async (request) => { + const session = await getAuthJsSession(request); + if (!session) return null; + + const attributes: Record = { + providerId: session.providerId, + }; + if (session.profile.email) { + attributes.email = session.profile.email; + } + if (session.profile.name) { + attributes.name = session.profile.name; + } + if (session.profile.image) { + attributes.image = session.profile.image; + } + return { + attributes, + authenticator: "authjs", + issuer: session.issuer, + principalId: session.profile.sub, + principalType: "user", + subject: session.profile.sub, + }; + }; +} + +export const agentChannelAuth = [authjsSession(), localDev(), vercelOidc()] as const; diff --git a/apps/frameworks/next/agent/channels/eve.ts b/apps/frameworks/next/agent/channels/eve.ts index 4a3b240b5..3d72f0b55 100644 --- a/apps/frameworks/next/agent/channels/eve.ts +++ b/apps/frameworks/next/agent/channels/eve.ts @@ -1,35 +1,6 @@ -import { type AuthFn, localDev, vercelOidc } from "eve/channels/auth"; import { eveChannel } from "eve/channels/eve"; -import { getAuthJsSession } from "@/lib/auth"; - -function authjsSession(): AuthFn { - return async (request) => { - const session = await getAuthJsSession(request); - if (!session) return null; - - const attributes: Record = { - providerId: session.providerId, - }; - if (session.profile.email) { - attributes.email = session.profile.email; - } - if (session.profile.name) { - attributes.name = session.profile.name; - } - if (session.profile.image) { - attributes.image = session.profile.image; - } - return { - attributes, - authenticator: "authjs", - issuer: session.issuer, - principalId: session.profile.sub, - principalType: "user", - subject: session.profile.sub, - }; - }; -} +import { agentChannelAuth } from "../channel-auth"; export default eveChannel({ - auth: [authjsSession(), localDev(), vercelOidc()], + auth: agentChannelAuth, }); diff --git a/apps/frameworks/next/agent/channels/realtime-speech.ts b/apps/frameworks/next/agent/channels/realtime-speech.ts new file mode 100644 index 000000000..05cc1bc97 --- /dev/null +++ b/apps/frameworks/next/agent/channels/realtime-speech.ts @@ -0,0 +1,45 @@ +import { createGateway } from "@ai-sdk/gateway"; +import { + vercelSpeechChannel, + type VercelSpeechChannelInput, + type VercelSpeechGetTokenInput, +} from "eve/channels/vercel/speech"; +import { agentChannelAuth } from "../channel-auth"; + +const gatewayBaseUrl = + process.env.AI_GATEWAY_BASE_URL?.trim() || process.env.AI_GATEWAY_BASEURL?.trim(); +const gatewayBypass = + process.env.VERCEL_AUTOMATION_BYPASS_SECRET?.trim() || process.env.VERCEL_DPBP?.trim(); + +const gateway = + gatewayBaseUrl !== undefined && gatewayBaseUrl.length > 0 + ? createGateway({ + baseURL: gatewayBaseUrl, + ...(gatewayBypass !== undefined && gatewayBypass.length > 0 + ? { headers: { "x-vercel-protection-bypass": gatewayBypass } } + : {}), + }) + : undefined; + +function withGatewayBypass(url: string): string { + if (gatewayBypass === undefined || gatewayBypass.length === 0) return url; + const parsed = new URL(url); + parsed.searchParams.set("x-vercel-protection-bypass", gatewayBypass); + return parsed.toString(); +} + +export default vercelSpeechChannel({ + auth: agentChannelAuth, + control: + process.env.EVE_REALTIME_CONTROL === "1" || process.env.NEXT_PUBLIC_EVE_VOICE_CONTROL === "1", + expiresAfterSeconds: 300, + ...(gateway === undefined + ? {} + : { + async getToken(input: VercelSpeechGetTokenInput) { + const token = await gateway.experimental_realtime.getToken(input); + return { ...token, url: withGatewayBypass(token.url) }; + }, + }), + model: process.env.EVE_REALTIME_MODEL?.trim() || "openai/gpt-realtime-2", +} satisfies VercelSpeechChannelInput); diff --git a/apps/frameworks/next/app/_chat/App.tsx b/apps/frameworks/next/app/_chat/App.tsx index fee46a885..5f2f530e3 100644 --- a/apps/frameworks/next/app/_chat/App.tsx +++ b/apps/frameworks/next/app/_chat/App.tsx @@ -1,12 +1,26 @@ "use client"; import { useEveAgent } from "eve/react"; +import { useEveVoice, type EveVoiceMessage } from "eve/react/voice"; import { type FormEvent, type JSX, useEffect, useMemo, useRef, useState } from "react"; -import { traceReducer } from "./trace-reducer"; +import { traceReducer, type TraceProjection } from "./trace-reducer"; import { resolveTurnFailureMessage, shouldRenderAssistantTurn } from "./turn-content"; import type { TraceTurn } from "./types"; +// Demo session tracking sets are unbounded otherwise; cap them so a long +// voice session does not accumulate turn ids and spoken-message keys forever. +const MAX_TRACKED_VOICE_TURNS = 256; + +function rememberBounded(seen: Set, value: string, max: number): void { + seen.add(value); + while (seen.size > max) { + const oldest = seen.values().next().value; + if (oldest === undefined) break; + seen.delete(oldest); + } +} + function ConversationSection(props: { readonly isSending: boolean; readonly turns: readonly TraceTurn[]; @@ -53,24 +67,115 @@ function ConversationSection(props: { ); } +// In Gateway-control mode the durable turn runs server-side (its own `voice:` +// continuation), so the typed-chat feed (`agent.data.turns`) never sees it. The +// hook surfaces the live transcript (user finalized speech + the agent's +// streamed spoken words) as `voice.messages`; this feed renders it, with a +// Thinking… row in the gap while Eve runs the turn. +function VoiceConversationSection(props: { + readonly messages: readonly EveVoiceMessage[]; + readonly thinking: boolean; +}) { + return ( + + ); +} + export function App() { const [composerInput, setComposerInput] = useState(""); const [composerError, setComposerError] = useState(undefined); + const [voiceCaption, setVoiceCaption] = useState(undefined); + const controlMode = process.env.NEXT_PUBLIC_EVE_VOICE_CONTROL === "1"; const conversationStageRef = useRef(null); + const agentRef = useRef> | undefined>(undefined); + const pendingVoiceMessagesRef = useRef([]); const reducer = useMemo(() => traceReducer(), []); + const spokenVoiceMessageKeysRef = useRef(new Set()); + const voiceRef = useRef, "speak"> | undefined>(undefined); + const voiceTurnIdsRef = useRef(new Set()); const agent = useEveAgent({ + onEvent(event) { + if (event.type === "message.received") { + // Correlate by matching the transcript anywhere in the pending queue so + // interleaved typed messages cannot shift the wrong entry off the head. + const pendingIndex = pendingVoiceMessagesRef.current.indexOf(event.data.message); + if (pendingIndex !== -1) { + pendingVoiceMessagesRef.current.splice(pendingIndex, 1); + rememberBounded(voiceTurnIdsRef.current, event.data.turnId, MAX_TRACKED_VOICE_TURNS); + return; + } + } + + if ( + event.type === "message.completed" && + event.data.finishReason !== "tool-calls" && + event.data.message !== null && + voiceTurnIdsRef.current.has(event.data.turnId) + ) { + const key = `${event.data.turnId}:${event.data.stepIndex}:${event.data.message}`; + if (spokenVoiceMessageKeysRef.current.has(key)) return; + + rememberBounded(spokenVoiceMessageKeysRef.current, key, MAX_TRACKED_VOICE_TURNS); + voiceRef.current?.speak(event.data.message); + setVoiceCaption(`Reply ready: ${event.data.message}`); + } + }, reducer, }); + agentRef.current = agent; + const voice = useEveVoice({ + // Opt into Gateway-owned control mode (A-lite): the Gateway drives turns + // over its server-side control socket, so the browser only streams audio + // and `onTranscript` below is not used. Defaults off (client-driven path). + controlMode, + onEvent(event) { + // The live transcript now comes from `voice.messages` / `voice.isThinking` + // (rendered by VoiceConversationSection). Only the client-driven path + // keeps the lightweight caption. + if (!controlMode && event.type === "input-transcription-completed") { + setVoiceCaption(`Heard: ${event.transcript}`); + } + }, + async onTranscript({ transcript }) { + setVoiceCaption(`Heard: ${transcript}`); + pendingVoiceMessagesRef.current.push(transcript); + try { + await agentRef.current?.send({ message: transcript }); + } catch (error) { + const index = pendingVoiceMessagesRef.current.indexOf(transcript); + if (index !== -1) pendingVoiceMessagesRef.current.splice(index, 1); + throw error; + } + }, + }); + voiceRef.current = voice; const turns = agent.data.turns; const isComposeInProgress = agent.status === "submitted" || agent.status === "streaming"; const hasComposerText = composerInput.trim().length > 0; - const hasConversation = turns.length > 0 || isComposeInProgress; + const hasVoiceConversation = controlMode && (voice.messages.length > 0 || voice.isThinking); + const hasConversation = turns.length > 0 || isComposeInProgress || hasVoiceConversation; const conversationActivityKey = [ agent.session.sessionId ?? "new-thread", String(agent.session.streamIndex), String(agent.events.length), agent.status, + String(voice.messages.length), + String(voice.messages.at(-1)?.text.length ?? 0), + String(voice.isThinking), ].join(":"); useEffect(() => { @@ -136,7 +241,27 @@ export function App() { value={composerInput} />
+ {voiceCaption !== undefined ?

{voiceCaption}

: }
+
); } + +function VoiceGlyph(props: { readonly activity: ReturnType["activity"] }) { + if (props.activity === "assistant-speaking") { + return ( + + ); + } + + if (props.activity === "user-speaking") { + return ( + + ); + } + + return ( + + ); +} + +function voiceButtonLabel(activity: ReturnType["activity"]): string { + switch (activity) { + case "assistant-speaking": + return "Stop voice; assistant is speaking"; + case "connecting": + return "Connecting voice"; + case "error": + return "Voice unavailable"; + case "listening": + return "Stop voice; listening"; + case "user-speaking": + return "Stop voice; speech detected"; + case "ready": + return "Start voice"; + } + return "Start voice"; +} diff --git a/apps/frameworks/next/app/_chat/styles.css b/apps/frameworks/next/app/_chat/styles.css index 4b34f6203..6b0ec2d96 100644 --- a/apps/frameworks/next/app/_chat/styles.css +++ b/apps/frameworks/next/app/_chat/styles.css @@ -296,6 +296,19 @@ pre { margin-left: auto; } +.voice-caption { + color: var(--fg-muted); + flex: 1; + font-size: 12px; + line-height: 1.35; + margin: 0; + min-width: 0; + overflow: hidden; + text-align: left; + text-overflow: ellipsis; + white-space: nowrap; +} + .send-button { background: none; border: 1px solid var(--border); @@ -307,6 +320,88 @@ pre { width: 32px; } +.voice-toggle-button { + align-items: center; + background: none; + border: 1px solid var(--border); + border-radius: 50%; + color: var(--fg-muted); + cursor: pointer; + display: inline-flex; + height: 32px; + justify-content: center; + padding: 0; + position: relative; + transition: + background 160ms ease, + border-color 160ms ease, + box-shadow 160ms ease, + color 160ms ease, + opacity 160ms ease; + width: 32px; +} + +.voice-toggle-button[data-voice-state="connecting"], +.voice-toggle-button[data-voice-state="listening"], +.voice-toggle-button[data-voice-state="user-speaking"], +.voice-toggle-button[data-voice-state="assistant-speaking"] { + background: var(--accent); + border-color: var(--accent); + color: var(--fg-on-inverted); +} + +.voice-toggle-button[data-voice-state="listening"] { + box-shadow: 0 0 0 3px color-mix(in srgb, var(--accent) 10%, transparent); +} + +.voice-toggle-button[data-voice-state="user-speaking"] { + box-shadow: + 0 0 0 3px color-mix(in srgb, var(--accent) 14%, transparent), + 0 0 0 9px color-mix(in srgb, var(--accent) 8%, transparent); +} + +.voice-toggle-button[data-voice-state="user-speaking"]::after { + animation: voice-pulse 1.1s ease-out infinite; + border: 1px solid color-mix(in srgb, var(--accent) 34%, transparent); + border-radius: inherit; + content: ""; + inset: -6px; + pointer-events: none; + position: absolute; +} + +.voice-toggle-button[data-voice-state="assistant-speaking"] { + box-shadow: 0 0 0 3px color-mix(in srgb, var(--success) 18%, transparent); +} + +.voice-toggle-button[data-voice-state="error"] { + border-color: color-mix(in srgb, var(--danger) 42%, var(--border)); + color: var(--danger); +} + +.voice-toggle-button:hover:enabled { + color: var(--fg); +} + +.voice-toggle-button[data-voice-state="connecting"]:hover:enabled, +.voice-toggle-button[data-voice-state="listening"]:hover:enabled, +.voice-toggle-button[data-voice-state="user-speaking"]:hover:enabled, +.voice-toggle-button[data-voice-state="assistant-speaking"]:hover:enabled { + color: var(--fg-on-inverted); + opacity: 0.85; +} + +@keyframes voice-pulse { + from { + opacity: 0.72; + transform: scale(0.9); + } + to { + opacity: 0; + transform: scale(1.22); + } +} + .send-button.ready { background: var(--accent); border-color: var(--accent); diff --git a/apps/frameworks/next/package.json b/apps/frameworks/next/package.json index a7e57adc0..47da60295 100644 --- a/apps/frameworks/next/package.json +++ b/apps/frameworks/next/package.json @@ -12,6 +12,8 @@ "typecheck": "tsc" }, "dependencies": { + "@ai-sdk/gateway": "catalog:", + "@ai-sdk/react": "catalog:", "@auth/core": "0.41.2", "@vercel/connect": "catalog:", "@vercel/oidc": "3.4.1", diff --git a/docs/channels/meta.json b/docs/channels/meta.json index 32cb56fa4..83924619d 100644 --- a/docs/channels/meta.json +++ b/docs/channels/meta.json @@ -3,6 +3,7 @@ "pages": [ "overview", "eve", + "realtime-speech", "slack", "discord", "teams", diff --git a/docs/channels/realtime-speech.mdx b/docs/channels/realtime-speech.mdx new file mode 100644 index 000000000..1ced36671 --- /dev/null +++ b/docs/channels/realtime-speech.mdx @@ -0,0 +1,152 @@ +--- +title: "Realtime Speech (Vercel AI Gateway)" +description: "Add a browser microphone backed by Vercel AI Gateway realtime audio; finalized utterances run as normal durable Eve turns." +--- + +Realtime Speech adds a microphone surface for agents without turning Eve into a long-running audio runtime. + +This channel is specific to **Vercel AI Gateway** realtime audio and is exported from `eve/channels/vercel/speech`. It is not a provider-agnostic speech API. The shipped topology is: + +- **Audio:** browser ↔ AI Gateway realtime WebSocket (Eve is never in the audio path). +- **Turns:** browser/app ↔ Eve `/eve/v1/session` (+ `/stream`) as ordinary durable turns. + +Eve serves a setup route that authenticates the caller and mints a short-lived Gateway client secret for the browser audio socket; each finalized utterance then becomes one normal durable agent turn through the existing session API. + +`useEveVoice` is audio-first by default: it requests audio output from the realtime model and uses transcription events for visible text. Override `sessionConfig.outputModalities` only if you intentionally want a text-only or provider-specific realtime mode. + +## Install + +The server channel ships with Eve. The React microphone helper uses the AI SDK realtime hook, so install the optional peer in apps that render voice UI: + +```bash +npm install eve@latest @ai-sdk/react +``` + +## Enable the channel + +Create `agent/channels/speech.ts`: + +```ts +import { vercelSpeechChannel } from "eve/channels/vercel/speech"; +import { localDev, vercelOidc } from "eve/channels/auth"; + +export default vercelSpeechChannel({ + auth: [localDev(), vercelOidc()], +}); +``` + +Provide `AI_GATEWAY_API_KEY` in the server environment. The setup route uses it to mint a single-use `vcst_` realtime client secret, so the browser never receives a long-lived Gateway credential. + +## Add a microphone + +Use the React hook to render a microphone control that fits your UI: + +```tsx +"use client"; + +import { useEveVoice } from "eve/react/voice"; + +export function ComposerActions() { + const voice = useEveVoice(); + const active = voice.status === "connected" || voice.status === "connecting"; + + return ( + + ); +} +``` + +The React helper is intentionally limited to React because it wraps the AI SDK realtime React hook. For non-React clients, mint the realtime audio token with `setupVoice` and run turns through a normal durable session: + +```ts +import { Client, setupVoice } from "eve/client"; + +const client = new Client({ host: "https://my-agent.example.com" }); + +// Mint the short-lived AI Gateway realtime token to open the audio socket. +const { token, url, voiceSessionId } = await setupVoice(client); + +// Finalized transcripts are ordinary durable turns. +const session = client.session(); +const reply = await session.send("What's the weather?").result(); +console.log(reply.message); +``` + +## How it works + +- `POST /eve/v1/realtime-speech/setup` authenticates the request and returns a Gateway realtime token, an empty realtime tool list, and `voiceSessionId`. +- The browser opens the AI Gateway realtime WebSocket directly with the short-lived token. +- The client listens for finalized realtime transcription events and sends each transcript to Eve as a normal durable turn through `POST /eve/v1/session` (and `POST /eve/v1/session/:sessionId` for follow-ups), which return immediately. +- The client consumes the session event stream (`GET /eve/v1/session/:sessionId/stream`) and, on a non-tool-call `message.completed`, sends that text back to the realtime session for audio playback. +- The durable continuation token returned by the session route keeps the same Eve conversation across utterances, advancing the stream cursor each turn. + +No Eve HTTP request blocks for a full model turn: the setup POST and each session turn POST return right away, and replies arrive over the event stream. The speech transport can stay open for many utterances. Eve still sees discrete durable turns and parks between them, so history, tools, compaction, auth, and instrumentation behave the same as other channels. + +Eve is the durable source of truth for the conversation: transcripts enter Eve as user turns, and spoken replies are readback of Eve output. Realtime provider suppression is still client/provider mediated, so do not treat the realtime model as a security boundary for policy decisions. + +## Options + +```ts +export default vercelSpeechChannel({ + auth: [localDev(), vercelOidc()], + model: "openai/gpt-realtime-2", + expiresAfterSeconds: 300, +}); +``` + +`useEveVoice` defaults to the `/eve/v1/realtime-speech/setup` route and same-origin `/eve/v1/session` routes. Override `setupUrl` only if you changed the channel `basePath`, and pass `host`, `auth`, `headers`, `client`, or `session` to run turns against a custom origin or a session you already manage (for example one shared with `useEveAgent`). + +The default client session config uses `outputModalities: ["audio"]`, `inputAudioTranscription: {}`, and `outputAudioTranscription: {}`. This keeps the speech UX compatible with realtime providers that reject mixed `audio` + `text` output modalities while still letting the UI observe transcripts. + +`sessionConfig` is merged over the defaults, so passing `instructions` replaces the built-in speech-adapter prompt that drives reply playback (it tells the model to speak only the text after the `EVE_SPEAK:` marker). If you override `instructions`, keep that marker behavior or Eve's replies will not be spoken. + +## Gateway-owned control mode (A-lite, experimental) + +In the default mode above the client (or app) owns turn timing. The experimental **Gateway-owned control mode** instead lets AI Gateway drive turns: per live session it dials an Eve `WS()` control route with a signed, short-lived control token; Eve verifies it, owns turn coordination (settle-debounce, backchannel/duplicate suppression, barge-in), runs durable turns, and streams reply text back as semantic `response.delta` / `response.done` packets that Gateway injects into the provider's TTS. Audio still never flows through Eve, and the browser is frame-filtered by Gateway so only Eve can trigger speech. + +This requires the matching AI Gateway control feature to be enabled for your team. Enable it on the channel: + +```ts +export default vercelSpeechChannel({ + auth: [localDev(), vercelOidc()], + control: true, +}); +``` + +and put the browser hook in control mode so it only streams audio (no client-driven turns): + +```tsx +const voice = useEveVoice({ controlMode: true }); +``` + +`control: true` makes `/setup` mint a token carrying an Eve control URL + signed control token, and serves the `{basePath}/ws` route Gateway dials. Put the browser hook in `controlMode` only when the channel is also configured with `control`; otherwise the browser and Gateway will disagree about who owns Eve turns. + +Configuration: + +- **`EVE_REALTIME_CONTROL_URL`** — full `wss://` (or `ws://localhost`) control URL. For local dev, expose Eve via a tunnel (ngrok/cloudflared) or a preview deployment and point this at `wss:///eve/v1/realtime-speech/ws`. Otherwise Eve derives it from the deployment host (`VERCEL_BRANCH_URL` / `VERCEL_URL`). +- **`EVE_REALTIME_CONTROL_SECRET`** — HMAC secret for control tokens. Required by default. For local/preview experiments only, set `EVE_REALTIME_CONTROL_ALLOW_GATEWAY_KEY_FALLBACK=1` or pass `allowGatewayKeyFallback: true` to derive it from `AI_GATEWAY_API_KEY`. +- **`VERCEL_AUTOMATION_BYPASS_SECRET`** (or `VERCEL_DPBP`) — when set, appended to Vercel deployment control URLs as `x-vercel-protection-bypass` so Gateway can dial a protection-enabled preview. Temporary, for protected-preview testing. + +Control mode keeps a voice-session continuation token and stream cursor keyed by the authenticated principal plus `voiceSessionId`. The channel uses an in-memory store by default, which helps same-instance reconnects but is not durable across serverless cold starts. For production recovery, pass a durable store: + +```ts +export default vercelSpeechChannel({ + auth: [localDev(), vercelOidc()], + control: { + stateStore: { + async get(key) { + return await loadVoiceState(key); + }, + async set(key, state) { + await saveVoiceState(key, state); + }, + }, + }, +}); +``` + +If you want recovery across page reloads, persist and reuse the same `voiceSessionId` in `useEveVoice({ voiceSessionId, controlMode: true })` or `setupVoice(..., { voiceSessionId })`. + +Eve degrades gracefully: it uses final transcripts (partials are optional), only promises barge-in when Gateway sends interruption signals, and still runs durable turns even if the provider cannot speak. The default durable-session path remains available and is the recommended fallback. diff --git a/packages/eve-catalog/src/index.ts b/packages/eve-catalog/src/index.ts index 07682f226..8fb86cad2 100644 --- a/packages/eve-catalog/src/index.ts +++ b/packages/eve-catalog/src/index.ts @@ -143,6 +143,13 @@ export const INTEGRATIONS: readonly IntegrationEntry[] = [ tagline: "Embed a first-party web chat UI backed by your agent.", surfaces: { scaffoldable: true, gallery: true }, }, + { + slug: "realtime-speech", + name: "Realtime Speech", + kind: "channel", + tagline: "Add a microphone button backed by AI Gateway realtime audio and durable Eve turns.", + surfaces: { scaffoldable: false, gallery: true }, + }, { slug: "linear", name: "Linear", diff --git a/packages/eve/package.json b/packages/eve/package.json index e1c1c1cee..f4955f1ff 100644 --- a/packages/eve/package.json +++ b/packages/eve/package.json @@ -66,11 +66,21 @@ "import": "./dist/src/client/index.js", "default": "./dist/src/client/index.js" }, + "./client/voice": { + "types": "./dist/src/client/voice.d.ts", + "import": "./dist/src/client/voice.js", + "default": "./dist/src/client/voice.js" + }, "./react": { "types": "./dist/src/react/index.d.ts", "import": "./dist/src/react/index.js", "default": "./dist/src/react/index.js" }, + "./react/voice": { + "types": "./dist/src/react/voice.d.ts", + "import": "./dist/src/react/voice.js", + "default": "./dist/src/react/voice.js" + }, "./vue": { "types": "./dist/src/vue/index.d.ts", "import": "./dist/src/vue/index.js", @@ -206,6 +216,11 @@ "import": "./dist/src/public/channels/eve.js", "default": "./dist/src/public/channels/eve.js" }, + "./channels/vercel/speech": { + "types": "./dist/src/public/channels/vercel/speech.d.ts", + "import": "./dist/src/public/channels/vercel/speech.js", + "default": "./dist/src/public/channels/vercel/speech.js" + }, "./channels/auth": { "types": "./dist/src/public/channels/auth.d.ts", "import": "./dist/src/public/channels/auth.js", @@ -288,11 +303,13 @@ }, "devDependencies": { "@ai-sdk/anthropic": "catalog:", + "@ai-sdk/gateway": "catalog:", "@ai-sdk/google": "catalog:", "@ai-sdk/mcp": "catalog:", "@ai-sdk/openai": "catalog:", "@ai-sdk/otel": "catalog:", "@ai-sdk/provider": "catalog:", + "@ai-sdk/react": "catalog:", "@chat-adapter/slack": "4.29.0", "@chat-adapter/state-memory": "4.29.0", "@clack/core": "1.3.1", @@ -336,6 +353,8 @@ "zod-validation-error": "5.0.0" }, "peerDependencies": { + "@ai-sdk/gateway": "catalog:", + "@ai-sdk/react": "catalog:", "@opentelemetry/api": "^1.0.0", "@sveltejs/kit": "^2.0.0", "ai": "catalog:", @@ -350,6 +369,12 @@ "vue": "^3.5.0" }, "peerDependenciesMeta": { + "@ai-sdk/gateway": { + "optional": true + }, + "@ai-sdk/react": { + "optional": true + }, "@opentelemetry/api": { "optional": true }, diff --git a/packages/eve/scripts/build-rolldown.mjs b/packages/eve/scripts/build-rolldown.mjs index 30f10e597..a04bc9ac1 100644 --- a/packages/eve/scripts/build-rolldown.mjs +++ b/packages/eve/scripts/build-rolldown.mjs @@ -125,8 +125,9 @@ const EXCLUDED_DIRECTORIES = new Set([join("internal", "testing")]); * Packages externalized at bundle time so rolldown never inlines them * into eve's dist tree. Three categories: * - * - Peer dependencies (`ai`, `next`, `react`, `@opentelemetry/api`, - * `braintrust`) — consumers provide the install. + * - Peer dependencies (`ai`, `next`, `react`, `@ai-sdk/gateway`, + * `@ai-sdk/react`, `@opentelemetry/api`, `braintrust`) — consumers + * provide the install. * - Runtime dependencies (`nitro`) — resolved at * runtime against the eve installation. * - Optional peer dependency (`just-bash`) — the opt-in local sandbox @@ -138,6 +139,8 @@ const EXCLUDED_DIRECTORIES = new Set([join("internal", "testing")]); * the package `imports` map. */ const EXTERNAL_PACKAGES = new Set([ + "@ai-sdk/gateway", + "@ai-sdk/react", "@nuxt/kit", "@opentelemetry/api", "@sveltejs/kit", diff --git a/packages/eve/src/client/index.ts b/packages/eve/src/client/index.ts index 24744f10d..0c7c1ff63 100644 --- a/packages/eve/src/client/index.ts +++ b/packages/eve/src/client/index.ts @@ -9,6 +9,7 @@ export { defaultMessageReducer } from "#client/message-reducer.js"; export { createDataUrlFilePart, createTextWithFileContent } from "#client/file-parts.js"; export { MessageResponse } from "#client/message-response.js"; export { ClientSession } from "#client/session.js"; +export { EVE_VOICE_SETUP_ROUTE_PATH, setupVoice, voiceSetupUrl } from "#client/voice.js"; // --------------------------------------------------------------------------- // Client types @@ -52,6 +53,7 @@ export type { StreamOptions, TokenValue, } from "#client/types.js"; +export type { EveVoiceSetupResult, SetupVoiceOptions, VoiceTokenClient } from "#client/voice.js"; export type { EveAgentReducer, diff --git a/packages/eve/src/client/url.test.ts b/packages/eve/src/client/url.test.ts index b0df8c1c0..abb44984d 100644 --- a/packages/eve/src/client/url.test.ts +++ b/packages/eve/src/client/url.test.ts @@ -24,4 +24,19 @@ describe("createClientUrl", () => { "/api/eve/v1/session/123/stream?startIndex=4", ); }); + + it("preserves a query string embedded in the route path for absolute hosts", () => { + expect( + createClientUrl( + "https://agent.example.com", + "/eve/v1/realtime-speech/setup?voiceSessionId=v1", + ), + ).toBe("https://agent.example.com/eve/v1/realtime-speech/setup?voiceSessionId=v1"); + }); + + it("preserves a query string embedded in the route path for same-origin prefixes", () => { + expect(createClientUrl("", "/eve/v1/realtime-speech/setup?voiceSessionId=v1")).toBe( + "/eve/v1/realtime-speech/setup?voiceSessionId=v1", + ); + }); }); diff --git a/packages/eve/src/client/url.ts b/packages/eve/src/client/url.ts index 4f49be6cd..ce8db4c07 100644 --- a/packages/eve/src/client/url.ts +++ b/packages/eve/src/client/url.ts @@ -10,8 +10,17 @@ export function createClientUrl( routePath: string, searchParams?: Readonly>, ): string { - const normalizedRoute = routePath.startsWith("/") ? routePath : `/${routePath}`; - const search = formatSearch(searchParams); + const queryIndex = routePath.indexOf("?"); + const pathPart = queryIndex === -1 ? routePath : routePath.slice(0, queryIndex); + const embeddedSearch = queryIndex === -1 ? "" : routePath.slice(queryIndex); + const normalizedRoute = pathPart.startsWith("/") ? pathPart : `/${pathPart}`; + // Explicit searchParams win; otherwise preserve a query string already on the + // route path (e.g. the realtime-speech setup URL's `?voiceSessionId=`), which + // the URL pathname setter would otherwise percent-encode for absolute hosts. + const search = + searchParams && Object.keys(searchParams).length > 0 + ? formatSearch(searchParams) + : embeddedSearch; if (isAbsoluteUrl(host)) { const url = new URL(host); diff --git a/packages/eve/src/client/voice.test.ts b/packages/eve/src/client/voice.test.ts new file mode 100644 index 000000000..736290690 --- /dev/null +++ b/packages/eve/src/client/voice.test.ts @@ -0,0 +1,72 @@ +import { describe, expect, it, vi } from "vitest"; + +import { Client } from "#client/client.js"; +import { setupVoice, voiceSetupUrl } from "#client/voice.js"; + +describe("voiceSetupUrl", () => { + it("appends the voice session id to a relative setup route", () => { + expect(voiceSetupUrl("/eve/v1/realtime-speech/setup", "voice-1")).toBe( + "/eve/v1/realtime-speech/setup?voiceSessionId=voice-1", + ); + }); + + it("appends the voice session id to an absolute setup route", () => { + expect(voiceSetupUrl("https://eve.example.com/eve/v1/realtime-speech/setup", "voice-1")).toBe( + "https://eve.example.com/eve/v1/realtime-speech/setup?voiceSessionId=voice-1", + ); + }); +}); + +describe("setupVoice", () => { + it("mints a realtime token through the setup route with the voice session id", async () => { + const fetch = vi.fn(async () => + Response.json({ + expiresAt: 1_700_000_060, + token: "vcst_test", + url: "wss://gateway.example/realtime-model", + voiceSessionId: "voice-1", + }), + ); + + const result = await setupVoice({ fetch }, { voiceSessionId: "voice-1" }); + + expect(fetch).toHaveBeenCalledWith( + "/eve/v1/realtime-speech/setup?voiceSessionId=voice-1", + expect.objectContaining({ method: "POST" }), + ); + expect(result).toEqual({ + expiresAt: 1_700_000_060, + token: "vcst_test", + url: "wss://gateway.example/realtime-model", + voiceSessionId: "voice-1", + }); + }); + + it("throws when the setup response is malformed", async () => { + const fetch = vi.fn(async () => Response.json({ token: "vcst_test" })); + await expect(setupVoice({ fetch }, { voiceSessionId: "voice-1" })).rejects.toThrow(/malformed/); + }); + + it("works against an authenticated Eve client and a remote host", async () => { + const fetch = vi.fn(async () => + Response.json({ + token: "vcst_client", + url: "wss://gateway.example/realtime-model", + voiceSessionId: "voice-client", + }), + ); + vi.stubGlobal("fetch", fetch); + + const client = new Client({ auth: { bearer: "test-token" }, host: "https://eve.example.com" }); + await setupVoice(client, { voiceSessionId: "voice-client" }); + + expect(fetch).toHaveBeenCalledWith( + "https://eve.example.com/eve/v1/realtime-speech/setup?voiceSessionId=voice-client", + expect.objectContaining({ method: "POST" }), + ); + const headers = (fetch as ReturnType).mock.calls[0]![1].headers as Headers; + expect(headers.get("authorization")).toBe("Bearer test-token"); + + vi.unstubAllGlobals(); + }); +}); diff --git a/packages/eve/src/client/voice.ts b/packages/eve/src/client/voice.ts new file mode 100644 index 000000000..e635fa718 --- /dev/null +++ b/packages/eve/src/client/voice.ts @@ -0,0 +1,93 @@ +export const EVE_VOICE_SETUP_ROUTE_PATH = "/eve/v1/realtime-speech/setup"; + +export interface EveVoiceSetupResult { + readonly control?: boolean; + readonly expiresAt?: number; + readonly token: string; + readonly url: string; + readonly voiceSessionId: string; +} + +/** + * Minimal authenticated-fetch surface needed to mint a realtime voice token. + * + * {@link import("#client/client.js").Client} satisfies this, but the helper + * stays decoupled so it can run against any same-auth transport. + */ +export interface VoiceTokenClient { + fetch(path: string, init?: RequestInit): Promise; +} + +export interface SetupVoiceOptions { + /** Override the setup route when the channel uses a custom `basePath`. */ + readonly setupUrl?: string; + /** Reuse an existing voice session id instead of letting the server mint one. */ + readonly voiceSessionId?: string; +} + +/** + * Appends the voice session id to a realtime-speech setup URL. + * + * Works for both relative same-origin routes (`/eve/v1/realtime-speech/setup`) + * and absolute origins. The realtime audio socket and Gateway usage attribution + * are keyed by this id; durable Eve turns are bound to the authenticated + * principal by normal session-route auth, not by this value. + */ +export function voiceSetupUrl(baseUrl: string, voiceSessionId: string): string { + const absolute = /^https?:\/\//u.test(baseUrl); + const parsed = new URL(baseUrl, "https://eve.local"); + parsed.searchParams.set("voiceSessionId", voiceSessionId); + if (absolute) return parsed.toString(); + return `${parsed.pathname}${parsed.search}${parsed.hash}`; +} + +/** + * Mints a short-lived AI Gateway realtime token for a voice client. + * + * This is the one genuinely voice-specific concern that the durable session + * API does not cover: opening the browser/audio socket to AI Gateway. Run + * normal turns with {@link import("#client/session.js").ClientSession} via + * `client.session().send(...)`; use this only to obtain the realtime audio + * token and `voiceSessionId`. + */ +export async function setupVoice( + client: VoiceTokenClient, + options: SetupVoiceOptions = {}, +): Promise { + const voiceSessionId = options.voiceSessionId ?? crypto.randomUUID(); + const url = voiceSetupUrl(options.setupUrl ?? EVE_VOICE_SETUP_ROUTE_PATH, voiceSessionId); + + const response = await client.fetch(url, { + headers: { "content-type": "application/json" }, + method: "POST", + }); + const data = (await response.json().catch(() => ({}))) as Partial & { + readonly error?: unknown; + }; + if (!response.ok) { + throw new Error(typeof data.error === "string" ? data.error : "Eve voice setup failed."); + } + if (typeof data.token !== "string" || typeof data.url !== "string") { + throw new Error("Eve voice setup response was malformed."); + } + + const resolvedVoiceSessionId = + typeof data.voiceSessionId === "string" && data.voiceSessionId.length > 0 + ? data.voiceSessionId + : voiceSessionId; + + const result: { + control?: boolean; + expiresAt?: number; + token: string; + url: string; + voiceSessionId: string; + } = { + token: data.token, + url: data.url, + voiceSessionId: resolvedVoiceSessionId, + }; + if (typeof data.control === "boolean") result.control = data.control; + if (typeof data.expiresAt === "number") result.expiresAt = data.expiresAt; + return result; +} diff --git a/packages/eve/src/public/channels/index.ts b/packages/eve/src/public/channels/index.ts index e3a1ded98..cbbbf5b38 100644 --- a/packages/eve/src/public/channels/index.ts +++ b/packages/eve/src/public/channels/index.ts @@ -32,6 +32,14 @@ export { createWebSocketUpgradeServer, type WebSocketUpgradeServerBridge, } from "#channel/websocket-upgrade-server.js"; +export { + vercelSpeechChannel, + type VercelRealtimeClientSecret, + type VercelRealtimeControlConfig, + type VercelSpeechChannelInput, + type VercelSpeechControlInput, + type VercelSpeechSetupResponse, +} from "#public/channels/vercel/speech.js"; import { getChannelInstrumentationKind } from "#channel/compiled-channel.js"; import type { Channel } from "#public/definitions/defineChannel.js"; diff --git a/packages/eve/src/public/channels/vercel/control-token.test.ts b/packages/eve/src/public/channels/vercel/control-token.test.ts new file mode 100644 index 000000000..a2815315f --- /dev/null +++ b/packages/eve/src/public/channels/vercel/control-token.test.ts @@ -0,0 +1,107 @@ +import { afterEach, describe, expect, it } from "vitest"; + +import type { SessionAuthContext } from "#channel/types.js"; +import { + createControlToken, + resolveControlSecret, + verifyControlToken, +} from "#public/channels/vercel/control-token.js"; + +const auth: SessionAuthContext = { + attributes: { team: "acme" }, + authenticator: "test", + issuer: "test-idp", + principalId: "user-1", + principalType: "user", + subject: "user-1", +}; + +const secret = "test-control-secret"; + +afterEach(() => { + delete process.env.EVE_REALTIME_CONTROL_SECRET; + delete process.env.AI_GATEWAY_API_KEY; +}); + +describe("control token", () => { + it("round-trips the principal and voice session id", async () => { + const token = await createControlToken({ + auth, + voiceSessionId: "voice-1", + ttlSeconds: 600, + secret, + }); + const result = await verifyControlToken(token, { secret }); + + expect(result).toEqual({ ok: true, auth, voiceSessionId: "voice-1" }); + }); + + it("rejects a tampered token", async () => { + const token = await createControlToken({ + auth, + voiceSessionId: "voice-1", + ttlSeconds: 600, + secret, + }); + const [prefix, body, sig] = token.split("."); + const tampered = `${prefix}.${body}x.${sig}`; + + expect(await verifyControlToken(tampered, { secret })).toMatchObject({ ok: false }); + }); + + it("rejects a token signed with a different secret", async () => { + const token = await createControlToken({ + auth, + voiceSessionId: "voice-1", + ttlSeconds: 600, + secret, + }); + expect(await verifyControlToken(token, { secret: "other" })).toEqual({ + ok: false, + reason: "bad_signature", + }); + }); + + it("rejects an expired token", async () => { + const token = await createControlToken({ + auth, + voiceSessionId: "voice-1", + ttlSeconds: 1, + secret, + now: 1_000_000, + }); + expect(await verifyControlToken(token, { secret, now: 1_000_000 + 5_000 })).toEqual({ + ok: false, + reason: "expired", + }); + }); + + it("rejects a missing token", async () => { + expect(await verifyControlToken(undefined, { secret })).toEqual({ + ok: false, + reason: "missing_token", + }); + }); + + it("derives a fallback secret from AI_GATEWAY_API_KEY", () => { + process.env.AI_GATEWAY_API_KEY = "gw-key-123"; + const resolved = resolveControlSecret(undefined, { allowGatewayKeyFallback: true }); + expect(resolved).toBe("eve-realtime-control:gw-key-123"); + expect(resolved).not.toBe("gw-key-123"); + }); + + it("does not derive from AI_GATEWAY_API_KEY unless explicitly allowed", () => { + process.env.AI_GATEWAY_API_KEY = "gw-key-123"; + expect(() => resolveControlSecret()).toThrow(/signing secret/); + }); + + it("prefers EVE_REALTIME_CONTROL_SECRET over the gateway key", () => { + process.env.AI_GATEWAY_API_KEY = "gw-key-123"; + process.env.EVE_REALTIME_CONTROL_SECRET = "dedicated"; + expect(resolveControlSecret()).toBe("dedicated"); + }); + + it("throws when no secret is available", () => { + expect(() => resolveControlSecret()).toThrow(/signing secret/); + }); +}); diff --git a/packages/eve/src/public/channels/vercel/control-token.ts b/packages/eve/src/public/channels/vercel/control-token.ts new file mode 100644 index 000000000..46a49cbdb --- /dev/null +++ b/packages/eve/src/public/channels/vercel/control-token.ts @@ -0,0 +1,189 @@ +import type { SessionAuthContext } from "#channel/types.js"; + +/** + * Stateless, HMAC-signed control token for the Gateway-owned realtime voice + * control socket. + * + * Eve mints this at `/setup` (carrying the authenticated principal and the + * `voiceSessionId`) and hands it to AI Gateway as the realtime `control.token`. + * Gateway later dials Eve's `WS()` control route and presents it as + * `Authorization: Bearer `. Because the mint and the WS upgrade are + * different (serverless) invocations with no shared per-session store, the token + * is self-verifying: the upgrade recomputes the HMAC and checks expiry/audience + * rather than looking the secret up. The signature is the unforgeable capability + * that authorizes Gateway to drive durable turns as the bound principal. + */ + +const TOKEN_PREFIX = "evc1"; +const AUDIENCE = "eve-voice-control"; + +interface ControlTokenPayload { + readonly aud: typeof AUDIENCE; + /** Expiry, epoch seconds. */ + readonly exp: number; + /** Issued-at, epoch seconds. */ + readonly iat: number; + /** Authenticated principal the durable turns run as. */ + readonly auth: SessionAuthContext; + /** Client-visible voice session correlation id. */ + readonly vsid: string; +} + +export interface CreateControlTokenInput { + readonly auth: SessionAuthContext; + readonly voiceSessionId: string; + readonly ttlSeconds: number; + readonly secret: string; + readonly now?: number; +} + +export interface ResolveControlSecretOptions { + /** + * Allows preview/local setups to derive the signing secret from + * `AI_GATEWAY_API_KEY`. Disabled by default so the Gateway credential is not + * also the Eve control-plane signing root in production. + */ + readonly allowGatewayKeyFallback?: boolean; +} + +export type VerifyControlTokenResult = + | { readonly ok: true; readonly auth: SessionAuthContext; readonly voiceSessionId: string } + | { readonly ok: false; readonly reason: string }; + +/** + * Resolves the HMAC signing secret. Prefers an explicit value, then + * `EVE_REALTIME_CONTROL_SECRET`. Preview/dev can opt into a domain-separated + * derivation from `AI_GATEWAY_API_KEY`, but production should use a dedicated + * control-plane secret. + */ +export function resolveControlSecret( + explicit?: string, + options: ResolveControlSecretOptions = {}, +): string { + const allowGatewayKeyFallback = + options.allowGatewayKeyFallback === true || + process.env.EVE_REALTIME_CONTROL_ALLOW_GATEWAY_KEY_FALLBACK === "1"; + const candidate = + readNonEmpty(explicit) ?? + readNonEmpty(process.env.EVE_REALTIME_CONTROL_SECRET) ?? + (allowGatewayKeyFallback + ? deriveFallbackSecret(readNonEmpty(process.env.AI_GATEWAY_API_KEY)) + : undefined); + if (candidate === undefined) { + throw new Error( + "Eve realtime voice control requires a signing secret. Set EVE_REALTIME_CONTROL_SECRET.", + ); + } + return candidate; +} + +/** Signs a control token binding the principal + voice session id, with expiry. */ +export async function createControlToken(input: CreateControlTokenInput): Promise { + const iat = Math.floor((input.now ?? Date.now()) / 1000); + const payload: ControlTokenPayload = { + aud: AUDIENCE, + exp: iat + Math.max(1, Math.floor(input.ttlSeconds)), + iat, + auth: input.auth, + vsid: input.voiceSessionId, + }; + const body = base64UrlEncode(new TextEncoder().encode(JSON.stringify(payload))); + const signature = await sign(`${TOKEN_PREFIX}.${body}`, input.secret); + return `${TOKEN_PREFIX}.${body}.${signature}`; +} + +/** Verifies signature, audience, and expiry; returns the bound principal. */ +export async function verifyControlToken( + token: string | undefined | null, + input: { readonly secret: string; readonly now?: number }, +): Promise { + if (typeof token !== "string" || token.length === 0) { + return { ok: false, reason: "missing_token" }; + } + const parts = token.split("."); + if (parts.length !== 3 || parts[0] !== TOKEN_PREFIX) { + return { ok: false, reason: "malformed_token" }; + } + const [, body, signature] = parts; + const expected = await sign(`${TOKEN_PREFIX}.${body}`, input.secret); + if (!timingSafeEqual(signature ?? "", expected)) { + return { ok: false, reason: "bad_signature" }; + } + + let payload: ControlTokenPayload; + try { + payload = JSON.parse( + new TextDecoder().decode(base64UrlDecode(body ?? "")), + ) as ControlTokenPayload; + } catch { + return { ok: false, reason: "malformed_payload" }; + } + if (payload.aud !== AUDIENCE) return { ok: false, reason: "bad_audience" }; + + const now = Math.floor((input.now ?? Date.now()) / 1000); + if (typeof payload.exp !== "number" || payload.exp < now) { + return { ok: false, reason: "expired" }; + } + if (!isSessionAuthContext(payload.auth) || typeof payload.vsid !== "string") { + return { ok: false, reason: "malformed_payload" }; + } + + return { ok: true, auth: payload.auth, voiceSessionId: payload.vsid }; +} + +async function sign(message: string, secret: string): Promise { + const key = await crypto.subtle.importKey( + "raw", + new TextEncoder().encode(secret), + { name: "HMAC", hash: "SHA-256" }, + false, + ["sign"], + ); + const digest = await crypto.subtle.sign("HMAC", key, new TextEncoder().encode(message)); + return base64UrlEncode(new Uint8Array(digest)); +} + +function deriveFallbackSecret(apiKey: string | undefined): string | undefined { + if (apiKey === undefined) return undefined; + // Domain-separate so the control-token secret is never byte-identical to the + // Gateway credential, even though it is derived from it. + return `eve-realtime-control:${apiKey}`; +} + +function isSessionAuthContext(value: unknown): value is SessionAuthContext { + return ( + value !== null && + typeof value === "object" && + typeof (value as SessionAuthContext).principalId === "string" && + typeof (value as SessionAuthContext).principalType === "string" && + typeof (value as SessionAuthContext).authenticator === "string" + ); +} + +function readNonEmpty(value: string | undefined): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function timingSafeEqual(a: string, b: string): boolean { + if (a.length !== b.length) return false; + let mismatch = 0; + for (let i = 0; i < a.length; i += 1) { + mismatch |= a.charCodeAt(i) ^ b.charCodeAt(i); + } + return mismatch === 0; +} + +function base64UrlEncode(bytes: Uint8Array): string { + let binary = ""; + for (const byte of bytes) binary += String.fromCharCode(byte); + return btoa(binary).replace(/\+/g, "-").replace(/\//g, "_").replace(/=+$/u, ""); +} + +function base64UrlDecode(value: string): Uint8Array { + const base64 = value.replace(/-/g, "+").replace(/_/g, "/"); + const padded = base64 + "=".repeat((4 - (base64.length % 4)) % 4); + const binary = atob(padded); + return Uint8Array.from(binary, (char) => char.charCodeAt(0)); +} diff --git a/packages/eve/src/public/channels/vercel/control-url.test.ts b/packages/eve/src/public/channels/vercel/control-url.test.ts new file mode 100644 index 000000000..add8c0926 --- /dev/null +++ b/packages/eve/src/public/channels/vercel/control-url.test.ts @@ -0,0 +1,85 @@ +import { afterEach, beforeEach, describe, expect, it } from "vitest"; + +import { resolveControlUrl } from "#public/channels/vercel/control-url.js"; + +const wsPath = "/eve/v1/realtime-speech/ws"; + +function clearControlEnv() { + delete process.env.EVE_REALTIME_CONTROL_URL; + delete process.env.VERCEL_BRANCH_URL; + delete process.env.VERCEL_URL; + delete process.env.VERCEL_PROJECT_PRODUCTION_URL; + delete process.env.VERCEL_AUTOMATION_BYPASS_SECRET; + delete process.env.VERCEL_DPBP; +} + +// The dev shell may export VERCEL_DPBP; clear it so "no bypass" cases are +// deterministic regardless of ambient env. +beforeEach(clearControlEnv); +afterEach(clearControlEnv); + +describe("resolveControlUrl", () => { + it("honors an explicit override URL", () => { + const url = resolveControlUrl({ + wsPath, + explicitUrl: "wss://tunnel.ngrok.app/eve/v1/realtime-speech/ws", + }); + expect(url).toBe("wss://tunnel.ngrok.app/eve/v1/realtime-speech/ws"); + }); + + it("derives a wss URL from the Vercel deployment host", () => { + process.env.VERCEL_BRANCH_URL = "eve-preview.vercel.app"; + const url = resolveControlUrl({ wsPath }); + expect(url).toBe("wss://eve-preview.vercel.app/eve/v1/realtime-speech/ws"); + }); + + it("appends the deploy-protection bypass secret as a query param", () => { + process.env.VERCEL_URL = "eve-preview.vercel.app"; + process.env.VERCEL_AUTOMATION_BYPASS_SECRET = "bypass123"; + const url = new URL(resolveControlUrl({ wsPath })); + expect(url.searchParams.get("x-vercel-protection-bypass")).toBe("bypass123"); + }); + + it("does not append deploy-protection bypass secrets to non-Vercel hosts", () => { + process.env.VERCEL_DPBP = "ambient-bypass"; + const url = new URL( + resolveControlUrl({ + wsPath, + explicitUrl: "wss://tunnel.ngrok.app/eve/v1/realtime-speech/ws", + }), + ); + expect(url.searchParams.get("x-vercel-protection-bypass")).toBe(null); + }); + + it("uses an explicit bypass secret for Vercel hosts", () => { + const url = new URL( + resolveControlUrl({ + wsPath, + explicitUrl: "wss://eve-preview.vercel.app/eve/v1/realtime-speech/ws", + bypassSecret: "explicit-bypass", + }), + ); + expect(url.searchParams.get("x-vercel-protection-bypass")).toBe("explicit-bypass"); + }); + + it("throws when no control URL or deployment host is configured", () => { + expect(() => resolveControlUrl({ wsPath })).toThrow(/EVE_REALTIME_CONTROL_URL/); + }); + + it("rejects public ws:// control URLs", () => { + expect(() => + resolveControlUrl({ + wsPath, + explicitUrl: "ws://app.example.com/eve/v1/realtime-speech/ws", + }), + ).toThrow(/wss:\/\//); + }); + + it("uses ws:// for localhost overrides", () => { + const url = resolveControlUrl({ + wsPath, + explicitUrl: "ws://localhost:3000/eve/v1/realtime-speech/ws", + }); + expect(url).toBe("ws://localhost:3000/eve/v1/realtime-speech/ws"); + }); +}); diff --git a/packages/eve/src/public/channels/vercel/control-url.ts b/packages/eve/src/public/channels/vercel/control-url.ts new file mode 100644 index 000000000..b59f2acf5 --- /dev/null +++ b/packages/eve/src/public/channels/vercel/control-url.ts @@ -0,0 +1,96 @@ +/** + * Builds the `control.url` Eve mints into the Gateway realtime token: the public + * `wss://` URL of Eve's own WebSocket control route that AI Gateway dials back. + * + * Resolution order: + * 1. `EVE_REALTIME_CONTROL_URL` (or the explicit override) — a full `wss://` + * (or `ws://localhost`) URL, used for tunneled local dev (ngrok/preview). + * 2. The deployment host from `VERCEL_BRANCH_URL` / `VERCEL_URL` / + * `VERCEL_PROJECT_PRODUCTION_URL`. + * + * AI Gateway dials this URL with only an `Authorization` header and does not + * follow redirects, so a Vercel Deployment Protection bypass cannot ride a + * header — it is appended to the URL as the `x-vercel-protection-bypass` query + * param for Vercel deployment hosts only (read from + * `VERCEL_AUTOMATION_BYPASS_SECRET`, falling back to `VERCEL_DPBP`). This is a + * temporary measure for protected preview testing. + */ +export interface ResolveControlUrlInput { + /** The `/ws` route path, e.g. `/eve/v1/realtime-speech/ws`. */ + readonly wsPath: string; + /** Explicit full WS URL override (defaults to `EVE_REALTIME_CONTROL_URL`). */ + readonly explicitUrl?: string; + /** Explicit deploy-protection bypass secret override. */ + readonly bypassSecret?: string; +} + +export function resolveControlUrl(input: ResolveControlUrlInput): string { + const base = resolveBaseUrl(input); + const bypass = readNonEmpty(input.bypassSecret) ?? readDeployBypassSecret(); + if (bypass !== undefined && isVercelHost(base.hostname)) { + base.searchParams.set("x-vercel-protection-bypass", bypass); + } + return base.toString(); +} + +function resolveBaseUrl(input: ResolveControlUrlInput): URL { + const explicit = + readNonEmpty(input.explicitUrl) ?? readNonEmpty(process.env.EVE_REALTIME_CONTROL_URL); + if (explicit !== undefined) { + // The override carries the full URL including path; honor it verbatim. + return validateControlUrl(new URL(explicit)); + } + + const host = resolveDeploymentHost(); + if (host === undefined) { + throw new Error( + "Eve realtime voice control could not resolve a public host. Set EVE_REALTIME_CONTROL_URL.", + ); + } + const scheme = isLocalHost(host) ? "ws" : "wss"; + return validateControlUrl(new URL(`${scheme}://${host}${input.wsPath}`)); +} + +function resolveDeploymentHost(): string | undefined { + const fromEnv = + readNonEmpty(process.env.VERCEL_BRANCH_URL) ?? + readNonEmpty(process.env.VERCEL_URL) ?? + readNonEmpty(process.env.VERCEL_PROJECT_PRODUCTION_URL); + if (fromEnv !== undefined) return stripScheme(fromEnv); + + return undefined; +} + +function readDeployBypassSecret(): string | undefined { + return ( + readNonEmpty(process.env.VERCEL_AUTOMATION_BYPASS_SECRET) ?? + readNonEmpty(process.env.VERCEL_DPBP) + ); +} + +function isLocalHost(host: string): boolean { + const hostname = host.split(":")[0] ?? host; + return hostname === "localhost" || hostname.endsWith(".localhost") || hostname === "127.0.0.1"; +} + +function isVercelHost(hostname: string): boolean { + return hostname.endsWith(".vercel.app") || hostname.endsWith(".vercel.sh"); +} + +function validateControlUrl(url: URL): URL { + if (url.protocol === "wss:") return url; + if (url.protocol === "ws:" && isLocalHost(url.host)) return url; + throw new Error( + "Eve realtime voice control URL must use wss://, or ws://localhost for local dev.", + ); +} + +function stripScheme(value: string): string { + return value.replace(/^[a-z]+:\/\//iu, ""); +} + +function readNonEmpty(value: string | undefined | null): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} diff --git a/packages/eve/src/public/channels/vercel/speech.test.ts b/packages/eve/src/public/channels/vercel/speech.test.ts new file mode 100644 index 000000000..cc161e7b0 --- /dev/null +++ b/packages/eve/src/public/channels/vercel/speech.test.ts @@ -0,0 +1,209 @@ +import { describe, expect, it, vi } from "vitest"; + +import { isHttpRouteDefinition, isWebSocketRouteDefinition } from "#channel/routes.js"; +import type { Channel } from "#public/definitions/defineChannel.js"; +import { none } from "#public/channels/auth.js"; +import { createControlToken, verifyControlToken } from "#public/channels/vercel/control-token.js"; +import { vercelSpeechChannel } from "#public/channels/vercel/speech.js"; + +async function callRoute( + channel: Channel, + method: string, + path: string, + request: Request, +): Promise { + const route = channel.routes.find( + (candidate) => candidate.method === method && candidate.path === path, + ); + if (route === undefined || !isHttpRouteDefinition(route)) { + throw new Error(`Missing HTTP route ${method} ${path}`); + } + return route.handler(request, { + getSession: vi.fn() as any, + params: {}, + receive: vi.fn() as any, + requestIp: null, + send: vi.fn() as any, + waitUntil: vi.fn(), + }); +} + +describe("vercelSpeechChannel", () => { + it("mints a Gateway realtime token and returns the voice session id", async () => { + const getToken = vi.fn(async () => ({ + expiresAt: 1_700_000_060, + token: "vcst_test", + url: "wss://gateway.example/realtime-model?ai-model-id=openai%2Fgpt-realtime-2", + })); + const channel = vercelSpeechChannel({ + auth: none(), + basePath: "/voice", + createVoiceSessionId: () => "voice-session-1", + expiresAfterSeconds: 120, + getToken, + }); + + const response = await callRoute( + channel, + "POST", + "/voice/setup", + new Request("http://localhost/voice/setup"), + ); + const body = (await response.json()) as Record; + + expect(response.status).toBe(200); + expect(response.headers.get("cache-control")).toBe("no-store"); + expect(getToken).toHaveBeenCalledWith({ + expiresAfterSeconds: 120, + model: "openai/gpt-realtime-2", + }); + expect(body).toMatchObject({ + expiresAt: 1_700_000_060, + tools: [], + token: "vcst_test", + url: "wss://gateway.example/realtime-model?ai-model-id=openai%2Fgpt-realtime-2", + voiceSessionId: "voice-session-1", + }); + }); + + it("reuses a client-supplied voice session id", async () => { + const channel = vercelSpeechChannel({ + auth: none(), + basePath: "/voice", + getToken: async () => ({ token: "vcst_unused", url: "wss://gateway.example" }), + }); + + const response = await callRoute( + channel, + "POST", + "/voice/setup", + new Request("http://localhost/voice/setup?voiceSessionId=existing-session"), + ); + const body = (await response.json()) as Record; + + expect(body.voiceSessionId).toBe("existing-session"); + }); + + it("exposes only setup and health routes (no blocking /turn route)", () => { + const channel = vercelSpeechChannel({ + auth: none(), + basePath: "/voice", + getToken: async () => ({ token: "vcst_unused", url: "wss://gateway.example" }), + }); + + expect(channel.routes.map((route) => `${route.method} ${route.path}`).sort()).toEqual([ + "GET /voice/health", + "POST /voice/setup", + ]); + expect(channel.routes.some((route) => route.path.endsWith("/turn"))).toBe(false); + }); + + it("serves a health route", async () => { + const channel = vercelSpeechChannel({ + auth: none(), + basePath: "/voice", + model: "openai/gpt-realtime-2", + getToken: async () => ({ token: "vcst_unused", url: "wss://gateway.example" }), + }); + + const response = await callRoute( + channel, + "GET", + "/voice/health", + new Request("http://localhost/voice/health"), + ); + const body = (await response.json()) as Record; + + expect(body).toEqual({ + ok: true, + channel: "realtime-speech", + control: false, + model: "openai/gpt-realtime-2", + }); + }); + + it("mints control config and serves the control route in gateway-control mode", async () => { + process.env.EVE_REALTIME_CONTROL_SECRET = "ws-test-secret"; + const bypass = process.env.VERCEL_DPBP; + const bypass2 = process.env.VERCEL_AUTOMATION_BYPASS_SECRET; + delete process.env.VERCEL_DPBP; + delete process.env.VERCEL_AUTOMATION_BYPASS_SECRET; + try { + const captured: Array> = []; + const getToken = vi.fn(async (options: Record) => { + captured.push(options); + return { token: "vcst_x", url: "wss://gateway.example" }; + }); + const channel = vercelSpeechChannel({ + auth: () => ({ + attributes: {}, + authenticator: "test", + principalId: "u1", + principalType: "user", + }), + basePath: "/voice", + control: { controlUrl: "wss://eve.example/voice/ws" }, + getToken, + }); + + await callRoute(channel, "POST", "/voice/setup", new Request("http://localhost/voice/setup")); + + const control = captured[0]!.control as { mode: string; token: string; url: string }; + expect(control.mode).toBe("eve"); + expect(control.url).toBe("wss://eve.example/voice/ws"); + const verified = await verifyControlToken(control.token, { secret: "ws-test-secret" }); + expect(verified).toMatchObject({ ok: true, voiceSessionId: expect.any(String) }); + + expect( + channel.routes.some((route) => route.method === "WEBSOCKET" && route.path === "/voice/ws"), + ).toBe(true); + } finally { + delete process.env.EVE_REALTIME_CONTROL_SECRET; + if (bypass !== undefined) process.env.VERCEL_DPBP = bypass; + if (bypass2 !== undefined) process.env.VERCEL_AUTOMATION_BYPASS_SECRET = bypass2; + } + }); + + it("rejects an unauthenticated control upgrade and accepts a valid token", async () => { + process.env.EVE_REALTIME_CONTROL_SECRET = "ws-test-secret"; + try { + const channel = vercelSpeechChannel({ + auth: none(), + basePath: "/voice", + control: true, + getToken: async () => ({ token: "vcst_x", url: "wss://gateway.example" }), + }); + const route = channel.routes.find( + (candidate) => candidate.method === "WEBSOCKET" && candidate.path === "/voice/ws", + ); + if (route === undefined || !isWebSocketRouteDefinition(route)) { + throw new Error("Missing control WS route"); + } + const hooks = await route.handler(new Request("http://localhost/voice/ws"), { + getSession: vi.fn() as any, + params: {}, + receive: vi.fn() as any, + requestIp: null, + send: vi.fn() as any, + waitUntil: vi.fn(), + }); + + const rejected = await hooks.upgrade!(new Request("http://localhost/voice/ws")); + expect(rejected).toBeInstanceOf(Response); + expect((rejected as Response).status).toBe(401); + + const token = await createControlToken({ + auth: { attributes: {}, authenticator: "t", principalId: "u", principalType: "user" }, + voiceSessionId: "v1", + ttlSeconds: 60, + secret: "ws-test-secret", + }); + const accepted = await hooks.upgrade!( + new Request("http://localhost/voice/ws", { headers: { authorization: `Bearer ${token}` } }), + ); + expect(accepted).not.toBeInstanceOf(Response); + } finally { + delete process.env.EVE_REALTIME_CONTROL_SECRET; + } + }); +}); diff --git a/packages/eve/src/public/channels/vercel/speech.ts b/packages/eve/src/public/channels/vercel/speech.ts new file mode 100644 index 000000000..5fe04d09f --- /dev/null +++ b/packages/eve/src/public/channels/vercel/speech.ts @@ -0,0 +1,323 @@ +import { gateway } from "ai"; + +import type { AuthFn } from "#public/channels/auth.js"; +import { routeAuth } from "#public/channels/auth.js"; +import type { SessionAuthContext } from "#channel/types.js"; +import { + defineChannel, + GET, + POST, + WS, + type Channel, + type RouteDefinition, +} from "#public/definitions/defineChannel.js"; +import { + createControlToken, + resolveControlSecret, + verifyControlToken, +} from "#public/channels/vercel/control-token.js"; +import { resolveControlUrl } from "#public/channels/vercel/control-url.js"; +import { + EVE_VOICE_CONTROL_PROTOCOL, + parseControlPacket, +} from "#public/channels/vercel/voice-control-protocol.js"; +import { + createInMemoryVoiceControlStateStore, + VoiceTurnCoordinator, + type VoiceControlStateStore, + type VoiceTurnCoordinatorOptions, +} from "#public/channels/vercel/voice-turn-coordinator.js"; + +const DEFAULT_BASE_PATH = "/eve/v1/realtime-speech"; +const DEFAULT_MODEL = "openai/gpt-realtime-2"; +const DEFAULT_CONTROL_TOKEN_TTL_SECONDS = 600; + +/** + * Gateway-owned control plane ("A-lite") configuration. When set, the setup + * route mints a `vcst_` token carrying an Eve control socket config, and the + * channel serves the `WS()` control route AI Gateway dials back. Pass `true` + * for defaults. + */ +export interface VercelSpeechControlInput { + /** HMAC secret for control tokens. Defaults to `EVE_REALTIME_CONTROL_SECRET`. */ + readonly secret?: string; + /** Allow deriving the control-token signing secret from `AI_GATEWAY_API_KEY`. Local/preview only. */ + readonly allowGatewayKeyFallback?: boolean; + /** Full `wss://` control URL override. Defaults to `EVE_REALTIME_CONTROL_URL` / deployment host. */ + readonly controlUrl?: string; + /** Vercel deploy-protection bypass secret override (for protected previews). */ + readonly bypassSecret?: string; + /** Control-token TTL (seconds). Default 600. */ + readonly tokenTtlSeconds?: number; + /** Durable context strings contributed on each control-driven turn. */ + readonly context?: readonly string[]; + /** Durable state for continuation/cursor recovery across control WS reconnects. */ + readonly stateStore?: VoiceControlStateStore; + /** Turn settle/debounce window (ms). */ + readonly settleMs?: number; +} + +/** + * Eve-owned mirror of the AI Gateway realtime client-secret result (`token`, + * `url`, `expiresAt`). Declared locally so eve's public channel surface does not + * re-export the AI SDK's experimental realtime types, which can change freely. + */ +export interface VercelRealtimeClientSecret { + readonly token: string; + readonly url: string; + readonly expiresAt?: number; +} + +/** + * Gateway realtime `control` config sealed into the minted token. Structurally + * mirrors `@ai-sdk/gateway`'s `GatewayRealtimeControlConfig`; defined locally so + * eve does not depend on the gateway type re-export. + */ +export interface VercelRealtimeControlConfig { + readonly mode: "eve"; + readonly token: string; + readonly url: string; +} + +export interface VercelSpeechGetTokenInput { + readonly expiresAfterSeconds?: number; + readonly model: string; + readonly control?: VercelRealtimeControlConfig; +} + +export interface VercelSpeechChannelInput { + /** Route auth used by the setup route. */ + readonly auth: AuthFn | readonly AuthFn[]; + /** AI Gateway realtime model id. */ + readonly model?: string; + /** Base path for the setup, health, and control routes. */ + readonly basePath?: string; + /** Client-secret TTL forwarded to AI Gateway. */ + readonly expiresAfterSeconds?: number; + /** + * Enable the Gateway-owned control plane (A-lite). When set, `/setup` mints a + * token with control config and the `{basePath}/ws` control route is served. + */ + readonly control?: VercelSpeechControlInput | boolean; + /** Test/advanced injection point for token minting. Defaults to AI Gateway. */ + readonly getToken?: (input: VercelSpeechGetTokenInput) => Promise; + /** Test/advanced injection point for creating long-lived voice session ids. */ + readonly createVoiceSessionId?: () => string; +} + +export interface VercelSpeechSetupResponse extends VercelRealtimeClientSecret { + /** Whether this token carries Gateway-owned Eve control config. */ + readonly control: boolean; + /** No model-visible tools are exposed to the realtime speech adapter. */ + readonly tools: readonly []; + readonly voiceSessionId: string; +} + +/** + * Builds an Eve channel for long-lived realtime speech sessions backed by Vercel + * AI Gateway realtime audio. + * + * Default (client-driven) mode: the browser keeps an AI Gateway realtime socket + * open using the setup route's short-lived `vcst_` token, and finalized + * transcripts run as ordinary durable turns through `/eve/v1/session`. + * + * Gateway-control mode (A-lite, opt in via `control`): `/setup` additionally + * mints control config into the token so AI Gateway dials Eve's `{basePath}/ws` + * route per session; Eve then owns turn coordination and streams reply text back + * for Gateway to inject into provider TTS. Either way the realtime model is only + * the ears and mouth and Eve stays the durable assistant of record. + */ +export function vercelSpeechChannel(input: VercelSpeechChannelInput): Channel { + const basePath = normalizeBasePath(input.basePath ?? DEFAULT_BASE_PATH); + const model = input.model ?? DEFAULT_MODEL; + const getToken = + input.getToken ?? + ((options: VercelSpeechGetTokenInput) => gateway.experimental_realtime.getToken(options)); + const createVoiceSessionId = input.createVoiceSessionId ?? (() => crypto.randomUUID()); + const controlOptions = normalizeControlInput(input.control); + const wsPath = `${basePath}/ws`; + + const routes: RouteDefinition[] = [ + POST(`${basePath}/setup`, async (req) => { + const authResult = await routeAuth(req, input.auth); + if (authResult instanceof Response) return authResult; + + const url = new URL(req.url); + const voiceSessionId = + readOptionalString(url.searchParams.get("voiceSessionId")) ?? createVoiceSessionId(); + + let control: VercelRealtimeControlConfig | undefined; + if (controlOptions !== undefined) { + const secret = resolveControlSecret(controlOptions.secret, { + allowGatewayKeyFallback: controlOptions.allowGatewayKeyFallback, + }); + const token = await createControlToken({ + auth: authResult, + voiceSessionId, + ttlSeconds: controlOptions.tokenTtlSeconds ?? DEFAULT_CONTROL_TOKEN_TTL_SECONDS, + secret, + }); + const controlUrlInput: { + wsPath: string; + explicitUrl?: string; + bypassSecret?: string; + } = { wsPath }; + if (controlOptions.controlUrl !== undefined) { + controlUrlInput.explicitUrl = controlOptions.controlUrl; + } + if (controlOptions.bypassSecret !== undefined) { + controlUrlInput.bypassSecret = controlOptions.bypassSecret; + } + control = { mode: "eve", token, url: resolveControlUrl(controlUrlInput) }; + } + + const getTokenInput: { + model: string; + expiresAfterSeconds?: number; + control?: VercelRealtimeControlConfig; + } = { model }; + if (input.expiresAfterSeconds !== undefined) { + getTokenInput.expiresAfterSeconds = input.expiresAfterSeconds; + } + if (control !== undefined) getTokenInput.control = control; + const token = await getToken(getTokenInput); + + return jsonNoStore({ + ...token, + control: control !== undefined, + tools: [], + voiceSessionId, + } satisfies VercelSpeechSetupResponse); + }), + + GET(`${basePath}/health`, async () => + jsonNoStore({ + ok: true, + channel: "realtime-speech", + control: controlOptions !== undefined, + model, + }), + ), + ]; + + if (controlOptions !== undefined) { + routes.push(createControlRoute({ wsPath, controlOptions })); + } + + return defineChannel({ + kindHint: "realtime-speech", + routes, + }); +} + +function createControlRoute(input: { + readonly wsPath: string; + readonly controlOptions: VercelSpeechControlInput; +}): RouteDefinition { + // Per-connection coordinators, keyed by peer id. eve invokes the WS route + // handler per hook (upgrade/open/message run in separate closures), so + // connection state cannot live in the handler closure — it is keyed here on + // the stable `peer.id`, and the principal is recovered from `peer.request`. + const connections = new Map(); + const stateStore = input.controlOptions.stateStore ?? createInMemoryVoiceControlStateStore(); + + async function verifyPeer( + request: Request, + ): Promise<{ auth: SessionAuthContext; voiceSessionId: string } | undefined> { + let secret: string; + try { + secret = resolveControlSecret(input.controlOptions.secret, { + allowGatewayKeyFallback: input.controlOptions.allowGatewayKeyFallback, + }); + } catch { + return undefined; + } + const result = await verifyControlToken(readBearerToken(request.headers.get("authorization")), { + secret, + }); + return result.ok ? { auth: result.auth, voiceSessionId: result.voiceSessionId } : undefined; + } + + return WS(input.wsPath, (_req, args) => ({ + async upgrade(request) { + // Reject bad tokens at the handshake for a clean 401. + const verified = await verifyPeer(request); + if (verified === undefined) return new Response("Unauthorized", { status: 401 }); + return { headers: { "sec-websocket-protocol": EVE_VOICE_CONTROL_PROTOCOL } }; + }, + async open(peer) { + const verified = await verifyPeer(peer.request); + if (verified === undefined) { + peer.close(1011, "unverified"); + return; + } + const coordinatorOptions: { + -readonly [K in keyof VoiceTurnCoordinatorOptions]: VoiceTurnCoordinatorOptions[K]; + } = { + auth: verified.auth, + voiceSessionId: verified.voiceSessionId, + send: args.send, + sendRaw: (packet) => peer.send(packet), + stateStore, + closeSocket: (code, reason) => peer.close(code, reason), + }; + if (input.controlOptions.context !== undefined) { + coordinatorOptions.context = input.controlOptions.context; + } + if (input.controlOptions.settleMs !== undefined) { + coordinatorOptions.settleMs = input.controlOptions.settleMs; + } + const coordinator = new VoiceTurnCoordinator(coordinatorOptions); + connections.set(peer.id, coordinator); + coordinator.start(); + }, + message(peer, message) { + const event = parseControlPacket(message.text()); + if (event !== null) connections.get(peer.id)?.handle(event); + }, + close(peer) { + connections.get(peer.id)?.dispose(); + connections.delete(peer.id); + }, + error(peer) { + connections.get(peer.id)?.dispose(); + connections.delete(peer.id); + }, + })); +} + +function normalizeControlInput( + control: VercelSpeechChannelInput["control"], +): VercelSpeechControlInput | undefined { + if (control === undefined || control === false) return undefined; + if (control === true) return {}; + return control; +} + +function readBearerToken(header: string | null): string | undefined { + if (header === null) return undefined; + const match = /^Bearer\s+(.+)$/iu.exec(header.trim()); + return match?.[1]; +} + +function readOptionalString(value: unknown): string | undefined { + if (typeof value !== "string") return undefined; + const trimmed = value.trim(); + return trimmed.length > 0 ? trimmed : undefined; +} + +function normalizeBasePath(path: string): string { + const trimmed = path.trim().replace(/\/+$/u, ""); + if (!trimmed.startsWith("/") || trimmed.length === 0) { + throw new Error("vercelSpeechChannel basePath must start with `/`."); + } + return trimmed; +} + +function jsonNoStore(body: unknown): Response { + return Response.json(body, { + headers: { + "cache-control": "no-store", + }, + }); +} diff --git a/packages/eve/src/public/channels/vercel/voice-control-protocol.ts b/packages/eve/src/public/channels/vercel/voice-control-protocol.ts new file mode 100644 index 000000000..9d7fd6535 --- /dev/null +++ b/packages/eve/src/public/channels/vercel/voice-control-protocol.ts @@ -0,0 +1,17 @@ +/** + * Eve's vocabulary for the AI Gateway speech-engine control protocol. The wire + * contract (envelope, events, capabilities, codec) is defined once in + * `@ai-sdk/gateway` and shared with the Gateway so the two can't drift; this + * module just re-exports it under Eve-local names. Eve is the controller: it + * receives engine→controller events and sends controller→engine events. + */ +export { + DEFAULT_SPEECH_ENGINE_CAPABILITIES as DEFAULT_CONTROL_CAPABILITIES, + encodeSpeechEngineEvent as encodeControlPacket, + GATEWAY_SPEECH_ENGINE_SUBPROTOCOL as EVE_VOICE_CONTROL_PROTOCOL, + parseSpeechEngineServerEvent as parseControlPacket, + type SpeechEngineCapabilities as RealtimeControlCapabilities, + type SpeechEngineClientEvent as EveToGatewayEvent, + type SpeechEngineDescriptor as RealtimeControlEngine, + type SpeechEngineServerEvent as GatewayToEveEvent, +} from "@ai-sdk/gateway"; diff --git a/packages/eve/src/public/channels/vercel/voice-turn-coordinator.test.ts b/packages/eve/src/public/channels/vercel/voice-turn-coordinator.test.ts new file mode 100644 index 000000000..6bdfd398a --- /dev/null +++ b/packages/eve/src/public/channels/vercel/voice-turn-coordinator.test.ts @@ -0,0 +1,287 @@ +import { describe, expect, it, vi } from "vitest"; + +import type { SendFn } from "#channel/routes.js"; +import type { SessionAuthContext } from "#channel/types.js"; +import { + DEFAULT_CONTROL_CAPABILITIES, + type GatewayToEveEvent, + type RealtimeControlCapabilities, +} from "#public/channels/vercel/voice-control-protocol.js"; +import { + createInMemoryVoiceControlStateStore, + VoiceTurnCoordinator, + type VoiceControlStateStore, +} from "#public/channels/vercel/voice-turn-coordinator.js"; + +function sessionOpened( + overrides: Partial, +): Extract { + return { + type: "session.opened", + data: { + sessionId: "s1", + engine: { + provider: "openai", + model: "openai/gpt-realtime-2", + protocol: "ai-sdk", + capabilities: { ...DEFAULT_CONTROL_CAPABILITIES, ...overrides }, + }, + }, + }; +} + +const auth: SessionAuthContext = { + attributes: {}, + authenticator: "test", + principalId: "user-1", + principalType: "user", +}; + +function closedStream(events: readonly unknown[]): ReadableStream { + return new ReadableStream({ + start(controller) { + for (const event of events) controller.enqueue(event); + controller.close(); + }, + }); +} + +function sendReturning(events: readonly unknown[], id = "session-1") { + const impl: SendFn = async () => ({ + id, + continuationToken: "voice:ct", + getEventStream: async () => closedStream(events), + }); + return vi.fn(impl); +} + +function harness( + send: SendFn, + settleMs = 5, + options: { stateStore?: VoiceControlStateStore } = {}, +) { + const packets: Array<{ type: string; data: Record }> = []; + const coordinator = new VoiceTurnCoordinator({ + auth, + voiceSessionId: "voice-1", + send, + sendRaw: (packet) => packets.push(JSON.parse(packet)), + closeSocket: () => undefined, + ...options, + settleMs, + }); + return { coordinator, packets, types: () => packets.map((p) => p.type) }; +} + +const reply = (message: string, stepIndex = 0, finishReason = "stop") => ({ + type: "message.completed", + data: { finishReason, message, sequence: 1, stepIndex, turnId: "t1" }, +}); +const waiting = () => ({ type: "session.waiting", data: { wait: "next-user-message" } }); +const failed = () => ({ type: "session.failed", data: { code: "boom", message: "boom" } }); + +describe("VoiceTurnCoordinator", () => { + it("emits session.ready on start", async () => { + const { coordinator, types } = harness(sendReturning([])); + coordinator.start(); + await vi.waitFor(() => expect(types()).toEqual(["session.ready"])); + }); + + it("runs a durable turn and streams response.delta + response.done", async () => { + const send = sendReturning([reply("Hello there"), waiting()]); + const { coordinator, packets } = harness(send); + coordinator.start(); + + coordinator.handle({ type: "input.transcript.final", data: { text: "hi", itemId: "i1" } }); + + await vi.waitFor(() => expect(packets.some((p) => p.type === "response.done")).toBe(true)); + expect(send).toHaveBeenCalledWith( + expect.objectContaining({ message: "hi" }), + expect.objectContaining({ auth, mode: "conversation" }), + ); + const delta = packets.find((p) => p.type === "response.delta"); + expect(delta?.data).toEqual({ text: "Hello there", turnId: "turn_1" }); + // turn.started / response.delta / response.done share the turn id so the + // Gateway can correlate frames and drop a superseded turn's frames by id. + expect(packets.find((p) => p.type === "turn.started")?.data.turnId).toBe("turn_1"); + expect(packets.find((p) => p.type === "response.done")?.data.turnId).toBe("turn_1"); + }); + + it("does not speak intermediate tool-call text", async () => { + const send = sendReturning([ + reply("Let me check that", 0, "tool-calls"), + reply("The weather is mild", 1, "stop"), + waiting(), + ]); + const { coordinator, packets } = harness(send); + coordinator.start(); + coordinator.handle({ + type: "input.transcript.final", + data: { text: "weather?", itemId: "i1" }, + }); + + await vi.waitFor(() => expect(packets.some((p) => p.type === "response.done")).toBe(true)); + const deltas = packets.filter((p) => p.type === "response.delta").map((p) => p.data.text); + expect(deltas).toEqual(["The weather is mild"]); + }); + + it("ignores backchannel acknowledgements", async () => { + const send = sendReturning([reply("ok"), waiting()]); + const { coordinator } = harness(send); + coordinator.start(); + coordinator.handle({ type: "input.transcript.final", data: { text: "mm-hmm", itemId: "i1" } }); + + await new Promise((resolve) => setTimeout(resolve, 30)); + expect(send).not.toHaveBeenCalled(); + }); + + it("de-duplicates a repeated transcript itemId", async () => { + const send = sendReturning([reply("Hi"), waiting()]); + const { coordinator, packets } = harness(send); + coordinator.start(); + coordinator.handle({ type: "input.transcript.final", data: { text: "hi", itemId: "dup" } }); + coordinator.handle({ type: "input.transcript.final", data: { text: "hi", itemId: "dup" } }); + + await vi.waitFor(() => expect(packets.some((p) => p.type === "response.done")).toBe(true)); + expect(send).toHaveBeenCalledTimes(1); + }); + + it("de-duplicates immediate repeated transcript text when itemId is missing", async () => { + const send = sendReturning([reply("Hi"), waiting()]); + const { coordinator, packets } = harness(send); + coordinator.start(); + coordinator.handle({ type: "input.transcript.final", data: { text: "hi" } }); + coordinator.handle({ type: "input.transcript.final", data: { text: "hi" } }); + + await vi.waitFor(() => expect(packets.some((p) => p.type === "response.done")).toBe(true)); + expect(send).toHaveBeenCalledTimes(1); + }); + + it("persists continuation and stream cursor across control socket reconnects", async () => { + const stateStore = createInMemoryVoiceControlStateStore(); + const streamStarts: Array = []; + const send = vi.fn(async () => ({ + id: "session-1", + continuationToken: "voice:stable", + getEventStream: async (options) => { + streamStarts.push(options?.startIndex); + return closedStream([reply("Hi"), waiting()]); + }, + })); + + const first = harness(send, 5, { stateStore }); + first.coordinator.start(); + first.coordinator.handle({ type: "input.transcript.final", data: { text: "first" } }); + await vi.waitFor(() => expect(first.types()).toContain("response.done")); + + const second = harness(send, 5, { stateStore }); + second.coordinator.start(); + second.coordinator.handle({ type: "input.transcript.final", data: { text: "second" } }); + await vi.waitFor(() => expect(second.types()).toContain("response.done")); + + expect(streamStarts).toEqual([0, 2]); + expect(send.mock.calls[0]?.[1].continuationToken).toBe( + send.mock.calls[1]?.[1].continuationToken, + ); + }); + + it("emits error instead of response.done when the durable turn fails", async () => { + const send = sendReturning([failed()]); + const { coordinator, types } = harness(send); + coordinator.start(); + coordinator.handle({ type: "input.transcript.final", data: { text: "hi", itemId: "i1" } }); + + await vi.waitFor(() => expect(types()).toContain("error")); + expect(types()).not.toContain("response.done"); + }); + + it("clears pending transcript text on barge-in before the turn settles", async () => { + const send = sendReturning([reply("Hi"), waiting()]); + const { coordinator, packets } = harness(send); + coordinator.start(); + coordinator.handle({ type: "input.transcript.final", data: { text: "first", itemId: "i1" } }); + coordinator.handle({ type: "input.interrupted", data: {} }); + coordinator.handle({ type: "input.transcript.final", data: { text: "second", itemId: "i2" } }); + + await vi.waitFor(() => expect(packets.some((p) => p.type === "response.done")).toBe(true)); + expect(send).toHaveBeenCalledWith( + expect.objectContaining({ message: "second" }), + expect.anything(), + ); + expect(send).toHaveBeenCalledTimes(1); + }); + + it("cancels an in-flight response on barge-in", async () => { + let controller: ReadableStreamDefaultController | undefined; + const impl: SendFn = async () => ({ + id: "session-1", + continuationToken: "voice:ct", + getEventStream: async () => + new ReadableStream({ + start(c) { + controller = c; + }, + }), + }); + const send = vi.fn(impl); + const { coordinator, packets, types } = harness(send); + coordinator.start(); + coordinator.handle({ + type: "input.transcript.final", + data: { text: "tell me a story", itemId: "i1" }, + }); + + // Wait for the turn to start and emit a delta (response in flight). + await vi.waitFor(() => expect(controller).toBeDefined()); + controller!.enqueue(reply("Once upon a")); + await vi.waitFor(() => expect(packets.some((p) => p.type === "response.delta")).toBe(true)); + + coordinator.handle({ type: "input.interrupted", data: {} }); + + expect(types()).toContain("response.cancel"); + expect(types()).not.toContain("response.done"); + }); + + it("runs the turn but skips the spoken readout when output.audio is false", async () => { + const send = sendReturning([reply("Hello there"), waiting()]); + const { coordinator, packets } = harness(send); + coordinator.start(); + coordinator.handle(sessionOpened({ "output.audio": false })); + coordinator.handle({ type: "input.transcript.final", data: { text: "hi", itemId: "i1" } }); + + await vi.waitFor(() => expect(send).toHaveBeenCalled()); + await new Promise((resolve) => setTimeout(resolve, 20)); + expect(packets.some((p) => p.type === "response.delta")).toBe(false); + expect(packets.some((p) => p.type === "response.done")).toBe(true); + }); + + it("does not emit response.cancel on barge-in when output.cancel is false", async () => { + let controller: ReadableStreamDefaultController | undefined; + const impl: SendFn = async () => ({ + id: "session-1", + continuationToken: "voice:ct", + getEventStream: async () => + new ReadableStream({ + start(c) { + controller = c; + }, + }), + }); + const send = vi.fn(impl); + const { coordinator, packets, types } = harness(send); + coordinator.start(); + coordinator.handle(sessionOpened({ "output.cancel": false })); + coordinator.handle({ + type: "input.transcript.final", + data: { text: "tell me a story", itemId: "i1" }, + }); + + await vi.waitFor(() => expect(controller).toBeDefined()); + controller!.enqueue(reply("Once upon a")); + await vi.waitFor(() => expect(packets.some((p) => p.type === "response.delta")).toBe(true)); + + coordinator.handle({ type: "input.interrupted", data: {} }); + + expect(types()).not.toContain("response.cancel"); + }); +}); diff --git a/packages/eve/src/public/channels/vercel/voice-turn-coordinator.ts b/packages/eve/src/public/channels/vercel/voice-turn-coordinator.ts new file mode 100644 index 000000000..f86ce1b06 --- /dev/null +++ b/packages/eve/src/public/channels/vercel/voice-turn-coordinator.ts @@ -0,0 +1,393 @@ +import type { SendFn } from "#channel/routes.js"; +import type { Session } from "#channel/session.js"; +import type { SessionAuthContext } from "#channel/types.js"; +import { isCurrentTurnBoundaryEvent, isTurnFailureEvent } from "#protocol/message.js"; +import { + DEFAULT_CONTROL_CAPABILITIES, + encodeControlPacket, + type EveToGatewayEvent, + type GatewayToEveEvent, + type RealtimeControlCapabilities, +} from "#public/channels/vercel/voice-control-protocol.js"; + +/** Settle delay before a finalized transcript starts a turn; coalesces rapid finals. */ +const DEFAULT_SETTLE_MS = 220; +/** Bounds the dedupe set so a long session does not grow it without limit. */ +const MAX_TRACKED_ITEMS = 256; +/** Suppresses immediate duplicate transcript finals when Gateway omits item ids. */ +const TEXT_DEDUPE_MS = 10_000; + +/** Short acknowledgements that should not trigger a durable Eve turn. */ +const BACKCHANNELS = new Set([ + "ok", + "okay", + "yeah", + "yep", + "yup", + "uh huh", + "uh-huh", + "mhm", + "mm", + "mm-hmm", + "mmhmm", + "right", + "sure", + "got it", + "cool", + "nice", + "hmm", + "huh", + "okay cool", +]); + +export interface VoiceTurnCoordinatorOptions { + readonly auth: SessionAuthContext; + readonly voiceSessionId: string; + /** Channel `send`, used to run durable Eve turns. */ + readonly send: SendFn; + /** Sends a wire packet string to AI Gateway (the open WS peer). */ + readonly sendRaw: (packet: string) => void; + /** Closes the control socket with a code/reason (fail-closed). */ + readonly closeSocket: (code: number, reason: string) => void; + /** Optional durable context strings contributed on each turn. */ + readonly context?: readonly string[]; + /** Stores durable voice continuation/cursor state across control WS reconnects. */ + readonly stateStore?: VoiceControlStateStore; + /** Settle delay override (ms). */ + readonly settleMs?: number; +} + +export interface VoiceControlSessionState { + readonly continuationToken: string; + readonly sessionId?: string; + readonly streamIndex: number; +} + +export interface VoiceControlStateStore { + get( + key: string, + ): Promise | VoiceControlSessionState | undefined; + set(key: string, state: VoiceControlSessionState): Promise | void; +} + +export function createInMemoryVoiceControlStateStore(): VoiceControlStateStore { + const states = new Map(); + return { + get(key) { + return states.get(key); + }, + set(key, state) { + states.set(key, { ...state }); + }, + }; +} + +/** + * Drives durable Eve turns from the Gateway-owned realtime voice control socket. + * + * It receives finalized transcripts (and lifecycle/barge-in signals) from AI + * Gateway, debounces and de-duplicates them, runs one durable Eve turn per + * settled utterance via the channel `send`, and streams non-tool-call reply text + * back as `response.delta` / `response.done`. A user barge-in aborts the + * in-flight turn's relay and emits `response.cancel`. + * + * It degrades gracefully against the per-session capability hints the Gateway + * advertises in `session.opened.data.engine`: with `output.audio: false` it + * still runs the durable turn but skips the spoken readout; with + * `output.cancel: false` it aborts the local relay on barge-in without emitting + * `response.cancel`; it only consumes final transcripts; and it only reacts to + * an actual `input.interrupted` / `input.speech.started`, so it never promises + * barge-in the provider cannot honor. + */ +export class VoiceTurnCoordinator { + readonly #options: VoiceTurnCoordinatorOptions; + readonly #settleMs: number; + readonly #stateKey: string; + readonly #stateReady: Promise; + readonly #processedItemIds = new Set(); + readonly #processedTextFingerprints = new Map(); + + #seq = 0; + #turnSeq = 0; + #disposed = false; + #continuationToken: string; + #lastSessionId: string | undefined; + #streamIndex = 0; + + #pendingText = ""; + #settleTimer: ReturnType | undefined; + #queue: Promise = Promise.resolve(); + #activeTurn: + | { readonly abort: AbortController; cancelStream: () => void; readonly turnId: string } + | undefined; + #responseInFlight = false; + #capabilities: RealtimeControlCapabilities = DEFAULT_CONTROL_CAPABILITIES; + + constructor(options: VoiceTurnCoordinatorOptions) { + this.#options = options; + this.#settleMs = options.settleMs ?? DEFAULT_SETTLE_MS; + this.#stateKey = createStateKey(options.auth, options.voiceSessionId); + this.#continuationToken = createStableContinuationToken(options.auth, options.voiceSessionId); + this.#stateReady = this.#hydrateState(); + } + + /** Signals readiness so Gateway clears its ready timeout. */ + start(): void { + void this.#stateReady.then( + () => this.#emit({ type: "session.ready" }), + () => this.#fail("state_load_failed"), + ); + } + + /** Routes one inbound Gateway→Eve control event. */ + handle(event: GatewayToEveEvent): void { + if (this.#disposed) return; + switch (event.type) { + case "session.opened": + if (event.data.engine !== undefined) this.#capabilities = event.data.engine.capabilities; + return; + case "input.transcript.final": + this.#onTranscriptFinal(event.data.text, event.data.itemId); + return; + case "input.speech.started": + case "input.interrupted": + this.#bargeIn(); + return; + case "session.closed": + this.dispose(); + return; + case "error": + this.dispose(); + return; + // input.speech.stopped / session.stats — no action. + default: + return; + } + } + + /** Tears down timers and aborts any in-flight turn. */ + dispose(): void { + if (this.#disposed) return; + this.#disposed = true; + this.#clearSettle(); + this.#activeTurn?.abort.abort(); + this.#activeTurn?.cancelStream(); + this.#activeTurn = undefined; + } + + #onTranscriptFinal(rawText: string, itemId?: string): void { + const text = rawText.trim(); + if (text.length === 0) return; + if (itemId !== undefined) { + if (this.#processedItemIds.has(itemId)) return; + this.#processedItemIds.add(itemId); + if (this.#processedItemIds.size > MAX_TRACKED_ITEMS) { + const oldest = this.#processedItemIds.values().next().value; + if (oldest !== undefined) this.#processedItemIds.delete(oldest); + } + } + if (isBackchannel(text)) return; + if (itemId === undefined && this.#isDuplicateText(text)) return; + + this.#pendingText = this.#pendingText.length > 0 ? `${this.#pendingText} ${text}` : text; + this.#clearSettle(); + this.#settleTimer = setTimeout(() => this.#flushPending(), this.#settleMs); + } + + #flushPending(): void { + this.#settleTimer = undefined; + const message = this.#pendingText; + this.#pendingText = ""; + if (message.length === 0 || this.#disposed) return; + this.#queue = this.#queue.catch(() => undefined).then(() => this.#runTurn(message)); + } + + #bargeIn(): void { + this.#clearSettle(); + this.#pendingText = ""; + const cancelledTurnId = this.#activeTurn?.turnId; + const hadResponse = this.#responseInFlight || this.#activeTurn !== undefined; + this.#activeTurn?.abort.abort(); + if (hadResponse) { + // Skip the cancel frame when the engine can't act on it; the local relay + // is already aborted above either way. The durable stream is still drained + // to its boundary so the next turn cannot race the same continuation. + if (this.#capabilities["output.cancel"] && cancelledTurnId !== undefined) { + this.#emit({ type: "response.cancel", data: { turnId: cancelledTurnId } }); + } + this.#responseInFlight = false; + } + } + + async #runTurn(message: string): Promise { + if (this.#disposed) return; + await this.#stateReady; + const abort = new AbortController(); + const turnId = `turn_${(this.#turnSeq += 1)}`; + const turn = { abort, cancelStream: () => undefined as void, turnId }; + this.#activeTurn = turn; + let session: Session | undefined; + let consumed = 0; + let failed = false; + + try { + this.#emit({ type: "turn.started", data: { turnId } }); + const payload: { message: string; context?: readonly string[] } = { message }; + if (this.#options.context !== undefined) payload.context = this.#options.context; + session = await this.#options.send(payload, { + auth: this.#options.auth, + continuationToken: this.#continuationToken, + mode: "conversation", + }); + + const startIndex = this.#lastSessionId === session.id ? this.#streamIndex : 0; + const stream = await session.getEventStream({ startIndex }); + const reader = stream.getReader(); + turn.cancelStream = () => { + void reader.cancel().catch(() => undefined); + }; + + const partials = new Map(); + try { + while (!this.#disposed) { + const { done, value } = await reader.read(); + if (done) break; + consumed += 1; + const event = value; + if (isTurnFailureEvent(event)) { + failed = true; + break; + } + if (event.type === "message.appended") { + partials.set( + event.data.stepIndex, + (partials.get(event.data.stepIndex) ?? "") + event.data.messageDelta, + ); + } else if (event.type === "message.completed") { + if (event.data.finishReason === "tool-calls") { + partials.delete(event.data.stepIndex); + continue; + } + const text = (partials.get(event.data.stepIndex) || event.data.message || "").trim(); + partials.delete(event.data.stepIndex); + // The durable turn always runs; only stream the spoken readout when + // the engine can actually speak it (`output.audio`). + if (text.length > 0 && !abort.signal.aborted && this.#capabilities["output.audio"]) { + this.#emit({ type: "response.delta", data: { text, turnId } }); + this.#responseInFlight = true; + } + } else if (isCurrentTurnBoundaryEvent(event)) { + break; + } + } + } finally { + try { + await reader.cancel(); + } catch { + // Best effort. + } + } + + await this.#persistState(session, startIndex + consumed); + + if (failed) { + this.#responseInFlight = false; + if (!abort.signal.aborted && !this.#disposed) { + this.#emit({ type: "error", data: { message: "turn_failed" } }); + } + return; + } + + if (!abort.signal.aborted && !this.#disposed) { + this.#emit({ type: "response.done", data: { turnId } }); + this.#responseInFlight = false; + } + } catch { + this.#responseInFlight = false; + if (!abort.signal.aborted && !this.#disposed) { + this.#emit({ type: "error", data: { message: "turn_failed" } }); + } + } finally { + if (this.#activeTurn === turn) this.#activeTurn = undefined; + } + } + + async #hydrateState(): Promise { + const state = await this.#options.stateStore?.get(this.#stateKey); + if (state === undefined) return; + if (state.continuationToken.length > 0) this.#continuationToken = state.continuationToken; + this.#lastSessionId = state.sessionId; + this.#streamIndex = Math.max(0, Math.floor(state.streamIndex)); + } + + async #persistState(session: Session, streamIndex: number): Promise { + this.#lastSessionId = session.id; + this.#streamIndex = streamIndex; + await this.#options.stateStore?.set(this.#stateKey, { + continuationToken: this.#continuationToken, + sessionId: this.#lastSessionId, + streamIndex: this.#streamIndex, + }); + } + + #isDuplicateText(text: string): boolean { + const now = Date.now(); + for (const [fingerprint, seenAt] of this.#processedTextFingerprints) { + if (now - seenAt > TEXT_DEDUPE_MS) this.#processedTextFingerprints.delete(fingerprint); + } + const fingerprint = textFingerprint(text); + if (fingerprint.length === 0) return false; + if (this.#processedTextFingerprints.has(fingerprint)) return true; + this.#processedTextFingerprints.set(fingerprint, now); + if (this.#processedTextFingerprints.size > MAX_TRACKED_ITEMS) { + const oldest = this.#processedTextFingerprints.keys().next().value; + if (oldest !== undefined) this.#processedTextFingerprints.delete(oldest); + } + return false; + } + + #fail(message: string): void { + if (!this.#disposed) this.#emit({ type: "error", data: { message } }); + this.#options.closeSocket(1011, message); + this.dispose(); + } + + #emit(event: EveToGatewayEvent): void { + if (this.#disposed && event.type !== "error") return; + this.#seq += 1; + this.#options.sendRaw(encodeControlPacket(this.#seq, event)); + } + + #clearSettle(): void { + if (this.#settleTimer !== undefined) { + clearTimeout(this.#settleTimer); + this.#settleTimer = undefined; + } + } +} + +function isBackchannel(text: string): boolean { + const normalized = text + .toLowerCase() + .replace(/[.!?,]+$/u, "") + .trim(); + return BACKCHANNELS.has(normalized); +} + +function createStateKey(auth: SessionAuthContext, voiceSessionId: string): string { + return ["voice-control", auth.authenticator, auth.principalType, auth.principalId, voiceSessionId] + .map(encodeStateKeyPart) + .join(":"); +} + +function createStableContinuationToken(auth: SessionAuthContext, voiceSessionId: string): string { + return createStateKey(auth, voiceSessionId); +} + +function encodeStateKeyPart(value: string): string { + return encodeURIComponent(value); +} + +function textFingerprint(text: string): string { + return text.toLowerCase().replace(/\s+/gu, " ").trim(); +} diff --git a/packages/eve/src/react/voice.test.ts b/packages/eve/src/react/voice.test.ts new file mode 100644 index 000000000..fdcd739c9 --- /dev/null +++ b/packages/eve/src/react/voice.test.ts @@ -0,0 +1,526 @@ +import { createElement } from "react"; +import { act, create } from "react-test-renderer"; +import { afterEach, describe, expect, it, vi } from "vitest"; + +(globalThis as { IS_REACT_ACT_ENVIRONMENT?: boolean }).IS_REACT_ACT_ENVIRONMENT = true; + +const realtimeOptions: any[] = []; + +const realtimeState = { + cancelResponse: vi.fn(), + connect: vi.fn(async () => undefined), + disconnect: vi.fn(), + events: [], + isCapturing: false, + isPlaying: false, + messages: [], + requestResponse: vi.fn(), + sendEvent: vi.fn(), + startAudioCapture: vi.fn(), + status: "disconnected", + stopAudioCapture: vi.fn(), + stopPlayback: vi.fn(), +}; + +vi.mock("@ai-sdk/react", () => ({ + experimental_useRealtime: (options: unknown) => { + realtimeOptions.push(options); + return realtimeState; + }, +})); + +vi.mock("ai", () => ({ + __esModule: true, +})); + +afterEach(() => { + realtimeOptions.length = 0; + vi.clearAllMocks(); + vi.unstubAllGlobals(); +}); + +const SESSION_ID_HEADER = "x-eve-session-id"; + +function completedMessageEvent(message: string) { + return { + type: "message.completed", + data: { finishReason: "stop", message, sequence: 1, stepIndex: 0, turnId: "turn-1" }, + }; +} + +function ndjsonResponse(events: readonly unknown[]): Response { + const body = events.map((event) => JSON.stringify(event)).join("\n") + "\n"; + return new Response(body, { + status: 200, + headers: { "content-type": "application/x-ndjson" }, + }); +} + +/** + * Mocks the durable session API the voice hook now drives: a create/continue + * POST that acknowledges immediately, followed by an NDJSON event stream. + * Each entry in `turns` supplies the events for one turn, in order. + */ +function sessionFetchMock(turns: ReadonlyArray<{ sessionId: string; events: readonly unknown[] }>) { + let streamedTurns = 0; + return vi.fn(async (input: string | URL, init?: RequestInit) => { + const url = typeof input === "string" ? input : input.toString(); + const method = (init?.method ?? "GET").toUpperCase(); + const turn = turns[Math.min(streamedTurns, turns.length - 1)]!; + + if (method === "POST" && /\/eve\/v1\/session(\/[^/]+)?$/.test(url)) { + return Response.json( + { ok: true, sessionId: turn.sessionId, continuationToken: "eve:token" }, + { status: 202, headers: { [SESSION_ID_HEADER]: turn.sessionId } }, + ); + } + if (method === "GET" && /\/stream(\?|$)/.test(url)) { + const response = ndjsonResponse(turn.events); + streamedTurns += 1; + return response; + } + throw new Error(`Unexpected fetch: ${method} ${url}`); + }); +} + +function postCalls(fetch: ReturnType): unknown[][] { + return fetch.mock.calls.filter( + (call) => ((call[1] as RequestInit | undefined)?.method ?? "GET").toUpperCase() === "POST", + ); +} + +function streamCalls(fetch: ReturnType): unknown[][] { + return fetch.mock.calls.filter((call) => /\/stream(\?|$)/.test(String(call[0]))); +} + +describe("useEveVoice", () => { + it("configures realtime with a stable voice session setup URL", async () => { + const { useEveVoice } = await import("#react/voice.js"); + + function TestComponent() { + useEveVoice({ voiceSessionId: "voice-1" }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + expect(realtimeOptions).toHaveLength(1); + expect(realtimeOptions[0].api.token).toBe( + "/eve/v1/realtime-speech/setup?voiceSessionId=voice-1", + ); + expect(realtimeOptions[0].model).toMatchObject({ + modelId: "openai/gpt-realtime-2", + provider: "gateway.realtime", + specificationVersion: "v4", + }); + expect(realtimeOptions[0].sessionConfig.outputModalities).toEqual(["audio"]); + expect( + realtimeOptions[0].model.getWebSocketConfig({ token: "vcst_test", url: "wss://gateway" }), + ).toEqual({ + protocols: ["ai-gateway-realtime.v1", "ai-gateway-auth.vcst_test"], + url: "wss://gateway", + }); + }); + + it("bridges finalized transcription into durable session turns and speaks the reply", async () => { + const fetch = sessionFetchMock([ + { sessionId: "session-1", events: [completedMessageEvent("Agent reply"), waiting()] }, + { sessionId: "session-1", events: [completedMessageEvent("Second reply"), waiting()] }, + ]); + vi.stubGlobal("fetch", fetch); + + const { useEveVoice } = await import("#react/voice.js"); + const onReply = vi.fn(); + + function TestComponent() { + useEveVoice({ context: ["voice context"], onReply, voiceSessionId: "voice-1" }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + realtimeOptions[0].onEvent({ + itemId: "item-1", + raw: {}, + transcript: "Hello over speech", + type: "input-transcription-completed", + }); + + await vi.waitFor(() => expect(onReply).toHaveBeenCalled()); + + // First turn creates the session and consumes its event stream. + const firstPost = postCalls(fetch)[0]!; + expect(String(firstPost[0])).toBe("/eve/v1/session"); + expect(JSON.parse((firstPost[1] as RequestInit).body as string)).toEqual({ + message: "Hello over speech", + clientContext: ["voice context"], + }); + + expect(onReply).toHaveBeenCalledWith({ + message: "Hello over speech", + sessionId: "session-1", + streamIndex: 2, + text: "Agent reply", + }); + expect(realtimeState.sendEvent).toHaveBeenCalledWith({ + type: "conversation-item-create", + item: { type: "text-message", role: "user", text: "EVE_SPEAK:\nAgent reply" }, + }); + expect(realtimeState.requestResponse).toHaveBeenCalledWith({ modalities: ["audio"] }); + + // Second turn continues the same session and resumes the stream cursor. + realtimeOptions[0].onEvent({ + itemId: "item-2", + raw: {}, + transcript: "Second message", + type: "input-transcription-completed", + }); + + await vi.waitFor(() => expect(postCalls(fetch)).toHaveLength(2)); + + const secondPost = postCalls(fetch)[1]!; + expect(String(secondPost[0])).toBe("/eve/v1/session/session-1"); + const secondStream = streamCalls(fetch).at(-1)!; + expect(String(secondStream[0])).toContain("startIndex=2"); + }); + + it("speaks the configured fallback when a turn fails without producing text", async () => { + const fetch = sessionFetchMock([{ sessionId: "session-1", events: [sessionFailed()] }]); + vi.stubGlobal("fetch", fetch); + + const { useEveVoice } = await import("#react/voice.js"); + const onReply = vi.fn(); + + function TestComponent() { + useEveVoice({ + fallbackReply: "Sorry, please try again.", + onReply, + voiceSessionId: "voice-1", + }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + realtimeOptions[0].onEvent({ + itemId: "item-1", + raw: {}, + transcript: "Hello", + type: "input-transcription-completed", + }); + + await vi.waitFor(() => + expect(realtimeState.sendEvent).toHaveBeenCalledWith({ + type: "conversation-item-create", + item: { type: "text-message", role: "user", text: "EVE_SPEAK:\nSorry, please try again." }, + }), + ); + expect(onReply).not.toHaveBeenCalled(); + }); + + it("ignores unsolicited model responses", async () => { + const { useEveVoice } = await import("#react/voice.js"); + + function TestComponent() { + useEveVoice({ voiceSessionId: "voice-1" }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + realtimeOptions[0].onEvent({ raw: {}, responseId: "response-1", type: "response-created" }); + + expect(realtimeState.cancelResponse).not.toHaveBeenCalled(); + expect(realtimeState.requestResponse).not.toHaveBeenCalled(); + }); + + it("does not suppress finalized user transcripts after an adapter response starts", async () => { + const fetch = sessionFetchMock([ + { sessionId: "session-1", events: [completedMessageEvent("Agent reply"), waiting()] }, + ]); + vi.stubGlobal("fetch", fetch); + const { useEveVoice } = await import("#react/voice.js"); + const onReply = vi.fn(); + + function TestComponent() { + useEveVoice({ onReply, voiceSessionId: "voice-1" }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + // Some realtime adapters create a placeholder response before the final + // user transcript arrives. That must not drop the user's durable Eve turn. + realtimeOptions[0].onEvent({ raw: {}, responseId: "auto-1", type: "response-created" }); + realtimeOptions[0].onEvent({ + itemId: "item-1", + raw: {}, + transcript: "Hello from the user", + type: "input-transcription-completed", + }); + + await vi.waitFor(() => expect(onReply).toHaveBeenCalled()); + expect(postCalls(fetch)).toHaveLength(1); + }); + + it("passes each transcript's own itemId to onTranscript", async () => { + const { useEveVoice } = await import("#react/voice.js"); + const seen: Array<{ itemId: string; transcript: string }> = []; + + function TestComponent() { + useEveVoice({ + voiceSessionId: "voice-1", + onTranscript: ({ itemId, transcript }) => { + seen.push({ itemId, transcript }); + }, + }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + // Both finalize before the serialized turn queue drains; each turn must + // report the itemId captured at enqueue time, not the latest one. + realtimeOptions[0].onEvent({ + itemId: "item-1", + raw: {}, + transcript: "first", + type: "input-transcription-completed", + }); + realtimeOptions[0].onEvent({ + itemId: "item-2", + raw: {}, + transcript: "second", + type: "input-transcription-completed", + }); + + await vi.waitFor(() => expect(seen).toHaveLength(2)); + expect(seen).toEqual([ + { itemId: "item-1", transcript: "first" }, + { itemId: "item-2", transcript: "second" }, + ]); + }); + + it("suppresses transcriptions that arrive while the Eve reply is speaking", async () => { + const fetch = sessionFetchMock([ + { sessionId: "session-1", events: [completedMessageEvent("Agent reply"), waiting()] }, + ]); + vi.stubGlobal("fetch", fetch); + const { useEveVoice } = await import("#react/voice.js"); + + function TestComponent() { + useEveVoice({ voiceSessionId: "voice-1" }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + realtimeOptions[0].onEvent({ + itemId: "item-1", + raw: {}, + transcript: "First utterance", + type: "input-transcription-completed", + }); + await vi.waitFor(() => expect(postCalls(fetch)).toHaveLength(1)); + + realtimeOptions[0].onEvent({ raw: {}, responseId: "response-1", type: "response-created" }); + realtimeOptions[0].onEvent({ + itemId: "item-2", + raw: {}, + transcript: "Agent reply", + type: "input-transcription-completed", + }); + + expect(postCalls(fetch)).toHaveLength(1); + }); + + it("does not run client turns in gateway-control mode", async () => { + const fetch = vi.fn(); + vi.stubGlobal("fetch", fetch); + const { useEveVoice } = await import("#react/voice.js"); + + function TestComponent() { + useEveVoice({ voiceSessionId: "voice-1", controlMode: true }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + // Gateway drives turns over its control socket; the browser only streams + // audio, so a finalized transcript must not start a client-side turn. + realtimeOptions[0].onEvent({ + itemId: "item-1", + raw: {}, + transcript: "hello", + type: "input-transcription-completed", + }); + + expect(fetch).not.toHaveBeenCalled(); + }); + + it("caps the live voice transcript and keeps the most recent messages", async () => { + const { useEveVoice } = await import("#react/voice.js"); + + let latest: ReturnType | undefined; + function TestComponent() { + latest = useEveVoice({ voiceSessionId: "voice-1", controlMode: true }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + await act(async () => { + for (let i = 0; i < 300; i++) { + realtimeOptions.at(-1).onEvent({ + itemId: `item-${i}`, + raw: {}, + transcript: `reply ${i}`, + type: "input-transcription-completed", + }); + } + }); + + await vi.waitFor(() => expect(latest?.messages).toHaveLength(256)); + expect(latest?.messages.at(-1)?.text).toBe("reply 299"); + expect(latest?.messages[0]?.text).toBe("reply 44"); + }); + + it("ignores empty transcription completions", async () => { + const fetch = vi.fn(); + vi.stubGlobal("fetch", fetch); + const { useEveVoice } = await import("#react/voice.js"); + + function TestComponent() { + useEveVoice({ voiceSessionId: "voice-1" }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + realtimeOptions[0].onEvent({ + itemId: "empty-item", + raw: {}, + transcript: " ", + type: "input-transcription-completed", + }); + + expect(fetch).not.toHaveBeenCalled(); + }); + + it("releases the microphone and skips capture when the realtime connection fails", async () => { + const stop = vi.fn(); + const getUserMedia = vi.fn(async () => ({ getTracks: () => [{ stop }] })); + vi.stubGlobal("navigator", { mediaDevices: { getUserMedia } }); + realtimeState.connect.mockImplementationOnce(async () => { + realtimeOptions[0].onError(new Error("realtime offline")); + }); + + const { useEveVoice } = await import("#react/voice.js"); + const onError = vi.fn(); + let voice: ReturnType | undefined; + function TestComponent() { + voice = useEveVoice({ onError, voiceSessionId: "voice-1" }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + await act(async () => { + await voice!.start(); + }); + + expect(getUserMedia).toHaveBeenCalledTimes(1); + expect(stop).toHaveBeenCalledTimes(1); + expect(realtimeState.startAudioCapture).not.toHaveBeenCalled(); + expect(onError).toHaveBeenCalledTimes(1); + expect(onError).toHaveBeenCalledWith(expect.objectContaining({ message: "realtime offline" })); + }); + + it("ignores re-entrant start() calls while a connection is in flight", async () => { + const stop = vi.fn(); + const getUserMedia = vi.fn(async () => ({ getTracks: () => [{ stop }] })); + vi.stubGlobal("navigator", { mediaDevices: { getUserMedia } }); + + const { useEveVoice } = await import("#react/voice.js"); + let voice: ReturnType | undefined; + function TestComponent() { + voice = useEveVoice({ voiceSessionId: "voice-1" }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + // The second call is synchronous, before the first start() resolves its + // microphone request, so the re-entrancy guard must short-circuit it. + await act(async () => { + await Promise.all([voice!.start(), voice!.start()]); + }); + + expect(getUserMedia).toHaveBeenCalledTimes(1); + expect(realtimeState.connect).toHaveBeenCalledTimes(1); + expect(realtimeState.startAudioCapture).toHaveBeenCalledTimes(1); + }); + + it("releases the microphone when a realtime error surfaces after connecting", async () => { + const stop = vi.fn(); + const getUserMedia = vi.fn(async () => ({ getTracks: () => [{ stop }] })); + vi.stubGlobal("navigator", { mediaDevices: { getUserMedia } }); + + const { useEveVoice } = await import("#react/voice.js"); + let voice: ReturnType | undefined; + function TestComponent() { + voice = useEveVoice({ voiceSessionId: "voice-1" }); + return null; + } + + act(() => { + create(createElement(TestComponent)); + }); + + await act(async () => { + await voice!.start(); + }); + + expect(realtimeState.startAudioCapture).toHaveBeenCalledTimes(1); + expect(stop).not.toHaveBeenCalled(); + + act(() => { + realtimeOptions[0].onError(new Error("socket dropped")); + }); + + expect(realtimeState.stopAudioCapture).toHaveBeenCalled(); + expect(stop).toHaveBeenCalledTimes(1); + }); +}); + +function waiting() { + return { type: "session.waiting", data: { wait: "next-user-message" } }; +} + +function sessionFailed() { + return { type: "session.failed", data: { reason: "boom" } }; +} diff --git a/packages/eve/src/react/voice.ts b/packages/eve/src/react/voice.ts new file mode 100644 index 000000000..dfcf0046a --- /dev/null +++ b/packages/eve/src/react/voice.ts @@ -0,0 +1,792 @@ +"use client"; + +import { experimental_useRealtime } from "@ai-sdk/react"; +import { Client } from "#client/client.js"; +import type { ClientSession } from "#client/session.js"; +import type { ClientAuth, HeadersValue } from "#client/types.js"; +import { EVE_VOICE_SETUP_ROUTE_PATH, voiceSetupUrl } from "#client/voice.js"; +import type { + Experimental_RealtimeClientEvent, + Experimental_RealtimeModel, + Experimental_RealtimeServerEvent, + Experimental_RealtimeSessionConfig, +} from "ai"; +import { useCallback, useEffect, useMemo, useRef, useState } from "react"; + +const DEFAULT_MODEL = "openai/gpt-realtime-2"; +const GATEWAY_REALTIME_SUBPROTOCOL = "ai-gateway-realtime.v1"; +const GATEWAY_AUTH_SUBPROTOCOL_PREFIX = "ai-gateway-auth."; +const EVE_SPEAK_PREFIX = "EVE_SPEAK:"; +const ECHO_SUPPRESSION_MS = 900; +// Bounds the deduplication set so a long-lived session does not grow it without +// limit; finalized transcription item ids are only revisited within a few turns. +const MAX_TRACKED_INPUT_ITEMS = 256; +// Bounds the rendered voice transcript so a long-lived session does not grow the +// messages array without limit; mirrors MAX_TRACKED_INPUT_ITEMS. +const MAX_VOICE_MESSAGES = 256; + +type StoppableMediaStream = { + getTracks(): readonly { stop(): void }[]; +}; + +export interface UseEveVoiceOptions { + readonly context?: string | readonly string[]; + readonly model?: string; + readonly sessionConfig?: EveVoiceConfig; + readonly setupUrl?: string; + readonly voiceSessionId?: string; + /** + * Gateway-owned control mode (A-lite). When true, AI Gateway drives durable + * turns over its server-side control socket, so the browser only streams + * audio: the hook does not run client-side `/eve/v1/session` turns, call + * `onTranscript`, or speak replies (Gateway injects TTS). Use this when the + * channel is configured with `control`. + */ + readonly controlMode?: boolean; + /** + * Spoken when a turn fails before producing any assistant text. Off by + * default: a failed turn surfaces through `onError`/`status` instead of + * speaking a canned line, and a fallback is never spoken when the turn + * already produced a usable reply. + */ + readonly fallbackReply?: string; + /** + * Run voice turns against an existing session/client instead of an internal + * one. Pass `session` to share a `ClientSession`, `client` to reuse an + * authenticated `Client`, or `host`/`auth`/`headers` to configure the + * internal client. Read once when the hook first renders; remount to change. + */ + readonly client?: Client; + readonly session?: ClientSession; + readonly host?: string; + readonly auth?: ClientAuth; + readonly headers?: HeadersValue; + readonly onError?: (error: Error) => void; + readonly onEvent?: (event: EveVoiceEvent) => void; + readonly onTranscript?: (input: { + readonly itemId: string; + readonly transcript: string; + readonly voiceSessionId: string; + }) => Promise | string | void; + readonly onReply?: (reply: { + readonly message: string; + readonly sessionId: string; + readonly streamIndex: number; + readonly text: string; + }) => void; +} + +export type EveVoiceActivity = + | "ready" + | "connecting" + | "listening" + | "user-speaking" + | "assistant-speaking" + | "error"; + +export type EveVoiceStatus = "disconnected" | "connecting" | "connected" | "error"; + +/** + * One rendered turn in the live voice transcript. Works in both modes: in + * Gateway-control mode it is built from the realtime transcript events the + * browser receives (user `input-transcription-completed`, assistant + * `audio-transcript-delta`); in client-driven mode it mirrors the durable turn + * the hook runs. Lets apps render the conversation without reaching into raw + * provider event shapes. + */ +export interface EveVoiceMessage { + readonly id: string; + readonly role: "user" | "assistant"; + readonly text: string; +} + +// Keeps the most recent messages; new entries are appended, so the oldest are +// evicted from the front and a streaming (newest) assistant message is never +// dropped mid-stream. +function capVoiceMessages(messages: readonly EveVoiceMessage[]): readonly EveVoiceMessage[] { + return messages.length > MAX_VOICE_MESSAGES + ? messages.slice(messages.length - MAX_VOICE_MESSAGES) + : messages; +} + +function upsertUserVoiceMessage( + messages: readonly EveVoiceMessage[], + id: string, + text: string, +): readonly EveVoiceMessage[] { + if (text.length === 0 || messages.some((message) => message.id === id)) return messages; + return capVoiceMessages([...messages, { id, role: "user", text }]); +} + +function appendAssistantVoiceDelta( + messages: readonly EveVoiceMessage[], + id: string, + delta: string, +): readonly EveVoiceMessage[] { + const index = messages.findIndex((message) => message.id === id); + const existing = index === -1 ? undefined : messages[index]; + if (existing === undefined) { + return capVoiceMessages([...messages, { id, role: "assistant", text: delta }]); + } + const next = messages.slice(); + next[index] = { id, role: "assistant", text: existing.text + delta }; + return next; +} + +function setAssistantVoiceText( + messages: readonly EveVoiceMessage[], + id: string, + text: string, +): readonly EveVoiceMessage[] { + const index = messages.findIndex((message) => message.id === id); + if (index === -1) return capVoiceMessages([...messages, { id, role: "assistant", text }]); + const next = messages.slice(); + next[index] = { id, role: "assistant", text }; + return next; +} + +export interface EveVoiceConfig { + readonly instructions?: string; + readonly inputAudioTranscription?: { + readonly language?: string; + readonly model?: string; + readonly prompt?: string; + }; + readonly outputAudioTranscription?: { + readonly language?: string; + readonly model?: string; + readonly prompt?: string; + }; + readonly outputAudioFormat?: { + readonly rate?: number; + readonly type: string; + }; + readonly outputModalities?: ("audio" | "text")[]; + readonly providerOptions?: Record; + readonly turnDetection?: { + readonly prefixPaddingMs?: number; + readonly silenceDurationMs?: number; + readonly threshold?: number; + readonly type: "disabled" | "semantic-vad" | "server-vad"; + } | null; + readonly voice?: string; +} + +export type EveVoiceEvent = + | { readonly raw: unknown; readonly sessionId?: string; readonly type: "session-created" } + | { readonly raw: unknown; readonly type: "session-updated" } + | { readonly itemId?: string; readonly raw: unknown; readonly type: "speech-started" } + | { readonly itemId?: string; readonly raw: unknown; readonly type: "speech-stopped" } + | { + readonly itemId?: string; + readonly previousItemId?: string; + readonly raw: unknown; + readonly type: "audio-committed"; + } + | { + readonly item: unknown; + readonly itemId: string; + readonly raw: unknown; + readonly type: "conversation-item-added"; + } + | { + readonly itemId: string; + readonly raw: unknown; + readonly transcript: string; + readonly type: "input-transcription-completed"; + } + | { readonly raw: unknown; readonly responseId: string; readonly type: "response-created" } + | { + readonly raw: unknown; + readonly responseId: string; + readonly status: string; + readonly type: "response-done"; + } + | { + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "output-item-added"; + } + | { + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "output-item-done"; + } + | { + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "content-part-added"; + } + | { + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "content-part-done"; + } + | { + readonly delta: string; + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "audio-delta"; + } + | { + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "audio-done"; + } + | { + readonly delta: string; + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "audio-transcript-delta"; + } + | { + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly transcript?: string; + readonly type: "audio-transcript-done"; + } + | { + readonly delta: string; + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "text-delta"; + } + | { + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly text?: string; + readonly type: "text-done"; + } + | { + readonly callId: string; + readonly delta: string; + readonly itemId: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "function-call-arguments-delta"; + } + | { + readonly arguments: string; + readonly callId: string; + readonly itemId: string; + readonly name: string; + readonly raw: unknown; + readonly responseId: string; + readonly type: "function-call-arguments-done"; + } + | { + readonly code?: string; + readonly message: string; + readonly raw: unknown; + readonly type: "error"; + } + | { readonly raw: unknown; readonly rawType: string; readonly type: "custom" }; + +export interface UseEveVoiceResult { + readonly error: Error | undefined; + readonly activity: EveVoiceActivity; + readonly isCapturing: boolean; + readonly isPlaying: boolean; + readonly isUserSpeaking: boolean; + readonly lastReply: string | undefined; + /** Live voice transcript for both sides; render this instead of raw events. */ + readonly messages: readonly EveVoiceMessage[]; + /** True between a finalized user transcript and the assistant's first words. */ + readonly isThinking: boolean; + readonly sessionId: string | undefined; + readonly speak: (text: string) => void; + readonly status: EveVoiceStatus; + readonly stopPlayback: () => void; + readonly streamIndex: number; + readonly voiceSessionId: string; + start(): Promise; + stop(): void; +} + +export function useEveVoice(options: UseEveVoiceOptions = {}): UseEveVoiceResult { + const voiceSessionIdRef = useRef(options.voiceSessionId ?? crypto.randomUUID()); + const voiceSessionId = voiceSessionIdRef.current; + const sessionRef = useRef(undefined); + if (sessionRef.current === undefined) { + sessionRef.current = resolveVoiceSession(options); + } + const session = sessionRef.current; + const [error, setError] = useState(undefined); + const [isUserSpeaking, setIsUserSpeaking] = useState(false); + const [lastReply, setLastReply] = useState(undefined); + const [messages, setMessages] = useState([]); + const [isThinking, setIsThinking] = useState(false); + const [sessionId, setSessionId] = useState(session.state.sessionId); + const [streamIndex, setStreamIndex] = useState(session.state.streamIndex); + const ignoreInputUntilRef = useRef(0); + const processedInputItemsRef = useRef(new Set()); + const requestResponseRef = useRef<((options?: { modalities?: string[] }) => void) | undefined>( + undefined, + ); + const responseInFlightRef = useRef(false); + const mediaStreamRef = useRef(null); + const lastErrorRef = useRef(undefined); + const startingRef = useRef(false); + const stopAudioCaptureRef = useRef<(() => void) | undefined>(undefined); + + const model = useMemo(() => resolveRealtimeModel(options.model), [options.model]); + const setupUrl = useMemo( + () => voiceSetupUrl(options.setupUrl ?? EVE_VOICE_SETUP_ROUTE_PATH, voiceSessionId), + [options.setupUrl, voiceSessionId], + ); + const sessionConfig = useMemo( + () => + buildSessionConfig({ + sessionConfig: options.sessionConfig, + voiceSessionId, + }), + [options.sessionConfig, voiceSessionId], + ); + + const handleError = useCallback( + (nextError: Error) => { + lastErrorRef.current = nextError; + setError(nextError); + setIsUserSpeaking(false); + options.onError?.(nextError); + }, + [options.onError], + ); + + const handleRealtimeError = useCallback( + (nextError: Error) => { + // A realtime/transport failure leaves the session unusable — including one + // that surfaces after a successful connect — so release the microphone. + // Per-turn failures go through handleError directly and keep the mic open. + stopAudioCaptureRef.current?.(); + mediaStreamRef.current?.getTracks().forEach((track) => track.stop()); + mediaStreamRef.current = null; + handleError(nextError); + }, + [handleError], + ); + + const speakEveReply = useCallback((text: string) => { + const trimmed = text.trim(); + if (trimmed.length === 0) return; + + sendEventRef.current?.({ + type: "conversation-item-create", + item: { + type: "text-message", + role: "user", + text: `${EVE_SPEAK_PREFIX}\n${trimmed}`, + }, + }); + requestResponseRef.current?.({ modalities: ["audio"] }); + }, []); + + const speakFallbackReply = useCallback(() => { + const fallback = options.fallbackReply; + if (fallback === undefined || fallback.trim().length === 0) return; + setLastReply(fallback); + speakEveReply(fallback); + }, [options.fallbackReply, speakEveReply]); + + const runEveTurn = useCallback( + async (message: string, itemId: string) => { + setMessages((prev) => upsertUserVoiceMessage(prev, `user:${itemId}`, message.trim())); + setIsThinking(true); + if (options.onTranscript !== undefined) { + const reply = await options.onTranscript({ + itemId, + transcript: message, + voiceSessionId, + }); + setIsThinking(false); + if (typeof reply === "string" && reply.trim().length > 0) { + setLastReply(reply); + setMessages((prev) => setAssistantVoiceText(prev, `assistant:${itemId}`, reply.trim())); + speakEveReply(reply); + } + return; + } + + let replyText: string | undefined; + let spokeReply = false; + try { + const turn: { message: string; clientContext?: string | readonly string[] } = { message }; + if (options.context !== undefined) turn.clientContext = options.context; + const response = await session.send(turn); + + let failed = false; + for await (const event of response) { + if ( + event.type === "message.completed" && + event.data.finishReason !== "tool-calls" && + typeof event.data.message === "string" && + event.data.message.length > 0 + ) { + replyText = event.data.message; + } else if (event.type === "session.failed") { + failed = true; + } + } + + setSessionId(session.state.sessionId); + setStreamIndex(session.state.streamIndex); + + if (replyText !== undefined && replyText.trim().length > 0) { + const finalReply = replyText.trim(); + setLastReply(replyText); + setMessages((prev) => setAssistantVoiceText(prev, `assistant:${itemId}`, finalReply)); + setIsThinking(false); + options.onReply?.({ + message, + sessionId: response.sessionId, + streamIndex: session.state.streamIndex, + text: replyText, + }); + speakEveReply(replyText); + spokeReply = true; + return; + } + + // A terminal failure that produced no assistant text would be dead + // air, so optionally speak the configured fallback. + setIsThinking(false); + if (failed) speakFallbackReply(); + } catch (cause) { + setIsThinking(false); + // Only fall back when this turn produced no usable reply, so a late + // transport error cannot overwrite or double-speak a real answer. + if (!spokeReply && (replyText === undefined || replyText.trim().length === 0)) { + speakFallbackReply(); + } + throw cause; + } + }, + [ + options.context, + options.onReply, + options.onTranscript, + session, + speakEveReply, + speakFallbackReply, + voiceSessionId, + ], + ); + + const turnQueueRef = useRef(Promise.resolve()); + const enqueueEveTurn = useCallback( + (message: string, itemId: string) => { + turnQueueRef.current = turnQueueRef.current + .catch(() => undefined) + .then(() => runEveTurn(message, itemId)) + .catch((cause) => { + const nextError = cause instanceof Error ? cause : new Error(String(cause)); + handleError(nextError); + }); + }, + [handleError, runEveTurn], + ); + + const handleEvent = useCallback( + (event: Experimental_RealtimeServerEvent) => { + switch (event.type) { + case "response-created": + // Suppress user transcripts for the lifetime of ANY model response, + // not only ones solicited via speak(). A server-VAD auto-response + // would otherwise play with no in-flight flag set, and its own audio + // could be transcribed back and enqueued as a spurious user turn. + responseInFlightRef.current = true; + break; + case "response-done": + case "error": + responseInFlightRef.current = false; + ignoreInputUntilRef.current = Date.now() + ECHO_SUPPRESSION_MS; + setIsThinking(false); + break; + case "speech-started": + setIsUserSpeaking(true); + break; + case "speech-stopped": + case "audio-committed": + setIsUserSpeaking(false); + break; + case "audio-transcript-delta": + // Gateway-control mode: the assistant's spoken words stream back as + // transcript deltas; accumulate them into the live feed. + if (options.controlMode) { + setIsThinking(false); + setMessages((prev) => + appendAssistantVoiceDelta(prev, `assistant:${event.responseId}`, event.delta), + ); + } + break; + case "audio-transcript-done": + if (options.controlMode) { + setIsThinking(false); + const finalText = event.transcript?.trim(); + if (finalText !== undefined && finalText.length > 0) { + setMessages((prev) => + setAssistantVoiceText(prev, `assistant:${event.responseId}`, finalText), + ); + } + } + break; + case "input-transcription-completed": + setIsUserSpeaking(false); + // In Gateway-control mode the server drives turns over its control + // socket; the browser only streams audio and never runs client turns. + // It still surfaces the finalized transcript, which we render and use + // to flip into the Thinking… state until the assistant speaks. + if (options.controlMode) { + { + const transcript = event.transcript.trim(); + if (transcript.length > 0) { + setMessages((prev) => + upsertUserVoiceMessage(prev, `user:${event.itemId}`, transcript), + ); + setIsThinking(true); + } + } + break; + } + if (processedInputItemsRef.current.has(event.itemId)) { + break; + } + processedInputItemsRef.current.add(event.itemId); + if (processedInputItemsRef.current.size > MAX_TRACKED_INPUT_ITEMS) { + const oldest = processedInputItemsRef.current.values().next().value; + if (oldest !== undefined) processedInputItemsRef.current.delete(oldest); + } + const transcript = event.transcript.trim(); + if (transcript.length === 0) { + break; + } + if (Date.now() < ignoreInputUntilRef.current) { + break; + } + enqueueEveTurn(transcript, event.itemId); + break; + } + options.onEvent?.(event as EveVoiceEvent); + }, + [enqueueEveTurn, options.controlMode, options.onEvent], + ); + + const sendEventRef = useRef<((event: Experimental_RealtimeClientEvent) => void) | undefined>( + undefined, + ); + const realtime = experimental_useRealtime({ + api: { token: setupUrl }, + model, + onError: handleRealtimeError, + onEvent: handleEvent, + sessionConfig, + }); + requestResponseRef.current = realtime.requestResponse; + sendEventRef.current = realtime.sendEvent; + stopAudioCaptureRef.current = realtime.stopAudioCapture; + + const stop = useCallback(() => { + realtime.stopAudioCapture(); + realtime.stopPlayback(); + realtime.disconnect(); + ignoreInputUntilRef.current = 0; + processedInputItemsRef.current.clear(); + responseInFlightRef.current = false; + setIsUserSpeaking(false); + mediaStreamRef.current?.getTracks().forEach((track) => track.stop()); + mediaStreamRef.current = null; + }, [realtime]); + const stopRef = useRef(stop); + stopRef.current = stop; + + const start = useCallback(async () => { + // Ignore re-entrant starts: a second in-flight or already-live session + // would acquire another microphone stream and orphan the previous one. + if ( + startingRef.current || + realtime.status === "connecting" || + realtime.status === "connected" + ) { + return; + } + startingRef.current = true; + setError(undefined); + lastErrorRef.current = undefined; + try { + const mediaStream = await getMicrophoneStream(); + mediaStreamRef.current = mediaStream; + await realtime.connect(); + // The AI SDK's connect() resolves even when the realtime session fails to + // open: it routes the failure through onError instead of rejecting. Treat + // a captured error as a thrown connection failure so the microphone is + // released and audio capture never starts against a dead session. + if (lastErrorRef.current !== undefined) { + throw lastErrorRef.current; + } + realtime.startAudioCapture(mediaStream as Parameters[0]); + } catch (cause) { + mediaStreamRef.current?.getTracks().forEach((track) => track.stop()); + mediaStreamRef.current = null; + const nextError = cause instanceof Error ? cause : new Error(String(cause)); + // Avoid double-reporting when onError already surfaced this error. + if (nextError !== lastErrorRef.current) { + handleError(nextError); + } + } finally { + startingRef.current = false; + } + }, [handleError, realtime]); + + useEffect(() => () => stopRef.current(), []); + + return { + activity: resolveActivity({ + isPlaying: realtime.isPlaying, + isUserSpeaking, + status: realtime.status, + }), + error, + isCapturing: realtime.isCapturing, + isPlaying: realtime.isPlaying, + isThinking, + isUserSpeaking, + lastReply, + messages, + sessionId, + speak: speakEveReply, + start, + status: realtime.status, + stop, + stopPlayback: realtime.stopPlayback, + streamIndex, + voiceSessionId, + }; +} + +function resolveActivity(input: { + readonly isPlaying: boolean; + readonly isUserSpeaking: boolean; + readonly status: EveVoiceStatus; +}): EveVoiceActivity { + if (input.status === "error") return "error"; + if (input.status === "connecting") return "connecting"; + if (input.status !== "connected") return "ready"; + if (input.isUserSpeaking) return "user-speaking"; + if (input.isPlaying) return "assistant-speaking"; + return "listening"; +} + +function resolveVoiceSession(options: UseEveVoiceOptions): ClientSession { + if (options.session !== undefined) return options.session; + if (options.client !== undefined) return options.client.session(); + const clientOptions: { host: string; auth?: ClientAuth; headers?: HeadersValue } = { + host: options.host ?? "", + }; + if (options.auth !== undefined) clientOptions.auth = options.auth; + if (options.headers !== undefined) clientOptions.headers = options.headers; + const client = new Client(clientOptions); + return client.session(); +} + +function buildSessionConfig(input: { + readonly sessionConfig: EveVoiceConfig | undefined; + readonly voiceSessionId: string; +}): Partial { + const baseGatewayOptions = { + tags: ["eve", "realtime-speech"], + user: input.voiceSessionId, + }; + const providerOptions = input.sessionConfig?.providerOptions; + const gatewayOptions = asRecord(providerOptions?.gateway); + + return { + instructions: [ + "You are a speech transport adapter for an Eve agent, not the assistant.", + "Do not answer user speech directly and do not mention tools, waiting, or checking.", + `Only speak when you receive a user message beginning with ${EVE_SPEAK_PREFIX}`, + "When you receive that marker, read only the text after it exactly.", + ].join(" "), + inputAudioTranscription: {}, + outputAudioTranscription: {}, + outputModalities: ["audio"], + turnDetection: { type: "server-vad" }, + voice: "alloy", + ...input.sessionConfig, + providerOptions: { + ...providerOptions, + gateway: { + ...baseGatewayOptions, + ...gatewayOptions, + }, + }, + }; +} + +function asRecord(value: unknown): Record | undefined { + return value !== null && typeof value === "object" && !Array.isArray(value) + ? (value as Record) + : undefined; +} + +function resolveRealtimeModel(model: string | Experimental_RealtimeModel | undefined) { + if (typeof model === "object" && model !== null) return model; + return createGatewayRealtimeModel(model ?? DEFAULT_MODEL); +} + +function createGatewayRealtimeModel(modelId: string): Experimental_RealtimeModel { + return { + specificationVersion: "v4", + provider: "gateway.realtime", + modelId, + doCreateClientSecret() { + throw new Error( + "Eve voice mints Gateway realtime client secrets through the setup route, not in the browser.", + ); + }, + getWebSocketConfig(options) { + return { + url: options.url, + protocols: [ + GATEWAY_REALTIME_SUBPROTOCOL, + `${GATEWAY_AUTH_SUBPROTOCOL_PREFIX}${options.token}`, + ], + }; + }, + parseServerEvent(raw: unknown): Experimental_RealtimeServerEvent { + return raw as Experimental_RealtimeServerEvent; + }, + serializeClientEvent(event: Experimental_RealtimeClientEvent): unknown { + return event; + }, + buildSessionConfig(config: Experimental_RealtimeSessionConfig): unknown { + return config; + }, + }; +} + +async function getMicrophoneStream(): Promise { + const mediaDevices = ( + globalThis as { + readonly navigator?: { + readonly mediaDevices?: { + getUserMedia(input: { readonly audio: true }): Promise; + }; + }; + } + ).navigator?.mediaDevices; + + if (mediaDevices === undefined) { + throw new Error("Microphone capture is not available in this environment."); + } + return mediaDevices.getUserMedia({ audio: true }); +} diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5fa5cef66..9e9079d09 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -24,6 +24,9 @@ catalogs: '@ai-sdk/provider': specifier: 4.0.0-beta.19 version: 4.0.0-beta.19 + '@ai-sdk/react': + specifier: 4.0.0-beta.182 + version: 4.0.0-beta.182 '@types/node': specifier: 25.9.1 version: 25.9.1 @@ -67,6 +70,9 @@ catalogs: specifier: 4.4.3 version: 4.4.3 +overrides: + '@ai-sdk/gateway': file:vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz + importers: .: @@ -289,6 +295,12 @@ importers: apps/frameworks/next: dependencies: + '@ai-sdk/gateway': + specifier: file:../../../vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz + version: file:vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz(zod@4.4.3) + '@ai-sdk/react': + specifier: 'catalog:' + version: 4.0.0-beta.182(react@19.2.6)(zod@4.4.3) '@auth/core': specifier: 0.41.2 version: 0.41.2 @@ -735,6 +747,9 @@ importers: packages/eve: dependencies: + '@ai-sdk/gateway': + specifier: file:../../vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz + version: file:vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz(zod@4.4.3) '@opentelemetry/api': specifier: ^1.0.0 version: 1.9.1 @@ -766,6 +781,9 @@ importers: '@ai-sdk/provider': specifier: 'catalog:' version: 4.0.0-beta.19 + '@ai-sdk/react': + specifier: 'catalog:' + version: 4.0.0-beta.182(react@19.2.6)(zod@4.4.3) '@chat-adapter/slack': specifier: 4.29.0 version: 4.29.0(ai@7.0.0-beta.178(zod@4.4.3))(zod@4.4.3) @@ -904,14 +922,9 @@ packages: peerDependencies: zod: ^3.25.76 || ^4.1.8 - '@ai-sdk/gateway@3.0.120': - resolution: {integrity: sha512-MYKAeD2q7/sa1ZdqtL2tw0Me0B8Tok6Q/fhkJDhJl39dG8u+VBlWO9yk9lcdm784bM418o1EKObo4aOxs6+18Q==} - engines: {node: '>=18'} - peerDependencies: - zod: ^3.25.76 || ^4.1.8 - - '@ai-sdk/gateway@4.0.0-beta.109': - resolution: {integrity: sha512-W/1kLlPb6Bgbhwep3CA3R6do0HD7SXV5gyuz2XBLY1YABqgxYkw+IhEcjOYlmn9v+Tifjqy5yJqmWdSHMJhyPQ==} + '@ai-sdk/gateway@file:vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz': + resolution: {integrity: sha512-Twty+hibORBK2YBEnmV1MEkI1ukSXn0Alb4pLIweV/JDFhj6oF1AaLdrWwwbRwaJANwybLd9Y7AiibHQhvO5UA==, tarball: file:vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz} + version: 4.0.0-beta.110 engines: {node: '>=22'} peerDependencies: zod: ^3.25.76 || ^4.1.8 @@ -964,6 +977,12 @@ packages: peerDependencies: react: ^18 || ~19.0.1 || ~19.1.2 || ^19.2.1 + '@ai-sdk/react@4.0.0-beta.182': + resolution: {integrity: sha512-o9YwQ1QELhsuXcdg2ZfJ1GyHrtsXBSjMtlxdGnOW36Jkqiv0DT1yARI2PQKX0Ge0vsCNGIbPoJzNrkDzRTcG+g==} + engines: {node: '>=22'} + peerDependencies: + react: ^18 || ~19.0.1 || ~19.1.2 || ^19.2.1 + '@alloc/quick-lru@5.2.0': resolution: {integrity: sha512-UrcABB+4bUrFABwbluTIBErXwvbsU/V7TZWfmbgJfbkwiBuziS9gxdODUyuiecfdGQ85jglMW6juS3+z5TsKLw==} engines: {node: '>=10'} @@ -11724,6 +11743,7 @@ packages: stream-to-promise@2.2.0: resolution: {integrity: sha512-HAGUASw8NT0k8JvIVutB2Y/9iBk7gpgEyAudXwNJmZERdMITGdajOa4VJfD/kNiA3TppQpTP4J+CtcHwdzKBAw==} + deprecated: Deprecated. Use node:stream/promises and node:stream/consumers instead. streamdown@2.5.0: resolution: {integrity: sha512-/tTnURfIOxZK/pqJAxsfCvETG/XCJHoWnk3jq9xLcuz6CSpnjjuxSRBTTL4PKGhxiZQf0lqPxGhImdpwcZ2XwA==} @@ -12962,14 +12982,7 @@ snapshots: '@ai-sdk/provider-utils': 5.0.0-beta.49(zod@4.4.3) zod: 4.4.3 - '@ai-sdk/gateway@3.0.120(zod@4.4.3)': - dependencies: - '@ai-sdk/provider': 3.0.10 - '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3) - '@vercel/oidc': 3.2.0 - zod: 4.4.3 - - '@ai-sdk/gateway@4.0.0-beta.109(zod@4.4.3)': + '@ai-sdk/gateway@file:vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz(zod@4.4.3)': dependencies: '@ai-sdk/provider': 4.0.0-beta.19 '@ai-sdk/provider-utils': 5.0.0-beta.49(zod@4.4.3) @@ -13036,6 +13049,18 @@ snapshots: transitivePeerDependencies: - zod + '@ai-sdk/react@4.0.0-beta.182(react@19.2.6)(zod@4.4.3)': + dependencies: + '@ai-sdk/mcp': 2.0.0-beta.66(zod@4.4.3) + '@ai-sdk/provider': 4.0.0-beta.19 + '@ai-sdk/provider-utils': 5.0.0-beta.49(zod@4.4.3) + ai: 7.0.0-beta.178(zod@4.4.3) + react: 19.2.6 + swr: 2.4.1(react@19.2.6) + throttleit: 2.1.0 + transitivePeerDependencies: + - zod + '@alloc/quick-lru@5.2.0': {} '@antfu/install-pkg@1.1.0': @@ -19235,7 +19260,7 @@ snapshots: ai@6.0.191(zod@4.4.3): dependencies: - '@ai-sdk/gateway': 3.0.120(zod@4.4.3) + '@ai-sdk/gateway': file:vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz(zod@4.4.3) '@ai-sdk/provider': 3.0.10 '@ai-sdk/provider-utils': 4.0.27(zod@4.4.3) '@opentelemetry/api': 1.9.1 @@ -19243,7 +19268,7 @@ snapshots: ai@7.0.0-beta.178(zod@4.4.3): dependencies: - '@ai-sdk/gateway': 4.0.0-beta.109(zod@4.4.3) + '@ai-sdk/gateway': file:vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz(zod@4.4.3) '@ai-sdk/provider': 4.0.0-beta.19 '@ai-sdk/provider-utils': 5.0.0-beta.49(zod@4.4.3) zod: 4.4.3 diff --git a/pnpm-workspace.yaml b/pnpm-workspace.yaml index a2b11d54d..936496ff2 100644 --- a/pnpm-workspace.yaml +++ b/pnpm-workspace.yaml @@ -4,6 +4,13 @@ packages: - e2e/fixtures/* - packages/* +# Temporary override: a locally-built @ai-sdk/gateway that adds realtime voice +# `control` mint support plus the shared speech-engine control protocol + codec +# + GatewaySpeechEngineSession helper. Vendored as a tarball so this branch is +# self-contained; drop it once the change lands upstream in @ai-sdk/gateway. +overrides: + "@ai-sdk/gateway": "file:vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz" + allowBuilds: "@mongodb-js/zstd": true "@nestjs/core": true @@ -19,6 +26,7 @@ allowBuilds: catalog: "@ai-sdk/anthropic": "4.0.0-beta.67" + "@ai-sdk/gateway": "4.0.0-beta.110" "@ai-sdk/google": "4.0.0-beta.82" "@ai-sdk/mcp": "2.0.0-beta.66" "@ai-sdk/openai": "4.0.0-beta.74" diff --git a/vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz b/vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz new file mode 100644 index 000000000..fc112911b Binary files /dev/null and b/vendor/ai-sdk-gateway-4.0.0-beta.110-speech-engine.tgz differ