From f121f31d88d2b005c5d30df3684ba6bc1a0a5b06 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 17:59:03 -0400 Subject: [PATCH 01/35] =?UTF-8?q?refactor(desktop):=20hub=20always-on=20?= =?UTF-8?q?=E2=80=94=20drop=20enable=20toggle,=20derive=20provider=20from?= =?UTF-8?q?=20Voice=20Model?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RealtimeHubSettings no longer stores an enable flag or its own provider; the hub is the default voice path and its provider follows the existing Advanced 'Voice Model' picker (RealtimeOmniSettings). Drops the now-unused subtitle and CaseIterable conformance. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../FloatingControlBarState.swift | 46 ---- .../FloatingControlBarView.swift | 79 ++++--- .../RealtimeHubSettings.swift | 63 ++---- .../VoiceActivityIndicator.swift | 199 ------------------ 4 files changed, 54 insertions(+), 333 deletions(-) delete mode 100644 desktop/macos/Desktop/Sources/FloatingControlBar/VoiceActivityIndicator.swift diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift index f39cf6618f3..8608cb771ec 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift @@ -51,21 +51,6 @@ struct FloatingBarNotification: Identifiable, Equatable { } } -/// The high-level voice activity the floating bar is reflecting right now. Derived -/// from the lower-level PTT/hub flags so the status indicator has a single, ordered -/// source of truth (each state has exactly one visual treatment). -enum VoiceActivity: Equatable { - /// Nothing happening — the bar rests as a calm, barely-breathing sliver. - case idle - /// User is holding push-to-talk; we're capturing their voice (red, "you"). - case listening - /// Turn committed, waiting on the model's reply — the model may answer late, - /// so this MUST read as "working, wait" rather than "done" (cool autonomous swirl). - case thinking - /// The model is speaking its reply (warm, audio-reactive waveform — "it"). - case speaking -} - /// Observable object holding the state for the floating control bar. @MainActor class FloatingControlBarState: NSObject, ObservableObject { @@ -108,35 +93,6 @@ class FloatingControlBarState: NSObject, ObservableObject { @Published var isVoiceListening: Bool = false @Published var isVoiceLocked: Bool = false @Published var voiceTranscript: String = "" - /// True after a voice turn is committed and we're waiting on the model's reply - /// (vs. still recording) — drives the "Thinking…/Responding…" indicator so the user - /// knows to wait rather than re-pressing (which would interrupt a slow reply). - @Published var isVoiceThinking: Bool = false - /// True while the model is actually speaking its reply (native audio playing or the - /// AVSpeech fallback talking). Distinct from `isVoiceThinking` so the indicator can - /// show a clearly different "it's talking" treatment vs. "it's working". - @Published var isVoiceSpeaking: Bool = false - /// Smoothed 0…1 output amplitude of the model's spoken reply, sampled from the - /// playback engine. Drives the speaking waveform so it reacts to the actual voice - /// (premium feel) rather than animating blindly. 0 when not speaking. - @Published var voiceLevel: CGFloat = 0 - - /// Single ordered source of truth for the status indicator. Listening wins (the user - /// is actively talking), then speaking, then thinking, else idle — by construction the - /// hub sets these mutually exclusively, the ordering just makes barge-in race-safe. - var voiceActivity: VoiceActivity { - if isVoiceListening { return .listening } - if isVoiceSpeaking { return .speaking } - if isVoiceThinking { return .thinking } - return .idle - } - - /// Whether any voice turn is in flight — keeps the bar expanded across the whole - /// listening → thinking → speaking arc so the indicator stays visible (one expand, - /// one collapse per turn — no resize churn mid-turn). - var isVoiceActive: Bool { - isVoiceListening || isVoiceThinking || isVoiceSpeaking - } // Voice follow-up state (PTT while AI conversation is active) @Published var isVoiceFollowUp: Bool = false @@ -180,8 +136,6 @@ class FloatingControlBarState: NSObject, ObservableObject { isVoiceFollowUp = false voiceFollowUpTranscript = "" currentQueryFromVoice = false - isVoiceSpeaking = false - voiceLevel = 0 lastConversationActivityAt = nil } } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift index 9188f5d11b3..763fd6f3494 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift @@ -33,11 +33,9 @@ struct FloatingControlBarView: View { .animation(.spring(response: 0.35, dampingFraction: 0.82), value: state.currentNotification?.id) } - /// Whether the bar chrome should stretch to fill the window width. Stays full-width - /// for the whole voice turn (listening → thinking → speaking) so the status indicator - /// has room and the bar resizes exactly once per turn. + /// Whether the bar chrome should stretch to fill the window width private var barNeedsFullWidth: Bool { - isHovering || state.showingAIConversation || state.isVoiceActive + isHovering || state.showingAIConversation || state.isVoiceListening } private var barChrome: some View { @@ -85,7 +83,7 @@ struct FloatingControlBarView: View { } } .overlay(alignment: .topTrailing) { - if isHovering && !state.isVoiceActive { + if isHovering && !state.isVoiceListening { Button { openFloatingBarSettings() } label: { @@ -281,8 +279,8 @@ struct FloatingControlBarView: View { private var controlBarView: some View { Group { - if state.isVoiceActive && !state.isVoiceFollowUp { - voiceActiveView + if state.isVoiceListening && !state.isVoiceFollowUp { + voiceListeningView .padding(.horizontal, 6) .padding(.vertical, 3) .frame(height: 50) @@ -308,11 +306,11 @@ struct FloatingControlBarView: View { } } - /// Minimal resting indicator shown when not hovering and no voice turn is active — - /// a calm, slowly breathing sliver. (Active turns render `voiceActiveView` instead.) + /// Minimal thin bar shown when not hovering private var compactCircleView: some View { - VoiceActivityIndicator(activity: state.voiceActivity, level: state.voiceLevel) - .frame(width: 28, height: 14) + RoundedRectangle(cornerRadius: 3) + .fill(Color.white.opacity(0.5)) + .frame(width: 28, height: 6) } private func compactToggle(_ title: String, isOn: Binding) -> some View { @@ -360,15 +358,20 @@ struct FloatingControlBarView: View { } } - /// Unified expanded voice view for the whole turn. The status indicator carries the - /// state (listening / thinking / speaking) visually; the text is just the helpful - /// detail (transcript, "Release to send", "Thinking…"). One element, no jarring swaps. - private var voiceActiveView: some View { + private var voiceListeningView: some View { HStack(spacing: 8) { - VoiceActivityIndicator(activity: state.voiceActivity, level: state.voiceLevel) - .frame(width: 34, height: 18) + // Pulsing mic icon + Circle() + .fill(Color.red) + .frame(width: 10, height: 10) + .scaleEffect(state.isVoiceListening ? 1.2 : 1.0) + .animation(.easeInOut(duration: 0.6).repeatForever(autoreverses: true), value: state.isVoiceListening) + + Image(systemName: "mic.fill") + .scaledFont(size: 14, weight: .semibold) + .foregroundColor(.white) - if state.isVoiceLocked && state.isVoiceListening { + if state.isVoiceLocked { Text("LOCKED") .scaledFont(size: 10, weight: .bold) .foregroundColor(.orange) @@ -378,31 +381,21 @@ struct FloatingControlBarView: View { .cornerRadius(4) } - // Dim only the "Release to send" hint; live transcript / status reads brighter. - let isHint = state.voiceActivity == .listening && state.voiceTranscript.isEmpty - Text(voiceStatusText) - .scaledFont(size: 13) - .foregroundColor(.white.opacity(isHint ? 0.5 : 0.85)) - .lineLimit(1) - .truncationMode(.head) - } - } - - /// The detail text beside the indicator for the current voice state. The indicator - /// itself carries the state visually; this is just the helpful detail. - private var voiceStatusText: String { - switch state.voiceActivity { - case .listening: - if !state.voiceTranscript.isEmpty { return state.voiceTranscript } - return state.isVoiceLocked - ? "Tap \(shortcutSettings.pttShortcut.displayLabel) to send" - : "Release \(shortcutSettings.pttShortcut.displayLabel) to send" - case .thinking: - return "Thinking…" - case .speaking: - return "Speaking…" - case .idle: - return "" + if !state.voiceTranscript.isEmpty { + Text(state.voiceTranscript) + .scaledFont(size: 13) + .foregroundColor(.white.opacity(0.8)) + .lineLimit(1) + .truncationMode(.head) + } else { + Text( + state.isVoiceLocked + ? "Tap \(shortcutSettings.pttShortcut.displayLabel) to send" + : "Release \(shortcutSettings.pttShortcut.displayLabel) to send" + ) + .scaledFont(size: 13) + .foregroundColor(.white.opacity(0.5)) + } } } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift index 4f4b5952e44..e06f27d0ec0 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift @@ -1,18 +1,20 @@ import Foundation -// MARK: - Realtime Hub (Phase 1) +// MARK: - Realtime Hub // // "Realtime-as-hub": instead of the cascade (STT → router → Claude → TTS), one // realtime model is the single hub. It does in-session STT, reasoning, routing // (as tool choice), and speaks the answer. Its tools call the EXISTING backend // endpoints / app code — no new backend routes. // -// Phase 1 is CLIENT-DIRECT + dev/test only: the realtime WS connects straight to -// the provider with the user's own BYOK key (see APIKeyService). It is gated so -// it never runs for managed (non-BYOK) users. Phase 2 will replace the BYOK key -// with a server-minted ephemeral token to make it shippable. +// The hub is the default voice path — there is no opt-in toggle. Every PTT turn +// routes through it whenever it can connect: BYOK users connect client-direct with +// their own key (see APIKeyService); managed users connect with a server-minted +// ephemeral token. When neither is available (no key, mint fails / not entitled) the +// turn falls back to the legacy STT cascade. The provider follows the user's "Voice +// Model" choice in Advanced settings (RealtimeOmniSettings) — no separate picker. -enum RealtimeHubProvider: String, CaseIterable, Sendable { +enum RealtimeHubProvider: String, Sendable { case openai case gemini @@ -23,13 +25,6 @@ enum RealtimeHubProvider: String, CaseIterable, Sendable { } } - var subtitle: String { - switch self { - case .openai: return "gpt-realtime-2 · native spoken audio" - case .gemini: return "gemini native-audio Live · spoken audio + tools" - } - } - /// Concrete model identifier sent to the provider. var modelID: String { switch self { @@ -58,46 +53,24 @@ enum RealtimeHubProvider: String, CaseIterable, Sendable { final class RealtimeHubSettings { static let shared = RealtimeHubSettings() - private let enabledKey = "realtimeHubEnabled" - private let providerKey = "realtimeHubProvider" - - private init() { - UserDefaults.standard.register(defaults: [ - enabledKey: false, - providerKey: RealtimeHubProvider.openai.rawValue, - ]) - } - - /// Master switch. When off, the floating bar uses the legacy STT → router → - /// Claude → TTS cascade. Ships behind this flag. - var isEnabled: Bool { - get { UserDefaults.standard.bool(forKey: enabledKey) } - set { - UserDefaults.standard.set(newValue, forKey: enabledKey) - NotificationCenter.default.post(name: .realtimeHubSettingsDidChange, object: nil) - } - } + private init() {} + /// The hub provider follows the user's "Voice Model" choice in Advanced settings — + /// there is no separate hub picker. The two map 1:1 (same underlying models), and + /// `.auto` is already resolved to a concrete provider by `effectiveProvider`. var provider: RealtimeHubProvider { - get { - let raw = UserDefaults.standard.string(forKey: providerKey) - return raw.flatMap(RealtimeHubProvider.init(rawValue:)) ?? .openai - } - set { - UserDefaults.standard.set(newValue.rawValue, forKey: providerKey) - NotificationCenter.default.post(name: .realtimeHubSettingsDidChange, object: nil) + switch RealtimeOmniSettings.shared.effectiveProvider { + case .gptRealtime2: return .openai + case .geminiFlashLive, .auto: return .gemini } } - /// The hub may only run client-direct when the user has supplied the selected - /// provider's own key (BYOK / dev key). This is the managed-user gate: managed - /// users have no BYOK key, so the hub stays off and the cascade is used. + /// True when the hub can connect client-direct with the user's own provider key + /// (BYOK / dev key). Managed users without a key connect via a minted ephemeral + /// token instead (see RealtimeHubController.ensureWarm); both reach the hub. var canConnect: Bool { APIKeyService.byokKey(provider.byokProvider) != nil } - - /// True when the hub should drive this PTT turn (enabled + a usable key). - var isActive: Bool { isEnabled && canConnect } } extension Notification.Name { diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceActivityIndicator.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceActivityIndicator.swift deleted file mode 100644 index 46f0eb99883..00000000000 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceActivityIndicator.swift +++ /dev/null @@ -1,199 +0,0 @@ -import SwiftUI - -/// The floating bar's single status element. One coherent shape that changes its -/// motion law, color, and energy per state — never a hard icon swap — so the user -/// always knows, at a glance and without labels, whether the assistant is: -/// -/// • idle — a calm, barely-breathing sliver (nearly still, muted) -/// • listening — a red waveform reacting to "you" (red is reserved for recording) -/// • thinking — a cool blue→violet gradient sweeping on its own fixed clock; the -/// self-driven motion (no audio) reads as "working, wait" — critical -/// so a late reply never looks like "done / idle" -/// • speaking — a green waveform driven by the model's actual output amplitude -/// ("it's talking", clearly distinct from the red "you" waveform) -/// -/// Performance: idle (the long-lived resting state) uses a single Core Animation -/// property animation — no per-frame redraw. The active states use one -/// `TimelineView(.animation)` + `Canvas` (a single GPU-friendly draw pass, no -/// view-graph diffing per frame), and only run for the few seconds a turn is live. -/// No blur/shadow/material (those force offscreen passes) — glow is faked with -/// translucent gradient fills. -struct VoiceActivityIndicator: View { - let activity: VoiceActivity - /// Smoothed 0…1 amplitude of the model's spoken reply (drives the speaking waveform). - var level: CGFloat = 0 - - var body: some View { - ZStack { - switch activity { - case .idle: - IdleBreath() - case .listening: - WaveformBars(palette: .listening, level: 0, reactive: false) - case .thinking: - ThinkingSweep() - case .speaking: - WaveformBars(palette: .speaking, level: level, reactive: true) - } - } - // Cross-fade + gentle scale between states so energy "ramps" rather than snaps. - .transition(.opacity.combined(with: .scale(scale: 0.7))) - .animation(.spring(response: 0.4, dampingFraction: 0.86), value: activity) - } -} - -// MARK: - Idle - -/// A short muted capsule that breathes very slowly. Intentionally low-energy so the -/// resting bar never pulls the eye. Pure Core Animation — no redraw loop. -private struct IdleBreath: View { - @State private var breathing = false - - var body: some View { - Capsule() - .fill(Color.white.opacity(breathing ? 0.55 : 0.26)) - .frame(width: 26, height: 5) - .scaleEffect(x: 1, y: breathing ? 1.0 : 0.7, anchor: .center) - .onAppear { - withAnimation(.easeInOut(duration: 2.8).repeatForever(autoreverses: true)) { - breathing = true - } - } - } -} - -// MARK: - Thinking - -/// A cool blue→violet gradient that pans continuously across a capsule at a fixed, -/// self-driven rate. The autonomous (non-audio) motion is the cue that the model is -/// working — so a slow reply reads as "wait", never as "done". -private struct ThinkingSweep: View { - // Hoisted: the colors are state-constant, so only the gradient positions change - // per frame — no point rebuilding these Gradient values 60–120×/s. - private static let sweepGradient = Gradient(colors: [ - Color(red: 0.70, green: 0.49, blue: 1.0), // violet - Color(red: 0.43, green: 0.55, blue: 1.0), // blue - Color(red: 0.70, green: 0.49, blue: 1.0), - Color(red: 0.43, green: 0.55, blue: 1.0), - Color(red: 0.70, green: 0.49, blue: 1.0), - ]) - private static let glowGradient = Gradient(colors: [ - Color.white.opacity(0.45), Color.white.opacity(0), - ]) - - var body: some View { - TimelineView(.animation) { timeline in - Canvas { context, size in - let t = timeline.date.timeIntervalSinceReferenceDate - let rect = CGRect(origin: .zero, size: size) - let capsule = Capsule().path(in: rect) - - // Dim base track so the capsule reads even at the low point of the sweep. - context.fill(capsule, with: .color(.white.opacity(0.10))) - - context.drawLayer { layer in - layer.clip(to: capsule) - - // Pan a symmetric violet→blue→violet gradient horizontally. Symmetric - // stops + a span twice the width mean the loop has no visible seam. - let period = 2.2 // seconds per full pan - let phase = (t.truncatingRemainder(dividingBy: period)) / period - let span = size.width * 2 - let shift = CGFloat(phase) * span - layer.fill( - Rectangle().path(in: rect), - with: .linearGradient( - Self.sweepGradient, - startPoint: CGPoint(x: -span + shift, y: 0), - endPoint: CGPoint(x: shift, y: 0))) - - // Soft moving highlight (faked glow) gliding with an eased ping-pong - // so it slows at the ends instead of snapping back. - let eased = 0.5 - 0.5 * cos(phase * 2 * .pi) - let cx = size.width * CGFloat(eased) - let glowR = max(size.height, size.width * 0.32) - layer.fill( - Rectangle().path(in: rect), - with: .radialGradient( - Self.glowGradient, - center: CGPoint(x: cx, y: size.height / 2), - startRadius: 0, endRadius: glowR)) - } - } - } - .frame(width: 34, height: 8) - } -} - -// MARK: - Waveform (listening + speaking) - -/// Color treatment for a waveform state — a precomputed top→bottom gradient (constant -/// per state, so it's built once here, not per-bar per-frame inside the Canvas). -private struct WaveformPalette { - let gradient: Gradient - - /// Red — reserved exclusively for recording the user ("you"). - static let listening = WaveformPalette(gradient: Gradient(colors: [ - Color(red: 1.0, green: 0.42, blue: 0.42), - Color(red: 1.0, green: 0.18, blue: 0.33), - ])) - - /// Green/mint — the assistant speaking ("it"); clearly not the red "you" or blue "thinking". - static let speaking = WaveformPalette(gradient: Gradient(colors: [ - Color(red: 0.46, green: 0.93, blue: 0.74), - Color(red: 0.20, green: 0.83, blue: 0.60), - ])) -} - -/// A small centered equalizer. `reactive` bars track the live `level` (speaking); -/// non-reactive bars animate on a lively synthetic clock (listening). A per-bar phase -/// + center weighting gives an organic "voice blob" rather than a marching pattern. -private struct WaveformBars: View { - let palette: WaveformPalette - var level: CGFloat - var reactive: Bool - - private let barCount = 5 - - var body: some View { - TimelineView(.animation) { timeline in - Canvas { context, size in - let t = timeline.date.timeIntervalSinceReferenceDate - // Equal bars and gaps: n bars, n-1 gaps, all one unit wide. - let unit = size.width / CGFloat(barCount * 2 - 1) - let radius = unit / 2 - let minH = size.height * 0.28 - - for i in 0.. Date: Wed, 17 Jun 2026 17:59:03 -0400 Subject: [PATCH 02/35] refactor(desktop): remove hub enable gates; unwire pill flags; single BYOK predicate isActive/ensureWarm/reconnect no longer gate on the removed isEnabled toggle. isActive now consults RealtimeHubSettings.canConnect (single source). Removes the floating-bar pill flag writes (isVoiceThinking/Speaking/voiceLevel) and the orphaned AVSpeech delegate; native-audio playback is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../RealtimeHubController.swift | 106 ++++-------------- 1 file changed, 19 insertions(+), 87 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift index cf8e7356e4c..0fe8156e067 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift @@ -59,23 +59,19 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { private override init() { super.init() - // Clear "speaking" when the AVSpeech fallback finishes (native audio uses the - // player's drain callback instead). - speech.delegate = self } /// In-flight ephemeral mint guard (managed users). private var minting = false /// True when the hub should drive this PTT turn. Read by PushToTalkManager at PTT - /// start. BYOK users are ready immediately (own key); managed users are ready only - /// once a warm session exists (token minted + connecting) — otherwise PTT falls - /// back to the legacy cascade for that turn. + /// start. The hub is the default voice path (no opt-in toggle): BYOK users are ready + /// immediately (own key); managed users are ready only once a warm session exists + /// (token minted + connecting) — otherwise PTT falls back to the legacy cascade for + /// that turn. var isActive: Bool { - guard RealtimeHubSettings.shared.isEnabled else { return false } - let provider = RealtimeHubSettings.shared.provider - if APIKeyService.byokKey(provider.byokProvider) != nil { return true } - return session != nil && sessionProvider == provider + if RealtimeHubSettings.shared.canConnect { return true } + return session != nil && sessionProvider == RealtimeHubSettings.shared.provider } func setup(barState: FloatingControlBarState) { @@ -93,9 +89,8 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } @objc private func settingsChanged() { - // Only reconnect if enabled and the provider actually changed — avoids - // redundant teardown/recreate races on unrelated notifications. - if !RealtimeHubSettings.shared.isEnabled { teardownSession(); return } + // Only reconnect if the provider actually changed — avoids redundant + // teardown/recreate races on unrelated notifications. if session != nil, sessionProvider == RealtimeHubSettings.shared.provider { return } teardownSession() ensureWarm() @@ -103,11 +98,10 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { // MARK: - Warm session lifecycle (kept open between turns) - /// Open the WS now if it isn't already (no-op if disabled or already warm). - /// BYOK → connect client-direct with the user's key (Phase 1). Otherwise, if - /// signed in → mint a server-side ephemeral token (Phase 2) and connect with it. + /// Open the WS now if it isn't already (no-op if already warm). BYOK → connect + /// client-direct with the user's key. Otherwise, if signed in → mint a server-side + /// ephemeral token and connect with it. func ensureWarm() { - guard RealtimeHubSettings.shared.isEnabled else { return } let provider = RealtimeHubSettings.shared.provider if session != nil, sessionProvider == provider { return } if session != nil { teardownSession() } @@ -117,7 +111,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } else if AuthService.shared.isSignedIn { mintAndConnect(provider: provider) } else { - log("RealtimeHub: enabled but no BYOK key and not signed in — hub unavailable (cascade).") + log("RealtimeHub: no BYOK key and not signed in — hub unavailable (cascade).") } } @@ -137,9 +131,8 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { log("⚠️ RealtimeHub: ephemeral mint failed / not entitled — staying on cascade") return } - // Provider/enable may have changed while minting; only connect if still wanted. - guard RealtimeHubSettings.shared.isEnabled, - RealtimeHubSettings.shared.provider == provider, self.session == nil + // Provider may have changed while minting; only connect if still wanted. + guard RealtimeHubSettings.shared.provider == provider, self.session == nil else { return } self.startSession(provider: provider, auth: .ephemeral(token)) } @@ -152,24 +145,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { // Both providers stream native spoken audio (24k PCM) → StreamingPCMPlayer; // AVSpeech is only a no-audio fallback. if pcmPlayer == nil { - let p = StreamingPCMPlayer(sampleRate: 24000) - // Feed the live output amplitude to the speaking waveform — but only while we're - // actually in the speaking state, so publishing `voiceLevel` never re-renders the - // bar outside that window. - p.onLevel = { [weak self] level in - guard let self, self.barState?.isVoiceSpeaking == true else { return } - self.barState?.voiceLevel = CGFloat(level) - } - // The reply isn't truly over until the buffered audio finishes draining — only - // then do we drop "speaking" and let the bar collapse back to idle. - p.onPlayingChanged = { [weak self] playing in - guard let self, let barState = self.barState else { return } - if !playing { - barState.isVoiceSpeaking = false - barState.voiceLevel = 0 - } - } - pcmPlayer = p + pcmPlayer = StreamingPCMPlayer(sampleRate: 24000) } s.start() log( @@ -202,9 +178,6 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { audioReceivedThisTurn = false turnRecorded = false lastTurnAt = Date() - barState?.isVoiceThinking = false // new turn → we're recording again, not waiting - barState?.isVoiceSpeaking = false // any prior reply is being cut off below - barState?.voiceLevel = 0 pcmPlayer?.stop() // stop any prior reply locally if speech.isSpeaking { speech.stopSpeaking(at: .immediate) } if bargeIn { @@ -246,11 +219,6 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { /// PTT-up: end the turn; the model now responds (and may call tools). func commitTurn() { responding = true - // Show a distinct "waiting on the model" state (not the red recording dot, which - // reads as "still listening") so the user knows to wait rather than re-press. Setting - // this keeps the bar's `isVoiceActive` true across the PTT-up → thinking handoff, so - // the window stays expanded (the window observes the flags and resizes itself). - barState?.isVoiceThinking = true // (The screen frame is sent at turn START — see beginTurn — so it has time to // upload/decode before the model answers. Nothing to attach here.) session?.commitInputTurn() @@ -297,11 +265,6 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } func hubDidReceiveAudio(_ pcm24k: Data) { - if !audioReceivedThisTurn { - // First audio of the turn: it's no longer thinking, it's speaking. - barState?.isVoiceThinking = false - barState?.isVoiceSpeaking = true - } audioReceivedThisTurn = true pcmPlayer?.enqueue(pcm24k) // native spoken audio (OpenAI + Gemini) } @@ -412,41 +375,34 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { // land here. responding = false logError("RealtimeHub: session error — \(message)") - // The reply is dead — stop any buffered audio and drop the speaking state before - // collapsing (the drain callback won't fire for a torn-down engine). + // The reply is dead — stop any buffered audio before collapsing. pcmPlayer?.stop() if speech.isSpeaking { speech.stopSpeaking(at: .immediate) } - barState?.isVoiceSpeaking = false - barState?.voiceLevel = 0 exitVoiceUI() let aliveFor = lastWarmAt.map { Date().timeIntervalSince($0) } ?? 0 teardownSession() // Re-warm so the NEXT PTT uses the hub, not the STT cascade. Gemini idle-closes // the socket (~2.5 min, close 1008) even before the first turn; managed users have // no BYOK key, so once `session` is nil `isActive` is false and PTT silently falls - // back to omni STT. So gate on isEnabled (NOT isActive, which needs a live session). + // back to omni STT. So always try to re-warm (the hub is the default voice path). // A socket that survived past the idle window was a normal idle-close → reset the // strike budget and keep re-warming forever; one that died fast is likely a config/ // auth failure → let the strikes cap stop the churn. if aliveFor > 60 { hubReconnectStrikes = 0 } - guard RealtimeHubSettings.shared.isEnabled, !reconnectPending, hubReconnectStrikes < 5 else { return } + guard !reconnectPending, hubReconnectStrikes < 5 else { return } hubReconnectStrikes += 1 reconnectPending = true DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) { [weak self] in guard let self else { return } self.reconnectPending = false - if RealtimeHubSettings.shared.isEnabled, self.session == nil { self.ensureWarm() } + if self.session == nil { self.ensureWarm() } } } /// Return the floating bar from its PTT voice state to compact after a hub turn. - /// Leaves `isVoiceSpeaking` alone — the turn can finish generating while the buffered - /// reply is still playing; the player's drain callback drops speaking when it ends. The - /// window observes these flags and collapses itself once `isVoiceActive` goes false. private func exitVoiceUI() { guard let barState else { return } barState.voiceTranscript = "" - barState.isVoiceThinking = false barState.isVoiceListening = false barState.isVoiceLocked = false barState.isVoiceFollowUp = false @@ -523,17 +479,9 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { utterance.voice = AVSpeechSynthesisVoice(language: AVSpeechSynthesisVoice.currentLanguageCode()) ?? AVSpeechSynthesisVoice(language: "en-US") - barState?.isVoiceThinking = false - barState?.isVoiceSpeaking = true speech.speak(utterance) } - /// Drop the speaking state once the AVSpeech fallback stops talking. - private func finishedSpeaking() { - barState?.isVoiceSpeaking = false - barState?.voiceLevel = 0 - } - /// Local synthetic mouse click (point_click tool). @discardableResult static func click(at point: CGPoint) -> Bool { @@ -549,19 +497,3 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { return true } } - -// MARK: - AVSpeech fallback completion - -extension RealtimeHubController: AVSpeechSynthesizerDelegate { - nonisolated func speechSynthesizer( - _ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance - ) { - Task { @MainActor [weak self] in self?.finishedSpeaking() } - } - - nonisolated func speechSynthesizer( - _ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance - ) { - Task { @MainActor [weak self] in self?.finishedSpeaking() } - } -} From ce7c34e809e9ad27edf4d2720a0ab3f67d12fd8d Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 17:59:03 -0400 Subject: [PATCH 03/35] refactor(desktop): remove floating-bar Realtime Hub toggle + provider picker The hub is always-on and follows the existing Voice Model picker, so the duplicate enable+provider cards are removed. Default Voice Model set to OpenAI. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../MainWindow/Pages/SettingsPage.swift | 76 +------------------ 1 file changed, 1 insertion(+), 75 deletions(-) diff --git a/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift b/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift index fdb8bc462e7..488b1ab852a 100644 --- a/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift +++ b/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift @@ -271,12 +271,7 @@ struct SettingsContentView: View { // AI Chat settings @AppStorage("chatBridgeMode") private var chatBridgeMode: String = "piMono" - @AppStorage("realtimeOmniProvider") private var realtimeOmniProvider: String = RealtimeOmniProvider.auto.rawValue - // Realtime-as-hub (Phase 1, dev/BYOK only): the realtime model is the single - // tool-dispatching voice hub. Provider toggle persisted here; RealtimeHubSession - // reads it at connect. - @AppStorage("realtimeHubEnabled") private var realtimeHubEnabled = false - @AppStorage("realtimeHubProvider") private var realtimeHubProvider: String = RealtimeHubProvider.openai.rawValue + @AppStorage("realtimeOmniProvider") private var realtimeOmniProvider: String = RealtimeOmniProvider.gptRealtime2.rawValue @AppStorage("askModeEnabled") private var askModeEnabled = false @AppStorage("claudeMdEnabled") private var claudeMdEnabled = true @AppStorage("projectClaudeMdEnabled") private var projectClaudeMdEnabled = true @@ -2534,75 +2529,6 @@ struct SettingsContentView: View { voiceSpeedSlider(settingId: "floatingbar.voicespeed") .opacity(shortcutSettings.hasAnyFloatingBarVoiceAnswersEnabled ? 1 : 0.55) .disabled(!shortcutSettings.hasAnyFloatingBarVoiceAnswersEnabled) - - realtimeHubCard - realtimeHubProviderCard - .opacity(realtimeHubEnabled ? 1 : 0.55) - .disabled(!realtimeHubEnabled) - } - } - - // MARK: Realtime-as-hub (Phase 1, dev/BYOK only) - - /// The realtime model becomes the single voice hub: in-session STT + reasoning - /// + tool-choice routing + spoken reply, bypassing the STT→Haiku router→Claude - /// cascade. Client-direct using the user's own BYOK key (dev/test only). - private var realtimeHubCard: some View { - settingsCard(settingId: "floatingbar.realtimehub") { - HStack(spacing: 16) { - VStack(alignment: .leading, spacing: 4) { - Text("Realtime Voice Hub (experimental)") - .scaledFont(size: 16, weight: .semibold) - .foregroundColor(OmiColors.textPrimary) - Text( - "Let the realtime model run the whole voice turn — listen, decide, and speak — " - + "instead of the slower transcribe→route→answer pipeline. Uses your own provider key." - ) - .scaledFont(size: 13) - .foregroundColor(OmiColors.textSecondary) - } - Spacer() - Toggle("", isOn: $realtimeHubEnabled) - .toggleStyle(.switch) - .tint(OmiColors.purplePrimary) - .onChange(of: realtimeHubEnabled) { _ in - NotificationCenter.default.post(name: .realtimeHubSettingsDidChange, object: nil) - } - } - } - } - - private var realtimeHubProviderCard: some View { - let provider = RealtimeHubProvider(rawValue: realtimeHubProvider) ?? .openai - let hasKey = APIKeyService.byokKey(provider.byokProvider) != nil - return settingsCard(settingId: "floatingbar.realtimehub.provider") { - HStack(spacing: 16) { - VStack(alignment: .leading, spacing: 4) { - Text("Hub Provider") - .scaledFont(size: 16, weight: .semibold) - .foregroundColor(OmiColors.textPrimary) - Text( - hasKey - ? provider.subtitle - : "⚠️ No \(provider.byokProvider.displayName) key set — add one in Developer settings to use this provider." - ) - .scaledFont(size: 13) - .foregroundColor(hasKey ? OmiColors.textSecondary : OmiColors.purplePrimary) - } - Spacer() - Picker("", selection: $realtimeHubProvider) { - ForEach(RealtimeHubProvider.allCases, id: \.rawValue) { p in - Text(p.displayName).tag(p.rawValue) - } - } - .pickerStyle(.menu) - .labelsHidden() - .frame(width: 180) - .tint(OmiColors.purplePrimary) - .onChange(of: realtimeHubProvider) { _ in - NotificationCenter.default.post(name: .realtimeHubSettingsDidChange, object: nil) - } - } } } From 884fd41f599bfa8c1c9760b475f903ced252c661 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 17:59:03 -0400 Subject: [PATCH 04/35] feat(desktop): default Voice Model to OpenAI (GPT Realtime 2) Users can still switch to Gemini or Auto in Advanced settings; this default also drives the realtime hub provider. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Desktop/Sources/RealtimeOmni/RealtimeOmniSettings.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/desktop/macos/Desktop/Sources/RealtimeOmni/RealtimeOmniSettings.swift b/desktop/macos/Desktop/Sources/RealtimeOmni/RealtimeOmniSettings.swift index 393feeaa853..5edc05c7094 100644 --- a/desktop/macos/Desktop/Sources/RealtimeOmni/RealtimeOmniSettings.swift +++ b/desktop/macos/Desktop/Sources/RealtimeOmni/RealtimeOmniSettings.swift @@ -59,7 +59,9 @@ final class RealtimeOmniSettings { private init() { UserDefaults.standard.register(defaults: [ - providerKey: RealtimeOmniProvider.auto.rawValue, + // Default to OpenAI (GPT Realtime 2); the user can switch to Gemini or Auto + // in Advanced → Voice Model. This default also drives the realtime hub provider. + providerKey: RealtimeOmniProvider.gptRealtime2.rawValue, enabledKey: false, ]) } From f858824888acdf53ce8a891157d84f303a52e988 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 17:59:52 -0400 Subject: [PATCH 05/35] revert(desktop): restore imperative PTT resize; drop reactive voice-activity observer Keeps the QueryTracer wiring, router-skip/screenshot heuristics, and recordVoiceTurn. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../FloatingControlBarWindow.swift | 76 ++++--------------- 1 file changed, 13 insertions(+), 63 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift index 7dc37d7d622..d97aa6cb7e0 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift @@ -41,8 +41,6 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { private var suppressHoverResize = false private var inputHeightCancellable: AnyCancellable? private var responseHeightCancellable: AnyCancellable? - private var voiceActivityCancellable: AnyCancellable? - private var collapseWorkItem: DispatchWorkItem? private var resizeWorkItem: DispatchWorkItem? /// Saved center point from before chat opened, used to restore position on close. private var preChatCenter: NSPoint? @@ -87,7 +85,6 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { self.maxSize = FloatingControlBarWindow.maxBarSize setupViews() - setupVoiceActivityObserver() if ShortcutSettings.shared.draggableBarEnabled, let savedPosition = UserDefaults.standard.string(forKey: FloatingControlBarWindow.positionKey) { @@ -522,54 +519,6 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { inputHeightCancellable = nil } - /// Single owner of the voice-turn expand/collapse. The bar is wide whenever a voice - /// turn is active (`isVoiceActive` = listening || thinking || speaking) and collapses - /// to the resting sliver when it ends — derived reactively from the published flags - /// instead of imperative resize calls scattered across the PTT/hub code (which had to - /// coordinate via a `skipResize` flag). - private func setupVoiceActivityObserver() { - voiceActivityCancellable = state.$isVoiceListening - .combineLatest(state.$isVoiceThinking, state.$isVoiceSpeaking) - .map { $0 || $1 || $2 } - .removeDuplicates() - .receive(on: DispatchQueue.main) - .sink { [weak self] active in - self?.onVoiceActiveChanged(active) - } - } - - /// Expand immediately so the window is already wide when the indicator + text render - /// (a delayed expand flashes the content cramped in the sliver first). Defer the - /// collapse a beat so the transient listening→thinking dip on PTT-up — `isVoiceActive` - /// momentarily clears before commitTurn sets thinking — doesn't blink the bar shut. - private func onVoiceActiveChanged(_ active: Bool) { - collapseWorkItem?.cancel() - collapseWorkItem = nil - if active { - applyVoiceExpansion(true) - } else { - let work = DispatchWorkItem { [weak self] in self?.applyVoiceExpansion(false) } - collapseWorkItem = work - DispatchQueue.main.asyncAfter(deadline: .now() + 0.12, execute: work) - } - } - - private func applyVoiceExpansion(_ active: Bool) { - // Onboarding shows no separate bar; follow-up and the AI conversation own their - // own layout, so the voice indicator never drives the window size in those modes. - guard UserDefaults.standard.bool(forKey: "hasCompletedOnboarding"), - !state.isVoiceFollowUp else { return } - if active { - guard !state.showingAIConversation else { return } - resizeForPTTState(expanded: true, animated: false) // snap — content is ready now - } else { - // Collapse only when nothing else needs the window expanded. - guard !state.showingAIConversation, !state.showingAIResponse, - state.currentNotification == nil, !state.isHoveringBar else { return } - resizeForPTTState(expanded: false, animated: true) - } - } - func updateAIResponse(type: String, text: String) { guard state.showingAIConversation else { return } @@ -670,7 +619,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { /// Resize for hover expand/collapse — anchored from center so the circle grows outward. func resizeForHover(expanded: Bool) { - guard !state.showingAIConversation, !state.isVoiceActive, !state.isShowingNotification, !suppressHoverResize else { return } + guard !state.showingAIConversation, !state.isVoiceListening, !state.isShowingNotification, !suppressHoverResize else { return } resizeWorkItem?.cancel() resizeWorkItem = nil @@ -679,7 +628,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { let doResize: () -> Void = { [weak self] in guard let self = self else { return } guard !self.state.showingAIConversation, - !self.state.isVoiceActive, + !self.state.isVoiceListening, !self.state.isShowingNotification, !self.suppressHoverResize else { return } @@ -709,16 +658,12 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { } } - /// Resize window for PTT state (expanded when listening, compact circle when idle). - /// Expand snaps (animated:false) so the indicator + text never flash cramped while the - /// window grows; collapse animates for a smooth shrink back to the resting sliver. - func resizeForPTTState(expanded: Bool, animated: Bool = true) { + /// Resize window for PTT state (expanded when listening, compact circle when idle) + func resizeForPTTState(expanded: Bool) { let size = expanded ? NSSize(width: FloatingControlBarWindow.expandedWidth, height: FloatingControlBarWindow.expandedBarSize.height) : FloatingControlBarWindow.minBarSize - // Idempotent: skip when already at the target size (avoids a no-op resize). - if abs(frame.width - size.width) < 1, abs(frame.height - size.height) < 1 { return } - resizeAnchored(to: size, makeResizable: false, animated: animated) + resizeAnchored(to: size, makeResizable: false, animated: true) } func showNotification(_ notification: FloatingBarNotification, animated: Bool = true) { @@ -737,7 +682,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { state.currentNotification = nil let targetSize: NSSize - if state.isVoiceActive { + if state.isVoiceListening { targetSize = NSSize(width: Self.expandedWidth, height: Self.expandedBarSize.height) } else { targetSize = state.isHoveringBar ? Self.expandedBarSize : Self.minBarSize @@ -748,7 +693,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { /// Restore the compact pill size when we temporarily surface the bar outside /// of an active hover, notification, voice session, or AI conversation. func normalizeForTemporaryShow() { - guard !state.showingAIConversation, !state.isVoiceActive, state.currentNotification == nil else { return } + guard !state.showingAIConversation, !state.isVoiceListening, state.currentNotification == nil else { return } resizeAnchored(to: Self.minBarSize, makeResizable: false, animated: false, anchorTop: true) } @@ -892,7 +837,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { minimumWidth = FloatingControlBarWindow.expandedWidth } else if state.currentNotification != nil { minimumWidth = FloatingControlBarWindow.notificationWidth - } else if state.isVoiceActive { + } else if state.isVoiceListening { minimumWidth = FloatingControlBarWindow.expandedWidth } else if state.isHoveringBar { minimumWidth = FloatingControlBarWindow.expandedBarSize.width @@ -1737,6 +1682,11 @@ class FloatingControlBarManager { return window?.state } + /// Resize the floating bar for PTT state changes. + func resizeForPTT(expanded: Bool) { + window?.resizeForPTTState(expanded: expanded) + } + // MARK: - AI Query private func prepareVisibleQueryState(_ message: String, in barWindow: FloatingControlBarWindow, fromVoice: Bool) { From bb26c29b9d911718699d04f2b149a41947683c7d Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 17:59:52 -0400 Subject: [PATCH 06/35] revert(desktop): restore updateBarState(skipResize:) imperative resize Co-Authored-By: Claude Opus 4.8 (1M context) --- .../PushToTalkManager.swift | 34 +++++++++++-------- 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift index 3eca7fcdd98..8daa4225417 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift @@ -523,10 +523,6 @@ class PushToTalkManager: ObservableObject { state = .finalizing finalizeWorkItem?.cancel() finalizeWorkItem = nil - // Flags only — the window keeps the bar expanded into "thinking" because commitTurn - // sets isVoiceThinking before the reactive resize observer settles (so isVoiceActive - // never dips), which is why there's no flicker and no skip-resize coordination here. - updateBarState() // Stop mic immediately — no more audio capture audioCaptureService?.stopCapture() @@ -563,10 +559,10 @@ class PushToTalkManager: ObservableObject { // Real speech — instant local ack + commit. The hub speaks the reply and // dispatches tools itself; no transcript/router/LLM hop here. if ShortcutSettings.shared.pttSoundsEnabled { ackSound?.play() } - barState?.voiceTranscript = "…" RealtimeHubController.shared.commitTurn() - // Leave the bar showing "…"; the hub controller exits the voice UI on turn - // completion (so we skip the clearing updateBarState()). + // Collapse the bar on release — the hub speaks its reply as audio (no inline + // status UI), the same as the legacy voice path. + updateBarState() AnalyticsManager.shared.floatingBarPTTEnded( mode: finalizedMode, hadTranscript: true, transcriptLength: 0) log("PushToTalkManager: hub turn committed (instant ack)") @@ -729,14 +725,14 @@ class PushToTalkManager: ObservableObject { isCurrentSessionFollowUp = false - // Reset state. The reactive resize observer won't collapse the bar when a query is in - // flight or a conversation is open — it guards on showingAIConversation/showingAIResponse, - // which openAIInputWithQuery sets (to the correct response size) right after this. + // Reset state — skip PTT collapse resize when we have a query, + // because openAIInputWithQuery will resize to the correct size. + // Also skip resize when in follow-up mode (panel is already at response size). state = .idle transcriptSegments = [] lastInterimText = "" currentContextSnapshot = nil - updateBarState() + updateBarState(skipResize: hasQuery || wasFollowUp) guard hasQuery else { log("PushToTalkManager: no transcript to send") @@ -1037,8 +1033,9 @@ class PushToTalkManager: ObservableObject { // MARK: - Bar State Sync - private func updateBarState() { + private func updateBarState(skipResize: Bool = false) { guard let barState = barState else { return } + let wasListening = barState.isVoiceListening let isShowingVoiceUI = (state == .listening || state == .lockedListening) barState.isVoiceListening = isShowingVoiceUI barState.isVoiceLocked = (state == .lockedListening) @@ -1047,9 +1044,16 @@ class PushToTalkManager: ObservableObject { barState.voiceTranscript = "" barState.voiceFollowUpTranscript = "" } - // The bar's expand/collapse is derived reactively from these flags by the window - // (FloatingControlBarWindow.setupVoiceActivityObserver) — one resize per turn, no - // imperative calls or skip-flags to keep in sync here. + + // Skip resize when in follow-up mode, expanded AI conversation, or during onboarding + // (during onboarding the floating bar shouldn't appear as a separate window) + let isOnboarding = !UserDefaults.standard.bool(forKey: "hasCompletedOnboarding") + guard !skipResize && !barState.isVoiceFollowUp && !barState.showingAIConversation && !isOnboarding else { return } + if barState.isVoiceListening && !wasListening { + FloatingControlBarManager.shared.resizeForPTT(expanded: true) + } else if !barState.isVoiceListening && wasListening { + FloatingControlBarManager.shared.resizeForPTT(expanded: false) + } } } From 155c171ac74e58009791ea0d851976d8fa4245a5 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 17:59:52 -0400 Subject: [PATCH 07/35] refactor(desktop): drop orphaned audio level-tap from StreamingPCMPlayer The mixer RMS tap + pending-buffer tracking only fed the reverted speaking waveform; playback (enqueue/stop/engine-restart/config-rebuild) is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../StreamingPCMPlayer.swift | 85 +------------------ 1 file changed, 1 insertion(+), 84 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/StreamingPCMPlayer.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/StreamingPCMPlayer.swift index 28f4ab4d60e..0836dea0aa2 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/StreamingPCMPlayer.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/StreamingPCMPlayer.swift @@ -14,27 +14,6 @@ final class StreamingPCMPlayer { private let format: AVAudioFormat private var configObserver: NSObjectProtocol? - /// Smoothed 0…1 output amplitude, delivered on the main thread (~40×/s) while the - /// engine runs. Driven by a tap on the mixer so it tracks what's *actually audible*, - /// not what's been buffered ahead. Used to make the speaking waveform audio-reactive. - var onLevel: ((Float) -> Void)? - /// Fires on the main thread when playback starts (false→true) and when the queue - /// fully drains (true→false). Lets the caller mark "speaking" precisely — including - /// the silent tail after the last chunk arrives but before it finishes playing. - var onPlayingChanged: ((Bool) -> Void)? - - /// Outstanding scheduled buffers (incremented on enqueue, decremented when each - /// finishes). Guarded by `bufferLock` because completion handlers run off-main. - private var pendingBuffers = 0 - private let bufferLock = NSLock() - private var isPlayingState = false - // Exponential moving average of the output RMS (smoothed so the waveform never jitters). - private var smoothedLevel: Float = 0 - // Last value handed to `onLevel`, so we skip main-thread hops while the level is flat - // (e.g. the silent tail of a reply) instead of publishing the same number ~40×/s. - private var lastDispatchedLevel: Float = -1 - private var levelTapInstalled = false - init(sampleRate: Double = 24000) { // Float32 mono at the source rate; the mixer resamples to the device rate. format = AVAudioFormat( @@ -55,8 +34,6 @@ final class StreamingPCMPlayer { log("StreamingPCMPlayer: audio config changed — rebuilding engine") self.player.stop() self.engine.stop() - // The rebuilt graph loses the old tap; let ensureRunning() reinstall it. - self.removeLevelTap() self.engine.disconnectNodeOutput(self.player) self.engine.connect(self.player, to: self.engine.mainMixerNode, format: self.format) self.ensureRunning() @@ -69,42 +46,6 @@ final class StreamingPCMPlayer { } } - /// Tap the mixer output once the engine is live so `onLevel` reflects the audio the - /// user actually hears. Cheap: one RMS pass per ~1024-frame buffer, EMA-smoothed. - private func installLevelTapIfNeeded() { - guard !levelTapInstalled, engine.isRunning else { return } - levelTapInstalled = true - engine.mainMixerNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { - [weak self] buffer, _ in - guard let self, self.onLevel != nil, let data = buffer.floatChannelData else { return } - let frames = Int(buffer.frameLength) - guard frames > 0 else { return } - let samples = data[0] - var sumSquares: Float = 0 - for i in 0.. self.smoothedLevel ? 0.35 : 0.12 - self.smoothedLevel += (target - self.smoothedLevel) * alpha - let out = self.smoothedLevel - // Only hop to main when the level actually moved — flat/silent stretches stay quiet. - guard abs(out - self.lastDispatchedLevel) > 0.01 else { return } - self.lastDispatchedLevel = out - DispatchQueue.main.async { self.onLevel?(out) } - } - } - - /// Detach the level tap (call when playback stops; reinstalled on the next play). - private func removeLevelTap() { - guard levelTapInstalled else { return } - engine.mainMixerNode.removeTap(onBus: 0) - levelTapInstalled = false - smoothedLevel = 0 - lastDispatchedLevel = -1 - } - /// Ensure the engine + player are actually running before scheduling. Checking /// the real `isRunning`/`isPlaying` state (not a one-shot flag) is what makes /// playback survive past the first turn: AVAudioEngine auto-suspends when idle @@ -125,19 +66,6 @@ final class StreamingPCMPlayer { if !player.isPlaying { player.play() } - installLevelTapIfNeeded() - } - - /// Adjust the outstanding-buffer count and emit `onPlayingChanged` on the edges. - private func adjustPending(by delta: Int) { - bufferLock.lock() - pendingBuffers = max(0, pendingBuffers + delta) - let nowPlaying = pendingBuffers > 0 - let changed = nowPlaying != isPlayingState - if changed { isPlayingState = nowPlaying } - bufferLock.unlock() - guard changed else { return } - DispatchQueue.main.async { [weak self] in self?.onPlayingChanged?(nowPlaying) } } /// `data` = little-endian Int16 PCM, mono, at the configured sample rate. @@ -156,22 +84,11 @@ final class StreamingPCMPlayer { channel[i] = max(-1.0, min(1.0, Float(src[i]) / 32768.0)) } } - adjustPending(by: 1) - player.scheduleBuffer(buffer, completionHandler: { [weak self] in self?.adjustPending(by: -1) }) + player.scheduleBuffer(buffer) } func stop() { - removeLevelTap() // no playback → no reason to keep tapping (reinstalled on next play) player.stop() engine.stop() - bufferLock.lock() - pendingBuffers = 0 - let wasPlaying = isPlayingState - isPlayingState = false - bufferLock.unlock() - smoothedLevel = 0 - if wasPlaying { - DispatchQueue.main.async { [weak self] in self?.onPlayingChanged?(false) } - } } } From 4b047bc3545502f918f94ef66f522cb9ecfa3095 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 17:59:52 -0400 Subject: [PATCH 08/35] docs(desktop): update RealtimeHubSession header (BYOK + managed, not dev-only) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../FloatingControlBar/RealtimeHubSession.swift | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift index 7dc991138c0..2e009f35140 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift @@ -1,12 +1,12 @@ import Foundation import Network -// MARK: - Realtime Hub Session (Phase 1, CLIENT-DIRECT) +// MARK: - Realtime Hub Session // -// One persistent WebSocket to a realtime provider, opened with the user's own -// BYOK key (dev/test only — gated by RealtimeHubSettings.canConnect). The model -// is the hub: it does in-session STT + reasoning + routing (via tool calls) and -// speaks the answer. +// One persistent WebSocket to a realtime provider, opened either with the user's +// own BYOK key (client-direct, gated by RealtimeHubSettings.canConnect) or with a +// server-minted ephemeral token (managed users). The model is the hub: it does +// in-session STT + reasoning + routing (via tool calls) and speaks the answer. // // Two providers, normalized to ONE internal stream surface // (RealtimeHubSessionDelegate): From 26c445b7c23b798d4f35c645116fe5e514de84c7 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 17:59:52 -0400 Subject: [PATCH 09/35] =?UTF-8?q?chore(desktop):=20changelog=20=E2=80=94?= =?UTF-8?q?=20hub=20as=20default=20voice=20path;=20drop=20reverted=20pill?= =?UTF-8?q?=20entry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.8 (1M context) --- desktop/macos/CHANGELOG.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/desktop/macos/CHANGELOG.json b/desktop/macos/CHANGELOG.json index 2c0d04f0735..278ff53fa00 100644 --- a/desktop/macos/CHANGELOG.json +++ b/desktop/macos/CHANGELOG.json @@ -1,8 +1,7 @@ { "unreleased": [ "Faster, cheaper assistant responses via Anthropic prompt caching of the system+tools prefix and conversation history", - "Added an experimental Realtime Voice Hub (Settings \u2192 Floating Bar): the realtime model handles your whole voice turn \u2014 listening, deciding, and speaking \u2014 for noticeably faster replies", - "Redesigned the floating bar voice indicator with smooth, distinct idle, listening, thinking, and speaking states so you always know whether the assistant is working or done", + "Faster voice replies (experimental): the realtime model now handles your whole voice turn \u2014 listening, deciding, and speaking \u2014 instead of the slower transcribe\u2192route\u2192answer pipeline", "Voice (push-to-talk) conversations now appear in your chat history", "Fixed older chat messages failing to load in long chats" ], From 6666c64fe737f6307799f870e688ace1b6d7a2b7 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 18:42:27 -0400 Subject: [PATCH 10/35] fix(desktop): re-warm hub on Voice Model change; collapse bar on mid-turn hub exit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Observe .realtimeOmniSettingsDidChange (the live 'voice model changed' signal) instead of the dead .realtimeHubSettingsDidChange, so switching the Voice Model re-warms the hub on the newly selected provider for BYOK users. - exitVoiceUI() now collapses the bar when it clears isVoiceListening (mid-turn error / silent-tap cancel) — the transition-based updateBarState() would otherwise see no change and leave the bar expanded. Guarded against conversation/response/ notification/hover/onboarding. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../RealtimeHubController.swift | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift index 0fe8156e067..5c28412c4fa 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift @@ -76,14 +76,16 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { func setup(barState: FloatingControlBarState) { self.barState = barState - // Register the observer exactly once — duplicate registrations (re-entrant - // setup) fired settingsChanged N times, each tearing down + recreating the - // socket, which orphaned a connecting session (Gemini 1001/1008 closes). + // The hub provider follows the "Voice Model" picker, so re-warm when it changes — + // observe the live settings notification (posted by the picker, RealtimeOmniSettings + // setters, and AutoModelSelector). Register exactly once — duplicate registrations + // (re-entrant setup) fired settingsChanged N times, each tearing down + recreating + // the socket, which orphaned a connecting session (Gemini 1001/1008 closes). NotificationCenter.default.removeObserver( - self, name: .realtimeHubSettingsDidChange, object: nil) + self, name: .realtimeOmniSettingsDidChange, object: nil) NotificationCenter.default.addObserver( self, selector: #selector(settingsChanged), - name: .realtimeHubSettingsDidChange, object: nil) + name: .realtimeOmniSettingsDidChange, object: nil) // Expose the headless E2E action (omi-ctl action hub_test_turn pcm=… provider=…). RealtimeHubTestHarness.registerAutomationAction() } @@ -402,10 +404,22 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { /// Return the floating bar from its PTT voice state to compact after a hub turn. private func exitVoiceUI() { guard let barState else { return } + // Capture before clearing: a mid-turn error or silent-tap cancel clears the + // listening flag here, so PushToTalkManager.updateBarState() (which resizes only + // on a wasListening→false transition) would see no change and leave the bar wide. + let wasExpandedForVoice = barState.isVoiceListening barState.voiceTranscript = "" barState.isVoiceListening = false barState.isVoiceLocked = false barState.isVoiceFollowUp = false + // Collapse the bar ourselves in that case — guarded so we never shrink the bar out + // from under an open conversation, response, notification, hover, or onboarding. + guard wasExpandedForVoice, + !barState.showingAIConversation, !barState.showingAIResponse, + barState.currentNotification == nil, !barState.isHoveringBar, + UserDefaults.standard.bool(forKey: "hasCompletedOnboarding") + else { return } + FloatingControlBarManager.shared.resizeForPTT(expanded: false) } // MARK: - Tools From ff7bcc0e4191540f19c2795682dfb13dd1b5b750 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 18:42:27 -0400 Subject: [PATCH 11/35] fix(desktop): remove dead .realtimeHubSettingsDidChange notification Nothing posts it after the hub toggle was removed; the controller now listens to .realtimeOmniSettingsDidChange instead. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Sources/FloatingControlBar/RealtimeHubSettings.swift | 4 ---- 1 file changed, 4 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift index e06f27d0ec0..3ae0e0ec18e 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift @@ -72,7 +72,3 @@ final class RealtimeHubSettings { APIKeyService.byokKey(provider.byokProvider) != nil } } - -extension Notification.Name { - static let realtimeHubSettingsDidChange = Notification.Name("realtimeHubSettingsDidChange") -} From 1793120ba1b4607aa3c573ac4ef24c9751dbd688 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 18:42:28 -0400 Subject: [PATCH 12/35] fix(desktop): Voice Model picker posts .realtimeOmniSettingsDidChange MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The picker writes @AppStorage directly (bypassing the setter), so post the change ourselves — this re-warms the realtime hub on the newly selected provider. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift b/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift index 488b1ab852a..79fa7458530 100644 --- a/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift +++ b/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift @@ -3420,6 +3420,10 @@ struct SettingsContentView: View { if newValue == RealtimeOmniProvider.auto.rawValue { AutoModelSelector.shared.refreshIfStale() } + // The picker writes @AppStorage directly (bypassing the RealtimeOmniSettings + // setter), so post the change ourselves — this is what re-warms the realtime + // hub on the newly selected provider (and is a no-op for unchanged providers). + NotificationCenter.default.post(name: .realtimeOmniSettingsDidChange, object: nil) } } From 713f5fff64b0bbc2a1ba9169257bd7a3924e9ee9 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 18:57:21 -0400 Subject: [PATCH 13/35] fix(desktop): hub takes a turn only when actually connected (graceful cascade fallback) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit isActive returned true whenever a BYOK key existed, so a stale/revoked OpenAI or Gemini key kept entering hub mode on every PTT and lost the turn to a failing realtime session (while managed-token failures already fell back). Now isActive requires a live, authenticated session for the selected provider (new hubConnected flag, set on the post-auth connect, cleared on teardown). A key/token that can't connect never routes a turn to the hub — PTT transparently uses the legacy cascade, so a broken hub never costs the user a turn. The hub re-warms in the background and resumes taking turns the moment it connects (also covers provider switches and idle-close reconnect windows). Reconnect churn is still capped via a named maxReconnectStrikes constant. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../RealtimeHubController.swift | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift index 5c28412c4fa..776cbdee28f 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift @@ -46,6 +46,15 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { /// Consecutive failed (re)connects with no surviving session — caps churn on a hard /// failure. Reset when a socket survives past the idle window or a turn completes. private var hubReconnectStrikes = 0 + /// After this many consecutive fast failures (e.g. a stale/revoked key failing auth), + /// the hub stops re-warming so it doesn't hammer a dead endpoint. + private static let maxReconnectStrikes = 5 + /// True only while a session is connected + authenticated for `sessionProvider`. This is + /// what gates `isActive`: a PTT turn enters hub mode only when the hub is genuinely + /// connected right now; otherwise it transparently uses the legacy cascade. Set in + /// hubDidConnect (fires post-auth, on "ready") and cleared on teardown/error, so a + /// stale/revoked key — which never connects — never costs the user a turn. + private var hubConnected = false /// True between commit and turn-done — used to detect barge-in (a new PTT while /// the previous reply is still in flight). private var responding = false @@ -65,13 +74,14 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { private var minting = false /// True when the hub should drive this PTT turn. Read by PushToTalkManager at PTT - /// start. The hub is the default voice path (no opt-in toggle): BYOK users are ready - /// immediately (own key); managed users are ready only once a warm session exists - /// (token minted + connecting) — otherwise PTT falls back to the legacy cascade for - /// that turn. + /// start. The hub is the default voice path (no opt-in toggle). var isActive: Bool { - if RealtimeHubSettings.shared.canConnect { return true } - return session != nil && sessionProvider == RealtimeHubSettings.shared.provider + // Drive a turn only when the hub is actually CONNECTED + authenticated for the + // currently-selected provider. A turn never enters hub mode on a key/token that can't + // connect (stale/revoked key, failed mint, mid-reconnect, or a just-switched provider): + // PTT transparently uses the legacy cascade instead, so a broken hub never costs the + // user a turn. The hub re-warms in the background and flips this true once it connects. + hubConnected && sessionProvider == RealtimeHubSettings.shared.provider } func setup(barState: FloatingControlBarState) { @@ -162,6 +172,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { session?.stop() session = nil sessionProvider = nil + hubConnected = false // no live session → PTT falls back to the cascade until re-warm } // MARK: - PTT integration @@ -243,6 +254,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { func hubDidConnect() { lastWarmAt = Date() + hubConnected = true // authenticated + ready — PTT may now route turns to the hub log("RealtimeHub: connected (\(sessionProvider?.displayName ?? "?"))") } @@ -391,7 +403,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { // strike budget and keep re-warming forever; one that died fast is likely a config/ // auth failure → let the strikes cap stop the churn. if aliveFor > 60 { hubReconnectStrikes = 0 } - guard !reconnectPending, hubReconnectStrikes < 5 else { return } + guard !reconnectPending, hubReconnectStrikes < Self.maxReconnectStrikes else { return } hubReconnectStrikes += 1 reconnectPending = true DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) { [weak self] in From 1b67d9c5db4f8068682cc9e068d20b115820a577 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 23:29:20 -0400 Subject: [PATCH 14/35] feat(desktop): give the realtime voice agent personal-data tools Add get_memories, search_memories, search_conversations, get_conversations, create_action_item and update_action_item to the floating-bar voice agent so it answers "who am I / what do you know about me" and "most recent conversation" instead of refusing. Reads are synchronous and spoken; the system prompt no longer claims it can't see personal data and routes recency vs semantic vs spawn_agent. Tool dicts are static `let` (built once, not per reconnect). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../FloatingControlBar/RealtimeHubTools.swift | 179 +++++++++++++++--- 1 file changed, 148 insertions(+), 31 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index dc526d867cf..d3ba37d796d 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -3,9 +3,12 @@ import Foundation // MARK: - Realtime Hub tool surface // // The realtime model IS the router: instead of a separate Haiku classify() call, -// the model decides what to do by choosing a tool. The same four tools are +// the model decides what to do by choosing a tool. The same tool surface is // declared to both providers (OpenAI Realtime `tools`, Gemini `functionDeclarations`); // `RealtimeHubController` executes them by calling EXISTING app code / endpoints. +// Reads (get_tasks, get_memories, search_memories, search_conversations) and simple +// writes (create_action_item, update_action_item) run synchronously and speak their +// result; multi-step / other-app work still goes to spawn_agent. enum HubTool: String { /// Escalate a hard / knowledge-heavy question to the smarter Claude model via @@ -17,6 +20,21 @@ enum HubTool: String { /// Read the user's tasks locally (TasksStore) and return them inline to speak — a /// fast synchronous READ, NOT a background agent. case getTasks = "get_tasks" + /// Read what Omi knows about the user (memories / facts) and return it inline to speak. + /// Fast synchronous READ — the answer to "who am I" / "what do you know about me". + case getMemories = "get_memories" + /// Semantically search the user's memories / facts for something specific. Fast READ. + case searchMemories = "search_memories" + /// Semantically search the user's past conversations (titles + summaries, no transcripts). + /// Fast synchronous READ. + case searchConversations = "search_conversations" + /// List the user's MOST RECENT conversations, newest first (titles + summaries, no + /// transcripts). Fast READ — the answer to "most recent / latest / last conversation". + case getConversations = "get_conversations" + /// Create a new task / to-do / reminder for the user. Fast synchronous WRITE. + case createActionItem = "create_action_item" + /// Update an existing task (mark done, change text/due). Needs the task id from get_tasks. + case updateActionItem = "update_action_item" /// Capture the user's screen so the model can see what they're looking at. case screenshot = "screenshot" /// Click at on-screen coordinates (local). @@ -33,34 +51,51 @@ enum RealtimeHubTools { give the full answer yourself — don't shorten it and don't offload it. \ Always reply in English. - IMPORTANT: You have NO direct access to the user's personal data or their apps. \ - You cannot see their tasks, to-dos, calendar, notes, emails, messages, past \ - conversations, memories, files, or reminders on your own. The spawn_agent tool \ - CAN — it hands the request to a background agent that has all of those tools and \ - can act in the user's apps and browser. + IMPORTANT: You CAN read the user's Omi data directly with fast tools — their tasks \ + (get_tasks), what Omi knows about them / their memories & facts (get_memories, \ + search_memories), and their past conversations (search_conversations) — and you can \ + make simple task changes (create_action_item, update_action_item). For anything in \ + their OTHER apps (calendar, notes, emails, messages, files, reminders, browser) or any \ + multi-step "do X for me" work, use spawn_agent — it hands the request to a background \ + agent that has those tools and can act in the user's apps. Using tools: the moment a request needs a tool, briefly acknowledge it OUT LOUD in your \ own natural, varied words (keep it short, and don't include any answer or data you don't \ - have yet), then immediately call the tool. For a data tool (get_tasks, ask_higher_model), \ - speak its result after it returns. NEVER put an answer — real or guessed — in that \ - acknowledgment, NEVER skip the tool call, and never read tool JSON aloud. You cannot see \ - tasks, data, or the screen without calling a tool. + have yet), then immediately call the tool. For a READ tool (get_tasks, get_memories, \ + search_memories, search_conversations, ask_higher_model), speak its result after it \ + returns. NEVER put an answer — real or guessed — in that acknowledgment, NEVER skip the \ + tool call, and never read tool JSON or ids aloud. You cannot see the user's data or \ + screen without calling a tool. Decide what to do with each request: - The user's TASKS / to-dos / what's due — a READ ("what are my tasks", "what's due \ today", "what's on my list", "do I have anything today"): you MUST call get_tasks and \ - speak ONLY what it returns. You CANNOT see their tasks any other way — never guess, \ - summarize from memory, or make up tasks. Always call get_tasks; do NOT use an agent. - - DOING something for the user, or their OTHER personal data (calendar, notes, emails, \ - messages, conversations, memories, files, reminders) — create/send/open/edit/search/ \ - schedule/automate/"do X for me"/any multi-step work: you CANNOT do these yourself. You \ - MUST actually EMIT the spawn_agent function call (with a clear, self-contained `brief`). \ - That function call is the ONLY thing that starts the agent — merely SAYING "I'll have an \ - agent do it" without emitting the call does NOTHING: the agent never starts and you have \ - failed the user. So always emit the spawn_agent call. You may add one short natural \ - sentence as you call it, but never instead of it. Do NOT ask clarifying questions before \ - spawning — spawn with what you have. Do NOT wait for it, narrate its steps, refuse, or \ - claim you can't. + speak ONLY what it returns. Never guess, summarize from memory, or make up tasks. + - WHO the user is / what you know about them / their memories or facts ("who am I", \ + "what do you know about me", "what are my preferences"): you MUST call get_memories (no \ + query) and speak what it returns. For a SPECIFIC fact ("what's my dog's name", "where do \ + I work"), call search_memories with a focused query. NEVER answer "I don't know" or guess \ + — always call the tool first; this data is the whole point. + - The user's MOST RECENT / latest / last conversation ("what was my most recent \ + conversation", "what did we just talk about", "my recent conversations"): call \ + get_conversations (newest first) — NOT search_conversations, which is semantic and does \ + NOT sort by time. Speak the latest one. + - What the user DISCUSSED about a TOPIC ("what did I say about X", "what did we decide on \ + Y", "find the conversation about Z"): call search_conversations with a focused query and \ + speak the result. + - ADD a task / to-do / reminder ("remind me to…", "add … to my list", "I need to…"): \ + call create_action_item with a clear `description` (and `due_at` if a time was given), \ + then confirm out loud. CHANGE an existing task (mark done, edit, reschedule): first \ + call get_tasks to get the matching task's id, then call update_action_item with that id. + - DOING something for the user in their OTHER apps (calendar, notes, emails, messages, \ + files, browser) or any multi-step work — create/send/open/edit/search/schedule/automate/ \ + "do X for me": you CANNOT do these yourself. You MUST actually EMIT the spawn_agent \ + function call (with a clear, self-contained `brief`). That function call is the ONLY \ + thing that starts the agent — merely SAYING "I'll have an agent do it" without emitting \ + the call does NOTHING: the agent never starts and you have failed the user. So always \ + emit the spawn_agent call. You may add one short natural sentence as you call it, but \ + never instead of it. Do NOT ask clarifying questions before spawning — spawn with what \ + you have. Do NOT wait for it, narrate its steps, refuse, or claim you can't. - Everything else — general questions, facts, chit-chat, explanations, advice, jokes, \ and creative or long-form requests (stories, brainstorming, drafts): ANSWER YOURSELF. \ You are fully capable; do it directly, even when the ask is long or open-ended. Do \ @@ -75,9 +110,9 @@ enum RealtimeHubTools { Keep latency low: prefer answering directly when you can. """ - /// OpenAI Realtime GA `session.tools` entries. - static var openAITools: [[String: Any]] { - [ + /// OpenAI Realtime GA `session.tools` entries. Static `let` — built once, not rebuilt on + /// every session (re)connect that reads it. + static let openAITools: [[String: Any]] = [ [ "type": "function", "name": HubTool.askHigherModel.rawValue, @@ -103,6 +138,90 @@ enum RealtimeHubTools { + "my list'. Do NOT use spawn_agent for reading tasks.", "parameters": ["type": "object", "properties": [:]], ], + [ + "type": "function", + "name": HubTool.getMemories.rawValue, + "description": + "Read what Omi knows about the user — their memories and facts (preferences, " + + "background, people, habits). Fast synchronous read with NO query. Use this for " + + "'who am I', 'what do you know about me', 'what are my preferences'. Speak what it returns.", + "parameters": ["type": "object", "properties": [:]], + ], + [ + "type": "function", + "name": HubTool.searchMemories.rawValue, + "description": + "Search the user's memories / facts for a SPECIFIC thing ('what's my dog's name', " + + "'where do I work', 'what's my partner's name'). Fast synchronous read. Speak the result.", + "parameters": [ + "type": "object", + "properties": [ + "query": ["type": "string", "description": "What to look up about the user."] + ], + "required": ["query"], + ], + ], + [ + "type": "function", + "name": HubTool.searchConversations.rawValue, + "description": + "Search the user's past conversations for what they discussed ('what did I say about X', " + + "'what did we decide', 'summarize my last meeting'). Returns titles + summaries only " + + "(no full transcripts). Fast synchronous read. Speak the result.", + "parameters": [ + "type": "object", + "properties": [ + "query": ["type": "string", "description": "What topic / conversation to find."] + ], + "required": ["query"], + ], + ], + [ + "type": "function", + "name": HubTool.getConversations.rawValue, + "description": + "List the user's MOST RECENT conversations, newest first (titles + summaries, no full " + + "transcripts). Use this — NOT search_conversations — for 'what was my most recent / " + + "latest / last conversation', 'what did we just talk about', or 'my recent conversations'. " + + "search_conversations is semantic and does NOT order by time, so it's wrong for 'recent'. " + + "Fast synchronous read. Speak the result.", + "parameters": ["type": "object", "properties": [:]], + ], + [ + "type": "function", + "name": HubTool.createActionItem.rawValue, + "description": + "Create a new task / to-do / reminder for the user ('remind me to…', 'add … to my " + + "list', 'I need to…'). Fast synchronous write. Confirm out loud after it returns.", + "parameters": [ + "type": "object", + "properties": [ + "description": ["type": "string", "description": "The task text."], + "due_at": [ + "type": "string", + "description": "Optional ISO-8601 due date/time, only if the user gave one.", + ], + ], + "required": ["description"], + ], + ], + [ + "type": "function", + "name": HubTool.updateActionItem.rawValue, + "description": + "Update an existing task: mark it done, edit its text, or reschedule it. You MUST first " + + "call get_tasks to get the matching task's id, then pass that id here. Fast synchronous write.", + "parameters": [ + "type": "object", + "properties": [ + "id": ["type": "string", "description": "The task id from get_tasks."], + "completed": ["type": "boolean", "description": "Set true to mark the task done."], + "description": ["type": "string", "description": "New task text, if changing it."], + "due_at": ["type": "string", "description": "New ISO-8601 due date/time, if rescheduling."], + ], + "required": ["id"], + ], + ], [ "type": "function", "name": HubTool.spawnAgent.rawValue, @@ -140,12 +259,11 @@ enum RealtimeHubTools { "required": ["x", "y"], ], ], - ] - } + ] - /// Gemini Live `setup.tools[0].functionDeclarations` entries (same surface). - static var geminiFunctionDeclarations: [[String: Any]] { - openAITools.map { tool in + /// Gemini Live `setup.tools[0].functionDeclarations` entries (same surface). Derived once + /// from `openAITools`. + static let geminiFunctionDeclarations: [[String: Any]] = openAITools.map { tool in // Gemini wants {name, description, parameters} without the OpenAI "type" wrapper. var decl: [String: Any] = [ "name": tool["name"] as? String ?? "", @@ -159,7 +277,6 @@ enum RealtimeHubTools { } return decl } - } /// Recursively uppercase every `type` value in a JSON-schema dict so it matches Gemini's /// Schema enum (object → OBJECT, string → STRING, …). From 9c419b1beea71267b41e77e941b2269522dbd266 Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 23:29:27 -0400 Subject: [PATCH 15/35] feat(desktop): execute the voice-agent data tools via APIClient MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Wire the new HubTool cases in hubDidRequestTool to the existing APIClient.tool* endpoints (capped for voice: top 3/5, no transcripts). Reads/writes share one runToolAndSpeak helper (Task / do-catch / empty-fallback / log / sendToolResult) and a small arg() accessor; get_tasks output carries [id:…] so update_action_item can target a task. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../RealtimeHubController.swift | 103 +++++++++++++++++- 1 file changed, 100 insertions(+), 3 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift index 776cbdee28f..894159abf50 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift @@ -295,6 +295,25 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } } + /// Run an async tool `body`, then speak its result: on throw → `errorText`, on an + /// empty/whitespace result → `emptyText`. Shared by the data read/write tool cases so the + /// Task / do-catch / blank-check / log / sendToolResult tail lives in exactly one place. + private func runToolAndSpeak( + callId: String, name: String, detail: String = "", + emptyText: String, errorText: String, + _ body: @escaping () async throws -> String + ) { + Task { [weak self] in + guard let self else { return } + var out: String + do { out = try await body() } catch { out = errorText } + if out.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { out = emptyText } + let suffix = detail.isEmpty ? "" : " \(detail)" + log("RealtimeHub[\(self.providerTag)]: tool \(name)\(suffix) → \(out.prefix(60))") + self.session?.sendToolResult(callId: callId, name: name, output: out) + } + } + func hubDidRequestTool(name: String, callId: String, argumentsJSON: String) { let arguments = (try? JSONSerialization.jsonObject(with: Data(argumentsJSON.utf8)) as? [String: Any]) ?? [:] @@ -303,9 +322,10 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { session?.sendToolResult(callId: callId, name: name, output: "Unknown tool.") return } + func arg(_ key: String) -> String { (arguments[key] as? String) ?? turnTranscript } switch tool { case .askHigherModel: - let query = (arguments["query"] as? String) ?? turnTranscript + let query = arg("query") log("RealtimeHub[\(providerTag)]: tool ask_higher_model → POST /v2/chat/completions (claude-sonnet-4-6) query=\"\(query.prefix(80))\"") Task { [weak self] in guard let self else { return } @@ -320,8 +340,9 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { await TasksStore.shared.loadDashboardTasks() let overdue = TasksStore.shared.overdueTasks let today = TasksStore.shared.todaysTasks + // Include the task id (for update_action_item) — the model is told never to speak ids. func list(_ items: [TaskActionItem]) -> String { - items.prefix(15).map { "- \($0.description)" }.joined(separator: "\n") + items.prefix(15).map { "- \($0.description) [id:\($0.id)]" }.joined(separator: "\n") } var out = "" if !overdue.isEmpty { out += "Overdue (\(overdue.count)):\n\(list(overdue))\n" } @@ -330,8 +351,84 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { log("RealtimeHub[\(self.providerTag)]: tool get_tasks → \(overdue.count) overdue, \(today.count) today") self.session?.sendToolResult(callId: callId, name: name, output: out) } + case .getMemories: + // Fast READ — "who am I" / "what do you know about me". Backend memories+facts. + runToolAndSpeak( + callId: callId, name: name, + emptyText: "I don't have any memories saved about you yet.", + errorText: "Could not read your memories right now." + ) { try await APIClient.shared.toolGetMemories(limit: 15).resultText } + case .searchMemories: + let query = arg("query") + runToolAndSpeak( + callId: callId, name: name, detail: "q=\"\(query.prefix(60))\"", + emptyText: "I couldn't find anything about that.", + errorText: "Could not search your memories right now." + ) { try await APIClient.shared.toolSearchMemories(query: query, limit: 5).resultText } + case .searchConversations: + // Capped for voice: top 5, summaries only (no full transcripts). + let query = arg("query") + runToolAndSpeak( + callId: callId, name: name, detail: "q=\"\(query.prefix(60))\"", + emptyText: "I couldn't find a conversation about that.", + errorText: "Could not search your conversations right now." + ) { + try await APIClient.shared.toolSearchConversations( + query: query, limit: 5, includeTranscript: false + ).resultText + } + case .getConversations: + // Fast READ — most recent conversations, newest first (backend orders created_at DESC). + // Capped for voice: top 3, summaries only. This is the recency path; search_conversations + // is semantic and must NOT be used for "most recent". + runToolAndSpeak( + callId: callId, name: name, + emptyText: "I don't see any recent conversations.", + errorText: "Could not read your recent conversations right now." + ) { + try await APIClient.shared.toolGetConversations( + limit: 3, includeTranscript: false + ).resultText + } + case .createActionItem: + let description = (arguments["description"] as? String)? + .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + let dueAt = arguments["due_at"] as? String + guard !description.isEmpty else { + session?.sendToolResult( + callId: callId, name: name, output: "No task description was given.") + return + } + runToolAndSpeak( + callId: callId, name: name, detail: "\"\(description.prefix(60))\"", + emptyText: "Task created.", + errorText: "Could not create the task right now." + ) { + try await APIClient.shared.toolCreateActionItem( + description: description, dueAt: dueAt + ).resultText + } + case .updateActionItem: + guard let id = (arguments["id"] as? String), !id.isEmpty else { + session?.sendToolResult( + callId: callId, name: name, + output: "Missing the task id — call get_tasks first to find it.") + return + } + let completed = arguments["completed"] as? Bool + let newDescription = arguments["description"] as? String + let dueAt = arguments["due_at"] as? String + runToolAndSpeak( + callId: callId, name: name, detail: "id=\(id.prefix(8))", + emptyText: "Task updated.", + errorText: "Could not update the task right now." + ) { + try await APIClient.shared.toolUpdateActionItem( + id: id, completed: completed, description: newDescription, dueAt: dueAt + ).resultText + } case .spawnAgent: - let brief = (arguments["brief"] as? String) ?? turnTranscript + let brief = arg("brief") let model = ShortcutSettings.shared.selectedModel.isEmpty ? "claude-sonnet-4-6" : ShortcutSettings.shared.selectedModel // Non-blocking: spawn renders its own pill ("text bubble") and runs on its From 8e98a2b1b10f34cc01985daf1137c62f80ca021d Mon Sep 17 00:00:00 2001 From: vendz Date: Wed, 17 Jun 2026 23:29:35 -0400 Subject: [PATCH 16/35] test(desktop): stub the new voice-agent data tools in the hub harness Keep the harness tool-result switch exhaustive for the six new HubTool cases. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../FloatingControlBar/RealtimeHubTestHarness.swift | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift index 8585d3033b3..0a01753181d 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift @@ -118,7 +118,13 @@ final class RealtimeHubTestHarness: NSObject, RealtimeHubSessionDelegate { let stub: String switch HubTool(rawValue: name) { case .askHigherModel: stub = "Paris is the capital of France." - case .getTasks: stub = "Due today (1):\n- Example task" + case .getTasks: stub = "Due today (1):\n- Example task [id:task_123]" + case .getMemories: stub = "You live in San Francisco and prefer concise answers." + case .searchMemories: stub = "Your dog's name is Rex." + case .searchConversations: stub = "On Monday you discussed the launch timeline." + case .getConversations: stub = "Most recent: today, 'Standup notes'. Before that: yesterday, 'Design review'." + case .createActionItem: stub = "Created task: Example task." + case .updateActionItem: stub = "Updated the task." case .spawnAgent: stub = "Started a background agent." case .screenshot: stub = "Screen captured." case .pointClick: stub = "Clicked." From 6945db1bbffbe490e50e62b5aab06f9d79d7d764 Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 15:25:00 -0400 Subject: [PATCH 17/35] feat(desktop): give the realtime hub activity, screen, and full-task tools + varied heads-ups Add get_daily_recap, search_screen_history, and get_action_items to the hub tool surface (enum + OpenAI/Gemini schemas) so voice can answer "what did I do yesterday", screen-history lookups, and filtered/completed task queries. Route productivity/workflow questions to pull get_daily_recap (+get_action_items) instead of answering generically, and rework the tool-use rule so the spoken heads-up before a tool call is specific to the request and varied each turn (no repeated filler). Co-Authored-By: Claude Opus 4.8 (1M context) --- .../FloatingControlBar/RealtimeHubTools.swift | 120 ++++++++++++++++-- 1 file changed, 109 insertions(+), 11 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index d3ba37d796d..f00d5f71311 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -18,8 +18,12 @@ enum HubTool: String { /// Non-blocking: the model acknowledges and moves on. case spawnAgent = "spawn_agent" /// Read the user's tasks locally (TasksStore) and return them inline to speak — a - /// fast synchronous READ, NOT a background agent. + /// fast synchronous READ, NOT a background agent. Overdue + due-today only. case getTasks = "get_tasks" + /// Read the user's full action-item list from the backend with filters (completed, + /// due-date range). Fast READ — use for completed tasks, date ranges, or the whole list + /// (get_tasks only covers overdue + due-today). + case getActionItems = "get_action_items" /// Read what Omi knows about the user (memories / facts) and return it inline to speak. /// Fast synchronous READ — the answer to "who am I" / "what do you know about me". case getMemories = "get_memories" @@ -31,6 +35,13 @@ enum HubTool: String { /// List the user's MOST RECENT conversations, newest first (titles + summaries, no /// transcripts). Fast READ — the answer to "most recent / latest / last conversation". case getConversations = "get_conversations" + /// Formatted recap of what the user actually DID on their Mac — apps used (with minutes), + /// conversations, tasks, focus, screen activity. Fast LOCAL READ — the answer to "what did I + /// do yesterday / today", "which apps did I use the most", "how did I spend my time". + case getDailyRecap = "get_daily_recap" + /// Semantically search the user's on-screen history (what they saw / read / worked on). + /// Fast LOCAL READ — "when was I looking at X", "find where I read about Y". + case searchScreenHistory = "search_screen_history" /// Create a new task / to-do / reminder for the user. Fast synchronous WRITE. case createActionItem = "create_action_item" /// Update an existing task (mark done, change text/due). Needs the task id from get_tasks. @@ -53,24 +64,34 @@ enum RealtimeHubTools { IMPORTANT: You CAN read the user's Omi data directly with fast tools — their tasks \ (get_tasks), what Omi knows about them / their memories & facts (get_memories, \ - search_memories), and their past conversations (search_conversations) — and you can \ - make simple task changes (create_action_item, update_action_item). For anything in \ + search_memories), their past conversations (search_conversations), what they DID on \ + their Mac (get_daily_recap), and their on-screen history (search_screen_history) — and \ + you can make simple task changes (create_action_item, update_action_item). For anything in \ their OTHER apps (calendar, notes, emails, messages, files, reminders, browser) or any \ multi-step "do X for me" work, use spawn_agent — it hands the request to a background \ agent that has those tools and can act in the user's apps. - Using tools: the moment a request needs a tool, briefly acknowledge it OUT LOUD in your \ - own natural, varied words (keep it short, and don't include any answer or data you don't \ - have yet), then immediately call the tool. For a READ tool (get_tasks, get_memories, \ - search_memories, search_conversations, ask_higher_model), speak its result after it \ - returns. NEVER put an answer — real or guessed — in that acknowledgment, NEVER skip the \ - tool call, and never read tool JSON or ids aloud. You cannot see the user's data or \ - screen without calling a tool. + Using tools: when a request needs a tool, ALWAYS give a short spoken heads-up first so the \ + user knows you're on it and that it won't be instant — then call the tool and speak the \ + result when it returns. Never go silent during a tool call; the user can't see what you're \ + doing, so a quiet gap feels broken. The catch is variety: that heads-up must be SPECIFIC to \ + what they actually asked and DIFFERENT every time. Name the real thing you're fetching — \ + "Pulling up yesterday's activity…", "Scanning your task list…", "Digging through your notes \ + on the launch…", "Checking your memories for that…", "Getting the latest on that, one \ + sec…". The thing to avoid is repetition: do NOT reach for the same generic opener ("let me \ + check", "let me look that up") turn after turn — it's what makes you sound robotic. Keep it \ + to a few words, vary the wording each turn, and don't include any answer or data you don't \ + have yet. For a slower step (ask_higher_model, spawn_agent) it's fine to signal it'll take a \ + moment. NEVER speak an answer — real or guessed — before the tool returns, NEVER skip the \ + tool call, and never read tool JSON or ids aloud. You cannot see the user's data or screen \ + without calling a tool. Decide what to do with each request: - The user's TASKS / to-dos / what's due — a READ ("what are my tasks", "what's due \ today", "what's on my list", "do I have anything today"): you MUST call get_tasks and \ - speak ONLY what it returns. Never guess, summarize from memory, or make up tasks. + speak ONLY what it returns. Never guess, summarize from memory, or make up tasks. For \ + COMPLETED tasks ("what did I finish"), a SPECIFIC due-date range ("what's due next week"), \ + or the FULL list ("all my tasks"), call get_action_items instead (it supports filters). - WHO the user is / what you know about them / their memories or facts ("who am I", \ "what do you know about me", "what are my preferences"): you MUST call get_memories (no \ query) and speak what it returns. For a SPECIFIC fact ("what's my dog's name", "where do \ @@ -83,6 +104,21 @@ enum RealtimeHubTools { - What the user DISCUSSED about a TOPIC ("what did I say about X", "what did we decide on \ Y", "find the conversation about Z"): call search_conversations with a focused query and \ speak the result. + - The user's own ACTIVITY / what they DID / how they spent their time ("what did I do \ + yesterday", "what did I do today", "which apps did I use the most", "how did I spend my \ + morning", "summarize my day"): you MUST call get_daily_recap (days_ago: 0 = today, 1 = \ + yesterday) and speak a SHORT spoken summary of the highlights it returns — top apps, key \ + conversations, tasks. Do NOT use search_conversations or spawn_agent for this, and never \ + guess; this is exactly what get_daily_recap is for. + - What the user SAW / read / worked on ON SCREEN ("when was I looking at X", "find where I \ + read about Y", "what was I doing in app Z"): call search_screen_history with a focused \ + query and speak the result. + - ADVICE about the user's OWN productivity / workflow / habits / focus ("how can I improve \ + my workflow", "how can I be more productive", "what should I change", "how am I doing", \ + "where am I wasting time"): do NOT answer generically. FIRST call get_daily_recap (days_ago: \ + 1 for today, 7 for the week) — and get_action_items when tasks matter — then base EVERY \ + suggestion on what they ACTUALLY did: their apps, distracted vs focused sessions, and \ + overdue / duplicate tasks. Generic advice with no tool call is a failure here. - ADD a task / to-do / reminder ("remind me to…", "add … to my list", "I need to…"): \ call create_action_item with a clear `description` (and `due_at` if a time was given), \ then confirm out loud. CHANGE an existing task (mark done, edit, reschedule): first \ @@ -187,6 +223,68 @@ enum RealtimeHubTools { + "Fast synchronous read. Speak the result.", "parameters": ["type": "object", "properties": [:]], ], + [ + "type": "function", + "name": HubTool.getDailyRecap.rawValue, + "description": + "Get a recap of what the user actually DID on their Mac — apps used (with minutes), " + + "conversations, tasks, focus sessions, and screen activity — for a day. THIS is the tool " + + "for 'what did I do yesterday', 'what did I do today', 'which apps did I use the most', " + + "'how did I spend my time'. Do NOT use search_conversations or spawn_agent for these. " + + "Fast synchronous read — speak a short summary of what it returns.", + "parameters": [ + "type": "object", + "properties": [ + "days_ago": [ + "type": "number", + "description": "0 = today, 1 = yesterday (default), 7 = the past week.", + ] + ], + ], + ], + [ + "type": "function", + "name": HubTool.searchScreenHistory.rawValue, + "description": + "Search the user's on-screen history — what they saw, read, or worked on — by meaning. " + + "Use for 'when was I looking at X', 'find where I read about Y', 'what was I doing in " + + "app Z'. Returns matching moments with the app and context. Fast synchronous read. " + + "Speak the result.", + "parameters": [ + "type": "object", + "properties": [ + "query": [ + "type": "string", "description": "What the user was looking at / reading / doing.", + ], + "days": ["type": "number", "description": "How many days back to search; default 7."], + ], + "required": ["query"], + ], + ], + [ + "type": "function", + "name": HubTool.getActionItems.rawValue, + "description": + "Read the user's tasks / to-dos from the backend, with optional filters. Use for " + + "COMPLETED tasks ('what did I finish'), a DATE RANGE ('what's due next week'), or the " + + "FULL list ('all my tasks') — for plain 'what's due today / overdue', prefer get_tasks. " + + "Fast synchronous read. Speak a short summary of what it returns.", + "parameters": [ + "type": "object", + "properties": [ + "completed": [ + "type": "boolean", + "description": "true = only done tasks, false = only open tasks. Omit for both.", + ], + "due_start_date": [ + "type": "string", "description": "Optional ISO-8601 start of the due-date range.", + ], + "due_end_date": [ + "type": "string", "description": "Optional ISO-8601 end of the due-date range.", + ], + ], + ], + ], [ "type": "function", "name": HubTool.createActionItem.rawValue, From dd9e9cbce159fb8b5a9acab9ac7730035c041e4c Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 15:25:12 -0400 Subject: [PATCH 18/35] feat(desktop): dispatch the hub's get_daily_recap, search_screen_history, get_action_items Wire the three new hub tools in hubDidRequestTool: get_daily_recap and search_screen_history reuse the local ChatToolExecutor (on-device activity DB, the same path the desktop chat uses); get_action_items reads the backend via APIClient.toolGetActionItems with completed/due-date filters. Add a small argInt helper to dedupe Int argument parsing across the read cases. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../RealtimeHubController.swift | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift index 894159abf50..d6d0534dddc 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift @@ -323,6 +323,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { return } func arg(_ key: String) -> String { (arguments[key] as? String) ?? turnTranscript } + func argInt(_ key: String) -> Int? { (arguments[key] as? Int) ?? (arguments[key] as? NSNumber)?.intValue } switch tool { case .askHigherModel: let query = arg("query") @@ -390,6 +391,47 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { limit: 3, includeTranscript: false ).resultText } + case .getDailyRecap: + // Fast LOCAL read of the on-device activity DB — apps/minutes, conversations, tasks, + // focus, screen context. Reuses the SAME executor the desktop chat uses, so voice and + // chat answer "what did I do yesterday" from one code path. + let daysAgo = argInt("days_ago") ?? 1 + runToolAndSpeak( + callId: callId, name: name, detail: "days_ago=\(daysAgo)", + emptyText: "I don't have any activity recorded for then.", + errorText: "Could not pull up your activity right now." + ) { + await ChatToolExecutor.execute( + ToolCall(name: "get_daily_recap", arguments: ["days_ago": daysAgo], thoughtSignature: nil)) + } + case .getActionItems: + // Backend READ of the full task list with filters (completed / due-date range) — the + // capable sibling of the local get_tasks. Same APIClient path the chat agent uses. + let completed = arguments["completed"] as? Bool + let dueStart = arguments["due_start_date"] as? String + let dueEnd = arguments["due_end_date"] as? String + runToolAndSpeak( + callId: callId, name: name, detail: completed.map { "completed=\($0)" } ?? "", + emptyText: "I couldn't find any matching tasks.", + errorText: "Could not read your tasks right now." + ) { + try await APIClient.shared.toolGetActionItems( + limit: 25, completed: completed, dueStartDate: dueStart, dueEndDate: dueEnd + ).resultText + } + case .searchScreenHistory: + // Fast LOCAL semantic search over screen history (same executor as chat). + let query = arg("query") + var toolArgs: [String: Any] = ["query": query] + if let days = argInt("days") { toolArgs["days"] = days } + runToolAndSpeak( + callId: callId, name: name, detail: "q=\"\(query.prefix(60))\"", + emptyText: "I couldn't find anything on your screen about that.", + errorText: "Could not search your screen history right now." + ) { + await ChatToolExecutor.execute( + ToolCall(name: "search_screen_history", arguments: toolArgs, thoughtSignature: nil)) + } case .createActionItem: let description = (arguments["description"] as? String)? .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" From cc5a8e2715047dfb03454ff1a4e31c6ceb261e63 Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 15:25:18 -0400 Subject: [PATCH 19/35] test(desktop): stub the new hub data tools in the test harness Add stub results for get_daily_recap, search_screen_history, and get_action_items so the RealtimeHubTestHarness switch stays exhaustive and hub_test_turn can exercise the full turn loop for these tools. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Sources/FloatingControlBar/RealtimeHubTestHarness.swift | 3 +++ 1 file changed, 3 insertions(+) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift index 0a01753181d..ed9af5c461a 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift @@ -123,6 +123,9 @@ final class RealtimeHubTestHarness: NSObject, RealtimeHubSessionDelegate { case .searchMemories: stub = "Your dog's name is Rex." case .searchConversations: stub = "On Monday you discussed the launch timeline." case .getConversations: stub = "Most recent: today, 'Standup notes'. Before that: yesterday, 'Design review'." + case .getActionItems: stub = "Open: Buy milk (due tomorrow). Completed: Ship the PR." + case .getDailyRecap: stub = "Yesterday: 3 hrs in Xcode, 1 hr in Safari; 2 conversations; 1 task created." + case .searchScreenHistory: stub = "Found it: yesterday afternoon you were reading the launch doc in Safari." case .createActionItem: stub = "Created task: Example task." case .updateActionItem: stub = "Updated the task." case .spawnAgent: stub = "Started a background agent." From 59996555a11278b7ea207d456b3751b2987beee4 Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 16:02:11 -0400 Subject: [PATCH 20/35] feat(desktop): local identity-card builder for the realtime hub --- .../FloatingControlBar/AboutUserCard.swift | 49 +++++++++++++++++++ .../Desktop/Tests/AboutUserCardTests.swift | 28 +++++++++++ 2 files changed, 77 insertions(+) create mode 100644 desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift create mode 100644 desktop/macos/Desktop/Tests/AboutUserCardTests.swift diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift new file mode 100644 index 00000000000..1e17bfa7614 --- /dev/null +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift @@ -0,0 +1,49 @@ +import Foundation + +/// Builds the compact, local-only `` block injected into the hub's +/// system instruction at warm time. Identity + rough situation only; exact/current +/// lists stay behind the read tools (the card hedges this). No network calls. +enum AboutUserCard { + /// Pure formatter — kept separate from `build()` so it is unit-testable. + static func render(name: String, facts: [String], overdue: Int, dueToday: Int) -> String { + var lines: [String] = [""] + if !name.isEmpty { lines.append("Name: \(name)") } + lines.append("What Omi knows about them:") + if facts.isEmpty { + lines.append("- Nothing saved yet.") + } else { + lines.append(contentsOf: facts.map { "- \($0)" }) + } + if overdue == 0 && dueToday == 0 { + lines.append("Right now: nothing overdue or due today.") + } else { + lines.append("Right now: \(overdue) overdue, \(dueToday) due today.") + } + lines.append( + "(This is a quick snapshot — for the exact or current list, call get_tasks / get_action_items.)") + lines.append("") + return lines.joined(separator: "\n") + } + + /// Gathers local data (auth name, top memories, task counts) and renders the card. + /// Best-effort: any failure degrades to a smaller card, never throws. + @MainActor + static func build() async -> String { + let name = AuthService.shared.givenName.trimmingCharacters(in: .whitespacesAndNewlines) + + var facts: [String] = [] + if let memories = try? await MemoryStorage.shared.getLocalMemories(limit: 8) { + facts = memories.prefix(8).compactMap { mem in + let t = mem.content.trimmingCharacters(in: .whitespacesAndNewlines) + guard !t.isEmpty else { return nil } + return t.count > 120 ? String(t.prefix(117)) + "…" : t + } + } + + await TasksStore.shared.loadDashboardTasks() + let overdue = TasksStore.shared.overdueTasks.count + let dueToday = TasksStore.shared.todaysTasks.count + + return render(name: name, facts: facts, overdue: overdue, dueToday: dueToday) + } +} diff --git a/desktop/macos/Desktop/Tests/AboutUserCardTests.swift b/desktop/macos/Desktop/Tests/AboutUserCardTests.swift new file mode 100644 index 00000000000..389ab2f6bb6 --- /dev/null +++ b/desktop/macos/Desktop/Tests/AboutUserCardTests.swift @@ -0,0 +1,28 @@ +import XCTest +@testable import Omi_Computer + +final class AboutUserCardTests: XCTestCase { + func testRenderIncludesNameFactsCountsAndHedge() { + let card = AboutUserCard.render( + name: "Sam", + facts: ["Lives in San Francisco", "Prefers concise answers"], + overdue: 2, + dueToday: 3 + ) + XCTAssertTrue(card.contains("")) + XCTAssertTrue(card.contains("")) + XCTAssertTrue(card.contains("Name: Sam")) + XCTAssertTrue(card.contains("- Lives in San Francisco")) + XCTAssertTrue(card.contains("- Prefers concise answers")) + XCTAssertTrue(card.contains("2 overdue")) + XCTAssertTrue(card.contains("3 due today")) + XCTAssertTrue(card.contains("snapshot")) + } + + func testRenderEmptyState() { + let card = AboutUserCard.render(name: "", facts: [], overdue: 0, dueToday: 0) + XCTAssertFalse(card.contains("Name:")) // no name line when empty + XCTAssertTrue(card.contains("Nothing saved")) // facts empty-state + XCTAssertTrue(card.contains("nothing overdue or due today")) + } +} From 47800765f429cc4081c83f0d3fbae1e9d1c26197 Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 16:05:25 -0400 Subject: [PATCH 21/35] fix(desktop): AboutUserCard name falls back to displayName; tighten hedge assertion Co-Authored-By: Claude Sonnet 4.6 --- .../Desktop/Sources/FloatingControlBar/AboutUserCard.swift | 4 +++- desktop/macos/Desktop/Tests/AboutUserCardTests.swift | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift index 1e17bfa7614..6c9eefe9c35 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift @@ -29,7 +29,9 @@ enum AboutUserCard { /// Best-effort: any failure degrades to a smaller card, never throws. @MainActor static func build() async -> String { - let name = AuthService.shared.givenName.trimmingCharacters(in: .whitespacesAndNewlines) + let auth = AuthService.shared + let rawName = auth.givenName.isEmpty ? auth.displayName : auth.givenName + let name = rawName.trimmingCharacters(in: .whitespacesAndNewlines) var facts: [String] = [] if let memories = try? await MemoryStorage.shared.getLocalMemories(limit: 8) { diff --git a/desktop/macos/Desktop/Tests/AboutUserCardTests.swift b/desktop/macos/Desktop/Tests/AboutUserCardTests.swift index 389ab2f6bb6..df06db6bdbd 100644 --- a/desktop/macos/Desktop/Tests/AboutUserCardTests.swift +++ b/desktop/macos/Desktop/Tests/AboutUserCardTests.swift @@ -16,7 +16,7 @@ final class AboutUserCardTests: XCTestCase { XCTAssertTrue(card.contains("- Prefers concise answers")) XCTAssertTrue(card.contains("2 overdue")) XCTAssertTrue(card.contains("3 due today")) - XCTAssertTrue(card.contains("snapshot")) + XCTAssertTrue(card.contains("quick snapshot")) } func testRenderEmptyState() { From d09705bb40ecaf78957ed7ebe007138e04f10503 Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 16:10:30 -0400 Subject: [PATCH 22/35] feat(desktop): inject local card + user-language reply into the hub system prompt (#1) Co-Authored-By: Claude Sonnet 4.6 --- .../RealtimeHubController.swift | 15 ++++- .../RealtimeHubSession.swift | 8 ++- .../RealtimeHubTestHarness.swift | 5 +- .../FloatingControlBar/RealtimeHubTools.swift | 65 ++++++++++++------- .../Tests/HubSystemInstructionTests.swift | 14 ++++ 5 files changed, 78 insertions(+), 29 deletions(-) create mode 100644 desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift index d6d0534dddc..a48dc4e0757 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift @@ -62,6 +62,16 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { /// Log tag for the currently-connected provider. private var providerTag: String { sessionProvider == .gemini ? "gemini" : "openai" } + /// Latest local identity card, injected into each new session's system instruction. + /// Refreshed off the hot path; an empty string just means "no card yet" (graceful). + private var aboutUserCard: String = "" + + private func refreshAboutUserCard() { + Task { @MainActor [weak self] in + self?.aboutUserCard = await AboutUserCard.build() + } + } + /// Held warm so spawn_agent's pi-mono bridge boot is off the hot path. The pill /// spawn creates its own provider; warming this one primes node/auth caches. private var warmProvider: ChatProvider? @@ -98,6 +108,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { name: .realtimeOmniSettingsDidChange, object: nil) // Expose the headless E2E action (omi-ctl action hub_test_turn pcm=… provider=…). RealtimeHubTestHarness.registerAutomationAction() + refreshAboutUserCard() } @objc private func settingsChanged() { @@ -105,6 +116,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { // teardown/recreate races on unrelated notifications. if session != nil, sessionProvider == RealtimeHubSettings.shared.provider { return } teardownSession() + refreshAboutUserCard() ensureWarm() } @@ -151,7 +163,8 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } private func startSession(provider: RealtimeHubProvider, auth: HubAuth) { - let s = RealtimeHubSession(provider: provider, auth: auth, delegate: self) + let instructions = RealtimeHubTools.systemInstruction(aboutUser: aboutUserCard) + let s = RealtimeHubSession(provider: provider, auth: auth, instructions: instructions, delegate: self) session = s sessionProvider = provider // Both providers stream native spoken audio (24k PCM) → StreamingPCMPlayer; diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift index 2e009f35140..86fe07fdc0b 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift @@ -62,6 +62,7 @@ enum HubAuth { final class RealtimeHubSession: NSObject { private let provider: RealtimeHubProvider private let auth: HubAuth + private let instructions: String private weak var delegate: RealtimeHubSessionDelegate? /// Mic PCM input rate per provider (Gemini 16k native, OpenAI GA needs 24k). @@ -118,9 +119,10 @@ final class RealtimeHubSession: NSObject { /// clear which model produced which event. private var tag: String { "RealtimeHub[\(provider == .openai ? "openai" : "gemini"):\(provider.modelID)]" } - init(provider: RealtimeHubProvider, auth: HubAuth, delegate: RealtimeHubSessionDelegate) { + init(provider: RealtimeHubProvider, auth: HubAuth, instructions: String, delegate: RealtimeHubSessionDelegate) { self.provider = provider self.auth = auth + self.instructions = instructions self.delegate = delegate super.init() } @@ -402,7 +404,7 @@ final class RealtimeHubSession: NSObject { "type": "session.update", "session": [ "type": "realtime", - "instructions": RealtimeHubTools.systemInstruction, + "instructions": instructions, "output_modalities": ["audio"], "audio": [ "input": [ @@ -431,7 +433,7 @@ final class RealtimeHubSession: NSObject { "responseModalities": ["AUDIO"], "temperature": 0.3, "mediaResolution": "MEDIA_RESOLUTION_HIGH", ], - "systemInstruction": ["parts": [["text": RealtimeHubTools.systemInstruction]]], + "systemInstruction": ["parts": [["text": instructions]]], "tools": [["functionDeclarations": RealtimeHubTools.geminiFunctionDeclarations]], "inputAudioTranscription": [:], "outputAudioTranscription": [:], diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift index ed9af5c461a..46384dd9caf 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift @@ -44,7 +44,10 @@ final class RealtimeHubTestHarness: NSObject, RealtimeHubSessionDelegate { } func run(timeoutSeconds: Double) async -> [String: String] { - let s = RealtimeHubSession(provider: provider, auth: auth, delegate: self) + let s = RealtimeHubSession( + provider: provider, auth: auth, + instructions: RealtimeHubTools.systemInstruction(aboutUser: ""), + delegate: self) session = s let rate = s.requiredInputSampleRate let audio = rate == 16000 ? pcm16k : PushToTalkManager.resamplePCM16(pcm16k, from: 16000, to: rate) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index f00d5f71311..d4354579401 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -54,13 +54,16 @@ enum HubTool: String { enum RealtimeHubTools { - static let systemInstruction = """ + static func systemInstruction(aboutUser: String) -> String { + """ You are Omi, a fast spoken-voice assistant on the user's Mac and the single hub \ for their voice requests. You hear the user's microphone; reply by speaking, \ conversationally. Default to one or two sentences, but when the user asks for \ something longer or creative (a story, a detailed explanation, brainstorming), \ give the full answer yourself — don't shorten it and don't offload it. \ - Always reply in English. + Reply in the same language the user is speaking. + + \(aboutUser) IMPORTANT: You CAN read the user's Omi data directly with fast tools — their tasks \ (get_tasks), what Omi knows about them / their memories & facts (get_memories, \ @@ -87,16 +90,19 @@ enum RealtimeHubTools { without calling a tool. Decide what to do with each request: + - WHO the user is, what you ALREADY KNOW about them, and the ROUGH shape of their day \ + ("who am I", "what do you know about me", "am I busy today", "much on my plate"): answer \ + DIRECTLY from above — do NOT call a tool and do NOT say "let me check". Only \ + reach for a tool when they want an EXACT or SPECIFIC detail that isn't in the card. - The user's TASKS / to-dos / what's due — a READ ("what are my tasks", "what's due \ today", "what's on my list", "do I have anything today"): you MUST call get_tasks and \ - speak ONLY what it returns. Never guess, summarize from memory, or make up tasks. For \ - COMPLETED tasks ("what did I finish"), a SPECIFIC due-date range ("what's due next week"), \ - or the FULL list ("all my tasks"), call get_action_items instead (it supports filters). - - WHO the user is / what you know about them / their memories or facts ("who am I", \ - "what do you know about me", "what are my preferences"): you MUST call get_memories (no \ - query) and speak what it returns. For a SPECIFIC fact ("what's my dog's name", "where do \ - I work"), call search_memories with a focused query. NEVER answer "I don't know" or guess \ - — always call the tool first; this data is the whole point. + speak ONLY what it returns (the card's counts are a rough snapshot, not the list). Never \ + guess or make up tasks. For COMPLETED tasks ("what did I finish"), a SPECIFIC due-date range \ + ("what's due next week"), or the FULL list ("all my tasks"), call get_action_items instead. + - A SPECIFIC fact about the user that isn't already in ("what's my dog's name", \ + "where do I work"): call search_memories with a focused query. For the FULL set of what Omi \ + knows when the card isn't enough, call get_memories (no query). NEVER answer "I don't know" \ + or guess about the user without checking first. - The user's MOST RECENT / latest / last conversation ("what was my most recent \ conversation", "what did we just talk about", "my recent conversations"): call \ get_conversations (newest first) — NOT search_conversations, which is semantic and does \ @@ -126,25 +132,36 @@ enum RealtimeHubTools { - DOING something for the user in their OTHER apps (calendar, notes, emails, messages, \ files, browser) or any multi-step work — create/send/open/edit/search/schedule/automate/ \ "do X for me": you CANNOT do these yourself. You MUST actually EMIT the spawn_agent \ - function call (with a clear, self-contained `brief`). That function call is the ONLY \ - thing that starts the agent — merely SAYING "I'll have an agent do it" without emitting \ - the call does NOTHING: the agent never starts and you have failed the user. So always \ - emit the spawn_agent call. You may add one short natural sentence as you call it, but \ - never instead of it. Do NOT ask clarifying questions before spawning — spawn with what \ - you have. Do NOT wait for it, narrate its steps, refuse, or claim you can't. - - Everything else — general questions, facts, chit-chat, explanations, advice, jokes, \ - and creative or long-form requests (stories, brainstorming, drafts): ANSWER YOURSELF. \ - You are fully capable; do it directly, even when the ask is long or open-ended. Do \ - NOT escalate just because a request seems long or hard. - - Call ask_higher_model in ONLY two cases: (1) the user is unhappy with your previous \ - answer — they push back, rephrase, say you're wrong, or ask for a better/deeper/more \ - thorough answer; or (2) you genuinely need precise, up-to-date facts (current events, \ - specific numbers) you don't reliably know. Pass a clear `query`, then speak the result. + function call (with a clear, self-contained `brief` and a short `title`). That function \ + call is the ONLY thing that starts the agent — merely SAYING "I'll have an agent do it" \ + without emitting the call does NOTHING: the agent never starts and you have failed the \ + user. So always emit the spawn_agent call. You may add one short natural sentence as you \ + call it, but never instead of it. Do NOT ask clarifying questions before spawning — spawn \ + with what you have. Do NOT wait for it, narrate its steps, refuse, or claim you can't. + - Everything else — general questions, single facts, chit-chat, explanations, advice, \ + jokes, and creative or long-form requests (stories, brainstorming, drafts): ANSWER \ + YOURSELF. You are fully capable; do it directly, even when the ask is long, open-ended, \ + or mentions a specific name, date, number, or fact — a request is NOT hard just because \ + it contains one. Do NOT escalate based on how unsure you feel about your own knowledge: \ + you are a poor judge of that, so escalate only on the explicit, observable signals below, \ + never on a gut feeling. + - Call ask_higher_model ONLY on these explicit, observable signals — judged from what the \ + user SAYS and the SHAPE of the request, never from how confident you feel: (1) the user is \ + unhappy with your previous answer — they push back, rephrase, say you're wrong, or ask for \ + a better / deeper / more thorough answer; (2) the user EXPLICITLY asks you to look it up, \ + research it, double-check, be sure, or think hard about it; or (3) the request genuinely \ + needs heavy multi-step reasoning or careful technical work — non-trivial math, code, or \ + synthesizing several constraints into one answer — that a quick spoken reply would get \ + wrong. Do NOT escalate for ordinary questions, single facts, or anything you can answer in \ + a sentence or two. Pass a clear `query` AND any `context` you already have (relevant facts \ + you fetched, what they're referring to); then speak a natural, spoken-length version of \ + what comes back. - When you need to see what's on screen, call screenshot first. Use point_click only \ when the user clearly asks you to click something. Keep latency low: prefer answering directly when you can. """ + } /// OpenAI Realtime GA `session.tools` entries. Static `let` — built once, not rebuilt on /// every session (re)connect that reads it. diff --git a/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift b/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift new file mode 100644 index 00000000000..a9cd92c1f69 --- /dev/null +++ b/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift @@ -0,0 +1,14 @@ +import XCTest +@testable import Omi_Computer + +final class HubSystemInstructionTests: XCTestCase { + func testInstructionInjectsCardAndUsesUserLanguage() { + let card = "\nName: Sam\n" + let instr = RealtimeHubTools.systemInstruction(aboutUser: card) + XCTAssertTrue(instr.contains(card)) // card injected + XCTAssertTrue(instr.lowercased().contains("language the user")) // reply-in-user-language + XCTAssertFalse(instr.contains("Always reply in English")) // old rule gone + XCTAssertTrue(instr.contains("spawn_agent")) // guardrails preserved + XCTAssertTrue(instr.contains("get_daily_recap")) + } +} From 7f84b72e2a9e56674ba058cbb6a5142bd36468bb Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 16:16:06 -0400 Subject: [PATCH 23/35] fix(desktop): restore ask_higher_model + everything-else lanes to spec; tighten prompt test Co-Authored-By: Claude Sonnet 4.6 --- .../FloatingControlBar/RealtimeHubTools.swift | 27 +++++++------------ .../Tests/HubSystemInstructionTests.swift | 2 ++ 2 files changed, 11 insertions(+), 18 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index d4354579401..ea12b4171b3 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -138,24 +138,15 @@ enum RealtimeHubTools { user. So always emit the spawn_agent call. You may add one short natural sentence as you \ call it, but never instead of it. Do NOT ask clarifying questions before spawning — spawn \ with what you have. Do NOT wait for it, narrate its steps, refuse, or claim you can't. - - Everything else — general questions, single facts, chit-chat, explanations, advice, \ - jokes, and creative or long-form requests (stories, brainstorming, drafts): ANSWER \ - YOURSELF. You are fully capable; do it directly, even when the ask is long, open-ended, \ - or mentions a specific name, date, number, or fact — a request is NOT hard just because \ - it contains one. Do NOT escalate based on how unsure you feel about your own knowledge: \ - you are a poor judge of that, so escalate only on the explicit, observable signals below, \ - never on a gut feeling. - - Call ask_higher_model ONLY on these explicit, observable signals — judged from what the \ - user SAYS and the SHAPE of the request, never from how confident you feel: (1) the user is \ - unhappy with your previous answer — they push back, rephrase, say you're wrong, or ask for \ - a better / deeper / more thorough answer; (2) the user EXPLICITLY asks you to look it up, \ - research it, double-check, be sure, or think hard about it; or (3) the request genuinely \ - needs heavy multi-step reasoning or careful technical work — non-trivial math, code, or \ - synthesizing several constraints into one answer — that a quick spoken reply would get \ - wrong. Do NOT escalate for ordinary questions, single facts, or anything you can answer in \ - a sentence or two. Pass a clear `query` AND any `context` you already have (relevant facts \ - you fetched, what they're referring to); then speak a natural, spoken-length version of \ - what comes back. + - Everything else — general questions, facts, chit-chat, explanations, advice, jokes, \ + and creative or long-form requests (stories, brainstorming, drafts): ANSWER YOURSELF. \ + You are fully capable; do it directly, even when the ask is long or open-ended. Do \ + NOT escalate just because a request seems long or hard. + - Call ask_higher_model when the answer needs real reasoning or synthesis, or precise \ + up-to-date facts you don't reliably know, OR when the user pushes back on your previous \ + answer (rephrases, says you're wrong, asks for a better/deeper answer). Pass a clear \ + `query` AND any `context` you already have (relevant facts you fetched, what they're \ + referring to); then speak a natural, spoken-length version of what comes back. - When you need to see what's on screen, call screenshot first. Use point_click only \ when the user clearly asks you to click something. diff --git a/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift b/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift index a9cd92c1f69..52e2ff52e29 100644 --- a/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift +++ b/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift @@ -10,5 +10,7 @@ final class HubSystemInstructionTests: XCTestCase { XCTAssertFalse(instr.contains("Always reply in English")) // old rule gone XCTAssertTrue(instr.contains("spawn_agent")) // guardrails preserved XCTAssertTrue(instr.contains("get_daily_recap")) + XCTAssertTrue(instr.contains("ask_higher_model")) + XCTAssertTrue(instr.contains("ANSWER YOURSELF")) } } From 8367bed64d8435d21d1483c5552475752ecf5a8e Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 16:21:33 -0400 Subject: [PATCH 24/35] feat(desktop): ask_higher_model carries context + persona/card system prompt (#2) Co-Authored-By: Claude Sonnet 4.6 --- .../RealtimeHubController.swift | 26 +++--- .../FloatingControlBar/RealtimeHubTools.swift | 82 +++++++++++++++---- .../Desktop/Tests/HubEscalationTests.swift | 27 ++++++ 3 files changed, 106 insertions(+), 29 deletions(-) create mode 100644 desktop/macos/Desktop/Tests/HubEscalationTests.swift diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift index a48dc4e0757..2f6e06c80cc 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift @@ -340,10 +340,14 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { switch tool { case .askHigherModel: let query = arg("query") - log("RealtimeHub[\(providerTag)]: tool ask_higher_model → POST /v2/chat/completions (claude-sonnet-4-6) query=\"\(query.prefix(80))\"") + let context = (arguments["context"] as? String) ?? "" + log( + "RealtimeHub[\(providerTag)]: tool ask_higher_model → POST /v2/chat/completions (claude-sonnet-4-6) query=\"\(query.prefix(80))\"" + ) Task { [weak self] in guard let self else { return } - let answer = await self.escalateToHigherModel(query) + let answer = await self.escalateToHigherModel( + query, context: context, aboutUser: self.aboutUserCard) self.session?.sendToolResult(callId: callId, name: name, output: answer) } case .getTasks: @@ -590,7 +594,9 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { /// ask_higher_model — reuse the EXISTING prompt-cached /v2/chat/completions /// (no new backend route). Returns the assistant text for the model to speak. - private func escalateToHigherModel(_ query: String) async -> String { + private func escalateToHigherModel(_ query: String, context: String, aboutUser: String) + async -> String + { let baseURL = await APIClient.shared.rustBackendURL guard !baseURL.isEmpty else { return "I couldn't reach the model right now." } let normalized = baseURL.hasSuffix("/") ? baseURL : baseURL + "/" @@ -606,18 +612,8 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } catch { return "I couldn't authenticate to the model." } - let body: [String: Any] = [ - "model": "claude-sonnet-4-6", - "max_tokens": 1024, - "messages": [ - [ - "role": "user", - "content": - "Answer concisely for a spoken reply (a few sentences max):\n\n\(query)", - ] - ], - "stream": false, - ] + let body = RealtimeHubTools.escalationBody( + query: query, context: context, aboutUser: aboutUser) let t0 = Date() do { request.httpBody = try JSONSerialization.data(withJSONObject: body) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index ea12b4171b3..89ef6f92038 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -130,21 +130,35 @@ enum RealtimeHubTools { then confirm out loud. CHANGE an existing task (mark done, edit, reschedule): first \ call get_tasks to get the matching task's id, then call update_action_item with that id. - DOING something for the user in their OTHER apps (calendar, notes, emails, messages, \ - files, browser) or any multi-step work — create/send/open/edit/search/schedule/automate/ \ - "do X for me": you CANNOT do these yourself. You MUST actually EMIT the spawn_agent \ + files, browser), any multi-step work, OR anything needing a real look-up / current info \ + from the web (research something online, find the latest on X) — create/send/open/edit/ \ + search/schedule/automate/research/"do X for me": you CANNOT do these yourself. You MUST actually EMIT the spawn_agent \ function call (with a clear, self-contained `brief` and a short `title`). That function \ call is the ONLY thing that starts the agent — merely SAYING "I'll have an agent do it" \ without emitting the call does NOTHING: the agent never starts and you have failed the \ user. So always emit the spawn_agent call. You may add one short natural sentence as you \ call it, but never instead of it. Do NOT ask clarifying questions before spawning — spawn \ with what you have. Do NOT wait for it, narrate its steps, refuse, or claim you can't. - - Everything else — general questions, facts, chit-chat, explanations, advice, jokes, \ - and creative or long-form requests (stories, brainstorming, drafts): ANSWER YOURSELF. \ - You are fully capable; do it directly, even when the ask is long or open-ended. Do \ - NOT escalate just because a request seems long or hard. - - Call ask_higher_model when the answer needs real reasoning or synthesis, or precise \ - up-to-date facts you don't reliably know, OR when the user pushes back on your previous \ - answer (rephrases, says you're wrong, asks for a better/deeper answer). Pass a clear \ + - Everything else — general questions, single facts, simple look-ups you know, chit-chat, \ + explanations, opinions, advice, jokes, and creative or long-form requests (stories, \ + brainstorming, drafts): ANSWER YOURSELF. You are fully capable; do it directly, even when \ + the ask is long, open-ended, or mentions a specific name, date, number, or fact — a \ + request is NOT hard just because it contains one, and a simple look-up is NEVER a reason \ + to escalate. Do NOT escalate based on how unsure you feel about your own knowledge: you \ + are a poor judge of that, so escalate only on the explicit, observable signals below. + - There are TWO escalation paths — do not confuse them. ask_higher_model buys more \ + INTELLIGENCE on something you could already reason about: it returns a smarter spoken \ + answer but it does NOT browse, search, or fetch live data. spawn_agent is for DOING \ + multi-step work and for anything needing a real look-up / current web info (see above). + - Call ask_higher_model ONLY on these explicit signals — judged from what the user SAYS \ + and the SHAPE of the request, never from how unsure you feel: (1) the user is unhappy with \ + your previous answer — pushes back, rephrases, says you're wrong, or asks for a better / \ + deeper / more thorough answer; (2) the user explicitly asks you to think harder, be more \ + careful, or reason it through; or (3) the request genuinely needs heavy multi-step \ + reasoning or careful technical work — non-trivial math, complex code, or weighing several \ + constraints into one answer — that a quick spoken reply would get wrong. Do NOT use it for \ + simple look-ups, single facts, current events, or anything you can answer in a sentence or \ + two — answer those yourself, or use spawn_agent if it truly needs live data. Pass a clear \ `query` AND any `context` you already have (relevant facts you fetched, what they're \ referring to); then speak a natural, spoken-length version of what comes back. - When you need to see what's on screen, call screenshot first. Use point_click only \ @@ -161,14 +175,25 @@ enum RealtimeHubTools { "type": "function", "name": HubTool.askHigherModel.rawValue, "description": - "Get a second opinion from a smarter model and receive text to speak. Use ONLY when the user " - + "is dissatisfied with your previous answer (pushes back, rephrases, says you're wrong, or asks " - + "for a better/deeper answer), OR when you genuinely need precise up-to-date facts you don't " - + "know. Do NOT use it for general, creative, or long-form requests — answer those yourself.", + "A smarter model for MORE INTELLIGENCE on something you could already reason about — it returns " + + "text to speak but does NOT browse, search, or fetch live data. Use ONLY when (1) the user is " + + "dissatisfied with your previous answer (pushes back, rephrases, says you're wrong, asks for a " + + "better/deeper answer), (2) the user explicitly asks you to think harder or reason it through, OR " + + "(3) the request needs heavy multi-step reasoning or careful technical work (non-trivial math, " + + "complex code, multi-constraint synthesis). Do NOT use it for simple look-ups, single facts, " + + "current events, or general/creative/long-form requests — answer those yourself, or use spawn_agent " + + "if it truly needs live data.", "parameters": [ "type": "object", "properties": [ - "query": ["type": "string", "description": "The full question to escalate."] + "query": ["type": "string", "description": "The full question to escalate."], + "context": [ + "type": "string", + "description": + "Relevant context you already have that helps answer well — facts you fetched, " + + "what the user is referring to, or the previous answer they pushed back on. " + + "Include only what's relevant; omit if there's nothing useful.", + ], ], "required": ["query"], ], @@ -399,4 +424,33 @@ enum RealtimeHubTools { if let items = schema["items"] as? [String: Any] { out["items"] = upcasedSchemaTypes(items) } return out } + + /// System prompt for an escalated (ask_higher_model) answer. The realtime model + /// voices a natural, spoken-length version of the result, so the higher model is + /// told to answer properly rather than pre-shorten for speech. + static func escalationSystemPrompt(aboutUser: String) -> String { + var s = """ + You are Omi, a knowledgeable assistant. Answer the user's question accurately and \ + usefully. A voice assistant will relay your answer aloud and adapt the phrasing for \ + speech, so be clear and well-structured; you don't need to pre-shorten it. + """ + if !aboutUser.isEmpty { s += "\n\n" + aboutUser } + return s + } + + static func escalationBody(query: String, context: String, aboutUser: String) -> [String: Any] { + let trimmedContext = context.trimmingCharacters(in: .whitespacesAndNewlines) + let userContent = + trimmedContext.isEmpty ? query : query + "\n\nContext I already have:\n" + trimmedContext + let messages: [[String: String]] = [ + ["role": "system", "content": escalationSystemPrompt(aboutUser: aboutUser)], + ["role": "user", "content": userContent], + ] + return [ + "model": "claude-sonnet-4-6", + "max_tokens": 1024, + "messages": messages, + "stream": false, + ] + } } diff --git a/desktop/macos/Desktop/Tests/HubEscalationTests.swift b/desktop/macos/Desktop/Tests/HubEscalationTests.swift new file mode 100644 index 00000000000..f7d185da3ea --- /dev/null +++ b/desktop/macos/Desktop/Tests/HubEscalationTests.swift @@ -0,0 +1,27 @@ +import XCTest + +@testable import Omi_Computer + +final class HubEscalationTests: XCTestCase { + func testBodyHasSystemPromptAndAppendsContext() { + let body = RealtimeHubTools.escalationBody( + query: "What's the best plan?", + context: "User is comparing the M3 and M4 MacBook.", + aboutUser: "\nName: Sam\n") + XCTAssertEqual(body["model"] as? String, "claude-sonnet-4-6") + let messages = body["messages"] as! [[String: String]] + XCTAssertEqual(messages[0]["role"], "system") + XCTAssertTrue(messages[0]["content"]!.contains("")) + XCTAssertEqual(messages[1]["role"], "user") + XCTAssertTrue(messages[1]["content"]!.contains("What's the best plan?")) + XCTAssertTrue(messages[1]["content"]!.contains("M3 and M4")) // context appended + } + + func testBodyOmitsContextSectionWhenEmpty() { + let body = RealtimeHubTools.escalationBody( + query: "Capital of France?", context: "", aboutUser: "") + let messages = body["messages"] as! [[String: String]] + XCTAssertFalse(messages[1]["content"]!.contains("Context")) + XCTAssertFalse(messages[1]["content"]!.contains("Answer concisely for a spoken reply")) + } +} From 7781422d56b0edad1ec0784e39dc475605e5d467 Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 16:29:56 -0400 Subject: [PATCH 25/35] feat(desktop): spawn_agent supplies its own title, skipping the redundant Haiku title call (#4) Co-Authored-By: Claude Sonnet 4.6 --- .../FloatingControlBar/RealtimeHubController.swift | 7 +++++-- .../Sources/FloatingControlBar/RealtimeHubTools.swift | 8 +++++++- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift index 2f6e06c80cc..5d8a4ddafdd 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift @@ -488,14 +488,17 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } case .spawnAgent: let brief = arg("brief") + let title = (arguments["title"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) let model = ShortcutSettings.shared.selectedModel.isEmpty ? "claude-sonnet-4-6" : ShortcutSettings.shared.selectedModel // Non-blocking: spawn renders its own pill ("text bubble") and runs on its // own ChatProvider/AgentBridge. We don't await it on the voice loop. // fromVoice:false — the hub model speaks its own natural acknowledgment, so the pill // must NOT also speak its canned randomAck ("on it") or we double up. - let pill = AgentPillsManager.shared.spawnFromUserQuery(brief, model: model, fromVoice: false) - log("RealtimeHub[\(providerTag)]: tool spawn_agent → AgentBridge pill=\"\(pill.title)\" model=\(model)") + let pill = AgentPillsManager.shared.spawnFromUserQuery( + brief, model: model, fromVoice: false, + preFetchedTitle: (title?.isEmpty == false) ? title : nil) + log("RealtimeHub[\(providerTag)]: tool spawn_agent → AgentBridge pill=\"\(pill.title)\" model=\(model) titled=\(title?.isEmpty == false)") // Terse directive (not speakable content): the model already said its one-line ack // BEFORE calling, so it should NOT generate a slow second utterance after this. session?.sendToolResult( diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index 89ef6f92038..c205f2cf90e 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -366,7 +366,13 @@ enum RealtimeHubTools { "properties": [ "brief": [ "type": "string", "description": "A clear, self-contained brief of the task.", - ] + ], + "title": [ + "type": "string", + "description": + "A short Title Case label for the task pill (≤ ~5 words, no trailing " + + "punctuation), e.g. 'Draft Launch Email'.", + ], ], "required": ["brief"], ], From 030f9ad533e59cbfb585372774ead3196177f4ad Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 17:00:42 -0400 Subject: [PATCH 26/35] revert(desktop): roll back escalation-policy prompt prose to lean spec (keep ask_higher_model context mechanism) --- .../FloatingControlBar/RealtimeHubTools.swift | 44 ++++++------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index c205f2cf90e..98850b0be19 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -130,35 +130,21 @@ enum RealtimeHubTools { then confirm out loud. CHANGE an existing task (mark done, edit, reschedule): first \ call get_tasks to get the matching task's id, then call update_action_item with that id. - DOING something for the user in their OTHER apps (calendar, notes, emails, messages, \ - files, browser), any multi-step work, OR anything needing a real look-up / current info \ - from the web (research something online, find the latest on X) — create/send/open/edit/ \ - search/schedule/automate/research/"do X for me": you CANNOT do these yourself. You MUST actually EMIT the spawn_agent \ + files, browser) or any multi-step work — create/send/open/edit/search/schedule/automate/ \ + "do X for me": you CANNOT do these yourself. You MUST actually EMIT the spawn_agent \ function call (with a clear, self-contained `brief` and a short `title`). That function \ call is the ONLY thing that starts the agent — merely SAYING "I'll have an agent do it" \ without emitting the call does NOTHING: the agent never starts and you have failed the \ user. So always emit the spawn_agent call. You may add one short natural sentence as you \ call it, but never instead of it. Do NOT ask clarifying questions before spawning — spawn \ with what you have. Do NOT wait for it, narrate its steps, refuse, or claim you can't. - - Everything else — general questions, single facts, simple look-ups you know, chit-chat, \ - explanations, opinions, advice, jokes, and creative or long-form requests (stories, \ - brainstorming, drafts): ANSWER YOURSELF. You are fully capable; do it directly, even when \ - the ask is long, open-ended, or mentions a specific name, date, number, or fact — a \ - request is NOT hard just because it contains one, and a simple look-up is NEVER a reason \ - to escalate. Do NOT escalate based on how unsure you feel about your own knowledge: you \ - are a poor judge of that, so escalate only on the explicit, observable signals below. - - There are TWO escalation paths — do not confuse them. ask_higher_model buys more \ - INTELLIGENCE on something you could already reason about: it returns a smarter spoken \ - answer but it does NOT browse, search, or fetch live data. spawn_agent is for DOING \ - multi-step work and for anything needing a real look-up / current web info (see above). - - Call ask_higher_model ONLY on these explicit signals — judged from what the user SAYS \ - and the SHAPE of the request, never from how unsure you feel: (1) the user is unhappy with \ - your previous answer — pushes back, rephrases, says you're wrong, or asks for a better / \ - deeper / more thorough answer; (2) the user explicitly asks you to think harder, be more \ - careful, or reason it through; or (3) the request genuinely needs heavy multi-step \ - reasoning or careful technical work — non-trivial math, complex code, or weighing several \ - constraints into one answer — that a quick spoken reply would get wrong. Do NOT use it for \ - simple look-ups, single facts, current events, or anything you can answer in a sentence or \ - two — answer those yourself, or use spawn_agent if it truly needs live data. Pass a clear \ + - Everything else — general questions, facts, chit-chat, explanations, advice, jokes, \ + and creative or long-form requests (stories, brainstorming, drafts): ANSWER YOURSELF. \ + You are fully capable; do it directly, even when the ask is long or open-ended. Do \ + NOT escalate just because a request seems long or hard. + - Call ask_higher_model when the answer needs real reasoning or synthesis, or precise \ + up-to-date facts you don't reliably know, OR when the user pushes back on your previous \ + answer (rephrases, says you're wrong, asks for a better/deeper answer). Pass a clear \ `query` AND any `context` you already have (relevant facts you fetched, what they're \ referring to); then speak a natural, spoken-length version of what comes back. - When you need to see what's on screen, call screenshot first. Use point_click only \ @@ -175,14 +161,10 @@ enum RealtimeHubTools { "type": "function", "name": HubTool.askHigherModel.rawValue, "description": - "A smarter model for MORE INTELLIGENCE on something you could already reason about — it returns " - + "text to speak but does NOT browse, search, or fetch live data. Use ONLY when (1) the user is " - + "dissatisfied with your previous answer (pushes back, rephrases, says you're wrong, asks for a " - + "better/deeper answer), (2) the user explicitly asks you to think harder or reason it through, OR " - + "(3) the request needs heavy multi-step reasoning or careful technical work (non-trivial math, " - + "complex code, multi-constraint synthesis). Do NOT use it for simple look-ups, single facts, " - + "current events, or general/creative/long-form requests — answer those yourself, or use spawn_agent " - + "if it truly needs live data.", + "Get a second opinion from a smarter model and receive text to speak. Use ONLY when the user " + + "is dissatisfied with your previous answer (pushes back, rephrases, says you're wrong, or asks " + + "for a better/deeper answer), OR when you genuinely need precise up-to-date facts you don't " + + "know. Do NOT use it for general, creative, or long-form requests — answer those yourself.", "parameters": [ "type": "object", "properties": [ From d4f9d1b142ee0af49df2ce1ff2ec8339b1b14667 Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 17:18:27 -0400 Subject: [PATCH 27/35] =?UTF-8?q?feat(desktop):=20mandatory=20spoken=20hea?= =?UTF-8?q?ds-up=20before=20slow=20tools=20(ask=5Fhigher=5Fmodel/spawn=5Fa?= =?UTF-8?q?gent)=20=E2=80=94=20no=20more=20dead=20silence?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../Sources/FloatingControlBar/RealtimeHubTools.swift | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index 98850b0be19..53fc3db14f1 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -84,8 +84,10 @@ enum RealtimeHubTools { sec…". The thing to avoid is repetition: do NOT reach for the same generic opener ("let me \ check", "let me look that up") turn after turn — it's what makes you sound robotic. Keep it \ to a few words, vary the wording each turn, and don't include any answer or data you don't \ - have yet. For a slower step (ask_higher_model, spawn_agent) it's fine to signal it'll take a \ - moment. NEVER speak an answer — real or guessed — before the tool returns, NEVER skip the \ + have yet. This matters MOST for the slow steps: BEFORE you call ask_higher_model or spawn_agent you \ + MUST first say a brief, varied heads-up that you're thinking it through (e.g. "let me work \ + that through…", "give me a second on that…") — these take several seconds and silence feels \ + broken. NEVER speak an answer — real or guessed — before the tool returns, NEVER skip the \ tool call, and never read tool JSON or ids aloud. You cannot see the user's data or screen \ without calling a tool. From 60802eaad3afabf10d08bd9d2e85b3c99a12b2b9 Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 19:01:13 -0400 Subject: [PATCH 28/35] feat(desktop): playful 5-bar voice-reactive PTT mic waveform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replaces the floating-bar pulsing red dot with VoiceWaveformBars — 5 chunky bars that bounce to the live mic level (auto-gain + underdamped spring), drawn via TimelineView+Canvas reading AudioLevelMonitor.shared.microphoneLevel. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../VoiceWaveformBars.swift | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 desktop/macos/Desktop/Sources/FloatingControlBar/VoiceWaveformBars.swift diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceWaveformBars.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceWaveformBars.swift new file mode 100644 index 00000000000..8b038aaaaf6 --- /dev/null +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceWaveformBars.swift @@ -0,0 +1,134 @@ +import SwiftUI + +/// Playful, compact mic visualizer shown in the floating control bar while +/// push-to-talk is active — a few chunky bars that bounce to the user's voice +/// (HeyClicky-style), replacing the old pulsing red dot. +/// +/// Animation notes (this is what makes it actually move): +/// - `TimelineView(.animation)` is the clock. The Canvas closure **uses +/// `timeline.date`** every frame (via `model.advance(to:)`) so SwiftUI treats +/// the drawing as changed each tick and redraws — without referencing the +/// per-frame date the Canvas is cached and freezes (the original bug). +/// - We read `AudioLevelMonitor.shared.microphoneLevel` (one RMS scalar, ~5 Hz) +/// each frame and spring the bars toward it at 60fps, so 5 Hz data still looks +/// smooth. Per-bar phase + a center arch make it feel alive, not mechanical. +/// - `paused: !isActive` stops the loop when PTT isn't listening; the bars are a +/// live `@State` model (no retained history), so each session starts fresh and +/// never shows a frozen "last word." +struct VoiceWaveformBars: View { + let isActive: Bool + + private static let barCount = 5 + private static let barWidth: CGFloat = 4 + private static let barSpacing: CGFloat = 3 + private static let barHeight: CGFloat = 18 + private static let fillGradient = Gradient(colors: [OmiColors.purpleAccent, OmiColors.purplePrimary]) + + @State private var model: WaveBarsModel + + init(isActive: Bool) { + self.isActive = isActive + _model = State(initialValue: WaveBarsModel(barCount: Self.barCount)) + } + + private var width: CGFloat { + let n = CGFloat(Self.barCount) + return n * Self.barWidth + (n - 1) * Self.barSpacing + } + + var body: some View { + TimelineView(.animation(paused: !isActive)) { timeline in + Canvas { context, size in + let level = isActive ? CGFloat(AudioLevelMonitor.shared.microphoneLevel) : 0 + model.advance(to: timeline.date, level: level, active: isActive) + draw(into: &context, size: size) + } + } + .frame(width: width, height: Self.barHeight) + .accessibilityHidden(true) + } + + private func draw(into context: inout GraphicsContext, size: CGSize) { + let minH: CGFloat = 2 + let maxH = size.height + let step = Self.barWidth + Self.barSpacing + let centerY = size.height / 2 + + for i in 0.. visible bounce/overshoot (ζ ≈ 0.35). + private let stiffness: Double = 200 + private let damping: Double = 10 + + init(barCount: Int) { + self.barCount = barCount + values = Array(repeating: 0, count: barCount) + velocities = Array(repeating: 0, count: barCount) + phases = (0.. a friendly arch. + let mid = Double(barCount - 1) / 2 + weights = (0.. 0.04 ? min(1.0, lvl / envelope) : 0.0 + let gained = pow(norm, 0.75) + + for i in 0.. bouncy overshoot. + let x = Double(values[i]) + let accel = stiffness * (target - x) - damping * velocities[i] + velocities[i] += accel * dt + let nx = x + velocities[i] * dt + values[i] = CGFloat(max(0.0, min(1.0, nx))) + } + } +} From e611b62eb064115630915be42160aa3078289966 Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 19:01:31 -0400 Subject: [PATCH 29/35] refactor(desktop): use VoiceWaveformBars in floating-bar listening view Co-Authored-By: Claude Opus 4.8 (1M context) --- .../FloatingControlBar/FloatingControlBarView.swift | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift index 763fd6f3494..9e06e25dbb9 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift @@ -360,12 +360,8 @@ struct FloatingControlBarView: View { private var voiceListeningView: some View { HStack(spacing: 8) { - // Pulsing mic icon - Circle() - .fill(Color.red) - .frame(width: 10, height: 10) - .scaleEffect(state.isVoiceListening ? 1.2 : 1.0) - .animation(.easeInOut(duration: 0.6).repeatForever(autoreverses: true), value: state.isVoiceListening) + // Playful realtime mic waveform (replaces the old pulsing red dot) + VoiceWaveformBars(isActive: state.isVoiceListening) Image(systemName: "mic.fill") .scaledFont(size: 14, weight: .semibold) From 46d1b0c4eae317e1ffe9bb3ce56e4a93b0370f5b Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 19:01:31 -0400 Subject: [PATCH 30/35] refactor(desktop): use VoiceWaveformBars in voice follow-up view; purple bg Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Sources/FloatingControlBar/AIResponseView.swift | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/AIResponseView.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/AIResponseView.swift index 42e9c5ba788..bb11e6cc18e 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/AIResponseView.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/AIResponseView.swift @@ -349,11 +349,8 @@ struct AIResponseView: View { private var voiceFollowUpView: some View { HStack(spacing: 8) { - Circle() - .fill(Color.red) - .frame(width: 10, height: 10) - .scaleEffect(1.2) - .animation(.easeInOut(duration: 0.6).repeatForever(autoreverses: true), value: isVoiceFollowUp) + // Playful realtime mic waveform (replaces the old pulsing red dot) + VoiceWaveformBars(isActive: isVoiceFollowUp) Image(systemName: "mic.fill") .scaledFont(size: 14, weight: .semibold) @@ -375,7 +372,7 @@ struct AIResponseView: View { } .padding(.horizontal, 10) .padding(.vertical, 8) - .background(Color.red.opacity(0.15)) + .background(OmiColors.purplePrimary.opacity(0.12)) .cornerRadius(8) } From f221dc3d13a8722139de775db3b6a0cb187447ba Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 19:01:31 -0400 Subject: [PATCH 31/35] feat(desktop): feed live mic level to the PTT waveform via AudioLevelMonitor Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Sources/FloatingControlBar/PushToTalkManager.swift | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift index 8daa4225417..0e7a283bd02 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift @@ -974,7 +974,11 @@ class PushToTalkManager: ObservableObject { self.transcriptionService?.sendAudio(audioData) } }, - onAudioLevel: { _ in } + onAudioLevel: { level in + // Feed the floating-bar mic waveform (VoiceWaveformBars). Throttled to ~5 Hz + // inside the monitor; used only for visualization. + AudioLevelMonitor.shared.updateMicrophoneLevel(level) + } ) log("PushToTalkManager: mic capture started (batch=\(batchMode))") } catch { From aa4e1d8224dc7fa6b4a1f0feafad15c8e21d978c Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 19:01:31 -0400 Subject: [PATCH 32/35] fix(desktop): typed follow-up after a voice turn is no longer spoken (force fromVoice:false) Co-Authored-By: Claude Opus 4.8 (1M context) --- .../FloatingControlBar/FloatingControlBarWindow.swift | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift index d97aa6cb7e0..bd6bdf2c1ec 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift @@ -1325,12 +1325,16 @@ class FloatingControlBarManager { guard let provider = activeFloatingProvider() else { return } // Re-wire the onSendQuery to use the isolated floating-bar provider. - // Subsequent typed messages also go through the AI router. + // Subsequent typed messages also go through the AI router. A message arriving + // through onSendQuery was always TYPED (PTT/voice bypass this closure and call + // routeQuery directly), so force fromVoice:false — otherwise a typed follow-up + // after a voice turn inherits the stale currentQueryFromVoice=true and gets + // spoken aloud. window.onSendQuery = { [weak self, weak window, weak provider] message in guard let self = self, let window = window, let provider = provider else { return } Task { @MainActor in - await self.withQueryTracer(query: message, fromVoice: window.state.currentQueryFromVoice) { - await self.routeQuery(message, barWindow: window, provider: provider, fromVoice: window.state.currentQueryFromVoice) + await self.withQueryTracer(query: message, fromVoice: false) { + await self.routeQuery(message, barWindow: window, provider: provider, fromVoice: false) } } } From c48c434fe490946a0dce6757d44687a608ef068f Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 19:01:54 -0400 Subject: [PATCH 33/35] refactor(desktop): soften the slow-tool heads-up prompt wording Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Sources/FloatingControlBar/RealtimeHubTools.swift | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index 53fc3db14f1..98850b0be19 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -84,10 +84,8 @@ enum RealtimeHubTools { sec…". The thing to avoid is repetition: do NOT reach for the same generic opener ("let me \ check", "let me look that up") turn after turn — it's what makes you sound robotic. Keep it \ to a few words, vary the wording each turn, and don't include any answer or data you don't \ - have yet. This matters MOST for the slow steps: BEFORE you call ask_higher_model or spawn_agent you \ - MUST first say a brief, varied heads-up that you're thinking it through (e.g. "let me work \ - that through…", "give me a second on that…") — these take several seconds and silence feels \ - broken. NEVER speak an answer — real or guessed — before the tool returns, NEVER skip the \ + have yet. For a slower step (ask_higher_model, spawn_agent) it's fine to signal it'll take a \ + moment. NEVER speak an answer — real or guessed — before the tool returns, NEVER skip the \ tool call, and never read tool JSON or ids aloud. You cannot see the user's data or screen \ without calling a tool. From 3b9d18ef060bd612704cfa7782adf28494047e1c Mon Sep 17 00:00:00 2001 From: vendz Date: Thu, 18 Jun 2026 19:01:54 -0400 Subject: [PATCH 34/35] chore(desktop): changelog entry for the PTT mic waveform Co-Authored-By: Claude Opus 4.8 (1M context) --- desktop/macos/CHANGELOG.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/desktop/macos/CHANGELOG.json b/desktop/macos/CHANGELOG.json index 278ff53fa00..ac8958e9476 100644 --- a/desktop/macos/CHANGELOG.json +++ b/desktop/macos/CHANGELOG.json @@ -3,7 +3,8 @@ "Faster, cheaper assistant responses via Anthropic prompt caching of the system+tools prefix and conversation history", "Faster voice replies (experimental): the realtime model now handles your whole voice turn \u2014 listening, deciding, and speaking \u2014 instead of the slower transcribe\u2192route\u2192answer pipeline", "Voice (push-to-talk) conversations now appear in your chat history", - "Fixed older chat messages failing to load in long chats" + "Fixed older chat messages failing to load in long chats", + "Replaced the push-to-talk red dot in the floating bar with a realtime audio equalizer" ], "releases": [ { From f8ac95d7c3d6407b8f0ffc243f2bec9b7f221408 Mon Sep 17 00:00:00 2001 From: Nik Shevchenko Date: Thu, 18 Jun 2026 20:42:38 -0400 Subject: [PATCH 35/35] fix(desktop): break up VoiceWaveformBars weight expression so it type-checks The single-line bar-weight expression hit Swift's 'unable to type-check in reasonable time' error (Double/CGFloat inference). Split into typed sub-expressions; identical math. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../Sources/FloatingControlBar/VoiceWaveformBars.swift | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceWaveformBars.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceWaveformBars.swift index 8b038aaaaf6..76737568c91 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceWaveformBars.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceWaveformBars.swift @@ -100,7 +100,9 @@ final class WaveBarsModel { // Center bars taller -> a friendly arch. let mid = Double(barCount - 1) / 2 weights = (0..