diff --git a/app/lib/services/devices/plaud_connection.dart b/app/lib/services/devices/plaud_connection.dart index 92a33ffeaa6..b13978cab19 100644 --- a/app/lib/services/devices/plaud_connection.dart +++ b/app/lib/services/devices/plaud_connection.dart @@ -288,8 +288,7 @@ class PlaudDeviceConnection extends DeviceConnection { @override Future performGetImageListener({ required void Function(OrientedImage orientedImage) onImageReceived, - }) async => - null; + }) async => null; @override Future>?> performGetAccelListener({void Function(int)? onAccelChange}) async => null; @@ -321,15 +320,15 @@ class PlaudDeviceConnection extends DeviceConnection { List _toBytes32(int v) => [v & 0xFF, (v >> 8) & 0xFF, (v >> 16) & 0xFF, (v >> 24) & 0xFF]; List _toBytes64(int v) => [ - v & 0xFF, - (v >> 8) & 0xFF, - (v >> 16) & 0xFF, - (v >> 24) & 0xFF, - (v >> 32) & 0xFF, - (v >> 40) & 0xFF, - (v >> 48) & 0xFF, - (v >> 56) & 0xFF, - ]; + v & 0xFF, + (v >> 8) & 0xFF, + (v >> 16) & 0xFF, + (v >> 24) & 0xFF, + (v >> 32) & 0xFF, + (v >> 40) & 0xFF, + (v >> 48) & 0xFF, + (v >> 56) & 0xFF, + ]; int _toInt32(List b) => b[0] | (b[1] << 8) | (b[2] << 16) | (b[3] << 24); } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/AIResponseView.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/AIResponseView.swift index 42e9c5ba788..bb11e6cc18e 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/AIResponseView.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/AIResponseView.swift @@ -349,11 +349,8 @@ struct AIResponseView: View { private var voiceFollowUpView: some View { HStack(spacing: 8) { - Circle() - .fill(Color.red) - .frame(width: 10, height: 10) - .scaleEffect(1.2) - .animation(.easeInOut(duration: 0.6).repeatForever(autoreverses: true), value: isVoiceFollowUp) + // Playful realtime mic waveform (replaces the old pulsing red dot) + VoiceWaveformBars(isActive: isVoiceFollowUp) Image(systemName: "mic.fill") .scaledFont(size: 14, weight: .semibold) @@ -375,7 +372,7 @@ struct AIResponseView: View { } .padding(.horizontal, 10) .padding(.vertical, 8) - .background(Color.red.opacity(0.15)) + .background(OmiColors.purplePrimary.opacity(0.12)) .cornerRadius(8) } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift new file mode 100644 index 00000000000..6c9eefe9c35 --- /dev/null +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/AboutUserCard.swift @@ -0,0 +1,51 @@ +import Foundation + +/// Builds the compact, local-only `` block injected into the hub's +/// system instruction at warm time. Identity + rough situation only; exact/current +/// lists stay behind the read tools (the card hedges this). No network calls. +enum AboutUserCard { + /// Pure formatter — kept separate from `build()` so it is unit-testable. + static func render(name: String, facts: [String], overdue: Int, dueToday: Int) -> String { + var lines: [String] = [""] + if !name.isEmpty { lines.append("Name: \(name)") } + lines.append("What Omi knows about them:") + if facts.isEmpty { + lines.append("- Nothing saved yet.") + } else { + lines.append(contentsOf: facts.map { "- \($0)" }) + } + if overdue == 0 && dueToday == 0 { + lines.append("Right now: nothing overdue or due today.") + } else { + lines.append("Right now: \(overdue) overdue, \(dueToday) due today.") + } + lines.append( + "(This is a quick snapshot — for the exact or current list, call get_tasks / get_action_items.)") + lines.append("") + return lines.joined(separator: "\n") + } + + /// Gathers local data (auth name, top memories, task counts) and renders the card. + /// Best-effort: any failure degrades to a smaller card, never throws. + @MainActor + static func build() async -> String { + let auth = AuthService.shared + let rawName = auth.givenName.isEmpty ? auth.displayName : auth.givenName + let name = rawName.trimmingCharacters(in: .whitespacesAndNewlines) + + var facts: [String] = [] + if let memories = try? await MemoryStorage.shared.getLocalMemories(limit: 8) { + facts = memories.prefix(8).compactMap { mem in + let t = mem.content.trimmingCharacters(in: .whitespacesAndNewlines) + guard !t.isEmpty else { return nil } + return t.count > 120 ? String(t.prefix(117)) + "…" : t + } + } + + await TasksStore.shared.loadDashboardTasks() + let overdue = TasksStore.shared.overdueTasks.count + let dueToday = TasksStore.shared.todaysTasks.count + + return render(name: name, facts: facts, overdue: overdue, dueToday: dueToday) + } +} diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift index f39cf6618f3..8608cb771ec 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarState.swift @@ -51,21 +51,6 @@ struct FloatingBarNotification: Identifiable, Equatable { } } -/// The high-level voice activity the floating bar is reflecting right now. Derived -/// from the lower-level PTT/hub flags so the status indicator has a single, ordered -/// source of truth (each state has exactly one visual treatment). -enum VoiceActivity: Equatable { - /// Nothing happening — the bar rests as a calm, barely-breathing sliver. - case idle - /// User is holding push-to-talk; we're capturing their voice (red, "you"). - case listening - /// Turn committed, waiting on the model's reply — the model may answer late, - /// so this MUST read as "working, wait" rather than "done" (cool autonomous swirl). - case thinking - /// The model is speaking its reply (warm, audio-reactive waveform — "it"). - case speaking -} - /// Observable object holding the state for the floating control bar. @MainActor class FloatingControlBarState: NSObject, ObservableObject { @@ -108,35 +93,6 @@ class FloatingControlBarState: NSObject, ObservableObject { @Published var isVoiceListening: Bool = false @Published var isVoiceLocked: Bool = false @Published var voiceTranscript: String = "" - /// True after a voice turn is committed and we're waiting on the model's reply - /// (vs. still recording) — drives the "Thinking…/Responding…" indicator so the user - /// knows to wait rather than re-pressing (which would interrupt a slow reply). - @Published var isVoiceThinking: Bool = false - /// True while the model is actually speaking its reply (native audio playing or the - /// AVSpeech fallback talking). Distinct from `isVoiceThinking` so the indicator can - /// show a clearly different "it's talking" treatment vs. "it's working". - @Published var isVoiceSpeaking: Bool = false - /// Smoothed 0…1 output amplitude of the model's spoken reply, sampled from the - /// playback engine. Drives the speaking waveform so it reacts to the actual voice - /// (premium feel) rather than animating blindly. 0 when not speaking. - @Published var voiceLevel: CGFloat = 0 - - /// Single ordered source of truth for the status indicator. Listening wins (the user - /// is actively talking), then speaking, then thinking, else idle — by construction the - /// hub sets these mutually exclusively, the ordering just makes barge-in race-safe. - var voiceActivity: VoiceActivity { - if isVoiceListening { return .listening } - if isVoiceSpeaking { return .speaking } - if isVoiceThinking { return .thinking } - return .idle - } - - /// Whether any voice turn is in flight — keeps the bar expanded across the whole - /// listening → thinking → speaking arc so the indicator stays visible (one expand, - /// one collapse per turn — no resize churn mid-turn). - var isVoiceActive: Bool { - isVoiceListening || isVoiceThinking || isVoiceSpeaking - } // Voice follow-up state (PTT while AI conversation is active) @Published var isVoiceFollowUp: Bool = false @@ -180,8 +136,6 @@ class FloatingControlBarState: NSObject, ObservableObject { isVoiceFollowUp = false voiceFollowUpTranscript = "" currentQueryFromVoice = false - isVoiceSpeaking = false - voiceLevel = 0 lastConversationActivityAt = nil } } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift index 9188f5d11b3..9e06e25dbb9 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarView.swift @@ -33,11 +33,9 @@ struct FloatingControlBarView: View { .animation(.spring(response: 0.35, dampingFraction: 0.82), value: state.currentNotification?.id) } - /// Whether the bar chrome should stretch to fill the window width. Stays full-width - /// for the whole voice turn (listening → thinking → speaking) so the status indicator - /// has room and the bar resizes exactly once per turn. + /// Whether the bar chrome should stretch to fill the window width private var barNeedsFullWidth: Bool { - isHovering || state.showingAIConversation || state.isVoiceActive + isHovering || state.showingAIConversation || state.isVoiceListening } private var barChrome: some View { @@ -85,7 +83,7 @@ struct FloatingControlBarView: View { } } .overlay(alignment: .topTrailing) { - if isHovering && !state.isVoiceActive { + if isHovering && !state.isVoiceListening { Button { openFloatingBarSettings() } label: { @@ -281,8 +279,8 @@ struct FloatingControlBarView: View { private var controlBarView: some View { Group { - if state.isVoiceActive && !state.isVoiceFollowUp { - voiceActiveView + if state.isVoiceListening && !state.isVoiceFollowUp { + voiceListeningView .padding(.horizontal, 6) .padding(.vertical, 3) .frame(height: 50) @@ -308,11 +306,11 @@ struct FloatingControlBarView: View { } } - /// Minimal resting indicator shown when not hovering and no voice turn is active — - /// a calm, slowly breathing sliver. (Active turns render `voiceActiveView` instead.) + /// Minimal thin bar shown when not hovering private var compactCircleView: some View { - VoiceActivityIndicator(activity: state.voiceActivity, level: state.voiceLevel) - .frame(width: 28, height: 14) + RoundedRectangle(cornerRadius: 3) + .fill(Color.white.opacity(0.5)) + .frame(width: 28, height: 6) } private func compactToggle(_ title: String, isOn: Binding) -> some View { @@ -360,15 +358,16 @@ struct FloatingControlBarView: View { } } - /// Unified expanded voice view for the whole turn. The status indicator carries the - /// state (listening / thinking / speaking) visually; the text is just the helpful - /// detail (transcript, "Release to send", "Thinking…"). One element, no jarring swaps. - private var voiceActiveView: some View { + private var voiceListeningView: some View { HStack(spacing: 8) { - VoiceActivityIndicator(activity: state.voiceActivity, level: state.voiceLevel) - .frame(width: 34, height: 18) + // Playful realtime mic waveform (replaces the old pulsing red dot) + VoiceWaveformBars(isActive: state.isVoiceListening) - if state.isVoiceLocked && state.isVoiceListening { + Image(systemName: "mic.fill") + .scaledFont(size: 14, weight: .semibold) + .foregroundColor(.white) + + if state.isVoiceLocked { Text("LOCKED") .scaledFont(size: 10, weight: .bold) .foregroundColor(.orange) @@ -378,31 +377,21 @@ struct FloatingControlBarView: View { .cornerRadius(4) } - // Dim only the "Release to send" hint; live transcript / status reads brighter. - let isHint = state.voiceActivity == .listening && state.voiceTranscript.isEmpty - Text(voiceStatusText) - .scaledFont(size: 13) - .foregroundColor(.white.opacity(isHint ? 0.5 : 0.85)) - .lineLimit(1) - .truncationMode(.head) - } - } - - /// The detail text beside the indicator for the current voice state. The indicator - /// itself carries the state visually; this is just the helpful detail. - private var voiceStatusText: String { - switch state.voiceActivity { - case .listening: - if !state.voiceTranscript.isEmpty { return state.voiceTranscript } - return state.isVoiceLocked - ? "Tap \(shortcutSettings.pttShortcut.displayLabel) to send" - : "Release \(shortcutSettings.pttShortcut.displayLabel) to send" - case .thinking: - return "Thinking…" - case .speaking: - return "Speaking…" - case .idle: - return "" + if !state.voiceTranscript.isEmpty { + Text(state.voiceTranscript) + .scaledFont(size: 13) + .foregroundColor(.white.opacity(0.8)) + .lineLimit(1) + .truncationMode(.head) + } else { + Text( + state.isVoiceLocked + ? "Tap \(shortcutSettings.pttShortcut.displayLabel) to send" + : "Release \(shortcutSettings.pttShortcut.displayLabel) to send" + ) + .scaledFont(size: 13) + .foregroundColor(.white.opacity(0.5)) + } } } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift index 7dc37d7d622..bd6bdf2c1ec 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/FloatingControlBarWindow.swift @@ -41,8 +41,6 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { private var suppressHoverResize = false private var inputHeightCancellable: AnyCancellable? private var responseHeightCancellable: AnyCancellable? - private var voiceActivityCancellable: AnyCancellable? - private var collapseWorkItem: DispatchWorkItem? private var resizeWorkItem: DispatchWorkItem? /// Saved center point from before chat opened, used to restore position on close. private var preChatCenter: NSPoint? @@ -87,7 +85,6 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { self.maxSize = FloatingControlBarWindow.maxBarSize setupViews() - setupVoiceActivityObserver() if ShortcutSettings.shared.draggableBarEnabled, let savedPosition = UserDefaults.standard.string(forKey: FloatingControlBarWindow.positionKey) { @@ -522,54 +519,6 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { inputHeightCancellable = nil } - /// Single owner of the voice-turn expand/collapse. The bar is wide whenever a voice - /// turn is active (`isVoiceActive` = listening || thinking || speaking) and collapses - /// to the resting sliver when it ends — derived reactively from the published flags - /// instead of imperative resize calls scattered across the PTT/hub code (which had to - /// coordinate via a `skipResize` flag). - private func setupVoiceActivityObserver() { - voiceActivityCancellable = state.$isVoiceListening - .combineLatest(state.$isVoiceThinking, state.$isVoiceSpeaking) - .map { $0 || $1 || $2 } - .removeDuplicates() - .receive(on: DispatchQueue.main) - .sink { [weak self] active in - self?.onVoiceActiveChanged(active) - } - } - - /// Expand immediately so the window is already wide when the indicator + text render - /// (a delayed expand flashes the content cramped in the sliver first). Defer the - /// collapse a beat so the transient listening→thinking dip on PTT-up — `isVoiceActive` - /// momentarily clears before commitTurn sets thinking — doesn't blink the bar shut. - private func onVoiceActiveChanged(_ active: Bool) { - collapseWorkItem?.cancel() - collapseWorkItem = nil - if active { - applyVoiceExpansion(true) - } else { - let work = DispatchWorkItem { [weak self] in self?.applyVoiceExpansion(false) } - collapseWorkItem = work - DispatchQueue.main.asyncAfter(deadline: .now() + 0.12, execute: work) - } - } - - private func applyVoiceExpansion(_ active: Bool) { - // Onboarding shows no separate bar; follow-up and the AI conversation own their - // own layout, so the voice indicator never drives the window size in those modes. - guard UserDefaults.standard.bool(forKey: "hasCompletedOnboarding"), - !state.isVoiceFollowUp else { return } - if active { - guard !state.showingAIConversation else { return } - resizeForPTTState(expanded: true, animated: false) // snap — content is ready now - } else { - // Collapse only when nothing else needs the window expanded. - guard !state.showingAIConversation, !state.showingAIResponse, - state.currentNotification == nil, !state.isHoveringBar else { return } - resizeForPTTState(expanded: false, animated: true) - } - } - func updateAIResponse(type: String, text: String) { guard state.showingAIConversation else { return } @@ -670,7 +619,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { /// Resize for hover expand/collapse — anchored from center so the circle grows outward. func resizeForHover(expanded: Bool) { - guard !state.showingAIConversation, !state.isVoiceActive, !state.isShowingNotification, !suppressHoverResize else { return } + guard !state.showingAIConversation, !state.isVoiceListening, !state.isShowingNotification, !suppressHoverResize else { return } resizeWorkItem?.cancel() resizeWorkItem = nil @@ -679,7 +628,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { let doResize: () -> Void = { [weak self] in guard let self = self else { return } guard !self.state.showingAIConversation, - !self.state.isVoiceActive, + !self.state.isVoiceListening, !self.state.isShowingNotification, !self.suppressHoverResize else { return } @@ -709,16 +658,12 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { } } - /// Resize window for PTT state (expanded when listening, compact circle when idle). - /// Expand snaps (animated:false) so the indicator + text never flash cramped while the - /// window grows; collapse animates for a smooth shrink back to the resting sliver. - func resizeForPTTState(expanded: Bool, animated: Bool = true) { + /// Resize window for PTT state (expanded when listening, compact circle when idle) + func resizeForPTTState(expanded: Bool) { let size = expanded ? NSSize(width: FloatingControlBarWindow.expandedWidth, height: FloatingControlBarWindow.expandedBarSize.height) : FloatingControlBarWindow.minBarSize - // Idempotent: skip when already at the target size (avoids a no-op resize). - if abs(frame.width - size.width) < 1, abs(frame.height - size.height) < 1 { return } - resizeAnchored(to: size, makeResizable: false, animated: animated) + resizeAnchored(to: size, makeResizable: false, animated: true) } func showNotification(_ notification: FloatingBarNotification, animated: Bool = true) { @@ -737,7 +682,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { state.currentNotification = nil let targetSize: NSSize - if state.isVoiceActive { + if state.isVoiceListening { targetSize = NSSize(width: Self.expandedWidth, height: Self.expandedBarSize.height) } else { targetSize = state.isHoveringBar ? Self.expandedBarSize : Self.minBarSize @@ -748,7 +693,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { /// Restore the compact pill size when we temporarily surface the bar outside /// of an active hover, notification, voice session, or AI conversation. func normalizeForTemporaryShow() { - guard !state.showingAIConversation, !state.isVoiceActive, state.currentNotification == nil else { return } + guard !state.showingAIConversation, !state.isVoiceListening, state.currentNotification == nil else { return } resizeAnchored(to: Self.minBarSize, makeResizable: false, animated: false, anchorTop: true) } @@ -892,7 +837,7 @@ class FloatingControlBarWindow: NSPanel, NSWindowDelegate { minimumWidth = FloatingControlBarWindow.expandedWidth } else if state.currentNotification != nil { minimumWidth = FloatingControlBarWindow.notificationWidth - } else if state.isVoiceActive { + } else if state.isVoiceListening { minimumWidth = FloatingControlBarWindow.expandedWidth } else if state.isHoveringBar { minimumWidth = FloatingControlBarWindow.expandedBarSize.width @@ -1380,12 +1325,16 @@ class FloatingControlBarManager { guard let provider = activeFloatingProvider() else { return } // Re-wire the onSendQuery to use the isolated floating-bar provider. - // Subsequent typed messages also go through the AI router. + // Subsequent typed messages also go through the AI router. A message arriving + // through onSendQuery was always TYPED (PTT/voice bypass this closure and call + // routeQuery directly), so force fromVoice:false — otherwise a typed follow-up + // after a voice turn inherits the stale currentQueryFromVoice=true and gets + // spoken aloud. window.onSendQuery = { [weak self, weak window, weak provider] message in guard let self = self, let window = window, let provider = provider else { return } Task { @MainActor in - await self.withQueryTracer(query: message, fromVoice: window.state.currentQueryFromVoice) { - await self.routeQuery(message, barWindow: window, provider: provider, fromVoice: window.state.currentQueryFromVoice) + await self.withQueryTracer(query: message, fromVoice: false) { + await self.routeQuery(message, barWindow: window, provider: provider, fromVoice: false) } } } @@ -1737,6 +1686,11 @@ class FloatingControlBarManager { return window?.state } + /// Resize the floating bar for PTT state changes. + func resizeForPTT(expanded: Bool) { + window?.resizeForPTTState(expanded: expanded) + } + // MARK: - AI Query private func prepareVisibleQueryState(_ message: String, in barWindow: FloatingControlBarWindow, fromVoice: Bool) { diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift index 3eca7fcdd98..0e7a283bd02 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/PushToTalkManager.swift @@ -523,10 +523,6 @@ class PushToTalkManager: ObservableObject { state = .finalizing finalizeWorkItem?.cancel() finalizeWorkItem = nil - // Flags only — the window keeps the bar expanded into "thinking" because commitTurn - // sets isVoiceThinking before the reactive resize observer settles (so isVoiceActive - // never dips), which is why there's no flicker and no skip-resize coordination here. - updateBarState() // Stop mic immediately — no more audio capture audioCaptureService?.stopCapture() @@ -563,10 +559,10 @@ class PushToTalkManager: ObservableObject { // Real speech — instant local ack + commit. The hub speaks the reply and // dispatches tools itself; no transcript/router/LLM hop here. if ShortcutSettings.shared.pttSoundsEnabled { ackSound?.play() } - barState?.voiceTranscript = "…" RealtimeHubController.shared.commitTurn() - // Leave the bar showing "…"; the hub controller exits the voice UI on turn - // completion (so we skip the clearing updateBarState()). + // Collapse the bar on release — the hub speaks its reply as audio (no inline + // status UI), the same as the legacy voice path. + updateBarState() AnalyticsManager.shared.floatingBarPTTEnded( mode: finalizedMode, hadTranscript: true, transcriptLength: 0) log("PushToTalkManager: hub turn committed (instant ack)") @@ -729,14 +725,14 @@ class PushToTalkManager: ObservableObject { isCurrentSessionFollowUp = false - // Reset state. The reactive resize observer won't collapse the bar when a query is in - // flight or a conversation is open — it guards on showingAIConversation/showingAIResponse, - // which openAIInputWithQuery sets (to the correct response size) right after this. + // Reset state — skip PTT collapse resize when we have a query, + // because openAIInputWithQuery will resize to the correct size. + // Also skip resize when in follow-up mode (panel is already at response size). state = .idle transcriptSegments = [] lastInterimText = "" currentContextSnapshot = nil - updateBarState() + updateBarState(skipResize: hasQuery || wasFollowUp) guard hasQuery else { log("PushToTalkManager: no transcript to send") @@ -978,7 +974,11 @@ class PushToTalkManager: ObservableObject { self.transcriptionService?.sendAudio(audioData) } }, - onAudioLevel: { _ in } + onAudioLevel: { level in + // Feed the floating-bar mic waveform (VoiceWaveformBars). Throttled to ~5 Hz + // inside the monitor; used only for visualization. + AudioLevelMonitor.shared.updateMicrophoneLevel(level) + } ) log("PushToTalkManager: mic capture started (batch=\(batchMode))") } catch { @@ -1037,8 +1037,9 @@ class PushToTalkManager: ObservableObject { // MARK: - Bar State Sync - private func updateBarState() { + private func updateBarState(skipResize: Bool = false) { guard let barState = barState else { return } + let wasListening = barState.isVoiceListening let isShowingVoiceUI = (state == .listening || state == .lockedListening) barState.isVoiceListening = isShowingVoiceUI barState.isVoiceLocked = (state == .lockedListening) @@ -1047,9 +1048,16 @@ class PushToTalkManager: ObservableObject { barState.voiceTranscript = "" barState.voiceFollowUpTranscript = "" } - // The bar's expand/collapse is derived reactively from these flags by the window - // (FloatingControlBarWindow.setupVoiceActivityObserver) — one resize per turn, no - // imperative calls or skip-flags to keep in sync here. + + // Skip resize when in follow-up mode, expanded AI conversation, or during onboarding + // (during onboarding the floating bar shouldn't appear as a separate window) + let isOnboarding = !UserDefaults.standard.bool(forKey: "hasCompletedOnboarding") + guard !skipResize && !barState.isVoiceFollowUp && !barState.showingAIConversation && !isOnboarding else { return } + if barState.isVoiceListening && !wasListening { + FloatingControlBarManager.shared.resizeForPTT(expanded: true) + } else if !barState.isVoiceListening && wasListening { + FloatingControlBarManager.shared.resizeForPTT(expanded: false) + } } } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift index cf8e7356e4c..5d8a4ddafdd 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubController.swift @@ -46,6 +46,15 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { /// Consecutive failed (re)connects with no surviving session — caps churn on a hard /// failure. Reset when a socket survives past the idle window or a turn completes. private var hubReconnectStrikes = 0 + /// After this many consecutive fast failures (e.g. a stale/revoked key failing auth), + /// the hub stops re-warming so it doesn't hammer a dead endpoint. + private static let maxReconnectStrikes = 5 + /// True only while a session is connected + authenticated for `sessionProvider`. This is + /// what gates `isActive`: a PTT turn enters hub mode only when the hub is genuinely + /// connected right now; otherwise it transparently uses the legacy cascade. Set in + /// hubDidConnect (fires post-auth, on "ready") and cleared on teardown/error, so a + /// stale/revoked key — which never connects — never costs the user a turn. + private var hubConnected = false /// True between commit and turn-done — used to detect barge-in (a new PTT while /// the previous reply is still in flight). private var responding = false @@ -53,61 +62,70 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { /// Log tag for the currently-connected provider. private var providerTag: String { sessionProvider == .gemini ? "gemini" : "openai" } + /// Latest local identity card, injected into each new session's system instruction. + /// Refreshed off the hot path; an empty string just means "no card yet" (graceful). + private var aboutUserCard: String = "" + + private func refreshAboutUserCard() { + Task { @MainActor [weak self] in + self?.aboutUserCard = await AboutUserCard.build() + } + } + /// Held warm so spawn_agent's pi-mono bridge boot is off the hot path. The pill /// spawn creates its own provider; warming this one primes node/auth caches. private var warmProvider: ChatProvider? private override init() { super.init() - // Clear "speaking" when the AVSpeech fallback finishes (native audio uses the - // player's drain callback instead). - speech.delegate = self } /// In-flight ephemeral mint guard (managed users). private var minting = false /// True when the hub should drive this PTT turn. Read by PushToTalkManager at PTT - /// start. BYOK users are ready immediately (own key); managed users are ready only - /// once a warm session exists (token minted + connecting) — otherwise PTT falls - /// back to the legacy cascade for that turn. + /// start. The hub is the default voice path (no opt-in toggle). var isActive: Bool { - guard RealtimeHubSettings.shared.isEnabled else { return false } - let provider = RealtimeHubSettings.shared.provider - if APIKeyService.byokKey(provider.byokProvider) != nil { return true } - return session != nil && sessionProvider == provider + // Drive a turn only when the hub is actually CONNECTED + authenticated for the + // currently-selected provider. A turn never enters hub mode on a key/token that can't + // connect (stale/revoked key, failed mint, mid-reconnect, or a just-switched provider): + // PTT transparently uses the legacy cascade instead, so a broken hub never costs the + // user a turn. The hub re-warms in the background and flips this true once it connects. + hubConnected && sessionProvider == RealtimeHubSettings.shared.provider } func setup(barState: FloatingControlBarState) { self.barState = barState - // Register the observer exactly once — duplicate registrations (re-entrant - // setup) fired settingsChanged N times, each tearing down + recreating the - // socket, which orphaned a connecting session (Gemini 1001/1008 closes). + // The hub provider follows the "Voice Model" picker, so re-warm when it changes — + // observe the live settings notification (posted by the picker, RealtimeOmniSettings + // setters, and AutoModelSelector). Register exactly once — duplicate registrations + // (re-entrant setup) fired settingsChanged N times, each tearing down + recreating + // the socket, which orphaned a connecting session (Gemini 1001/1008 closes). NotificationCenter.default.removeObserver( - self, name: .realtimeHubSettingsDidChange, object: nil) + self, name: .realtimeOmniSettingsDidChange, object: nil) NotificationCenter.default.addObserver( self, selector: #selector(settingsChanged), - name: .realtimeHubSettingsDidChange, object: nil) + name: .realtimeOmniSettingsDidChange, object: nil) // Expose the headless E2E action (omi-ctl action hub_test_turn pcm=… provider=…). RealtimeHubTestHarness.registerAutomationAction() + refreshAboutUserCard() } @objc private func settingsChanged() { - // Only reconnect if enabled and the provider actually changed — avoids - // redundant teardown/recreate races on unrelated notifications. - if !RealtimeHubSettings.shared.isEnabled { teardownSession(); return } + // Only reconnect if the provider actually changed — avoids redundant + // teardown/recreate races on unrelated notifications. if session != nil, sessionProvider == RealtimeHubSettings.shared.provider { return } teardownSession() + refreshAboutUserCard() ensureWarm() } // MARK: - Warm session lifecycle (kept open between turns) - /// Open the WS now if it isn't already (no-op if disabled or already warm). - /// BYOK → connect client-direct with the user's key (Phase 1). Otherwise, if - /// signed in → mint a server-side ephemeral token (Phase 2) and connect with it. + /// Open the WS now if it isn't already (no-op if already warm). BYOK → connect + /// client-direct with the user's key. Otherwise, if signed in → mint a server-side + /// ephemeral token and connect with it. func ensureWarm() { - guard RealtimeHubSettings.shared.isEnabled else { return } let provider = RealtimeHubSettings.shared.provider if session != nil, sessionProvider == provider { return } if session != nil { teardownSession() } @@ -117,7 +135,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } else if AuthService.shared.isSignedIn { mintAndConnect(provider: provider) } else { - log("RealtimeHub: enabled but no BYOK key and not signed in — hub unavailable (cascade).") + log("RealtimeHub: no BYOK key and not signed in — hub unavailable (cascade).") } } @@ -137,39 +155,22 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { log("⚠️ RealtimeHub: ephemeral mint failed / not entitled — staying on cascade") return } - // Provider/enable may have changed while minting; only connect if still wanted. - guard RealtimeHubSettings.shared.isEnabled, - RealtimeHubSettings.shared.provider == provider, self.session == nil + // Provider may have changed while minting; only connect if still wanted. + guard RealtimeHubSettings.shared.provider == provider, self.session == nil else { return } self.startSession(provider: provider, auth: .ephemeral(token)) } } private func startSession(provider: RealtimeHubProvider, auth: HubAuth) { - let s = RealtimeHubSession(provider: provider, auth: auth, delegate: self) + let instructions = RealtimeHubTools.systemInstruction(aboutUser: aboutUserCard) + let s = RealtimeHubSession(provider: provider, auth: auth, instructions: instructions, delegate: self) session = s sessionProvider = provider // Both providers stream native spoken audio (24k PCM) → StreamingPCMPlayer; // AVSpeech is only a no-audio fallback. if pcmPlayer == nil { - let p = StreamingPCMPlayer(sampleRate: 24000) - // Feed the live output amplitude to the speaking waveform — but only while we're - // actually in the speaking state, so publishing `voiceLevel` never re-renders the - // bar outside that window. - p.onLevel = { [weak self] level in - guard let self, self.barState?.isVoiceSpeaking == true else { return } - self.barState?.voiceLevel = CGFloat(level) - } - // The reply isn't truly over until the buffered audio finishes draining — only - // then do we drop "speaking" and let the bar collapse back to idle. - p.onPlayingChanged = { [weak self] playing in - guard let self, let barState = self.barState else { return } - if !playing { - barState.isVoiceSpeaking = false - barState.voiceLevel = 0 - } - } - pcmPlayer = p + pcmPlayer = StreamingPCMPlayer(sampleRate: 24000) } s.start() log( @@ -184,6 +185,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { session?.stop() session = nil sessionProvider = nil + hubConnected = false // no live session → PTT falls back to the cascade until re-warm } // MARK: - PTT integration @@ -202,9 +204,6 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { audioReceivedThisTurn = false turnRecorded = false lastTurnAt = Date() - barState?.isVoiceThinking = false // new turn → we're recording again, not waiting - barState?.isVoiceSpeaking = false // any prior reply is being cut off below - barState?.voiceLevel = 0 pcmPlayer?.stop() // stop any prior reply locally if speech.isSpeaking { speech.stopSpeaking(at: .immediate) } if bargeIn { @@ -246,11 +245,6 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { /// PTT-up: end the turn; the model now responds (and may call tools). func commitTurn() { responding = true - // Show a distinct "waiting on the model" state (not the red recording dot, which - // reads as "still listening") so the user knows to wait rather than re-press. Setting - // this keeps the bar's `isVoiceActive` true across the PTT-up → thinking handoff, so - // the window stays expanded (the window observes the flags and resizes itself). - barState?.isVoiceThinking = true // (The screen frame is sent at turn START — see beginTurn — so it has time to // upload/decode before the model answers. Nothing to attach here.) session?.commitInputTurn() @@ -273,6 +267,7 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { func hubDidConnect() { lastWarmAt = Date() + hubConnected = true // authenticated + ready — PTT may now route turns to the hub log("RealtimeHub: connected (\(sessionProvider?.displayName ?? "?"))") } @@ -297,11 +292,6 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } func hubDidReceiveAudio(_ pcm24k: Data) { - if !audioReceivedThisTurn { - // First audio of the turn: it's no longer thinking, it's speaking. - barState?.isVoiceThinking = false - barState?.isVoiceSpeaking = true - } audioReceivedThisTurn = true pcmPlayer?.enqueue(pcm24k) // native spoken audio (OpenAI + Gemini) } @@ -318,6 +308,25 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } } + /// Run an async tool `body`, then speak its result: on throw → `errorText`, on an + /// empty/whitespace result → `emptyText`. Shared by the data read/write tool cases so the + /// Task / do-catch / blank-check / log / sendToolResult tail lives in exactly one place. + private func runToolAndSpeak( + callId: String, name: String, detail: String = "", + emptyText: String, errorText: String, + _ body: @escaping () async throws -> String + ) { + Task { [weak self] in + guard let self else { return } + var out: String + do { out = try await body() } catch { out = errorText } + if out.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty { out = emptyText } + let suffix = detail.isEmpty ? "" : " \(detail)" + log("RealtimeHub[\(self.providerTag)]: tool \(name)\(suffix) → \(out.prefix(60))") + self.session?.sendToolResult(callId: callId, name: name, output: out) + } + } + func hubDidRequestTool(name: String, callId: String, argumentsJSON: String) { let arguments = (try? JSONSerialization.jsonObject(with: Data(argumentsJSON.utf8)) as? [String: Any]) ?? [:] @@ -326,13 +335,19 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { session?.sendToolResult(callId: callId, name: name, output: "Unknown tool.") return } + func arg(_ key: String) -> String { (arguments[key] as? String) ?? turnTranscript } + func argInt(_ key: String) -> Int? { (arguments[key] as? Int) ?? (arguments[key] as? NSNumber)?.intValue } switch tool { case .askHigherModel: - let query = (arguments["query"] as? String) ?? turnTranscript - log("RealtimeHub[\(providerTag)]: tool ask_higher_model → POST /v2/chat/completions (claude-sonnet-4-6) query=\"\(query.prefix(80))\"") + let query = arg("query") + let context = (arguments["context"] as? String) ?? "" + log( + "RealtimeHub[\(providerTag)]: tool ask_higher_model → POST /v2/chat/completions (claude-sonnet-4-6) query=\"\(query.prefix(80))\"" + ) Task { [weak self] in guard let self else { return } - let answer = await self.escalateToHigherModel(query) + let answer = await self.escalateToHigherModel( + query, context: context, aboutUser: self.aboutUserCard) self.session?.sendToolResult(callId: callId, name: name, output: answer) } case .getTasks: @@ -343,8 +358,9 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { await TasksStore.shared.loadDashboardTasks() let overdue = TasksStore.shared.overdueTasks let today = TasksStore.shared.todaysTasks + // Include the task id (for update_action_item) — the model is told never to speak ids. func list(_ items: [TaskActionItem]) -> String { - items.prefix(15).map { "- \($0.description)" }.joined(separator: "\n") + items.prefix(15).map { "- \($0.description) [id:\($0.id)]" }.joined(separator: "\n") } var out = "" if !overdue.isEmpty { out += "Overdue (\(overdue.count)):\n\(list(overdue))\n" } @@ -353,16 +369,136 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { log("RealtimeHub[\(self.providerTag)]: tool get_tasks → \(overdue.count) overdue, \(today.count) today") self.session?.sendToolResult(callId: callId, name: name, output: out) } + case .getMemories: + // Fast READ — "who am I" / "what do you know about me". Backend memories+facts. + runToolAndSpeak( + callId: callId, name: name, + emptyText: "I don't have any memories saved about you yet.", + errorText: "Could not read your memories right now." + ) { try await APIClient.shared.toolGetMemories(limit: 15).resultText } + case .searchMemories: + let query = arg("query") + runToolAndSpeak( + callId: callId, name: name, detail: "q=\"\(query.prefix(60))\"", + emptyText: "I couldn't find anything about that.", + errorText: "Could not search your memories right now." + ) { try await APIClient.shared.toolSearchMemories(query: query, limit: 5).resultText } + case .searchConversations: + // Capped for voice: top 5, summaries only (no full transcripts). + let query = arg("query") + runToolAndSpeak( + callId: callId, name: name, detail: "q=\"\(query.prefix(60))\"", + emptyText: "I couldn't find a conversation about that.", + errorText: "Could not search your conversations right now." + ) { + try await APIClient.shared.toolSearchConversations( + query: query, limit: 5, includeTranscript: false + ).resultText + } + case .getConversations: + // Fast READ — most recent conversations, newest first (backend orders created_at DESC). + // Capped for voice: top 3, summaries only. This is the recency path; search_conversations + // is semantic and must NOT be used for "most recent". + runToolAndSpeak( + callId: callId, name: name, + emptyText: "I don't see any recent conversations.", + errorText: "Could not read your recent conversations right now." + ) { + try await APIClient.shared.toolGetConversations( + limit: 3, includeTranscript: false + ).resultText + } + case .getDailyRecap: + // Fast LOCAL read of the on-device activity DB — apps/minutes, conversations, tasks, + // focus, screen context. Reuses the SAME executor the desktop chat uses, so voice and + // chat answer "what did I do yesterday" from one code path. + let daysAgo = argInt("days_ago") ?? 1 + runToolAndSpeak( + callId: callId, name: name, detail: "days_ago=\(daysAgo)", + emptyText: "I don't have any activity recorded for then.", + errorText: "Could not pull up your activity right now." + ) { + await ChatToolExecutor.execute( + ToolCall(name: "get_daily_recap", arguments: ["days_ago": daysAgo], thoughtSignature: nil)) + } + case .getActionItems: + // Backend READ of the full task list with filters (completed / due-date range) — the + // capable sibling of the local get_tasks. Same APIClient path the chat agent uses. + let completed = arguments["completed"] as? Bool + let dueStart = arguments["due_start_date"] as? String + let dueEnd = arguments["due_end_date"] as? String + runToolAndSpeak( + callId: callId, name: name, detail: completed.map { "completed=\($0)" } ?? "", + emptyText: "I couldn't find any matching tasks.", + errorText: "Could not read your tasks right now." + ) { + try await APIClient.shared.toolGetActionItems( + limit: 25, completed: completed, dueStartDate: dueStart, dueEndDate: dueEnd + ).resultText + } + case .searchScreenHistory: + // Fast LOCAL semantic search over screen history (same executor as chat). + let query = arg("query") + var toolArgs: [String: Any] = ["query": query] + if let days = argInt("days") { toolArgs["days"] = days } + runToolAndSpeak( + callId: callId, name: name, detail: "q=\"\(query.prefix(60))\"", + emptyText: "I couldn't find anything on your screen about that.", + errorText: "Could not search your screen history right now." + ) { + await ChatToolExecutor.execute( + ToolCall(name: "search_screen_history", arguments: toolArgs, thoughtSignature: nil)) + } + case .createActionItem: + let description = (arguments["description"] as? String)? + .trimmingCharacters(in: .whitespacesAndNewlines) ?? "" + let dueAt = arguments["due_at"] as? String + guard !description.isEmpty else { + session?.sendToolResult( + callId: callId, name: name, output: "No task description was given.") + return + } + runToolAndSpeak( + callId: callId, name: name, detail: "\"\(description.prefix(60))\"", + emptyText: "Task created.", + errorText: "Could not create the task right now." + ) { + try await APIClient.shared.toolCreateActionItem( + description: description, dueAt: dueAt + ).resultText + } + case .updateActionItem: + guard let id = (arguments["id"] as? String), !id.isEmpty else { + session?.sendToolResult( + callId: callId, name: name, + output: "Missing the task id — call get_tasks first to find it.") + return + } + let completed = arguments["completed"] as? Bool + let newDescription = arguments["description"] as? String + let dueAt = arguments["due_at"] as? String + runToolAndSpeak( + callId: callId, name: name, detail: "id=\(id.prefix(8))", + emptyText: "Task updated.", + errorText: "Could not update the task right now." + ) { + try await APIClient.shared.toolUpdateActionItem( + id: id, completed: completed, description: newDescription, dueAt: dueAt + ).resultText + } case .spawnAgent: - let brief = (arguments["brief"] as? String) ?? turnTranscript + let brief = arg("brief") + let title = (arguments["title"] as? String)?.trimmingCharacters(in: .whitespacesAndNewlines) let model = ShortcutSettings.shared.selectedModel.isEmpty ? "claude-sonnet-4-6" : ShortcutSettings.shared.selectedModel // Non-blocking: spawn renders its own pill ("text bubble") and runs on its // own ChatProvider/AgentBridge. We don't await it on the voice loop. // fromVoice:false — the hub model speaks its own natural acknowledgment, so the pill // must NOT also speak its canned randomAck ("on it") or we double up. - let pill = AgentPillsManager.shared.spawnFromUserQuery(brief, model: model, fromVoice: false) - log("RealtimeHub[\(providerTag)]: tool spawn_agent → AgentBridge pill=\"\(pill.title)\" model=\(model)") + let pill = AgentPillsManager.shared.spawnFromUserQuery( + brief, model: model, fromVoice: false, + preFetchedTitle: (title?.isEmpty == false) ? title : nil) + log("RealtimeHub[\(providerTag)]: tool spawn_agent → AgentBridge pill=\"\(pill.title)\" model=\(model) titled=\(title?.isEmpty == false)") // Terse directive (not speakable content): the model already said its one-line ack // BEFORE calling, so it should NOT generate a slow second utterance after this. session?.sendToolResult( @@ -412,51 +548,58 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { // land here. responding = false logError("RealtimeHub: session error — \(message)") - // The reply is dead — stop any buffered audio and drop the speaking state before - // collapsing (the drain callback won't fire for a torn-down engine). + // The reply is dead — stop any buffered audio before collapsing. pcmPlayer?.stop() if speech.isSpeaking { speech.stopSpeaking(at: .immediate) } - barState?.isVoiceSpeaking = false - barState?.voiceLevel = 0 exitVoiceUI() let aliveFor = lastWarmAt.map { Date().timeIntervalSince($0) } ?? 0 teardownSession() // Re-warm so the NEXT PTT uses the hub, not the STT cascade. Gemini idle-closes // the socket (~2.5 min, close 1008) even before the first turn; managed users have // no BYOK key, so once `session` is nil `isActive` is false and PTT silently falls - // back to omni STT. So gate on isEnabled (NOT isActive, which needs a live session). + // back to omni STT. So always try to re-warm (the hub is the default voice path). // A socket that survived past the idle window was a normal idle-close → reset the // strike budget and keep re-warming forever; one that died fast is likely a config/ // auth failure → let the strikes cap stop the churn. if aliveFor > 60 { hubReconnectStrikes = 0 } - guard RealtimeHubSettings.shared.isEnabled, !reconnectPending, hubReconnectStrikes < 5 else { return } + guard !reconnectPending, hubReconnectStrikes < Self.maxReconnectStrikes else { return } hubReconnectStrikes += 1 reconnectPending = true DispatchQueue.main.asyncAfter(deadline: .now() + 1.5) { [weak self] in guard let self else { return } self.reconnectPending = false - if RealtimeHubSettings.shared.isEnabled, self.session == nil { self.ensureWarm() } + if self.session == nil { self.ensureWarm() } } } /// Return the floating bar from its PTT voice state to compact after a hub turn. - /// Leaves `isVoiceSpeaking` alone — the turn can finish generating while the buffered - /// reply is still playing; the player's drain callback drops speaking when it ends. The - /// window observes these flags and collapses itself once `isVoiceActive` goes false. private func exitVoiceUI() { guard let barState else { return } + // Capture before clearing: a mid-turn error or silent-tap cancel clears the + // listening flag here, so PushToTalkManager.updateBarState() (which resizes only + // on a wasListening→false transition) would see no change and leave the bar wide. + let wasExpandedForVoice = barState.isVoiceListening barState.voiceTranscript = "" - barState.isVoiceThinking = false barState.isVoiceListening = false barState.isVoiceLocked = false barState.isVoiceFollowUp = false + // Collapse the bar ourselves in that case — guarded so we never shrink the bar out + // from under an open conversation, response, notification, hover, or onboarding. + guard wasExpandedForVoice, + !barState.showingAIConversation, !barState.showingAIResponse, + barState.currentNotification == nil, !barState.isHoveringBar, + UserDefaults.standard.bool(forKey: "hasCompletedOnboarding") + else { return } + FloatingControlBarManager.shared.resizeForPTT(expanded: false) } // MARK: - Tools /// ask_higher_model — reuse the EXISTING prompt-cached /v2/chat/completions /// (no new backend route). Returns the assistant text for the model to speak. - private func escalateToHigherModel(_ query: String) async -> String { + private func escalateToHigherModel(_ query: String, context: String, aboutUser: String) + async -> String + { let baseURL = await APIClient.shared.rustBackendURL guard !baseURL.isEmpty else { return "I couldn't reach the model right now." } let normalized = baseURL.hasSuffix("/") ? baseURL : baseURL + "/" @@ -472,18 +615,8 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { } catch { return "I couldn't authenticate to the model." } - let body: [String: Any] = [ - "model": "claude-sonnet-4-6", - "max_tokens": 1024, - "messages": [ - [ - "role": "user", - "content": - "Answer concisely for a spoken reply (a few sentences max):\n\n\(query)", - ] - ], - "stream": false, - ] + let body = RealtimeHubTools.escalationBody( + query: query, context: context, aboutUser: aboutUser) let t0 = Date() do { request.httpBody = try JSONSerialization.data(withJSONObject: body) @@ -523,17 +656,9 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { utterance.voice = AVSpeechSynthesisVoice(language: AVSpeechSynthesisVoice.currentLanguageCode()) ?? AVSpeechSynthesisVoice(language: "en-US") - barState?.isVoiceThinking = false - barState?.isVoiceSpeaking = true speech.speak(utterance) } - /// Drop the speaking state once the AVSpeech fallback stops talking. - private func finishedSpeaking() { - barState?.isVoiceSpeaking = false - barState?.voiceLevel = 0 - } - /// Local synthetic mouse click (point_click tool). @discardableResult static func click(at point: CGPoint) -> Bool { @@ -549,19 +674,3 @@ final class RealtimeHubController: NSObject, RealtimeHubSessionDelegate { return true } } - -// MARK: - AVSpeech fallback completion - -extension RealtimeHubController: AVSpeechSynthesizerDelegate { - nonisolated func speechSynthesizer( - _ synthesizer: AVSpeechSynthesizer, didFinish utterance: AVSpeechUtterance - ) { - Task { @MainActor [weak self] in self?.finishedSpeaking() } - } - - nonisolated func speechSynthesizer( - _ synthesizer: AVSpeechSynthesizer, didCancel utterance: AVSpeechUtterance - ) { - Task { @MainActor [weak self] in self?.finishedSpeaking() } - } -} diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift index 7dc991138c0..86fe07fdc0b 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSession.swift @@ -1,12 +1,12 @@ import Foundation import Network -// MARK: - Realtime Hub Session (Phase 1, CLIENT-DIRECT) +// MARK: - Realtime Hub Session // -// One persistent WebSocket to a realtime provider, opened with the user's own -// BYOK key (dev/test only — gated by RealtimeHubSettings.canConnect). The model -// is the hub: it does in-session STT + reasoning + routing (via tool calls) and -// speaks the answer. +// One persistent WebSocket to a realtime provider, opened either with the user's +// own BYOK key (client-direct, gated by RealtimeHubSettings.canConnect) or with a +// server-minted ephemeral token (managed users). The model is the hub: it does +// in-session STT + reasoning + routing (via tool calls) and speaks the answer. // // Two providers, normalized to ONE internal stream surface // (RealtimeHubSessionDelegate): @@ -62,6 +62,7 @@ enum HubAuth { final class RealtimeHubSession: NSObject { private let provider: RealtimeHubProvider private let auth: HubAuth + private let instructions: String private weak var delegate: RealtimeHubSessionDelegate? /// Mic PCM input rate per provider (Gemini 16k native, OpenAI GA needs 24k). @@ -118,9 +119,10 @@ final class RealtimeHubSession: NSObject { /// clear which model produced which event. private var tag: String { "RealtimeHub[\(provider == .openai ? "openai" : "gemini"):\(provider.modelID)]" } - init(provider: RealtimeHubProvider, auth: HubAuth, delegate: RealtimeHubSessionDelegate) { + init(provider: RealtimeHubProvider, auth: HubAuth, instructions: String, delegate: RealtimeHubSessionDelegate) { self.provider = provider self.auth = auth + self.instructions = instructions self.delegate = delegate super.init() } @@ -402,7 +404,7 @@ final class RealtimeHubSession: NSObject { "type": "session.update", "session": [ "type": "realtime", - "instructions": RealtimeHubTools.systemInstruction, + "instructions": instructions, "output_modalities": ["audio"], "audio": [ "input": [ @@ -431,7 +433,7 @@ final class RealtimeHubSession: NSObject { "responseModalities": ["AUDIO"], "temperature": 0.3, "mediaResolution": "MEDIA_RESOLUTION_HIGH", ], - "systemInstruction": ["parts": [["text": RealtimeHubTools.systemInstruction]]], + "systemInstruction": ["parts": [["text": instructions]]], "tools": [["functionDeclarations": RealtimeHubTools.geminiFunctionDeclarations]], "inputAudioTranscription": [:], "outputAudioTranscription": [:], diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift index 4f4b5952e44..3ae0e0ec18e 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubSettings.swift @@ -1,18 +1,20 @@ import Foundation -// MARK: - Realtime Hub (Phase 1) +// MARK: - Realtime Hub // // "Realtime-as-hub": instead of the cascade (STT → router → Claude → TTS), one // realtime model is the single hub. It does in-session STT, reasoning, routing // (as tool choice), and speaks the answer. Its tools call the EXISTING backend // endpoints / app code — no new backend routes. // -// Phase 1 is CLIENT-DIRECT + dev/test only: the realtime WS connects straight to -// the provider with the user's own BYOK key (see APIKeyService). It is gated so -// it never runs for managed (non-BYOK) users. Phase 2 will replace the BYOK key -// with a server-minted ephemeral token to make it shippable. +// The hub is the default voice path — there is no opt-in toggle. Every PTT turn +// routes through it whenever it can connect: BYOK users connect client-direct with +// their own key (see APIKeyService); managed users connect with a server-minted +// ephemeral token. When neither is available (no key, mint fails / not entitled) the +// turn falls back to the legacy STT cascade. The provider follows the user's "Voice +// Model" choice in Advanced settings (RealtimeOmniSettings) — no separate picker. -enum RealtimeHubProvider: String, CaseIterable, Sendable { +enum RealtimeHubProvider: String, Sendable { case openai case gemini @@ -23,13 +25,6 @@ enum RealtimeHubProvider: String, CaseIterable, Sendable { } } - var subtitle: String { - switch self { - case .openai: return "gpt-realtime-2 · native spoken audio" - case .gemini: return "gemini native-audio Live · spoken audio + tools" - } - } - /// Concrete model identifier sent to the provider. var modelID: String { switch self { @@ -58,48 +53,22 @@ enum RealtimeHubProvider: String, CaseIterable, Sendable { final class RealtimeHubSettings { static let shared = RealtimeHubSettings() - private let enabledKey = "realtimeHubEnabled" - private let providerKey = "realtimeHubProvider" - - private init() { - UserDefaults.standard.register(defaults: [ - enabledKey: false, - providerKey: RealtimeHubProvider.openai.rawValue, - ]) - } - - /// Master switch. When off, the floating bar uses the legacy STT → router → - /// Claude → TTS cascade. Ships behind this flag. - var isEnabled: Bool { - get { UserDefaults.standard.bool(forKey: enabledKey) } - set { - UserDefaults.standard.set(newValue, forKey: enabledKey) - NotificationCenter.default.post(name: .realtimeHubSettingsDidChange, object: nil) - } - } + private init() {} + /// The hub provider follows the user's "Voice Model" choice in Advanced settings — + /// there is no separate hub picker. The two map 1:1 (same underlying models), and + /// `.auto` is already resolved to a concrete provider by `effectiveProvider`. var provider: RealtimeHubProvider { - get { - let raw = UserDefaults.standard.string(forKey: providerKey) - return raw.flatMap(RealtimeHubProvider.init(rawValue:)) ?? .openai - } - set { - UserDefaults.standard.set(newValue.rawValue, forKey: providerKey) - NotificationCenter.default.post(name: .realtimeHubSettingsDidChange, object: nil) + switch RealtimeOmniSettings.shared.effectiveProvider { + case .gptRealtime2: return .openai + case .geminiFlashLive, .auto: return .gemini } } - /// The hub may only run client-direct when the user has supplied the selected - /// provider's own key (BYOK / dev key). This is the managed-user gate: managed - /// users have no BYOK key, so the hub stays off and the cascade is used. + /// True when the hub can connect client-direct with the user's own provider key + /// (BYOK / dev key). Managed users without a key connect via a minted ephemeral + /// token instead (see RealtimeHubController.ensureWarm); both reach the hub. var canConnect: Bool { APIKeyService.byokKey(provider.byokProvider) != nil } - - /// True when the hub should drive this PTT turn (enabled + a usable key). - var isActive: Bool { isEnabled && canConnect } -} - -extension Notification.Name { - static let realtimeHubSettingsDidChange = Notification.Name("realtimeHubSettingsDidChange") } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift index 8585d3033b3..46384dd9caf 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTestHarness.swift @@ -44,7 +44,10 @@ final class RealtimeHubTestHarness: NSObject, RealtimeHubSessionDelegate { } func run(timeoutSeconds: Double) async -> [String: String] { - let s = RealtimeHubSession(provider: provider, auth: auth, delegate: self) + let s = RealtimeHubSession( + provider: provider, auth: auth, + instructions: RealtimeHubTools.systemInstruction(aboutUser: ""), + delegate: self) session = s let rate = s.requiredInputSampleRate let audio = rate == 16000 ? pcm16k : PushToTalkManager.resamplePCM16(pcm16k, from: 16000, to: rate) @@ -118,7 +121,16 @@ final class RealtimeHubTestHarness: NSObject, RealtimeHubSessionDelegate { let stub: String switch HubTool(rawValue: name) { case .askHigherModel: stub = "Paris is the capital of France." - case .getTasks: stub = "Due today (1):\n- Example task" + case .getTasks: stub = "Due today (1):\n- Example task [id:task_123]" + case .getMemories: stub = "You live in San Francisco and prefer concise answers." + case .searchMemories: stub = "Your dog's name is Rex." + case .searchConversations: stub = "On Monday you discussed the launch timeline." + case .getConversations: stub = "Most recent: today, 'Standup notes'. Before that: yesterday, 'Design review'." + case .getActionItems: stub = "Open: Buy milk (due tomorrow). Completed: Ship the PR." + case .getDailyRecap: stub = "Yesterday: 3 hrs in Xcode, 1 hr in Safari; 2 conversations; 1 task created." + case .searchScreenHistory: stub = "Found it: yesterday afternoon you were reading the launch doc in Safari." + case .createActionItem: stub = "Created task: Example task." + case .updateActionItem: stub = "Updated the task." case .spawnAgent: stub = "Started a background agent." case .screenshot: stub = "Screen captured." case .pointClick: stub = "Clicked." diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift index dc526d867cf..98850b0be19 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/RealtimeHubTools.swift @@ -3,9 +3,12 @@ import Foundation // MARK: - Realtime Hub tool surface // // The realtime model IS the router: instead of a separate Haiku classify() call, -// the model decides what to do by choosing a tool. The same four tools are +// the model decides what to do by choosing a tool. The same tool surface is // declared to both providers (OpenAI Realtime `tools`, Gemini `functionDeclarations`); // `RealtimeHubController` executes them by calling EXISTING app code / endpoints. +// Reads (get_tasks, get_memories, search_memories, search_conversations) and simple +// writes (create_action_item, update_action_item) run synchronously and speak their +// result; multi-step / other-app work still goes to spawn_agent. enum HubTool: String { /// Escalate a hard / knowledge-heavy question to the smarter Claude model via @@ -15,8 +18,34 @@ enum HubTool: String { /// Non-blocking: the model acknowledges and moves on. case spawnAgent = "spawn_agent" /// Read the user's tasks locally (TasksStore) and return them inline to speak — a - /// fast synchronous READ, NOT a background agent. + /// fast synchronous READ, NOT a background agent. Overdue + due-today only. case getTasks = "get_tasks" + /// Read the user's full action-item list from the backend with filters (completed, + /// due-date range). Fast READ — use for completed tasks, date ranges, or the whole list + /// (get_tasks only covers overdue + due-today). + case getActionItems = "get_action_items" + /// Read what Omi knows about the user (memories / facts) and return it inline to speak. + /// Fast synchronous READ — the answer to "who am I" / "what do you know about me". + case getMemories = "get_memories" + /// Semantically search the user's memories / facts for something specific. Fast READ. + case searchMemories = "search_memories" + /// Semantically search the user's past conversations (titles + summaries, no transcripts). + /// Fast synchronous READ. + case searchConversations = "search_conversations" + /// List the user's MOST RECENT conversations, newest first (titles + summaries, no + /// transcripts). Fast READ — the answer to "most recent / latest / last conversation". + case getConversations = "get_conversations" + /// Formatted recap of what the user actually DID on their Mac — apps used (with minutes), + /// conversations, tasks, focus, screen activity. Fast LOCAL READ — the answer to "what did I + /// do yesterday / today", "which apps did I use the most", "how did I spend my time". + case getDailyRecap = "get_daily_recap" + /// Semantically search the user's on-screen history (what they saw / read / worked on). + /// Fast LOCAL READ — "when was I looking at X", "find where I read about Y". + case searchScreenHistory = "search_screen_history" + /// Create a new task / to-do / reminder for the user. Fast synchronous WRITE. + case createActionItem = "create_action_item" + /// Update an existing task (mark done, change text/due). Needs the task id from get_tasks. + case updateActionItem = "update_action_item" /// Capture the user's screen so the model can see what they're looking at. case screenshot = "screenshot" /// Click at on-screen coordinates (local). @@ -25,59 +54,109 @@ enum HubTool: String { enum RealtimeHubTools { - static let systemInstruction = """ + static func systemInstruction(aboutUser: String) -> String { + """ You are Omi, a fast spoken-voice assistant on the user's Mac and the single hub \ for their voice requests. You hear the user's microphone; reply by speaking, \ conversationally. Default to one or two sentences, but when the user asks for \ something longer or creative (a story, a detailed explanation, brainstorming), \ give the full answer yourself — don't shorten it and don't offload it. \ - Always reply in English. + Reply in the same language the user is speaking. + + \(aboutUser) - IMPORTANT: You have NO direct access to the user's personal data or their apps. \ - You cannot see their tasks, to-dos, calendar, notes, emails, messages, past \ - conversations, memories, files, or reminders on your own. The spawn_agent tool \ - CAN — it hands the request to a background agent that has all of those tools and \ - can act in the user's apps and browser. + IMPORTANT: You CAN read the user's Omi data directly with fast tools — their tasks \ + (get_tasks), what Omi knows about them / their memories & facts (get_memories, \ + search_memories), their past conversations (search_conversations), what they DID on \ + their Mac (get_daily_recap), and their on-screen history (search_screen_history) — and \ + you can make simple task changes (create_action_item, update_action_item). For anything in \ + their OTHER apps (calendar, notes, emails, messages, files, reminders, browser) or any \ + multi-step "do X for me" work, use spawn_agent — it hands the request to a background \ + agent that has those tools and can act in the user's apps. - Using tools: the moment a request needs a tool, briefly acknowledge it OUT LOUD in your \ - own natural, varied words (keep it short, and don't include any answer or data you don't \ - have yet), then immediately call the tool. For a data tool (get_tasks, ask_higher_model), \ - speak its result after it returns. NEVER put an answer — real or guessed — in that \ - acknowledgment, NEVER skip the tool call, and never read tool JSON aloud. You cannot see \ - tasks, data, or the screen without calling a tool. + Using tools: when a request needs a tool, ALWAYS give a short spoken heads-up first so the \ + user knows you're on it and that it won't be instant — then call the tool and speak the \ + result when it returns. Never go silent during a tool call; the user can't see what you're \ + doing, so a quiet gap feels broken. The catch is variety: that heads-up must be SPECIFIC to \ + what they actually asked and DIFFERENT every time. Name the real thing you're fetching — \ + "Pulling up yesterday's activity…", "Scanning your task list…", "Digging through your notes \ + on the launch…", "Checking your memories for that…", "Getting the latest on that, one \ + sec…". The thing to avoid is repetition: do NOT reach for the same generic opener ("let me \ + check", "let me look that up") turn after turn — it's what makes you sound robotic. Keep it \ + to a few words, vary the wording each turn, and don't include any answer or data you don't \ + have yet. For a slower step (ask_higher_model, spawn_agent) it's fine to signal it'll take a \ + moment. NEVER speak an answer — real or guessed — before the tool returns, NEVER skip the \ + tool call, and never read tool JSON or ids aloud. You cannot see the user's data or screen \ + without calling a tool. Decide what to do with each request: + - WHO the user is, what you ALREADY KNOW about them, and the ROUGH shape of their day \ + ("who am I", "what do you know about me", "am I busy today", "much on my plate"): answer \ + DIRECTLY from above — do NOT call a tool and do NOT say "let me check". Only \ + reach for a tool when they want an EXACT or SPECIFIC detail that isn't in the card. - The user's TASKS / to-dos / what's due — a READ ("what are my tasks", "what's due \ today", "what's on my list", "do I have anything today"): you MUST call get_tasks and \ - speak ONLY what it returns. You CANNOT see their tasks any other way — never guess, \ - summarize from memory, or make up tasks. Always call get_tasks; do NOT use an agent. - - DOING something for the user, or their OTHER personal data (calendar, notes, emails, \ - messages, conversations, memories, files, reminders) — create/send/open/edit/search/ \ - schedule/automate/"do X for me"/any multi-step work: you CANNOT do these yourself. You \ - MUST actually EMIT the spawn_agent function call (with a clear, self-contained `brief`). \ - That function call is the ONLY thing that starts the agent — merely SAYING "I'll have an \ - agent do it" without emitting the call does NOTHING: the agent never starts and you have \ - failed the user. So always emit the spawn_agent call. You may add one short natural \ - sentence as you call it, but never instead of it. Do NOT ask clarifying questions before \ - spawning — spawn with what you have. Do NOT wait for it, narrate its steps, refuse, or \ - claim you can't. + speak ONLY what it returns (the card's counts are a rough snapshot, not the list). Never \ + guess or make up tasks. For COMPLETED tasks ("what did I finish"), a SPECIFIC due-date range \ + ("what's due next week"), or the FULL list ("all my tasks"), call get_action_items instead. + - A SPECIFIC fact about the user that isn't already in ("what's my dog's name", \ + "where do I work"): call search_memories with a focused query. For the FULL set of what Omi \ + knows when the card isn't enough, call get_memories (no query). NEVER answer "I don't know" \ + or guess about the user without checking first. + - The user's MOST RECENT / latest / last conversation ("what was my most recent \ + conversation", "what did we just talk about", "my recent conversations"): call \ + get_conversations (newest first) — NOT search_conversations, which is semantic and does \ + NOT sort by time. Speak the latest one. + - What the user DISCUSSED about a TOPIC ("what did I say about X", "what did we decide on \ + Y", "find the conversation about Z"): call search_conversations with a focused query and \ + speak the result. + - The user's own ACTIVITY / what they DID / how they spent their time ("what did I do \ + yesterday", "what did I do today", "which apps did I use the most", "how did I spend my \ + morning", "summarize my day"): you MUST call get_daily_recap (days_ago: 0 = today, 1 = \ + yesterday) and speak a SHORT spoken summary of the highlights it returns — top apps, key \ + conversations, tasks. Do NOT use search_conversations or spawn_agent for this, and never \ + guess; this is exactly what get_daily_recap is for. + - What the user SAW / read / worked on ON SCREEN ("when was I looking at X", "find where I \ + read about Y", "what was I doing in app Z"): call search_screen_history with a focused \ + query and speak the result. + - ADVICE about the user's OWN productivity / workflow / habits / focus ("how can I improve \ + my workflow", "how can I be more productive", "what should I change", "how am I doing", \ + "where am I wasting time"): do NOT answer generically. FIRST call get_daily_recap (days_ago: \ + 1 for today, 7 for the week) — and get_action_items when tasks matter — then base EVERY \ + suggestion on what they ACTUALLY did: their apps, distracted vs focused sessions, and \ + overdue / duplicate tasks. Generic advice with no tool call is a failure here. + - ADD a task / to-do / reminder ("remind me to…", "add … to my list", "I need to…"): \ + call create_action_item with a clear `description` (and `due_at` if a time was given), \ + then confirm out loud. CHANGE an existing task (mark done, edit, reschedule): first \ + call get_tasks to get the matching task's id, then call update_action_item with that id. + - DOING something for the user in their OTHER apps (calendar, notes, emails, messages, \ + files, browser) or any multi-step work — create/send/open/edit/search/schedule/automate/ \ + "do X for me": you CANNOT do these yourself. You MUST actually EMIT the spawn_agent \ + function call (with a clear, self-contained `brief` and a short `title`). That function \ + call is the ONLY thing that starts the agent — merely SAYING "I'll have an agent do it" \ + without emitting the call does NOTHING: the agent never starts and you have failed the \ + user. So always emit the spawn_agent call. You may add one short natural sentence as you \ + call it, but never instead of it. Do NOT ask clarifying questions before spawning — spawn \ + with what you have. Do NOT wait for it, narrate its steps, refuse, or claim you can't. - Everything else — general questions, facts, chit-chat, explanations, advice, jokes, \ and creative or long-form requests (stories, brainstorming, drafts): ANSWER YOURSELF. \ You are fully capable; do it directly, even when the ask is long or open-ended. Do \ NOT escalate just because a request seems long or hard. - - Call ask_higher_model in ONLY two cases: (1) the user is unhappy with your previous \ - answer — they push back, rephrase, say you're wrong, or ask for a better/deeper/more \ - thorough answer; or (2) you genuinely need precise, up-to-date facts (current events, \ - specific numbers) you don't reliably know. Pass a clear `query`, then speak the result. + - Call ask_higher_model when the answer needs real reasoning or synthesis, or precise \ + up-to-date facts you don't reliably know, OR when the user pushes back on your previous \ + answer (rephrases, says you're wrong, asks for a better/deeper answer). Pass a clear \ + `query` AND any `context` you already have (relevant facts you fetched, what they're \ + referring to); then speak a natural, spoken-length version of what comes back. - When you need to see what's on screen, call screenshot first. Use point_click only \ when the user clearly asks you to click something. Keep latency low: prefer answering directly when you can. """ + } - /// OpenAI Realtime GA `session.tools` entries. - static var openAITools: [[String: Any]] { - [ + /// OpenAI Realtime GA `session.tools` entries. Static `let` — built once, not rebuilt on + /// every session (re)connect that reads it. + static let openAITools: [[String: Any]] = [ [ "type": "function", "name": HubTool.askHigherModel.rawValue, @@ -89,7 +168,14 @@ enum RealtimeHubTools { "parameters": [ "type": "object", "properties": [ - "query": ["type": "string", "description": "The full question to escalate."] + "query": ["type": "string", "description": "The full question to escalate."], + "context": [ + "type": "string", + "description": + "Relevant context you already have that helps answer well — facts you fetched, " + + "what the user is referring to, or the previous answer they pushed back on. " + + "Include only what's relevant; omit if there's nothing useful.", + ], ], "required": ["query"], ], @@ -103,6 +189,152 @@ enum RealtimeHubTools { + "my list'. Do NOT use spawn_agent for reading tasks.", "parameters": ["type": "object", "properties": [:]], ], + [ + "type": "function", + "name": HubTool.getMemories.rawValue, + "description": + "Read what Omi knows about the user — their memories and facts (preferences, " + + "background, people, habits). Fast synchronous read with NO query. Use this for " + + "'who am I', 'what do you know about me', 'what are my preferences'. Speak what it returns.", + "parameters": ["type": "object", "properties": [:]], + ], + [ + "type": "function", + "name": HubTool.searchMemories.rawValue, + "description": + "Search the user's memories / facts for a SPECIFIC thing ('what's my dog's name', " + + "'where do I work', 'what's my partner's name'). Fast synchronous read. Speak the result.", + "parameters": [ + "type": "object", + "properties": [ + "query": ["type": "string", "description": "What to look up about the user."] + ], + "required": ["query"], + ], + ], + [ + "type": "function", + "name": HubTool.searchConversations.rawValue, + "description": + "Search the user's past conversations for what they discussed ('what did I say about X', " + + "'what did we decide', 'summarize my last meeting'). Returns titles + summaries only " + + "(no full transcripts). Fast synchronous read. Speak the result.", + "parameters": [ + "type": "object", + "properties": [ + "query": ["type": "string", "description": "What topic / conversation to find."] + ], + "required": ["query"], + ], + ], + [ + "type": "function", + "name": HubTool.getConversations.rawValue, + "description": + "List the user's MOST RECENT conversations, newest first (titles + summaries, no full " + + "transcripts). Use this — NOT search_conversations — for 'what was my most recent / " + + "latest / last conversation', 'what did we just talk about', or 'my recent conversations'. " + + "search_conversations is semantic and does NOT order by time, so it's wrong for 'recent'. " + + "Fast synchronous read. Speak the result.", + "parameters": ["type": "object", "properties": [:]], + ], + [ + "type": "function", + "name": HubTool.getDailyRecap.rawValue, + "description": + "Get a recap of what the user actually DID on their Mac — apps used (with minutes), " + + "conversations, tasks, focus sessions, and screen activity — for a day. THIS is the tool " + + "for 'what did I do yesterday', 'what did I do today', 'which apps did I use the most', " + + "'how did I spend my time'. Do NOT use search_conversations or spawn_agent for these. " + + "Fast synchronous read — speak a short summary of what it returns.", + "parameters": [ + "type": "object", + "properties": [ + "days_ago": [ + "type": "number", + "description": "0 = today, 1 = yesterday (default), 7 = the past week.", + ] + ], + ], + ], + [ + "type": "function", + "name": HubTool.searchScreenHistory.rawValue, + "description": + "Search the user's on-screen history — what they saw, read, or worked on — by meaning. " + + "Use for 'when was I looking at X', 'find where I read about Y', 'what was I doing in " + + "app Z'. Returns matching moments with the app and context. Fast synchronous read. " + + "Speak the result.", + "parameters": [ + "type": "object", + "properties": [ + "query": [ + "type": "string", "description": "What the user was looking at / reading / doing.", + ], + "days": ["type": "number", "description": "How many days back to search; default 7."], + ], + "required": ["query"], + ], + ], + [ + "type": "function", + "name": HubTool.getActionItems.rawValue, + "description": + "Read the user's tasks / to-dos from the backend, with optional filters. Use for " + + "COMPLETED tasks ('what did I finish'), a DATE RANGE ('what's due next week'), or the " + + "FULL list ('all my tasks') — for plain 'what's due today / overdue', prefer get_tasks. " + + "Fast synchronous read. Speak a short summary of what it returns.", + "parameters": [ + "type": "object", + "properties": [ + "completed": [ + "type": "boolean", + "description": "true = only done tasks, false = only open tasks. Omit for both.", + ], + "due_start_date": [ + "type": "string", "description": "Optional ISO-8601 start of the due-date range.", + ], + "due_end_date": [ + "type": "string", "description": "Optional ISO-8601 end of the due-date range.", + ], + ], + ], + ], + [ + "type": "function", + "name": HubTool.createActionItem.rawValue, + "description": + "Create a new task / to-do / reminder for the user ('remind me to…', 'add … to my " + + "list', 'I need to…'). Fast synchronous write. Confirm out loud after it returns.", + "parameters": [ + "type": "object", + "properties": [ + "description": ["type": "string", "description": "The task text."], + "due_at": [ + "type": "string", + "description": "Optional ISO-8601 due date/time, only if the user gave one.", + ], + ], + "required": ["description"], + ], + ], + [ + "type": "function", + "name": HubTool.updateActionItem.rawValue, + "description": + "Update an existing task: mark it done, edit its text, or reschedule it. You MUST first " + + "call get_tasks to get the matching task's id, then pass that id here. Fast synchronous write.", + "parameters": [ + "type": "object", + "properties": [ + "id": ["type": "string", "description": "The task id from get_tasks."], + "completed": ["type": "boolean", "description": "Set true to mark the task done."], + "description": ["type": "string", "description": "New task text, if changing it."], + "due_at": ["type": "string", "description": "New ISO-8601 due date/time, if rescheduling."], + ], + "required": ["id"], + ], + ], [ "type": "function", "name": HubTool.spawnAgent.rawValue, @@ -116,7 +348,13 @@ enum RealtimeHubTools { "properties": [ "brief": [ "type": "string", "description": "A clear, self-contained brief of the task.", - ] + ], + "title": [ + "type": "string", + "description": + "A short Title Case label for the task pill (≤ ~5 words, no trailing " + + "punctuation), e.g. 'Draft Launch Email'.", + ], ], "required": ["brief"], ], @@ -140,12 +378,11 @@ enum RealtimeHubTools { "required": ["x", "y"], ], ], - ] - } + ] - /// Gemini Live `setup.tools[0].functionDeclarations` entries (same surface). - static var geminiFunctionDeclarations: [[String: Any]] { - openAITools.map { tool in + /// Gemini Live `setup.tools[0].functionDeclarations` entries (same surface). Derived once + /// from `openAITools`. + static let geminiFunctionDeclarations: [[String: Any]] = openAITools.map { tool in // Gemini wants {name, description, parameters} without the OpenAI "type" wrapper. var decl: [String: Any] = [ "name": tool["name"] as? String ?? "", @@ -159,7 +396,6 @@ enum RealtimeHubTools { } return decl } - } /// Recursively uppercase every `type` value in a JSON-schema dict so it matches Gemini's /// Schema enum (object → OBJECT, string → STRING, …). @@ -176,4 +412,33 @@ enum RealtimeHubTools { if let items = schema["items"] as? [String: Any] { out["items"] = upcasedSchemaTypes(items) } return out } + + /// System prompt for an escalated (ask_higher_model) answer. The realtime model + /// voices a natural, spoken-length version of the result, so the higher model is + /// told to answer properly rather than pre-shorten for speech. + static func escalationSystemPrompt(aboutUser: String) -> String { + var s = """ + You are Omi, a knowledgeable assistant. Answer the user's question accurately and \ + usefully. A voice assistant will relay your answer aloud and adapt the phrasing for \ + speech, so be clear and well-structured; you don't need to pre-shorten it. + """ + if !aboutUser.isEmpty { s += "\n\n" + aboutUser } + return s + } + + static func escalationBody(query: String, context: String, aboutUser: String) -> [String: Any] { + let trimmedContext = context.trimmingCharacters(in: .whitespacesAndNewlines) + let userContent = + trimmedContext.isEmpty ? query : query + "\n\nContext I already have:\n" + trimmedContext + let messages: [[String: String]] = [ + ["role": "system", "content": escalationSystemPrompt(aboutUser: aboutUser)], + ["role": "user", "content": userContent], + ] + return [ + "model": "claude-sonnet-4-6", + "max_tokens": 1024, + "messages": messages, + "stream": false, + ] + } } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/StreamingPCMPlayer.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/StreamingPCMPlayer.swift index 28f4ab4d60e..0836dea0aa2 100644 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/StreamingPCMPlayer.swift +++ b/desktop/macos/Desktop/Sources/FloatingControlBar/StreamingPCMPlayer.swift @@ -14,27 +14,6 @@ final class StreamingPCMPlayer { private let format: AVAudioFormat private var configObserver: NSObjectProtocol? - /// Smoothed 0…1 output amplitude, delivered on the main thread (~40×/s) while the - /// engine runs. Driven by a tap on the mixer so it tracks what's *actually audible*, - /// not what's been buffered ahead. Used to make the speaking waveform audio-reactive. - var onLevel: ((Float) -> Void)? - /// Fires on the main thread when playback starts (false→true) and when the queue - /// fully drains (true→false). Lets the caller mark "speaking" precisely — including - /// the silent tail after the last chunk arrives but before it finishes playing. - var onPlayingChanged: ((Bool) -> Void)? - - /// Outstanding scheduled buffers (incremented on enqueue, decremented when each - /// finishes). Guarded by `bufferLock` because completion handlers run off-main. - private var pendingBuffers = 0 - private let bufferLock = NSLock() - private var isPlayingState = false - // Exponential moving average of the output RMS (smoothed so the waveform never jitters). - private var smoothedLevel: Float = 0 - // Last value handed to `onLevel`, so we skip main-thread hops while the level is flat - // (e.g. the silent tail of a reply) instead of publishing the same number ~40×/s. - private var lastDispatchedLevel: Float = -1 - private var levelTapInstalled = false - init(sampleRate: Double = 24000) { // Float32 mono at the source rate; the mixer resamples to the device rate. format = AVAudioFormat( @@ -55,8 +34,6 @@ final class StreamingPCMPlayer { log("StreamingPCMPlayer: audio config changed — rebuilding engine") self.player.stop() self.engine.stop() - // The rebuilt graph loses the old tap; let ensureRunning() reinstall it. - self.removeLevelTap() self.engine.disconnectNodeOutput(self.player) self.engine.connect(self.player, to: self.engine.mainMixerNode, format: self.format) self.ensureRunning() @@ -69,42 +46,6 @@ final class StreamingPCMPlayer { } } - /// Tap the mixer output once the engine is live so `onLevel` reflects the audio the - /// user actually hears. Cheap: one RMS pass per ~1024-frame buffer, EMA-smoothed. - private func installLevelTapIfNeeded() { - guard !levelTapInstalled, engine.isRunning else { return } - levelTapInstalled = true - engine.mainMixerNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { - [weak self] buffer, _ in - guard let self, self.onLevel != nil, let data = buffer.floatChannelData else { return } - let frames = Int(buffer.frameLength) - guard frames > 0 else { return } - let samples = data[0] - var sumSquares: Float = 0 - for i in 0.. self.smoothedLevel ? 0.35 : 0.12 - self.smoothedLevel += (target - self.smoothedLevel) * alpha - let out = self.smoothedLevel - // Only hop to main when the level actually moved — flat/silent stretches stay quiet. - guard abs(out - self.lastDispatchedLevel) > 0.01 else { return } - self.lastDispatchedLevel = out - DispatchQueue.main.async { self.onLevel?(out) } - } - } - - /// Detach the level tap (call when playback stops; reinstalled on the next play). - private func removeLevelTap() { - guard levelTapInstalled else { return } - engine.mainMixerNode.removeTap(onBus: 0) - levelTapInstalled = false - smoothedLevel = 0 - lastDispatchedLevel = -1 - } - /// Ensure the engine + player are actually running before scheduling. Checking /// the real `isRunning`/`isPlaying` state (not a one-shot flag) is what makes /// playback survive past the first turn: AVAudioEngine auto-suspends when idle @@ -125,19 +66,6 @@ final class StreamingPCMPlayer { if !player.isPlaying { player.play() } - installLevelTapIfNeeded() - } - - /// Adjust the outstanding-buffer count and emit `onPlayingChanged` on the edges. - private func adjustPending(by delta: Int) { - bufferLock.lock() - pendingBuffers = max(0, pendingBuffers + delta) - let nowPlaying = pendingBuffers > 0 - let changed = nowPlaying != isPlayingState - if changed { isPlayingState = nowPlaying } - bufferLock.unlock() - guard changed else { return } - DispatchQueue.main.async { [weak self] in self?.onPlayingChanged?(nowPlaying) } } /// `data` = little-endian Int16 PCM, mono, at the configured sample rate. @@ -156,22 +84,11 @@ final class StreamingPCMPlayer { channel[i] = max(-1.0, min(1.0, Float(src[i]) / 32768.0)) } } - adjustPending(by: 1) - player.scheduleBuffer(buffer, completionHandler: { [weak self] in self?.adjustPending(by: -1) }) + player.scheduleBuffer(buffer) } func stop() { - removeLevelTap() // no playback → no reason to keep tapping (reinstalled on next play) player.stop() engine.stop() - bufferLock.lock() - pendingBuffers = 0 - let wasPlaying = isPlayingState - isPlayingState = false - bufferLock.unlock() - smoothedLevel = 0 - if wasPlaying { - DispatchQueue.main.async { [weak self] in self?.onPlayingChanged?(false) } - } } } diff --git a/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceActivityIndicator.swift b/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceActivityIndicator.swift deleted file mode 100644 index 46f0eb99883..00000000000 --- a/desktop/macos/Desktop/Sources/FloatingControlBar/VoiceActivityIndicator.swift +++ /dev/null @@ -1,199 +0,0 @@ -import SwiftUI - -/// The floating bar's single status element. One coherent shape that changes its -/// motion law, color, and energy per state — never a hard icon swap — so the user -/// always knows, at a glance and without labels, whether the assistant is: -/// -/// • idle — a calm, barely-breathing sliver (nearly still, muted) -/// • listening — a red waveform reacting to "you" (red is reserved for recording) -/// • thinking — a cool blue→violet gradient sweeping on its own fixed clock; the -/// self-driven motion (no audio) reads as "working, wait" — critical -/// so a late reply never looks like "done / idle" -/// • speaking — a green waveform driven by the model's actual output amplitude -/// ("it's talking", clearly distinct from the red "you" waveform) -/// -/// Performance: idle (the long-lived resting state) uses a single Core Animation -/// property animation — no per-frame redraw. The active states use one -/// `TimelineView(.animation)` + `Canvas` (a single GPU-friendly draw pass, no -/// view-graph diffing per frame), and only run for the few seconds a turn is live. -/// No blur/shadow/material (those force offscreen passes) — glow is faked with -/// translucent gradient fills. -struct VoiceActivityIndicator: View { - let activity: VoiceActivity - /// Smoothed 0…1 amplitude of the model's spoken reply (drives the speaking waveform). - var level: CGFloat = 0 - - var body: some View { - ZStack { - switch activity { - case .idle: - IdleBreath() - case .listening: - WaveformBars(palette: .listening, level: 0, reactive: false) - case .thinking: - ThinkingSweep() - case .speaking: - WaveformBars(palette: .speaking, level: level, reactive: true) - } - } - // Cross-fade + gentle scale between states so energy "ramps" rather than snaps. - .transition(.opacity.combined(with: .scale(scale: 0.7))) - .animation(.spring(response: 0.4, dampingFraction: 0.86), value: activity) - } -} - -// MARK: - Idle - -/// A short muted capsule that breathes very slowly. Intentionally low-energy so the -/// resting bar never pulls the eye. Pure Core Animation — no redraw loop. -private struct IdleBreath: View { - @State private var breathing = false - - var body: some View { - Capsule() - .fill(Color.white.opacity(breathing ? 0.55 : 0.26)) - .frame(width: 26, height: 5) - .scaleEffect(x: 1, y: breathing ? 1.0 : 0.7, anchor: .center) - .onAppear { - withAnimation(.easeInOut(duration: 2.8).repeatForever(autoreverses: true)) { - breathing = true - } - } - } -} - -// MARK: - Thinking - -/// A cool blue→violet gradient that pans continuously across a capsule at a fixed, -/// self-driven rate. The autonomous (non-audio) motion is the cue that the model is -/// working — so a slow reply reads as "wait", never as "done". -private struct ThinkingSweep: View { - // Hoisted: the colors are state-constant, so only the gradient positions change - // per frame — no point rebuilding these Gradient values 60–120×/s. - private static let sweepGradient = Gradient(colors: [ - Color(red: 0.70, green: 0.49, blue: 1.0), // violet - Color(red: 0.43, green: 0.55, blue: 1.0), // blue - Color(red: 0.70, green: 0.49, blue: 1.0), - Color(red: 0.43, green: 0.55, blue: 1.0), - Color(red: 0.70, green: 0.49, blue: 1.0), - ]) - private static let glowGradient = Gradient(colors: [ - Color.white.opacity(0.45), Color.white.opacity(0), - ]) - - var body: some View { - TimelineView(.animation) { timeline in - Canvas { context, size in - let t = timeline.date.timeIntervalSinceReferenceDate - let rect = CGRect(origin: .zero, size: size) - let capsule = Capsule().path(in: rect) - - // Dim base track so the capsule reads even at the low point of the sweep. - context.fill(capsule, with: .color(.white.opacity(0.10))) - - context.drawLayer { layer in - layer.clip(to: capsule) - - // Pan a symmetric violet→blue→violet gradient horizontally. Symmetric - // stops + a span twice the width mean the loop has no visible seam. - let period = 2.2 // seconds per full pan - let phase = (t.truncatingRemainder(dividingBy: period)) / period - let span = size.width * 2 - let shift = CGFloat(phase) * span - layer.fill( - Rectangle().path(in: rect), - with: .linearGradient( - Self.sweepGradient, - startPoint: CGPoint(x: -span + shift, y: 0), - endPoint: CGPoint(x: shift, y: 0))) - - // Soft moving highlight (faked glow) gliding with an eased ping-pong - // so it slows at the ends instead of snapping back. - let eased = 0.5 - 0.5 * cos(phase * 2 * .pi) - let cx = size.width * CGFloat(eased) - let glowR = max(size.height, size.width * 0.32) - layer.fill( - Rectangle().path(in: rect), - with: .radialGradient( - Self.glowGradient, - center: CGPoint(x: cx, y: size.height / 2), - startRadius: 0, endRadius: glowR)) - } - } - } - .frame(width: 34, height: 8) - } -} - -// MARK: - Waveform (listening + speaking) - -/// Color treatment for a waveform state — a precomputed top→bottom gradient (constant -/// per state, so it's built once here, not per-bar per-frame inside the Canvas). -private struct WaveformPalette { - let gradient: Gradient - - /// Red — reserved exclusively for recording the user ("you"). - static let listening = WaveformPalette(gradient: Gradient(colors: [ - Color(red: 1.0, green: 0.42, blue: 0.42), - Color(red: 1.0, green: 0.18, blue: 0.33), - ])) - - /// Green/mint — the assistant speaking ("it"); clearly not the red "you" or blue "thinking". - static let speaking = WaveformPalette(gradient: Gradient(colors: [ - Color(red: 0.46, green: 0.93, blue: 0.74), - Color(red: 0.20, green: 0.83, blue: 0.60), - ])) -} - -/// A small centered equalizer. `reactive` bars track the live `level` (speaking); -/// non-reactive bars animate on a lively synthetic clock (listening). A per-bar phase -/// + center weighting gives an organic "voice blob" rather than a marching pattern. -private struct WaveformBars: View { - let palette: WaveformPalette - var level: CGFloat - var reactive: Bool - - private let barCount = 5 - - var body: some View { - TimelineView(.animation) { timeline in - Canvas { context, size in - let t = timeline.date.timeIntervalSinceReferenceDate - // Equal bars and gaps: n bars, n-1 gaps, all one unit wide. - let unit = size.width / CGFloat(barCount * 2 - 1) - let radius = unit / 2 - let minH = size.height * 0.28 - - for i in 0.. visible bounce/overshoot (ζ ≈ 0.35). + private let stiffness: Double = 200 + private let damping: Double = 10 + + init(barCount: Int) { + self.barCount = barCount + values = Array(repeating: 0, count: barCount) + velocities = Array(repeating: 0, count: barCount) + phases = (0.. a friendly arch. + let mid = Double(barCount - 1) / 2 + weights = (0.. 0.04 ? min(1.0, lvl / envelope) : 0.0 + let gained = pow(norm, 0.75) + + for i in 0.. bouncy overshoot. + let x = Double(values[i]) + let accel = stiffness * (target - x) - damping * velocities[i] + velocities[i] += accel * dt + let nx = x + velocities[i] * dt + values[i] = CGFloat(max(0.0, min(1.0, nx))) + } + } +} diff --git a/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift b/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift index fdb8bc462e7..79fa7458530 100644 --- a/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift +++ b/desktop/macos/Desktop/Sources/MainWindow/Pages/SettingsPage.swift @@ -271,12 +271,7 @@ struct SettingsContentView: View { // AI Chat settings @AppStorage("chatBridgeMode") private var chatBridgeMode: String = "piMono" - @AppStorage("realtimeOmniProvider") private var realtimeOmniProvider: String = RealtimeOmniProvider.auto.rawValue - // Realtime-as-hub (Phase 1, dev/BYOK only): the realtime model is the single - // tool-dispatching voice hub. Provider toggle persisted here; RealtimeHubSession - // reads it at connect. - @AppStorage("realtimeHubEnabled") private var realtimeHubEnabled = false - @AppStorage("realtimeHubProvider") private var realtimeHubProvider: String = RealtimeHubProvider.openai.rawValue + @AppStorage("realtimeOmniProvider") private var realtimeOmniProvider: String = RealtimeOmniProvider.gptRealtime2.rawValue @AppStorage("askModeEnabled") private var askModeEnabled = false @AppStorage("claudeMdEnabled") private var claudeMdEnabled = true @AppStorage("projectClaudeMdEnabled") private var projectClaudeMdEnabled = true @@ -2534,75 +2529,6 @@ struct SettingsContentView: View { voiceSpeedSlider(settingId: "floatingbar.voicespeed") .opacity(shortcutSettings.hasAnyFloatingBarVoiceAnswersEnabled ? 1 : 0.55) .disabled(!shortcutSettings.hasAnyFloatingBarVoiceAnswersEnabled) - - realtimeHubCard - realtimeHubProviderCard - .opacity(realtimeHubEnabled ? 1 : 0.55) - .disabled(!realtimeHubEnabled) - } - } - - // MARK: Realtime-as-hub (Phase 1, dev/BYOK only) - - /// The realtime model becomes the single voice hub: in-session STT + reasoning - /// + tool-choice routing + spoken reply, bypassing the STT→Haiku router→Claude - /// cascade. Client-direct using the user's own BYOK key (dev/test only). - private var realtimeHubCard: some View { - settingsCard(settingId: "floatingbar.realtimehub") { - HStack(spacing: 16) { - VStack(alignment: .leading, spacing: 4) { - Text("Realtime Voice Hub (experimental)") - .scaledFont(size: 16, weight: .semibold) - .foregroundColor(OmiColors.textPrimary) - Text( - "Let the realtime model run the whole voice turn — listen, decide, and speak — " - + "instead of the slower transcribe→route→answer pipeline. Uses your own provider key." - ) - .scaledFont(size: 13) - .foregroundColor(OmiColors.textSecondary) - } - Spacer() - Toggle("", isOn: $realtimeHubEnabled) - .toggleStyle(.switch) - .tint(OmiColors.purplePrimary) - .onChange(of: realtimeHubEnabled) { _ in - NotificationCenter.default.post(name: .realtimeHubSettingsDidChange, object: nil) - } - } - } - } - - private var realtimeHubProviderCard: some View { - let provider = RealtimeHubProvider(rawValue: realtimeHubProvider) ?? .openai - let hasKey = APIKeyService.byokKey(provider.byokProvider) != nil - return settingsCard(settingId: "floatingbar.realtimehub.provider") { - HStack(spacing: 16) { - VStack(alignment: .leading, spacing: 4) { - Text("Hub Provider") - .scaledFont(size: 16, weight: .semibold) - .foregroundColor(OmiColors.textPrimary) - Text( - hasKey - ? provider.subtitle - : "⚠️ No \(provider.byokProvider.displayName) key set — add one in Developer settings to use this provider." - ) - .scaledFont(size: 13) - .foregroundColor(hasKey ? OmiColors.textSecondary : OmiColors.purplePrimary) - } - Spacer() - Picker("", selection: $realtimeHubProvider) { - ForEach(RealtimeHubProvider.allCases, id: \.rawValue) { p in - Text(p.displayName).tag(p.rawValue) - } - } - .pickerStyle(.menu) - .labelsHidden() - .frame(width: 180) - .tint(OmiColors.purplePrimary) - .onChange(of: realtimeHubProvider) { _ in - NotificationCenter.default.post(name: .realtimeHubSettingsDidChange, object: nil) - } - } } } @@ -3494,6 +3420,10 @@ struct SettingsContentView: View { if newValue == RealtimeOmniProvider.auto.rawValue { AutoModelSelector.shared.refreshIfStale() } + // The picker writes @AppStorage directly (bypassing the RealtimeOmniSettings + // setter), so post the change ourselves — this is what re-warms the realtime + // hub on the newly selected provider (and is a no-op for unchanged providers). + NotificationCenter.default.post(name: .realtimeOmniSettingsDidChange, object: nil) } } diff --git a/desktop/macos/Desktop/Sources/RealtimeOmni/RealtimeOmniSettings.swift b/desktop/macos/Desktop/Sources/RealtimeOmni/RealtimeOmniSettings.swift index 393feeaa853..5edc05c7094 100644 --- a/desktop/macos/Desktop/Sources/RealtimeOmni/RealtimeOmniSettings.swift +++ b/desktop/macos/Desktop/Sources/RealtimeOmni/RealtimeOmniSettings.swift @@ -59,7 +59,9 @@ final class RealtimeOmniSettings { private init() { UserDefaults.standard.register(defaults: [ - providerKey: RealtimeOmniProvider.auto.rawValue, + // Default to OpenAI (GPT Realtime 2); the user can switch to Gemini or Auto + // in Advanced → Voice Model. This default also drives the realtime hub provider. + providerKey: RealtimeOmniProvider.gptRealtime2.rawValue, enabledKey: false, ]) } diff --git a/desktop/macos/Desktop/Tests/AboutUserCardTests.swift b/desktop/macos/Desktop/Tests/AboutUserCardTests.swift new file mode 100644 index 00000000000..df06db6bdbd --- /dev/null +++ b/desktop/macos/Desktop/Tests/AboutUserCardTests.swift @@ -0,0 +1,28 @@ +import XCTest +@testable import Omi_Computer + +final class AboutUserCardTests: XCTestCase { + func testRenderIncludesNameFactsCountsAndHedge() { + let card = AboutUserCard.render( + name: "Sam", + facts: ["Lives in San Francisco", "Prefers concise answers"], + overdue: 2, + dueToday: 3 + ) + XCTAssertTrue(card.contains("")) + XCTAssertTrue(card.contains("")) + XCTAssertTrue(card.contains("Name: Sam")) + XCTAssertTrue(card.contains("- Lives in San Francisco")) + XCTAssertTrue(card.contains("- Prefers concise answers")) + XCTAssertTrue(card.contains("2 overdue")) + XCTAssertTrue(card.contains("3 due today")) + XCTAssertTrue(card.contains("quick snapshot")) + } + + func testRenderEmptyState() { + let card = AboutUserCard.render(name: "", facts: [], overdue: 0, dueToday: 0) + XCTAssertFalse(card.contains("Name:")) // no name line when empty + XCTAssertTrue(card.contains("Nothing saved")) // facts empty-state + XCTAssertTrue(card.contains("nothing overdue or due today")) + } +} diff --git a/desktop/macos/Desktop/Tests/HubEscalationTests.swift b/desktop/macos/Desktop/Tests/HubEscalationTests.swift new file mode 100644 index 00000000000..f7d185da3ea --- /dev/null +++ b/desktop/macos/Desktop/Tests/HubEscalationTests.swift @@ -0,0 +1,27 @@ +import XCTest + +@testable import Omi_Computer + +final class HubEscalationTests: XCTestCase { + func testBodyHasSystemPromptAndAppendsContext() { + let body = RealtimeHubTools.escalationBody( + query: "What's the best plan?", + context: "User is comparing the M3 and M4 MacBook.", + aboutUser: "\nName: Sam\n") + XCTAssertEqual(body["model"] as? String, "claude-sonnet-4-6") + let messages = body["messages"] as! [[String: String]] + XCTAssertEqual(messages[0]["role"], "system") + XCTAssertTrue(messages[0]["content"]!.contains("")) + XCTAssertEqual(messages[1]["role"], "user") + XCTAssertTrue(messages[1]["content"]!.contains("What's the best plan?")) + XCTAssertTrue(messages[1]["content"]!.contains("M3 and M4")) // context appended + } + + func testBodyOmitsContextSectionWhenEmpty() { + let body = RealtimeHubTools.escalationBody( + query: "Capital of France?", context: "", aboutUser: "") + let messages = body["messages"] as! [[String: String]] + XCTAssertFalse(messages[1]["content"]!.contains("Context")) + XCTAssertFalse(messages[1]["content"]!.contains("Answer concisely for a spoken reply")) + } +} diff --git a/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift b/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift new file mode 100644 index 00000000000..52e2ff52e29 --- /dev/null +++ b/desktop/macos/Desktop/Tests/HubSystemInstructionTests.swift @@ -0,0 +1,16 @@ +import XCTest +@testable import Omi_Computer + +final class HubSystemInstructionTests: XCTestCase { + func testInstructionInjectsCardAndUsesUserLanguage() { + let card = "\nName: Sam\n" + let instr = RealtimeHubTools.systemInstruction(aboutUser: card) + XCTAssertTrue(instr.contains(card)) // card injected + XCTAssertTrue(instr.lowercased().contains("language the user")) // reply-in-user-language + XCTAssertFalse(instr.contains("Always reply in English")) // old rule gone + XCTAssertTrue(instr.contains("spawn_agent")) // guardrails preserved + XCTAssertTrue(instr.contains("get_daily_recap")) + XCTAssertTrue(instr.contains("ask_higher_model")) + XCTAssertTrue(instr.contains("ANSWER YOURSELF")) + } +}