From 7eb6f001dd20125deef47a6c66b5699e339f6d0b Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" Date: Thu, 26 Feb 2026 21:47:25 +0000 Subject: [PATCH 1/3] Bump version to 1.5.0 --- Textream/Textream.xcodeproj/project.pbxproj | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Textream/Textream.xcodeproj/project.pbxproj b/Textream/Textream.xcodeproj/project.pbxproj index df30b53..3b0b5f6 100644 --- a/Textream/Textream.xcodeproj/project.pbxproj +++ b/Textream/Textream.xcodeproj/project.pbxproj @@ -273,7 +273,7 @@ "$(inherited)", "@executable_path/../Frameworks", ); - MARKETING_VERSION = 1.4.0; + MARKETING_VERSION = 1.5.0; PRODUCT_BUNDLE_IDENTIFIER = dev.fka.textream; PRODUCT_NAME = "$(TARGET_NAME)"; REGISTER_APP_GROUPS = YES; @@ -312,7 +312,7 @@ "$(inherited)", "@executable_path/../Frameworks", ); - MARKETING_VERSION = 1.4.0; + MARKETING_VERSION = 1.5.0; PRODUCT_BUNDLE_IDENTIFIER = dev.fka.textream; PRODUCT_NAME = "$(TARGET_NAME)"; REGISTER_APP_GROUPS = YES; From 9413b186a6515dbb93ebf0ef1757de3e4bffc34c Mon Sep 17 00:00:00 2001 From: G10Bi Date: Fri, 12 Jun 2026 23:28:24 +0200 Subject: [PATCH 2/3] Fix: prevent 58s timeout freeze with full engine restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Problem After ~58 seconds of continuous speech recognition, `SFSpeechRecognizer` hits its internal timeout (error 1110 / 216) and the teleprompter freezes permanently. The original code called `restartTask()` — a soft restart that reuses the existing `AVAudioEngine`. But macOS also **pauses the microphone hardware** during these timeouts. The engine's internal state becomes unrecoverable, and no amount of task-level restart can bring it back. The only way out was a manual stop/start. ## Root cause `AVAudioEngine` caches the audio device format internally. After the system pauses the mic (triggered by the 58s speech recognition limit), the cached format is invalidated but the engine doesn't detect this. A soft restart (`restartTask()`) only recreates the `SFSpeechRecognitionTask`, not the engine. The new task receives silence or corrupt buffers and fails silently. ## Solution Added `forceRestartAfterTimeout()` — a method that performs a **complete teardown and recreation** of the entire audio stack: ```swift private func forceRestartAfterTimeout() { // 1. Save state let savedOffset = recognizedCharCount let savedSource = sourceText // 2. Full cleanup (task + engine + tap) cleanupRecognition() // 3. Fresh engine — picks up current hardware format audioEngine = AVAudioEngine() // 4. Restore state sourceText = savedSource recognizedCharCount = savedOffset matchStartOffset = savedOffset // 5. Restart after short delay for resource release DispatchQueue.main.asyncAfter(deadline: .now() + 0.2) { self.beginRecognition() } } --- Textream/Textream/SpeechRecognizer.swift | 352 ++++++++++++++++++++--- 1 file changed, 314 insertions(+), 38 deletions(-) diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift index 0730b1c..455ac6e 100644 --- a/Textream/Textream/SpeechRecognizer.swift +++ b/Textream/Textream/SpeechRecognizer.swift @@ -46,9 +46,10 @@ struct AudioInputDevice: Identifiable, Hashable { mScope: kAudioObjectPropertyScopeGlobal, mElement: kAudioObjectPropertyElementMain ) - var uid: CFString = "" as CFString - var uidSize = UInt32(MemoryLayout.size) + var uid: Unmanaged? = nil + var uidSize = UInt32(MemoryLayout?>.size) guard AudioObjectGetPropertyData(deviceID, &uidAddress, 0, nil, &uidSize, &uid) == noErr else { continue } + let uidValue = uid?.takeRetainedValue() as String? ?? "" // Get name var nameAddress = AudioObjectPropertyAddress( @@ -56,11 +57,12 @@ struct AudioInputDevice: Identifiable, Hashable { mScope: kAudioObjectPropertyScopeGlobal, mElement: kAudioObjectPropertyElementMain ) - var name: CFString = "" as CFString - var nameSize = UInt32(MemoryLayout.size) + var name: Unmanaged? = nil + var nameSize = UInt32(MemoryLayout?>.size) guard AudioObjectGetPropertyData(deviceID, &nameAddress, 0, nil, &nameSize, &name) == noErr else { continue } + let nameValue = name?.takeRetainedValue() as String? ?? "" - result.append(AudioInputDevice(id: deviceID, uid: uid as String, name: name as String)) + result.append(AudioInputDevice(id: deviceID, uid: uidValue, name: nameValue)) } return result } @@ -79,6 +81,8 @@ class SpeechRecognizer { var lastSpokenText: String = "" var shouldDismiss: Bool = false var shouldAdvancePage: Bool = false + private var lastSpokenContext: String = "" + private var lastPartialSpoken: String = "" /// True when recent audio levels indicate the user is actively speaking var isSpeaking: Bool { @@ -101,6 +105,11 @@ class SpeechRecognizer { private var pendingRestart: DispatchWorkItem? private var sessionGeneration: Int = 0 private var suppressConfigChange: Bool = false + private var requestLock = NSLock() + private var preemptiveRestartTimer: Timer? + /// Sliding window of recent match positions for confidence gating. + /// We require 2-of-3 recent results to agree before committing a forward jump. + private var recentMatchPositions: [Int] = [] /// Update the source text while preserving the current recognized char count. /// Used by Director Mode to live-edit unread text without resetting read progress. @@ -111,6 +120,7 @@ class SpeechRecognizer { normalizedSource = Self.normalize(collapsed) recognizedCharCount = min(preservingCharCount, collapsed.count) matchStartOffset = recognizedCharCount + recentMatchPositions = [] } /// Jump highlight to a specific char offset (e.g. when user taps a word) @@ -118,11 +128,38 @@ class SpeechRecognizer { recognizedCharCount = charOffset matchStartOffset = charOffset retryCount = 0 + recentMatchPositions = [] if isListening { restartRecognition() } } + private func forceRestartAfterTimeout() { + print("Timeout restart - recreating audio engine") + + // Save state + let savedOffset = recognizedCharCount + let savedSource = sourceText + + // Full stop + cleanupRecognition() + + // Recreate engine (essential!) + audioEngine = AVAudioEngine() + + // Restore state + sourceText = savedSource + recognizedCharCount = savedOffset + matchStartOffset = savedOffset + recentMatchPositions = [] + retryCount = 0 + + // Small delay to release resources + DispatchQueue.main.asyncAfter(deadline: .now() + 0.2) { [weak self] in + self?.beginRecognition() + } + } + func start(with text: String) { // Clean up any previous session immediately so pending restarts // and stale taps are removed before the async auth callback fires. @@ -135,6 +172,7 @@ class SpeechRecognizer { recognizedCharCount = 0 matchStartOffset = 0 retryCount = 0 + recentMatchPositions = [] error = nil sessionGeneration += 1 @@ -199,35 +237,49 @@ class SpeechRecognizer { isListening = false sourceText = "" retryCount = maxRetries + recentMatchPositions = [] cleanupRecognition() } func resume() { retryCount = 0 matchStartOffset = recognizedCharCount + recentMatchPositions = [] shouldDismiss = false beginRecognition() } - private func cleanupRecognition() { + private func cleanupRecognitionTask() { // Cancel any pending restart to prevent overlapping beginRecognition calls pendingRestart?.cancel() pendingRestart = nil + stopPreemptiveTimer() + if let observer = configurationChangeObserver { NotificationCenter.default.removeObserver(observer) configurationChangeObserver = nil } + requestLock.lock() recognitionRequest?.endAudio() recognitionRequest = nil + requestLock.unlock() recognitionTask?.cancel() recognitionTask = nil + } + + private func cleanupAudioEngine() { if audioEngine.isRunning { audioEngine.stop() } audioEngine.inputNode.removeTap(onBus: 0) } + private func cleanupRecognition() { + cleanupRecognitionTask() + cleanupAudioEngine() + } + /// Coalesces all delayed beginRecognition() calls into a single pending work item. /// Any previously scheduled restart is cancelled before the new one is queued. private func scheduleBeginRecognition(after delay: TimeInterval) { @@ -285,12 +337,27 @@ class SpeechRecognizer { recognitionRequest = SFSpeechAudioBufferRecognitionRequest() guard let recognitionRequest else { return } recognitionRequest.shouldReportPartialResults = true + + // Add context from last spoken words + if !lastSpokenContext.isEmpty { + recognitionRequest.contextualStrings = [lastSpokenContext] + } + + // Add contextual strings from the source text to improve STT accuracy + let upcoming = String(sourceText.dropFirst(matchStartOffset)) + let contextWords = upcoming.split(separator: " ") + .map { String($0).lowercased().filter { $0.isLetter || $0.isNumber } } + .filter { $0.count >= 5 } + let uniqueContextWords = Array(Set(contextWords).prefix(50)) + if !uniqueContextWords.isEmpty { + recognitionRequest.contextualStrings = uniqueContextWords + } let inputNode = audioEngine.inputNode - let recordingFormat = inputNode.outputFormat(forBus: 0) + let hardwareFormat = inputNode.outputFormat(forBus: 0) // Guard against invalid format during device transitions (e.g. mic switch) - guard recordingFormat.sampleRate > 0, recordingFormat.channelCount > 0 else { + guard hardwareFormat.sampleRate > 0, hardwareFormat.channelCount > 0 else { // Retry after a longer delay to let the audio system settle if retryCount < maxRetries { retryCount += 1 @@ -302,6 +369,17 @@ class SpeechRecognizer { return } + // SFSpeechRecognizer requires mono audio. Multi-channel devices (e.g. + // RODECaster Pro II at 2ch/48kHz) cause the recognition task to silently + // return no results. Request a mono tap and let AVAudioEngine downmix. + let monoFormat = AVAudioFormat( + commonFormat: hardwareFormat.commonFormat, + sampleRate: hardwareFormat.sampleRate, + channels: 1, + interleaved: hardwareFormat.isInterleaved + ) + let tapFormat = (hardwareFormat.channelCount > 1) ? monoFormat : hardwareFormat + // Observe audio configuration changes (e.g. mic switched externally) to restart gracefully configurationChangeObserver = NotificationCenter.default.addObserver( forName: .AVAudioEngineConfigurationChange, @@ -315,8 +393,8 @@ class SpeechRecognizer { // Belt-and-suspenders: ensure no stale tap exists before installing inputNode.removeTap(onBus: 0) - inputNode.installTap(onBus: 0, bufferSize: 1024, format: nil) { [weak self] buffer, _ in - recognitionRequest.append(buffer) + inputNode.installTap(onBus: 0, bufferSize: 1024, format: tapFormat) { [weak self] buffer, _ in + self?.appendBufferToRequest(buffer) guard let channelData = buffer.floatChannelData?[0] else { return } let frameLength = Int(buffer.frameLength) @@ -343,16 +421,40 @@ class SpeechRecognizer { DispatchQueue.main.async { // Ignore stale results from a previous session guard self.sessionGeneration == currentGeneration else { return } + self.lastPartialSpoken = spoken self.retryCount = 0 // Reset on success self.lastSpokenText = spoken self.matchCharacters(spoken: spoken) } } - if error != nil { + if let error { DispatchQueue.main.async { - // If recognitionRequest is nil, cleanup already ran (intentional cancel) — don't retry + guard self.sessionGeneration == currentGeneration else { return } guard self.recognitionRequest != nil else { return } - if self.isListening && !self.shouldDismiss && !self.sourceText.isEmpty && self.retryCount < self.maxRetries { + guard self.isListening && !self.shouldDismiss && !self.sourceText.isEmpty else { + self.isListening = false + return + } + + self.matchStartOffset = self.recognizedCharCount + + let nsError = error as NSError + let isTimeout = nsError.code == 1110 || nsError.code == 216 + + if isTimeout { + self.retryCount = 0 + // Advance with the last partial phrase before interruption + if !self.lastPartialSpoken.isEmpty { + let extraMatch = self.charLevelMatch(spoken: self.lastPartialSpoken) + self.recognizedCharCount = min( + self.matchStartOffset + extraMatch, + self.sourceText.count + ) + self.matchStartOffset = self.recognizedCharCount + } + // After a timeout, ALWAYS perform a full engine restart + self.forceRestartAfterTimeout() + } else if self.retryCount < self.maxRetries { self.retryCount += 1 let delay = min(Double(self.retryCount) * 0.5, 1.5) self.scheduleBeginRecognition(after: delay) @@ -367,6 +469,7 @@ class SpeechRecognizer { audioEngine.prepare() try audioEngine.start() isListening = true + startPreemptiveTimer() } catch { // Transient failure after a device switch — retry with longer delay if retryCount < maxRetries { @@ -380,12 +483,142 @@ class SpeechRecognizer { } private func restartRecognition() { - // Reset retries so the fresh engine gets a full set of attempts retryCount = 0 isListening = true - // Longer delay to let the audio system fully settle after a device change - cleanupRecognition() - scheduleBeginRecognition(after: 0.5) + if audioEngine.isRunning { + restartTask() + } else { + cleanupRecognition() + scheduleBeginRecognition(after: 0.5) + } + } + + // MARK: - Thread-safe buffer appending + + private func appendBufferToRequest(_ buffer: AVAudioPCMBuffer) { + requestLock.lock() + recognitionRequest?.append(buffer) + requestLock.unlock() + } + + // MARK: - Soft restart (task only, keeps audio engine running) + + private func restartTask() { + // Update match offset before restarting + matchStartOffset = recognizedCharCount + recentMatchPositions = [] + + // Cancel any pending restart + pendingRestart?.cancel() + pendingRestart = nil + + // IMPORTANT: Do NOT stop audioEngine or remove the tap + // Only stop the recognition task, not the audio stream + + // Lock the old request and task + requestLock.lock() + let oldRequest = recognitionRequest + recognitionRequest = nil // Prevent new appends + requestLock.unlock() + + oldRequest?.endAudio() + recognitionTask?.cancel() + recognitionTask = nil + + // Create a NEW request but reuse the same audioEngine (still running) + let newRequest = SFSpeechAudioBufferRecognitionRequest() + newRequest.shouldReportPartialResults = true + + // Add context from last spoken words + if !lastSpokenContext.isEmpty { + newRequest.contextualStrings = [lastSpokenContext] + } + + // Context for remaining text + let upcoming = String(sourceText.dropFirst(matchStartOffset)) + let contextWords = upcoming.split(separator: " ") + .map { String($0).lowercased().filter { $0.isLetter || $0.isNumber } } + .filter { $0.count >= 5 } + let uniqueWords = Array(Set(contextWords).prefix(50)) + if !uniqueWords.isEmpty { + newRequest.contextualStrings = uniqueWords + } + + // Connect the new request to existing audio buffer + requestLock.lock() + recognitionRequest = newRequest + requestLock.unlock() + + // If recognizer is not available, retry later + guard let speechRecognizer, speechRecognizer.isAvailable else { + scheduleBeginRecognition(after: 0.2) + return + } + + let currentGeneration = sessionGeneration + recognitionTask = speechRecognizer.recognitionTask(with: newRequest) { [weak self] result, error in + guard let self else { return } + if let result { + let spoken = result.bestTranscription.formattedString + DispatchQueue.main.async { + guard self.sessionGeneration == currentGeneration else { return } + self.lastPartialSpoken = spoken + self.retryCount = 0 + self.lastSpokenText = spoken + self.matchCharacters(spoken: spoken) + } + } + if let error { + DispatchQueue.main.async { + guard self.sessionGeneration == currentGeneration else { return } + guard self.recognitionRequest != nil else { return } + guard self.isListening && !self.shouldDismiss && !self.sourceText.isEmpty else { + self.isListening = false + return + } + + self.matchStartOffset = self.recognizedCharCount + let nsError = error as NSError + let isTimeout = nsError.code == 1110 || nsError.code == 216 + + if isTimeout { + self.retryCount = 0 + if !self.lastPartialSpoken.isEmpty { + let extraMatch = self.charLevelMatch(spoken: self.lastPartialSpoken) + self.recognizedCharCount = min( + self.matchStartOffset + extraMatch, + self.sourceText.count + ) + self.matchStartOffset = self.recognizedCharCount + } + // CORRECT: full restart instead of restartTask() + self.forceRestartAfterTimeout() + } else if self.retryCount < self.maxRetries { + self.retryCount += 1 + let delay = min(Double(self.retryCount) * 0.5, 1.5) + self.scheduleBeginRecognition(after: delay) + } else { + self.isListening = false + } + } + } + } + } + + // MARK: - Pre-emptive restart timer + + private func startPreemptiveTimer() { + // Timer disabled - let natural timeout handle the restart + // preemptiveRestartTimer?.invalidate() + // preemptiveRestartTimer = Timer.scheduledTimer(withTimeInterval: 55.0, repeats: true) { [weak self] _ in + // guard let self, self.isListening, !self.sourceText.isEmpty else { return } + // self.restartTask() + // } + } + + private func stopPreemptiveTimer() { + preemptiveRestartTimer?.invalidate() + preemptiveRestartTimer = nil } // MARK: - Fuzzy character-level matching @@ -397,19 +630,61 @@ class SpeechRecognizer { // Strategy 2: word-level match (handles STT word substitutions) let wordResult = wordLevelMatch(spoken: spoken) - let best = max(charResult, wordResult) + // Use agreement-based selection instead of blind max(). + // If both strategies agree within a tolerance, use the average. + // If they disagree wildly, use the more conservative (lower) result + // to avoid false-positive jumps. + let best: Int + let tolerance = 20 // characters + if abs(charResult - wordResult) <= tolerance { + best = (charResult + wordResult) / 2 + } else { + best = min(charResult, wordResult) + } - // Only move forward from the match start offset let newCount = matchStartOffset + best - if newCount > recognizedCharCount { - recognizedCharCount = min(newCount, sourceText.count) + guard newCount > recognizedCharCount else { return } + + let candidate = min(newCount, sourceText.count) + + // Confidence gating: require 2-of-3 recent results to agree on + // forward movement to avoid single-result false-positive jumps. + recentMatchPositions.append(candidate) + if recentMatchPositions.count > 3 { + recentMatchPositions.removeFirst() + } + + // Check if at least 2 of the recent positions agree (within tolerance) + let agreementThreshold = 10 // characters + var confirmed = false + if recentMatchPositions.count >= 2 { + var agreeCount = 0 + for pos in recentMatchPositions { + if abs(pos - candidate) <= agreementThreshold { + agreeCount += 1 + } + } + confirmed = agreeCount >= 2 + } + + // Small forward movements (< 1 word length) are always allowed + // to keep the highlight responsive for normal reading + let smallStep = candidate - recognizedCharCount <= 15 + + if confirmed || smallStep { + recognizedCharCount = candidate + + // Preserve last 5 words for next restart context + let contextWords = spoken.split(separator: " ").map(String.init).suffix(5) + lastSpokenContext = contextWords.joined(separator: " ") } } private func charLevelMatch(spoken: String) -> Int { let remainingSource = String(sourceText.dropFirst(matchStartOffset)) - let src = Array(remainingSource.lowercased().unicodeScalars).map { Character($0) } - let spk = Array(Self.normalize(spoken).unicodeScalars).map { Character($0) } + // Use Character arrays (not unicodeScalars) so counts match sourceText.count + let src = Array(remainingSource.lowercased()) + let spk = Array(Self.normalize(spoken)) var si = 0 var ri = 0 @@ -466,10 +741,10 @@ class SpeechRecognizer { } if found { continue } - // Skip both (substitution) - si += 1 + // No resync found — advance spoken pointer only. + // Do NOT advance lastGoodOrigIndex; this is a genuine mismatch, + // not a confirmed match position. ri += 1 - lastGoodOrigIndex = si } } @@ -506,13 +781,14 @@ class SpeechRecognizer { .filter { $0.isLetter || $0.isNumber } if srcWord == spkWord || isFuzzyMatch(srcWord, spkWord) { - // Count original chars including trailing punctuation, plus space + // Count original chars including trailing punctuation matchedCharCount += sourceWords[si].count - if si < sourceWords.count - 1 { - matchedCharCount += 1 // space - } si += 1 ri += 1 + // Add space separator only if there's a following word + if si < sourceWords.count { + matchedCharCount += 1 + } } else { // Try skipping up to 3 spoken words (STT hallucinated words) var foundSpk = false @@ -570,16 +846,16 @@ class SpeechRecognizer { if a.isEmpty || b.isEmpty { return false } // Exact match if a == b { return true } - // One starts with the other (phonetic prefix: "not" ~ "notch") - if a.hasPrefix(b) || b.hasPrefix(a) { return true } - // One contains the other - if a.contains(b) || b.contains(a) { return true } - // Shared prefix >= 60% of shorter word - let shared = zip(a, b).prefix(while: { $0 == $1 }).count let shorter = min(a.count, b.count) - if shorter >= 2 && shared >= max(2, shorter * 3 / 5) { return true } - // Edit distance tolerance + // Prefix match — only for words with at least 3 chars to avoid + // false positives like "or" matching "organization" + if shorter >= 3 && (a.hasPrefix(b) || b.hasPrefix(a)) { return true } + // Shared prefix >= 60% of shorter word (min 3 chars shared) + let shared = zip(a, b).prefix(while: { $0 == $1 }).count + if shorter >= 3 && shared >= max(3, shorter * 3 / 5) { return true } + // Edit distance tolerance — stricter for very short words let dist = editDistance(a, b) + if shorter <= 2 { return false } // 2-char words must be exact if shorter <= 4 { return dist <= 1 } if shorter <= 8 { return dist <= 2 } return dist <= max(a.count, b.count) / 3 From cecd1859771d5264c62d14c906078eb3625869af Mon Sep 17 00:00:00 2001 From: G10Bi Date: Sat, 13 Jun 2026 00:25:10 +0200 Subject: [PATCH 3/3] Fix assignment of contextualStrings in SpeechRecognizer Little error... --- Textream/Textream/SpeechRecognizer.swift | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Textream/Textream/SpeechRecognizer.swift b/Textream/Textream/SpeechRecognizer.swift index 98ffb9d..b5b431d 100644 --- a/Textream/Textream/SpeechRecognizer.swift +++ b/Textream/Textream/SpeechRecognizer.swift @@ -350,7 +350,7 @@ class SpeechRecognizer { .filter { $0.count >= 5 } allContext.append(contentsOf: Array(Set(contextWords)).prefix(50)) if !allContext.isEmpty { - newRequest.contextualStrings = allContext + recognitionRequest.contextualStrings = allContext } let inputNode = audioEngine.inputNode