From f123f4e00129a94ae7199b0b43ae0fb81336e53e Mon Sep 17 00:00:00 2001 From: Matt Van Horn <455140+mvanhorn@users.noreply.github.com> Date: Fri, 12 Jun 2026 00:25:34 -0700 Subject: [PATCH] fix: double-resume of checked continuation in ScreenTextExtractor.extractText(from:) crashes app on tiny OCR screenshots Fixes #502 --- Cotabby.xcodeproj/project.pbxproj | 4 + .../Services/Visual/ScreenTextExtractor.swift | 91 ++++++++++++++----- CotabbyTests/ScreenTextExtractorTests.swift | 59 ++++++++++++ 3 files changed, 133 insertions(+), 21 deletions(-) create mode 100644 CotabbyTests/ScreenTextExtractorTests.swift diff --git a/Cotabby.xcodeproj/project.pbxproj b/Cotabby.xcodeproj/project.pbxproj index 71efd8f4..ff78861d 100644 --- a/Cotabby.xcodeproj/project.pbxproj +++ b/Cotabby.xcodeproj/project.pbxproj @@ -446,6 +446,7 @@ A736C52C0D280A35946E37A3 /* SurfaceContextComposer.swift in Sources */ = {isa = PBXBuildFile; fileRef = C602357DDED5D11C8B4567FB /* SurfaceContextComposer.swift */; }; A773D96AC9EC16231633542C /* DownloadOutcomeClassifier.swift in Sources */ = {isa = PBXBuildFile; fileRef = 3DE1975F3B5F4A70478DBF41 /* DownloadOutcomeClassifier.swift */; }; A7A1B9BE242959B3EE396D27 /* LaunchAtLogin in Frameworks */ = {isa = PBXBuildFile; productRef = 7B0278A63FEEE8DEDB6C50DB /* LaunchAtLogin */; }; + A81E3481B0067E9D6F9D1325 /* ScreenTextExtractorTests.swift in Sources */ = {isa = PBXBuildFile; fileRef = 6C34A98E5B05DB3F4FE81E70 /* ScreenTextExtractorTests.swift */; }; A87978083EBE1AC294377F7C /* HuggingFaceSearchService.swift in Sources */ = {isa = PBXBuildFile; fileRef = 8F426127917FCB1096134732 /* HuggingFaceSearchService.swift */; }; A8A294534897C461CA5968F3 /* UnitConversionEvaluator.swift in Sources */ = {isa = PBXBuildFile; fileRef = 82E7794DF60664B1FA8F6E7B /* UnitConversionEvaluator.swift */; }; A93A741C8C3973687D28F0B6 /* SuggestionEngineModels.swift in Sources */ = {isa = PBXBuildFile; fileRef = ADBE3E6CC585C1683787C877 /* SuggestionEngineModels.swift */; }; @@ -823,6 +824,7 @@ 684737E62EE6495A71344923 /* DeepGeometryWalkThrottle.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = DeepGeometryWalkThrottle.swift; sourceTree = ""; }; 6A44BEC8C23FF227731DD0CD /* FocusCapabilityFlickerGate.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = FocusCapabilityFlickerGate.swift; sourceTree = ""; }; 6B2D97BAA3618A7D0357AC44 /* SuggestionWorkController.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = SuggestionWorkController.swift; sourceTree = ""; }; + 6C34A98E5B05DB3F4FE81E70 /* ScreenTextExtractorTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ScreenTextExtractorTests.swift; sourceTree = ""; }; 6CF1FBAABEF545B620AF8D78 /* ru-100k.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = "ru-100k.txt"; sourceTree = ""; }; 6D4C1EF008B9DFA753D561D3 /* LlamaEvalScoringTests.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = LlamaEvalScoringTests.swift; sourceTree = ""; }; 6DB982BF30B3601F57277776 /* fr-100k.txt */ = {isa = PBXFileReference; lastKnownFileType = text; path = "fr-100k.txt"; sourceTree = ""; }; @@ -1468,6 +1470,7 @@ 1909DF39C47A113382BB53B6 /* RequestIDTests.swift */, B1E6D9CCC0AA3674FEE57AE0 /* RuntimeBootstrapModelTests.swift */, B2BFD19A159680A495EE02FD /* ScreenshotContextGeneratorTests.swift */, + 6C34A98E5B05DB3F4FE81E70 /* ScreenTextExtractorTests.swift */, 474560E524C1D74BAB1570DA /* SecureFieldDetectorTests.swift */, D5A5591BEB9EE7B6E9064412 /* SelfCaptureGateTests.swift */, 2D7360A6D4261989A66658ED /* SentenceBoundaryClassifierTests.swift */, @@ -2489,6 +2492,7 @@ 7EB20783E0D36715D1230A5C /* PromptSectionBudgetTests.swift in Sources */, 1C46642846D8FD1475AA5CCF /* RequestIDTests.swift in Sources */, AD6E005ABE34AB7EBD92A30D /* RuntimeBootstrapModelTests.swift in Sources */, + A81E3481B0067E9D6F9D1325 /* ScreenTextExtractorTests.swift in Sources */, 1B3FFCB9A979F49BF86EAAD4 /* ScreenshotContextGeneratorTests.swift in Sources */, 4FC52FB28AFC013F000D8FF9 /* SecureFieldDetectorTests.swift in Sources */, AF26E77871200BB1FAAEBE79 /* SelfCaptureGateTests.swift in Sources */, diff --git a/Cotabby/Services/Visual/ScreenTextExtractor.swift b/Cotabby/Services/Visual/ScreenTextExtractor.swift index 2b6b301f..2111627b 100644 --- a/Cotabby/Services/Visual/ScreenTextExtractor.swift +++ b/Cotabby/Services/Visual/ScreenTextExtractor.swift @@ -49,7 +49,33 @@ enum ScreenTextExtractionError: LocalizedError { } } +/// Guards the callback-to-async bridge for a single OCR request. +/// +/// Vision can report a request failure through `VNRecognizeTextRequest`'s completion handler and +/// then rethrow that same failure from `VNImageRequestHandler.perform(_:)`. Swift checked +/// continuations must resume exactly once, so both paths share this short-lived gate. +private final class OCRContinuationResumer { + private let lock = NSLock() + private var hasResumed = false + + func resume(_ action: () -> Void) { + lock.lock() + let shouldResume = !hasResumed + if shouldResume { + hasResumed = true + } + lock.unlock() + + guard shouldResume else { return } + action() + } +} + struct ScreenTextExtractor: ScreenTextExtracting { + /// Vision cannot produce useful text from near-zero-sized request images. Treating those as + /// empty OCR keeps degenerate screenshots on the same unavailable-context path as blank windows. + private static let minimumOCRImageDimension = 4 + let maxImageDimension: Int let maxRecognizedCharacters: Int @@ -72,13 +98,28 @@ struct ScreenTextExtractor: ScreenTextExtracting { "downsampled=\(wasDownsampled)" ) + guard preparedImage.width >= Self.minimumOCRImageDimension, + preparedImage.height >= Self.minimumOCRImageDimension else { + log( + "ocr-skipped-too-small input=\(image.width)x\(image.height) " + + "prepared=\(preparedImage.width)x\(preparedImage.height)" + ) + throw ScreenTextExtractionError.noRecognizedText + } + return try await withCheckedThrowingContinuation { continuation in + let resumer = OCRContinuationResumer() + DispatchQueue.global(qos: .userInitiated).async { let request = VNRecognizeTextRequest { request, error in if let error { - let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000) - self.log("ocr-failed elapsed_ms=\(elapsedMilliseconds) reason=\(error.localizedDescription)") - continuation.resume(throwing: ScreenTextExtractionError.ocrFailed(error.localizedDescription)) + resumer.resume { + let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000) + self.log("ocr-failed elapsed_ms=\(elapsedMilliseconds) reason=\(error.localizedDescription)") + continuation.resume( + throwing: ScreenTextExtractionError.ocrFailed(error.localizedDescription) + ) + } return } @@ -105,25 +146,29 @@ struct ScreenTextExtractor: ScreenTextExtracting { let cappedText = String(joinedText.prefix(maxRecognizedCharacters)) guard !cappedText.trimmingCharacters(in: .whitespacesAndNewlines).isEmpty else { - let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000) - self.log("ocr-empty elapsed_ms=\(elapsedMilliseconds) lines=\(recognizedLines.count)") - continuation.resume(throwing: ScreenTextExtractionError.noRecognizedText) + resumer.resume { + let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000) + self.log("ocr-empty elapsed_ms=\(elapsedMilliseconds) lines=\(recognizedLines.count)") + continuation.resume(throwing: ScreenTextExtractionError.noRecognizedText) + } return } - let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000) - self.log( - "ocr-success elapsed_ms=\(elapsedMilliseconds) lines=\(recognizedLines.count) chars=\(cappedText.count) " + - "preview=\(self.preview(cappedText))" - ) - - continuation.resume( - returning: ExtractedScreenText( - text: cappedText, - lineCount: recognizedLines.count, - lines: recognizedLines + resumer.resume { + let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000) + self.log( + "ocr-success elapsed_ms=\(elapsedMilliseconds) lines=\(recognizedLines.count) chars=\(cappedText.count) " + + "preview=\(self.preview(cappedText))" + ) + + continuation.resume( + returning: ExtractedScreenText( + text: cappedText, + lineCount: recognizedLines.count, + lines: recognizedLines + ) ) - ) + } } // Accurate OCR is slower, but visual context is only captured once per focused @@ -139,9 +184,13 @@ struct ScreenTextExtractor: ScreenTextExtracting { let handler = VNImageRequestHandler(cgImage: preparedImage, options: [:]) try handler.perform([request]) } catch { - let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000) - self.log("ocr-failed elapsed_ms=\(elapsedMilliseconds) reason=\(error.localizedDescription)") - continuation.resume(throwing: ScreenTextExtractionError.ocrFailed(error.localizedDescription)) + resumer.resume { + let elapsedMilliseconds = Int(Date().timeIntervalSince(startedAt) * 1000) + self.log("ocr-failed elapsed_ms=\(elapsedMilliseconds) reason=\(error.localizedDescription)") + continuation.resume( + throwing: ScreenTextExtractionError.ocrFailed(error.localizedDescription) + ) + } } } } diff --git a/CotabbyTests/ScreenTextExtractorTests.swift b/CotabbyTests/ScreenTextExtractorTests.swift new file mode 100644 index 00000000..05adebad --- /dev/null +++ b/CotabbyTests/ScreenTextExtractorTests.swift @@ -0,0 +1,59 @@ +import CoreGraphics +import XCTest +@testable import Cotabby + +/// Direct tests for the Vision-backed OCR service boundary. +/// +/// Screenshot-context tests usually stub OCR so they can focus on orchestration. This file exists +/// separately because the crash happened inside `ScreenTextExtractor`'s callback-to-async bridge, +/// which only runs when the real Vision request path is exercised. +final class ScreenTextExtractorTests: XCTestCase { + func test_extractText_tinyImagesReturnNoRecognizedTextWithoutCrashing() async throws { + let extractor = ScreenTextExtractor() + + for dimension in [1, 2] { + let image = try makeSolidImage(width: dimension, height: dimension) + await assertNoRecognizedText(from: image, extractor: extractor) + } + } + + func test_extractText_blankNormalImageReturnsNoRecognizedText() async throws { + let image = try makeSolidImage(width: 128, height: 128) + + await assertNoRecognizedText(from: image, extractor: ScreenTextExtractor()) + } + + private func assertNoRecognizedText( + from image: CGImage, + extractor: ScreenTextExtractor, + file: StaticString = #filePath, + line: UInt = #line + ) async { + do { + _ = try await extractor.extractText(from: image) + XCTFail("Expected OCR to report no recognized text.", file: file, line: line) + } catch ScreenTextExtractionError.noRecognizedText { + // Expected: blank or degenerate screenshots should be treated as unavailable context. + } catch { + XCTFail("Expected noRecognizedText, got \(error).", file: file, line: line) + } + } + + private func makeSolidImage(width: Int, height: Int) throws -> CGImage { + let colorSpace = CGColorSpaceCreateDeviceRGB() + let context = try XCTUnwrap( + CGContext( + data: nil, + width: width, + height: height, + bitsPerComponent: 8, + bytesPerRow: width * 4, + space: colorSpace, + bitmapInfo: CGImageAlphaInfo.premultipliedLast.rawValue + ) + ) + context.setFillColor(CGColor(gray: 1, alpha: 1)) + context.fill(CGRect(x: 0, y: 0, width: width, height: height)) + return try XCTUnwrap(context.makeImage()) + } +}