From 14545572062f3b90bf5e8156f3535ccb7ca07dca Mon Sep 17 00:00:00 2001 From: Saksham Bhutani Date: Mon, 29 Jun 2026 10:36:19 -0400 Subject: [PATCH 1/2] added multimodal image input for all providers --- .../Attachments/CoachAttachmentStore.swift | 145 +++++++++++++++ .../Coach/Config/CoachFeatureFlags.swift | 1 + PulseLoop/Coach/Config/CoachSettings.swift | 4 + .../Coach/Config/CoachSettingsSection.swift | 4 + PulseLoop/Coach/Gemini/GeminiClient.swift | 53 +++++- PulseLoop/Coach/OpenAI/ResponsesTypes.swift | 14 +- .../Coach/OpenRouter/OpenRouterClient.swift | 34 +++- .../Orchestration/CoachOrchestrator.swift | 20 ++- .../Coach/ViewModels/CoachViewModel.swift | 24 ++- PulseLoop/Info.plist | 2 + PulseLoop/Models/PulseModels.swift | 7 +- PulseLoop/Views/CameraPicker.swift | 48 +++++ PulseLoop/Views/CoachView.swift | 168 +++++++++++++++--- PulseLoopTests/CoachMultimodalTests.swift | 142 +++++++++++++++ 14 files changed, 616 insertions(+), 50 deletions(-) create mode 100644 PulseLoop/Coach/Attachments/CoachAttachmentStore.swift create mode 100644 PulseLoop/Views/CameraPicker.swift create mode 100644 PulseLoopTests/CoachMultimodalTests.swift diff --git a/PulseLoop/Coach/Attachments/CoachAttachmentStore.swift b/PulseLoop/Coach/Attachments/CoachAttachmentStore.swift new file mode 100644 index 0000000..7dbaaaf --- /dev/null +++ b/PulseLoop/Coach/Attachments/CoachAttachmentStore.swift @@ -0,0 +1,145 @@ +import Foundation +import UIKit + +/// A reference to an image attached to a `CoachMessage`. The bytes live on disk in +/// `Documents/coach_attachments/`; the message persists only this small ref +/// (as JSON in `CoachMessage.attachmentsJSON`). Mirrors the `*JSON` ref convention +/// already used for `PendingAction` / `CoachTurnError` — no SwiftData blob, no +/// `@Attribute(.externalStorage)`, so the store stays small and fast. +struct CoachAttachmentRef: Codable, Equatable, Hashable { + /// Filename within `coach_attachments/` (e.g. `.jpg`). + let file: String + /// MIME type of the stored bytes (always `image/jpeg` in v1). + let mime: String + let width: Int + let height: Int + + init(file: String, mime: String = "image/jpeg", width: Int, height: Int) { + self.file = file + self.mime = mime + self.width = width + self.height = height + } + + /// JSON form for the (array-valued) `CoachMessage.attachmentsJSON` field. + static func encode(_ refs: [CoachAttachmentRef]) -> String? { + guard !refs.isEmpty, let data = try? JSONEncoder().encode(refs) else { return nil } + return String(data: data, encoding: .utf8) + } + + static func decode(fromJSON json: String?) -> [CoachAttachmentRef] { + guard let json, let data = json.data(using: .utf8) else { return [] } + return (try? JSONDecoder().decode([CoachAttachmentRef].self, from: data)) ?? [] + } +} + +/// The wire-ready forms of one image, built once from a `CoachAttachmentRef`'s +/// bytes and handed to the request builders. Each provider picks the shape it +/// needs: OpenAI/OpenRouter take the `data:` URL; Gemini takes the raw base64 + +/// `mimeType`. Sendable so it can cross the orchestrator's concurrency boundary. +struct CoachImagePayload: Sendable, Equatable { + /// `data:image/jpeg;base64,<…>` — used by OpenAI `input_image` and OpenRouter `image_url`. + let dataURL: String + /// Bare base64 (no `data:` prefix) — used by Gemini `inlineData.data`. + let rawBase64: String + let mimeType: String +} + +/// On-device store for coach image attachments: compresses + writes incoming +/// images, loads them back for the chat bubble, and produces the base64 payloads +/// the model clients send. Uses `FileManager` + the app Documents directory (the +/// same primitive `DiagnosticsExporter` already relies on). +enum CoachAttachmentStore { + /// Longest-edge cap applied before JPEG-encoding. Keeps request payloads small + /// (all three providers bill by image size / cap total request bytes) while + /// staying sharp enough for the model to read charts and labels. + private static let maxDimension: CGFloat = 1024 + private static let jpegQuality: CGFloat = 0.7 + static let mimeType = "image/jpeg" + + /// `Documents/coach_attachments/`, created lazily. + private static func directory() -> URL? { + guard let docs = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first else { + return nil + } + let dir = docs.appendingPathComponent("coach_attachments", isDirectory: true) + if !FileManager.default.fileExists(atPath: dir.path) { + try? FileManager.default.createDirectory(at: dir, withIntermediateDirectories: true) + } + return dir + } + + private static func url(for ref: CoachAttachmentRef) -> URL? { + directory()?.appendingPathComponent(ref.file, isDirectory: false) + } + + // MARK: - Save + + /// Downscales + JPEG-compresses `image`, writes it to a new `.jpg`, and + /// returns the ref. Returns nil if the bytes can't be produced or written. + static func save(_ image: UIImage) -> CoachAttachmentRef? { + let scaled = downscaled(image) + guard let data = scaled.jpegData(compressionQuality: jpegQuality), + let dir = directory() else { return nil } + let file = "\(UUID().uuidString).jpg" + let dest = dir.appendingPathComponent(file, isDirectory: false) + do { + try data.write(to: dest, options: .atomic) + } catch { + return nil + } + return CoachAttachmentRef( + file: file, + mime: mimeType, + width: Int(scaled.size.width * scaled.scale), + height: Int(scaled.size.height * scaled.scale) + ) + } + + private static func downscaled(_ image: UIImage) -> UIImage { + let size = image.size + let longEdge = max(size.width, size.height) + guard longEdge > maxDimension else { return image } + let ratio = maxDimension / longEdge + let target = CGSize(width: size.width * ratio, height: size.height * ratio) + let format = UIGraphicsImageRendererFormat.default() + format.scale = 1 + return UIGraphicsImageRenderer(size: target, format: format).image { _ in + image.draw(in: CGRect(origin: .zero, size: target)) + } + } + + // MARK: - Load + + static func data(for ref: CoachAttachmentRef) -> Data? { + guard let url = url(for: ref) else { return nil } + return try? Data(contentsOf: url) + } + + static func loadImage(_ ref: CoachAttachmentRef) -> UIImage? { + guard let data = data(for: ref) else { return nil } + return UIImage(data: data) + } + + static func delete(_ ref: CoachAttachmentRef) { + guard let url = url(for: ref) else { return } + try? FileManager.default.removeItem(at: url) + } + + // MARK: - Wire payloads + + /// Builds the model-ready payload (data URL + raw base64) for a stored ref. + static func payload(for ref: CoachAttachmentRef) -> CoachImagePayload? { + guard let data = data(for: ref) else { return nil } + let base64 = data.base64EncodedString() + return CoachImagePayload( + dataURL: "data:\(ref.mime);base64,\(base64)", + rawBase64: base64, + mimeType: ref.mime + ) + } + + static func payloads(for refs: [CoachAttachmentRef]) -> [CoachImagePayload] { + refs.compactMap { payload(for: $0) } + } +} diff --git a/PulseLoop/Coach/Config/CoachFeatureFlags.swift b/PulseLoop/Coach/Config/CoachFeatureFlags.swift index 5034e84..889418c 100644 --- a/PulseLoop/Coach/Config/CoachFeatureFlags.swift +++ b/PulseLoop/Coach/Config/CoachFeatureFlags.swift @@ -29,6 +29,7 @@ struct CoachFeatureFlags { var webSearchEnabled: Bool { settings.enableWebSearch } var writeToolsEnabled: Bool { settings.enableWriteTools } var liveMeasurementsEnabled: Bool { settings.enableLiveMeasurements } + var imageInputEnabled: Bool { settings.enableImageInput } var maxToolCalls: Int { max(1, settings.maxToolCalls) } var maxRounds: Int { max(1, settings.maxRounds) } diff --git a/PulseLoop/Coach/Config/CoachSettings.swift b/PulseLoop/Coach/Config/CoachSettings.swift index efc34a0..9f0ba9f 100644 --- a/PulseLoop/Coach/Config/CoachSettings.swift +++ b/PulseLoop/Coach/Config/CoachSettings.swift @@ -126,6 +126,9 @@ struct CoachSettings: Codable, Equatable { /// until Milestone B wires confirmation gates. var enableWriteTools: Bool = false var enableLiveMeasurements: Bool = false + /// When true, the coach composer shows a camera/photo button so the user can + /// attach an image to a message (multimodal input). Off by default. + var enableImageInput: Bool = false var maxToolCalls: Int = 8 var maxRounds: Int = 4 // Milestone D — automated daily check-in notifications. @@ -158,6 +161,7 @@ struct CoachSettings: Codable, Equatable { orProviderSort = try c.decodeIfPresent(String.self, forKey: .orProviderSort) enableWriteTools = try c.decodeIfPresent(Bool.self, forKey: .enableWriteTools) ?? d.enableWriteTools enableLiveMeasurements = try c.decodeIfPresent(Bool.self, forKey: .enableLiveMeasurements) ?? d.enableLiveMeasurements + enableImageInput = try c.decodeIfPresent(Bool.self, forKey: .enableImageInput) ?? d.enableImageInput maxToolCalls = try c.decodeIfPresent(Int.self, forKey: .maxToolCalls) ?? d.maxToolCalls maxRounds = try c.decodeIfPresent(Int.self, forKey: .maxRounds) ?? d.maxRounds notificationsEnabled = try c.decodeIfPresent(Bool.self, forKey: .notificationsEnabled) ?? d.notificationsEnabled diff --git a/PulseLoop/Coach/Config/CoachSettingsSection.swift b/PulseLoop/Coach/Config/CoachSettingsSection.swift index 487efdc..5d9f43b 100644 --- a/PulseLoop/Coach/Config/CoachSettingsSection.swift +++ b/PulseLoop/Coach/Config/CoachSettingsSection.swift @@ -161,6 +161,7 @@ struct CoachSettingsSection: View { toggleRow("AI actions (set goals, log, edit)", isOn: writeToolsBinding) toggleRow("Live ring measurements", isOn: liveMeasurementsBinding) + toggleRow("Image input (attach photos)", isOn: imageInputBinding) if !memories.isEmpty { SectionHeader(title: "Coach memory", action: nil) @@ -398,6 +399,9 @@ struct CoachSettingsSection: View { private var liveMeasurementsBinding: Binding { Binding(get: { store.settings.enableLiveMeasurements }, set: { store.settings.enableLiveMeasurements = $0 }) } + private var imageInputBinding: Binding { + Binding(get: { store.settings.enableImageInput }, set: { store.settings.enableImageInput = $0 }) + } // MARK: - Key actions diff --git a/PulseLoop/Coach/Gemini/GeminiClient.swift b/PulseLoop/Coach/Gemini/GeminiClient.swift index e114bfa..e6adef7 100644 --- a/PulseLoop/Coach/Gemini/GeminiClient.swift +++ b/PulseLoop/Coach/Gemini/GeminiClient.swift @@ -120,13 +120,52 @@ final class GeminiClient: ResponsesClient, @unchecked Sendable { systemText = systemParts.joined(separator: "\n\n") for item in conversationItems { - guard let role = item["role"] as? String, - let content = item["content"] as? String else { continue } + guard let role = item["role"] as? String else { continue } + let parts = geminiParts(from: item) + guard !parts.isEmpty else { continue } let geminiRole = role == "assistant" ? "model" : "user" - contents.append(["role": geminiRole, "parts": [["text": content]]]) + contents.append(["role": geminiRole, "parts": parts]) } } + /// Converts a Responses-API message item's `content` into Gemini `parts`. Text + /// items keep `content` as a String → `[{"text": …}]` (unchanged path). Image + /// items carry `content` as the OpenAI content-part array (`input_text` + + /// `input_image`), which we map to `{"text": …}` + `{"inlineData": {mimeType, data}}`. + private func geminiParts(from item: [String: Any]) -> [[String: Any]] { + if let text = item["content"] as? String { + return [["text": text]] + } + guard let parts = item["content"] as? [[String: Any]] else { return [] } + var out: [[String: Any]] = [] + for part in parts { + switch part["type"] as? String { + case "input_text", "text": + if let text = part["text"] as? String { out.append(["text": text]) } + case "input_image": + if let inline = inlineData(fromImageURL: part["image_url"] as? String) { + out.append(["inlineData": inline]) + } + default: + break + } + } + return out + } + + /// Splits an `input_image` `data:;base64,` URL into Gemini's + /// `inlineData` object (`mimeType` + bare base64 `data`). + private func inlineData(fromImageURL url: String?) -> [String: String]? { + guard let url, url.hasPrefix("data:"), + let comma = url.firstIndex(of: ","), + let semicolon = url.firstIndex(of: ";"), + url.distance(from: url.startIndex, to: semicolon) < url.distance(from: url.startIndex, to: comma) + else { return nil } + let mime = String(url[url.index(url.startIndex, offsetBy: 5).. [String: Any] { - ["role": role, "content": content] + /// One input message item. The text path keeps `content` a plain String so the + /// adapter clients' `content as? String` branches are untouched; images are + /// purely additive — only when `images` is non-empty does `content` become the + /// Responses-API content-part array (`input_text` + `input_image`). + static func message(role: String, content: String, images: [CoachImagePayload] = []) -> [String: Any] { + guard !images.isEmpty else { return ["role": role, "content": content] } + var parts: [[String: Any]] = [["type": "input_text", "text": content]] + for img in images { + parts.append(["type": "input_image", "image_url": img.dataURL]) + } + return ["role": role, "content": parts] } /// A function-call result item to feed back into the next turn. diff --git a/PulseLoop/Coach/OpenRouter/OpenRouterClient.swift b/PulseLoop/Coach/OpenRouter/OpenRouterClient.swift index f110c4d..28c301a 100644 --- a/PulseLoop/Coach/OpenRouter/OpenRouterClient.swift +++ b/PulseLoop/Coach/OpenRouter/OpenRouterClient.swift @@ -108,9 +108,8 @@ final class OpenRouterClient: ResponsesClient, @unchecked Sendable { messages = [] storedAssistantMessage = [:] for item in input { - guard let role = item["role"] as? String, - let content = item["content"] as? String else { continue } - messages.append(["role": chatRole(role), "content": content]) + guard let role = item["role"] as? String, item["content"] != nil else { continue } + messages.append(["role": chatRole(role), "content": chatContent(from: item)]) } // Unlike the native OpenAI/Gemini clients, OpenRouter sends no enforced // `response_format` (several catalog models reject this app's schema), so @@ -132,9 +131,8 @@ final class OpenRouterClient: ResponsesClient, @unchecked Sendable { let callId = item["call_id"] as? String, let output = item["output"] as? String { messages.append(["role": "tool", "tool_call_id": callId, "content": output]) - } else if let role = item["role"] as? String, - let content = item["content"] as? String { - messages.append(["role": chatRole(role), "content": content]) + } else if let role = item["role"] as? String, item["content"] != nil { + messages.append(["role": chatRole(role), "content": chatContent(from: item)]) } } } @@ -143,6 +141,30 @@ final class OpenRouterClient: ResponsesClient, @unchecked Sendable { responsesRole == "developer" ? "system" : responsesRole } + /// Converts a Responses-API message item's `content` into Chat Completions + /// `content`. Text items keep `content` a plain String (unchanged path, so the + /// cache-control rewrite still applies). Image items carry the OpenAI + /// content-part array (`input_text` + `input_image`), which we map to Chat + /// Completions parts (`{type:text}` + `{type:image_url, image_url:{url}}`). + private func chatContent(from item: [String: Any]) -> Any { + if let text = item["content"] as? String { return text } + guard let parts = item["content"] as? [[String: Any]] else { return "" } + var out: [[String: Any]] = [] + for part in parts { + switch part["type"] as? String { + case "input_text", "text": + if let text = part["text"] as? String { out.append(["type": "text", "text": text]) } + case "input_image": + if let url = part["image_url"] as? String { + out.append(["type": "image_url", "image_url": ["url": url]]) + } + default: + break + } + } + return out + } + // MARK: - Tool conversion (Responses flat → Chat Completions nested) /// Converts the app's flat Responses function specs diff --git a/PulseLoop/Coach/Orchestration/CoachOrchestrator.swift b/PulseLoop/Coach/Orchestration/CoachOrchestrator.swift index f974e92..8b4767c 100644 --- a/PulseLoop/Coach/Orchestration/CoachOrchestrator.swift +++ b/PulseLoop/Coach/Orchestration/CoachOrchestrator.swift @@ -23,19 +23,24 @@ struct CoachOrchestrator { var error: CoachTurnError? = nil } - struct PriorMessage { let role: String; let text: String } + struct PriorMessage { let role: String; let text: String; var images: [CoachImagePayload] = [] } + + /// Substituted as the user prompt when an image is sent with no text, so the + /// schema/tool loop still has a non-empty user turn to anchor on. + private static let imageOnlyPrompt = "Please look at the attached image." func runTurn( userText: String, packet: CoachContextPacket, recentMessages: [PriorMessage], + userImages: [CoachImagePayload] = [], onTrace: @escaping (CoachTraceEvent) -> Void = { _ in } ) async -> TurnResult { guard flags.coachEnabled else { return TurnResult(assistant: CoachFallbacks.scripted(packet: packet), trace: []) } do { - return try await runOpenAI(userText: userText, packet: packet, recentMessages: recentMessages, onTrace: onTrace) + return try await runOpenAI(userText: userText, packet: packet, recentMessages: recentMessages, userImages: userImages, onTrace: onTrace) } catch { onTrace(CoachTraceEvent(label: "Something went wrong", status: .failedTool)) return TurnResult(assistant: CoachFallbacks.fallback(), trace: [], error: CoachTurnError(error)) @@ -46,20 +51,27 @@ struct CoachOrchestrator { userText: String, packet: CoachContextPacket, recentMessages: [PriorMessage], + userImages: [CoachImagePayload], onTrace: @escaping (CoachTraceEvent) -> Void ) async throws -> TurnResult { let toolSpecs = registry.toolSpecs let textFormat = CoachResponseSchema.textFormat // Initial input: system + developer + recent turns + the new user message. + // Images only ever ride on user turns (system/developer/assistant stay text). var input: [[String: Any]] = [ OpenAIRequestBuilder.message(role: "system", content: CoachPromptBuilder.systemPrompt), OpenAIRequestBuilder.message(role: "developer", content: CoachPromptBuilder.developerMessage(packet: packet)), ] for m in recentMessages { - input.append(OpenAIRequestBuilder.message(role: m.role == "user" ? "user" : "assistant", content: m.text)) + let isUser = m.role == "user" + input.append(OpenAIRequestBuilder.message( + role: isUser ? "user" : "assistant", + content: m.text, + images: isUser ? m.images : [])) } - input.append(OpenAIRequestBuilder.message(role: "user", content: userText)) + let userContent = userText.isEmpty && !userImages.isEmpty ? Self.imageOnlyPrompt : userText + input.append(OpenAIRequestBuilder.message(role: "user", content: userContent, images: userImages)) onTrace(CoachTraceEvent(label: "Thinking about your question…", status: .thinking)) diff --git a/PulseLoop/Coach/ViewModels/CoachViewModel.swift b/PulseLoop/Coach/ViewModels/CoachViewModel.swift index 63d57bf..7f4b77b 100644 --- a/PulseLoop/Coach/ViewModels/CoachViewModel.swift +++ b/PulseLoop/Coach/ViewModels/CoachViewModel.swift @@ -36,17 +36,21 @@ final class CoachViewModel { _ text: String, conversationId: UUID, context: ModelContext, + attachments: [CoachAttachmentRef] = [], coordinator: RingSyncCoordinator? = nil ) async { let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) - guard !trimmed.isEmpty, !isSending else { return } + // Allow image-only sends: require either text or an attachment. + guard !(trimmed.isEmpty && attachments.isEmpty), !isSending else { return } isSending = true traceEvents = [] errorBanner = nil defer { isSending = false } // Optimistically persist the user message so the UI shows it immediately. - let userMessage = CoachMessage(conversationId: conversationId, role: "user", body: trimmed) + let userMessage = CoachMessage( + conversationId: conversationId, role: "user", body: trimmed, + attachmentsJSON: CoachAttachmentRef.encode(attachments)) context.insert(userMessage) try? context.save() @@ -54,6 +58,7 @@ final class CoachViewModel { let flags = CoachFeatureFlags(settings: settingsStore.settings, hasAPIKey: apiKey != nil) let packet = CoachContextBuilder.build(context: context) let recent = recentMessages(conversationId: conversationId, excluding: userMessage.id, context: context) + let userImages = CoachAttachmentStore.payloads(for: attachments) let orchestrator = CoachOrchestrator( client: activeClient, @@ -65,7 +70,8 @@ final class CoachViewModel { let result = await orchestrator.runTurn( userText: trimmed, packet: packet, - recentMessages: recent + recentMessages: recent, + userImages: userImages ) { [weak self] event in self?.traceEvents.append(event) } @@ -173,10 +179,18 @@ final class CoachViewModel { ) descriptor.fetchLimit = 40 let rows = (try? context.fetch(descriptor)) ?? [] - return rows + let recent = rows .filter { $0.id != excludedId && $0.role != "error" } // never replay error bubbles to the model .suffix(limit) - .map { CoachOrchestrator.PriorMessage(role: $0.role, text: $0.body) } + // Replay images only on the most recent prior user turn that has them, to + // keep context coherent without ballooning the payload with old base64. + let lastImageRowId = recent.last { CoachAttachmentRef.decode(fromJSON: $0.attachmentsJSON).isEmpty == false }?.id + return recent.map { row in + let images = row.id == lastImageRowId + ? CoachAttachmentStore.payloads(for: CoachAttachmentRef.decode(fromJSON: row.attachmentsJSON)) + : [] + return CoachOrchestrator.PriorMessage(role: row.role, text: row.body, images: images) + } } private func fetchConversation(_ id: UUID, context: ModelContext) -> CoachConversation? { diff --git a/PulseLoop/Info.plist b/PulseLoop/Info.plist index 6c54f59..ef95f5a 100644 --- a/PulseLoop/Info.plist +++ b/PulseLoop/Info.plist @@ -2,6 +2,8 @@ + NSCameraUsageDescription + Attach a photo to ask the AI Coach about it. UIBackgroundModes location diff --git a/PulseLoop/Models/PulseModels.swift b/PulseLoop/Models/PulseModels.swift index 520aebf..84c94b2 100644 --- a/PulseLoop/Models/PulseModels.swift +++ b/PulseLoop/Models/PulseModels.swift @@ -694,15 +694,20 @@ final class CoachMessage { var cardsJSON: String? /// Encoded `PendingAction` awaiting a Confirm/Cancel tap (Milestone B). var pendingActionJSON: String? = nil + /// Encoded `[CoachAttachmentRef]` for images attached to this message. The + /// bytes live in `Documents/coach_attachments/`; this holds only the refs. + /// Optional with a default keeps the SwiftData migration lightweight. + var attachmentsJSON: String? = nil var createdAt: Date - init(id: UUID = UUID(), conversationId: UUID, role: String, body: String, cardsJSON: String? = nil, pendingActionJSON: String? = nil, createdAt: Date = Date()) { + init(id: UUID = UUID(), conversationId: UUID, role: String, body: String, cardsJSON: String? = nil, pendingActionJSON: String? = nil, attachmentsJSON: String? = nil, createdAt: Date = Date()) { self.id = id self.conversationId = conversationId self.role = role self.body = body self.cardsJSON = cardsJSON self.pendingActionJSON = pendingActionJSON + self.attachmentsJSON = attachmentsJSON self.createdAt = createdAt } } diff --git a/PulseLoop/Views/CameraPicker.swift b/PulseLoop/Views/CameraPicker.swift new file mode 100644 index 0000000..3fc046a --- /dev/null +++ b/PulseLoop/Views/CameraPicker.swift @@ -0,0 +1,48 @@ +import SwiftUI +import UIKit + +/// Thin SwiftUI wrapper over `UIImagePickerController` for the device camera. +/// SwiftUI's `PhotosPicker` covers the photo library, but it can't capture from +/// the camera, so this handles that one case. Returns the picked `UIImage` via +/// `onPick` and dismisses itself. +struct CameraPicker: UIViewControllerRepresentable { + var onPick: (UIImage) -> Void + @Environment(\.dismiss) private var dismiss + + func makeUIViewController(context: Context) -> UIImagePickerController { + let picker = UIImagePickerController() + picker.sourceType = .camera + picker.delegate = context.coordinator + return picker + } + + func updateUIViewController(_ uiViewController: UIImagePickerController, context: Context) {} + + func makeCoordinator() -> Coordinator { Coordinator(self) } + + final class Coordinator: NSObject, UIImagePickerControllerDelegate, UINavigationControllerDelegate { + private let parent: CameraPicker + init(_ parent: CameraPicker) { self.parent = parent } + + func imagePickerController( + _ picker: UIImagePickerController, + didFinishPickingMediaWithInfo info: [UIImagePickerController.InfoKey: Any] + ) { + if let image = info[.originalImage] as? UIImage { + parent.onPick(image) + } + parent.dismiss() + } + + func imagePickerControllerDidCancel(_ picker: UIImagePickerController) { + parent.dismiss() + } + } +} + +extension UIImagePickerController { + /// Whether a hardware camera is available (false on most simulators). + static var cameraAvailable: Bool { + UIImagePickerController.isSourceTypeAvailable(.camera) + } +} diff --git a/PulseLoop/Views/CoachView.swift b/PulseLoop/Views/CoachView.swift index 6af4f16..1c3570a 100644 --- a/PulseLoop/Views/CoachView.swift +++ b/PulseLoop/Views/CoachView.swift @@ -1,6 +1,7 @@ import SwiftUI import SwiftData import UIKit +import PhotosUI private let coldStartPrompts = [ "How am I doing today?", @@ -21,8 +22,19 @@ struct CoachView: View { @State private var showHistory = false @State private var keyboardHeight: CGFloat = 0 @State private var nav = CoachNavigation.shared + @State private var settingsStore = CoachSettingsStore.shared @FocusState private var composerFocused: Bool + // Image attachment (multimodal input). One staged image per message. + @State private var stagedImage: UIImage? + @State private var stagedAttachment: CoachAttachmentRef? + @State private var showImageSourceDialog = false + @State private var showPhotosPicker = false + @State private var showCamera = false + @State private var photosPickerItem: PhotosPickerItem? + + private var imageInputEnabled: Bool { settingsStore.settings.enableImageInput } + /// Bottom inset for the composer: clears the overlaid nav bar (~60) when the /// keyboard is hidden, and sits just above the keyboard when shown. Computed /// manually because the tab layout pins the keyboard safe area (see RootViews). @@ -156,49 +168,129 @@ struct CoachView: View { } private var composer: some View { - HStack(spacing: 8) { - Image(systemName: "plus") - .font(.system(size: 18)).foregroundStyle(PulseColors.textMuted) - .frame(width: 36, height: 36).background(PulseColors.card, in: Circle()).opacity(0.6) - TextField("Ask the coach...", text: $draft) - .focused($composerFocused) - .textFieldStyle(.plain) - .font(.system(size: 14)) - .padding(.horizontal, 16).padding(.vertical, 10) - .background(PulseColors.card, in: Capsule()) - .overlay(Capsule().stroke(PulseColors.borderSubtle, lineWidth: 1)) - .onSubmit { send(draft) } - Button { send(draft) } label: { - Image(systemName: "arrow.up") - .font(.system(size: 16, weight: .semibold)) - .foregroundStyle(canSend ? .white : PulseColors.textMuted) - .frame(width: 36, height: 36) - .background(canSend ? PulseColors.accent : PulseColors.card, in: Circle()) + VStack(spacing: 8) { + if let stagedImage { stagedThumbnail(stagedImage) } + + HStack(spacing: 8) { + if imageInputEnabled { + Button { composerFocused = false; showImageSourceDialog = true } label: { + Image(systemName: "photo.on.rectangle") + .font(.system(size: 17)).foregroundStyle(PulseColors.textSecondary) + .frame(width: 36, height: 36).background(PulseColors.card, in: Circle()) + .overlay(Circle().stroke(PulseColors.borderSubtle, lineWidth: 1)) + } + .buttonStyle(.plain) + } else { + Image(systemName: "plus") + .font(.system(size: 18)).foregroundStyle(PulseColors.textMuted) + .frame(width: 36, height: 36).background(PulseColors.card, in: Circle()).opacity(0.6) + } + TextField("Ask the coach...", text: $draft) + .focused($composerFocused) + .textFieldStyle(.plain) + .font(.system(size: 14)) + .padding(.horizontal, 16).padding(.vertical, 10) + .background(PulseColors.card, in: Capsule()) + .overlay(Capsule().stroke(PulseColors.borderSubtle, lineWidth: 1)) + .onSubmit { send(draft) } + Button { send(draft) } label: { + Image(systemName: "arrow.up") + .font(.system(size: 16, weight: .semibold)) + .foregroundStyle(canSend ? .white : PulseColors.textMuted) + .frame(width: 36, height: 36) + .background(canSend ? PulseColors.accent : PulseColors.card, in: Circle()) + } + .buttonStyle(.plain) + .disabled(!canSend) } - .buttonStyle(.plain) - .disabled(!canSend) } .padding(.horizontal, 12).padding(.vertical, 10) + .confirmationDialog("Add image", isPresented: $showImageSourceDialog, titleVisibility: .visible) { + Button("Photo Library") { showPhotosPicker = true } + if UIImagePickerController.cameraAvailable { + Button("Camera") { showCamera = true } + } + Button("Cancel", role: .cancel) {} + } + .photosPicker(isPresented: $showPhotosPicker, selection: $photosPickerItem, matching: .images) + .onChange(of: photosPickerItem) { _, item in + guard let item else { return } + Task { await loadPickedPhoto(item) } + } + .fullScreenCover(isPresented: $showCamera) { + CameraPicker { image in stage(image) } + .ignoresSafeArea() + } + } + + /// Small preview chip for the staged image, with a remove button. + private func stagedThumbnail(_ image: UIImage) -> some View { + HStack { + ZStack(alignment: .topTrailing) { + Image(uiImage: image) + .resizable().scaledToFill() + .frame(width: 64, height: 64) + .clipShape(RoundedRectangle(cornerRadius: 14, style: .continuous)) + .overlay(RoundedRectangle(cornerRadius: 14, style: .continuous).stroke(PulseColors.borderSubtle, lineWidth: 1)) + Button { clearStagedImage() } label: { + Image(systemName: "xmark.circle.fill") + .font(.system(size: 18)) + .foregroundStyle(.white, Color.black.opacity(0.55)) + } + .buttonStyle(.plain) + .offset(x: 6, y: -6) + } + Spacer(minLength: 0) + } } private var canSend: Bool { - !draft.trimmingCharacters(in: .whitespaces).isEmpty && !viewModel.isSending + guard !viewModel.isSending else { return false } + return !draft.trimmingCharacters(in: .whitespaces).isEmpty || stagedAttachment != nil + } + + /// Compresses + persists the picked image and stages it for the next send. + private func stage(_ image: UIImage) { + guard let ref = CoachAttachmentStore.save(image) else { return } + stagedImage = image + stagedAttachment = ref + } + + private func loadPickedPhoto(_ item: PhotosPickerItem) async { + if let data = try? await item.loadTransferable(type: Data.self), + let image = UIImage(data: data) { + stage(image) + } + photosPickerItem = nil + } + + /// Removes the staged image and deletes its on-disk file (it was never sent). + private func clearStagedImage() { + if let ref = stagedAttachment { CoachAttachmentStore.delete(ref) } + stagedImage = nil + stagedAttachment = nil } private func send(_ text: String) { let trimmed = text.trimmingCharacters(in: .whitespacesAndNewlines) - guard !trimmed.isEmpty, !viewModel.isSending else { return } + let attachment = stagedAttachment + // Allow image-only sends: require either text or a staged image. + guard !(trimmed.isEmpty && attachment == nil), !viewModel.isSending else { return } let conversationId = resolveConversationId() // Title a fresh conversation from its opening message. if let convo = conversations.first(where: { $0.id == conversationId }), isDefaultTitle(convo.title), !allMessages.contains(where: { $0.conversationId == conversationId }) { - convo.title = String(trimmed.prefix(40)) + let seed = trimmed.isEmpty ? "Photo" : String(trimmed.prefix(40)) + convo.title = seed try? modelContext.save() } draft = "" + stagedImage = nil + stagedAttachment = nil composerFocused = false - Task { await viewModel.send(trimmed, conversationId: conversationId, context: modelContext, coordinator: coordinator) } + let attachments = attachment.map { [$0] } ?? [] + Task { await viewModel.send(trimmed, conversationId: conversationId, context: modelContext, attachments: attachments, coordinator: coordinator) } } /// The active conversation, creating one on first use. @@ -341,10 +433,15 @@ struct CoachBubble: View { message.role == "error" ? CoachTurnError.decode(fromJSON: message.cardsJSON) : nil } + private var attachments: [CoachAttachmentRef] { + CoachAttachmentRef.decode(fromJSON: message.attachmentsJSON) + } + var body: some View { HStack { if message.role == "user" { Spacer(minLength: 40) } - VStack(alignment: .leading, spacing: 8) { + VStack(alignment: message.role == "user" ? .trailing : .leading, spacing: 8) { + ForEach(attachments, id: \.file) { ref in attachmentImage(ref) } content if let pendingAction { CoachActionCardView( @@ -367,6 +464,9 @@ struct CoachBubble: View { .background(PulseColors.card) .clipShape(RoundedRectangle(cornerRadius: 18, style: .continuous)) .overlay(RoundedRectangle(cornerRadius: 18, style: .continuous).stroke(PulseColors.borderSubtle, lineWidth: 1)) + } else if message.role == "user" && message.body.isEmpty && !attachments.isEmpty { + // Image-only message: the image is the bubble, no empty text bubble below. + EmptyView() } else { (message.role == "user" ? Text(message.body) : Text(coachMarkdown: message.body)) .font(.system(size: 14)) @@ -380,6 +480,24 @@ struct CoachBubble: View { ) } } + + /// Renders an attached image (loaded from `CoachAttachmentStore`) as part of + /// the message bubble. Falls back to a placeholder if the file is missing. + @ViewBuilder private func attachmentImage(_ ref: CoachAttachmentRef) -> some View { + if let image = CoachAttachmentStore.loadImage(ref) { + Image(uiImage: image) + .resizable().scaledToFill() + .frame(maxWidth: 240, maxHeight: 220) + .clipShape(RoundedRectangle(cornerRadius: 18, style: .continuous)) + .overlay(RoundedRectangle(cornerRadius: 18, style: .continuous).stroke(PulseColors.borderSubtle, lineWidth: 1)) + } else { + RoundedRectangle(cornerRadius: 18, style: .continuous) + .fill(PulseColors.card) + .frame(width: 120, height: 90) + .overlay(Image(systemName: "photo").foregroundStyle(PulseColors.textMuted)) + .overlay(RoundedRectangle(cornerRadius: 18, style: .continuous).stroke(PulseColors.borderSubtle, lineWidth: 1)) + } + } } /// Red-bordered error bubble shown when a coach turn fails. Displays the error diff --git a/PulseLoopTests/CoachMultimodalTests.swift b/PulseLoopTests/CoachMultimodalTests.swift new file mode 100644 index 0000000..f2437b8 --- /dev/null +++ b/PulseLoopTests/CoachMultimodalTests.swift @@ -0,0 +1,142 @@ +import XCTest +@testable import PulseLoop + +/// Verifies multimodal (image) input across the canonical request builder and the +/// Gemini / OpenRouter adapters. Two invariants matter: +/// 1. Image-bearing user turns serialize to each provider's correct image shape. +/// 2. Text-only turns serialize to the *old* `"content": ""` form — proof +/// that adding images didn't change the existing (text + tool-call) path. +/// +/// Reuses `StubURLProtocol` from CoachTests to capture the translated request body. +final class CoachMultimodalTests: XCTestCase { + + private let img = CoachImagePayload( + dataURL: "data:image/jpeg;base64,QUJD", // base64 of "ABC" + rawBase64: "QUJD", + mimeType: "image/jpeg" + ) + + private func session() -> URLSession { + let config = URLSessionConfiguration.ephemeral + config.protocolClasses = [StubURLProtocol.self] + return URLSession(configuration: config) + } + + // MARK: - Canonical builder (OpenAI Responses shape) + + func testBuilder_textOnly_keepsStringContent() { + let msg = OpenAIRequestBuilder.message(role: "user", content: "hello") + XCTAssertEqual(msg["content"] as? String, "hello", + "Text-only content must remain a plain String (unchanged path).") + } + + func testBuilder_withImage_emitsInputImagePart() throws { + let msg = OpenAIRequestBuilder.message(role: "user", content: "what is this?", images: [img]) + let parts = try XCTUnwrap(msg["content"] as? [[String: Any]]) + XCTAssertEqual(parts.count, 2) + XCTAssertEqual(parts[0]["type"] as? String, "input_text") + XCTAssertEqual(parts[0]["text"] as? String, "what is this?") + XCTAssertEqual(parts[1]["type"] as? String, "input_image") + XCTAssertEqual(parts[1]["image_url"] as? String, "data:image/jpeg;base64,QUJD") + } + + // MARK: - Attachment ref round-trip + + func testAttachmentRef_encodeDecodeRoundTrip() { + let refs = [CoachAttachmentRef(file: "a.jpg", width: 100, height: 80)] + let json = CoachAttachmentRef.encode(refs) + XCTAssertNotNil(json) + XCTAssertEqual(CoachAttachmentRef.decode(fromJSON: json), refs) + XCTAssertNil(CoachAttachmentRef.encode([]), "Empty refs encode to nil (no field written).") + XCTAssertEqual(CoachAttachmentRef.decode(fromJSON: nil), []) + } + + // MARK: - Gemini adapter translation + + func testGemini_imageItem_emitsInlineDataPart() async throws { + StubURLProtocol.statusCode = 200 + StubURLProtocol.responseBody = Data(#"{"candidates":[{"content":{"parts":[{"text":"ok"}]}}]}"#.utf8) + + let client = GeminiClient(apiKey: "AIza-test", session: session()) + let input = [OpenAIRequestBuilder.message(role: "user", content: "describe", images: [img])] + let body = try OpenAIRequestBuilder.data( + model: "gemini-2.5-flash", input: input, tools: [], textFormat: nil, + previousResponseId: nil, reasoningEffort: nil) + _ = try await client.send(requestBody: body) + + let json = try captured() + let contents = try XCTUnwrap(json["contents"] as? [[String: Any]]) + let parts = try XCTUnwrap(contents.first?["parts"] as? [[String: Any]]) + XCTAssertEqual(parts[0]["text"] as? String, "describe") + let inline = try XCTUnwrap(parts[1]["inlineData"] as? [String: Any]) + XCTAssertEqual(inline["mimeType"] as? String, "image/jpeg") + XCTAssertEqual(inline["data"] as? String, "QUJD", "Gemini gets bare base64, no data: prefix.") + } + + func testGemini_textOnly_emitsPlainTextPart() async throws { + StubURLProtocol.statusCode = 200 + StubURLProtocol.responseBody = Data(#"{"candidates":[{"content":{"parts":[{"text":"ok"}]}}]}"#.utf8) + + let client = GeminiClient(apiKey: "AIza-test", session: session()) + let input = [OpenAIRequestBuilder.message(role: "user", content: "hi")] + let body = try OpenAIRequestBuilder.data( + model: "gemini-2.5-flash", input: input, tools: [], textFormat: nil, + previousResponseId: nil, reasoningEffort: nil) + _ = try await client.send(requestBody: body) + + let json = try captured() + let contents = try XCTUnwrap(json["contents"] as? [[String: Any]]) + let parts = try XCTUnwrap(contents.first?["parts"] as? [[String: Any]]) + XCTAssertEqual(parts.count, 1) + XCTAssertEqual(parts[0]["text"] as? String, "hi") + XCTAssertNil(parts[0]["inlineData"]) + } + + // MARK: - OpenRouter adapter translation + + func testOpenRouter_imageItem_emitsImageURLPart() async throws { + StubURLProtocol.statusCode = 200 + StubURLProtocol.responseBody = Data(#"{"choices":[{"message":{"content":"ok"}}]}"#.utf8) + + let client = OpenRouterClient(apiKey: "sk-or-test", session: session()) + let input = [OpenAIRequestBuilder.message(role: "user", content: "look", images: [img])] + let body = try OpenAIRequestBuilder.data( + model: "anthropic/claude-sonnet-4.6", input: input, tools: [], textFormat: nil, + previousResponseId: nil, reasoningEffort: nil) + _ = try await client.send(requestBody: body) + + let json = try captured() + let messages = try XCTUnwrap(json["messages"] as? [[String: Any]]) + let userMsg = try XCTUnwrap(messages.first { ($0["role"] as? String) == "user" }) + let parts = try XCTUnwrap(userMsg["content"] as? [[String: Any]]) + XCTAssertEqual(parts[0]["type"] as? String, "text") + XCTAssertEqual(parts[1]["type"] as? String, "image_url") + let imageURL = try XCTUnwrap(parts[1]["image_url"] as? [String: Any]) + XCTAssertEqual(imageURL["url"] as? String, "data:image/jpeg;base64,QUJD") + } + + func testOpenRouter_textOnly_keepsStringContent() async throws { + StubURLProtocol.statusCode = 200 + StubURLProtocol.responseBody = Data(#"{"choices":[{"message":{"content":"ok"}}]}"#.utf8) + + let client = OpenRouterClient(apiKey: "sk-or-test", session: session()) + let input = [OpenAIRequestBuilder.message(role: "user", content: "hi")] + let body = try OpenAIRequestBuilder.data( + model: "anthropic/claude-sonnet-4.6", input: input, tools: [], textFormat: nil, + previousResponseId: nil, reasoningEffort: nil) + _ = try await client.send(requestBody: body) + + let json = try captured() + let messages = try XCTUnwrap(json["messages"] as? [[String: Any]]) + let userMsg = try XCTUnwrap(messages.first { ($0["role"] as? String) == "user" }) + XCTAssertEqual(userMsg["content"] as? String, "hi", + "Text-only user content must stay a String so cache-control still applies.") + } + + // MARK: - Helper + + private func captured() throws -> [String: Any] { + let data = try XCTUnwrap(StubURLProtocol.lastRequestBody) + return try XCTUnwrap(try JSONSerialization.jsonObject(with: data) as? [String: Any]) + } +} From b477e6562e072b4a18925e3f13f2a5085e7ca4e0 Mon Sep 17 00:00:00 2001 From: Saksham Bhutani Date: Mon, 29 Jun 2026 10:38:40 -0400 Subject: [PATCH 2/2] fixed input bar --- PulseLoop/Views/CoachView.swift | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/PulseLoop/Views/CoachView.swift b/PulseLoop/Views/CoachView.swift index 1c3570a..033e294 100644 --- a/PulseLoop/Views/CoachView.swift +++ b/PulseLoop/Views/CoachView.swift @@ -28,7 +28,6 @@ struct CoachView: View { // Image attachment (multimodal input). One staged image per message. @State private var stagedImage: UIImage? @State private var stagedAttachment: CoachAttachmentRef? - @State private var showImageSourceDialog = false @State private var showPhotosPicker = false @State private var showCamera = false @State private var photosPickerItem: PhotosPickerItem? @@ -172,18 +171,21 @@ struct CoachView: View { if let stagedImage { stagedThumbnail(stagedImage) } HStack(spacing: 8) { + // Camera button — only shown when image input is enabled in + // Settings. Nothing is shown when off. Opens the camera directly; + // falls back to the photo library where no camera exists (simulator). if imageInputEnabled { - Button { composerFocused = false; showImageSourceDialog = true } label: { - Image(systemName: "photo.on.rectangle") + Button { + composerFocused = false + if UIImagePickerController.cameraAvailable { showCamera = true } + else { showPhotosPicker = true } + } label: { + Image(systemName: "camera") .font(.system(size: 17)).foregroundStyle(PulseColors.textSecondary) .frame(width: 36, height: 36).background(PulseColors.card, in: Circle()) .overlay(Circle().stroke(PulseColors.borderSubtle, lineWidth: 1)) } .buttonStyle(.plain) - } else { - Image(systemName: "plus") - .font(.system(size: 18)).foregroundStyle(PulseColors.textMuted) - .frame(width: 36, height: 36).background(PulseColors.card, in: Circle()).opacity(0.6) } TextField("Ask the coach...", text: $draft) .focused($composerFocused) @@ -205,13 +207,6 @@ struct CoachView: View { } } .padding(.horizontal, 12).padding(.vertical, 10) - .confirmationDialog("Add image", isPresented: $showImageSourceDialog, titleVisibility: .visible) { - Button("Photo Library") { showPhotosPicker = true } - if UIImagePickerController.cameraAvailable { - Button("Camera") { showCamera = true } - } - Button("Cancel", role: .cancel) {} - } .photosPicker(isPresented: $showPhotosPicker, selection: $photosPickerItem, matching: .images) .onChange(of: photosPickerItem) { _, item in guard let item else { return }