diff --git a/Packages/RelayInterface/Sources/RelayInterface/Models/RoomSummary.swift b/Packages/RelayInterface/Sources/RelayInterface/Models/RoomSummary.swift index 7065ab4..c5f3aa5 100644 --- a/Packages/RelayInterface/Sources/RelayInterface/Models/RoomSummary.swift +++ b/Packages/RelayInterface/Sources/RelayInterface/Models/RoomSummary.swift @@ -152,6 +152,12 @@ public final class RoomSummary: Identifiable { /// space filter bar. public var parentSpaceIds: Set + /// Whether this room currently has an ongoing MatrixRTC call. + /// + /// Mirrors the SDK's `RoomInfo.hasRoomCall`, which is true whenever any + /// non-expired `m.call.member` state event exists in the room. + public var hasRoomCall: Bool + /// Creates a new ``RoomSummary`` instance. /// /// - Parameters: @@ -192,7 +198,8 @@ public final class RoomSummary: Identifiable { inviterName: String? = nil, inviterAvatarURL: String? = nil, isSpace: Bool = false, - parentSpaceIds: Set = [] + parentSpaceIds: Set = [], + hasRoomCall: Bool = false ) { self.id = id self.name = name @@ -213,5 +220,6 @@ public final class RoomSummary: Identifiable { self.inviterAvatarURL = inviterAvatarURL self.isSpace = isSpace self.parentSpaceIds = parentSpaceIds + self.hasRoomCall = hasRoomCall } } diff --git a/Packages/RelayInterface/Sources/RelayInterface/Protocols/CallViewModelProtocol.swift b/Packages/RelayInterface/Sources/RelayInterface/Protocols/CallViewModelProtocol.swift index 20c72b3..0b018e3 100644 --- a/Packages/RelayInterface/Sources/RelayInterface/Protocols/CallViewModelProtocol.swift +++ b/Packages/RelayInterface/Sources/RelayInterface/Protocols/CallViewModelProtocol.swift @@ -83,6 +83,12 @@ public protocol CallViewModelProtocol: AnyObject, Observable { /// The identity of the local participant, set after connection. var localParticipantID: String? { get } + /// Human-readable description of the current connection step. Only + /// non-nil while `state == .connecting`. The UI is expected to hide + /// transient phases (steps shorter than ~300ms) so the indicator + /// only surfaces during genuinely slow joins on poor networks. + var connectingPhase: String? { get } + /// A monotonically increasing counter that is bumped whenever video tracks change /// (publish, unpublish, camera toggle, etc.). SwiftUI views should read this value /// to ensure ``NSViewRepresentable`` bridges receive `updateNSView` calls when the diff --git a/Relay/Info.plist b/Relay/Info.plist index aeadf07..cd860fa 100644 --- a/Relay/Info.plist +++ b/Relay/Info.plist @@ -4,6 +4,10 @@ ITSAppUsesNonExemptEncryption + CFBundleHelpBookFolder + Relay.help + CFBundleHelpBookName + app.subpop.Relay.help UTExportedTypeDeclarations diff --git a/Relay/Resources/Relay.help/Contents/Info.plist b/Relay/Resources/Relay.help/Contents/Info.plist new file mode 100644 index 0000000..e0e911c --- /dev/null +++ b/Relay/Resources/Relay.help/Contents/Info.plist @@ -0,0 +1,30 @@ + + + + + CFBundleDevelopmentRegion + en + CFBundleIdentifier + app.subpop.Relay.help + CFBundleInfoDictionaryVersion + 6.0 + CFBundleName + Relay Help + CFBundlePackageType + BNDL + CFBundleShortVersionString + 1.0 + CFBundleSignature + hbwr + HPDBookAccessPath + Relay.html + HPDBookIconPath + Shared/icon.png + HPDBookIndexPath + Relay.helpindex + HPDBookTitle + Relay Help + HPDBookType + 3 + + diff --git a/Relay/Resources/Relay.help/Contents/Resources/Shared/relay-help.css b/Relay/Resources/Relay.help/Contents/Resources/Shared/relay-help.css new file mode 100644 index 0000000..44de1b6 --- /dev/null +++ b/Relay/Resources/Relay.help/Contents/Resources/Shared/relay-help.css @@ -0,0 +1,160 @@ +/* Apple Help Book styling for Relay. Apple Help Viewer renders + the page inside a fixed-width WebKit pane (~520pt); these styles + match the default macOS help look-and-feel. */ + +:root { + color-scheme: light dark; + --fg: #1d1d1f; + --fg-muted: #6e6e73; + --bg: #ffffff; + --rule: #d2d2d7; + --accent: #0066cc; + --code-bg: #f5f5f7; +} + +@media (prefers-color-scheme: dark) { + :root { + --fg: #f5f5f7; + --fg-muted: #a1a1a6; + --bg: #1d1d1f; + --rule: #3a3a3c; + --accent: #2997ff; + --code-bg: #2c2c2e; + } +} + +html, body { + margin: 0; + padding: 0; + background: var(--bg); + color: var(--fg); + font: 13px -apple-system, BlinkMacSystemFont, "Helvetica Neue", Helvetica, sans-serif; + line-height: 1.5; +} + +body { + padding: 24px 32px 48px 32px; + max-width: 640px; +} + +h1, h2, h3 { + color: var(--fg); + font-weight: 600; + line-height: 1.25; + margin-top: 1.6em; +} + +h1 { + font-size: 22px; + margin-top: 0; +} + +h2 { + font-size: 16px; + border-bottom: 1px solid var(--rule); + padding-bottom: 4px; +} + +h3 { + font-size: 14px; +} + +p, ul, ol, table { + margin: 0.6em 0; +} + +ul, ol { + padding-left: 24px; +} + +li { + margin: 0.2em 0; +} + +code, kbd { + font-family: ui-monospace, Menlo, monospace; + font-size: 0.95em; + background: var(--code-bg); + padding: 1px 5px; + border-radius: 4px; +} + +pre { + background: var(--code-bg); + border: 1px solid var(--rule); + border-radius: 6px; + padding: 10px 12px; + overflow-x: auto; + font-family: ui-monospace, Menlo, monospace; + font-size: 0.95em; + line-height: 1.45; +} + +pre code { + background: none; + padding: 0; +} + +a { + color: var(--accent); + text-decoration: none; +} + +a:hover { + text-decoration: underline; +} + +table { + border-collapse: collapse; + width: 100%; + margin: 0.8em 0; +} + +th, td { + border: 1px solid var(--rule); + padding: 6px 10px; + text-align: left; + vertical-align: top; +} + +th { + background: var(--code-bg); + font-weight: 600; +} + +.callout { + background: var(--code-bg); + border-left: 3px solid var(--accent); + padding: 8px 12px; + margin: 1em 0; + border-radius: 4px; +} + +.muted { + color: var(--fg-muted); +} + +nav.topic-list { + list-style: none; + padding: 0; +} + +nav.topic-list li { + border-bottom: 1px solid var(--rule); + padding: 10px 0; +} + +nav.topic-list li:last-child { + border-bottom: none; +} + +nav.topic-list a { + display: block; + font-weight: 500; +} + +nav.topic-list .subtitle { + color: var(--fg-muted); + font-weight: 400; + margin-top: 2px; +} diff --git a/Relay/Resources/Relay.help/Contents/Resources/en.lproj/Relay.html b/Relay/Resources/Relay.help/Contents/Resources/en.lproj/Relay.html new file mode 100644 index 0000000..c23803a --- /dev/null +++ b/Relay/Resources/Relay.help/Contents/Resources/en.lproj/Relay.html @@ -0,0 +1,31 @@ + + + + + + + + + + + Relay Help + + +

Relay Help

+ +

Relay is a native macOS client for the Matrix chat network. This help book collects troubleshooting guides and notes that go beyond what fits inline in the app.

+ +

Topics

+ + +

Getting more help

+

If a topic in this help book doesn't cover your problem, file an issue at github.com/subpop/Relay/issues or join the conversation in #relayapp:matrix.org.

+ + diff --git a/Relay/Resources/Relay.help/Contents/Resources/en.lproj/pages/troubleshooting-calls.html b/Relay/Resources/Relay.help/Contents/Resources/en.lproj/pages/troubleshooting-calls.html new file mode 100644 index 0000000..ed152ea --- /dev/null +++ b/Relay/Resources/Relay.help/Contents/Resources/en.lproj/pages/troubleshooting-calls.html @@ -0,0 +1,159 @@ + + + + + + + + + + Troubleshooting MatrixRTC calls + + +

Relay Help › Troubleshooting calls

+ +

Troubleshooting MatrixRTC calls

+ +

If your calls fail to connect, or connect but show no audio or video, this page walks you through capturing the data Relay's maintainers need to diagnose the issue.

+ +

Quick capture (3 minutes)

+ +
    +
  1. Open Window › Activity Log (or press ⌥⌘A).
  2. +
  3. In the search bar, click the filter chip and limit to the Call category.
  4. +
  5. Leave the Activity Log window open and reproduce the failing call.
  6. +
  7. Once the call has failed (or you have media issues), press ⌘S in the Activity Log window to export the filtered events.
  8. +
  9. Save the file (default name relay-activity-log.json) and attach it to your bug report.
  10. +
+ +

That export is everything the developers need to triage a calling problem.

+ +

What's in the export

+ +

The file is pretty-printed JSON: an array of events, each with timestamp (ISO 8601), category (will be "call" after filtering), severity (debug / info / warning / error), source (which subsystem logged it), summary, optional detail, optional roomId, and a metadata key-value map.

+ +

Sample event:

+ +
{
+  "timestamp": "2026-06-12T14:30:05.123Z",
+  "category": "call",
+  "severity": "info",
+  "source": "LiveKitCredentialService",
+  "summary": "SFU URL discovered",
+  "roomId": "!abc:example.org",
+  "metadata": {}
+}
+ +

What's safe to share

+ +

The export contains:

+
    +
  • Your Matrix room ID (!…:server) and device IDs
  • +
  • Per-call membership UUIDs and key indices
  • +
  • Your homeserver hostname
  • +
  • SHA-256 fingerprints (first 8 hex chars) of encryption keys, never the keys themselves
  • +
+ +

It does not contain:

+
    +
  • Raw E2EE keys
  • +
  • OpenID tokens or LiveKit JWTs
  • +
  • Message contents, names, or avatars
  • +
  • The OpenID access token used for SFU auth
  • +
+ +
+ If you don't want your room IDs or device IDs in a public bug report, ask the maintainers for a DM in #relayapp:matrix.org and share the file there. +
+ +

Reading the export yourself

+ +

A few specific log lines act as signposts. If your file contains any of these, you can pre-diagnose your own issue.

+ +

Connection-time signals

+ + + + + + + + + + + + + + + + + + + + + + + +
Look forWhat it means
Fetching call credentialsThe call attempt started; subsequent events should show whether discovery and token exchange succeeded.
SFU URL discoveredYour homeserver advertises a LiveKit SFU. Good.
Failed to fetch call credentials with "This homeserver has no LiveKit call server configured…"Your homeserver doesn't expose org.matrix.msc4143.rtc_foci in .well-known/matrix/client, and the unstable transports endpoint isn't supported. Ask your homeserver admin to configure MatrixRTC.
Call credentials obtainedToken exchange succeeded. If the call still fails after this, the problem is downstream of credential acquisition.
+ +

Connected-but-no-media signals

+ +

If the call reaches the Connected to call event but you can't see or hear anyone, the failure is in the encryption-key exchange or frame routing.

+ + + + + + + + + + + + + + + + + + + +
Look forWhat it means
Distributed E2EE key to N user(s) followed by Received E2EE key from … for each peerKey exchange is happening. If you still have no media, the problem is in the frame-decoder routing — note the participant identity in the event's detail, this is the identity LiveKit assigned to the peer.
No Received E2EE key from … events at allPeers aren't sending you their keys, or the widget bridge isn't running. Check whether the room is configured as encrypted (E2EE is enabled only for encrypted Matrix rooms).
Widget bridge started but no later eventsThe widget driver is waiting for capability negotiation that never completes. Likely an SDK or homeserver-side issue.
+ +

Patterns worth flagging in a bug report

+ +

These specific event sequences point to a known class of failure:

+ +
    +
  1. No Call credentials obtained event after Fetching call credentials. Credential exchange is failing. Almost always a homeserver-side or SFU-side configuration problem; the maintainers will need to know which homeserver you're on.
  2. +
  3. Connected to call but no Distributed E2EE key event. The Matrix call-member state event went out, but no peers existed at the time you connected, or the local cache of call members is stale. If others were already in the call, this is a Relay bug worth reporting.
  4. +
  5. Received E2EE key from … events present, but you still see no media from those peers. Frame-cryptor routing is misaligned with the LiveKit participant identity. Please attach the export and note the LiveKit participant identity you see in those events' detail field.
  6. +
+ +

When the Activity Log isn't enough

+ +

For really hard cases (the SFU is rejecting our JWT with no useful error, or the LiveKit room itself never finishes initialising) the maintainers sometimes need a unified-log capture, which records the low-level RTC trace from inside the LiveKit SDK.

+ +

While the call is reproducing the issue, run in a terminal:

+ +
log stream --predicate 'subsystem == "RelayKit" AND category BEGINSWITH "Call"' \
+           --level info > relay-call-trace.log
+ +

Stop with ⌃C once the call has failed, then share relay-call-trace.log alongside the Activity Log JSON.

+ +

The unified-log capture contains more verbose internal trace including LiveKit SDK output. It's safe in the same way the Activity Log is (no key material, no tokens), but it does contain more verbose timing and routing data. Share it through the same channel you'd share the JSON.

+ +

Reporting

+ +

File an issue at github.com/subpop/Relay/issues or message #relayapp:matrix.org. Please include:

+ +
    +
  • The relay-activity-log.json export (filtered to the Call category)
  • +
  • Your homeserver hostname (e.g. matrix.example.org)
  • +
  • Whether other clients (Element X, Element Web) succeed at calling on the same account
  • +
  • A one-line description of what you saw: "fails to connect", "connects but no audio", "connects but no video", etc.
  • +
+ +

If you'd rather not put logs in a public issue, send them privately to maintainers in the Matrix room first.

+ + diff --git a/Relay/ViewModels/PreviewCallViewModel.swift b/Relay/ViewModels/PreviewCallViewModel.swift index 5017083..2bbeb04 100644 --- a/Relay/ViewModels/PreviewCallViewModel.swift +++ b/Relay/ViewModels/PreviewCallViewModel.swift @@ -29,6 +29,7 @@ final class PreviewCallViewModel: CallViewModelProtocol { var isLocalMicrophoneEnabled: Bool = false var localParticipantID: String? = nil var videoTrackRevision: UInt = 0 + var connectingPhase: String? = nil func connect(url: String, token: String, sfuServiceURL: String) async throws { state = .connecting diff --git a/Relay/Views/Call/CallView.swift b/Relay/Views/Call/CallView.swift index f9cddc6..3e99184 100644 --- a/Relay/Views/Call/CallView.swift +++ b/Relay/Views/Call/CallView.swift @@ -35,6 +35,11 @@ struct CallView: View { @State private var serverURL: String = "" @State private var accessToken: String = "" @State private var isJoining: Bool = false + /// Connecting-phase label that has stuck around long enough to be + /// worth showing the user. Updated from + /// ``CallViewModelProtocol/connectingPhase`` with a ~300 ms reveal + /// delay so brief phases on a fast network never flash on screen. + @State private var visibleConnectingPhase: String? // NOTE: The earlier implementation auto-hid the control bar after a // timeout using a `controlsVisible` @State + `.animation(.easeInOut(..), // value: controlsVisible)` on the control bar's opacity, plus a @@ -379,6 +384,14 @@ struct CallView: View { Text("Joining call…") .font(.headline) .foregroundStyle(.white.opacity(0.7)) + // Step indicator — only appears once a phase has been the + // current phase for ~300 ms. Hidden on fast networks. + if let visibleConnectingPhase { + Text(visibleConnectingPhase) + .font(.subheadline) + .foregroundStyle(.white.opacity(0.55)) + .transition(.opacity) + } Button("Cancel") { Task { await viewModel.disconnect() } } @@ -386,6 +399,31 @@ struct CallView: View { .foregroundStyle(.white) Spacer() } + .animation(.easeInOut(duration: 0.2), value: visibleConnectingPhase) + .onChange(of: viewModel.connectingPhase, initial: true) { _, newPhase in + scheduleConnectingPhaseReveal(newPhase) + } + } + + /// Reveals `phase` as the visible connecting-phase label after a + /// short delay (~300 ms). If `connectingPhase` changes again before + /// the delay fires, the previous phase is never shown — so brief + /// steps on a fast network don't flash on screen. + private func scheduleConnectingPhaseReveal(_ phase: String?) { + guard let phase else { + visibleConnectingPhase = nil + return + } + // Already showing it — nothing to do. + if visibleConnectingPhase == phase { return } + Task { @MainActor in + try? await Task.sleep(for: .milliseconds(300)) + // Confirm the view model is still on the same phase before + // committing it to the visible state. + if viewModel.connectingPhase == phase { + visibleConnectingPhase = phase + } + } } // MARK: - Failed Overlay diff --git a/Relay/Views/MainView.swift b/Relay/Views/MainView.swift index 4ccc457..08d417c 100644 --- a/Relay/Views/MainView.swift +++ b/Relay/Views/MainView.swift @@ -376,25 +376,39 @@ struct MainView: View { // swiftlint:disable:this type_body_length } private func startCallButton(roomId: String) -> some View { - Button { + let hasOngoingCall = currentRoom?.hasRoomCall ?? false + let label = hasOngoingCall ? "Join Call" : "Start Call" + let confirmTitle = hasOngoingCall ? "Join Call" : "Start Call" + let confirmAction = hasOngoingCall ? "Join" : "Call" + return Button { showCallConfirmation = true } label: { - Label("Start Call", systemImage: "phone.fill") + // Force the title to render alongside the icon on + // ongoing-call state so the toolbar pill visibly changes + // — default macOS toolbar style would hide the title and + // leave the pill indistinguishable from the idle state. + if hasOngoingCall { + Label(label, systemImage: "phone.fill") + .labelStyle(.titleAndIcon) + .foregroundStyle(Color.accentColor) + } else { + Label(label, systemImage: "phone.fill") + } } - .help("Start Call") + .help(label) .disabled(callManager.hasActiveCall) .confirmationDialog( - "Start Call", + confirmTitle, isPresented: $showCallConfirmation ) { - Button("Call") { + Button(confirmAction) { startCall(roomId: roomId) } } message: { if let name = currentRoom?.name { - Text("Start a call in \(name)?") + Text(hasOngoingCall ? "Join the call in \(name)?" : "Start a call in \(name)?") } else { - Text("Start a call in this room?") + Text(hasOngoingCall ? "Join the call in this room?" : "Start a call in this room?") } } } diff --git a/RelayKit/Call/CallEncryptionService.swift b/RelayKit/Call/CallEncryptionService.swift index eb176dd..9665e2f 100644 --- a/RelayKit/Call/CallEncryptionService.swift +++ b/RelayKit/Call/CallEncryptionService.swift @@ -16,11 +16,8 @@ import CryptoKit import Foundation import LiveKit import MatrixRustSDK -import OSLog import RelayInterface -private let logger = Logger(subsystem: "RelayKit", category: "CallEncryption") - /// Helpers for MatrixRTC call-member state signaling, power-level bootstrap, /// and LiveKit key provider plumbing. /// @@ -116,13 +113,9 @@ struct CallEncryptionService { let jsonData = try JSONSerialization.data(withJSONObject: body, options: [.sortedKeys]) let jsonString = String(data: jsonData, encoding: .utf8) ?? "{}" - activityLog?.log( - category: .call, severity: .debug, source: "CallEncryptionService", - summary: "Sending call member event", - detail: "stateKey: \(stateKey)\nbody: \(jsonString)", - roomId: roomID - ) - + // Body + state key contain device IDs and per-call membership UUIDs; + // not raw secrets but routing data — the post-send Activity Log + // entry below captures the same fields without the full body. _ = try await sdkRoom.sendStateEventRaw( eventType: Self.callMemberEventType, stateKey: stateKey, @@ -131,6 +124,7 @@ struct CallEncryptionService { activityLog?.log( category: .call, severity: .debug, source: "CallEncryptionService", summary: "Sent call membership state event", + detail: "state_key: \(stateKey), membershipID: \(membership), foci_preferred SFU: \(serviceURL).", roomId: roomID ) } @@ -173,69 +167,77 @@ struct CallEncryptionService { return } + struct MemberSummary { + let stateKey: String + let isActive: Bool + let sfuURL: String? + let membershipID: String? + } + var summaries: [MemberSummary] = [] + for event in events { guard let type = event["type"] as? String, type == Self.callMemberEventType else { continue } let stateKey = event["state_key"] as? String ?? "(none)" - if let content = event["content"], - let contentData = try? JSONSerialization.data(withJSONObject: content, options: [.sortedKeys]), - let contentStr = String(data: contentData, encoding: .utf8) { - activityLog?.log( - category: .call, severity: .debug, source: "CallEncryptionService", - summary: "Existing call member [key=\(stateKey)]", - detail: contentStr, - roomId: roomID - ) + let contentDict = event["content"] as? [String: Any] ?? [:] + let isActive = !contentDict.isEmpty + let sfu = (contentDict["foci_preferred"] as? [[String: Any]])? + .first(where: { ($0["type"] as? String) == "livekit" })?["livekit_service_url"] as? String + let membership = contentDict["membershipID"] as? String + summaries.append(MemberSummary( + stateKey: stateKey, + isActive: isActive, + sfuURL: sfu, + membershipID: membership + )) + } + + let active = summaries.filter { $0.isActive } + let tombstoned = summaries.count - active.count + if active.isEmpty { + activityLog?.log( + category: .call, severity: .debug, source: "CallEncryptionService", + summary: "No active call members in room", + detail: "Total `m.call.member` events scanned: \(summaries.count) (\(tombstoned) tombstoned).", + roomId: roomID + ) + } else { + let lines = active.map { summary -> String in + let sfu = summary.sfuURL ?? "(no SFU advertised)" + let mid = summary.membershipID ?? "(no membershipID)" + return " \(summary.stateKey) — SFU: \(sfu), membershipID: \(mid)" } + activityLog?.log( + category: .call, severity: .debug, source: "CallEncryptionService", + summary: "Active call members in room: \(active.count)", + detail: "Scanned \(summaries.count) `m.call.member` events (\(tombstoned) tombstoned).\n\(lines.joined(separator: "\n"))", + roomId: roomID + ) } } /// Returns a `userId -> [deviceId]` map of *other* users currently in the - /// call, parsed from `org.matrix.msc3401.call.member` state events. + /// call, sourced from the SDK's `RoomInfo.activeRoomCallParticipants`. + /// + /// The SDK's call-membership view is user-level only — no device IDs — + /// so each user's device list is `["*"]` (the to-device wildcard) and + /// the SDK fans out the Olm-encrypted to-device payload to all of that + /// user's devices. Matches `matrix-js-sdk/src/matrixrtc/ + /// ToDeviceKeyTransport.ts`. Some of those devices won't be in the + /// call, but the AES key we're broadcasting is per-call and the receiver + /// only consumes it if their LiveKit cryptor expects it — so the extra + /// Olm sessions are wasted, not unsafe. /// - /// Element-X writes per-device call-member events with state key - /// `___m.call`. We walk the full room state, filter for - /// non-empty call-member content (empty content means the participant - /// has left), and extract `(userId, deviceId)` from the state key. /// Our own `userID` is excluded. func fetchCallTargets() async -> [String: [String]] { - let base = homeserver.trimmingCharacters(in: .init(charactersIn: "/")) - let encodedRoomID = roomID.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) ?? roomID - - guard let url = URL(string: "\(base)/_matrix/client/v3/rooms/\(encodedRoomID)/state") else { return [:] } - - var request = URLRequest(url: url) - request.setValue("Bearer \(accessToken)", forHTTPHeaderField: "Authorization") + guard let sdkRoom else { return [:] } + guard let info = try? await sdkRoom.roomInfo() else { return [:] } - guard let (data, response) = try? await URLSession.shared.data(for: request), - let http = response as? HTTPURLResponse, http.statusCode == 200, - let events = try? JSONSerialization.jsonObject(with: data) as? [[String: Any]] else { - return [:] + var targets: [String: [String]] = [:] + for participantUserID in info.activeRoomCallParticipants where participantUserID != self.userID { + targets[participantUserID] = ["*"] } - - var targets: [String: Set] = [:] - for event in events { - guard let type = event["type"] as? String, - type == Self.callMemberEventType, - let stateKey = event["state_key"] as? String, - let content = event["content"] as? [String: Any], - !content.isEmpty else { continue } - - // State key format: `___m.call` where userId is - // itself `@localpart:server.tld`. Strip the leading underscore - // and the trailing `_m.call` marker, then split on the *last* - // underscore to separate deviceId from userId. - guard stateKey.hasPrefix("_"), stateKey.hasSuffix("_m.call") else { continue } - let trimmed = String(stateKey.dropFirst().dropLast("_m.call".count)) - guard let lastUnderscore = trimmed.lastIndex(of: "_") else { continue } - let userId = String(trimmed[..:` (the legacy shape) silently misroutes every + /// frame on v2-only deployments. + /// + /// Inputs are all ASCII (Matrix IDs, device IDs, UUIDs), so Swift's + /// `JSONSerialization` produces byte-identical output to Go's + /// `json.Marshal` for the same array. Reference: + /// `lk-jwt-service/helper.go::LiveKitIdentityFor`. + static func liveKitIdentity( + matrixID: String, + claimedDeviceID: String, + memberID: String + ) -> String { + let parts: [String] = [matrixID, claimedDeviceID, memberID] + guard let jsonData = try? JSONSerialization.data( + withJSONObject: parts, + options: [] + ) else { + return "" + } + let digest = SHA256.hash(data: jsonData) + // SHA-256 outputs 32 bytes; standard base64 = 44 chars with exactly + // one '=' of padding. Strip it to match Go's `unpaddedBase64`. + return Data(digest) + .base64EncodedString() + .replacing("=", with: "") + } + // MARK: - Key Provider Setup /// Builds a `BaseKeyProvider` whose internal `LKRTCFrameCryptorKeyProvider` @@ -274,7 +312,7 @@ struct CallEncryptionService { static func makeHKDFKeyProvider( ratchetWindowSize: Int32 = 10, keyRingSize: Int32 = 256 - ) -> BaseKeyProvider { + ) -> (provider: BaseKeyProvider, hkdfInstalled: Bool, fallbackReason: String?) { let options = KeyProviderOptions( sharedKey: false, ratchetWindowSize: ratchetWindowSize, @@ -283,8 +321,7 @@ struct CallEncryptionService { let provider = BaseKeyProvider(options: options) guard let cls = NSClassFromString("LKRTCFrameCryptorKeyProvider") as? NSObject.Type else { - logger.error("[RTC]LKRTCFrameCryptorKeyProvider class not found at runtime; HKDF swap skipped — E2EE interop with Element Call will fail (PBKDF2 vs HKDF mismatch)") - return provider + return (provider, false, "LKRTCFrameCryptorKeyProvider class not found at runtime") } let initSel = NSSelectorFromString( @@ -299,8 +336,7 @@ struct CallEncryptionService { ) let allocated = allocImp(cls, allocSel) guard (allocated as AnyObject).responds(to: initSel) else { - logger.error("[RTC]LKRTCFrameCryptorKeyProvider does not expose keyDerivationAlgorithm: init; webrtc-xcframework may be < 144.x — falling back to PBKDF2 (Element Call interop will fail)") - return provider + return (provider, false, "LKRTCFrameCryptorKeyProvider does not expose keyDerivationAlgorithm: init (webrtc-xcframework may be < 144.x)") } typealias InitFunc = @convention(c) ( @@ -326,12 +362,10 @@ struct CallEncryptionService { ) guard let ivar = class_getInstanceVariable(BaseKeyProvider.self, "rtcKeyProvider") else { - logger.error("[RTC]rtcKeyProvider ivar not found on BaseKeyProvider; HKDF swap skipped") - return provider + return (provider, false, "rtcKeyProvider ivar not found on BaseKeyProvider") } object_setIvar(provider, ivar, hkdfRtc) - logger.info("[RTC]Installed HKDF-backed LKRTCFrameCryptorKeyProvider (Element Call interop path)") - return provider + return (provider, true, nil) } /// Sets a raw key on a `BaseKeyProvider` for the given participant, bypassing @@ -341,15 +375,22 @@ struct CallEncryptionService { /// `BaseKeyProvider` is decorated with `@objcMembers`, so its internal /// `rtcKeyProvider` (an `LKRTCFrameCryptorKeyProvider`) is accessible via KVC. /// The ObjC provider accepts `NSData` directly. + /// Sets a raw AES key on the provider for `participantId`. Returns + /// `nil` on success, or a short failure reason string the caller can + /// surface in the Activity Log. The fingerprint of the raw IKM is + /// computed by the caller (via the SHA-256 it already keeps for its + /// own bookkeeping) — diverging fingerprints across local/peer + /// records are the #1 root cause of "maximum ratchet attempts + /// exceeded" on an otherwise-correct key-exchange handshake. + @discardableResult static func setRawKey( _ keyData: Data, on keyProvider: BaseKeyProvider, participantId: String, index: Int32 = 0 - ) { + ) -> String? { guard let rtcProvider = keyProvider.value(forKey: "rtcKeyProvider") as AnyObject? else { - logger.error("[RTC]Could not access rtcKeyProvider via KVC") - return + return "Could not access rtcKeyProvider via KVC" } // LKRTCFrameCryptorKeyProvider is an ObjC class with: @@ -359,8 +400,7 @@ struct CallEncryptionService { typealias SetKeyFunc = @convention(c) (AnyObject, Selector, NSData, Int32, NSString) -> Void let selector = NSSelectorFromString("setKey:withIndex:forParticipant:") guard (rtcProvider as? NSObject)?.responds(to: selector) == true else { - logger.error("[RTC]rtcKeyProvider does not respond to setKey:withIndex:forParticipant:") - return + return "rtcKeyProvider does not respond to setKey:withIndex:forParticipant:" } let imp = unsafeBitCast( @@ -368,28 +408,22 @@ struct CallEncryptionService { to: SetKeyFunc.self ) imp(rtcProvider, selector, keyData as NSData, index, participantId as NSString) - // SHA-256 fingerprint of the raw IKM so we can confirm the exact same - // 16 bytes end up on the wire. Matches the fingerprint logged in - // CallWidgetBridge.sendEncryptionKey. Diverging fingerprints mean - // our local frame cryptor and the peer are using different keys — - // the #1 root cause of "maximum ratchet attempts exceeded" on an - // otherwise-correct key-exchange handshake. - let fp = SHA256.hash(data: keyData).prefix(8).map { String(format: "%02x", $0) }.joined() - logger.info("[RTC]Set raw encryption key for participant \(participantId, privacy: .public) at index \(index) bytes=\(keyData.count) sha256[0..8]=\(fp, privacy: .public)") + return nil } /// Convenience: sets a raw key using base64-encoded key data. + /// Returns `nil` on success or a short failure reason. + @discardableResult static func setRawKey( base64Key: String, on keyProvider: BaseKeyProvider, participantId: String, index: Int32 = 0 - ) { + ) -> String? { guard let keyData = Data(base64Encoded: base64Key) else { - logger.error("[RTC]Invalid base64 key for participant \(participantId, privacy: .private)") - return + return "Invalid base64 key for participant \(participantId)" } - setRawKey(keyData, on: keyProvider, participantId: participantId, index: index) + return setRawKey(keyData, on: keyProvider, participantId: participantId, index: index) } } diff --git a/RelayKit/Call/CallViewModel.swift b/RelayKit/Call/CallViewModel.swift index 8b86ba1..e58188a 100644 --- a/RelayKit/Call/CallViewModel.swift +++ b/RelayKit/Call/CallViewModel.swift @@ -34,6 +34,11 @@ public final class CallViewModel: CallViewModelProtocol { public private(set) var isLocalCameraEnabled: Bool = false public private(set) var isLocalMicrophoneEnabled: Bool = false public private(set) var localParticipantID: String? + /// Human-readable label for the current step inside `.connecting`. + /// Updated as the connect path moves through credential exchange, + /// LiveKit attach, membership publish, key distribution, and media + /// start. Cleared when the call reaches `.connected` or `.failed`. + public private(set) var connectingPhase: String? /// Incremented whenever video tracks change, triggering SwiftUI to /// re-evaluate `videoContent(for:)` and pick up new or removed tracks. public private(set) var videoTrackRevision: UInt = 0 @@ -69,6 +74,11 @@ public final class CallViewModel: CallViewModelProtocol { /// The LiveKit key provider used for per-participant AES-GCM frame encryption. @ObservationIgnored private var keyProvider: BaseKeyProvider? + /// `true` when the HKDF-SHA256 LKRTCFrameCryptorKeyProvider was + /// successfully swapped in. `false` means we fell back to the + /// default PBKDF2 provider and interop with Element Call will fail. + @ObservationIgnored + private var hkdfKeyProviderInstalled: Bool = false /// The local participant's current encryption key (raw 16 bytes). @ObservationIgnored private var localEncryptionKey: Data? @@ -185,10 +195,12 @@ public final class CallViewModel: CallViewModelProtocol { // so the two sides produce different AES keys from matching // fingerprints, and every frame's auth tag fails on the peer. // See CallEncryptionService.makeHKDFKeyProvider for details. - self.keyProvider = CallEncryptionService.makeHKDFKeyProvider( + let result = CallEncryptionService.makeHKDFKeyProvider( ratchetWindowSize: 10, keyRingSize: 256 ) + self.keyProvider = result.provider + self.hkdfKeyProviderInstalled = result.hkdfInstalled } self.matrixRoom = encryptionContext.matrixRoom } @@ -197,6 +209,7 @@ public final class CallViewModel: CallViewModelProtocol { public func connect(url: String, token: String, sfuServiceURL: String = "") async throws { state = .connecting + connectingPhase = "Joining call server…" activityLog?.log( category: .call, severity: .info, source: "CallViewModel", summary: "Connecting to call", @@ -221,6 +234,24 @@ public final class CallViewModel: CallViewModelProtocol { let encryptionOpts: EncryptionOptions? = keyProvider.map { EncryptionOptions(keyProvider: $0, encryptionType: .gcm) } + if isE2eeEnabled { + let kdfDetail = hkdfKeyProviderInstalled + ? "HKDF-SHA256 key derivation active (Element Call interop path)." + : "WARNING: HKDF swap failed — using default PBKDF2. Element Call peers will produce different AES keys from the same IKM and frames will fail to decrypt." + activityLog?.log( + category: .call, severity: hkdfKeyProviderInstalled ? .debug : .warning, source: "CallViewModel", + summary: "LiveKit E2EE enabled", + detail: "GCM frame encryption active. \(kdfDetail)", + roomId: roomID + ) + } else { + activityLog?.log( + category: .call, severity: .debug, source: "CallViewModel", + summary: "LiveKit E2EE disabled", + detail: "Unencrypted Matrix room — frames sent in the clear to the SFU.", + roomId: roomID + ) + } let roomOpts = RoomOptions( defaultVideoPublishOptions: VideoPublishOptions( preferredCodec: .vp8 @@ -239,10 +270,12 @@ public final class CallViewModel: CallViewModelProtocol { connectOptions: connectOpts, roomOptions: roomOpts ) + connectingPhase = "Preparing encryption…" localParticipantID = room.localParticipant.identity?.stringValue activityLog?.log( category: .call, severity: .debug, source: "CallViewModel", - summary: "Connected with LiveKit identity: \(localParticipantID ?? "unknown")", + summary: "Connected to LiveKit", + detail: "Local identity: \(localParticipantID ?? "unknown"). Peers reading our `m.call.member` event expect this to match `${sender}:${device_id}` for legacy session events.", roomId: roomID ) @@ -261,13 +294,16 @@ public final class CallViewModel: CallViewModelProtocol { keyProvider: self.keyProvider ) bridge.activityLog = self.activityLog + bridge.onCallMemberStateChanged = { [weak self] in + self?.redistributeKeyOnMembershipChange() + } bridge.start() self.widgetBridge = bridge } catch { activityLog?.log( category: .call, severity: .error, source: "CallViewModel", summary: "Failed to create CallWidgetBridge", - detail: error.localizedDescription, + detail: "E2EE key exchange will not work; remote tiles will stay black. Error: \(error.localizedDescription)", roomId: roomID ) } @@ -280,36 +316,59 @@ public final class CallViewModel: CallViewModelProtocol { // frames is encrypted with nothing the remote peer can decrypt — // and Element-X's video decoder stalls on that first undecodable // frame, resulting in perpetual black video. - if self.isE2eeEnabled, let keyProvider = self.keyProvider, let encryptionService { + // + // Key under the identity LiveKit assigned us. This was the JWT + // `sub` claim: `::` on the legacy + // `/sfu/get` path, or the unpadded-base64 SHA-256 hash of + // `[user, device, member_id]` on v2 `/get_token`. The cryptor + // routes frames to remote peers' decoders using the *same* + // identity string LiveKit hands the SFU, so registering under + // the matrix-shaped `:` silently misroutes + // outbound frames on v2. + if self.isE2eeEnabled, let keyProvider = self.keyProvider { let key = CallEncryptionService.generateKey() self.localEncryptionKey = key - // Legacy `m.call.member` rtcBackendIdentity is always - // `${sender}:${device_id}` (matrix-js-sdk CallMembership.ts - // line 101). This is what remote peers route our frames under, - // so our local sender cryptor MUST be keyed under the same - // byte sequence — do not trust `localParticipantID` (the - // identity LiveKit assigns from the SFU JWT), since a - // mismatched JWT identity would silently break decrypt. - let localIdentity = "\(encryptionService.userID):\(encryptionService.deviceID)" - if let livekitIdentity = self.localParticipantID, livekitIdentity != localIdentity { + // Diagnostic: warn when the LiveKit-assigned identity + // doesn't match what peers will compute from our + // session-kind `m.call.member` event + // (`${sender}:${device_id}`, per matrix-js-sdk + // `CallMembership.parseFromEvent`). The legacy-first + // credential path keeps us on the colon shape, so this + // is normally silent; if it fires we've landed on the + // v2 hash identity and peer-side decryption will fail + // until we also publish MSC4143 sticky events. + let matrixSidIdentity: String? = encryptionService.map { "\($0.userID):\($0.deviceID)" } + if let livekitIdentity = self.localParticipantID, + let matrixSidIdentity, + livekitIdentity != matrixSidIdentity { activityLog?.log( category: .call, severity: .warning, source: "CallViewModel", summary: "LiveKit identity mismatch — frame encryption may misroute", - detail: "LiveKit: \(livekitIdentity), Matrix: \(localIdentity)", + detail: "LiveKit: \(livekitIdentity), peers expect: \(matrixSidIdentity)", roomId: roomID ) } let keyIndex = self.localKeyIndex - CallEncryptionService.setRawKey( + guard let livekitIdentity = self.localParticipantID, !livekitIdentity.isEmpty else { + activityLog?.log( + category: .call, severity: .error, source: "CallViewModel", + summary: "LiveKit assigned no local identity", + detail: "Cannot install local E2EE key; outbound frames will be undecodable.", + roomId: roomID + ) + throw CallViewModelError.missingLocalParticipantIdentity + } + let setKeyFailure = CallEncryptionService.setRawKey( key, on: keyProvider, - participantId: localIdentity, + participantId: livekitIdentity, index: Int32(keyIndex) ) + let failureNote = setKeyFailure.map { " setRawKey failure: \($0)." } ?? "" activityLog?.log( - category: .call, severity: .debug, source: "CallViewModel", - summary: "Local E2EE key set (index \(keyIndex))", - detail: "participantId: \(localIdentity)", + category: .call, severity: setKeyFailure == nil ? .debug : .error, source: "CallViewModel", + summary: "Local E2EE key installed", + detail: "Index: \(keyIndex), participantId: \(livekitIdentity). Frame cryptor will use this key for outbound frames before camera/mic publish.\(failureNote)", roomId: roomID ) } @@ -345,18 +404,15 @@ public final class CallViewModel: CallViewModelProtocol { // `MatrixService.callPowerLevels`); we no longer try to // mutate them at join time, matching Element Call. let membershipId = bridge?.membershipId + connectingPhase = "Announcing presence to the room…" do { try await encryptionService.sendCallMemberEvent( sfuServiceURL: sfuServiceURL, membershipId: membershipId ) } catch { - activityLog?.log( - category: .call, severity: .warning, source: "CallViewModel", - summary: "Call membership event failed", - detail: error.localizedDescription, - roomId: roomID - ) + let description = String(reflecting: error) + self.logCallMembershipFailure(error, description: description) } // 2. Start the membership heartbeat. matrix-js-sdk's @@ -379,24 +435,29 @@ public final class CallViewModel: CallViewModelProtocol { // state events already present on the room. The SDK // then Olm-encrypts the payload per-device. if self.isE2eeEnabled, let bridge, let localKey { + connectingPhase = "Distributing encryption keys…" let targets = await encryptionService.fetchCallTargets() self.callMembers = targets + let targetList = targets.keys.sorted().joined(separator: ", ") + activityLog?.log( + category: .call, severity: .debug, source: "CallViewModel", + summary: "Distributing E2EE key to \(targets.count) user(s) before media publish", + detail: "Recipients: \(targetList.isEmpty ? "(none)" : targetList).", + roomId: roomID + ) do { try await bridge.sendEncryptionKey( localKey, keyIndex: keyIndex, toMembers: targets ) - activityLog?.log( - category: .call, severity: .debug, source: "CallViewModel", - summary: "Distributed E2EE key to \(targets.count) user(s)", - roomId: roomID - ) + // Success entry — including fp — already written by + // CallWidgetBridge.sendEncryptionKey. } catch { activityLog?.log( category: .call, severity: .warning, source: "CallViewModel", - summary: "Widget-bridge key distribution failed", - detail: error.localizedDescription, + summary: "E2EE key distribution failed", + detail: "Tried sending to \(targets.count) user(s): \(targetList). Peers will see `missing_key` and our media will appear as black tiles to them. Error: \(error.localizedDescription)", roomId: roomID ) } @@ -405,12 +466,14 @@ public final class CallViewModel: CallViewModelProtocol { // Key is now installed locally and (best-effort) distributed to // any existing call participants. Safe to publish media. + connectingPhase = "Starting camera & microphone…" try await room.localParticipant.setMicrophone(enabled: true) try await room.localParticipant.setCamera(enabled: true) isLocalCameraEnabled = true isLocalMicrophoneEnabled = true state = .connected + connectingPhase = nil videoTrackRevision += 1 activityLog?.log( category: .call, severity: .info, source: "CallViewModel", @@ -430,6 +493,7 @@ public final class CallViewModel: CallViewModelProtocol { } state = .failed(message) + connectingPhase = nil activityLog?.log( category: .call, severity: .error, source: "CallViewModel", summary: "Call connection failed", @@ -449,6 +513,7 @@ public final class CallViewModel: CallViewModelProtocol { // Update UI state immediately — SwiftUI re-renders to the // disconnected state while the awaited cleanup runs. state = .disconnected + connectingPhase = nil participants = [] isLocalCameraEnabled = false isLocalMicrophoneEnabled = false @@ -503,19 +568,15 @@ public final class CallViewModel: CallViewModelProtocol { sfuServiceURL: sfuServiceURL, membershipId: membershipId ) - Task { @MainActor in - activityLog?.log( - category: .call, severity: .debug, source: "CallViewModel", - summary: "Heartbeat refreshed call.member state event", - roomId: roomID - ) - } + // Success entry already written by + // `CallEncryptionService.sendCallMemberEvent`. } catch { - Task { @MainActor in + let description = error.localizedDescription + await MainActor.run { activityLog?.log( category: .call, severity: .warning, source: "CallViewModel", - summary: "Heartbeat refresh failed", - detail: error.localizedDescription, + summary: "Call membership heartbeat refresh failed", + detail: "Other participants may treat us as having left when our event expires. Error: \(description)", roomId: roomID ) } @@ -616,47 +677,142 @@ public final class CallViewModel: CallViewModelProtocol { // MARK: - E2EE Key Redistribution - /// Re-sends the local encryption key to a newly joined participant so they - /// can decrypt our media. Routes through the widget bridge so the SDK - /// Olm-encrypts the to-device payload. - fileprivate func redistributeKey(to participantIdentity: String) { - guard let key = localEncryptionKey, let bridge = widgetBridge else { return } + /// Re-sends the local encryption key to all current call members so a + /// peer that just joined LiveKit can decrypt our media. + /// + /// Previously this method parsed the LiveKit participant identity + /// (`@user:server:device`) to recover a single user/device target. On + /// v2 the identity is an opaque base64 hash, so the parse fails and the + /// new peer never receives our key. Re-fetching `m.call.member` state + /// and broadcasting to everyone matches Element Call's + /// `RTCEncryptionManager` behaviour on membership changes — slightly + /// inefficient (existing peers receive our key twice) but correct on + /// both legacy and v2 paths. + /// + /// The `participantIdentity` parameter is now only used for logging. + /// Surfaces a `sendCallMemberEvent` failure to the Activity Log. The most + /// common failure shape in the wild is M_FORBIDDEN because the room's + /// `power_levels.events.org.matrix.msc3401.call.member` defaults to + /// `state_default` (50) instead of being explicitly lowered to 0 — when + /// hit, peers running Element Call / Element X have no Matrix-level + /// record of us joining the call, so they never send us their E2EE key + /// and our tiles stay black. Relay-created rooms set the override at + /// creation (see `MatrixService.callPowerLevels`); rooms created + /// elsewhere may not. + fileprivate func logCallMembershipFailure(_ error: Error, description: String) { + let isPowerLevelDenial = description.contains("M_FORBIDDEN") + && description.contains("org.matrix.msc3401.call.member") + && description.contains("power") + let summary = "Call membership state event rejected" + let detail: String + if isPowerLevelDenial { + detail = "Homeserver returned M_FORBIDDEN: this room requires a higher power level to send `org.matrix.msc3401.call.member`. Ask a room admin to set its required power level to 0 (Relay-created rooms do this automatically). Without this event in room state, other participants can't send you E2EE keys and your tiles will stay black on encrypted calls. Raw error: \(description)" + } else { + detail = "Without a successful call membership state event, peers can't see you as a call participant and won't send you E2EE keys. Raw error: \(description)" + } + activityLog?.log( + category: .call, severity: .error, source: "CallViewModel", + summary: summary, + detail: detail, + roomId: roomID + ) + } - // Parse "user:device" from the LiveKit identity - // (format: `@userId:server:deviceId`). Element Call uses identities - // like `@user:server:DEVICEID`. - let components = participantIdentity.components(separatedBy: ":") - guard components.count >= 3 else { - activityLog?.log( - category: .call, severity: .warning, source: "CallViewModel", - summary: "Cannot parse participant identity for key redistribution", - roomId: roomID - ) + /// Re-distributes our local E2EE key in response to an inbound + /// `m.call.member` state change. The widget bridge fires the + /// callback whenever it sees one of these events; we use that as a + /// signal to refresh our recipient set, because the SDK's + /// `RoomInfo.activeRoomCallParticipants` accessor lags behind + /// LiveKit's `participantDidConnect` (which is what + /// ``redistributeKey(to:)`` keys off). + /// + /// Guarded against heartbeat refreshes: skips when the *user* set + /// of targets hasn't changed since the last send. + fileprivate func redistributeKeyOnMembershipChange() { + guard let key = localEncryptionKey, + let bridge = widgetBridge, + let encryptionService else { return } - let userId = components[0] + ":" + components[1] - let deviceId = components.dropFirst(2).joined(separator: ":") let index = localKeyIndex Task { [weak self] in + guard let self else { return } + let targets = await encryptionService.fetchCallTargets() + let targetUserIDs = Set(targets.keys) + let previousUserIDs = await MainActor.run { Set(self.callMembers.keys) } + // Heartbeat / unchanged-member case: no new peer, nothing to do. + if targetUserIDs.isEmpty || targetUserIDs == previousUserIDs { return } + + let targetList = targets.keys.sorted().joined(separator: ", ") do { try await bridge.sendEncryptionKey( key, keyIndex: index, - toMembers: [userId: [deviceId]] - ) - self?.activityLog?.log( - category: .call, severity: .debug, source: "CallViewModel", - summary: "Redistributed key to \(participantIdentity)", - roomId: self?.roomID + toMembers: targets ) + await MainActor.run { + self.callMembers = targets + self.activityLog?.log( + category: .call, severity: .debug, source: "CallViewModel", + summary: "Redistributed E2EE key on m.call.member change", + detail: "Recipients: \(targetList). Index: \(index).", + roomId: self.roomID + ) + } } catch { - self?.activityLog?.log( - category: .call, severity: .warning, source: "CallViewModel", - summary: "Key redistribution failed", - detail: error.localizedDescription, - roomId: self?.roomID + let description = error.localizedDescription + await MainActor.run { + self.activityLog?.log( + category: .call, severity: .warning, source: "CallViewModel", + summary: "E2EE key redistribution failed (m.call.member trigger)", + detail: "Targets: \(targetList). Error: \(description)", + roomId: self.roomID + ) + } + } + } + } + + fileprivate func redistributeKey(to participantIdentity: String) { + guard let key = localEncryptionKey, + let bridge = widgetBridge, + let encryptionService else { + return + } + let index = localKeyIndex + + Task { + let targets = await encryptionService.fetchCallTargets() + guard !targets.isEmpty else { + await MainActor.run { + activityLog?.log( + category: .call, severity: .debug, source: "CallViewModel", + summary: "No call targets to redistribute key to", + detail: "Trigger: new participant \(participantIdentity). `fetchCallTargets` returned an empty map.", + roomId: roomID + ) + } + return + } + let targetList = targets.keys.sorted().joined(separator: ", ") + do { + try await bridge.sendEncryptionKey( + key, + keyIndex: index, + toMembers: targets ) + // Success entry — including fp — already written by + // CallWidgetBridge.sendEncryptionKey. + } catch { + await MainActor.run { + activityLog?.log( + category: .call, severity: .warning, source: "CallViewModel", + summary: "E2EE key redistribution failed", + detail: "Trigger: new participant \(participantIdentity). Targets: \(targetList). Error: \(error.localizedDescription)", + roomId: roomID + ) + } } } } @@ -744,6 +900,12 @@ public final class CallViewModel: CallViewModelProtocol { if viewModel.state == .connected { viewModel.state = .disconnected } + viewModel.activityLog?.log( + category: .call, severity: .warning, source: "CallViewModel", + summary: "LiveKit connection disconnected", + detail: "Previous state: \(Self.describe(oldValue))", + roomId: viewModel.roomID + ) case .reconnecting: viewModel.activityLog?.log( category: .call, severity: .warning, source: "CallViewModel", @@ -756,14 +918,82 @@ public final class CallViewModel: CallViewModelProtocol { } } + /// Fires when the SFU rejects the initial connection (auth, transport, + /// codec negotiation). Distinct from `didDisconnectWithError`, which + /// fires after a successful connect terminates. + func room(_ room: LiveKit.Room, didFailToConnectWithError error: LiveKitError?) { + let description = error?.localizedDescription ?? "no error reported" + Task { @MainActor [weak viewModel] in + guard let viewModel else { return } + viewModel.activityLog?.log( + category: .call, severity: .error, source: "CallViewModel", + summary: "LiveKit connection rejected", + detail: description, + roomId: viewModel.roomID + ) + } + } + + /// Fires when an already-connected room disconnects, with an optional + /// error explaining why. A `nil` error indicates a clean local + /// disconnect; a non-nil error is the most useful signal we get when + /// a call drops mid-session. + func room(_ room: LiveKit.Room, didDisconnectWithError error: LiveKitError?) { + let description = error?.localizedDescription + Task { @MainActor [weak viewModel] in + guard let viewModel else { return } + if let description { + viewModel.activityLog?.log( + category: .call, severity: .error, source: "CallViewModel", + summary: "LiveKit connection lost", + detail: description, + roomId: viewModel.roomID + ) + } else { + viewModel.activityLog?.log( + category: .call, severity: .debug, source: "CallViewModel", + summary: "LiveKit disconnected cleanly", + roomId: viewModel.roomID + ) + } + } + } + + /// Human-readable label for a `LiveKit.ConnectionState` enum value. + /// Lives on the delegate so the activity-log detail strings stay + /// stable across LiveKit SDK updates. + nonisolated private static func describe(_ state: LiveKit.ConnectionState) -> String { + switch state { + case .connected: "connected" + case .disconnected: "disconnected" + case .reconnecting: "reconnecting" + case .connecting: "connecting" + case .disconnecting: "disconnecting" + } + } + + /// Human-readable label for a `LiveKit.Track.Kind`. The raw value is + /// `Int`-backed (`audio=0`, `video=1`, `none=2`) which is useless in + /// logs. + nonisolated fileprivate static func describe(_ kind: Track.Kind) -> String { + switch kind { + case .audio: "audio" + case .video: "video" + case .none: "none" + default: "unknown(\(kind.rawValue))" + } + } + func room(_ room: LiveKit.Room, participantDidConnect participant: RemoteParticipant) { Task { @MainActor [weak viewModel] in guard let viewModel else { return } let identityStr = participant.identity?.stringValue ?? "(none)" + let sidStr = participant.sid?.stringValue ?? "(none)" + let displayName = participant.name ?? "(none)" viewModel.activityLog?.log( category: .call, severity: .debug, source: "CallViewModel", summary: "Remote participant connected", - detail: "Identity: \(identityStr)", + detail: "Identity: \(identityStr), sid: \(sidStr), name: \(displayName)", roomId: viewModel.roomID ) viewModel.syncParticipants(trackChanged: true) @@ -775,15 +1005,36 @@ public final class CallViewModel: CallViewModelProtocol { func room(_ room: LiveKit.Room, participant: RemoteParticipant, didSubscribeTrack publication: RemoteTrackPublication) { observeDimensions(of: publication) + let identityStr = participant.identity?.stringValue ?? "(none)" + let kind = Self.describe(publication.kind) + let sid = publication.sid Task { @MainActor [weak viewModel] in - let identityStr = participant.identity?.stringValue ?? "(none)" - let kind = publication.kind.rawValue - viewModel?.activityLog?.log( + guard let viewModel else { return } + viewModel.activityLog?.log( category: .call, severity: .debug, source: "CallViewModel", - summary: "Subscribed to \(kind) track from \(identityStr)", - roomId: viewModel?.roomID + summary: "Subscribed to remote \(kind) track", + detail: "Identity: \(identityStr), trackSid: \(sid)", + roomId: viewModel.roomID + ) + viewModel.syncParticipants(trackChanged: true) + } + } + + /// Fires when LiveKit can't subscribe to a remote track — the most + /// common cause is firewall / NAT blocking the media path while + /// signalling completes. This is the strongest signal for the + /// "connected, no media" failure shape. + func room(_ room: LiveKit.Room, participant: RemoteParticipant, didFailToSubscribeTrackWithSid trackSid: Track.Sid, error: LiveKitError) { + let identityStr = participant.identity?.stringValue ?? "(none)" + let description = error.localizedDescription + Task { @MainActor [weak viewModel] in + guard let viewModel else { return } + viewModel.activityLog?.log( + category: .call, severity: .error, source: "CallViewModel", + summary: "Failed to subscribe to remote track", + detail: "Identity: \(identityStr), trackSid: \(trackSid), error: \(description)", + roomId: viewModel.roomID ) - viewModel?.syncParticipants(trackChanged: true) } } @@ -811,14 +1062,69 @@ public final class CallViewModel: CallViewModelProtocol { func room(_ room: LiveKit.Room, localParticipant: LocalParticipant, didPublishTrack publication: LocalTrackPublication) { observeDimensions(of: publication) + let kind = Self.describe(publication.kind) + let sid = publication.sid Task { @MainActor [weak viewModel] in - viewModel?.videoTrackRevision += 1 + guard let viewModel else { return } + viewModel.activityLog?.log( + category: .call, severity: .debug, source: "CallViewModel", + summary: "Published local \(kind) track", + detail: "trackSid: \(sid)", + roomId: viewModel.roomID + ) + viewModel.videoTrackRevision += 1 } } func room(_ room: LiveKit.Room, participant: RemoteParticipant, didPublishTrack publication: RemoteTrackPublication) { + let identityStr = participant.identity?.stringValue ?? "(none)" + let kind = Self.describe(publication.kind) + let sid = publication.sid Task { @MainActor [weak viewModel] in - viewModel?.syncParticipants(trackChanged: true) + guard let viewModel else { return } + viewModel.activityLog?.log( + category: .call, severity: .debug, source: "CallViewModel", + summary: "Remote published \(kind) track", + detail: "Identity: \(identityStr), trackSid: \(sid)", + roomId: viewModel.roomID + ) + viewModel.syncParticipants(trackChanged: true) + } + } + + /// Per-track LiveKit E2EE state transitions. Only fires when E2EE is + /// enabled on the room. Normal lifecycle is `.new` → `.ok`. Any other + /// terminal state (`.missing_key`, `.encryption_failed`, + /// `.decryption_failed`, `.internal_error`) is the canonical signal + /// for "connected but no media" on encrypted rooms — surface them + /// loudly so users on Element-Call interop calls can see the + /// cryptor failing without having to read os_log. + func room(_ room: LiveKit.Room, trackPublication: TrackPublication, didUpdateE2EEState state: E2EEState) { + let stateLabel = state.toString() + let trackSid = trackPublication.sid + let trackKind = Self.describe(trackPublication.kind) + Task { @MainActor [weak viewModel] in + guard let viewModel else { return } + switch state { + case .ok, .new, .key_ratcheted: + return + case .missing_key: + viewModel.activityLog?.log( + category: .call, severity: .warning, source: "CallViewModel", + summary: "E2EE missing key for \(trackKind) track", + detail: "trackSid: \(trackSid). Remote peer's encryption key hasn't been received yet or was rejected.", + roomId: viewModel.roomID + ) + case .encryption_failed, .decryption_failed, .internal_error: + viewModel.activityLog?.log( + category: .call, severity: .error, source: "CallViewModel", + summary: "E2EE failure on \(trackKind) track", + detail: "State: \(stateLabel), trackSid: \(trackSid)", + roomId: viewModel.roomID + ) + @unknown default: + return + } } } @@ -863,3 +1169,20 @@ public final class CallViewModel: CallViewModelProtocol { } } } + +// MARK: - Errors + +/// Errors raised by `CallViewModel.connect`. Only the cases that surface to +/// the user via the error reporter or the call sheet need a +/// `LocalizedError`; internal-only failures can be plain `Swift.Error`. +enum CallViewModelError: LocalizedError { + case missingLocalParticipantIdentity + + var errorDescription: String? { + switch self { + case .missingLocalParticipantIdentity: + return "LiveKit didn't assign an identity to the local participant; " + + "the call can't be encrypted. Try reconnecting." + } + } +} diff --git a/RelayKit/Call/CallWidgetBridge.swift b/RelayKit/Call/CallWidgetBridge.swift index 326b603..3bb23a0 100644 --- a/RelayKit/Call/CallWidgetBridge.swift +++ b/RelayKit/Call/CallWidgetBridge.swift @@ -82,6 +82,11 @@ public final class CallWidgetBridge: @unchecked Sendable { private let roomId: String /// Activity log for surfacing widget bridge events in the Activity Log window. weak var activityLog: ActivityLog? + /// Fires whenever an `org.matrix.msc3401.call.member` state event is + /// observed via the widget driver — used by ``CallViewModel`` to retry + /// E2EE key distribution after a peer's membership lands in room + /// state. Closing on `[weak self]` is the caller's responsibility. + var onCallMemberStateChanged: (() -> Void)? /// Per-call MatrixRTC membership UUID. Must match the `membershipID` /// field in the `org.matrix.msc3401.call.member` state event and the /// `member.id` field in outbound `io.element.call.encryption_keys` @@ -189,15 +194,15 @@ public final class CallWidgetBridge: @unchecked Sendable { let capabilitiesProvider = self.capabilitiesProvider driverTask = Task { [weak self] in await driver.run(room: room, capabilitiesProvider: capabilitiesProvider) - self?.resolveReady() - Task { @MainActor [weak self] in - guard let self else { return } + guard let self else { return } + await MainActor.run { self.activityLog?.log( category: .call, severity: .debug, source: "CallWidgetBridge", - summary: "Widget driver exited", + summary: "WidgetDriver.run returned (driver exited)", roomId: self.roomId ) } + self.resolveReady() } recvTask = Task { [weak self] in @@ -207,15 +212,25 @@ public final class CallWidgetBridge: @unchecked Sendable { // Kick the state machine off the "Unset" state. Fire-and-forget — // the response just echoes back through recvLoop. Task { [weak self] in + guard let self else { return } + let widgetId = self.widgetId do { - try await self?.sendRequest(action: "content_loaded", data: [:]) + try await self.sendRequest(action: "content_loaded", data: [:]) + await MainActor.run { + self.activityLog?.log( + category: .call, severity: .debug, source: "CallWidgetBridge", + summary: "Widget content_loaded acknowledged by driver", + detail: "widgetId: \(widgetId)", + roomId: self.roomId + ) + } } catch { - Task { @MainActor [weak self] in - guard let self else { return } + let description = error.localizedDescription + await MainActor.run { self.activityLog?.log( category: .call, severity: .warning, source: "CallWidgetBridge", summary: "content_loaded failed", - detail: error.localizedDescription, + detail: "widgetId: \(widgetId). Error: \(description)", roomId: self.roomId ) } @@ -227,6 +242,7 @@ public final class CallWidgetBridge: @unchecked Sendable { self.activityLog?.log( category: .call, severity: .debug, source: "CallWidgetBridge", summary: "Widget bridge started", + detail: "widgetId: \(self.widgetId)", roomId: self.roomId ) } @@ -368,7 +384,7 @@ public final class CallWidgetBridge: @unchecked Sendable { self.activityLog?.log( category: .call, severity: .debug, source: "CallWidgetBridge", summary: "Sent E2EE key to \(toMembers.count) user(s)", - detail: "Key index: \(keyIndex), fingerprint: \(fp)", + detail: "Key index: \(keyIndex), member.id: \(self.membershipId), sha256[0..8]: \(fp).", roomId: self.roomId ) } @@ -395,7 +411,7 @@ public final class CallWidgetBridge: @unchecked Sendable { guard let self else { return } self.activityLog?.log( category: .call, severity: .debug, source: "CallWidgetBridge", - summary: "Sent call member state event", + summary: "Sent call member state event via widget", detail: "state_key: \(stateKey)", roomId: self.roomId ) @@ -445,14 +461,20 @@ public final class CallWidgetBridge: @unchecked Sendable { Task { @MainActor [weak self] in guard let self else { return } self.activityLog?.log( - category: .call, severity: .debug, source: "CallWidgetBridge", + category: .call, severity: .info, source: "CallWidgetBridge", summary: "Widget driver recv loop exited", + detail: "WidgetDriverHandle.recv returned nil.", roomId: self.roomId ) } break } + // SECURITY: never surface raw widget JSON. Outbound and inbound + // `send_to_device` payloads of type `io.element.call.encryption_keys` + // carry raw AES keys in the `keys.key` field — those would land + // unredacted in any log sink. We track action / type only. + guard let data = raw.data(using: .utf8), let msg = try? JSONSerialization.jsonObject(with: data) as? [String: Any] else { Task { @MainActor [weak self] in @@ -460,6 +482,7 @@ public final class CallWidgetBridge: @unchecked Sendable { self.activityLog?.log( category: .call, severity: .warning, source: "CallWidgetBridge", summary: "Non-JSON message from widget driver", + detail: "Length: \(raw.count) bytes.", roomId: self.roomId ) } @@ -491,6 +514,7 @@ public final class CallWidgetBridge: @unchecked Sendable { self.activityLog?.log( category: .call, severity: .warning, source: "CallWidgetBridge", summary: "Widget message missing action", + detail: "Message has neither `response` nor `action` keys; ignoring.", roomId: self.roomId ) } @@ -534,15 +558,27 @@ public final class CallWidgetBridge: @unchecked Sendable { case "send_event", "update_state": // Incoming Matrix events observed by the widget driver. // MatrixRTC member state is handled by Element Call peers - // directly; we just need to ack these. Log and move on. + // directly; we just need to ack these. Log and — for + // `org.matrix.msc3401.call.member` — also poke the view model + // to retry E2EE key distribution, since the SDK's + // `RoomInfo.activeRoomCallParticipants` accessor lags behind + // LiveKit's `participantDidConnect` signal: a peer can join + // the SFU before their membership state event arrives, which + // leaves our `redistributeKey(to:)` path unable to find them. if let type = data["type"] as? String { + let logAction = action + let logType = type + let isMemberEvent = (type == CallEncryptionService.callMemberEventType) Task { @MainActor [weak self] in guard let self else { return } self.activityLog?.log( category: .call, severity: .debug, source: "CallWidgetBridge", - summary: "Widget incoming \(action) type=\(type)", + summary: "Widget incoming \(logAction) (\(logType))", roomId: self.roomId ) + if isMemberEvent { + self.onCallMemberStateChanged?() + } } } responseBody = [:] @@ -551,11 +587,13 @@ public final class CallWidgetBridge: @unchecked Sendable { responseBody = [:] default: + let logAction = action Task { @MainActor [weak self] in guard let self else { return } self.activityLog?.log( category: .call, severity: .debug, source: "CallWidgetBridge", - summary: "Widget unhandled action: \(action)", + summary: "Widget unhandled action: \(logAction)", + detail: "Acking with empty response.", roomId: self.roomId ) } @@ -597,12 +635,13 @@ public final class CallWidgetBridge: @unchecked Sendable { } let ok = await handle.send(msg: json) if !ok { - let action = original["action"] as? String ?? "?" + let originalAction = original["action"] as? String ?? "?" Task { @MainActor [weak self] in guard let self else { return } self.activityLog?.log( category: .call, severity: .warning, source: "CallWidgetBridge", - summary: "handle.send returned false replying to action=\(action)", + summary: "Widget handle.send returned false", + detail: "Replying to action=\(originalAction).", roomId: self.roomId ) } @@ -623,7 +662,8 @@ public final class CallWidgetBridge: @unchecked Sendable { guard let self else { return } self.activityLog?.log( category: .call, severity: .warning, source: "CallWidgetBridge", - summary: "No keyProvider; dropping inbound key", + summary: "Dropping inbound encryption key — no keyProvider", + detail: "Sender: \(sender). The local frame cryptor isn't wired up yet.", roomId: self.roomId ) } @@ -645,7 +685,8 @@ public final class CallWidgetBridge: @unchecked Sendable { guard let self else { return } self.activityLog?.log( category: .call, severity: .warning, source: "CallWidgetBridge", - summary: "encryption_keys to-device missing keys", + summary: "encryption_keys to-device missing `keys` payload", + detail: "Sender: \(sender).", roomId: self.roomId ) } @@ -658,23 +699,36 @@ public final class CallWidgetBridge: @unchecked Sendable { let topDeviceId = (content["device_id"] as? String) ?? "" let deviceId = !claimedDeviceId.isEmpty ? claimedDeviceId : topDeviceId - // LiveKit participant identity lookup order. Element Call connects to - // the SFU with identity `@user:server:deviceId` (confirmed in the - // MatrixRTC JWT grant), so that's what we need to key on for the - // LKRTCFrameCryptorKeyProvider to route the key to the right - // participant's decoder. + // Register the inbound key under every plausible LiveKit + // participant identity for this peer. Which shape LiveKit assigned + // depends on which credential path (legacy or v2) the peer took + // when they joined the SFU — we don't necessarily know that from + // the to-device payload alone, so register under every candidate + // and let the cryptor pick the one whose participantId matches the + // SFU-assigned identity. // - // `member.id` is the MSC4143 per-membership UUID — an *event*-level - // identifier, not a LiveKit participant identity. It only enters the - // fallback chain so older peers that somehow omit the device id still - // get routed. - let participantIdentity: String + // - Legacy (`/sfu/get`): identity = `:`. + // - v2 (`/get_token`): identity = unpadded-base64 SHA-256 of + // `[sender, claimed_device_id, member.id]` per + // `lk-jwt-service/helper.go::LiveKitIdentityFor`. + var participantIdentities: [String] = [] if !deviceId.isEmpty { - participantIdentity = "\(sender):\(deviceId)" - } else if !memberId.isEmpty { - participantIdentity = memberId - } else { - participantIdentity = sender + participantIdentities.append("\(sender):\(deviceId)") + } + if !claimedDeviceId.isEmpty && !memberId.isEmpty { + let v2Identity = CallEncryptionService.liveKitIdentity( + matrixID: sender, + claimedDeviceID: claimedDeviceId, + memberID: memberId + ) + if !v2Identity.isEmpty { + participantIdentities.append(v2Identity) + } + } + if participantIdentities.isEmpty { + // Last-resort fallback — older peers that omit both device_id + // and member.id. Element Call's parser does the same. + participantIdentities.append(memberId.isEmpty ? sender : memberId) } for entry in keyEntries { @@ -683,18 +737,26 @@ public final class CallWidgetBridge: @unchecked Sendable { let keyData = Data(base64Encoded: base64Key) else { continue } - CallEncryptionService.setRawKey( - keyData, - on: keyProvider, - participantId: participantIdentity, - index: Int32(index) - ) + var setFailures: [String] = [] + for participantIdentity in participantIdentities { + if let reason = CallEncryptionService.setRawKey( + keyData, + on: keyProvider, + participantId: participantIdentity, + index: Int32(index) + ) { + setFailures.append("\(participantIdentity): \(reason)") + } + } + let identitiesJoined = participantIdentities.joined(separator: ", ") + let fp = SHA256.hash(data: keyData).prefix(8).map { String(format: "%02x", $0) }.joined() + let failureNote = setFailures.isEmpty ? "" : " setRawKey failures: \(setFailures.joined(separator: "; "))." Task { @MainActor [weak self] in guard let self else { return } self.activityLog?.log( - category: .call, severity: .debug, source: "CallWidgetBridge", + category: .call, severity: setFailures.isEmpty ? .debug : .warning, source: "CallWidgetBridge", summary: "Received E2EE key from \(sender)", - detail: "Participant: \(participantIdentity), key index: \(index)", + detail: "Routed to LiveKit participantIds: [\(identitiesJoined)]. Sender: \(sender), device: \(deviceId), member: \(memberId), index: \(index), sha256[0..8]: \(fp).\(failureNote)", roomId: self.roomId ) } diff --git a/RelayKit/Call/LiveKitCredentialService.swift b/RelayKit/Call/LiveKitCredentialService.swift index edef014..0521235 100644 --- a/RelayKit/Call/LiveKitCredentialService.swift +++ b/RelayKit/Call/LiveKitCredentialService.swift @@ -54,13 +54,15 @@ struct LiveKitCredentialService { activityLog?.log( category: .call, severity: .info, source: "LiveKitCredentialService", summary: "Fetching call credentials", + detail: "Room: \(roomID)", roomId: roomID ) do { - let sfuURL = try await discoverSFUURL() + let sfuURL = try await discoverSFUURL(roomID: roomID) activityLog?.log( category: .call, severity: .debug, source: "LiveKitCredentialService", summary: "SFU URL discovered", + detail: "SFU: \(sfuURL)", roomId: roomID ) let openIDToken = try await requestOpenIDToken() @@ -89,12 +91,21 @@ struct LiveKitCredentialService { // MARK: - Step 1: Discover SFU URL - private func discoverSFUURL() async throws -> String { - // Prefer the MSC4143 transports endpoint + private func discoverSFUURL(roomID: String) async throws -> String { + // Existing-call SFU wins. Per `focus_active.focus_selection == + // "oldest_membership"` (matrix-js-sdk's `MatrixRTCSession`), every + // joiner must converge on the SFU advertised by the *oldest* + // active call membership. If we picked our own homeserver's SFU + // here instead, a federated call would split across two SFUs and + // media never reaches the other side. + if let url = try? await fetchSFUFromCallMembers(roomID: roomID) { + return url + } + // Bootstrap path — we're the first to join. Prefer MSC4143 + // transports endpoint, fall back to `.well-known`. if let url = try? await fetchRTCTransportsURL() { return url } - // Fall back to .well-known if let url = try? await fetchWellKnownSFUURL() { return url } @@ -139,6 +150,76 @@ struct LiveKitCredentialService { return first.livekitServiceUrl } + /// Reads `m.call.member` state events on the room and returns the + /// `foci_preferred[].livekit_service_url` advertised by the **oldest** + /// active call membership — the SFU every joiner is supposed to + /// converge on per `focus_active.focus_selection == "oldest_membership"`. + /// + /// Skips tombstoned (empty-content) and expired memberships so a stale + /// leftover doesn't outvote the live participants. Returns + /// `sfuURLNotFound` when nobody is in the call yet, signalling the + /// caller to bootstrap via local discovery. + private func fetchSFUFromCallMembers(roomID: String) async throws -> String { + let base = homeserver.trimmingCharacters(in: .init(charactersIn: "/")) + let encodedRoomID = roomID.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) ?? roomID + guard let url = URL(string: "\(base)/_matrix/client/v3/rooms/\(encodedRoomID)/state") else { + throw LiveKitCredentialError.invalidURL + } + var request = URLRequest(url: url) + request.setValue("Bearer \(accessToken)", forHTTPHeaderField: "Authorization") + + let (data, response) = try await URLSession.shared.data(for: request) + guard let http = response as? HTTPURLResponse, http.statusCode == 200 else { + throw LiveKitCredentialError.serverError + } + + guard let events = try JSONSerialization.jsonObject(with: data) as? [[String: Any]] else { + throw LiveKitCredentialError.sfuURLNotFound + } + + struct Candidate { + let createdTs: Int64 + let sfuURL: String + } + var candidates: [Candidate] = [] + let nowMs = Int64(Date().timeIntervalSince1970 * 1000) + + for event in events { + guard let type = event["type"] as? String, + type == "org.matrix.msc3401.call.member", + let content = event["content"] as? [String: Any], + !content.isEmpty, + let fociPreferred = content["foci_preferred"] as? [[String: Any]], + let focus = fociPreferred.first(where: { ($0["type"] as? String) == "livekit" }), + let serviceURL = focus["livekit_service_url"] as? String, + !serviceURL.isEmpty + else { continue } + + // Drop expired memberships. Default `expires` is 4h + // (14400000ms). `created_ts` falls back to event-level + // `origin_server_ts` so very old non-tombstoned events still + // get pruned. + let createdTs = (content["created_ts"] as? NSNumber)?.int64Value + ?? (event["origin_server_ts"] as? NSNumber)?.int64Value + ?? 0 + let expires = (content["expires"] as? NSNumber)?.int64Value ?? 14400000 + if createdTs > 0 && createdTs + expires < nowMs { + continue + } + candidates.append(Candidate(createdTs: createdTs, sfuURL: serviceURL)) + } + + guard let oldest = candidates.min(by: { $0.createdTs < $1.createdTs }) else { + throw LiveKitCredentialError.sfuURLNotFound + } + activityLog?.log( + category: .call, severity: .debug, source: "LiveKitCredentialService", + summary: "Joining existing call SFU (oldest_membership)", + detail: "SFU: \(oldest.sfuURL). Picked from \(candidates.count) active member(s)." + ) + return oldest.sfuURL + } + // MARK: - Step 2: Request OpenID Token private func requestOpenIDToken() async throws -> OpenIDTokenPayload { @@ -167,11 +248,46 @@ struct LiveKitCredentialService { roomID: String, openIDToken: OpenIDTokenPayload ) async throws -> (url: String, token: String) { - // Try the v2 endpoint first, fall back to legacy - if let result = try? await fetchLiveKitTokenV2(sfuURL: sfuURL, roomID: roomID, openIDToken: openIDToken) { - return result + // Try legacy `/sfu/get` first. It assigns LiveKit identity + // `${user}:${device}` — which matches what matrix-js-sdk peers + // (Element Call / Element X / Element Web) compute as + // `rtcBackendIdentity` from our `org.matrix.msc3401.call.member` + // event (see `CallMembership.parseFromEvent` — + // `MembershipKind.Session` branch is the plain-concat form, not the + // hashed v2 form). If we use v2 `/get_token` we land on a hashed + // identity that peers reading our legacy session event cannot + // reconcile, breaking video routing. v2 only becomes viable once we + // also publish MSC4143 sticky `m.rtc.member` events. + do { + return try await fetchLiveKitTokenLegacy( + sfuURL: sfuURL, + roomID: roomID, + openIDToken: openIDToken + ) + } catch let legacyError { + logLegacyFailure(legacyError, sfuURL: sfuURL) } - return try await fetchLiveKitTokenLegacy(sfuURL: sfuURL, roomID: roomID, openIDToken: openIDToken) + return try await fetchLiveKitTokenV2(sfuURL: sfuURL, roomID: roomID, openIDToken: openIDToken) + } + + /// Surfaces a `/sfu/get` failure so the fall-forward to v2 is visible + /// after the fact. Format-aware: a + /// `LiveKitCredentialError.tokenExchangeRejected` carries structured + /// detail; anything else falls through to its `localizedDescription`. + private func logLegacyFailure(_ error: Error, sfuURL: String) { + let detail: String + if case let LiveKitCredentialError.tokenExchangeRejected(status, errcode, message, _) = error { + let errcodePart = errcode.map { " \($0)" } ?? "" + let messagePart = message.map { ": \($0)" } ?? "" + detail = "HTTP \(status)\(errcodePart)\(messagePart)" + } else { + detail = error.localizedDescription + } + activityLog?.log( + category: .call, severity: .warning, source: "LiveKitCredentialService", + summary: "Legacy /sfu/get rejected; trying v2", + detail: detail + ) } private func fetchLiveKitTokenV2( @@ -189,19 +305,35 @@ struct LiveKitCredentialService { let body = GetTokenRequest( roomId: roomID, + // Element Call hardcodes "m.call#ROOM" for the application slot + // on the v2 endpoint. lk-jwt-service `SFURequest.Validate()` + // rejects requests where `slot_id` is empty with HTTP 400 + // M_BAD_JSON, which is what forced every previous Relay call + // to silently fall back to legacy `/sfu/get`. + slotId: "m.call#ROOM", openidToken: openIDToken, member: .init(id: "\(userID):\(deviceID)", claimedUserId: userID, claimedDeviceId: deviceID) ) request.httpBody = try JSONEncoder().encode(body) let (data, response) = try await URLSession.shared.data(for: request) - guard let http = response as? HTTPURLResponse, http.statusCode == 200 else { - throw LiveKitCredentialError.tokenExchangeFailed + guard let http = response as? HTTPURLResponse else { + throw LiveKitCredentialError.serverError + } + guard http.statusCode == 200 else { + let (errcode, message) = Self.parseMatrixError(data) + throw LiveKitCredentialError.tokenExchangeRejected( + status: http.statusCode, + errcode: errcode, + message: message, + endpoint: "/get_token" + ) } let decoded = try JSONDecoder().decode(LiveKitTokenResponse.self, from: data) activityLog?.log( category: .call, severity: .debug, source: "LiveKitCredentialService", - summary: "LiveKit credentials obtained via /get_token", + summary: "Credentials obtained via v2 /get_token", + detail: "LiveKit URL: \(decoded.url)", roomId: roomID ) return (decoded.url, decoded.jwt) @@ -224,17 +356,43 @@ struct LiveKitCredentialService { request.httpBody = try JSONEncoder().encode(body) let (data, response) = try await URLSession.shared.data(for: request) - guard let http = response as? HTTPURLResponse, http.statusCode == 200 else { - throw LiveKitCredentialError.tokenExchangeFailed + guard let http = response as? HTTPURLResponse else { + throw LiveKitCredentialError.serverError + } + guard http.statusCode == 200 else { + let (errcode, message) = Self.parseMatrixError(data) + throw LiveKitCredentialError.tokenExchangeRejected( + status: http.statusCode, + errcode: errcode, + message: message, + endpoint: "/sfu/get" + ) } let decoded = try JSONDecoder().decode(LiveKitTokenResponse.self, from: data) activityLog?.log( category: .call, severity: .debug, source: "LiveKitCredentialService", - summary: "LiveKit credentials obtained via legacy /sfu/get", + summary: "Credentials obtained via legacy /sfu/get", + detail: "LiveKit URL: \(decoded.url)", roomId: roomID ) return (decoded.url, decoded.jwt) } + + /// Extracts `(errcode, error)` from a Matrix-style error response body. + /// Used to turn lk-jwt-service responses like + /// `{"errcode":"M_BAD_JSON","error":"The request body is missing..."}` + /// into a single human-readable line. Returns `(nil, nil)` if the body + /// isn't a Matrix error envelope. + private static func parseMatrixError(_ data: Data) -> (errcode: String?, message: String?) { + struct MatrixError: Decodable { + let errcode: String? + let error: String? + } + guard let parsed = try? JSONDecoder().decode(MatrixError.self, from: data) else { + return (nil, nil) + } + return (parsed.errcode, parsed.error) + } } // MARK: - Errors @@ -244,7 +402,11 @@ enum LiveKitCredentialError: LocalizedError { case invalidURL case serverError case openIDTokenFailed - case tokenExchangeFailed + /// The LiveKit JWT service rejected our request. Carries the HTTP + /// status, Matrix `errcode`/`error` if present, and which endpoint + /// produced the failure (`/get_token` or `/sfu/get`) so a user + /// support trace can identify both the path taken and the reason. + case tokenExchangeRejected(status: Int, errcode: String?, message: String?, endpoint: String) var errorDescription: String? { switch self { @@ -257,8 +419,10 @@ enum LiveKitCredentialError: LocalizedError { return "The homeserver returned an error while fetching call credentials." case .openIDTokenFailed: return "Failed to obtain an OpenID token from the homeserver." - case .tokenExchangeFailed: - return "The call server rejected the credential exchange." + case .tokenExchangeRejected(let status, let errcode, let message, let endpoint): + let errcodePart = errcode.map { " \($0)" } ?? "" + let messagePart = message.map { ": \($0)" } ?? "" + return "Call server rejected \(endpoint) with HTTP \(status)\(errcodePart)\(messagePart)" } } } @@ -308,6 +472,7 @@ struct OpenIDTokenPayload: Codable { private struct GetTokenRequest: Encodable { let roomId: String + let slotId: String let openidToken: OpenIDTokenPayload let member: Member struct Member: Encodable { @@ -322,6 +487,7 @@ private struct GetTokenRequest: Encodable { } enum CodingKeys: String, CodingKey { case roomId = "room_id" + case slotId = "slot_id" case openidToken = "openid_token" case member } diff --git a/RelayKit/Services/RoomListManager.swift b/RelayKit/Services/RoomListManager.swift index b2cc28e..c9b0d2a 100644 --- a/RelayKit/Services/RoomListManager.swift +++ b/RelayKit/Services/RoomListManager.swift @@ -532,6 +532,7 @@ private final class RoomEntry: Identifiable { summary.pinnedEventIds = info.pinnedEventIds summary.isFavourite = info.isFavourite summary.successorRoomId = info.successorRoom?.roomId + summary.hasRoomCall = info.hasRoomCall // Map SDK membership to RelayInterface type switch info.membership {