From 60232f55db2fa27488574f6d2aeaf7f9f16300db Mon Sep 17 00:00:00 2001 From: Max Heimbrock <43608204+MaxHeimbrock@users.noreply.github.com> Date: Fri, 3 Jul 2026 16:12:39 +0200 Subject: [PATCH] Surface FFI panic events instead of silently dropping them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit FfiEvent.Panic is the FFI layer's unrecoverable-error notification: a background task died (watch_panic), a request handler panicked, or the FFI hit a state error with no callback left to carry it (e.g. the ready-handshake timeout force-closing a room after ConnectCallback). Unity's DispatchEvent had `case Panic: break` — a stub from an early protocol sync — so all of these were fully silent: rooms kept looking connected while their event pipeline was dead, and in-flight requests hung forever. DispatchEvent now logs the panic, cancels all pending callbacks (so awaiting instructions resolve with IsError instead of hanging; done before raising PanicReceived so requests issued by reacting user handlers are not swept up), and raises a new internal FfiClient.PanicReceived event. Each connected Room subscribes and tears itself down through the disconnect path apps already handle: Disconnected / DisconnectedWithReason(UnknownReason) + cleanup. Note release FFI binaries build with panic=abort, so genuine Rust panics crash the process before an event can be sent; this wiring covers debug builds and the deliberate send_panic error paths, which include the ready-timeout zombie-room case. Co-Authored-By: Claude Fable 5 --- Runtime/Scripts/Core/Room.cs | 14 +++ Runtime/Scripts/Internal/FFI/FFIClient.cs | 10 ++ Runtime/Scripts/Internal/FFI/FFIEvents.cs | 2 + Tests/EditMode/PanicEventTests.cs | 101 ++++++++++++++++++ Tests/EditMode/PanicEventTests.cs.meta | 11 ++ Tests/PlayMode/PanicRoomTeardownTests.cs | 65 +++++++++++ Tests/PlayMode/PanicRoomTeardownTests.cs.meta | 11 ++ 7 files changed, 214 insertions(+) create mode 100644 Tests/EditMode/PanicEventTests.cs create mode 100644 Tests/EditMode/PanicEventTests.cs.meta create mode 100644 Tests/PlayMode/PanicRoomTeardownTests.cs create mode 100644 Tests/PlayMode/PanicRoomTeardownTests.cs.meta diff --git a/Runtime/Scripts/Core/Room.cs b/Runtime/Scripts/Core/Room.cs index eec304c3..b2422334 100644 --- a/Runtime/Scripts/Core/Room.cs +++ b/Runtime/Scripts/Core/Room.cs @@ -220,6 +220,7 @@ private void Cleanup() FfiClient.Instance.RoomEventReceived -= OnEventReceived; FfiClient.Instance.RpcMethodInvocationReceived -= OnRpcMethodInvocationReceived; FfiClient.Instance.DisconnectReceived -= OnDisconnectReceived; + FfiClient.Instance.PanicReceived -= OnPanicReceived; // Participant + track + publication FFI handles are independent entries in the // Rust handle table — dropping the room handle alone does not cascade to them, so @@ -598,6 +599,7 @@ internal void OnConnect(ConnectCallback info) FfiClient.Instance.RoomEventReceived += OnEventReceived; FfiClient.Instance.DisconnectReceived += OnDisconnectReceived; + FfiClient.Instance.PanicReceived += OnPanicReceived; FfiClient.Instance.RpcMethodInvocationReceived += OnRpcMethodInvocationReceived; // Signal Rust that listeners are installed and it can start forwarding room events. @@ -619,6 +621,18 @@ private void OnDisconnectReceived(DisconnectCallback e) Utils.Debug($"OnDisconnect.... {e}"); } + private void OnPanicReceived(Panic e) + { + // A panic means the FFI layer's background tasks may have died: this + // room could silently stop receiving events (including Disconnected + // itself), so the panic is surfaced through the disconnect path apps + // already handle. + DisconnectReason = DisconnectReason.UnknownReason; + Disconnected?.Invoke(this); + DisconnectedWithReason?.Invoke(this, DisconnectReason); + OnDisconnect(); + } + private void OnDisconnect() { Cleanup(); diff --git a/Runtime/Scripts/Internal/FFI/FFIClient.cs b/Runtime/Scripts/Internal/FFI/FFIClient.cs index 61dc2b71..8d360ea6 100644 --- a/Runtime/Scripts/Internal/FFI/FFIClient.cs +++ b/Runtime/Scripts/Internal/FFI/FFIClient.cs @@ -50,6 +50,7 @@ internal sealed class FfiClient : IFFIClient private readonly ConcurrentDictionary pendingCallbacks = new(); public event DisconnectReceivedDelegate? DisconnectReceived; + public event PanicReceivedDelegate? PanicReceived; public event RoomEventReceivedDelegate? RoomEventReceived; public event TrackEventReceivedDelegate? TrackEventReceived; public event RpcMethodInvocationReceivedDelegate? RpcMethodInvocationReceived; @@ -466,6 +467,15 @@ private static void DispatchEvent(FfiEvent ffiEvent) Instance.DataTrackStreamEventReceived?.Invoke(ffiEvent.DataTrackStreamEvent!); break; case FfiEvent.MessageOneofCase.Panic: + // The FFI layer declares its state unrecoverable after a panic: + // background tasks may have died, so rooms can silently stop + // receiving events and in-flight requests may never complete. + // Pending callbacks are cancelled before PanicReceived so that + // requests issued by user handlers reacting to the panic (e.g. a + // reconnect attempt from a Disconnected handler) are not swept up. + Utils.Error($"FFI panic: {ffiEvent.Panic.Message} — native SDK state is unrecoverable; cancelling pending requests and disconnecting rooms"); + Instance.ClearPendingCallbacks(); + Instance.PanicReceived?.Invoke(ffiEvent.Panic); break; default: break; diff --git a/Runtime/Scripts/Internal/FFI/FFIEvents.cs b/Runtime/Scripts/Internal/FFI/FFIEvents.cs index 8e182a59..e2794147 100644 --- a/Runtime/Scripts/Internal/FFI/FFIEvents.cs +++ b/Runtime/Scripts/Internal/FFI/FFIEvents.cs @@ -20,6 +20,8 @@ namespace LiveKit.Internal.FFI internal delegate void DisconnectReceivedDelegate(DisconnectCallback e); + internal delegate void PanicReceivedDelegate(Panic e); + internal delegate void GetSessionStatsDelegate(GetStatsCallback e); internal delegate void SetLocalMetadataReceivedDelegate(SetLocalMetadataCallback e); diff --git a/Tests/EditMode/PanicEventTests.cs b/Tests/EditMode/PanicEventTests.cs new file mode 100644 index 00000000..6056e72b --- /dev/null +++ b/Tests/EditMode/PanicEventTests.cs @@ -0,0 +1,101 @@ +using System; +using System.Collections.Generic; +using System.Text.RegularExpressions; +using System.Threading; +using LiveKit.Proto; +using NUnit.Framework; +using UnityEngine; +using UnityEngine.TestTools; + +using LiveKit.Internal.FFI; +namespace LiveKit.EditModeTests +{ + public class PanicEventTests + { + private sealed class RecordingSyncContext : SynchronizationContext + { + public readonly List<(SendOrPostCallback callback, object state)> Posts = new(); + public override void Post(SendOrPostCallback d, object state) + { + Posts.Add((d, state)); + } + public void DrainOnce() + { + foreach (var (cb, st) in Posts) cb(st); + Posts.Clear(); + } + } + + // Seeded in a different high-bit range than SkipDispatchTests so the two + // fixtures cannot collide in FfiClient.Instance's shared pendingCallbacks map. + private static long _asyncIdSeed = 0x7FE0_0000_0000_0000L; + private static ulong NextAsyncId() => (ulong)Interlocked.Increment(ref _asyncIdSeed); + + [Test] + public void RouteFfiEvent_Panic_RaisesPanicReceivedOnMainThread_AndCancelsPendingCallbacks() + { + var asyncId = NextAsyncId(); + var completed = false; + var canceled = false; + + // Stands in for any in-flight async request (e.g. a pending + // ConnectInstruction). After a panic its Rust-side task may be dead, + // so it must be cancelled rather than left to hang forever. + FfiClient.Instance.RegisterPendingCallback( + asyncId, + static e => e.UnpublishTrack, + cb => { completed = true; }, + onCancel: () => { canceled = true; }); + + string receivedMessage = null; + PanicReceivedDelegate handler = e => receivedMessage = e.Message; + FfiClient.Instance.PanicReceived += handler; + + var recording = new RecordingSyncContext(); + var originalContext = FfiClient.Instance._context; + FfiClient.Instance._context = recording; + try + { + LogAssert.Expect(LogType.Error, new Regex("FFI panic")); + + var ev = new FfiEvent { Panic = new Panic { Message = "test panic from FFI" } }; + var dispatcher = new Thread(() => FfiClient.RouteFfiEvent(ev)); + dispatcher.Start(); + Assert.IsTrue(dispatcher.Join(TimeSpan.FromSeconds(2)), + "Dispatcher thread did not finish within 2s."); + + // Panic handling fires user-facing events (room teardown), so it must + // be marshalled to the main thread, not run on the FFI callback thread. + Assert.AreEqual(1, recording.Posts.Count, + "RouteFfiEvent should post the panic to the main-thread sync context."); + Assert.IsNull(receivedMessage, + "PanicReceived ran on the FFI callback thread instead of the main-thread drain."); + Assert.IsFalse(canceled, + "Pending callbacks were cancelled before the main-thread drain."); + + recording.DrainOnce(); + + Assert.AreEqual("test panic from FFI", receivedMessage, + "PanicReceived did not fire with the panic message."); + Assert.IsTrue(canceled, + "Pending callbacks were not cancelled on panic — awaiting instructions would hang forever."); + Assert.IsFalse(completed, + "The pending callback completed instead of being cancelled."); + + // The pending entry must be gone: a late callback for it is a no-op. + var lateCallback = new FfiEvent + { + UnpublishTrack = new UnpublishTrackCallback { AsyncId = asyncId } + }; + Assert.IsFalse(FfiClient.Instance.TryDispatchPendingCallback(asyncId, lateCallback), + "The cancelled pending entry is still registered."); + } + finally + { + FfiClient.Instance._context = originalContext; + FfiClient.Instance.PanicReceived -= handler; + FfiClient.Instance.CancelPendingCallback(asyncId); + } + } + } +} diff --git a/Tests/EditMode/PanicEventTests.cs.meta b/Tests/EditMode/PanicEventTests.cs.meta new file mode 100644 index 00000000..47cf7f1f --- /dev/null +++ b/Tests/EditMode/PanicEventTests.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 43df0c93530a4c698356db201dd39859 +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: diff --git a/Tests/PlayMode/PanicRoomTeardownTests.cs b/Tests/PlayMode/PanicRoomTeardownTests.cs new file mode 100644 index 00000000..a8c20019 --- /dev/null +++ b/Tests/PlayMode/PanicRoomTeardownTests.cs @@ -0,0 +1,65 @@ +using System.Collections; +using System.Text.RegularExpressions; +using LiveKit.Internal.FFI; +using LiveKit.PlayModeTests.Utils; +using LiveKit.Proto; +using NUnit.Framework; +using UnityEngine; +using UnityEngine.TestTools; + +namespace LiveKit.PlayModeTests +{ + /// + /// An FfiEvent.Panic means the FFI layer's state is unrecoverable — its + /// background tasks may have died, leaving rooms that silently stop + /// receiving events (they would not even get a Disconnected event). The + /// SDK must convert that into an observable disconnect on every live + /// room instead of dropping the event. + /// + public class PanicRoomTeardownTests + { + const float DisconnectTimeoutSeconds = 10f; + + [UnityTest, Category("E2E")] + public IEnumerator Panic_DisconnectsConnectedRoom() + { + var options = TestRoomContext.ConnectionOptions.Default; + options.Identity = "panic-teardown"; + using var context = new TestRoomContext(options); + + yield return context.ConnectRoom(0); + Assert.IsNull(context.ConnectionError, context.ConnectionError); + + var room = context.Rooms[0]; + Room disconnectedRoom = null; + DisconnectReason? reason = null; + room.Disconnected += r => disconnectedRoom = r; + room.DisconnectedWithReason += (r, dr) => reason = dr; + + LogAssert.Expect(LogType.Error, new Regex("FFI panic")); + + // Inject a synthetic panic through the same entry point the native + // callback uses; it is posted to the main thread like real events. + FfiClient.RouteFfiEvent(new FfiEvent + { + Panic = new Panic { Message = "synthetic panic (test)" } + }); + + var disconnected = new Expectation( + predicate: () => disconnectedRoom != null, + timeoutSeconds: DisconnectTimeoutSeconds); + yield return disconnected.Wait(); + + Assert.IsNull(disconnected.Error, + "Room did not raise Disconnected after an FFI panic event."); + Assert.AreSame(room, disconnectedRoom, + "Disconnected fired for a different room instance."); + Assert.AreEqual(DisconnectReason.UnknownReason, reason, + "DisconnectedWithReason did not carry the expected reason."); + Assert.AreEqual(DisconnectReason.UnknownReason, room.DisconnectReason, + "Room.DisconnectReason was not set by the panic teardown."); + Assert.IsFalse(room.IsConnected, + "Room still reports IsConnected after the panic teardown."); + } + } +} diff --git a/Tests/PlayMode/PanicRoomTeardownTests.cs.meta b/Tests/PlayMode/PanicRoomTeardownTests.cs.meta new file mode 100644 index 00000000..6f8b34d4 --- /dev/null +++ b/Tests/PlayMode/PanicRoomTeardownTests.cs.meta @@ -0,0 +1,11 @@ +fileFormatVersion: 2 +guid: 4a0a0a2e224f4d30ad6c86b4e578324d +MonoImporter: + externalObjects: {} + serializedVersion: 2 + defaultReferences: [] + executionOrder: 0 + icon: {instanceID: 0} + userData: + assetBundleName: + assetBundleVariant: