From 00c278c48c038f2dc8a73df39b7777e159492521 Mon Sep 17 00:00:00 2001 From: Alex Tercete Date: Thu, 28 May 2026 17:27:27 +0100 Subject: [PATCH 1/4] remote/coordinator: report acquire and release failures Replace boolean success/failure handling in the resource acquire and release helpers with CoordinatorError exceptions. This lets AcquirePlace and ReleasePlace return coordinator-side failure reasons to the client. This makes coordinator-side acquire/release failures actionable for users instead of only reporting a generic failed operation. Signed-off-by: Alex Tercete Reviewed-by: Alex Tercete # gatekeeper Co-authored-by: Idan Saadon --- labgrid/remote/coordinator.py | 41 +++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/labgrid/remote/coordinator.py b/labgrid/remote/coordinator.py index ea60f4933..af3c02a07 100644 --- a/labgrid/remote/coordinator.py +++ b/labgrid/remote/coordinator.py @@ -205,6 +205,10 @@ async def wait(self): self.expired = True +class CoordinatorError(Exception): + pass + + class ExporterError(Exception): pass @@ -664,7 +668,7 @@ async def _acquire_resources(self, place, resources): # all resources need to be free for resource in resources: if resource.acquired: - return False + raise CoordinatorError(f"{resource} is already acquired by {resource.acquired}") for otherplace in self.places.values(): for oldres in otherplace.acquired_resources: @@ -672,7 +676,9 @@ async def _acquire_resources(self, place, resources): logging.info( "Conflicting orphaned resource %s for acquire request for place %s", oldres, place.name ) - return False + raise CoordinatorError( + f"conflicting orphaned resource {oldres} for acquire request for place {place.name}" + ) # acquire resources acquired = [] @@ -683,8 +689,11 @@ async def _acquire_resources(self, place, resources): except Exception: logging.exception("failed to acquire %s", resource) # cleanup - await self._release_resources(place, acquired) - return False + try: + await self._release_resources(place, acquired) + except CoordinatorError: + logging.exception("failed to release acquired resources during acquire cleanup") + raise CoordinatorError() for resource in resources: place.acquired_resources.append(resource) @@ -702,6 +711,7 @@ async def _release_resources(self, place, resources, callback=True): except ValueError: pass + failed = False for resource in resources: if resource.orphaned: continue @@ -721,12 +731,15 @@ async def _release_resources(self, place, resources, callback=True): if resource.acquired: logging.warning("resource %s still acquired after release request", resource) except (ExporterError, TimeoutError): + failed = True logging.exception("failed to release %s", resource) # at leaset try to notify the clients try: self._publish_resource(resource) except: logging.exception("failed to publish released resource %s", resource) + if failed: + raise CoordinatorError(f"failed to release resources for place {place.name}") async def _synchronize_resources(self): assert self.lock.locked() @@ -878,10 +891,18 @@ async def AcquirePlace(self, request, context): if not place.hasmatch(resource.path): continue resources.append(resource) - if not await self._acquire_resources(place, resources): + try: + await self._acquire_resources(place, resources) + except CoordinatorError as e: # revert earlier change place.acquired = None - await context.abort(grpc.StatusCode.FAILED_PRECONDITION, f"Failed to acquire resources for place {name}") + message = f"Failed to acquire resources for place {name}" + if str(e): + message += f": {e}" + await context.abort( + grpc.StatusCode.FAILED_PRECONDITION, + message, + ) place.touch() self._publish_place(place) self.save_later() @@ -905,7 +926,13 @@ async def ReleasePlace(self, request, context): if fromuser and place.acquired != fromuser: return labgrid_coordinator_pb2.ReleasePlaceResponse() - await self._release_resources(place, place.acquired_resources) + try: + await self._release_resources(place, place.acquired_resources) + except CoordinatorError as e: + message = f"Failed to release resources for place {name}" + if str(e): + message += f": {e}" + await context.abort(grpc.StatusCode.FAILED_PRECONDITION, message) place.acquired = None place.allowed = set() From 2fe34c5f0d1d5d67f2f186b31c149f221736cd9a Mon Sep 17 00:00:00 2001 From: Alex Tercete Date: Thu, 28 May 2026 17:27:48 +0100 Subject: [PATCH 2/4] remote/coordinator: preserve exporter failure reasons Signed-off-by: Alex Tercete Reviewed-by: Alex Tercete # gatekeeper Co-authored-by: Idan Saadon --- labgrid/remote/coordinator.py | 21 +++++++++++++-------- labgrid/remote/exporter.py | 5 ++++- 2 files changed, 17 insertions(+), 9 deletions(-) diff --git a/labgrid/remote/coordinator.py b/labgrid/remote/coordinator.py index af3c02a07..bd6565197 100644 --- a/labgrid/remote/coordinator.py +++ b/labgrid/remote/coordinator.py @@ -209,7 +209,7 @@ class CoordinatorError(Exception): pass -class ExporterError(Exception): +class ExporterError(CoordinatorError): pass @@ -686,14 +686,16 @@ async def _acquire_resources(self, place, resources): for resource in resources: await self._acquire_resource(place, resource) acquired.append(resource) - except Exception: + except Exception as e: logging.exception("failed to acquire %s", resource) # cleanup try: await self._release_resources(place, acquired) except CoordinatorError: logging.exception("failed to release acquired resources during acquire cleanup") - raise CoordinatorError() + if isinstance(e, CoordinatorError): + raise + raise CoordinatorError(f"failed to acquire {resource}") from e for resource in resources: place.acquired_resources.append(resource) @@ -711,7 +713,7 @@ async def _release_resources(self, place, resources, callback=True): except ValueError: pass - failed = False + failure = None for resource in resources: if resource.orphaned: continue @@ -730,16 +732,19 @@ async def _release_resources(self, place, resources, callback=True): raise ExporterError(f"failed to release {resource} ({cmd.response.reason})") if resource.acquired: logging.warning("resource %s still acquired after release request", resource) - except (ExporterError, TimeoutError): - failed = True + except (ExporterError, TimeoutError) as e: + if failure is None or isinstance(e, ExporterError): + failure = e logging.exception("failed to release %s", resource) # at leaset try to notify the clients try: self._publish_resource(resource) except: logging.exception("failed to publish released resource %s", resource) - if failed: - raise CoordinatorError(f"failed to release resources for place {place.name}") + if failure: + if isinstance(failure, ExporterError): + raise failure + raise CoordinatorError(f"failed to release resources for place {place.name}") from failure async def _synchronize_resources(self): assert self.lock.locked() diff --git a/labgrid/remote/exporter.py b/labgrid/remote/exporter.py index 68c4fa708..c94dc8991 100755 --- a/labgrid/remote/exporter.py +++ b/labgrid/remote/exporter.py @@ -926,8 +926,11 @@ async def message_pump(self): ) success = True except (BrokenResourceError, InvalidResourceRequestError, UnknownResourceError) as e: - reason = e.args[0] + reason = str(e) logging.warning("set_acquired_request failed: %s", reason) + except Exception as e: + reason = str(e) or repr(e) + logging.exception("failed to handle set_acquired_request: %s", reason) finally: in_message = labgrid_coordinator_pb2.ExporterInMessage() in_message.response.success = success From d69c3a9e7ebbc9c766f43fa9848ffd78cb180586 Mon Sep 17 00:00:00 2001 From: Alex Tercete Date: Thu, 28 May 2026 17:28:07 +0100 Subject: [PATCH 3/4] remote/coordinator: report exporter command timeouts Convert exporter acquire/release command timeouts into ExporterError failures. This returns a clear timeout reason to the client instead of relying on an empty TimeoutError message. Signed-off-by: Alex Tercete Reviewed-by: Alex Tercete # gatekeeper Co-authored-by: Idan Saadon --- labgrid/remote/coordinator.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/labgrid/remote/coordinator.py b/labgrid/remote/coordinator.py index bd6565197..774f615ea 100644 --- a/labgrid/remote/coordinator.py +++ b/labgrid/remote/coordinator.py @@ -655,7 +655,10 @@ async def _acquire_resource(self, place, resource): request.place_name = place.name cmd = ExporterCommand(request) self.get_exporter_by_name(resource.path[0]).queue.put_nowait(cmd) - await cmd.wait() + try: + await cmd.wait() + except asyncio.TimeoutError as e: + raise ExporterError("timed out waiting for exporter while acquiring resource") from e if not cmd.response.success: raise ExporterError(f"failed to acquire {resource} ({cmd.response.reason})") if resource.acquired != place.name: @@ -727,13 +730,16 @@ async def _release_resources(self, place, resources, callback=True): # request.place_name is left unset to indicate release cmd = ExporterCommand(request) self.get_exporter_by_name(resource.path[0]).queue.put_nowait(cmd) - await cmd.wait() + try: + await cmd.wait() + except asyncio.TimeoutError as e: + raise ExporterError("timed out waiting for exporter while releasing resource") from e if not cmd.response.success: raise ExporterError(f"failed to release {resource} ({cmd.response.reason})") if resource.acquired: logging.warning("resource %s still acquired after release request", resource) - except (ExporterError, TimeoutError) as e: - if failure is None or isinstance(e, ExporterError): + except ExporterError as e: + if failure is None: failure = e logging.exception("failed to release %s", resource) # at leaset try to notify the clients From 205371678b19d5b05eade328f1a23505f1c817fd Mon Sep 17 00:00:00 2001 From: Alex Tercete Date: Thu, 28 May 2026 17:28:35 +0100 Subject: [PATCH 4/4] remote/coordinator: reduce exporter failure details Avoid including the full resource representation in exporter acquire/release failure messages returned to the client. The exporter-provided reason is enough for users, while the full resource details can be noisy and may expose more information than needed. Signed-off-by: Alex Tercete Reviewed-by: Alex Tercete # gatekeeper Co-authored-by: Idan Saadon --- labgrid/remote/coordinator.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/labgrid/remote/coordinator.py b/labgrid/remote/coordinator.py index 774f615ea..edad1bec0 100644 --- a/labgrid/remote/coordinator.py +++ b/labgrid/remote/coordinator.py @@ -660,7 +660,7 @@ async def _acquire_resource(self, place, resource): except asyncio.TimeoutError as e: raise ExporterError("timed out waiting for exporter while acquiring resource") from e if not cmd.response.success: - raise ExporterError(f"failed to acquire {resource} ({cmd.response.reason})") + raise ExporterError(cmd.response.reason or "exporter returned failure without a reason") if resource.acquired != place.name: logging.warning("resource %s not acquired by this place after acquire request", resource) @@ -735,7 +735,7 @@ async def _release_resources(self, place, resources, callback=True): except asyncio.TimeoutError as e: raise ExporterError("timed out waiting for exporter while releasing resource") from e if not cmd.response.success: - raise ExporterError(f"failed to release {resource} ({cmd.response.reason})") + raise ExporterError(cmd.response.reason or "exporter returned failure without a reason") if resource.acquired: logging.warning("resource %s still acquired after release request", resource) except ExporterError as e: