From 503f0136999f22f5a02748e7fe058a8409f6abe0 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 18:10:40 +0000 Subject: [PATCH 1/5] ref: Stop arroyo ChildProcessTerminated from creating Sentry issues A multiprocessing worker in a consumer being killed by a signal (almost always an OOM-kill of the worker, sometimes a native crash) makes arroyo raise ChildProcessTerminated. arroyo logs it at ERROR via logger.exception and then re-raises it, so a single worker death surfaces as several ERROR-level Sentry issues (SNUBA-BA7/BA8/BA9/BAD/B1T) -- four from the "Caught exception, shutting down..." ERROR log (grouped by wherever the SIGCHLD interrupted the run loop) plus one from the re-raised, unhandled exception. The consumer simply shuts down and is restarted by the orchestrator, so these are transient/operational noise, not actionable bugs. These are ERROR-level, so the WARN->log policy from #8077 does not cover them. Filter them by exception type in before_send (an isinstance check on the cause/context chain -- the robust, non-string approach), alongside the existing AllocationPolicyViolations / RPCAllocationPolicyException filtering. The worker death is still observable via logs and arroyo's sigchld.detected metric. Fixes SNUBA-BA7 Fixes SNUBA-BA8 Fixes SNUBA-BA9 Fixes SNUBA-BAD Fixes SNUBA-B1T Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NmYA6zVesfHV8aXRSFGkjB --- snuba/environment.py | 56 +++++++++++++++++++++++------- tests/test_environment.py | 73 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 117 insertions(+), 12 deletions(-) create mode 100644 tests/test_environment.py diff --git a/snuba/environment.py b/snuba/environment.py index b8311e2754..ab3d781308 100644 --- a/snuba/environment.py +++ b/snuba/environment.py @@ -78,21 +78,53 @@ def setup_logging(level: Optional[str] = None) -> None: def before_send(event: Event, hint: Hint) -> Event | None: - """Filter out AllocationPolicyViolations and RPCAllocationPolicyException from being sent to Sentry""" - if "exc_info" in hint: - _, exc_value, _ = hint["exc_info"] - # Check if it's an AllocationPolicyViolations in the cause chain - if exc_value is not None: - from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException + """ + Drop transient/operational exceptions that recover on their own and aren't + actionable as Sentry issues. They are still captured as logs/breadcrumbs. + + - ``AllocationPolicyViolations`` / ``RPCAllocationPolicyException``: expected + query rejections, not bugs. + - arroyo ``ChildProcessTerminated``: a multiprocessing worker in a consumer + was killed by a signal (almost always an OOM-kill of the worker, sometimes + a native crash). arroyo logs this at ERROR via ``logger.exception`` and + then re-raises it, so a single worker death surfaces as several ERROR-level + Sentry issues (e.g. SNUBA-BA7/BA8/BA9/BAD/B1T, grouped by wherever the + SIGCHLD happened to interrupt the run loop) even though the consumer simply + shuts down and is restarted by the orchestrator. Because these are + ERROR-level they are not covered by the WARN->log policy, so filter them by + type here. The underlying worker death is still observable via logs and + arroyo's ``sigchld.detected`` metric. + """ + if "exc_info" not in hint: + return event - if isinstance(exc_value, RPCAllocationPolicyException): - return None # Don't send to Sentry + _, exc_value, _ = hint["exc_info"] + if exc_value is None: + return event + + from arroyo.processing.strategies.run_task_with_multiprocessing import ( + ChildProcessTerminated, + ) + + from snuba.query.allocation_policies import AllocationPolicyViolations + from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException + + noise_types = ( + RPCAllocationPolicyException, + AllocationPolicyViolations, + ChildProcessTerminated, + ) - cause = getattr(exc_value, "__cause__", None) - from snuba.query.allocation_policies import AllocationPolicyViolations + # Walk the exception chain (the exception itself plus its __cause__ / + # __context__) and drop the event if any link is a known noise type. + seen: set[int] = set() + exc: Optional[BaseException] = exc_value + while exc is not None and id(exc) not in seen: + seen.add(id(exc)) + if isinstance(exc, noise_types): + return None # Don't send to Sentry + exc = exc.__cause__ or exc.__context__ - if isinstance(cause, AllocationPolicyViolations): - return None # Don't send to Sentry return event diff --git a/tests/test_environment.py b/tests/test_environment.py new file mode 100644 index 0000000000..7b5329f2fb --- /dev/null +++ b/tests/test_environment.py @@ -0,0 +1,73 @@ +from arroyo.processing.strategies.run_task_with_multiprocessing import ( + ChildProcessTerminated, +) + +from snuba.environment import before_send +from snuba.query.allocation_policies import AllocationPolicyViolations +from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException + + +def _hint_for(exc: BaseException) -> dict: + return {"exc_info": (type(exc), exc, exc.__traceback__)} + + +def test_before_send_passes_through_without_exc_info() -> None: + event = {"message": "hello"} + assert before_send(dict(event), {}) == event + + +def test_before_send_passes_through_unrelated_exception() -> None: + try: + raise ValueError("a real bug") + except ValueError as err: + assert before_send({"message": "boom"}, _hint_for(err)) is not None + + +def test_before_send_drops_child_process_terminated() -> None: + """ + A consumer multiprocessing worker dying (SIGCHLD) is logged at ERROR and + re-raised by arroyo; it recovers on restart and is not an actionable issue. + """ + try: + raise ChildProcessTerminated(17) + except ChildProcessTerminated as err: + assert before_send({"message": "Caught exception, shutting down..."}, _hint_for(err)) is None + + +def test_before_send_drops_child_process_terminated_in_cause_chain() -> None: + try: + try: + raise ChildProcessTerminated(17) + except ChildProcessTerminated as inner: + raise RuntimeError("wrapped") from inner + except RuntimeError as err: + assert before_send({"message": "wrapped"}, _hint_for(err)) is None + + +def test_before_send_drops_allocation_policy_violations() -> None: + try: + raise AllocationPolicyViolations("rejected") + except AllocationPolicyViolations as err: + assert before_send({"message": "rejected"}, _hint_for(err)) is None + + +def test_before_send_drops_rpc_allocation_policy_exception() -> None: + try: + raise RPCAllocationPolicyException("rejected") + except RPCAllocationPolicyException as err: + assert before_send({"message": "rejected"}, _hint_for(err)) is None + + +def test_before_send_handles_none_exc_value() -> None: + event = {"message": "no exc"} + assert before_send(dict(event), {"exc_info": (None, None, None)}) == event + + +def test_before_send_terminates_on_cyclic_cause_chain() -> None: + # Defensive: a self-referential context must not loop forever. + err = ValueError("self") + try: + raise err + except ValueError: + err.__context__ = err + assert before_send({"message": "cycle"}, _hint_for(err)) is not None From 92c513c3f8a163000736d9be07c886a4e2927704 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 18:13:49 +0000 Subject: [PATCH 2/5] ref: respect __suppress_context__ when walking exception chain in before_send After `raise X from None`, Python keeps the prior exception on __context__ but sets __suppress_context__ = True. The chain walk used `__cause__ or __context__`, which followed that suppressed context and could drop a legitimate Sentry event whose explicitly-suppressed context happened to contain a noise type. Follow the chain the way CPython's own traceback printer does: explicit cause wins, else the implicit context unless it was suppressed. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NmYA6zVesfHV8aXRSFGkjB --- snuba/environment.py | 12 +++++++++++- tests/test_environment.py | 17 +++++++++++++++++ 2 files changed, 28 insertions(+), 1 deletion(-) diff --git a/snuba/environment.py b/snuba/environment.py index ab3d781308..ee6719b98d 100644 --- a/snuba/environment.py +++ b/snuba/environment.py @@ -123,7 +123,17 @@ def before_send(event: Event, hint: Hint) -> Event | None: seen.add(id(exc)) if isinstance(exc, noise_types): return None # Don't send to Sentry - exc = exc.__cause__ or exc.__context__ + # Follow the chain the way Python itself displays it: an explicit cause + # (`raise ... from other`) wins, otherwise the implicit context -- unless + # it was suppressed via `raise ... from None`, in which case we must not + # follow __context__ or we could drop an unrelated event whose suppressed + # context happens to contain a noise type. + if exc.__cause__ is not None: + exc = exc.__cause__ + elif not exc.__suppress_context__: + exc = exc.__context__ + else: + exc = None return event diff --git a/tests/test_environment.py b/tests/test_environment.py index 7b5329f2fb..375a9e3eb3 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -44,6 +44,23 @@ def test_before_send_drops_child_process_terminated_in_cause_chain() -> None: assert before_send({"message": "wrapped"}, _hint_for(err)) is None +def test_before_send_keeps_event_when_noise_is_in_suppressed_context() -> None: + """ + `raise ... from None` keeps the prior exception on __context__ but suppresses + it; we must not follow it, otherwise a legitimate error would be dropped just + because a noise type sits in its (explicitly suppressed) context. + """ + try: + try: + raise ChildProcessTerminated(17) + except ChildProcessTerminated: + raise ValueError("a real bug") from None + except ValueError as err: + assert err.__suppress_context__ is True + assert isinstance(err.__context__, ChildProcessTerminated) + assert before_send({"message": "a real bug"}, _hint_for(err)) is not None + + def test_before_send_drops_allocation_policy_violations() -> None: try: raise AllocationPolicyViolations("rejected") From 667868708e177dad29df40ecbee60d19c04ea6c3 Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 18:15:15 +0000 Subject: [PATCH 3/5] test: fix RPCAllocationPolicyException instantiation in before_send test RPCAllocationPolicyException.__init__ requires (message, routing_decision_dict). The test passed only the message, which would raise TypeError before reaching before_send. Pass an empty routing_decision_dict. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NmYA6zVesfHV8aXRSFGkjB --- tests/test_environment.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_environment.py b/tests/test_environment.py index 375a9e3eb3..2b785ca252 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -70,7 +70,7 @@ def test_before_send_drops_allocation_policy_violations() -> None: def test_before_send_drops_rpc_allocation_policy_exception() -> None: try: - raise RPCAllocationPolicyException("rejected") + raise RPCAllocationPolicyException("rejected", {}) except RPCAllocationPolicyException as err: assert before_send({"message": "rejected"}, _hint_for(err)) is None From 31d225a3a3df5019bb3fc4d2d54e859d99f085a7 Mon Sep 17 00:00:00 2001 From: "getsantry[bot]" <66042841+getsantry[bot]@users.noreply.github.com> Date: Tue, 23 Jun 2026 18:23:19 +0000 Subject: [PATCH 4/5] [getsentry/action-github-commit] Auto commit --- tests/test_environment.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_environment.py b/tests/test_environment.py index 2b785ca252..75db8c6ee5 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -31,7 +31,9 @@ def test_before_send_drops_child_process_terminated() -> None: try: raise ChildProcessTerminated(17) except ChildProcessTerminated as err: - assert before_send({"message": "Caught exception, shutting down..."}, _hint_for(err)) is None + assert ( + before_send({"message": "Caught exception, shutting down..."}, _hint_for(err)) is None + ) def test_before_send_drops_child_process_terminated_in_cause_chain() -> None: From 741fecca6295f734ba2f823bc543d6e01c49e37a Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 23 Jun 2026 18:29:11 +0000 Subject: [PATCH 5/5] test: satisfy strict mypy in before_send tests Build on the pre-commit autofix (ruff line-wrap) with the type fixes the autofix did not make: annotate event/hint as sentry_sdk Event/Hint, drop the dict(event) calls (a plain dict isn't assignable to the Event TypedDict param) and the bare `-> dict` return (disallow_any_generics), asserting identity against the typed event instead. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01NmYA6zVesfHV8aXRSFGkjB --- tests/test_environment.py | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) diff --git a/tests/test_environment.py b/tests/test_environment.py index 75db8c6ee5..38e0b197cd 100644 --- a/tests/test_environment.py +++ b/tests/test_environment.py @@ -1,26 +1,28 @@ from arroyo.processing.strategies.run_task_with_multiprocessing import ( ChildProcessTerminated, ) +from sentry_sdk.types import Event, Hint from snuba.environment import before_send from snuba.query.allocation_policies import AllocationPolicyViolations from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException -def _hint_for(exc: BaseException) -> dict: +def _hint_for(exc: BaseException) -> Hint: return {"exc_info": (type(exc), exc, exc.__traceback__)} def test_before_send_passes_through_without_exc_info() -> None: - event = {"message": "hello"} - assert before_send(dict(event), {}) == event + event: Event = {"message": "hello"} + assert before_send(event, {}) is event def test_before_send_passes_through_unrelated_exception() -> None: + event: Event = {"message": "boom"} try: raise ValueError("a real bug") except ValueError as err: - assert before_send({"message": "boom"}, _hint_for(err)) is not None + assert before_send(event, _hint_for(err)) is event def test_before_send_drops_child_process_terminated() -> None: @@ -28,22 +30,22 @@ def test_before_send_drops_child_process_terminated() -> None: A consumer multiprocessing worker dying (SIGCHLD) is logged at ERROR and re-raised by arroyo; it recovers on restart and is not an actionable issue. """ + event: Event = {"message": "Caught exception, shutting down..."} try: raise ChildProcessTerminated(17) except ChildProcessTerminated as err: - assert ( - before_send({"message": "Caught exception, shutting down..."}, _hint_for(err)) is None - ) + assert before_send(event, _hint_for(err)) is None def test_before_send_drops_child_process_terminated_in_cause_chain() -> None: + event: Event = {"message": "wrapped"} try: try: raise ChildProcessTerminated(17) except ChildProcessTerminated as inner: raise RuntimeError("wrapped") from inner except RuntimeError as err: - assert before_send({"message": "wrapped"}, _hint_for(err)) is None + assert before_send(event, _hint_for(err)) is None def test_before_send_keeps_event_when_noise_is_in_suppressed_context() -> None: @@ -52,6 +54,7 @@ def test_before_send_keeps_event_when_noise_is_in_suppressed_context() -> None: it; we must not follow it, otherwise a legitimate error would be dropped just because a noise type sits in its (explicitly suppressed) context. """ + event: Event = {"message": "a real bug"} try: try: raise ChildProcessTerminated(17) @@ -60,33 +63,37 @@ def test_before_send_keeps_event_when_noise_is_in_suppressed_context() -> None: except ValueError as err: assert err.__suppress_context__ is True assert isinstance(err.__context__, ChildProcessTerminated) - assert before_send({"message": "a real bug"}, _hint_for(err)) is not None + assert before_send(event, _hint_for(err)) is event def test_before_send_drops_allocation_policy_violations() -> None: + event: Event = {"message": "rejected"} try: raise AllocationPolicyViolations("rejected") except AllocationPolicyViolations as err: - assert before_send({"message": "rejected"}, _hint_for(err)) is None + assert before_send(event, _hint_for(err)) is None def test_before_send_drops_rpc_allocation_policy_exception() -> None: + event: Event = {"message": "rejected"} try: raise RPCAllocationPolicyException("rejected", {}) except RPCAllocationPolicyException as err: - assert before_send({"message": "rejected"}, _hint_for(err)) is None + assert before_send(event, _hint_for(err)) is None def test_before_send_handles_none_exc_value() -> None: - event = {"message": "no exc"} - assert before_send(dict(event), {"exc_info": (None, None, None)}) == event + event: Event = {"message": "no exc"} + hint: Hint = {"exc_info": (None, None, None)} + assert before_send(event, hint) is event def test_before_send_terminates_on_cyclic_cause_chain() -> None: # Defensive: a self-referential context must not loop forever. + event: Event = {"message": "cycle"} err = ValueError("self") try: raise err except ValueError: err.__context__ = err - assert before_send({"message": "cycle"}, _hint_for(err)) is not None + assert before_send(event, _hint_for(err)) is event