diff --git a/snuba/environment.py b/snuba/environment.py index b8311e2754..ee6719b98d 100644 --- a/snuba/environment.py +++ b/snuba/environment.py @@ -78,21 +78,63 @@ def setup_logging(level: Optional[str] = None) -> None: def before_send(event: Event, hint: Hint) -> Event | None: - """Filter out AllocationPolicyViolations and RPCAllocationPolicyException from being sent to Sentry""" - if "exc_info" in hint: - _, exc_value, _ = hint["exc_info"] - # Check if it's an AllocationPolicyViolations in the cause chain - if exc_value is not None: - from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException + """ + Drop transient/operational exceptions that recover on their own and aren't + actionable as Sentry issues. They are still captured as logs/breadcrumbs. + + - ``AllocationPolicyViolations`` / ``RPCAllocationPolicyException``: expected + query rejections, not bugs. + - arroyo ``ChildProcessTerminated``: a multiprocessing worker in a consumer + was killed by a signal (almost always an OOM-kill of the worker, sometimes + a native crash). arroyo logs this at ERROR via ``logger.exception`` and + then re-raises it, so a single worker death surfaces as several ERROR-level + Sentry issues (e.g. SNUBA-BA7/BA8/BA9/BAD/B1T, grouped by wherever the + SIGCHLD happened to interrupt the run loop) even though the consumer simply + shuts down and is restarted by the orchestrator. Because these are + ERROR-level they are not covered by the WARN->log policy, so filter them by + type here. The underlying worker death is still observable via logs and + arroyo's ``sigchld.detected`` metric. + """ + if "exc_info" not in hint: + return event - if isinstance(exc_value, RPCAllocationPolicyException): - return None # Don't send to Sentry + _, exc_value, _ = hint["exc_info"] + if exc_value is None: + return event + + from arroyo.processing.strategies.run_task_with_multiprocessing import ( + ChildProcessTerminated, + ) + + from snuba.query.allocation_policies import AllocationPolicyViolations + from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException + + noise_types = ( + RPCAllocationPolicyException, + AllocationPolicyViolations, + ChildProcessTerminated, + ) - cause = getattr(exc_value, "__cause__", None) - from snuba.query.allocation_policies import AllocationPolicyViolations + # Walk the exception chain (the exception itself plus its __cause__ / + # __context__) and drop the event if any link is a known noise type. + seen: set[int] = set() + exc: Optional[BaseException] = exc_value + while exc is not None and id(exc) not in seen: + seen.add(id(exc)) + if isinstance(exc, noise_types): + return None # Don't send to Sentry + # Follow the chain the way Python itself displays it: an explicit cause + # (`raise ... from other`) wins, otherwise the implicit context -- unless + # it was suppressed via `raise ... from None`, in which case we must not + # follow __context__ or we could drop an unrelated event whose suppressed + # context happens to contain a noise type. + if exc.__cause__ is not None: + exc = exc.__cause__ + elif not exc.__suppress_context__: + exc = exc.__context__ + else: + exc = None - if isinstance(cause, AllocationPolicyViolations): - return None # Don't send to Sentry return event diff --git a/tests/test_environment.py b/tests/test_environment.py new file mode 100644 index 0000000000..38e0b197cd --- /dev/null +++ b/tests/test_environment.py @@ -0,0 +1,99 @@ +from arroyo.processing.strategies.run_task_with_multiprocessing import ( + ChildProcessTerminated, +) +from sentry_sdk.types import Event, Hint + +from snuba.environment import before_send +from snuba.query.allocation_policies import AllocationPolicyViolations +from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException + + +def _hint_for(exc: BaseException) -> Hint: + return {"exc_info": (type(exc), exc, exc.__traceback__)} + + +def test_before_send_passes_through_without_exc_info() -> None: + event: Event = {"message": "hello"} + assert before_send(event, {}) is event + + +def test_before_send_passes_through_unrelated_exception() -> None: + event: Event = {"message": "boom"} + try: + raise ValueError("a real bug") + except ValueError as err: + assert before_send(event, _hint_for(err)) is event + + +def test_before_send_drops_child_process_terminated() -> None: + """ + A consumer multiprocessing worker dying (SIGCHLD) is logged at ERROR and + re-raised by arroyo; it recovers on restart and is not an actionable issue. + """ + event: Event = {"message": "Caught exception, shutting down..."} + try: + raise ChildProcessTerminated(17) + except ChildProcessTerminated as err: + assert before_send(event, _hint_for(err)) is None + + +def test_before_send_drops_child_process_terminated_in_cause_chain() -> None: + event: Event = {"message": "wrapped"} + try: + try: + raise ChildProcessTerminated(17) + except ChildProcessTerminated as inner: + raise RuntimeError("wrapped") from inner + except RuntimeError as err: + assert before_send(event, _hint_for(err)) is None + + +def test_before_send_keeps_event_when_noise_is_in_suppressed_context() -> None: + """ + `raise ... from None` keeps the prior exception on __context__ but suppresses + it; we must not follow it, otherwise a legitimate error would be dropped just + because a noise type sits in its (explicitly suppressed) context. + """ + event: Event = {"message": "a real bug"} + try: + try: + raise ChildProcessTerminated(17) + except ChildProcessTerminated: + raise ValueError("a real bug") from None + except ValueError as err: + assert err.__suppress_context__ is True + assert isinstance(err.__context__, ChildProcessTerminated) + assert before_send(event, _hint_for(err)) is event + + +def test_before_send_drops_allocation_policy_violations() -> None: + event: Event = {"message": "rejected"} + try: + raise AllocationPolicyViolations("rejected") + except AllocationPolicyViolations as err: + assert before_send(event, _hint_for(err)) is None + + +def test_before_send_drops_rpc_allocation_policy_exception() -> None: + event: Event = {"message": "rejected"} + try: + raise RPCAllocationPolicyException("rejected", {}) + except RPCAllocationPolicyException as err: + assert before_send(event, _hint_for(err)) is None + + +def test_before_send_handles_none_exc_value() -> None: + event: Event = {"message": "no exc"} + hint: Hint = {"exc_info": (None, None, None)} + assert before_send(event, hint) is event + + +def test_before_send_terminates_on_cyclic_cause_chain() -> None: + # Defensive: a self-referential context must not loop forever. + event: Event = {"message": "cycle"} + err = ValueError("self") + try: + raise err + except ValueError: + err.__context__ = err + assert before_send(event, _hint_for(err)) is event