Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 54 additions & 12 deletions snuba/environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,21 +78,63 @@ def setup_logging(level: Optional[str] = None) -> None:


def before_send(event: Event, hint: Hint) -> Event | None:
"""Filter out AllocationPolicyViolations and RPCAllocationPolicyException from being sent to Sentry"""
if "exc_info" in hint:
_, exc_value, _ = hint["exc_info"]
# Check if it's an AllocationPolicyViolations in the cause chain
if exc_value is not None:
from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException
"""
Drop transient/operational exceptions that recover on their own and aren't
actionable as Sentry issues. They are still captured as logs/breadcrumbs.

- ``AllocationPolicyViolations`` / ``RPCAllocationPolicyException``: expected
query rejections, not bugs.
- arroyo ``ChildProcessTerminated``: a multiprocessing worker in a consumer
was killed by a signal (almost always an OOM-kill of the worker, sometimes
a native crash). arroyo logs this at ERROR via ``logger.exception`` and
then re-raises it, so a single worker death surfaces as several ERROR-level
Sentry issues (e.g. SNUBA-BA7/BA8/BA9/BAD/B1T, grouped by wherever the
SIGCHLD happened to interrupt the run loop) even though the consumer simply
shuts down and is restarted by the orchestrator. Because these are
ERROR-level they are not covered by the WARN->log policy, so filter them by
type here. The underlying worker death is still observable via logs and
arroyo's ``sigchld.detected`` metric.
"""
if "exc_info" not in hint:
return event

if isinstance(exc_value, RPCAllocationPolicyException):
return None # Don't send to Sentry
_, exc_value, _ = hint["exc_info"]
if exc_value is None:
return event

from arroyo.processing.strategies.run_task_with_multiprocessing import (
ChildProcessTerminated,
)

from snuba.query.allocation_policies import AllocationPolicyViolations
from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException

noise_types = (
RPCAllocationPolicyException,
AllocationPolicyViolations,
ChildProcessTerminated,
)

cause = getattr(exc_value, "__cause__", None)
from snuba.query.allocation_policies import AllocationPolicyViolations
# Walk the exception chain (the exception itself plus its __cause__ /
# __context__) and drop the event if any link is a known noise type.
seen: set[int] = set()
exc: Optional[BaseException] = exc_value
while exc is not None and id(exc) not in seen:
seen.add(id(exc))
if isinstance(exc, noise_types):
return None # Don't send to Sentry
# Follow the chain the way Python itself displays it: an explicit cause
# (`raise ... from other`) wins, otherwise the implicit context -- unless
# it was suppressed via `raise ... from None`, in which case we must not
# follow __context__ or we could drop an unrelated event whose suppressed
# context happens to contain a noise type.
if exc.__cause__ is not None:
exc = exc.__cause__
elif not exc.__suppress_context__:
exc = exc.__context__
else:
exc = None

if isinstance(cause, AllocationPolicyViolations):
return None # Don't send to Sentry
return event


Expand Down
99 changes: 99 additions & 0 deletions tests/test_environment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
from arroyo.processing.strategies.run_task_with_multiprocessing import (
ChildProcessTerminated,
)
from sentry_sdk.types import Event, Hint

from snuba.environment import before_send
from snuba.query.allocation_policies import AllocationPolicyViolations
from snuba.web.rpc.common.exceptions import RPCAllocationPolicyException


def _hint_for(exc: BaseException) -> Hint:
return {"exc_info": (type(exc), exc, exc.__traceback__)}


def test_before_send_passes_through_without_exc_info() -> None:
event: Event = {"message": "hello"}
assert before_send(event, {}) is event


def test_before_send_passes_through_unrelated_exception() -> None:
event: Event = {"message": "boom"}
try:
raise ValueError("a real bug")
except ValueError as err:
assert before_send(event, _hint_for(err)) is event


def test_before_send_drops_child_process_terminated() -> None:
"""
A consumer multiprocessing worker dying (SIGCHLD) is logged at ERROR and
re-raised by arroyo; it recovers on restart and is not an actionable issue.
"""
event: Event = {"message": "Caught exception, shutting down..."}
try:
raise ChildProcessTerminated(17)
except ChildProcessTerminated as err:
assert before_send(event, _hint_for(err)) is None


def test_before_send_drops_child_process_terminated_in_cause_chain() -> None:
event: Event = {"message": "wrapped"}
try:
try:
raise ChildProcessTerminated(17)
except ChildProcessTerminated as inner:
raise RuntimeError("wrapped") from inner
except RuntimeError as err:
assert before_send(event, _hint_for(err)) is None


def test_before_send_keeps_event_when_noise_is_in_suppressed_context() -> None:
"""
`raise ... from None` keeps the prior exception on __context__ but suppresses
it; we must not follow it, otherwise a legitimate error would be dropped just
because a noise type sits in its (explicitly suppressed) context.
"""
event: Event = {"message": "a real bug"}
try:
try:
raise ChildProcessTerminated(17)
except ChildProcessTerminated:
raise ValueError("a real bug") from None
except ValueError as err:
assert err.__suppress_context__ is True
assert isinstance(err.__context__, ChildProcessTerminated)
assert before_send(event, _hint_for(err)) is event


def test_before_send_drops_allocation_policy_violations() -> None:
event: Event = {"message": "rejected"}
try:
raise AllocationPolicyViolations("rejected")
except AllocationPolicyViolations as err:
assert before_send(event, _hint_for(err)) is None


def test_before_send_drops_rpc_allocation_policy_exception() -> None:
event: Event = {"message": "rejected"}
try:
raise RPCAllocationPolicyException("rejected", {})
except RPCAllocationPolicyException as err:
assert before_send(event, _hint_for(err)) is None


def test_before_send_handles_none_exc_value() -> None:
event: Event = {"message": "no exc"}
hint: Hint = {"exc_info": (None, None, None)}
assert before_send(event, hint) is event


def test_before_send_terminates_on_cyclic_cause_chain() -> None:
# Defensive: a self-referential context must not loop forever.
event: Event = {"message": "cycle"}
err = ValueError("self")
try:
raise err
except ValueError:
err.__context__ = err
assert before_send(event, _hint_for(err)) is event
Loading