From 0e30cb05e0d7d40b734ff02a4dfc92084f1ddb69 Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Wed, 17 Jun 2026 16:40:20 -0700 Subject: [PATCH 1/8] feat(eap-items): use indexed_name column for sentry.op/sentry.metric.name filters Filtering on the per-item-type primary name attribute (sentry.op for spans, sentry.metric.name for metrics) was translated into an unindexed attributes_string bucket map lookup (arrayElement(attributes_string_N, ...)). The same value is also stored in the dedicated indexed_name String column, which has a bloom filter index (migration 0057). Add an IndexedNameOptimizer logical query processor that rewrites attributes_string['sentry.op'|'sentry.metric.name'] SubscriptableReferences to the indexed_name column so the bloom filter index is used. Because indexed_name is item-type specific, the rewrite is only applied when the query is unambiguously scoped to the owning item type via an equals(item_type, N) condition; otherwise it safely falls back to the bucket lookup. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01GXBnLMCV79qaAkVRkQu7Nw Agent transcript: https://claudescope.sentry.dev/share/cdjPeZ3xwrigGK-uGIWE0P3ZX1YWtOxOZwNTOWPmZMo --- .../entities/eap_items.yaml | 1 + .../logical/indexed_name_optimizer.py | 91 +++++++++ .../processors/test_indexed_name_optimizer.py | 173 ++++++++++++++++++ 3 files changed, 265 insertions(+) create mode 100644 snuba/query/processors/logical/indexed_name_optimizer.py create mode 100644 tests/query/processors/test_indexed_name_optimizer.py diff --git a/snuba/datasets/configuration/events_analytics_platform/entities/eap_items.yaml b/snuba/datasets/configuration/events_analytics_platform/entities/eap_items.yaml index d47659d1555..bab229eb410 100644 --- a/snuba/datasets/configuration/events_analytics_platform/entities/eap_items.yaml +++ b/snuba/datasets/configuration/events_analytics_platform/entities/eap_items.yaml @@ -109,6 +109,7 @@ storage_selector: selector: EAPItemsStorageSelector query_processors: + - processor: IndexedNameOptimizer - processor: HashBucketFunctionTransformer args: hash_bucket_names: diff --git a/snuba/query/processors/logical/indexed_name_optimizer.py b/snuba/query/processors/logical/indexed_name_optimizer.py new file mode 100644 index 00000000000..a44a9948959 --- /dev/null +++ b/snuba/query/processors/logical/indexed_name_optimizer.py @@ -0,0 +1,91 @@ +from typing import Optional + +from sentry_protos.snuba.v1.request_common_pb2 import TraceItemType + +from snuba.query.conditions import get_first_level_and_conditions +from snuba.query.expressions import ( + Column, + Expression, + FunctionCall, + Literal, + SubscriptableReference, +) +from snuba.query.logical import Query +from snuba.query.processors.logical import LogicalQueryProcessor +from snuba.query.query_settings import QuerySettings + + +class IndexedNameOptimizer(LogicalQueryProcessor): + """ + In eap_items, the per-item-type primary name attribute is promoted to a + dedicated ``indexed_name`` String column with a bloom filter index: + ``sentry.op`` for spans and ``sentry.metric.name`` for metrics. The same + value is still written to the hashed ``attributes_string`` buckets. + + Filtering ``attributes_string['sentry.op']`` (resp. + ``attributes_string['sentry.metric.name']``) would otherwise be translated + into a bucket map lookup (``arrayElement(attributes_string_N, ...)``) which + has no index. This processor rewrites those subscriptable accesses into a + reference to the indexed ``indexed_name`` column so the bloom filter index + can be used. + + The promotion is item-type specific, so the rewrite is only applied when the + query is unambiguously scoped (via an ``item_type`` equality condition) to + the item type that owns the attribute. Otherwise the access is left as is and + falls back to the bucket lookup. + """ + + # item_type proto enum value -> attribute promoted into ``indexed_name`` + INDEXED_NAME_KEY_BY_ITEM_TYPE: dict[int, str] = { + TraceItemType.TRACE_ITEM_TYPE_SPAN: "sentry.op", + TraceItemType.TRACE_ITEM_TYPE_METRIC: "sentry.metric.name", + } + + INDEXED_NAME_COLUMN = "indexed_name" + ATTRIBUTES_STRING_COLUMN = "attributes_string" + + def _indexed_name_key(self, query: Query) -> Optional[str]: + """Return the attribute name promoted into ``indexed_name`` for this + query, or ``None`` if the query is not unambiguously scoped to a single + supported item type.""" + condition = query.get_condition() + if condition is None: + return None + + item_types: set[int] = set() + for cond in get_first_level_and_conditions(condition): + if not isinstance(cond, FunctionCall) or cond.function_name != "equals": + continue + if len(cond.parameters) != 2: + continue + lhs, rhs = cond.parameters + if not isinstance(lhs, Column) or lhs.column_name != "item_type": + continue + if not isinstance(rhs, Literal) or not isinstance(rhs.value, int): + continue + item_types.add(rhs.value) + + if len(item_types) != 1: + return None + return self.INDEXED_NAME_KEY_BY_ITEM_TYPE.get(item_types.pop()) + + def process_query(self, query: Query, query_settings: QuerySettings) -> None: + key = self._indexed_name_key(query) + if key is None: + return + + def transform(exp: Expression) -> Expression: + if ( + isinstance(exp, SubscriptableReference) + and exp.column.column_name == self.ATTRIBUTES_STRING_COLUMN + and isinstance(exp.key, Literal) + and exp.key.value == key + ): + return Column( + alias=exp.alias, + table_name=exp.column.table_name, + column_name=self.INDEXED_NAME_COLUMN, + ) + return exp + + query.transform_expressions(transform) diff --git a/tests/query/processors/test_indexed_name_optimizer.py b/tests/query/processors/test_indexed_name_optimizer.py new file mode 100644 index 00000000000..34438a59565 --- /dev/null +++ b/tests/query/processors/test_indexed_name_optimizer.py @@ -0,0 +1,173 @@ +from copy import deepcopy + +import pytest + +from snuba.clickhouse.columns import ColumnSet +from snuba.datasets.entities.entity_key import EntityKey +from snuba.query import SelectedExpression +from snuba.query.data_source.simple import Entity as QueryEntity +from snuba.query.dsl import and_cond, column, equals, literal +from snuba.query.expressions import ( + Column, + Expression, + Literal, + SubscriptableReference, +) +from snuba.query.logical import Query +from snuba.query.processors.logical.indexed_name_optimizer import ( + IndexedNameOptimizer, +) +from snuba.query.query_settings import HTTPQuerySettings + +SPAN = 1 +METRIC = 8 +LOG = 2 + + +def _attr_str(key: str, alias: str | None = None) -> SubscriptableReference: + return SubscriptableReference( + alias=alias, + column=column("attributes_string"), + key=literal(key), + ) + + +def _query( + condition: Expression, + selected_columns: list[SelectedExpression] | None = None, +) -> Query: + return Query( + QueryEntity(EntityKey.EAP_ITEMS, ColumnSet([])), + selected_columns=selected_columns or [SelectedExpression("c", column("project_id"))], + condition=condition, + ) + + +test_data = [ + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(_attr_str("sentry.op"), literal("db.query")), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(Column(None, None, "indexed_name"), literal("db.query")), + ) + ), + id="span sentry.op rewritten to indexed_name", + ), + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(METRIC)), + equals(_attr_str("sentry.metric.name"), literal("my.metric")), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(METRIC)), + equals(Column(None, None, "indexed_name"), literal("my.metric")), + ) + ), + id="metric sentry.metric.name rewritten to indexed_name", + ), + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(_attr_str("sentry.metric.name"), literal("my.metric")), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(_attr_str("sentry.metric.name"), literal("my.metric")), + ) + ), + id="span query with metric key is left untouched", + ), + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(_attr_str("foo"), literal("bar")), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(_attr_str("foo"), literal("bar")), + ) + ), + id="non-indexed attribute is left untouched", + ), + pytest.param( + _query(equals(_attr_str("sentry.op"), literal("db.query"))), + _query(equals(_attr_str("sentry.op"), literal("db.query"))), + id="no item_type condition leaves bucket lookup", + ), + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(LOG)), + equals(_attr_str("sentry.op"), literal("db.query")), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(LOG)), + equals(_attr_str("sentry.op"), literal("db.query")), + ) + ), + id="unsupported item_type leaves bucket lookup", + ), + pytest.param( + _query( + condition=equals(column("item_type"), literal(SPAN)), + selected_columns=[ + SelectedExpression("op", _attr_str("sentry.op", alias="op")), + ], + ), + _query( + condition=equals(column("item_type"), literal(SPAN)), + selected_columns=[ + SelectedExpression("op", Column("op", None, "indexed_name")), + ], + ), + id="select preserves alias on rewrite", + ), +] + + +@pytest.mark.parametrize("pre_format, expected_query", test_data) +def test_indexed_name_optimizer(pre_format: Query, expected_query: Query) -> None: + copy = deepcopy(pre_format) + IndexedNameOptimizer().process_query(copy, HTTPQuerySettings()) + assert copy.get_selected_columns() == expected_query.get_selected_columns() + assert copy.get_condition() == expected_query.get_condition() + + +def test_contradictory_item_types_left_untouched() -> None: + query = _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + and_cond( + equals(column("item_type"), literal(METRIC)), + equals(_attr_str("sentry.op"), literal("db.query")), + ), + ) + ) + copy = deepcopy(query) + IndexedNameOptimizer().process_query(copy, HTTPQuerySettings()) + # ambiguous item_type -> no rewrite; the subscriptable access survives + condition = copy.get_condition() + assert condition is not None + assert any( + isinstance(e, SubscriptableReference) + and isinstance(e.key, Literal) + and e.key.value == "sentry.op" + for e in condition + ) From 32ce6d81b081c69750efaf6f7cc19812c3846bf5 Mon Sep 17 00:00:00 2001 From: Pierre Massat Date: Wed, 17 Jun 2026 17:05:02 -0700 Subject: [PATCH 2/8] test: use protobuf TraceItemType enum instead of hardcoded item_type values Keeps the optimizer tests in sync with the sentry_protos enum values they key off of, so a divergence in the upstream enum fails the tests instead of silently degrading the optimization. Co-Authored-By: Claude Opus 4.8 (1M context) Claude-Session: https://claude.ai/code/session_01GXBnLMCV79qaAkVRkQu7Nw --- tests/query/processors/test_indexed_name_optimizer.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/query/processors/test_indexed_name_optimizer.py b/tests/query/processors/test_indexed_name_optimizer.py index 34438a59565..79d4745e99f 100644 --- a/tests/query/processors/test_indexed_name_optimizer.py +++ b/tests/query/processors/test_indexed_name_optimizer.py @@ -1,6 +1,7 @@ from copy import deepcopy import pytest +from sentry_protos.snuba.v1.request_common_pb2 import TraceItemType from snuba.clickhouse.columns import ColumnSet from snuba.datasets.entities.entity_key import EntityKey @@ -19,9 +20,11 @@ ) from snuba.query.query_settings import HTTPQuerySettings -SPAN = 1 -METRIC = 8 -LOG = 2 +# Use the protobuf enum values directly so the tests stay in sync with the +# production dependency the optimizer keys off of. +SPAN = TraceItemType.TRACE_ITEM_TYPE_SPAN +METRIC = TraceItemType.TRACE_ITEM_TYPE_METRIC +LOG = TraceItemType.TRACE_ITEM_TYPE_LOG def _attr_str(key: str, alias: str | None = None) -> SubscriptableReference: From 10147ea38db37291c647cc076ef906c404b2c9b2 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 03:13:41 +0000 Subject: [PATCH 3/8] ref(eap-items): use ConditionFunctions.EQ instead of hardcoded "equals" Address review nit: reference the ConditionFunctions.EQ constant rather than the hardcoded "equals" string in IndexedNameOptimizer. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01CajnvSCYTxvJ9Hw5jELRPH --- .../query/processors/logical/indexed_name_optimizer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/snuba/query/processors/logical/indexed_name_optimizer.py b/snuba/query/processors/logical/indexed_name_optimizer.py index a44a9948959..c2187b3abdb 100644 --- a/snuba/query/processors/logical/indexed_name_optimizer.py +++ b/snuba/query/processors/logical/indexed_name_optimizer.py @@ -2,7 +2,10 @@ from sentry_protos.snuba.v1.request_common_pb2 import TraceItemType -from snuba.query.conditions import get_first_level_and_conditions +from snuba.query.conditions import ( + ConditionFunctions, + get_first_level_and_conditions, +) from snuba.query.expressions import ( Column, Expression, @@ -54,7 +57,10 @@ def _indexed_name_key(self, query: Query) -> Optional[str]: item_types: set[int] = set() for cond in get_first_level_and_conditions(condition): - if not isinstance(cond, FunctionCall) or cond.function_name != "equals": + if ( + not isinstance(cond, FunctionCall) + or cond.function_name != ConditionFunctions.EQ + ): continue if len(cond.parameters) != 2: continue From 77b3cde305c5cac19fbd9fb7a293705681a81fb2 Mon Sep 17 00:00:00 2001 From: "getsantry[bot]" <66042841+getsantry[bot]@users.noreply.github.com> Date: Fri, 19 Jun 2026 03:18:33 +0000 Subject: [PATCH 4/8] [getsentry/action-github-commit] Auto commit --- snuba/query/processors/logical/indexed_name_optimizer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/snuba/query/processors/logical/indexed_name_optimizer.py b/snuba/query/processors/logical/indexed_name_optimizer.py index c2187b3abdb..2de19c1613c 100644 --- a/snuba/query/processors/logical/indexed_name_optimizer.py +++ b/snuba/query/processors/logical/indexed_name_optimizer.py @@ -57,10 +57,7 @@ def _indexed_name_key(self, query: Query) -> Optional[str]: item_types: set[int] = set() for cond in get_first_level_and_conditions(condition): - if ( - not isinstance(cond, FunctionCall) - or cond.function_name != ConditionFunctions.EQ - ): + if not isinstance(cond, FunctionCall) or cond.function_name != ConditionFunctions.EQ: continue if len(cond.parameters) != 2: continue From 147b34d587d5f6be0525806e8b86e51a05324796 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 04:25:04 +0000 Subject: [PATCH 5/8] feat(eap-items): keep attributes_string fallback and match the real RPC filter shape The IndexedNameOptimizer matched SubscriptableReference, but the EAP RPC builder emits map-backed filters as arrayElement(attributes_string, key) (since #8035), so the optimizer never fired on real traffic. It also replaced the bucket lookup outright, which would drop rows whose indexed_name is empty (migration 0057 adds the column with no backfill). Rewrite a WHERE-clause equals/in filter on the indexed attribute into or(, ): the bloom filter index can prune granules via indexed_name while the bucket lookup keeps results correct for non-backfilled rows. Only non-empty string literals are pushed onto the index (indexed_name = '' would over-match every non-backfilled row), and SELECT/GROUP BY references keep reading the bucket. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01Hax3wseAtUEKY7GfpcJNTx --- .../logical/indexed_name_optimizer.py | 148 +++++++++++-- .../processors/test_indexed_name_optimizer.py | 204 ++++++++++++++++-- 2 files changed, 310 insertions(+), 42 deletions(-) diff --git a/snuba/query/processors/logical/indexed_name_optimizer.py b/snuba/query/processors/logical/indexed_name_optimizer.py index 2de19c1613c..a9a3e3a3735 100644 --- a/snuba/query/processors/logical/indexed_name_optimizer.py +++ b/snuba/query/processors/logical/indexed_name_optimizer.py @@ -4,8 +4,10 @@ from snuba.query.conditions import ( ConditionFunctions, + combine_and_conditions, get_first_level_and_conditions, ) +from snuba.query.dsl import or_cond from snuba.query.expressions import ( Column, Expression, @@ -25,12 +27,41 @@ class IndexedNameOptimizer(LogicalQueryProcessor): ``sentry.op`` for spans and ``sentry.metric.name`` for metrics. The same value is still written to the hashed ``attributes_string`` buckets. - Filtering ``attributes_string['sentry.op']`` (resp. - ``attributes_string['sentry.metric.name']``) would otherwise be translated - into a bucket map lookup (``arrayElement(attributes_string_N, ...)``) which - has no index. This processor rewrites those subscriptable accesses into a - reference to the indexed ``indexed_name`` column so the bloom filter index - can be used. + A filter on ``attributes_string['sentry.op']`` (resp. + ``attributes_string['sentry.metric.name']``) is otherwise translated into a + bucket map lookup (``arrayElement(attributes_string_N, ...)``) which has no + index. This processor rewrites such a filter into an ``OR`` that also probes + the indexed ``indexed_name`` column, so the bloom filter index can prune + granules:: + + equals(arrayElement(attributes_string, 'sentry.op'), 'db.query') + -> + or( + equals(indexed_name, 'db.query'), + equals(arrayElement(attributes_string, 'sentry.op'), 'db.query'), + ) + + The bucket lookup is intentionally kept as a fallback: ``indexed_name`` was + added (with no backfill) by migration ``0057_add_name_column_and_index``, so + rows written before that migration — or before the value started being + populated — have an empty ``indexed_name`` while still carrying the value in + ``attributes_string``. Filtering on ``indexed_name`` alone would silently + drop those rows; the ``OR`` keeps the query correct while still letting the + index help for the rows that do have it. + + Only the WHERE clause is rewritten (the bloom filter index only helps + filtering), and only the positive ``equals`` / ``in`` operators against + non-empty string literals are touched — for those the ``OR`` fallback is + correctness-preserving (see ``_is_index_safe_value`` for why the empty-string + default is excluded). Everything else (negations, guarded forms, comparisons + to the column default, SELECT/GROUP BY references) keeps reading the bucket + and is left untouched. + + The EAP RPC builder emits the ``arrayElement(attributes_string, key)`` form + (see ``snuba.web.rpc.common.common._map_backed_operands``); the equivalent + hand-written SnQL ``SubscriptableReference`` form is handled too. This runs + before ``HashBucketFunctionTransformer`` so the bucket branch still gets + mapped to its hashed bucket column downstream. The promotion is item-type specific, so the rewrite is only applied when the query is unambiguously scoped (via an ``item_type`` equality condition) to @@ -47,6 +78,10 @@ class IndexedNameOptimizer(LogicalQueryProcessor): INDEXED_NAME_COLUMN = "indexed_name" ATTRIBUTES_STRING_COLUMN = "attributes_string" + # Operators whose granule pruning the ``indexed_name`` bloom filter can serve + # and for which an OR fallback to the bucket lookup is correctness-preserving. + OPTIMIZABLE_OPS = frozenset({ConditionFunctions.EQ, ConditionFunctions.IN}) + def _indexed_name_key(self, query: Query) -> Optional[str]: """Return the attribute name promoted into ``indexed_name`` for this query, or ``None`` if the query is not unambiguously scoped to a single @@ -72,23 +107,94 @@ def _indexed_name_key(self, query: Query) -> Optional[str]: return None return self.INDEXED_NAME_KEY_BY_ITEM_TYPE.get(item_types.pop()) + def _indexed_name_ref(self, exp: Expression, key: str) -> Optional[Column]: + """If ``exp`` is an ``attributes_string[key]`` access — either the + ``arrayElement(attributes_string, key)`` form emitted by the EAP RPC + builder or the hand-written ``SubscriptableReference`` form — return the + equivalent reference to the ``indexed_name`` column (carrying over the + table name). Otherwise return ``None``.""" + if ( + isinstance(exp, FunctionCall) + and exp.function_name == "arrayElement" + and len(exp.parameters) == 2 + and isinstance(exp.parameters[0], Column) + and exp.parameters[0].column_name == self.ATTRIBUTES_STRING_COLUMN + and isinstance(exp.parameters[1], Literal) + and exp.parameters[1].value == key + ): + return Column(None, exp.parameters[0].table_name, self.INDEXED_NAME_COLUMN) + + if ( + isinstance(exp, SubscriptableReference) + and exp.column.column_name == self.ATTRIBUTES_STRING_COLUMN + and isinstance(exp.key, Literal) + and exp.key.value == key + ): + return Column(None, exp.column.table_name, self.INDEXED_NAME_COLUMN) + + return None + + def _is_index_safe_value(self, function_name: str, rhs: Expression) -> bool: + """An empty string is the ``String`` column default, which non-backfilled + rows (empty ``indexed_name``) also read as — so ``indexed_name = ''`` would + match every such row regardless of its real value. Only probe + ``indexed_name`` for non-empty string literals, where a match means the + value was actually populated; the bucket fallback still covers the rest. + This mirrors ``_comparison_can_match_column_default`` on the RPC side: the + cases it guards are exactly the ones we must not push onto the index.""" + + def safe_literal(exp: Expression) -> bool: + return isinstance(exp, Literal) and isinstance(exp.value, str) and exp.value != "" + + if function_name == ConditionFunctions.EQ: + return safe_literal(rhs) + # IN: the right-hand side is an ``array(...)`` of literals. + if function_name == ConditionFunctions.IN: + return ( + isinstance(rhs, FunctionCall) + and rhs.function_name == "array" + and len(rhs.parameters) > 0 + and all(safe_literal(p) for p in rhs.parameters) + ) + return False + + def _maybe_rewrite_conjunct(self, conjunct: Expression, key: str) -> Expression: + """Rewrite a top-level ``equals``/``in`` filter on the indexed attribute + into ``or(, )``. Any other + conjunct is returned unchanged.""" + if ( + not isinstance(conjunct, FunctionCall) + or conjunct.function_name not in self.OPTIMIZABLE_OPS + or len(conjunct.parameters) != 2 + ): + return conjunct + + indexed_ref = self._indexed_name_ref(conjunct.parameters[0], key) + if indexed_ref is None: + return conjunct + + if not self._is_index_safe_value(conjunct.function_name, conjunct.parameters[1]): + return conjunct + + indexed_branch = FunctionCall( + None, + conjunct.function_name, + (indexed_ref, conjunct.parameters[1]), + ) + return or_cond(indexed_branch, conjunct) + def process_query(self, query: Query, query_settings: QuerySettings) -> None: key = self._indexed_name_key(query) if key is None: return - def transform(exp: Expression) -> Expression: - if ( - isinstance(exp, SubscriptableReference) - and exp.column.column_name == self.ATTRIBUTES_STRING_COLUMN - and isinstance(exp.key, Literal) - and exp.key.value == key - ): - return Column( - alias=exp.alias, - table_name=exp.column.table_name, - column_name=self.INDEXED_NAME_COLUMN, - ) - return exp - - query.transform_expressions(transform) + condition = query.get_condition() + if condition is None: + return + + conjuncts = list(get_first_level_and_conditions(condition)) + rewritten = [self._maybe_rewrite_conjunct(cond, key) for cond in conjuncts] + if rewritten == conjuncts: + return + + query.set_ast_condition(combine_and_conditions(rewritten)) diff --git a/tests/query/processors/test_indexed_name_optimizer.py b/tests/query/processors/test_indexed_name_optimizer.py index 79d4745e99f..2dd9a074ade 100644 --- a/tests/query/processors/test_indexed_name_optimizer.py +++ b/tests/query/processors/test_indexed_name_optimizer.py @@ -7,10 +7,20 @@ from snuba.datasets.entities.entity_key import EntityKey from snuba.query import SelectedExpression from snuba.query.data_source.simple import Entity as QueryEntity -from snuba.query.dsl import and_cond, column, equals, literal +from snuba.query.dsl import ( + and_cond, + arrayElement, + column, + equals, + in_cond, + literal, + literals_array, + or_cond, +) from snuba.query.expressions import ( Column, Expression, + FunctionCall, Literal, SubscriptableReference, ) @@ -27,7 +37,14 @@ LOG = TraceItemType.TRACE_ITEM_TYPE_LOG +def _attr_arr(key: str) -> FunctionCall: + """The ``arrayElement(attributes_string, key)`` form the EAP RPC builder + emits for a map-backed string filter (see ``_map_backed_operands``).""" + return arrayElement(None, column("attributes_string"), literal(key)) + + def _attr_str(key: str, alias: str | None = None) -> SubscriptableReference: + """The hand-written SnQL ``attributes_string[key]`` form.""" return SubscriptableReference( alias=alias, column=column("attributes_string"), @@ -47,6 +64,24 @@ def _query( test_data = [ + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(_attr_arr("sentry.op"), literal("db.query")), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + or_cond( + equals(column("indexed_name"), literal("db.query")), + equals(_attr_arr("sentry.op"), literal("db.query")), + ), + ) + ), + id="span sentry.op (arrayElement) gets an indexed_name OR fallback", + ), pytest.param( _query( and_cond( @@ -57,37 +92,70 @@ def _query( _query( and_cond( equals(column("item_type"), literal(SPAN)), - equals(Column(None, None, "indexed_name"), literal("db.query")), + or_cond( + equals(column("indexed_name"), literal("db.query")), + equals(_attr_str("sentry.op"), literal("db.query")), + ), ) ), - id="span sentry.op rewritten to indexed_name", + id="span sentry.op (SubscriptableReference) gets an indexed_name OR fallback", ), pytest.param( _query( and_cond( equals(column("item_type"), literal(METRIC)), - equals(_attr_str("sentry.metric.name"), literal("my.metric")), + equals(_attr_arr("sentry.metric.name"), literal("my.metric")), ) ), _query( and_cond( equals(column("item_type"), literal(METRIC)), - equals(Column(None, None, "indexed_name"), literal("my.metric")), + or_cond( + equals(column("indexed_name"), literal("my.metric")), + equals(_attr_arr("sentry.metric.name"), literal("my.metric")), + ), + ) + ), + id="metric sentry.metric.name gets an indexed_name OR fallback", + ), + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + in_cond( + _attr_arr("sentry.op"), + literals_array(None, [literal("db.query"), literal("http.client")]), + ), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + or_cond( + in_cond( + column("indexed_name"), + literals_array(None, [literal("db.query"), literal("http.client")]), + ), + in_cond( + _attr_arr("sentry.op"), + literals_array(None, [literal("db.query"), literal("http.client")]), + ), + ), ) ), - id="metric sentry.metric.name rewritten to indexed_name", + id="span sentry.op IN gets an indexed_name OR fallback", ), pytest.param( _query( and_cond( equals(column("item_type"), literal(SPAN)), - equals(_attr_str("sentry.metric.name"), literal("my.metric")), + equals(_attr_arr("sentry.metric.name"), literal("my.metric")), ) ), _query( and_cond( equals(column("item_type"), literal(SPAN)), - equals(_attr_str("sentry.metric.name"), literal("my.metric")), + equals(_attr_arr("sentry.metric.name"), literal("my.metric")), ) ), id="span query with metric key is left untouched", @@ -96,37 +164,128 @@ def _query( _query( and_cond( equals(column("item_type"), literal(SPAN)), - equals(_attr_str("foo"), literal("bar")), + equals(_attr_arr("foo"), literal("bar")), ) ), _query( and_cond( equals(column("item_type"), literal(SPAN)), - equals(_attr_str("foo"), literal("bar")), + equals(_attr_arr("foo"), literal("bar")), ) ), id="non-indexed attribute is left untouched", ), pytest.param( - _query(equals(_attr_str("sentry.op"), literal("db.query"))), - _query(equals(_attr_str("sentry.op"), literal("db.query"))), + _query(equals(_attr_arr("sentry.op"), literal("db.query"))), + _query(equals(_attr_arr("sentry.op"), literal("db.query"))), id="no item_type condition leaves bucket lookup", ), pytest.param( _query( and_cond( equals(column("item_type"), literal(LOG)), - equals(_attr_str("sentry.op"), literal("db.query")), + equals(_attr_arr("sentry.op"), literal("db.query")), ) ), _query( and_cond( equals(column("item_type"), literal(LOG)), - equals(_attr_str("sentry.op"), literal("db.query")), + equals(_attr_arr("sentry.op"), literal("db.query")), ) ), id="unsupported item_type leaves bucket lookup", ), + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + FunctionCall( + None, + "notEquals", + (_attr_arr("sentry.op"), literal("db.query")), + ), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + FunctionCall( + None, + "notEquals", + (_attr_arr("sentry.op"), literal("db.query")), + ), + ) + ), + id="negated filter is left untouched (OR fallback would be incorrect)", + ), + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(_attr_arr("sentry.op"), literal("")), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(_attr_arr("sentry.op"), literal("")), + ) + ), + id="empty-string equals is left untouched (indexed_name='' over-matches)", + ), + pytest.param( + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + in_cond( + _attr_arr("sentry.op"), + literals_array(None, [literal("db.query"), literal("")]), + ), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + in_cond( + _attr_arr("sentry.op"), + literals_array(None, [literal("db.query"), literal("")]), + ), + ) + ), + id="IN containing the empty string is left untouched", + ), + pytest.param( + # The guarded form the RPC emits when the value is the column default: + # and(mapContains(...), equals(arrayElement(...), '')). The empty value + # makes the indexed_name probe unsafe, so the whole thing is left alone. + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + and_cond( + FunctionCall( + None, + "mapContains", + (column("attributes_string"), literal("sentry.op")), + ), + equals(_attr_arr("sentry.op"), literal("")), + ), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + and_cond( + FunctionCall( + None, + "mapContains", + (column("attributes_string"), literal("sentry.op")), + ), + equals(_attr_arr("sentry.op"), literal("")), + ), + ) + ), + id="guarded empty-value form is left untouched", + ), pytest.param( _query( condition=equals(column("item_type"), literal(SPAN)), @@ -137,10 +296,10 @@ def _query( _query( condition=equals(column("item_type"), literal(SPAN)), selected_columns=[ - SelectedExpression("op", Column("op", None, "indexed_name")), + SelectedExpression("op", _attr_str("sentry.op", alias="op")), ], ), - id="select preserves alias on rewrite", + id="SELECT references are left on the bucket (index only helps WHERE)", ), ] @@ -159,18 +318,21 @@ def test_contradictory_item_types_left_untouched() -> None: equals(column("item_type"), literal(SPAN)), and_cond( equals(column("item_type"), literal(METRIC)), - equals(_attr_str("sentry.op"), literal("db.query")), + equals(_attr_arr("sentry.op"), literal("db.query")), ), ) ) copy = deepcopy(query) IndexedNameOptimizer().process_query(copy, HTTPQuerySettings()) - # ambiguous item_type -> no rewrite; the subscriptable access survives + # ambiguous item_type -> no rewrite; no indexed_name reference is introduced. condition = copy.get_condition() assert condition is not None + assert not any(isinstance(e, Column) and e.column_name == "indexed_name" for e in condition) + # the original bucket lookup survives untouched. assert any( - isinstance(e, SubscriptableReference) - and isinstance(e.key, Literal) - and e.key.value == "sentry.op" + isinstance(e, FunctionCall) + and e.function_name == "arrayElement" + and isinstance(e.parameters[1], Literal) + and e.parameters[1].value == "sentry.op" for e in condition ) From b535ee4d6d4eb7e0c9e10ebe26b329c8601da59a Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 04:38:47 +0000 Subject: [PATCH 6/8] ref(eap-items): gate indexed_name rewrite behind a config flag, drop the fallback Replace the OR bucket fallback with a direct index-only rewrite (attributes_string[key] -> indexed_name), gated behind the eap_items_use_indexed_name runtime flag (default off). The fallback forced ClickHouse to scan every granule (an OR with the unindexed bucket branch is never prunable), so it negated the bloom filter index it was meant to use. Index-only filtering is only correct once indexed_name is fully populated across the retention window (migration 0057 adds the column without a backfill), so the flag must stay off until then; flipping it on lets the bloom filter index serve sentry.op / sentry.metric.name filters. The rewrite still matches the real arrayElement(attributes_string, key) shape emitted by the RPC builder (and the SnQL SubscriptableReference form). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01Hax3wseAtUEKY7GfpcJNTx --- .../logical/indexed_name_optimizer.py | 125 ++++--------- .../processors/test_indexed_name_optimizer.py | 169 ++++++------------ 2 files changed, 81 insertions(+), 213 deletions(-) diff --git a/snuba/query/processors/logical/indexed_name_optimizer.py b/snuba/query/processors/logical/indexed_name_optimizer.py index a9a3e3a3735..9bc44810d17 100644 --- a/snuba/query/processors/logical/indexed_name_optimizer.py +++ b/snuba/query/processors/logical/indexed_name_optimizer.py @@ -2,12 +2,11 @@ from sentry_protos.snuba.v1.request_common_pb2 import TraceItemType +from snuba import state from snuba.query.conditions import ( ConditionFunctions, - combine_and_conditions, get_first_level_and_conditions, ) -from snuba.query.dsl import or_cond from snuba.query.expressions import ( Column, Expression, @@ -30,38 +29,26 @@ class IndexedNameOptimizer(LogicalQueryProcessor): A filter on ``attributes_string['sentry.op']`` (resp. ``attributes_string['sentry.metric.name']``) is otherwise translated into a bucket map lookup (``arrayElement(attributes_string_N, ...)``) which has no - index. This processor rewrites such a filter into an ``OR`` that also probes - the indexed ``indexed_name`` column, so the bloom filter index can prune - granules:: + index. This processor rewrites that access into a reference to the indexed + ``indexed_name`` column so the bloom filter index can be used:: equals(arrayElement(attributes_string, 'sentry.op'), 'db.query') - -> - or( - equals(indexed_name, 'db.query'), - equals(arrayElement(attributes_string, 'sentry.op'), 'db.query'), - ) - - The bucket lookup is intentionally kept as a fallback: ``indexed_name`` was - added (with no backfill) by migration ``0057_add_name_column_and_index``, so - rows written before that migration — or before the value started being - populated — have an empty ``indexed_name`` while still carrying the value in - ``attributes_string``. Filtering on ``indexed_name`` alone would silently - drop those rows; the ``OR`` keeps the query correct while still letting the - index help for the rows that do have it. - - Only the WHERE clause is rewritten (the bloom filter index only helps - filtering), and only the positive ``equals`` / ``in`` operators against - non-empty string literals are touched — for those the ``OR`` fallback is - correctness-preserving (see ``_is_index_safe_value`` for why the empty-string - default is excluded). Everything else (negations, guarded forms, comparisons - to the column default, SELECT/GROUP BY references) keeps reading the bucket - and is left untouched. + -> equals(indexed_name, 'db.query') The EAP RPC builder emits the ``arrayElement(attributes_string, key)`` form (see ``snuba.web.rpc.common.common._map_backed_operands``); the equivalent hand-written SnQL ``SubscriptableReference`` form is handled too. This runs - before ``HashBucketFunctionTransformer`` so the bucket branch still gets - mapped to its hashed bucket column downstream. + before ``HashBucketFunctionTransformer``, so untouched accesses still get + mapped to their hashed bucket column downstream. + + ``indexed_name`` was added (with no backfill) by migration + ``0057_add_name_column_and_index``, so rows written before that migration — + or before the value started being populated — have an empty ``indexed_name`` + while still carrying the value in ``attributes_string``. Reading + ``indexed_name`` for those rows would silently drop them, so the rewrite is + gated behind the ``CONFIG_KEY`` runtime flag (default off) and must only be + enabled once ``indexed_name`` is fully populated for the live retention + window. The promotion is item-type specific, so the rewrite is only applied when the query is unambiguously scoped (via an ``item_type`` equality condition) to @@ -69,6 +56,11 @@ class IndexedNameOptimizer(LogicalQueryProcessor): falls back to the bucket lookup. """ + # Runtime flag gating the rewrite. Off (0) until ``indexed_name`` has been + # backfilled across the retention window; flipping it on lets the bloom + # filter index serve ``sentry.op`` / ``sentry.metric.name`` filters. + CONFIG_KEY = "eap_items_use_indexed_name" + # item_type proto enum value -> attribute promoted into ``indexed_name`` INDEXED_NAME_KEY_BY_ITEM_TYPE: dict[int, str] = { TraceItemType.TRACE_ITEM_TYPE_SPAN: "sentry.op", @@ -78,10 +70,6 @@ class IndexedNameOptimizer(LogicalQueryProcessor): INDEXED_NAME_COLUMN = "indexed_name" ATTRIBUTES_STRING_COLUMN = "attributes_string" - # Operators whose granule pruning the ``indexed_name`` bloom filter can serve - # and for which an OR fallback to the bucket lookup is correctness-preserving. - OPTIMIZABLE_OPS = frozenset({ConditionFunctions.EQ, ConditionFunctions.IN}) - def _indexed_name_key(self, query: Query) -> Optional[str]: """Return the attribute name promoted into ``indexed_name`` for this query, or ``None`` if the query is not unambiguously scoped to a single @@ -111,8 +99,8 @@ def _indexed_name_ref(self, exp: Expression, key: str) -> Optional[Column]: """If ``exp`` is an ``attributes_string[key]`` access — either the ``arrayElement(attributes_string, key)`` form emitted by the EAP RPC builder or the hand-written ``SubscriptableReference`` form — return the - equivalent reference to the ``indexed_name`` column (carrying over the - table name). Otherwise return ``None``.""" + equivalent reference to the ``indexed_name`` column (preserving the alias + and table name). Otherwise return ``None``.""" if ( isinstance(exp, FunctionCall) and exp.function_name == "arrayElement" @@ -122,7 +110,7 @@ def _indexed_name_ref(self, exp: Expression, key: str) -> Optional[Column]: and isinstance(exp.parameters[1], Literal) and exp.parameters[1].value == key ): - return Column(None, exp.parameters[0].table_name, self.INDEXED_NAME_COLUMN) + return Column(exp.alias, exp.parameters[0].table_name, self.INDEXED_NAME_COLUMN) if ( isinstance(exp, SubscriptableReference) @@ -130,71 +118,20 @@ def _indexed_name_ref(self, exp: Expression, key: str) -> Optional[Column]: and isinstance(exp.key, Literal) and exp.key.value == key ): - return Column(None, exp.column.table_name, self.INDEXED_NAME_COLUMN) + return Column(exp.alias, exp.column.table_name, self.INDEXED_NAME_COLUMN) return None - def _is_index_safe_value(self, function_name: str, rhs: Expression) -> bool: - """An empty string is the ``String`` column default, which non-backfilled - rows (empty ``indexed_name``) also read as — so ``indexed_name = ''`` would - match every such row regardless of its real value. Only probe - ``indexed_name`` for non-empty string literals, where a match means the - value was actually populated; the bucket fallback still covers the rest. - This mirrors ``_comparison_can_match_column_default`` on the RPC side: the - cases it guards are exactly the ones we must not push onto the index.""" - - def safe_literal(exp: Expression) -> bool: - return isinstance(exp, Literal) and isinstance(exp.value, str) and exp.value != "" - - if function_name == ConditionFunctions.EQ: - return safe_literal(rhs) - # IN: the right-hand side is an ``array(...)`` of literals. - if function_name == ConditionFunctions.IN: - return ( - isinstance(rhs, FunctionCall) - and rhs.function_name == "array" - and len(rhs.parameters) > 0 - and all(safe_literal(p) for p in rhs.parameters) - ) - return False - - def _maybe_rewrite_conjunct(self, conjunct: Expression, key: str) -> Expression: - """Rewrite a top-level ``equals``/``in`` filter on the indexed attribute - into ``or(, )``. Any other - conjunct is returned unchanged.""" - if ( - not isinstance(conjunct, FunctionCall) - or conjunct.function_name not in self.OPTIMIZABLE_OPS - or len(conjunct.parameters) != 2 - ): - return conjunct - - indexed_ref = self._indexed_name_ref(conjunct.parameters[0], key) - if indexed_ref is None: - return conjunct - - if not self._is_index_safe_value(conjunct.function_name, conjunct.parameters[1]): - return conjunct - - indexed_branch = FunctionCall( - None, - conjunct.function_name, - (indexed_ref, conjunct.parameters[1]), - ) - return or_cond(indexed_branch, conjunct) - def process_query(self, query: Query, query_settings: QuerySettings) -> None: - key = self._indexed_name_key(query) - if key is None: + if state.get_int_config(self.CONFIG_KEY, 0) == 0: return - condition = query.get_condition() - if condition is None: + key = self._indexed_name_key(query) + if key is None: return - conjuncts = list(get_first_level_and_conditions(condition)) - rewritten = [self._maybe_rewrite_conjunct(cond, key) for cond in conjuncts] - if rewritten == conjuncts: - return + def transform(exp: Expression) -> Expression: + indexed_ref = self._indexed_name_ref(exp, key) + return indexed_ref if indexed_ref is not None else exp - query.set_ast_condition(combine_and_conditions(rewritten)) + query.transform_expressions(transform) diff --git a/tests/query/processors/test_indexed_name_optimizer.py b/tests/query/processors/test_indexed_name_optimizer.py index 2dd9a074ade..cf8cea31236 100644 --- a/tests/query/processors/test_indexed_name_optimizer.py +++ b/tests/query/processors/test_indexed_name_optimizer.py @@ -15,7 +15,6 @@ in_cond, literal, literals_array, - or_cond, ) from snuba.query.expressions import ( Column, @@ -29,6 +28,7 @@ IndexedNameOptimizer, ) from snuba.query.query_settings import HTTPQuerySettings +from snuba.state import set_config # Use the protobuf enum values directly so the tests stay in sync with the # production dependency the optimizer keys off of. @@ -74,13 +74,10 @@ def _query( _query( and_cond( equals(column("item_type"), literal(SPAN)), - or_cond( - equals(column("indexed_name"), literal("db.query")), - equals(_attr_arr("sentry.op"), literal("db.query")), - ), + equals(column("indexed_name"), literal("db.query")), ) ), - id="span sentry.op (arrayElement) gets an indexed_name OR fallback", + id="span sentry.op (arrayElement) rewritten to indexed_name", ), pytest.param( _query( @@ -92,13 +89,10 @@ def _query( _query( and_cond( equals(column("item_type"), literal(SPAN)), - or_cond( - equals(column("indexed_name"), literal("db.query")), - equals(_attr_str("sentry.op"), literal("db.query")), - ), + equals(column("indexed_name"), literal("db.query")), ) ), - id="span sentry.op (SubscriptableReference) gets an indexed_name OR fallback", + id="span sentry.op (SubscriptableReference) rewritten to indexed_name", ), pytest.param( _query( @@ -110,13 +104,10 @@ def _query( _query( and_cond( equals(column("item_type"), literal(METRIC)), - or_cond( - equals(column("indexed_name"), literal("my.metric")), - equals(_attr_arr("sentry.metric.name"), literal("my.metric")), - ), + equals(column("indexed_name"), literal("my.metric")), ) ), - id="metric sentry.metric.name gets an indexed_name OR fallback", + id="metric sentry.metric.name rewritten to indexed_name", ), pytest.param( _query( @@ -131,19 +122,31 @@ def _query( _query( and_cond( equals(column("item_type"), literal(SPAN)), - or_cond( - in_cond( - column("indexed_name"), - literals_array(None, [literal("db.query"), literal("http.client")]), - ), - in_cond( - _attr_arr("sentry.op"), - literals_array(None, [literal("db.query"), literal("http.client")]), - ), + in_cond( + column("indexed_name"), + literals_array(None, [literal("db.query"), literal("http.client")]), ), ) ), - id="span sentry.op IN gets an indexed_name OR fallback", + id="span sentry.op IN rewritten to indexed_name", + ), + pytest.param( + # The rewrite is a value substitution, so it applies to any operator the + # access appears under (the index only helps equals/in, but substituting + # is always correct once indexed_name is populated). + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + FunctionCall(None, "notEquals", (_attr_arr("sentry.op"), literal("db.query"))), + ) + ), + _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + FunctionCall(None, "notEquals", (column("indexed_name"), literal("db.query"))), + ) + ), + id="any operator on the access is rewritten (notEquals)", ), pytest.param( _query( @@ -195,97 +198,6 @@ def _query( ), id="unsupported item_type leaves bucket lookup", ), - pytest.param( - _query( - and_cond( - equals(column("item_type"), literal(SPAN)), - FunctionCall( - None, - "notEquals", - (_attr_arr("sentry.op"), literal("db.query")), - ), - ) - ), - _query( - and_cond( - equals(column("item_type"), literal(SPAN)), - FunctionCall( - None, - "notEquals", - (_attr_arr("sentry.op"), literal("db.query")), - ), - ) - ), - id="negated filter is left untouched (OR fallback would be incorrect)", - ), - pytest.param( - _query( - and_cond( - equals(column("item_type"), literal(SPAN)), - equals(_attr_arr("sentry.op"), literal("")), - ) - ), - _query( - and_cond( - equals(column("item_type"), literal(SPAN)), - equals(_attr_arr("sentry.op"), literal("")), - ) - ), - id="empty-string equals is left untouched (indexed_name='' over-matches)", - ), - pytest.param( - _query( - and_cond( - equals(column("item_type"), literal(SPAN)), - in_cond( - _attr_arr("sentry.op"), - literals_array(None, [literal("db.query"), literal("")]), - ), - ) - ), - _query( - and_cond( - equals(column("item_type"), literal(SPAN)), - in_cond( - _attr_arr("sentry.op"), - literals_array(None, [literal("db.query"), literal("")]), - ), - ) - ), - id="IN containing the empty string is left untouched", - ), - pytest.param( - # The guarded form the RPC emits when the value is the column default: - # and(mapContains(...), equals(arrayElement(...), '')). The empty value - # makes the indexed_name probe unsafe, so the whole thing is left alone. - _query( - and_cond( - equals(column("item_type"), literal(SPAN)), - and_cond( - FunctionCall( - None, - "mapContains", - (column("attributes_string"), literal("sentry.op")), - ), - equals(_attr_arr("sentry.op"), literal("")), - ), - ) - ), - _query( - and_cond( - equals(column("item_type"), literal(SPAN)), - and_cond( - FunctionCall( - None, - "mapContains", - (column("attributes_string"), literal("sentry.op")), - ), - equals(_attr_arr("sentry.op"), literal("")), - ), - ) - ), - id="guarded empty-value form is left untouched", - ), pytest.param( _query( condition=equals(column("item_type"), literal(SPAN)), @@ -296,23 +208,42 @@ def _query( _query( condition=equals(column("item_type"), literal(SPAN)), selected_columns=[ - SelectedExpression("op", _attr_str("sentry.op", alias="op")), + SelectedExpression("op", Column("op", None, "indexed_name")), ], ), - id="SELECT references are left on the bucket (index only helps WHERE)", + id="select preserves alias on rewrite", ), ] +@pytest.mark.redis_db @pytest.mark.parametrize("pre_format, expected_query", test_data) def test_indexed_name_optimizer(pre_format: Query, expected_query: Query) -> None: + set_config(IndexedNameOptimizer.CONFIG_KEY, 1) copy = deepcopy(pre_format) IndexedNameOptimizer().process_query(copy, HTTPQuerySettings()) assert copy.get_selected_columns() == expected_query.get_selected_columns() assert copy.get_condition() == expected_query.get_condition() +@pytest.mark.redis_db +def test_disabled_by_default_leaves_query_untouched() -> None: + set_config(IndexedNameOptimizer.CONFIG_KEY, 0) + query = _query( + and_cond( + equals(column("item_type"), literal(SPAN)), + equals(_attr_arr("sentry.op"), literal("db.query")), + ) + ) + copy = deepcopy(query) + IndexedNameOptimizer().process_query(copy, HTTPQuerySettings()) + assert copy.get_condition() == query.get_condition() + assert copy.get_selected_columns() == query.get_selected_columns() + + +@pytest.mark.redis_db def test_contradictory_item_types_left_untouched() -> None: + set_config(IndexedNameOptimizer.CONFIG_KEY, 1) query = _query( and_cond( equals(column("item_type"), literal(SPAN)), From 0ad79ee954068507deacd480df42756ac1f8b2f2 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 04:56:09 +0000 Subject: [PATCH 7/8] fix(eap-items): keep indexed_name key typed as str inside the transform closure mypy widens the narrowed Optional[str] back to Optional[str] inside the nested transform() closure, failing the pre-commit mypy hook. Bind it to a str-typed local before the closure. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01Hax3wseAtUEKY7GfpcJNTx --- snuba/query/processors/logical/indexed_name_optimizer.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/snuba/query/processors/logical/indexed_name_optimizer.py b/snuba/query/processors/logical/indexed_name_optimizer.py index 9bc44810d17..bec0de541e7 100644 --- a/snuba/query/processors/logical/indexed_name_optimizer.py +++ b/snuba/query/processors/logical/indexed_name_optimizer.py @@ -130,8 +130,12 @@ def process_query(self, query: Query, query_settings: QuerySettings) -> None: if key is None: return + # Bind to a str-typed local: mypy widens a narrowed enclosing-scope + # variable back to its declared Optional[str] inside the nested closure. + indexed_key: str = key + def transform(exp: Expression) -> Expression: - indexed_ref = self._indexed_name_ref(exp, key) + indexed_ref = self._indexed_name_ref(exp, indexed_key) return indexed_ref if indexed_ref is not None else exp query.transform_expressions(transform) From 3f6fe40178f752bc7c92c7f87dfdb229b4bc9a71 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 19 Jun 2026 14:45:55 +0000 Subject: [PATCH 8/8] test(eap-items): end-to-end time series test for metric-name filter via indexed_name Add an EndpointTimeSeries test that writes METRIC items and runs a sentry.metric.name-filtered SUM time series with the IndexedNameOptimizer flag enabled. Asserts the name filter still returns the correct rows once it's rewritten onto the indexed_name column, and that results are identical with the flag off (bucket lookup) and on (indexed_name). Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01Hax3wseAtUEKY7GfpcJNTx --- .../test_endpoint_time_series_indexed_name.py | 130 ++++++++++++++++++ 1 file changed, 130 insertions(+) create mode 100644 tests/web/rpc/v1/test_endpoint_time_series/test_endpoint_time_series_indexed_name.py diff --git a/tests/web/rpc/v1/test_endpoint_time_series/test_endpoint_time_series_indexed_name.py b/tests/web/rpc/v1/test_endpoint_time_series/test_endpoint_time_series_indexed_name.py new file mode 100644 index 00000000000..8ed925873f8 --- /dev/null +++ b/tests/web/rpc/v1/test_endpoint_time_series/test_endpoint_time_series_indexed_name.py @@ -0,0 +1,130 @@ +from datetime import UTC, datetime, timedelta + +import pytest +from google.protobuf.timestamp_pb2 import Timestamp +from sentry_protos.snuba.v1.endpoint_time_series_pb2 import ( + TimeSeriesRequest, + TimeSeriesResponse, +) +from sentry_protos.snuba.v1.request_common_pb2 import RequestMeta, TraceItemType +from sentry_protos.snuba.v1.trace_item_attribute_pb2 import ( + AttributeAggregation, + AttributeKey, + AttributeValue, + ExtrapolationMode, + Function, +) +from sentry_protos.snuba.v1.trace_item_filter_pb2 import ( + ComparisonFilter, + TraceItemFilter, +) +from sentry_protos.snuba.v1.trace_item_pb2 import AnyValue + +from snuba.datasets.storages.factory import get_storage +from snuba.datasets.storages.storage_key import StorageKey +from snuba.query.processors.logical.indexed_name_optimizer import IndexedNameOptimizer +from snuba.state import set_config +from snuba.web.rpc.v1.endpoint_time_series import EndpointTimeSeries +from tests.base import BaseApiTest +from tests.helpers import write_raw_unprocessed_events +from tests.web.rpc.v1.test_utils import gen_item_message + +BASE_TIME = datetime.utcnow().replace( + hour=8, minute=0, second=0, microsecond=0, tzinfo=UTC +) - timedelta(hours=24) + +GRANULARITY_SECS = 300 +QUERY_DURATION = 3600 + +# A matching metric is written this many times; each carries value=1.0, so a SUM +# over the matching items totals MATCHING_COUNT. +MATCHING_COUNT = 6 +NON_MATCHING_COUNT = 4 + + +def _store_metrics() -> None: + """Write METRIC items: ``MATCHING_COUNT`` named ``my.metric`` plus + ``NON_MATCHING_COUNT`` named ``other.metric``. The rust processor promotes + ``sentry.metric.name`` into the ``indexed_name`` column (see + ``rust_snuba/src/processors/eap_items.rs``), which is what the optimizer + rewrites the name filter onto.""" + messages: list[bytes] = [] + for name, count in (("my.metric", MATCHING_COUNT), ("other.metric", NON_MATCHING_COUNT)): + for _ in range(count): + messages.append( + gen_item_message( + start_timestamp=BASE_TIME + timedelta(seconds=60), + type=TraceItemType.TRACE_ITEM_TYPE_METRIC, + remove_default_attributes=True, + attributes={ + "sentry.metric.name": AnyValue(string_value=name), + "value": AnyValue(double_value=1.0), + }, + ) + ) + write_raw_unprocessed_events(get_storage(StorageKey("eap_items")), messages) # type: ignore + + +def _request() -> TimeSeriesRequest: + """SUM(value) over metrics named ``my.metric``. The name filter is what the + IndexedNameOptimizer rewrites onto the indexed_name column for metrics.""" + return TimeSeriesRequest( + meta=RequestMeta( + project_ids=[1], + organization_id=1, + cogs_category="something", + referrer="something", + start_timestamp=Timestamp(seconds=int(BASE_TIME.timestamp())), + end_timestamp=Timestamp(seconds=int(BASE_TIME.timestamp()) + QUERY_DURATION), + trace_item_type=TraceItemType.TRACE_ITEM_TYPE_METRIC, + ), + filter=TraceItemFilter( + comparison_filter=ComparisonFilter( + key=AttributeKey(type=AttributeKey.TYPE_STRING, name="sentry.metric.name"), + op=ComparisonFilter.OP_EQUALS, + value=AttributeValue(val_str="my.metric"), + ) + ), + aggregations=[ + AttributeAggregation( + aggregate=Function.FUNCTION_SUM, + key=AttributeKey(type=AttributeKey.TYPE_FLOAT, name="value"), + label="sum", + extrapolation_mode=ExtrapolationMode.EXTRAPOLATION_MODE_NONE, + ), + ], + granularity_secs=GRANULARITY_SECS, + ) + + +def _total(response: TimeSeriesResponse) -> float: + return float(sum(dp.data for ts in response.result_timeseries for dp in ts.data_points)) + + +@pytest.mark.eap +@pytest.mark.redis_db +class TestTimeSeriesIndexedName(BaseApiTest): + def test_metric_name_filter_with_indexed_name_enabled(self) -> None: + """A metric-name-filtered time series returns the right rows when the + optimizer rewrites the name filter onto the indexed_name column.""" + _store_metrics() + set_config(IndexedNameOptimizer.CONFIG_KEY, 1) + + response = EndpointTimeSeries().execute(_request()) + + # Only the my.metric items (value=1.0 each) match; other.metric is excluded. + assert _total(response) == float(MATCHING_COUNT) + + def test_indexed_name_rewrite_is_result_preserving(self) -> None: + """The rewrite must not change results: the same request returns the same + time series with the flag off (bucket lookup) and on (indexed_name).""" + _store_metrics() + + set_config(IndexedNameOptimizer.CONFIG_KEY, 0) + disabled = EndpointTimeSeries().execute(_request()) + + set_config(IndexedNameOptimizer.CONFIG_KEY, 1) + enabled = EndpointTimeSeries().execute(_request()) + + assert _total(disabled) == float(MATCHING_COUNT) + assert list(enabled.result_timeseries) == list(disabled.result_timeseries)