From 81a64db9a69930dcf64ecc1baf3a4bb80eb2a397 Mon Sep 17 00:00:00 2001 From: Alberto Leal Date: Wed, 15 Apr 2026 14:21:32 -0400 Subject: [PATCH 1/4] feat(outcomes): Route queries to daily table when time range exceeds retention Queries whose lower timestamp bound is older than 90 days are now routed to the outcomes_daily table instead of outcomes_hourly. The hourly table only retains ~90 days of data, so older queries would return incomplete results. Uses get_time_range() to extract the lower timestamp bound from the query AST. TimeSeriesProcessor runs before storage selection in the entity pipeline, so datetime Literals are available for inspection. Existing behavior is preserved: - OutcomesQuerySettings(use_daily=True) still routes to daily - OutcomesQuerySettings(use_daily=False) still routes to hourly - Queries within the 90-day window still route to hourly --- .../entities/storage_selectors/outcomes.py | 30 ++++- .../storage_selectors/test_outcomes.py | 125 +++++++++++++++--- 2 files changed, 135 insertions(+), 20 deletions(-) diff --git a/snuba/datasets/entities/storage_selectors/outcomes.py b/snuba/datasets/entities/storage_selectors/outcomes.py index 4ad94ec2d03..528a6d90b16 100644 --- a/snuba/datasets/entities/storage_selectors/outcomes.py +++ b/snuba/datasets/entities/storage_selectors/outcomes.py @@ -1,5 +1,7 @@ +from datetime import datetime, timedelta, timezone from typing import Sequence +from snuba.clickhouse.query_dsl.accessors import get_time_range from snuba.datasets.entities.storage_selectors import QueryStorageSelector from snuba.datasets.entities.storage_selectors.selector import QueryStorageSelectorError from snuba.datasets.storage import EntityStorageConnection, ReadableTableStorage @@ -7,11 +9,22 @@ from snuba.query.logical import Query from snuba.query.query_settings import OutcomesQuerySettings, QuerySettings +# Queries starting earlier than this threshold are routed to the daily table +# because the hourly table only retains ~90 days of data. +_DAILY_THRESHOLD = timedelta(days=90) + class OutcomesStorageSelector(QueryStorageSelector): """ - Outcomes storage selector to decide whether to query the hourly or daily - outcomes tables + Outcomes storage selector that decides whether to query the hourly or + daily outcomes tables. + + Routing priority: + 1. OutcomesQuerySettings — honours the explicit use_daily flag. + 2. Time-range — if the query's lower timestamp bound is older than 90 + days, route to daily (the hourly table does not retain data that far + back). + 3. Default — hourly. """ def __init__(self) -> None: @@ -29,7 +42,7 @@ def select_storage( self.daily_storage if query_settings.get_use_daily() else self.hourly_storage ) else: - outcomes_key = self.hourly_storage + outcomes_key = self._route_by_time_range(query) for storage_connection in storage_connections: assert isinstance(storage_connection.storage, ReadableTableStorage) @@ -39,3 +52,14 @@ def select_storage( raise QueryStorageSelectorError( "The specified storage in selector does not exist in storage list." ) + + def _route_by_time_range(self, query: Query) -> StorageKey: + # Route to daily if the query reaches beyond the hourly table's + # retention window (~90 days). + lower_bound, _ = get_time_range(query, "timestamp") + if lower_bound is not None: + cutoff = datetime.now(timezone.utc) - _DAILY_THRESHOLD + if lower_bound < cutoff: + return self.daily_storage + + return self.hourly_storage diff --git a/tests/datasets/entities/storage_selectors/test_outcomes.py b/tests/datasets/entities/storage_selectors/test_outcomes.py index 6518710f293..3a280f211e2 100644 --- a/tests/datasets/entities/storage_selectors/test_outcomes.py +++ b/tests/datasets/entities/storage_selectors/test_outcomes.py @@ -1,3 +1,5 @@ +from datetime import datetime, timedelta, timezone + import pytest from snuba.datasets.entities.entity_key import EntityKey @@ -6,7 +8,9 @@ from snuba.datasets.storage import Storage from snuba.datasets.storages.factory import get_storage from snuba.datasets.storages.storage_key import StorageKey +from snuba.query.conditions import BooleanFunctions, ConditionFunctions, binary_condition from snuba.query.data_source.simple import Entity +from snuba.query.expressions import Column, Literal from snuba.query.logical import Query from snuba.query.query_settings import HTTPQuerySettings, OutcomesQuerySettings @@ -18,30 +22,117 @@ DAILY = get_storage(StorageKey.OUTCOMES_DAILY) HOURLY = get_storage(StorageKey.OUTCOMES_HOURLY) -TEST_CASES = [ - pytest.param(OutcomesQuerySettings(), HOURLY), - pytest.param(OutcomesQuerySettings(use_daily=True), DAILY), - pytest.param(HTTPQuerySettings(), HOURLY), + +def _make_timestamp_condition(start: datetime, end: datetime): + """Build a ``timestamp >= start AND timestamp < end`` condition node.""" + return binary_condition( + BooleanFunctions.AND, + binary_condition( + ConditionFunctions.GTE, + Column(None, None, "timestamp"), + Literal(None, start), + ), + binary_condition( + ConditionFunctions.LT, + Column(None, None, "timestamp"), + Literal(None, end), + ), + ) + + +def _query_with_timestamps(start: datetime, end: datetime) -> Query: + """Return a Query whose WHERE clause contains a timestamp range.""" + return Query( + from_clause=OUTCOMES_ENTITY, + condition=_make_timestamp_condition(start, end), + ) + + +_NOW = datetime.now(timezone.utc) +_OLD_START = _NOW - timedelta(days=120) # >90 days ago +_OLD_END = _NOW - timedelta(days=90) +_RECENT_START = _NOW - timedelta(days=30) # <90 days ago +_RECENT_END = _NOW - timedelta(days=1) + +# --- Test cases without timestamp conditions (query is irrelevant) ---------- + +NO_TIMESTAMP_CASES = [ + pytest.param(OutcomesQuerySettings(), HOURLY, id="outcomes_settings_default_hourly"), + pytest.param(OutcomesQuerySettings(use_daily=True), DAILY, id="outcomes_settings_use_daily"), + pytest.param(OutcomesQuerySettings(use_daily=False), HOURLY, id="outcomes_settings_no_daily"), + pytest.param(HTTPQuerySettings(), HOURLY, id="no_timestamp_default_hourly"), + pytest.param( + HTTPQuerySettings(referrer="outcomes.timeseries"), + HOURLY, + id="no_timestamp_non_billing_hourly", + ), ] -@pytest.mark.parametrize( - "settings, expected_storage", - TEST_CASES, -) -def test_storage_selector( +@pytest.mark.parametrize("settings, expected_storage", NO_TIMESTAMP_CASES) +def test_storage_selector_no_timestamp( settings: HTTPQuerySettings, expected_storage: Storage, ) -> None: """ - Test that we route queries to either hourly or daily outcomes tables - based on the `use_daily` setting passed through in OutcomesQuerySettings - If just HTTPQuerySettings, then uses hourly table. + Routing without timestamp conditions in the query. + + - OutcomesQuerySettings with use_daily=True -> daily. + - Everything else -> hourly. """ - unimportant_query = Query(from_clause=OUTCOMES_ENTITY) + query = Query(from_clause=OUTCOMES_ENTITY) connections = get_entity(EntityKey.OUTCOMES).get_all_storage_connections() - selected_storage = OutcomesStorageSelector().select_storage( - unimportant_query, settings, connections - ) - assert selected_storage.storage == expected_storage + selected = OutcomesStorageSelector().select_storage(query, settings, connections) + assert selected.storage == expected_storage + + +# --- Test cases with timestamp conditions (time-range routing) ---------------- + +TIMESTAMP_CASES = [ + # >90 days ago -> daily, regardless of referrer + pytest.param( + _query_with_timestamps(_OLD_START, _OLD_END), + HTTPQuerySettings(referrer="outcomes.timeseries"), + DAILY, + id="old_range_non_billing_daily", + ), + pytest.param( + _query_with_timestamps(_OLD_START, _OLD_END), + HTTPQuerySettings(), + DAILY, + id="old_range_default_daily", + ), + # <90 days ago -> hourly + pytest.param( + _query_with_timestamps(_RECENT_START, _RECENT_END), + HTTPQuerySettings(referrer="outcomes.timeseries"), + HOURLY, + id="recent_range_non_billing_hourly", + ), + pytest.param( + _query_with_timestamps(_RECENT_START, _RECENT_END), + HTTPQuerySettings(), + HOURLY, + id="recent_range_default_hourly", + ), +] + + +@pytest.mark.parametrize("query, settings, expected_storage", TIMESTAMP_CASES) +def test_storage_selector_with_timestamps( + query: Query, + settings: HTTPQuerySettings, + expected_storage: Storage, +) -> None: + """ + Time-range routing: queries reaching beyond the hourly table's 90-day + retention are routed to the daily table. + + - Query start >90 days ago -> daily (hourly table lacks the data). + - Query start <90 days ago -> hourly. + """ + connections = get_entity(EntityKey.OUTCOMES).get_all_storage_connections() + + selected = OutcomesStorageSelector().select_storage(query, settings, connections) + assert selected.storage == expected_storage From f2d038c52d87df81b0b2ef63f2f0079a4f52ba74 Mon Sep 17 00:00:00 2001 From: Alberto Leal Date: Wed, 15 Apr 2026 17:29:55 -0400 Subject: [PATCH 2/4] feat(outcomes): Add billing referrer fallback for daily table routing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Billing queries need 13-month retention available only in the daily table. Even when the query's time range is within the 90-day hourly window, billing queries should route to daily for consistent results across the full billing history. Add a referrer-based fallback: queries with a referrer starting with "billing." are routed to the daily table. This is triggered by sentry's UsageService which uses referrer "billing.usage_service.clickhouse". Routing priority: 1. OutcomesQuerySettings — explicit use_daily flag 2. Time-range — lower timestamp bound older than 90 days 3. Referrer — starts with "billing." 4. Default — hourly --- .../entities/storage_selectors/outcomes.py | 16 +++++++-- .../storage_selectors/test_outcomes.py | 35 ++++++++++++------- 2 files changed, 36 insertions(+), 15 deletions(-) diff --git a/snuba/datasets/entities/storage_selectors/outcomes.py b/snuba/datasets/entities/storage_selectors/outcomes.py index 528a6d90b16..ce987d29ce0 100644 --- a/snuba/datasets/entities/storage_selectors/outcomes.py +++ b/snuba/datasets/entities/storage_selectors/outcomes.py @@ -24,7 +24,10 @@ class OutcomesStorageSelector(QueryStorageSelector): 2. Time-range — if the query's lower timestamp bound is older than 90 days, route to daily (the hourly table does not retain data that far back). - 3. Default — hourly. + 3. Referrer — if the referrer starts with "billing.", route to daily + (billing queries need 13-month retention only available in the daily + table). + 4. Default — hourly. """ def __init__(self) -> None: @@ -42,7 +45,7 @@ def select_storage( self.daily_storage if query_settings.get_use_daily() else self.hourly_storage ) else: - outcomes_key = self._route_by_time_range(query) + outcomes_key = self._route_by_time_and_referrer(query, query_settings) for storage_connection in storage_connections: assert isinstance(storage_connection.storage, ReadableTableStorage) @@ -53,7 +56,9 @@ def select_storage( "The specified storage in selector does not exist in storage list." ) - def _route_by_time_range(self, query: Query) -> StorageKey: + def _route_by_time_and_referrer( + self, query: Query, query_settings: QuerySettings + ) -> StorageKey: # Route to daily if the query reaches beyond the hourly table's # retention window (~90 days). lower_bound, _ = get_time_range(query, "timestamp") @@ -62,4 +67,9 @@ def _route_by_time_range(self, query: Query) -> StorageKey: if lower_bound < cutoff: return self.daily_storage + # Billing queries need 13-month retention available only in the + # daily table. The referrer is set by sentry's UsageService. + if hasattr(query_settings, "referrer") and query_settings.referrer.startswith("billing."): + return self.daily_storage + return self.hourly_storage diff --git a/tests/datasets/entities/storage_selectors/test_outcomes.py b/tests/datasets/entities/storage_selectors/test_outcomes.py index 3a280f211e2..004872df009 100644 --- a/tests/datasets/entities/storage_selectors/test_outcomes.py +++ b/tests/datasets/entities/storage_selectors/test_outcomes.py @@ -66,6 +66,16 @@ def _query_with_timestamps(start: datetime, end: datetime) -> Query: HOURLY, id="no_timestamp_non_billing_hourly", ), + pytest.param( + HTTPQuerySettings(referrer="billing.usage_service.clickhouse"), + DAILY, + id="no_timestamp_billing_referrer_daily", + ), + pytest.param( + HTTPQuerySettings(referrer="billing.anything"), + DAILY, + id="no_timestamp_billing_prefix_daily", + ), ] @@ -78,6 +88,7 @@ def test_storage_selector_no_timestamp( Routing without timestamp conditions in the query. - OutcomesQuerySettings with use_daily=True -> daily. + - Referrers starting with "billing." -> daily (13-month retention). - Everything else -> hourly. """ query = Query(from_clause=OUTCOMES_ENTITY) @@ -87,7 +98,7 @@ def test_storage_selector_no_timestamp( assert selected.storage == expected_storage -# --- Test cases with timestamp conditions (time-range routing) ---------------- +# --- Test cases with timestamp conditions (hybrid routing) ------------------ TIMESTAMP_CASES = [ # >90 days ago -> daily, regardless of referrer @@ -99,22 +110,22 @@ def test_storage_selector_no_timestamp( ), pytest.param( _query_with_timestamps(_OLD_START, _OLD_END), - HTTPQuerySettings(), + HTTPQuerySettings(referrer="billing.anything"), DAILY, - id="old_range_default_daily", + id="old_range_billing_daily", ), - # <90 days ago -> hourly + # <90 days ago -> referrer fallback pytest.param( _query_with_timestamps(_RECENT_START, _RECENT_END), - HTTPQuerySettings(referrer="outcomes.timeseries"), - HOURLY, - id="recent_range_non_billing_hourly", + HTTPQuerySettings(referrer="billing.anything"), + DAILY, + id="recent_range_billing_daily", ), pytest.param( _query_with_timestamps(_RECENT_START, _RECENT_END), - HTTPQuerySettings(), + HTTPQuerySettings(referrer="outcomes.timeseries"), HOURLY, - id="recent_range_default_hourly", + id="recent_range_non_billing_hourly", ), ] @@ -126,11 +137,11 @@ def test_storage_selector_with_timestamps( expected_storage: Storage, ) -> None: """ - Time-range routing: queries reaching beyond the hourly table's 90-day - retention are routed to the daily table. + Hybrid routing: time-range check takes priority over referrer. - Query start >90 days ago -> daily (hourly table lacks the data). - - Query start <90 days ago -> hourly. + - Query start <90 days ago + billing referrer -> daily (referrer fallback). + - Query start <90 days ago + non-billing referrer -> hourly. """ connections = get_entity(EntityKey.OUTCOMES).get_all_storage_connections() From 68b639bf07af993615bbcacbd764eb47d3d31520 Mon Sep 17 00:00:00 2001 From: Alberto Leal Date: Thu, 16 Apr 2026 20:20:49 -0400 Subject: [PATCH 3/4] fix(tests): add return type annotation for _make_timestamp_condition --- tests/datasets/entities/storage_selectors/test_outcomes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/datasets/entities/storage_selectors/test_outcomes.py b/tests/datasets/entities/storage_selectors/test_outcomes.py index 004872df009..a098ce1f6bb 100644 --- a/tests/datasets/entities/storage_selectors/test_outcomes.py +++ b/tests/datasets/entities/storage_selectors/test_outcomes.py @@ -10,7 +10,7 @@ from snuba.datasets.storages.storage_key import StorageKey from snuba.query.conditions import BooleanFunctions, ConditionFunctions, binary_condition from snuba.query.data_source.simple import Entity -from snuba.query.expressions import Column, Literal +from snuba.query.expressions import Column, FunctionCall, Literal from snuba.query.logical import Query from snuba.query.query_settings import HTTPQuerySettings, OutcomesQuerySettings @@ -23,7 +23,7 @@ HOURLY = get_storage(StorageKey.OUTCOMES_HOURLY) -def _make_timestamp_condition(start: datetime, end: datetime): +def _make_timestamp_condition(start: datetime, end: datetime) -> FunctionCall: """Build a ``timestamp >= start AND timestamp < end`` condition node.""" return binary_condition( BooleanFunctions.AND, From 418568b98fcc6d34856c816443ce2753669a5617 Mon Sep 17 00:00:00 2001 From: Meredith Heller Date: Thu, 23 Apr 2026 12:43:02 -0700 Subject: [PATCH 4/4] fix test --- snuba/datasets/entities/storage_selectors/outcomes.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/snuba/datasets/entities/storage_selectors/outcomes.py b/snuba/datasets/entities/storage_selectors/outcomes.py index ce987d29ce0..7f89709cadb 100644 --- a/snuba/datasets/entities/storage_selectors/outcomes.py +++ b/snuba/datasets/entities/storage_selectors/outcomes.py @@ -1,4 +1,4 @@ -from datetime import datetime, timedelta, timezone +from datetime import UTC, datetime, timedelta from typing import Sequence from snuba.clickhouse.query_dsl.accessors import get_time_range @@ -63,8 +63,11 @@ def _route_by_time_and_referrer( # retention window (~90 days). lower_bound, _ = get_time_range(query, "timestamp") if lower_bound is not None: - cutoff = datetime.now(timezone.utc) - _DAILY_THRESHOLD - if lower_bound < cutoff: + lower_bound_tz = ( + lower_bound if lower_bound.tzinfo is not None else lower_bound.replace(tzinfo=UTC) + ) + cutoff = datetime.now(UTC) - _DAILY_THRESHOLD + if lower_bound_tz < cutoff: return self.daily_storage # Billing queries need 13-month retention available only in the