From 1767ad2c4adcd7a21bb70cf512e9e8379f10a11b Mon Sep 17 00:00:00 2001 From: aschumann-virtualcable Date: Fri, 8 May 2026 16:25:45 +0200 Subject: [PATCH] Optimizes report queries and improves session pairing logic Refactors report generation to reduce database queries by batching and bucketizing event data in Python, significantly improving performance for time-based aggregations. Updates session calculation to use SQL window functions for accurate LOGIN/LOGOUT pairing, ensuring correctness even with out-of-order events. Standardizes time zone handling and interval validation for consistency across reports. --- .../uds/reports/stats/pool_users_summary.py | 53 ++++----- .../uds/reports/stats/pools_performance.py | 111 ++++++++++-------- server/src/uds/reports/stats/usage_by_pool.py | 89 +++++++------- server/src/uds/reports/stats/user_access.py | 85 ++++++++------ 4 files changed, 184 insertions(+), 154 deletions(-) diff --git a/server/src/uds/reports/stats/pool_users_summary.py b/server/src/uds/reports/stats/pool_users_summary.py index 5cf7977dc..e137188c2 100644 --- a/server/src/uds/reports/stats/pool_users_summary.py +++ b/server/src/uds/reports/stats/pool_users_summary.py @@ -29,11 +29,14 @@ """ Author: Adolfo Gómez, dkmaster at dkmon dot com """ +import collections import csv import io import logging import typing +from django.db.models import F, Window +from django.db.models.functions import Lag from django.utils.translation import gettext from django.utils.translation import gettext_lazy as _ @@ -76,48 +79,44 @@ def get_pool_data(self, pool: 'ServicePool') -> tuple[list[dict[str, typing.Any] end = self.end_date.as_timestamp() logger.debug(self.pool.value) + login = events.types.stats.EventType.LOGIN + logout = events.types.stats.EventType.LOGOUT + + # LOGIN/LOGOUT pairing pushed to DB via window LAG over fld4 (single pool, no need + # to partition by owner_id). Preserves "last login wins" semantics: only LOGOUT + # rows whose previous event for the same user is a LOGIN are accepted as a pair. items = ( StatsManager.manager() .enumerate_events( events.types.stats.EventOwnerType.SERVICEPOOL, - (events.types.stats.EventType.LOGIN, events.types.stats.EventType.LOGOUT), + (login, logout), owner_id=pool.id, since=start, to=end, ) - .order_by('stamp') + .annotate( + prev_type=Window(Lag('event_type'), partition_by=[F('fld4')], order_by=[F('stamp')]), + prev_stamp=Window(Lag('stamp'), partition_by=[F('fld4')], order_by=[F('stamp')]), + ) + .values('event_type', 'stamp', 'fld4', 'prev_type', 'prev_stamp') ) - logins: dict[str, int] = {} - users: dict[str, dict[str, typing.Any]] = {} + users: dict[str, dict[str, int]] = collections.defaultdict( + lambda: {'sessions': 0, 'time': 0} + ) for i in items: - # if '\\' in i.fld1: - # continue - username = i.fld4 - if i.event_type == events.types.stats.EventType.LOGIN: - logins[username] = i.stamp - else: - if username in logins: - stamp = logins[username] - del logins[username] - total = i.stamp - stamp - if username not in users: - users[username] = {'sessions': 0, 'time': 0} - users[username]['sessions'] += 1 - users[username]['time'] += total - # data.append({ - # 'name': i.fld4, - # 'date': datetime.datetime.fromtimestamp(stamp), - # 'time': total - # }) - - # Extract different number of users + if i['event_type'] != logout or i['prev_type'] != login: + continue + entry = users[i['fld4']] + entry['sessions'] += 1 + entry['time'] += i['stamp'] - i['prev_stamp'] + data = [ { 'user': k, 'sessions': v['sessions'], - 'hours': '{:.2f}'.format(float(v['time']) / 3600), - 'average': '{:.2f}'.format(float(v['time']) / 3600 / v['sessions']), + 'hours': '{:.2f}'.format(v['time'] / 3600), + 'average': '{:.2f}'.format(v['time'] / 3600 / v['sessions']), } for k, v in users.items() ] diff --git a/server/src/uds/reports/stats/pools_performance.py b/server/src/uds/reports/stats/pools_performance.py index c350dc030..74d9036b7 100644 --- a/server/src/uds/reports/stats/pools_performance.py +++ b/server/src/uds/reports/stats/pools_performance.py @@ -35,10 +35,9 @@ import io import logging import typing -import collections.abc import django.template.defaultfilters as filters -from django.db.models import Count +from django.utils import timezone from django.utils.translation import gettext from django.utils.translation import gettext_lazy as _ @@ -77,14 +76,12 @@ def init_gui(self) -> None: ] self.pools.set_choices(vals) - def list_pools(self) -> collections.abc.Iterable[tuple[int, str]]: + def list_pools(self) -> list[tuple[int, str]]: if '0-0-0-0' in self.pools.value: - pools = ServicePool.objects.all() + qs = ServicePool.objects.all() else: - pools = ServicePool.objects.filter(uuid__in=self.pools.value) - - for p in pools: - yield (p.id, p.name) + qs = ServicePool.objects.filter(uuid__in=self.pools.value) + return list(qs.values_list('id', 'name')) def get_range_data( self, @@ -93,12 +90,7 @@ def get_range_data( ]: # pylint: disable=too-many-locals start = self.start_date.as_timestamp() end = self.end_date.as_timestamp() - if self.sampling_points.as_int() < 2: - self.sampling_points.value = 2 - if self.sampling_points.as_int() > 128: - self.sampling_points.value = 128 - - sampling_points = self.sampling_points.as_int() + sampling_points = max(2, min(128, self.sampling_points.as_int())) # x axis label format if end - start > 3600 * 24 * 2: @@ -106,56 +98,73 @@ def get_range_data( else: x_label_format = 'SHORT_DATETIME_FORMAT' - sampling_intervals: list[tuple[int, int]] = [] sampling_interval_seconds = (end - start) / sampling_points + # Precompute per-bucket (start, end, midpoint) once. + bucket_bounds: list[tuple[int, int, int]] = [] for i in range(sampling_points): - sampling_intervals.append( - (int(start + i * sampling_interval_seconds), int(start + (i + 1) * sampling_interval_seconds)) - ) - - # Store dataUsers for all pools - pools_data: list[dict[str, typing.Any]] = [] + b_start = int(start + i * sampling_interval_seconds) + b_end = int(start + (i + 1) * sampling_interval_seconds) + bucket_bounds.append((b_start, b_end, (b_start + b_end) // 2)) fld = StatsManager.manager().get_event_field_for('username') + pools = self.list_pools() + last_idx = sampling_points - 1 + + # Single query covering all selected pools. Bucketize in Python keyed by owner_id. + rows = ( + StatsManager.manager() + .enumerate_events( + events.types.stats.EventOwnerType.SERVICEPOOL, + events.types.stats.EventType.ACCESS, + since=start, + to=end, + owner_id=[p[0] for p in pools], + ) + .values('owner_id', 'stamp', fld) + ) + distinct_users: dict[int, list[set[str]]] = { + p[0]: [set() for _ in range(sampling_points)] for p in pools + } + accesses_count: dict[int, list[int]] = { + p[0]: [0] * sampling_points for p in pools + } + for row in rows: + idx = int((row['stamp'] - start) // sampling_interval_seconds) + if idx < 0: + continue + if idx > last_idx: + idx = last_idx + owner_id = row['owner_id'] + distinct_users[owner_id][idx].add(row[fld]) + accesses_count[owner_id][idx] += 1 + + pools_data: list[dict[str, typing.Any]] = [] report_data: list[dict[str, typing.Any]] = [] - for p in self.list_pools(): + for pool_id, pool_name in pools: + users_buckets = distinct_users[pool_id] + access_buckets = accesses_count[pool_id] data_users: list[tuple[int, int]] = [] data_accesses: list[tuple[int, int]] = [] - for interval in sampling_intervals: - key = (interval[0] + interval[1]) // 2 - q = ( - StatsManager.manager() - .enumerate_events( - events.types.stats.EventOwnerType.SERVICEPOOL, - events.types.stats.EventType.ACCESS, - since=interval[0], - to=interval[1], - owner_id=p[0], - ) - .values(fld) - .annotate(cnt=Count(fld)) - ) - accesses = 0 - for v in q: - accesses += v['cnt'] - - data_users.append((key, len(q))) # Store number of users + for i, (b_start, b_end, key) in enumerate(bucket_bounds): + users_n = len(users_buckets[i]) + accesses = access_buckets[i] + data_users.append((key, users_n)) data_accesses.append((key, accesses)) report_data.append( { - 'name': p[1], - 'date': utils.timestamp_as_str(interval[0], 'SHORT_DATETIME_FORMAT') + 'name': pool_name, + 'date': utils.timestamp_as_str(b_start, 'SHORT_DATETIME_FORMAT') + ' - ' - + utils.timestamp_as_str(interval[1], 'SHORT_DATETIME_FORMAT'), - 'users': len(q), + + utils.timestamp_as_str(b_end, 'SHORT_DATETIME_FORMAT'), + 'users': users_n, 'accesses': accesses, } ) pools_data.append( { - 'pool': p[0], - 'name': p[1], + 'pool': pool_id, + 'name': pool_name, 'dataUsers': data_users, 'dataAccesses': data_accesses, } @@ -178,7 +187,7 @@ def generate(self) -> bytes: # l is the index of the x value # returns the date in the x value to be used as label on the x axis def _tick_fnc1(l: int) -> str: - return filters.date(datetime.datetime.fromtimestamp(x[l]), x_label_format) if int(x[l]) >= 0 else '' + return filters.date(timezone.make_aware(datetime.datetime.fromtimestamp(x[l])), x_label_format) if int(x[l]) >= 0 else '' data = { 'title': _('Distinct Users'), @@ -194,7 +203,7 @@ def _tick_fnc1(l: int) -> str: x = [v[0] for v in pools_data[0]['dataAccesses']] def _tick_fnc2(l: int) -> str: - return filters.date(datetime.datetime.fromtimestamp(x[l]), x_label_format) if int(x[l]) >= 0 else '' + return filters.date(timezone.make_aware(datetime.datetime.fromtimestamp(x[l])), x_label_format) if int(x[l]) >= 0 else '' data = { 'title': _('Accesses'), @@ -213,10 +222,10 @@ def _tick_fnc2(l: int) -> str: 'uds/reports/stats/pools-performance.html', dct={ 'data': report_data, - 'pools': [i[1] for i in self.list_pools()], + 'pools': [p['name'] for p in pools_data], 'beginning': self.start_date.as_date(), 'ending': self.end_date.as_date(), - 'intervals': self.sampling_points.as_int(), + 'intervals': max(2, min(128, self.sampling_points.as_int())), }, header=gettext('UDS Pools Performance Report'), water=gettext('Pools Performance'), diff --git a/server/src/uds/reports/stats/usage_by_pool.py b/server/src/uds/reports/stats/usage_by_pool.py index bf24839f9..d7aec1535 100644 --- a/server/src/uds/reports/stats/usage_by_pool.py +++ b/server/src/uds/reports/stats/usage_by_pool.py @@ -34,8 +34,10 @@ import io import logging import typing -from collections import defaultdict +from django.db.models import F, Window +from django.db.models.functions import Lag +from django.utils import timezone from django.utils.translation import gettext from django.utils.translation import gettext_lazy as _ @@ -70,64 +72,65 @@ def init_gui(self) -> None: self.pool.set_choices(vals) def get_data(self) -> tuple[list[dict[str, typing.Any]], str]: - # Generate the sampling intervals and get dataUsers from db start = self.start_date.as_timestamp() end = self.end_date.as_timestamp() logger.debug(self.pool.value) if '0-0-0-0' in self.pool.value: - pools = ServicePool.objects.all() + qs = ServicePool.objects.all() else: - pools = ServicePool.objects.filter(uuid__in=self.pool.value) - - # Build pool id -> pool map for fast lookup - pool_map: dict[int, ServicePool] = {p.id: p for p in pools} - - # Single query for ALL pools (eliminates N+1) + values() avoids model instantiation overhead + qs = ServicePool.objects.filter(uuid__in=self.pool.value) + + # (uuid, name) per pool id. values_list avoids instantiating ServicePool + # rows just to read 2 fields. + pool_map: dict[int, tuple[str, str]] = { + p_id: (p_uuid, p_name) + for p_id, p_uuid, p_name in qs.values_list('id', 'uuid', 'name') + } + + login = stats.events.types.stats.EventType.LOGIN + logout = stats.events.types.stats.EventType.LOGOUT + + # LOGIN/LOGOUT pairing pushed to DB via window LAG over (owner_id, fld4) ordered by stamp. + # Preserves "last login wins" semantics: if prev event of same (pool, user) is LOGIN -> pair. + # Portable across MySQL 8+, PostgreSQL, SQLite 3.25+, Oracle (ANSI window functions). + partition = [F('owner_id'), F('fld4')] items = ( StatsManager.manager() .enumerate_events( stats.events.types.stats.EventOwnerType.SERVICEPOOL, - (stats.events.types.stats.EventType.LOGIN, stats.events.types.stats.EventType.LOGOUT), + (login, logout), owner_id=list(pool_map.keys()), since=start, to=end, ) - .order_by('stamp') - .values('owner_id', 'event_type', 'stamp', 'fld2', 'fld4') + .annotate( + prev_type=Window(Lag('event_type'), partition_by=partition, order_by=[F('stamp')]), + prev_stamp=Window(Lag('stamp'), partition_by=partition, order_by=[F('stamp')]), + ) + .values('owner_id', 'event_type', 'stamp', 'fld2', 'fld4', 'prev_type', 'prev_stamp') ) - # Group events by pool_id for correct LOGIN/LOGOUT pairing per pool - pool_events: dict[int, list[dict[str, typing.Any]]] = defaultdict(list) - for item in items: - pool_events[item['owner_id']].append(item) - data: list[dict[str, typing.Any]] = [] - for pool_id, evts in pool_events.items(): - pool = pool_map[pool_id] - logins: dict[str, int] = {} - for i in evts: - username = i['fld4'] # full_username - if i['event_type'] == stats.events.types.stats.EventType.LOGIN: - logins[username] = i['stamp'] - else: - stamp = logins.pop(username, None) # pop avoids double lookup + del - if stamp is not None: - total = i['stamp'] - stamp - # src_ip logic: if IPv6 (contains '[') return as-is, else split port - fld2 = i['fld2'] - origin = fld2 if '[' in fld2 else fld2.split(':')[0] - data.append( - { - 'name': username, - 'origin': origin, - 'date': datetime.datetime.fromtimestamp(stamp), - 'time': total, - 'pool': pool.uuid, - 'pool_name': pool.name, - } - ) - - return data, ','.join([p.name for p in pools]) + for i in items: + if i['event_type'] != logout or i['prev_type'] != login: + continue + pool_uuid, pool_name = pool_map[i['owner_id']] + login_stamp = i['prev_stamp'] + fld2 = i['fld2'] + # ipv6 handled inline (was StatsEvents.src_ip property; we use .values()). + origin = fld2 if '[' in fld2 else fld2.split(':')[0] + data.append( + { + 'name': i['fld4'], + 'origin': origin, + 'date': timezone.make_aware(datetime.datetime.fromtimestamp(login_stamp)), + 'time': i['stamp'] - login_stamp, + 'pool': pool_uuid, + 'pool_name': pool_name, + } + ) + + return data, ','.join(name for _uuid, name in pool_map.values()) def generate(self) -> bytes: items, poolname = self.get_data() diff --git a/server/src/uds/reports/stats/user_access.py b/server/src/uds/reports/stats/user_access.py index 8c8639d86..c1235e253 100644 --- a/server/src/uds/reports/stats/user_access.py +++ b/server/src/uds/reports/stats/user_access.py @@ -36,6 +36,7 @@ import typing import django.template.defaultfilters as filters +from django.utils import timezone from django.utils.translation import gettext from django.utils.translation import gettext_lazy as _ @@ -68,7 +69,7 @@ class StatsReportLogin(StatsReport): order=4, label=_('Number of intervals'), length=3, - min_value=0, + min_value=2, max_value=128, tooltip=_('Number of sampling points used in charts'), default=64, @@ -77,12 +78,7 @@ class StatsReportLogin(StatsReport): def get_range_data(self) -> tuple[str, list[tuple[int, int]], list[dict[str, typing.Any]]]: start = self.start_date.as_timestamp() end = self.end_date.as_timestamp() - if self.sampling_points.as_int() < 2: - self.sampling_points.value = 2 - if self.sampling_points.as_int() > 128: - self.sampling_points.value = 128 - - sampling_points = self.sampling_points.as_int() + sampling_points = max(2, min(128, self.sampling_points.as_int())) # x axis label format if end - start > 3600 * 24 * 2: @@ -90,33 +86,44 @@ def get_range_data(self) -> tuple[str, list[tuple[int, int]], list[dict[str, typ else: x_label_format = 'SHORT_DATETIME_FORMAT' - sampling_intervals: list[tuple[int, int]] = [] sampling_interval_seconds = (end - start) / sampling_points + bucket_bounds: list[tuple[int, int, int]] = [] for i in range(sampling_points): - sampling_intervals.append( - (int(start + i * sampling_interval_seconds), int(start + (i + 1) * sampling_interval_seconds)) + b_start = int(start + i * sampling_interval_seconds) + b_end = int(start + (i + 1) * sampling_interval_seconds) + bucket_bounds.append((b_start, b_end, (b_start + b_end) // 2)) + + # Single query covering the full range; bucketize in Python. + # Was: sampling_points queries (up to 128) doing COUNT(*) each. + counts = [0] * sampling_points + last_idx = sampling_points - 1 + for row in ( + StatsManager.manager() + .enumerate_events( + stats.events.types.stats.EventOwnerType.AUTHENTICATOR, + stats.events.types.stats.EventType.LOGIN, + since=start, + to=end, ) + .values('stamp') + ): + idx = int((row['stamp'] - start) // sampling_interval_seconds) + if idx < 0: + continue + if idx > last_idx: + idx = last_idx + counts[idx] += 1 data: list[tuple[int, int]] = [] report_data: list[dict[str, typing.Any]] = [] - for interval in sampling_intervals: - key = (interval[0] + interval[1]) // 2 - val = ( - StatsManager.manager() - .enumerate_events( - stats.events.types.stats.EventOwnerType.AUTHENTICATOR, - stats.events.types.stats.EventType.LOGIN, - since=interval[0], - to=interval[1], - ) - .count() - ) + for i, (b_start, b_end, key) in enumerate(bucket_bounds): + val = counts[i] data.append((key, val)) report_data.append( { - 'date': utils.timestamp_as_str(interval[0], 'SHORT_DATETIME_FORMAT') + 'date': utils.timestamp_as_str(b_start, 'SHORT_DATETIME_FORMAT') + ' - ' - + utils.timestamp_as_str(interval[1], 'SHORT_DATETIME_FORMAT'), + + utils.timestamp_as_str(b_end, 'SHORT_DATETIME_FORMAT'), 'users': val, } ) @@ -130,14 +137,26 @@ def get_week_hourly_data(self) -> tuple[list[int], list[int], list[list[int]]]: data_week = [0] * 7 data_hour = [0] * 24 data_week_hour = [[0] * 24 for _ in range(7)] - for val in StatsManager.manager().enumerate_events( - stats.events.types.stats.EventOwnerType.AUTHENTICATOR, stats.events.types.stats.EventType.LOGIN, since=start, to=end + # Hoisted: 1 tz lookup + tz-aware fromtimestamp per row instead of + # make_aware() (which re-resolves the tz internally on every call). + tz = timezone.get_current_timezone() + from_ts = datetime.datetime.fromtimestamp + for row in ( + StatsManager.manager() + .enumerate_events( + stats.events.types.stats.EventOwnerType.AUTHENTICATOR, + stats.events.types.stats.EventType.LOGIN, + since=start, + to=end, + ) + .values('stamp') ): - s = datetime.datetime.fromtimestamp(val.stamp) - data_week[s.weekday()] += 1 - data_hour[s.hour] += 1 - data_week_hour[s.weekday()][s.hour] += 1 - logger.debug('Data: %s %s', s.weekday(), s.hour) + s = from_ts(row['stamp'], tz) + wd = s.weekday() + hr = s.hour + data_week[wd] += 1 + data_hour[hr] += 1 + data_week_hour[wd][hr] += 1 return data_week, data_hour, data_week_hour @@ -150,7 +169,7 @@ def generate(self) -> bytes: graph1 = io.BytesIO() def _tick_fnc1(l: int) -> str: - return filters.date(datetime.datetime.fromtimestamp(l), x_label_format) + return filters.date(timezone.make_aware(datetime.datetime.fromtimestamp(l)), x_label_format) x = [v[0] for v in data] d: dict[str, typing.Any] = { @@ -229,7 +248,7 @@ def _tick_fnc3(l: int) -> str: 'data': report_data, 'beginning': self.start_date.as_date(), 'ending': self.end_date.as_date(), - 'intervals': self.sampling_points.as_int(), + 'intervals': max(2, min(128, self.sampling_points.as_int())), }, header=gettext('Users access to UDS'), water=gettext('UDS Report for users access'),