From 7e58856eaf61005603d4d4861488ed243198b93d Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 6 Mar 2026 01:24:53 +0900 Subject: [PATCH 1/3] refactor(BA-4910): Use set comprehension for container_ids deduplication Change container_ids collection from list to set comprehension in collect_container_stat() and collect_process_stat() to automatically deduplicate container IDs and prevent redundant stat collection. Benefits: - Automatic deduplication of container IDs - More efficient (no duplicate processing) - Semantically clearer (expresses intent of unique IDs) - Defensive against edge cases in kernel registry Changes: - src/ai/backend/agent/agent.py:1391: collect_container_stat() - src/ai/backend/agent/agent.py:1403: collect_process_stat() Co-Authored-By: Claude Sonnet 4.5 --- src/ai/backend/agent/agent.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 9c65caaa754..b022dcd1ade 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -1388,25 +1388,25 @@ async def collect_node_stat(self, resource_scaling_factors: Mapping[SlotName, De async def collect_container_stat(self, interval: float) -> None: if self.local_config.debug.log_stats: log.debug("collecting container statistics") - container_ids: list[ContainerId] = [] - for kernel_obj in [*self.kernel_registry.values()]: - if not kernel_obj.stats_enabled or kernel_obj.container_id is None: - continue - container_ids.append(ContainerId(kernel_obj.container_id)) + container_ids = { + ContainerId(kernel_obj.container_id) + for kernel_obj in self.kernel_registry.values() + if kernel_obj.stats_enabled and kernel_obj.container_id is not None + } async with asyncio.timeout(STAT_COLLECTION_TIMEOUT): - await self.stat_ctx.collect_container_stat(container_ids) + await self.stat_ctx.collect_container_stat(list(container_ids)) @_observe_stat_task(stat_scope=StatScope.PROCESS) async def collect_process_stat(self, interval: float) -> None: if self.local_config.debug.log_stats: log.debug("collecting process statistics in container") - container_ids = [] - for kernel_obj in [*self.kernel_registry.values()]: - if not kernel_obj.stats_enabled or kernel_obj.container_id is None: - continue - container_ids.append(ContainerId(kernel_obj.container_id)) + container_ids = { + ContainerId(kernel_obj.container_id) + for kernel_obj in self.kernel_registry.values() + if kernel_obj.stats_enabled and kernel_obj.container_id is not None + } async with asyncio.timeout(STAT_COLLECTION_TIMEOUT): - await self.stat_ctx.collect_per_container_process_stat(container_ids) + await self.stat_ctx.collect_per_container_process_stat(list(container_ids)) def _get_public_host(self) -> str: agent_config = self.local_config.agent From 3b540ca9603d86c5742f4cf74dddb659e824b2f4 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 6 Mar 2026 01:31:46 +0900 Subject: [PATCH 2/3] changelog: add news fragment for PR #9718 --- changes/9718.enhance.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changes/9718.enhance.md diff --git a/changes/9718.enhance.md b/changes/9718.enhance.md new file mode 100644 index 00000000000..ac96b29519e --- /dev/null +++ b/changes/9718.enhance.md @@ -0,0 +1 @@ +Deduplicate container IDs in agent stat collection to avoid redundant processing From 129c844800a42c6752733063c1c12d0cc17a2032 Mon Sep 17 00:00:00 2001 From: Sanghun Lee Date: Fri, 6 Mar 2026 15:43:21 +0900 Subject: [PATCH 3/3] revert to simple loop --- src/ai/backend/agent/agent.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index b022dcd1ade..8dca4e7c8b6 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -1388,11 +1388,11 @@ async def collect_node_stat(self, resource_scaling_factors: Mapping[SlotName, De async def collect_container_stat(self, interval: float) -> None: if self.local_config.debug.log_stats: log.debug("collecting container statistics") - container_ids = { - ContainerId(kernel_obj.container_id) - for kernel_obj in self.kernel_registry.values() - if kernel_obj.stats_enabled and kernel_obj.container_id is not None - } + container_ids: set[ContainerId] = set() + for kernel_obj in [*self.kernel_registry.values()]: + if not kernel_obj.stats_enabled or kernel_obj.container_id is None: + continue + container_ids.add(ContainerId(kernel_obj.container_id)) async with asyncio.timeout(STAT_COLLECTION_TIMEOUT): await self.stat_ctx.collect_container_stat(list(container_ids)) @@ -1400,11 +1400,11 @@ async def collect_container_stat(self, interval: float) -> None: async def collect_process_stat(self, interval: float) -> None: if self.local_config.debug.log_stats: log.debug("collecting process statistics in container") - container_ids = { - ContainerId(kernel_obj.container_id) - for kernel_obj in self.kernel_registry.values() - if kernel_obj.stats_enabled and kernel_obj.container_id is not None - } + container_ids: set[ContainerId] = set() + for kernel_obj in [*self.kernel_registry.values()]: + if not kernel_obj.stats_enabled or kernel_obj.container_id is None: + continue + container_ids.add(ContainerId(kernel_obj.container_id)) async with asyncio.timeout(STAT_COLLECTION_TIMEOUT): await self.stat_ctx.collect_per_container_process_stat(list(container_ids))