diff --git a/changes/9718.enhance.md b/changes/9718.enhance.md new file mode 100644 index 00000000000..ac96b29519e --- /dev/null +++ b/changes/9718.enhance.md @@ -0,0 +1 @@ +Deduplicate container IDs in agent stat collection to avoid redundant processing diff --git a/src/ai/backend/agent/agent.py b/src/ai/backend/agent/agent.py index 9c65caaa754..8dca4e7c8b6 100644 --- a/src/ai/backend/agent/agent.py +++ b/src/ai/backend/agent/agent.py @@ -1388,25 +1388,25 @@ async def collect_node_stat(self, resource_scaling_factors: Mapping[SlotName, De async def collect_container_stat(self, interval: float) -> None: if self.local_config.debug.log_stats: log.debug("collecting container statistics") - container_ids: list[ContainerId] = [] + container_ids: set[ContainerId] = set() for kernel_obj in [*self.kernel_registry.values()]: if not kernel_obj.stats_enabled or kernel_obj.container_id is None: continue - container_ids.append(ContainerId(kernel_obj.container_id)) + container_ids.add(ContainerId(kernel_obj.container_id)) async with asyncio.timeout(STAT_COLLECTION_TIMEOUT): - await self.stat_ctx.collect_container_stat(container_ids) + await self.stat_ctx.collect_container_stat(list(container_ids)) @_observe_stat_task(stat_scope=StatScope.PROCESS) async def collect_process_stat(self, interval: float) -> None: if self.local_config.debug.log_stats: log.debug("collecting process statistics in container") - container_ids = [] + container_ids: set[ContainerId] = set() for kernel_obj in [*self.kernel_registry.values()]: if not kernel_obj.stats_enabled or kernel_obj.container_id is None: continue - container_ids.append(ContainerId(kernel_obj.container_id)) + container_ids.add(ContainerId(kernel_obj.container_id)) async with asyncio.timeout(STAT_COLLECTION_TIMEOUT): - await self.stat_ctx.collect_per_container_process_stat(container_ids) + await self.stat_ctx.collect_per_container_process_stat(list(container_ids)) def _get_public_host(self) -> str: agent_config = self.local_config.agent