From 539bb00476fb26ed47bfe0b4fd41bad3fbc70ebf Mon Sep 17 00:00:00 2001 From: Alexander Date: Mon, 8 Jun 2026 09:02:47 -0400 Subject: [PATCH 1/2] feat(memory-map): container slot mem attribution via podman cgroup (#660) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Populate per-slot mem_mb for container slots from the podman/docker cgroup (memory.current) so the memory map shows real resident bytes instead of a file-size estimate. Backend (capacity.py): - Add `_container_cgroup_mem_bytes(slot_name)`: resolves podman/docker runtime via shutil.which, runs ` inspect -f {{.State.Pid}} hal0-slot-`, walks to cgroupv2 memory.current. Returns 0 on any error (container absent = lemond slot, fallback is automatic). - `build_per_slot` path 2: try cgroup probe for every non-NPU slot; use result if >0 (no KV estimate — cgroup already includes all resident bytes). Path 3 (lemond file-size + KV estimate) unchanged. UI (memory-map.jsx): - Add `slotLegendSub(slot)`: container slots show image tag (truncated to last 32 chars) or profile slug in legend; lemond slots keep device token as before. - Expand slot model with `isContainer`, `image`, `profile` fields. - Both sidebar and expanded legend variants use slotLegendSub. Tests: - 12 unit tests for _container_cgroup_mem_bytes (happy path, no runtime, container absent, timeout, cgroupv1, unreadable, docker fallback) and build_per_slot container vs lemond vs NPU paths. - 2 Playwright specs: container legend shows image tag (HAL0_DATA injection via addInitScript); pool bar has ≥2 segments. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/hal0/slots/capacity.py | 115 +++++++++- tests/slots/test_container_cgroup_mem.py | 262 +++++++++++++++++++++++ ui/src/dash/memory-map.jsx | 30 ++- ui/tests/e2e/specs/memory-map-v3.spec.ts | 55 +++++ 4 files changed, 456 insertions(+), 6 deletions(-) create mode 100644 tests/slots/test_container_cgroup_mem.py diff --git a/src/hal0/slots/capacity.py b/src/hal0/slots/capacity.py index acb15692..66620b9f 100644 --- a/src/hal0/slots/capacity.py +++ b/src/hal0/slots/capacity.py @@ -32,6 +32,10 @@ if TYPE_CHECKING: from hal0.hardware.probe import HardwareInfo +# Container-name prefix matches the convention in providers/container.py: +# ``ExecStop = stop -t 20 hal0-slot-``. +_CONTAINER_NAME_PREFIX = "hal0-slot-" + # NOTE: We code against ``hal0.hardware.probe.HardwareInfo`` as the contract # even though the probe itself is currently a stub (raises NotImplementedError). @@ -131,6 +135,79 @@ def _ctx_tokens_for(model_meta: dict[str, Any] | None) -> int: return _DEFAULT_CTX_TOKENS +async def _container_cgroup_mem_bytes(slot_name: str) -> int: + """Cgroup-wide ``memory.current`` for the podman/docker container backing *slot_name*. + + Container name convention: ``hal0-slot-`` (matches the + ``ExecStop`` line written by :mod:`hal0.providers.container`). + + Resolution path: + 1. Detect runtime (podman → docker) via the same logic as + :func:`hal0.providers.container._container_runtime`. + 2. Run `` inspect -f {{.State.Pid}} hal0-slot-`` + to get the container init PID. + 3. Read ``/proc//cgroup`` for the cgroupv2 unified path. + 4. Read ``/sys/fs/cgroup//memory.current``. + + Returns 0 on any error so the caller can fall back gracefully — + a missing/stopped container is not exceptional; it just means the + slot is not backed by a container runtime. + + The returned value includes model weights + KV-cache + runtime + overhead as measured by the cgroup; callers MUST NOT add an + additional KV estimate on top. + """ + import shutil + + # Resolve the container runtime binary (podman preferred over docker). + runtime = None + for candidate in ("podman", "docker"): + found = shutil.which(candidate) + if found: + runtime = found + break + if runtime is None: + return 0 + + container_name = f"{_CONTAINER_NAME_PREFIX}{slot_name}" + try: + proc = await asyncio.create_subprocess_exec( + runtime, + "inspect", + "-f", + "{{.State.Pid}}", + container_name, + stdin=asyncio.subprocess.DEVNULL, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.DEVNULL, + ) + out, _ = await asyncio.wait_for(proc.communicate(), timeout=1.5) + except (TimeoutError, FileNotFoundError, OSError): + return 0 + if proc.returncode != 0: + return 0 + try: + pid = int(out.decode("utf-8", errors="replace").strip() or 0) + except ValueError: + pid = 0 + if pid <= 0: + return 0 + try: + with open(f"/proc/{pid}/cgroup", encoding="utf-8") as f: + cg_line = f.readline().strip() + except OSError: + return 0 + # cgroupv2 unified hierarchy line: "0::/system.slice/podman-.scope" + if "::" not in cg_line: + return 0 + cg_rel = cg_line.split("::", 1)[1].lstrip("/") + try: + with open(f"/sys/fs/cgroup/{cg_rel}/memory.current", encoding="utf-8") as f: + return int(f.read().strip() or 0) + except (OSError, ValueError): + return 0 + + async def build_per_slot( slots: list[Any], *, @@ -145,10 +222,19 @@ async def build_per_slot( {slot_name: {"vram_mb", "ram_mb", "mem_mb", "state", "model_id"}} where ``mem_mb`` (== ``vram_mb`` on UMA) is the best-estimate resident - footprint: model file size (from the registry, or the FLM footprint - for NPU tags) plus a coarse KV-cache estimate scaled by the model's - context window. A loaded 25B Q5 model therefore reports ~tens of GB - rather than 0 — the number the dashboard memory map needs. + footprint. Three attribution paths, in priority order: + + 1. **NPU / FLM slots**: FLM catalog footprint_gb (includes runtime + KV). + 2. **Container slots** (podman ``hal0-slot-``): live cgroup + ``memory.current`` — real resident bytes including weights + KV + + runtime overhead. Probed via :func:`_container_cgroup_mem_bytes`. + 3. **Lemonade / lemond slots** (fallback): model file size from the + registry plus a coarse KV-cache estimate scaled by context window. + + The cgroup probe is attempted for every non-NPU slot and naturally + returns 0 for lemond slots (no matching container exists), so the + container → lemond fallback is automatic — no explicit runtime-type + detection required. Non-resident slots are omitted so the caller can render them as holding no memory. Never raises: a registry miss yields a 0-size row @@ -196,6 +282,26 @@ async def build_per_slot( } continue model_mb = (entry.get("size_bytes") or 0) / (1024 * 1024) + # ── Container cgroup probe (path 2) ──────────────────────────────── + # Try the live podman/docker cgroup first. Returns 0 when no + # container named hal0-slot- exists (i.e. for lemond slots), + # so the probe is naturally a no-op for the lemond path — no + # explicit runtime-type detection required. + cgroup_bytes = await _container_cgroup_mem_bytes(s.name) + if cgroup_bytes > 0: + # cgroup memory.current already includes weights + KV + runtime; + # do NOT add an additional KV estimate on top. + cgroup_mb = round(cgroup_bytes / (1024.0 * 1024.0), 1) + out[s.name] = { + "vram_mb": cgroup_mb, + "ram_mb": 0.0, + "mem_mb": cgroup_mb, + "state": state, + "model_id": model_id, + } + continue + + # ── Lemond / file-size fallback (path 3) ─────────────────────────── if model_mb <= 0 and registry is not None: try: m = registry.get(model_id) @@ -353,5 +459,6 @@ def as_dict(self) -> dict[str, Any]: __all__ = [ "CapacityProbeError", "CapacitySnapshot", + "_container_cgroup_mem_bytes", "build_per_slot", ] diff --git a/tests/slots/test_container_cgroup_mem.py b/tests/slots/test_container_cgroup_mem.py new file mode 100644 index 00000000..67ad5fa1 --- /dev/null +++ b/tests/slots/test_container_cgroup_mem.py @@ -0,0 +1,262 @@ +"""Tests for podman cgroup mem probe in hal0.slots.capacity. + +Covers :func:`hal0.slots.capacity._container_cgroup_mem_bytes` — the +function that reads live ``memory.current`` for a podman/docker container +named ``hal0-slot-``. All subprocess and file I/O is mocked so +these tests run without a real container runtime. +""" + +from __future__ import annotations + +from unittest.mock import AsyncMock, MagicMock, mock_open, patch + +import pytest + +from hal0.slots.capacity import _container_cgroup_mem_bytes, build_per_slot + +# ── helpers ──────────────────────────────────────────────────────────────── + + +def _make_proc(returncode: int = 0, stdout: bytes = b"") -> AsyncMock: + """Return a fake asyncio.subprocess.Process mock.""" + proc = AsyncMock() + proc.returncode = returncode + proc.communicate = AsyncMock(return_value=(stdout, b"")) + return proc + + +# ── _container_cgroup_mem_bytes ───────────────────────────────────────────── + + +class TestContainerCgroupMemBytes: + """Unit tests for the podman cgroup probe.""" + + @pytest.mark.asyncio + async def test_returns_bytes_on_happy_path(self, tmp_path): + """Full happy-path: podman inspect → PID → cgroup path → memory.current.""" + cg_rel = "system.slice/libpod-abc123.scope" + cg_line = f"0::/{cg_rel}\n" + mem_current = "12345678\n" + + def _open_side_effect(path, *args, **kwargs): + # /proc//cgroup: contains "cgroup" but NOT "memory" + if "cgroup" in str(path) and "memory" not in str(path): + return mock_open(read_data=cg_line)() + # /sys/fs/cgroup/.../memory.current: contains "memory.current" + if "memory.current" in str(path): + return mock_open(read_data=mem_current)() + raise FileNotFoundError(path) + + with ( + patch("shutil.which", return_value="/usr/bin/podman"), + patch("asyncio.create_subprocess_exec") as mock_exec, + patch("builtins.open", side_effect=_open_side_effect), + ): + mock_exec.return_value = _make_proc(returncode=0, stdout=b"12345\n") + result = await _container_cgroup_mem_bytes("primary") + + assert result == 12345678 + + @pytest.mark.asyncio + async def test_returns_zero_when_no_runtime(self): + """No podman or docker binary → 0.""" + with patch("shutil.which", return_value=None): + result = await _container_cgroup_mem_bytes("primary") + assert result == 0 + + @pytest.mark.asyncio + async def test_returns_zero_when_container_not_found(self): + """Container inspect returns non-zero (container doesn't exist) → 0.""" + with ( + patch("shutil.which", return_value="/usr/bin/podman"), + patch("asyncio.create_subprocess_exec") as mock_exec, + ): + mock_exec.return_value = _make_proc(returncode=1, stdout=b"") + result = await _container_cgroup_mem_bytes("primary") + assert result == 0 + + @pytest.mark.asyncio + async def test_returns_zero_when_pid_is_zero(self): + """Inspect returns PID 0 (stopped container) → 0.""" + with ( + patch("shutil.which", return_value="/usr/bin/podman"), + patch("asyncio.create_subprocess_exec") as mock_exec, + ): + mock_exec.return_value = _make_proc(returncode=0, stdout=b"0\n") + result = await _container_cgroup_mem_bytes("primary") + assert result == 0 + + @pytest.mark.asyncio + async def test_returns_zero_on_inspect_timeout(self): + """asyncio.wait_for timeout during inspect → 0.""" + with ( + patch("shutil.which", return_value="/usr/bin/podman"), + patch("asyncio.create_subprocess_exec") as mock_exec, + patch("asyncio.wait_for", side_effect=TimeoutError), + ): + mock_exec.return_value = _make_proc(returncode=0, stdout=b"12\n") + result = await _container_cgroup_mem_bytes("primary") + assert result == 0 + + @pytest.mark.asyncio + async def test_returns_zero_on_cgroup_v1(self): + """cgroupv1 line lacks '::' → probe cannot walk path → 0.""" + cg_line = "1:cpu,cpuacct:/system.slice/podman-abc.scope\n" + + def _open_side_effect(path, *args, **kwargs): + if "cgroup" in str(path): + return mock_open(read_data=cg_line)() + raise FileNotFoundError(path) + + with ( + patch("shutil.which", return_value="/usr/bin/podman"), + patch("asyncio.create_subprocess_exec") as mock_exec, + patch("builtins.open", side_effect=_open_side_effect), + ): + mock_exec.return_value = _make_proc(returncode=0, stdout=b"99\n") + result = await _container_cgroup_mem_bytes("primary") + assert result == 0 + + @pytest.mark.asyncio + async def test_returns_zero_when_memory_current_unreadable(self): + """memory.current read fails (OSError) → 0.""" + cg_line = "0::/system.slice/libpod-abc.scope\n" + + def _open_side_effect(path, *args, **kwargs): + if "cgroup" in str(path) and "memory" not in str(path): + return mock_open(read_data=cg_line)() + raise OSError("no such file") + + with ( + patch("shutil.which", return_value="/usr/bin/podman"), + patch("asyncio.create_subprocess_exec") as mock_exec, + patch("builtins.open", side_effect=_open_side_effect), + ): + mock_exec.return_value = _make_proc(returncode=0, stdout=b"42\n") + result = await _container_cgroup_mem_bytes("primary") + assert result == 0 + + @pytest.mark.asyncio + async def test_docker_fallback_when_podman_absent(self): + """Falls back to docker when podman is not found.""" + cg_rel = "system.slice/docker-xyz.scope" + cg_line = f"0::/{cg_rel}\n" + mem_current = "999999\n" + + def _which(cmd): + return "/usr/bin/docker" if cmd == "docker" else None + + def _open_side_effect(path, *args, **kwargs): + if "cgroup" in str(path) and "memory" not in str(path): + return mock_open(read_data=cg_line)() + if "memory.current" in str(path): + return mock_open(read_data=mem_current)() + raise FileNotFoundError(path) + + with ( + patch("shutil.which", side_effect=_which), + patch("asyncio.create_subprocess_exec") as mock_exec, + patch("builtins.open", side_effect=_open_side_effect), + ): + mock_exec.return_value = _make_proc(returncode=0, stdout=b"77\n") + result = await _container_cgroup_mem_bytes("primary") + + assert result == 999999 + # Verify docker was called (not podman) + call_args = mock_exec.call_args[0] + assert call_args[0] == "/usr/bin/docker" + + +# ── build_per_slot integration ────────────────────────────────────────────── + + +class TestBuildPerSlotContainerPath: + """Verify build_per_slot uses cgroup bytes for container slots + and falls back to file-size estimate for lemond slots.""" + + def _make_slot(self, name, state="ready", model_id="mymodel", backend="rocm"): + slot = MagicMock() + slot.name = name + slot.state = state + slot.model_id = model_id + slot.backend = backend + slot.metadata = {"provider": "lemonade", "backend": backend} + return slot + + @pytest.mark.asyncio + async def test_container_slot_uses_cgroup_bytes(self): + """When cgroup probe returns bytes, build_per_slot uses them (no KV estimate).""" + slot = self._make_slot("primary") + cgroup_bytes = 20 * 1024 * 1024 * 1024 # 20 GiB + + with patch( + "hal0.slots.capacity._container_cgroup_mem_bytes", + new_callable=AsyncMock, + return_value=cgroup_bytes, + ): + result = await build_per_slot([slot]) + + assert "primary" in result + row = result["primary"] + expected_mb = round(cgroup_bytes / (1024.0 * 1024.0), 1) + assert row["mem_mb"] == expected_mb + assert row["vram_mb"] == expected_mb + assert row["ram_mb"] == 0.0 + + @pytest.mark.asyncio + async def test_lemond_slot_falls_back_to_registry(self): + """When cgroup probe returns 0, build_per_slot uses registry file size.""" + slot = self._make_slot("primary") + model_mock = MagicMock() + model_mock.size_bytes = 15 * 1024 * 1024 * 1024 # 15 GiB + model_mock.model_dump = lambda: {} + + registry = MagicMock() + registry.get = MagicMock(return_value=model_mock) + + with patch( + "hal0.slots.capacity._container_cgroup_mem_bytes", + new_callable=AsyncMock, + return_value=0, + ): + result = await build_per_slot([slot], registry=registry) + + assert "primary" in result + row = result["primary"] + # Registry file size in MiB (no KV estimate for 0-token default context) + file_mb = model_mock.size_bytes / (1024 * 1024) + # KV estimate is added — just check it's ≥ file_mb + assert row["mem_mb"] >= file_mb + + @pytest.mark.asyncio + async def test_npu_slot_skips_cgroup_probe(self): + """NPU/FLM slots use the FLM footprint path and never call the cgroup probe.""" + slot = self._make_slot("npu-chat", backend="flm") + slot.metadata = {"provider": "flm", "backend": "flm"} + + flm_catalog = { + "mymodel": {"footprint_gb": 4.5, "size_bytes": 0}, + } + + with patch( + "hal0.slots.capacity._container_cgroup_mem_bytes", + new_callable=AsyncMock, + return_value=99999, # should never be called + ) as mock_probe: + result = await build_per_slot([slot], flm_catalog=flm_catalog) + + mock_probe.assert_not_called() + assert "npu-chat" in result + assert result["npu-chat"]["mem_mb"] == round(4.5 * 1024, 1) + + @pytest.mark.asyncio + async def test_offline_slot_omitted(self): + """Slots in non-resident states produce no row.""" + slot = self._make_slot("primary", state="offline") + with patch( + "hal0.slots.capacity._container_cgroup_mem_bytes", + new_callable=AsyncMock, + return_value=0, + ): + result = await build_per_slot([slot]) + assert result == {} diff --git a/ui/src/dash/memory-map.jsx b/ui/src/dash/memory-map.jsx index ea0a8125..d94a49e8 100644 --- a/ui/src/dash/memory-map.jsx +++ b/ui/src/dash/memory-map.jsx @@ -271,6 +271,13 @@ export function useMemoryMapModel() { bytesGb: a.bytesGb, modelId: a.slot.model || '', approx: a.approx, + // Container runtime fields — present when slot.runtime="container". + // `image` is the full image tag; `profile` is the profile slug. + // Used by the legend to show context relevant for container slots + // (device chip is uninformative for containers — all are "rocm"). + isContainer: a.slot.runtime === 'container' || a.slot.container_status != null, + image: a.slot.image || null, + profile: a.slot.profile || null, })) })(), }, @@ -374,6 +381,25 @@ function HostBar({ model }) { ) } +// Per-slot legend sub-label. +// Container slots: show image tag (truncated at 32 chars) or profile slug — +// more informative than "rocm" which every container shares. +// Lemond slots: keep the device token (rocm/vulkan/cpu/npu) as before. +function slotLegendSub(slot) { + if (slot.isContainer) { + // Prefer image tag (e.g. "ghcr.io/hal0ai/amd-strix-halo-toolboxes:rocm-7.2.4-server") + // Truncate to last 32 chars so very long registry paths stay readable. + if (slot.image) { + const img = String(slot.image) + return img.length > 32 ? '…' + img.slice(-32) : img + } + if (slot.profile) return String(slot.profile) + return 'container' + } + // Lemond / legacy path: show device + approx marker. + return `${slot.device}${slot.approx ? ' · ≈' : ''}` +} + function LegendRow({ swatch, name, sub, sz }) { return (
@@ -418,7 +444,7 @@ export function MemoryMap({ variant = 'sidebar', onConfigure }) { key={s.name} swatch={s.color} name={s.name} - sub={`${s.device}${s.approx ? ' · ≈' : ''}${s.modelId ? ' · ' + s.modelId : ''}`} + sub={`${slotLegendSub(s)}${s.modelId ? ' · ' + s.modelId : ''}`} sz={s.bytesGb} /> ))} @@ -480,7 +506,7 @@ export function MemoryMap({ variant = 'sidebar', onConfigure }) { key={s.name} swatch={s.color} name={s.name} - sub={s.device} + sub={slotLegendSub(s)} sz={s.bytesGb} /> ))} diff --git a/ui/tests/e2e/specs/memory-map-v3.spec.ts b/ui/tests/e2e/specs/memory-map-v3.spec.ts index 8c4e6d50..31851784 100644 --- a/ui/tests/e2e/specs/memory-map-v3.spec.ts +++ b/ui/tests/e2e/specs/memory-map-v3.spec.ts @@ -187,6 +187,61 @@ test.describe('Memory map — sidebar', () => { const c1 = await swatches.nth(1).evaluate((el) => getComputedStyle(el).backgroundColor) expect(c0).not.toBe(c1) }) + + test('container slot legend shows image tag, not device token', async ({ page }) => { + // #660: container slots are uninformative with "device=rocm" in the + // legend sub — every GPU container slot shares the same device token. + // The backend emits image + profile for container slots; the legend + // must show the (truncated) image tag instead. + // + // With VITE_MOCK_LEMONADE=1 the mock shim reads HAL0_DATA directly; + // page.route() never fires. Inject the container slot via addInitScript + // before data.jsx sets window.HAL0_DATA, so the mock harness picks it up. + const containerSlot = { + name: 'primary-container', type: 'llm', device: 'gpu-rocm', + model: 'qwen3.6-35b-a3b-q4_k_m', model_id: 'qwen3.6-35b-a3b', + group: 'chat', state: 'ready', port: 8096, + runtime: 'container', + profile: 'rocmfp4-mtp', + image: 'ghcr.io/hal0ai/amd-strix-halo-toolboxes:rocm-7.2.4-rocmfp4-server', + image_status: 'present', + container_status: 'running', + container_health: true, + mem_mb: 22400, + } + await page.addInitScript((slot) => { + // Intercept the HAL0_DATA setter that data.jsx calls, prepend our slot. + let stored: any = undefined + Object.defineProperty(window, 'HAL0_DATA', { + set(v: any) { + stored = { ...v, slots: [slot, ...(v.slots || [])] } + }, + get() { return stored }, + configurable: true, + }) + }, containerSlot) + await mockStatsHardware(page, { configured: false, detected: false }) + await page.goto('/#dashboard') + const legend = page.locator('.memmap-sidebar .memmap-legend') + await expect(legend).toBeVisible() + // The container slot's legend sub must contain part of the image tag. + // slotLegendSub truncates to the last 32 chars of the image string: + // "…rocm-7.2.4-rocmfp4-server" + await expect(legend).toContainText('rocm-7.2.4-rocmfp4-server') + }) + + test('container slot mem_mb attributed in pool bar', async ({ page }) => { + // #660: container slots with mem_mb > 0 must register as a non-zero + // segment in the pool bar — i.e. the bar has ≥2 width segments + // (at least one slot + the free remainder). + await mockStatsHardware(page, { configured: false, detected: false }) + await page.goto('/#dashboard') + const bar = page.locator('.memmap-sidebar .memmap-bar') + await expect(bar).toBeVisible() + const segments = bar.locator('i[style*="width"]') + const count = await segments.count() + expect(count).toBeGreaterThanOrEqual(2) + }) }) // NOTE: the `Memory map — expanded` variant (with its Proxmox host-pressure From 2d98928a25747a1b1fa00b9dbe65e989abcb2dfc Mon Sep 17 00:00:00 2001 From: Alexander Date: Mon, 8 Jun 2026 09:11:36 -0400 Subject: [PATCH 2/2] fix(memory-map): floor container cgroup mem at registry estimate (#660) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #672 review: on Strix Halo (UMA) model weights live in GTT (system RAM via amdgpu/TTM) and are often NOT charged to the process memory cgroup. A live container can report a cgroup of only ~2 GB (runtime/buffers) while holding a ~22 GB model. The previous `if cgroup_bytes > 0` used the cgroup unconditionally, REPLACING the correct registry estimate with a wrong-LOW value — under-reporting container memory. Fix: compute BOTH the cgroup value and the registry file-size + KV estimate for container slots, and use `max(cgroup_mb, estimate_mb)`. When the cgroup accounts for weights it wins (>= estimate); when GTT isn't charged the estimate wins → never under-reports. lemond/NPU paths unchanged. Tests: add under-report case (2 GB cgroup, 22 GB model -> estimate wins) and cgroup-wins case (24 GB cgroup > 22 GB model -> cgroup wins). Co-Authored-By: Claude Opus 4.8 (1M context) --- src/hal0/slots/capacity.py | 54 +++++++++++++---------- tests/slots/test_container_cgroup_mem.py | 56 +++++++++++++++++++++++- 2 files changed, 85 insertions(+), 25 deletions(-) diff --git a/src/hal0/slots/capacity.py b/src/hal0/slots/capacity.py index 66620b9f..32b7e9a6 100644 --- a/src/hal0/slots/capacity.py +++ b/src/hal0/slots/capacity.py @@ -225,9 +225,14 @@ async def build_per_slot( footprint. Three attribution paths, in priority order: 1. **NPU / FLM slots**: FLM catalog footprint_gb (includes runtime + KV). - 2. **Container slots** (podman ``hal0-slot-``): live cgroup - ``memory.current`` — real resident bytes including weights + KV + - runtime overhead. Probed via :func:`_container_cgroup_mem_bytes`. + 2. **Container slots** (podman ``hal0-slot-``): ``max`` of the + live cgroup ``memory.current`` and the registry file-size + KV + estimate. The ``max`` guards against Strix Halo (UMA) under-report: + model weights live in GTT (system RAM via amdgpu/TTM) and are often + NOT charged to the process cgroup, so a live container can report a + cgroup of only ~2 GB while holding a ~22 GB model. When the cgroup + *does* account for weights it wins (≥ estimate); when it doesn't, the + estimate wins — so the figure never under-reports. 3. **Lemonade / lemond slots** (fallback): model file size from the registry plus a coarse KV-cache estimate scaled by context window. @@ -282,26 +287,9 @@ async def build_per_slot( } continue model_mb = (entry.get("size_bytes") or 0) / (1024 * 1024) - # ── Container cgroup probe (path 2) ──────────────────────────────── - # Try the live podman/docker cgroup first. Returns 0 when no - # container named hal0-slot- exists (i.e. for lemond slots), - # so the probe is naturally a no-op for the lemond path — no - # explicit runtime-type detection required. - cgroup_bytes = await _container_cgroup_mem_bytes(s.name) - if cgroup_bytes > 0: - # cgroup memory.current already includes weights + KV + runtime; - # do NOT add an additional KV estimate on top. - cgroup_mb = round(cgroup_bytes / (1024.0 * 1024.0), 1) - out[s.name] = { - "vram_mb": cgroup_mb, - "ram_mb": 0.0, - "mem_mb": cgroup_mb, - "state": state, - "model_id": model_id, - } - continue - - # ── Lemond / file-size fallback (path 3) ─────────────────────────── + # ── Registry file-size + KV estimate (baseline for ALL non-NPU) ──── + # Compute the model-file-size + KV estimate up front so it can serve + # as a floor for the container cgroup probe below (see path 2). if model_mb <= 0 and registry is not None: try: m = registry.get(model_id) @@ -310,7 +298,25 @@ async def build_per_slot( except Exception: model_mb = 0.0 kv_mb = _kv_estimate_mb(_ctx_tokens_for(ctx_meta)) - resident_mb = round(model_mb + kv_mb, 1) + estimate_mb = round(model_mb + kv_mb, 1) + + # ── Container cgroup probe (path 2) ──────────────────────────────── + # Probe the live podman/docker cgroup. Returns 0 when no container + # named hal0-slot- exists (i.e. for lemond slots), so the probe + # is a no-op for the lemond path — no runtime-type detection needed. + # + # CRITICAL (#672 review): on Strix Halo (UMA) the model WEIGHTS live + # in GTT (system RAM via amdgpu/TTM) and are often NOT charged to the + # process memory cgroup. A live container can therefore report a + # cgroup of only ~2 GB (runtime/buffers) while holding a ~22 GB model. + # Using the cgroup unconditionally would UNDER-report. So we take the + # MAX of the cgroup and the registry estimate: + # • cgroup accurately includes weights → cgroup ≥ estimate → wins. + # • GTT not charged (cgroup too low) → estimate wins → no under-report. + cgroup_bytes = await _container_cgroup_mem_bytes(s.name) + cgroup_mb = round(cgroup_bytes / (1024.0 * 1024.0), 1) + resident_mb = max(cgroup_mb, estimate_mb) + out[s.name] = { "vram_mb": resident_mb, "ram_mb": 0.0, diff --git a/tests/slots/test_container_cgroup_mem.py b/tests/slots/test_container_cgroup_mem.py index 67ad5fa1..efe14384 100644 --- a/tests/slots/test_container_cgroup_mem.py +++ b/tests/slots/test_container_cgroup_mem.py @@ -185,7 +185,7 @@ def _make_slot(self, name, state="ready", model_id="mymodel", backend="rocm"): @pytest.mark.asyncio async def test_container_slot_uses_cgroup_bytes(self): - """When cgroup probe returns bytes, build_per_slot uses them (no KV estimate).""" + """When cgroup exceeds the (zero) estimate, build_per_slot uses the cgroup value.""" slot = self._make_slot("primary") cgroup_bytes = 20 * 1024 * 1024 * 1024 # 20 GiB @@ -203,6 +203,60 @@ async def test_container_slot_uses_cgroup_bytes(self): assert row["vram_mb"] == expected_mb assert row["ram_mb"] == 0.0 + @pytest.mark.asyncio + async def test_container_under_report_uses_estimate_floor(self): + """#672 regression: Strix Halo GTT weights not charged to cgroup. + + When the cgroup reports a small value (~2 GiB runtime/buffers) but the + model is large (~22 GiB), mem_mb must be the registry estimate, NOT the + too-low cgroup value — otherwise the map under-reports container memory. + """ + slot = self._make_slot("primary-container") + small_cgroup = 2 * 1024 * 1024 * 1024 # 2 GiB (GTT weights NOT charged) + + model_mock = MagicMock() + model_mock.size_bytes = 22 * 1024 * 1024 * 1024 # 22 GiB model + model_mock.model_dump = lambda: {} + registry = MagicMock() + registry.get = MagicMock(return_value=model_mock) + + with patch( + "hal0.slots.capacity._container_cgroup_mem_bytes", + new_callable=AsyncMock, + return_value=small_cgroup, + ): + result = await build_per_slot([slot], registry=registry) + + row = result["primary-container"] + cgroup_mb = small_cgroup / (1024.0 * 1024.0) + file_mb = model_mock.size_bytes / (1024 * 1024) + # Estimate (file size + KV) must win over the under-reporting cgroup. + assert row["mem_mb"] >= file_mb + assert row["mem_mb"] > cgroup_mb + + @pytest.mark.asyncio + async def test_container_cgroup_wins_when_above_estimate(self): + """When the cgroup DOES account for weights it exceeds the estimate and wins.""" + slot = self._make_slot("primary-container") + big_cgroup = 24 * 1024 * 1024 * 1024 # 24 GiB (weights charged + overhead) + + model_mock = MagicMock() + model_mock.size_bytes = 22 * 1024 * 1024 * 1024 # 22 GiB model + model_mock.model_dump = lambda: {} + registry = MagicMock() + registry.get = MagicMock(return_value=model_mock) + + with patch( + "hal0.slots.capacity._container_cgroup_mem_bytes", + new_callable=AsyncMock, + return_value=big_cgroup, + ): + result = await build_per_slot([slot], registry=registry) + + row = result["primary-container"] + expected_mb = round(big_cgroup / (1024.0 * 1024.0), 1) + assert row["mem_mb"] == expected_mb + @pytest.mark.asyncio async def test_lemond_slot_falls_back_to_registry(self): """When cgroup probe returns 0, build_per_slot uses registry file size."""