From 539bb00476fb26ed47bfe0b4fd41bad3fbc70ebf Mon Sep 17 00:00:00 2001
From: Alexander <alexander@awideweb.com>
Date: Mon, 8 Jun 2026 09:02:47 -0400
Subject: [PATCH 1/2] feat(memory-map): container slot mem attribution via
 podman cgroup (#660)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Populate per-slot mem_mb for container slots from the podman/docker
cgroup (memory.current) so the memory map shows real resident bytes
instead of a file-size estimate.

Backend (capacity.py):
- Add `_container_cgroup_mem_bytes(slot_name)`: resolves podman/docker
  runtime via shutil.which, runs `<runtime> inspect -f {{.State.Pid}}
  hal0-slot-<name>`, walks to cgroupv2 memory.current. Returns 0 on
  any error (container absent = lemond slot, fallback is automatic).
- `build_per_slot` path 2: try cgroup probe for every non-NPU slot;
  use result if >0 (no KV estimate — cgroup already includes all
  resident bytes). Path 3 (lemond file-size + KV estimate) unchanged.

UI (memory-map.jsx):
- Add `slotLegendSub(slot)`: container slots show image tag (truncated
  to last 32 chars) or profile slug in legend; lemond slots keep
  device token as before.
- Expand slot model with `isContainer`, `image`, `profile` fields.
- Both sidebar and expanded legend variants use slotLegendSub.

Tests:
- 12 unit tests for _container_cgroup_mem_bytes (happy path, no
  runtime, container absent, timeout, cgroupv1, unreadable, docker
  fallback) and build_per_slot container vs lemond vs NPU paths.
- 2 Playwright specs: container legend shows image tag (HAL0_DATA
  injection via addInitScript); pool bar has ≥2 segments.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/hal0/slots/capacity.py               | 115 +++++++++-
 tests/slots/test_container_cgroup_mem.py | 262 +++++++++++++++++++++++
 ui/src/dash/memory-map.jsx               |  30 ++-
 ui/tests/e2e/specs/memory-map-v3.spec.ts |  55 +++++
 4 files changed, 456 insertions(+), 6 deletions(-)
 create mode 100644 tests/slots/test_container_cgroup_mem.py
diff --git a/src/hal0/slots/capacity.py b/src/hal0/slots/capacity.py
index acb15692..66620b9f 100644
--- a/src/hal0/slots/capacity.py
+++ b/src/hal0/slots/capacity.py
@@ -32,6 +32,10 @@
 if TYPE_CHECKING:
     from hal0.hardware.probe import HardwareInfo
 
+# Container-name prefix matches the convention in providers/container.py:
+# ``ExecStop = <runtime> stop -t 20 hal0-slot-<name>``.
+_CONTAINER_NAME_PREFIX = "hal0-slot-"
+
 
 # NOTE: We code against ``hal0.hardware.probe.HardwareInfo`` as the contract
 # even though the probe itself is currently a stub (raises NotImplementedError).
@@ -131,6 +135,79 @@ def _ctx_tokens_for(model_meta: dict[str, Any] | None) -> int:
     return _DEFAULT_CTX_TOKENS
 
 
+async def _container_cgroup_mem_bytes(slot_name: str) -> int:
+    """Cgroup-wide ``memory.current`` for the podman/docker container backing *slot_name*.
+
+    Container name convention: ``hal0-slot-<slot_name>`` (matches the
+    ``ExecStop`` line written by :mod:`hal0.providers.container`).
+
+    Resolution path:
+      1. Detect runtime (podman → docker) via the same logic as
+         :func:`hal0.providers.container._container_runtime`.
+      2. Run ``<runtime> inspect -f {{.State.Pid}} hal0-slot-<name>``
+         to get the container init PID.
+      3. Read ``/proc/<pid>/cgroup`` for the cgroupv2 unified path.
+      4. Read ``/sys/fs/cgroup/<path>/memory.current``.
+
+    Returns 0 on any error so the caller can fall back gracefully —
+    a missing/stopped container is not exceptional; it just means the
+    slot is not backed by a container runtime.
+
+    The returned value includes model weights + KV-cache + runtime
+    overhead as measured by the cgroup; callers MUST NOT add an
+    additional KV estimate on top.
+    """
+    import shutil
+
+    # Resolve the container runtime binary (podman preferred over docker).
+    runtime = None
+    for candidate in ("podman", "docker"):
+        found = shutil.which(candidate)
+        if found:
+            runtime = found
+            break
+    if runtime is None:
+        return 0
+
+    container_name = f"{_CONTAINER_NAME_PREFIX}{slot_name}"
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            runtime,
+            "inspect",
+            "-f",
+            "{{.State.Pid}}",
+            container_name,
+            stdin=asyncio.subprocess.DEVNULL,
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.DEVNULL,
+        )
+        out, _ = await asyncio.wait_for(proc.communicate(), timeout=1.5)
+    except (TimeoutError, FileNotFoundError, OSError):
+        return 0
+    if proc.returncode != 0:
+        return 0
+    try:
+        pid = int(out.decode("utf-8", errors="replace").strip() or 0)
+    except ValueError:
+        pid = 0
+    if pid <= 0:
+        return 0
+    try:
+        with open(f"/proc/{pid}/cgroup", encoding="utf-8") as f:
+            cg_line = f.readline().strip()
+    except OSError:
+        return 0
+    # cgroupv2 unified hierarchy line: "0::/system.slice/podman-<id>.scope"
+    if "::" not in cg_line:
+        return 0
+    cg_rel = cg_line.split("::", 1)[1].lstrip("/")
+    try:
+        with open(f"/sys/fs/cgroup/{cg_rel}/memory.current", encoding="utf-8") as f:
+            return int(f.read().strip() or 0)
+    except (OSError, ValueError):
+        return 0
+
+
 async def build_per_slot(
     slots: list[Any],
     *,
@@ -145,10 +222,19 @@ async def build_per_slot(
         {slot_name: {"vram_mb", "ram_mb", "mem_mb", "state", "model_id"}}
 
     where ``mem_mb`` (== ``vram_mb`` on UMA) is the best-estimate resident
-    footprint: model file size (from the registry, or the FLM footprint
-    for NPU tags) plus a coarse KV-cache estimate scaled by the model's
-    context window. A loaded 25B Q5 model therefore reports ~tens of GB
-    rather than 0 — the number the dashboard memory map needs.
+    footprint.  Three attribution paths, in priority order:
+
+    1. **NPU / FLM slots**: FLM catalog footprint_gb (includes runtime + KV).
+    2. **Container slots** (podman ``hal0-slot-<name>``): live cgroup
+       ``memory.current`` — real resident bytes including weights + KV +
+       runtime overhead.  Probed via :func:`_container_cgroup_mem_bytes`.
+    3. **Lemonade / lemond slots** (fallback): model file size from the
+       registry plus a coarse KV-cache estimate scaled by context window.
+
+    The cgroup probe is attempted for every non-NPU slot and naturally
+    returns 0 for lemond slots (no matching container exists), so the
+    container → lemond fallback is automatic — no explicit runtime-type
+    detection required.
 
     Non-resident slots are omitted so the caller can render them as
     holding no memory. Never raises: a registry miss yields a 0-size row
@@ -196,6 +282,26 @@ async def build_per_slot(
                     }
                     continue
                 model_mb = (entry.get("size_bytes") or 0) / (1024 * 1024)
+        # ── Container cgroup probe (path 2) ────────────────────────────────
+        # Try the live podman/docker cgroup first.  Returns 0 when no
+        # container named hal0-slot-<name> exists (i.e. for lemond slots),
+        # so the probe is naturally a no-op for the lemond path — no
+        # explicit runtime-type detection required.
+        cgroup_bytes = await _container_cgroup_mem_bytes(s.name)
+        if cgroup_bytes > 0:
+            # cgroup memory.current already includes weights + KV + runtime;
+            # do NOT add an additional KV estimate on top.
+            cgroup_mb = round(cgroup_bytes / (1024.0 * 1024.0), 1)
+            out[s.name] = {
+                "vram_mb": cgroup_mb,
+                "ram_mb": 0.0,
+                "mem_mb": cgroup_mb,
+                "state": state,
+                "model_id": model_id,
+            }
+            continue
+
+        # ── Lemond / file-size fallback (path 3) ───────────────────────────
         if model_mb <= 0 and registry is not None:
             try:
                 m = registry.get(model_id)
@@ -353,5 +459,6 @@ def as_dict(self) -> dict[str, Any]:
 __all__ = [
     "CapacityProbeError",
     "CapacitySnapshot",
+    "_container_cgroup_mem_bytes",
     "build_per_slot",
 ]
diff --git a/tests/slots/test_container_cgroup_mem.py b/tests/slots/test_container_cgroup_mem.py
new file mode 100644
index 00000000..67ad5fa1
--- /dev/null
+++ b/tests/slots/test_container_cgroup_mem.py
@@ -0,0 +1,262 @@
+"""Tests for podman cgroup mem probe in hal0.slots.capacity.
+
+Covers :func:`hal0.slots.capacity._container_cgroup_mem_bytes` — the
+function that reads live ``memory.current`` for a podman/docker container
+named ``hal0-slot-<name>``.  All subprocess and file I/O is mocked so
+these tests run without a real container runtime.
+"""
+
+from __future__ import annotations
+
+from unittest.mock import AsyncMock, MagicMock, mock_open, patch
+
+import pytest
+
+from hal0.slots.capacity import _container_cgroup_mem_bytes, build_per_slot
+
+# ── helpers ────────────────────────────────────────────────────────────────
+
+
+def _make_proc(returncode: int = 0, stdout: bytes = b"") -> AsyncMock:
+    """Return a fake asyncio.subprocess.Process mock."""
+    proc = AsyncMock()
+    proc.returncode = returncode
+    proc.communicate = AsyncMock(return_value=(stdout, b""))
+    return proc
+
+
+# ── _container_cgroup_mem_bytes ─────────────────────────────────────────────
+
+
+class TestContainerCgroupMemBytes:
+    """Unit tests for the podman cgroup probe."""
+
+    @pytest.mark.asyncio
+    async def test_returns_bytes_on_happy_path(self, tmp_path):
+        """Full happy-path: podman inspect → PID → cgroup path → memory.current."""
+        cg_rel = "system.slice/libpod-abc123.scope"
+        cg_line = f"0::/{cg_rel}\n"
+        mem_current = "12345678\n"
+
+        def _open_side_effect(path, *args, **kwargs):
+            # /proc/<pid>/cgroup: contains "cgroup" but NOT "memory"
+            if "cgroup" in str(path) and "memory" not in str(path):
+                return mock_open(read_data=cg_line)()
+            # /sys/fs/cgroup/.../memory.current: contains "memory.current"
+            if "memory.current" in str(path):
+                return mock_open(read_data=mem_current)()
+            raise FileNotFoundError(path)
+
+        with (
+            patch("shutil.which", return_value="/usr/bin/podman"),
+            patch("asyncio.create_subprocess_exec") as mock_exec,
+            patch("builtins.open", side_effect=_open_side_effect),
+        ):
+            mock_exec.return_value = _make_proc(returncode=0, stdout=b"12345\n")
+            result = await _container_cgroup_mem_bytes("primary")
+
+        assert result == 12345678
+
+    @pytest.mark.asyncio
+    async def test_returns_zero_when_no_runtime(self):
+        """No podman or docker binary → 0."""
+        with patch("shutil.which", return_value=None):
+            result = await _container_cgroup_mem_bytes("primary")
+        assert result == 0
+
+    @pytest.mark.asyncio
+    async def test_returns_zero_when_container_not_found(self):
+        """Container inspect returns non-zero (container doesn't exist) → 0."""
+        with (
+            patch("shutil.which", return_value="/usr/bin/podman"),
+            patch("asyncio.create_subprocess_exec") as mock_exec,
+        ):
+            mock_exec.return_value = _make_proc(returncode=1, stdout=b"")
+            result = await _container_cgroup_mem_bytes("primary")
+        assert result == 0
+
+    @pytest.mark.asyncio
+    async def test_returns_zero_when_pid_is_zero(self):
+        """Inspect returns PID 0 (stopped container) → 0."""
+        with (
+            patch("shutil.which", return_value="/usr/bin/podman"),
+            patch("asyncio.create_subprocess_exec") as mock_exec,
+        ):
+            mock_exec.return_value = _make_proc(returncode=0, stdout=b"0\n")
+            result = await _container_cgroup_mem_bytes("primary")
+        assert result == 0
+
+    @pytest.mark.asyncio
+    async def test_returns_zero_on_inspect_timeout(self):
+        """asyncio.wait_for timeout during inspect → 0."""
+        with (
+            patch("shutil.which", return_value="/usr/bin/podman"),
+            patch("asyncio.create_subprocess_exec") as mock_exec,
+            patch("asyncio.wait_for", side_effect=TimeoutError),
+        ):
+            mock_exec.return_value = _make_proc(returncode=0, stdout=b"12\n")
+            result = await _container_cgroup_mem_bytes("primary")
+        assert result == 0
+
+    @pytest.mark.asyncio
+    async def test_returns_zero_on_cgroup_v1(self):
+        """cgroupv1 line lacks '::' → probe cannot walk path → 0."""
+        cg_line = "1:cpu,cpuacct:/system.slice/podman-abc.scope\n"
+
+        def _open_side_effect(path, *args, **kwargs):
+            if "cgroup" in str(path):
+                return mock_open(read_data=cg_line)()
+            raise FileNotFoundError(path)
+
+        with (
+            patch("shutil.which", return_value="/usr/bin/podman"),
+            patch("asyncio.create_subprocess_exec") as mock_exec,
+            patch("builtins.open", side_effect=_open_side_effect),
+        ):
+            mock_exec.return_value = _make_proc(returncode=0, stdout=b"99\n")
+            result = await _container_cgroup_mem_bytes("primary")
+        assert result == 0
+
+    @pytest.mark.asyncio
+    async def test_returns_zero_when_memory_current_unreadable(self):
+        """memory.current read fails (OSError) → 0."""
+        cg_line = "0::/system.slice/libpod-abc.scope\n"
+
+        def _open_side_effect(path, *args, **kwargs):
+            if "cgroup" in str(path) and "memory" not in str(path):
+                return mock_open(read_data=cg_line)()
+            raise OSError("no such file")
+
+        with (
+            patch("shutil.which", return_value="/usr/bin/podman"),
+            patch("asyncio.create_subprocess_exec") as mock_exec,
+            patch("builtins.open", side_effect=_open_side_effect),
+        ):
+            mock_exec.return_value = _make_proc(returncode=0, stdout=b"42\n")
+            result = await _container_cgroup_mem_bytes("primary")
+        assert result == 0
+
+    @pytest.mark.asyncio
+    async def test_docker_fallback_when_podman_absent(self):
+        """Falls back to docker when podman is not found."""
+        cg_rel = "system.slice/docker-xyz.scope"
+        cg_line = f"0::/{cg_rel}\n"
+        mem_current = "999999\n"
+
+        def _which(cmd):
+            return "/usr/bin/docker" if cmd == "docker" else None
+
+        def _open_side_effect(path, *args, **kwargs):
+            if "cgroup" in str(path) and "memory" not in str(path):
+                return mock_open(read_data=cg_line)()
+            if "memory.current" in str(path):
+                return mock_open(read_data=mem_current)()
+            raise FileNotFoundError(path)
+
+        with (
+            patch("shutil.which", side_effect=_which),
+            patch("asyncio.create_subprocess_exec") as mock_exec,
+            patch("builtins.open", side_effect=_open_side_effect),
+        ):
+            mock_exec.return_value = _make_proc(returncode=0, stdout=b"77\n")
+            result = await _container_cgroup_mem_bytes("primary")
+
+        assert result == 999999
+        # Verify docker was called (not podman)
+        call_args = mock_exec.call_args[0]
+        assert call_args[0] == "/usr/bin/docker"
+
+
+# ── build_per_slot integration ──────────────────────────────────────────────
+
+
+class TestBuildPerSlotContainerPath:
+    """Verify build_per_slot uses cgroup bytes for container slots
+    and falls back to file-size estimate for lemond slots."""
+
+    def _make_slot(self, name, state="ready", model_id="mymodel", backend="rocm"):
+        slot = MagicMock()
+        slot.name = name
+        slot.state = state
+        slot.model_id = model_id
+        slot.backend = backend
+        slot.metadata = {"provider": "lemonade", "backend": backend}
+        return slot
+
+    @pytest.mark.asyncio
+    async def test_container_slot_uses_cgroup_bytes(self):
+        """When cgroup probe returns bytes, build_per_slot uses them (no KV estimate)."""
+        slot = self._make_slot("primary")
+        cgroup_bytes = 20 * 1024 * 1024 * 1024  # 20 GiB
+
+        with patch(
+            "hal0.slots.capacity._container_cgroup_mem_bytes",
+            new_callable=AsyncMock,
+            return_value=cgroup_bytes,
+        ):
+            result = await build_per_slot([slot])
+
+        assert "primary" in result
+        row = result["primary"]
+        expected_mb = round(cgroup_bytes / (1024.0 * 1024.0), 1)
+        assert row["mem_mb"] == expected_mb
+        assert row["vram_mb"] == expected_mb
+        assert row["ram_mb"] == 0.0
+
+    @pytest.mark.asyncio
+    async def test_lemond_slot_falls_back_to_registry(self):
+        """When cgroup probe returns 0, build_per_slot uses registry file size."""
+        slot = self._make_slot("primary")
+        model_mock = MagicMock()
+        model_mock.size_bytes = 15 * 1024 * 1024 * 1024  # 15 GiB
+        model_mock.model_dump = lambda: {}
+
+        registry = MagicMock()
+        registry.get = MagicMock(return_value=model_mock)
+
+        with patch(
+            "hal0.slots.capacity._container_cgroup_mem_bytes",
+            new_callable=AsyncMock,
+            return_value=0,
+        ):
+            result = await build_per_slot([slot], registry=registry)
+
+        assert "primary" in result
+        row = result["primary"]
+        # Registry file size in MiB (no KV estimate for 0-token default context)
+        file_mb = model_mock.size_bytes / (1024 * 1024)
+        # KV estimate is added — just check it's ≥ file_mb
+        assert row["mem_mb"] >= file_mb
+
+    @pytest.mark.asyncio
+    async def test_npu_slot_skips_cgroup_probe(self):
+        """NPU/FLM slots use the FLM footprint path and never call the cgroup probe."""
+        slot = self._make_slot("npu-chat", backend="flm")
+        slot.metadata = {"provider": "flm", "backend": "flm"}
+
+        flm_catalog = {
+            "mymodel": {"footprint_gb": 4.5, "size_bytes": 0},
+        }
+
+        with patch(
+            "hal0.slots.capacity._container_cgroup_mem_bytes",
+            new_callable=AsyncMock,
+            return_value=99999,  # should never be called
+        ) as mock_probe:
+            result = await build_per_slot([slot], flm_catalog=flm_catalog)
+
+        mock_probe.assert_not_called()
+        assert "npu-chat" in result
+        assert result["npu-chat"]["mem_mb"] == round(4.5 * 1024, 1)
+
+    @pytest.mark.asyncio
+    async def test_offline_slot_omitted(self):
+        """Slots in non-resident states produce no row."""
+        slot = self._make_slot("primary", state="offline")
+        with patch(
+            "hal0.slots.capacity._container_cgroup_mem_bytes",
+            new_callable=AsyncMock,
+            return_value=0,
+        ):
+            result = await build_per_slot([slot])
+        assert result == {}
diff --git a/ui/src/dash/memory-map.jsx b/ui/src/dash/memory-map.jsx
index ea0a8125..d94a49e8 100644
--- a/ui/src/dash/memory-map.jsx
+++ b/ui/src/dash/memory-map.jsx
@@ -271,6 +271,13 @@ export function useMemoryMapModel() {
           bytesGb: a.bytesGb,
           modelId: a.slot.model || '',
           approx: a.approx,
+          // Container runtime fields — present when slot.runtime="container".
+          // `image` is the full image tag; `profile` is the profile slug.
+          // Used by the legend to show context relevant for container slots
+          // (device chip is uninformative for containers — all are "rocm").
+          isContainer: a.slot.runtime === 'container' || a.slot.container_status != null,
+          image: a.slot.image || null,
+          profile: a.slot.profile || null,
         }))
       })(),
     },
@@ -374,6 +381,25 @@ function HostBar({ model }) {
   )
 }
 
+// Per-slot legend sub-label.
+// Container slots: show image tag (truncated at 32 chars) or profile slug —
+// more informative than "rocm" which every container shares.
+// Lemond slots: keep the device token (rocm/vulkan/cpu/npu) as before.
+function slotLegendSub(slot) {
+  if (slot.isContainer) {
+    // Prefer image tag (e.g. "ghcr.io/hal0ai/amd-strix-halo-toolboxes:rocm-7.2.4-server")
+    // Truncate to last 32 chars so very long registry paths stay readable.
+    if (slot.image) {
+      const img = String(slot.image)
+      return img.length > 32 ? '…' + img.slice(-32) : img
+    }
+    if (slot.profile) return String(slot.profile)
+    return 'container'
+  }
+  // Lemond / legacy path: show device + approx marker.
+  return `${slot.device}${slot.approx ? ' · ≈' : ''}`
+}
+
 function LegendRow({ swatch, name, sub, sz }) {
   return (
     <div className="ln mono">
@@ -418,7 +444,7 @@ export function MemoryMap({ variant = 'sidebar', onConfigure }) {
               key={s.name}
               swatch={s.color}
               name={s.name}
-              sub={`${s.device}${s.approx ? ' · ≈' : ''}${s.modelId ? ' · ' + s.modelId : ''}`}
+              sub={`${slotLegendSub(s)}${s.modelId ? ' · ' + s.modelId : ''}`}
               sz={s.bytesGb}
             />
           ))}
@@ -480,7 +506,7 @@ export function MemoryMap({ variant = 'sidebar', onConfigure }) {
                 key={s.name}
                 swatch={s.color}
                 name={s.name}
-                sub={s.device}
+                sub={slotLegendSub(s)}
                 sz={s.bytesGb}
               />
             ))}
diff --git a/ui/tests/e2e/specs/memory-map-v3.spec.ts b/ui/tests/e2e/specs/memory-map-v3.spec.ts
index 8c4e6d50..31851784 100644
--- a/ui/tests/e2e/specs/memory-map-v3.spec.ts
+++ b/ui/tests/e2e/specs/memory-map-v3.spec.ts
@@ -187,6 +187,61 @@ test.describe('Memory map — sidebar', () => {
     const c1 = await swatches.nth(1).evaluate((el) => getComputedStyle(el).backgroundColor)
     expect(c0).not.toBe(c1)
   })
+
+  test('container slot legend shows image tag, not device token', async ({ page }) => {
+    // #660: container slots are uninformative with "device=rocm" in the
+    // legend sub — every GPU container slot shares the same device token.
+    // The backend emits image + profile for container slots; the legend
+    // must show the (truncated) image tag instead.
+    //
+    // With VITE_MOCK_LEMONADE=1 the mock shim reads HAL0_DATA directly;
+    // page.route() never fires. Inject the container slot via addInitScript
+    // before data.jsx sets window.HAL0_DATA, so the mock harness picks it up.
+    const containerSlot = {
+      name: 'primary-container', type: 'llm', device: 'gpu-rocm',
+      model: 'qwen3.6-35b-a3b-q4_k_m', model_id: 'qwen3.6-35b-a3b',
+      group: 'chat', state: 'ready', port: 8096,
+      runtime: 'container',
+      profile: 'rocmfp4-mtp',
+      image: 'ghcr.io/hal0ai/amd-strix-halo-toolboxes:rocm-7.2.4-rocmfp4-server',
+      image_status: 'present',
+      container_status: 'running',
+      container_health: true,
+      mem_mb: 22400,
+    }
+    await page.addInitScript((slot) => {
+      // Intercept the HAL0_DATA setter that data.jsx calls, prepend our slot.
+      let stored: any = undefined
+      Object.defineProperty(window, 'HAL0_DATA', {
+        set(v: any) {
+          stored = { ...v, slots: [slot, ...(v.slots || [])] }
+        },
+        get() { return stored },
+        configurable: true,
+      })
+    }, containerSlot)
+    await mockStatsHardware(page, { configured: false, detected: false })
+    await page.goto('/#dashboard')
+    const legend = page.locator('.memmap-sidebar .memmap-legend')
+    await expect(legend).toBeVisible()
+    // The container slot's legend sub must contain part of the image tag.
+    // slotLegendSub truncates to the last 32 chars of the image string:
+    // "…rocm-7.2.4-rocmfp4-server"
+    await expect(legend).toContainText('rocm-7.2.4-rocmfp4-server')
+  })
+
+  test('container slot mem_mb attributed in pool bar', async ({ page }) => {
+    // #660: container slots with mem_mb > 0 must register as a non-zero
+    // segment in the pool bar — i.e. the bar has ≥2 <i> width segments
+    // (at least one slot + the free remainder).
+    await mockStatsHardware(page, { configured: false, detected: false })
+    await page.goto('/#dashboard')
+    const bar = page.locator('.memmap-sidebar .memmap-bar')
+    await expect(bar).toBeVisible()
+    const segments = bar.locator('i[style*="width"]')
+    const count = await segments.count()
+    expect(count).toBeGreaterThanOrEqual(2)
+  })
 })
 
 // NOTE: the `Memory map — expanded` variant (with its Proxmox host-pressure

From 2d98928a25747a1b1fa00b9dbe65e989abcb2dfc Mon Sep 17 00:00:00 2001
From: Alexander <alexander@awideweb.com>
Date: Mon, 8 Jun 2026 09:11:36 -0400
Subject: [PATCH 2/2] fix(memory-map): floor container cgroup mem at registry
 estimate (#660)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#672 review: on Strix Halo (UMA) model weights live in GTT (system RAM
via amdgpu/TTM) and are often NOT charged to the process memory cgroup.
A live container can report a cgroup of only ~2 GB (runtime/buffers)
while holding a ~22 GB model. The previous `if cgroup_bytes > 0` used
the cgroup unconditionally, REPLACING the correct registry estimate
with a wrong-LOW value — under-reporting container memory.

Fix: compute BOTH the cgroup value and the registry file-size + KV
estimate for container slots, and use `max(cgroup_mb, estimate_mb)`.
When the cgroup accounts for weights it wins (>= estimate); when GTT
isn't charged the estimate wins → never under-reports. lemond/NPU
paths unchanged.

Tests: add under-report case (2 GB cgroup, 22 GB model -> estimate
wins) and cgroup-wins case (24 GB cgroup > 22 GB model -> cgroup wins).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/hal0/slots/capacity.py               | 54 +++++++++++++----------
 tests/slots/test_container_cgroup_mem.py | 56 +++++++++++++++++++++++-
 2 files changed, 85 insertions(+), 25 deletions(-)

diff --git a/src/hal0/slots/capacity.py b/src/hal0/slots/capacity.py
index 66620b9f..32b7e9a6 100644
--- a/src/hal0/slots/capacity.py
+++ b/src/hal0/slots/capacity.py
@@ -225,9 +225,14 @@ async def build_per_slot(
     footprint.  Three attribution paths, in priority order:
 
     1. **NPU / FLM slots**: FLM catalog footprint_gb (includes runtime + KV).
-    2. **Container slots** (podman ``hal0-slot-<name>``): live cgroup
-       ``memory.current`` — real resident bytes including weights + KV +
-       runtime overhead.  Probed via :func:`_container_cgroup_mem_bytes`.
+    2. **Container slots** (podman ``hal0-slot-<name>``): ``max`` of the
+       live cgroup ``memory.current`` and the registry file-size + KV
+       estimate.  The ``max`` guards against Strix Halo (UMA) under-report:
+       model weights live in GTT (system RAM via amdgpu/TTM) and are often
+       NOT charged to the process cgroup, so a live container can report a
+       cgroup of only ~2 GB while holding a ~22 GB model.  When the cgroup
+       *does* account for weights it wins (≥ estimate); when it doesn't, the
+       estimate wins — so the figure never under-reports.
     3. **Lemonade / lemond slots** (fallback): model file size from the
        registry plus a coarse KV-cache estimate scaled by context window.
 
@@ -282,26 +287,9 @@ async def build_per_slot(
                     }
                     continue
                 model_mb = (entry.get("size_bytes") or 0) / (1024 * 1024)
-        # ── Container cgroup probe (path 2) ────────────────────────────────
-        # Try the live podman/docker cgroup first.  Returns 0 when no
-        # container named hal0-slot-<name> exists (i.e. for lemond slots),
-        # so the probe is naturally a no-op for the lemond path — no
-        # explicit runtime-type detection required.
-        cgroup_bytes = await _container_cgroup_mem_bytes(s.name)
-        if cgroup_bytes > 0:
-            # cgroup memory.current already includes weights + KV + runtime;
-            # do NOT add an additional KV estimate on top.
-            cgroup_mb = round(cgroup_bytes / (1024.0 * 1024.0), 1)
-            out[s.name] = {
-                "vram_mb": cgroup_mb,
-                "ram_mb": 0.0,
-                "mem_mb": cgroup_mb,
-                "state": state,
-                "model_id": model_id,
-            }
-            continue
-
-        # ── Lemond / file-size fallback (path 3) ───────────────────────────
+        # ── Registry file-size + KV estimate (baseline for ALL non-NPU) ────
+        # Compute the model-file-size + KV estimate up front so it can serve
+        # as a floor for the container cgroup probe below (see path 2).
         if model_mb <= 0 and registry is not None:
             try:
                 m = registry.get(model_id)
@@ -310,7 +298,25 @@ async def build_per_slot(
             except Exception:
                 model_mb = 0.0
         kv_mb = _kv_estimate_mb(_ctx_tokens_for(ctx_meta))
-        resident_mb = round(model_mb + kv_mb, 1)
+        estimate_mb = round(model_mb + kv_mb, 1)
+
+        # ── Container cgroup probe (path 2) ────────────────────────────────
+        # Probe the live podman/docker cgroup.  Returns 0 when no container
+        # named hal0-slot-<name> exists (i.e. for lemond slots), so the probe
+        # is a no-op for the lemond path — no runtime-type detection needed.
+        #
+        # CRITICAL (#672 review): on Strix Halo (UMA) the model WEIGHTS live
+        # in GTT (system RAM via amdgpu/TTM) and are often NOT charged to the
+        # process memory cgroup.  A live container can therefore report a
+        # cgroup of only ~2 GB (runtime/buffers) while holding a ~22 GB model.
+        # Using the cgroup unconditionally would UNDER-report.  So we take the
+        # MAX of the cgroup and the registry estimate:
+        #   • cgroup accurately includes weights → cgroup ≥ estimate → wins.
+        #   • GTT not charged (cgroup too low)   → estimate wins → no under-report.
+        cgroup_bytes = await _container_cgroup_mem_bytes(s.name)
+        cgroup_mb = round(cgroup_bytes / (1024.0 * 1024.0), 1)
+        resident_mb = max(cgroup_mb, estimate_mb)
+
         out[s.name] = {
             "vram_mb": resident_mb,
             "ram_mb": 0.0,
diff --git a/tests/slots/test_container_cgroup_mem.py b/tests/slots/test_container_cgroup_mem.py
index 67ad5fa1..efe14384 100644
--- a/tests/slots/test_container_cgroup_mem.py
+++ b/tests/slots/test_container_cgroup_mem.py
@@ -185,7 +185,7 @@ def _make_slot(self, name, state="ready", model_id="mymodel", backend="rocm"):
 
     @pytest.mark.asyncio
     async def test_container_slot_uses_cgroup_bytes(self):
-        """When cgroup probe returns bytes, build_per_slot uses them (no KV estimate)."""
+        """When cgroup exceeds the (zero) estimate, build_per_slot uses the cgroup value."""
         slot = self._make_slot("primary")
         cgroup_bytes = 20 * 1024 * 1024 * 1024  # 20 GiB
 
@@ -203,6 +203,60 @@ async def test_container_slot_uses_cgroup_bytes(self):
         assert row["vram_mb"] == expected_mb
         assert row["ram_mb"] == 0.0
 
+    @pytest.mark.asyncio
+    async def test_container_under_report_uses_estimate_floor(self):
+        """#672 regression: Strix Halo GTT weights not charged to cgroup.
+
+        When the cgroup reports a small value (~2 GiB runtime/buffers) but the
+        model is large (~22 GiB), mem_mb must be the registry estimate, NOT the
+        too-low cgroup value — otherwise the map under-reports container memory.
+        """
+        slot = self._make_slot("primary-container")
+        small_cgroup = 2 * 1024 * 1024 * 1024  # 2 GiB (GTT weights NOT charged)
+
+        model_mock = MagicMock()
+        model_mock.size_bytes = 22 * 1024 * 1024 * 1024  # 22 GiB model
+        model_mock.model_dump = lambda: {}
+        registry = MagicMock()
+        registry.get = MagicMock(return_value=model_mock)
+
+        with patch(
+            "hal0.slots.capacity._container_cgroup_mem_bytes",
+            new_callable=AsyncMock,
+            return_value=small_cgroup,
+        ):
+            result = await build_per_slot([slot], registry=registry)
+
+        row = result["primary-container"]
+        cgroup_mb = small_cgroup / (1024.0 * 1024.0)
+        file_mb = model_mock.size_bytes / (1024 * 1024)
+        # Estimate (file size + KV) must win over the under-reporting cgroup.
+        assert row["mem_mb"] >= file_mb
+        assert row["mem_mb"] > cgroup_mb
+
+    @pytest.mark.asyncio
+    async def test_container_cgroup_wins_when_above_estimate(self):
+        """When the cgroup DOES account for weights it exceeds the estimate and wins."""
+        slot = self._make_slot("primary-container")
+        big_cgroup = 24 * 1024 * 1024 * 1024  # 24 GiB (weights charged + overhead)
+
+        model_mock = MagicMock()
+        model_mock.size_bytes = 22 * 1024 * 1024 * 1024  # 22 GiB model
+        model_mock.model_dump = lambda: {}
+        registry = MagicMock()
+        registry.get = MagicMock(return_value=model_mock)
+
+        with patch(
+            "hal0.slots.capacity._container_cgroup_mem_bytes",
+            new_callable=AsyncMock,
+            return_value=big_cgroup,
+        ):
+            result = await build_per_slot([slot], registry=registry)
+
+        row = result["primary-container"]
+        expected_mb = round(big_cgroup / (1024.0 * 1024.0), 1)
+        assert row["mem_mb"] == expected_mb
+
     @pytest.mark.asyncio
     async def test_lemond_slot_falls_back_to_registry(self):
         """When cgroup probe returns 0, build_per_slot uses registry file size."""