konjoai · konjoinfinity · May 19, 2026 · May 19, 2026 · May 19, 2026 · May 19, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -62,20 +62,23 @@ jobs:
         #   test_git_integration.py - live-server integration requiring model weights.
         #                             Run: pytest tests/test_git_integration.py --model qwen3:8b
         #   tests/quant/test_int4_loader.py - requires squish_quant Rust extension (cargo/maturin).
-        #   The following three files have `import mlx.core` at module level (inside try/except).
+        #   The following files have module-level Metal/MLX imports (directly or via squish modules).
         #   On macOS CI runners the Metal GPU context is not ready at pytest collection time;
         #   the bare import triggers Metal init → SIGABRT (exit 134) killing the entire process.
-        #   All three must be run manually on real Apple Silicon hardware with a warmed Metal context.
+        #   All must be run manually on real Apple Silicon hardware with a warmed Metal context.
         #   tests/test_sqint2_linear.py   - module-level mlx.core import → Metal SIGABRT
         #   tests/test_backend_unit.py    - module-level mlx.core import → Metal SIGABRT
         #   tests/quant/test_int3_linear_unit.py - module-level mlx.core import → Metal SIGABRT
+        #   tests/hardware/test_fused_kernels_unit.py - importorskip(mlx.core) → imports
+        #       squish.hardware.fused_kernels which has module-level mlx.nn → Metal SIGABRT
         run: |
           pytest tests/ \
             --ignore=tests/test_git_integration.py \
             --ignore=tests/quant/test_int4_loader.py \
             --ignore=tests/test_sqint2_linear.py \
             --ignore=tests/test_backend_unit.py \
             --ignore=tests/quant/test_int3_linear_unit.py \
+            --ignore=tests/hardware/test_fused_kernels_unit.py \
             --timeout=60 \
             -v --tb=short -q
 
@@ -116,14 +119,16 @@ jobs:
       - name: Run full test suite (Rust extension available)
         # test_int4_loader.py is now included — requires squish_quant built above.
         # test_git_integration.py still excluded (needs live model weights).
-        # Metal SIGABRT exclusions (module-level mlx.core import → exit 134 at collection time):
-        #   test_sqint2_linear.py, test_backend_unit.py, test_int3_linear_unit.py
+        # Metal SIGABRT exclusions (module-level mlx.core/mlx.nn import → exit 134 at collection time):
+        #   test_sqint2_linear.py, test_backend_unit.py, test_int3_linear_unit.py,
+        #   tests/hardware/test_fused_kernels_unit.py (importorskip → mlx.nn via fused_kernels)
         run: |
           .venv/bin/pytest tests/ \
             --ignore=tests/test_git_integration.py \
             --ignore=tests/test_sqint2_linear.py \
             --ignore=tests/test_backend_unit.py \
             --ignore=tests/quant/test_int3_linear_unit.py \
+            --ignore=tests/hardware/test_fused_kernels_unit.py \
             --timeout=120 \
             -v --tb=short -q
 

diff --git a/demo/index.html b/demo/index.html
@@ -870,6 +870,22 @@ <h1>Compression, made visible.</h1>
 </main>
 
 <script>
+  /* ── Architecture lookup table (mirrors demo/server.py _ARCH_TABLE) ─
+     Kept in sync with the Python table so the savings calculator can
+     project KV memory before the user submits the form to the backend.
+     test_wave108_calculator.py::TestArchTableJsParity enforces parity. */
+  const ARCH_TABLE = [
+    { cap: 0.6,      n_layers: 24, n_kv_heads: 2, head_dim: 64,  label: '0.5B-class' },
+    { cap: 1.0,      n_layers: 24, n_kv_heads: 2, head_dim: 128, label: '1B-class' },
+    { cap: 2.0,      n_layers: 28, n_kv_heads: 2, head_dim: 128, label: '1.5B–2B-class' },
+    { cap: 4.0,      n_layers: 32, n_kv_heads: 4, head_dim: 128, label: '3B-class' },
+    { cap: 9.0,      n_layers: 32, n_kv_heads: 8, head_dim: 128, label: '7B–8B-class' },
+    { cap: 15.0,     n_layers: 40, n_kv_heads: 8, head_dim: 128, label: '13B-class' },
+    { cap: 35.0,     n_layers: 48, n_kv_heads: 8, head_dim: 128, label: '30B-class' },
+    { cap: 75.0,     n_layers: 80, n_kv_heads: 8, head_dim: 128, label: '70B-class' },
+    { cap: Infinity, n_layers: 96, n_kv_heads: 8, head_dim: 128, label: '100B+-class' },
+  ];
+
   /* ════════════════════════════════════════════════════════════════════
      W109 dashboard logic
      The visual is pure CSS — JS only updates a few CSS custom props

diff --git a/squish/kv/kv_cache.py b/squish/kv/kv_cache.py
diff --git a/squish/server.py b/squish/server.py
@@ -128,6 +128,31 @@ def _require(pkg: str, install: str | None = None) -> None:
         print(f"  {_C.PK}✗  Missing dependency:{_C.R}  {_C.W}{pkg}{_C.R}  {_C.DIM}→  pip install {hint}{_C.R}")
         sys.exit(1)
 
+
+# mlx-lm 0.31.0 was yanked (March 2026) for a batched KV cache cross-contamination
+# bug — a correctness regression in server mode where different requests in a batch
+# could overwrite each other's KV state.  0.31.1+ is safe.  0.31.2+ also adds
+# native MTP speculative decoding for Qwen3.5/3.6.
+_MLX_LM_BAD_VERSION = "0.31.0"
+
+def _check_mlx_lm_version() -> None:
+    """Warn when the installed mlx-lm version is the yanked 0.31.0 release."""
+    if sys.platform != "darwin":
+        return
+    try:
+        import importlib.metadata as _im
+        ver = _im.version("mlx-lm")
+    except Exception:
+        return  # not installed or metadata unavailable — not our problem
+    if ver == _MLX_LM_BAD_VERSION:
+        print(
+            f"\n  {_C.PK}⚠  mlx-lm {ver} is UNSAFE and was yanked from PyPI.{_C.R}\n"
+            f"  {_C.DIM}Batched KV cache cross-contamination bug: different requests\n"
+            f"  can corrupt each other's KV state in server mode.{_C.R}\n"
+            f"  {_C.W}Upgrade immediately:{_C.R}  {_C.DIM}pip install 'mlx-lm>=0.31.1'{_C.R}\n"
+        )
+
+
 _require("fastapi")
 
 from fastapi import FastAPI, HTTPException, Request, Security  # noqa: E402
@@ -3993,6 +4018,7 @@ def main():  # pragma: no cover
                 "[paged-attention] could not initialise (%s) — disabled", _paged_err
             )
 
+    _check_mlx_lm_version()
     _print_banner()
 
     if getattr(args, "mlx_model_dir", ""):