Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -62,20 +62,23 @@ jobs:
# test_git_integration.py - live-server integration requiring model weights.
# Run: pytest tests/test_git_integration.py --model qwen3:8b
# tests/quant/test_int4_loader.py - requires squish_quant Rust extension (cargo/maturin).
# The following three files have `import mlx.core` at module level (inside try/except).
# The following files have module-level Metal/MLX imports (directly or via squish modules).
# On macOS CI runners the Metal GPU context is not ready at pytest collection time;
# the bare import triggers Metal init → SIGABRT (exit 134) killing the entire process.
# All three must be run manually on real Apple Silicon hardware with a warmed Metal context.
# All must be run manually on real Apple Silicon hardware with a warmed Metal context.
# tests/test_sqint2_linear.py - module-level mlx.core import → Metal SIGABRT
# tests/test_backend_unit.py - module-level mlx.core import → Metal SIGABRT
# tests/quant/test_int3_linear_unit.py - module-level mlx.core import → Metal SIGABRT
# tests/hardware/test_fused_kernels_unit.py - importorskip(mlx.core) → imports
# squish.hardware.fused_kernels which has module-level mlx.nn → Metal SIGABRT
run: |
pytest tests/ \
--ignore=tests/test_git_integration.py \
--ignore=tests/quant/test_int4_loader.py \
--ignore=tests/test_sqint2_linear.py \
--ignore=tests/test_backend_unit.py \
--ignore=tests/quant/test_int3_linear_unit.py \
--ignore=tests/hardware/test_fused_kernels_unit.py \
--timeout=60 \
-v --tb=short -q

Expand Down Expand Up @@ -116,14 +119,16 @@ jobs:
- name: Run full test suite (Rust extension available)
# test_int4_loader.py is now included — requires squish_quant built above.
# test_git_integration.py still excluded (needs live model weights).
# Metal SIGABRT exclusions (module-level mlx.core import → exit 134 at collection time):
# test_sqint2_linear.py, test_backend_unit.py, test_int3_linear_unit.py
# Metal SIGABRT exclusions (module-level mlx.core/mlx.nn import → exit 134 at collection time):
# test_sqint2_linear.py, test_backend_unit.py, test_int3_linear_unit.py,
# tests/hardware/test_fused_kernels_unit.py (importorskip → mlx.nn via fused_kernels)
run: |
.venv/bin/pytest tests/ \
--ignore=tests/test_git_integration.py \
--ignore=tests/test_sqint2_linear.py \
--ignore=tests/test_backend_unit.py \
--ignore=tests/quant/test_int3_linear_unit.py \
--ignore=tests/hardware/test_fused_kernels_unit.py \
--timeout=120 \
-v --tb=short -q

Expand Down
16 changes: 16 additions & 0 deletions demo/index.html
Original file line number Diff line number Diff line change
Expand Up @@ -870,6 +870,22 @@ <h1>Compression, made visible.</h1>
</main>

<script>
/* ── Architecture lookup table (mirrors demo/server.py _ARCH_TABLE) ─
Kept in sync with the Python table so the savings calculator can
project KV memory before the user submits the form to the backend.
test_wave108_calculator.py::TestArchTableJsParity enforces parity. */
const ARCH_TABLE = [
{ cap: 0.6, n_layers: 24, n_kv_heads: 2, head_dim: 64, label: '0.5B-class' },
{ cap: 1.0, n_layers: 24, n_kv_heads: 2, head_dim: 128, label: '1B-class' },
{ cap: 2.0, n_layers: 28, n_kv_heads: 2, head_dim: 128, label: '1.5B–2B-class' },
{ cap: 4.0, n_layers: 32, n_kv_heads: 4, head_dim: 128, label: '3B-class' },
{ cap: 9.0, n_layers: 32, n_kv_heads: 8, head_dim: 128, label: '7B–8B-class' },
{ cap: 15.0, n_layers: 40, n_kv_heads: 8, head_dim: 128, label: '13B-class' },
{ cap: 35.0, n_layers: 48, n_kv_heads: 8, head_dim: 128, label: '30B-class' },
{ cap: 75.0, n_layers: 80, n_kv_heads: 8, head_dim: 128, label: '70B-class' },
{ cap: Infinity, n_layers: 96, n_kv_heads: 8, head_dim: 128, label: '100B+-class' },
];

/* ════════════════════════════════════════════════════════════════════
W109 dashboard logic
The visual is pure CSS — JS only updates a few CSS custom props
Expand Down
290 changes: 284 additions & 6 deletions squish/kv/kv_cache.py

Large diffs are not rendered by default.

26 changes: 26 additions & 0 deletions squish/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,31 @@ def _require(pkg: str, install: str | None = None) -> None:
print(f" {_C.PK}✗ Missing dependency:{_C.R} {_C.W}{pkg}{_C.R} {_C.DIM}→ pip install {hint}{_C.R}")
sys.exit(1)


# mlx-lm 0.31.0 was yanked (March 2026) for a batched KV cache cross-contamination
# bug — a correctness regression in server mode where different requests in a batch
# could overwrite each other's KV state. 0.31.1+ is safe. 0.31.2+ also adds
# native MTP speculative decoding for Qwen3.5/3.6.
_MLX_LM_BAD_VERSION = "0.31.0"

def _check_mlx_lm_version() -> None:
"""Warn when the installed mlx-lm version is the yanked 0.31.0 release."""
if sys.platform != "darwin":
return
try:
import importlib.metadata as _im
ver = _im.version("mlx-lm")
except Exception:
return # not installed or metadata unavailable — not our problem
if ver == _MLX_LM_BAD_VERSION:
print(
f"\n {_C.PK}⚠ mlx-lm {ver} is UNSAFE and was yanked from PyPI.{_C.R}\n"
f" {_C.DIM}Batched KV cache cross-contamination bug: different requests\n"
f" can corrupt each other's KV state in server mode.{_C.R}\n"
f" {_C.W}Upgrade immediately:{_C.R} {_C.DIM}pip install 'mlx-lm>=0.31.1'{_C.R}\n"
)


_require("fastapi")

from fastapi import FastAPI, HTTPException, Request, Security # noqa: E402
Expand Down Expand Up @@ -3993,6 +4018,7 @@ def main(): # pragma: no cover
"[paged-attention] could not initialise (%s) — disabled", _paged_err
)

_check_mlx_lm_version()
_print_banner()

if getattr(args, "mlx_model_dir", ""):
Expand Down
Loading
Loading