From b08f5edde367047983dd540e1510c439d39dd73e Mon Sep 17 00:00:00 2001
From: Fabian Smith <33810210+smithfabian@users.noreply.github.com>
Date: Fri, 24 Apr 2026 11:26:20 +0200
Subject: [PATCH 1/2] fix(tests): repair default test failures

---
 omlx/admin/routes.py                    |  2 ++
 omlx/cache/boundary_snapshot_store.py   | 14 +++++++++++---
 omlx/model_profiles.py                  |  1 +
 pyproject.toml                          |  2 ++
 tests/integration/test_e2e_streaming.py |  5 +++++
 tests/test_accuracy_benchmark.py        |  2 +-
 6 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/omlx/admin/routes.py b/omlx/admin/routes.py
index cf2318696..582778fc6 100644
--- a/omlx/admin/routes.py
+++ b/omlx/admin/routes.py
@@ -1412,6 +1412,7 @@ async def list_models(is_admin: bool = Depends(require_admin)):
                 "force_sampling": settings.force_sampling,
                 "max_tool_result_tokens": settings.max_tool_result_tokens,
                 "enable_thinking": settings.enable_thinking,
+                "preserve_thinking": settings.preserve_thinking,
                 "thinking_budget_enabled": settings.thinking_budget_enabled,
                 "thinking_budget_tokens": settings.thinking_budget_tokens,
                 "reasoning_parser": settings.reasoning_parser,
@@ -1421,6 +1422,7 @@ async def list_models(is_admin: bool = Depends(require_admin)):
                 "index_cache_freq": settings.index_cache_freq,
                 "turboquant_kv_enabled": settings.turboquant_kv_enabled,
                 "turboquant_kv_bits": settings.turboquant_kv_bits,
+                "turboquant_skip_last": settings.turboquant_skip_last,
                 "specprefill_enabled": settings.specprefill_enabled,
                 "specprefill_draft_model": settings.specprefill_draft_model,
                 "specprefill_keep_pct": settings.specprefill_keep_pct,
diff --git a/omlx/cache/boundary_snapshot_store.py b/omlx/cache/boundary_snapshot_store.py
index 8593696e9..c39b6282a 100644
--- a/omlx/cache/boundary_snapshot_store.py
+++ b/omlx/cache/boundary_snapshot_store.py
@@ -263,16 +263,22 @@ def cleanup_request(self, request_id: str) -> None:
 
     def cleanup_all(self) -> None:
         """Delete all snapshot files (for reset/startup)."""
-        # Drain write queue so the writer thread doesn't process stale
-        # items after the directory is deleted.
+        # Drain queued writes, then wait for any item the writer already
+        # dequeued. Without the join, an in-flight write can recreate a
+        # request directory after the cleanup has removed it.
+        saw_sentinel = False
         while True:
             try:
                 item = self._write_queue.get_nowait()
+                self._write_queue.task_done()
                 if item is None:  # Sentinel — put it back for shutdown.
                     self._write_queue.put(item)
+                    saw_sentinel = True
                     break
             except queue.Empty:
                 break
+        if not saw_sentinel:
+            self._write_queue.join()
 
         with self._pending_lock:
             self._pending_writes.clear()
@@ -320,6 +326,7 @@ def _writer_loop(self) -> None:
                 continue
 
             if item is None:  # Sentinel
+                self._write_queue.task_done()
                 break
 
             pw_key, tensors_raw, metadata, file_path = item
@@ -335,6 +342,7 @@ def _writer_loop(self) -> None:
                 except Exception:
                     pass
                 self._dec_cancelled(pw_key[0])
+                self._write_queue.task_done()
                 continue
 
             temp_path = None
@@ -392,7 +400,7 @@ def _writer_loop(self) -> None:
                     # If file was written successfully, remove entirely.
                     if file_path.exists():
                         self._pending_writes.pop(pw_key, None)
-
+                self._write_queue.task_done()
 
     def _serialize_extracted(
         self,
diff --git a/omlx/model_profiles.py b/omlx/model_profiles.py
index 00d39aef8..637cefb76 100644
--- a/omlx/model_profiles.py
+++ b/omlx/model_profiles.py
@@ -28,6 +28,7 @@
     "presence_penalty",
     "force_sampling",
     "enable_thinking",
+    "preserve_thinking",
     "thinking_budget_enabled",
     "thinking_budget_tokens",
     "reasoning_parser",
diff --git a/pyproject.toml b/pyproject.toml
index af3a1cac4..d6421dc2a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -89,6 +89,7 @@ audio = [
 dev = [
     "pytest>=7.0.0",
     "pytest-asyncio>=0.21.0",
+    "python-multipart>=0.0.5",
     "black>=23.0.0",
     "ruff>=0.1.0",
     "mypy>=1.0.0",
@@ -101,6 +102,7 @@ dev = [
 dev = [
     "pytest>=7.0.0",
     "pytest-asyncio>=0.21.0",
+    "python-multipart>=0.0.5",
     "black>=23.0.0",
     "ruff>=0.1.0",
     "mypy>=1.0.0",
diff --git a/tests/integration/test_e2e_streaming.py b/tests/integration/test_e2e_streaming.py
index 73c6d253e..e4518a7e8 100644
--- a/tests/integration/test_e2e_streaming.py
+++ b/tests/integration/test_e2e_streaming.py
@@ -174,6 +174,11 @@ def get_model_ids(self) -> List[str]:
     def get_status(self) -> Dict[str, Any]:
         return {"models": self._models}
 
+    def get_entry(self, model_id: str):
+        if model_id in self.get_model_ids():
+            return MagicMock(config_model_type="")
+        return None
+
     async def get_engine(self, model_id: str):
         return self._engine
 
diff --git a/tests/test_accuracy_benchmark.py b/tests/test_accuracy_benchmark.py
index 95f24da5d..8159b0121 100644
--- a/tests/test_accuracy_benchmark.py
+++ b/tests/test_accuracy_benchmark.py
@@ -58,7 +58,7 @@ def test_all_valid_benchmarks(self):
             model_id="test-model",
             benchmarks={b: 100 for b in VALID_BENCHMARKS},
         )
-        assert len(req.benchmarks) == 12
+        assert len(req.benchmarks) == len(VALID_BENCHMARKS)
 
     def test_enable_thinking_default_false(self):
         req = AccuracyBenchmarkRequest(

From 15c69256fc7c6361c53b41c44105cfc69562893f Mon Sep 17 00:00:00 2001
From: Fabian Smith <33810210+smithfabian@users.noreply.github.com>
Date: Fri, 24 Apr 2026 12:50:36 +0200
Subject: [PATCH 2/2] test(streaming): use explicit engine entry mock

---
 tests/integration/test_e2e_streaming.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/integration/test_e2e_streaming.py b/tests/integration/test_e2e_streaming.py
index e4518a7e8..c66dcfa41 100644
--- a/tests/integration/test_e2e_streaming.py
+++ b/tests/integration/test_e2e_streaming.py
@@ -9,6 +9,7 @@
 import json
 import pytest
 from dataclasses import dataclass, field
+from types import SimpleNamespace
 from typing import Any, AsyncIterator, Dict, List, Optional
 from unittest.mock import AsyncMock, MagicMock, patch
 
@@ -176,7 +177,10 @@ def get_status(self) -> Dict[str, Any]:
 
     def get_entry(self, model_id: str):
         if model_id in self.get_model_ids():
-            return MagicMock(config_model_type="")
+            return SimpleNamespace(
+                config_model_type="",
+                preserve_thinking_default=None,
+            )
         return None
 
     async def get_engine(self, model_id: str):