Hal0ai · thinmintdev · Jun 7, 2026 · Jun 7, 2026
diff --git a/ARCHITECTURE.md b/ARCHITECTURE.md
@@ -93,19 +93,39 @@ is no longer valid — primarily for FLM model-tag namespace drift.
   doesn't import HTTP client code, doesn't know about models other than
   via the registry, and doesn't make assumptions about backends beyond
   the provider ABC.
+- **Two orthogonal routing axes — do not conflate them.**
+  - `Dispatcher.dispatch()` is the **transport axis**: given a model id
+    (or path default), it resolves which upstream URL to forward to using
+    three ordered tiers — registry lookup → passthrough on warm caches →
+    cold-cache prefetch.  If all three miss, it raises `NoRouteFound`.
+    It does not know about capability types or slot selection policy.
+  - `SlotManager.route_for_request()` (via `omni_router`) is the
+    **capability axis**: given a capability label (chat / embed / asr /
+    tts / image) it selects *which slot* is the right destination, then
+    calls `dispatch()` for the actual forwarding.  It runs before
+    `dispatch()` and is what multi-modal "omni" requests use to fan out
+    across capability groups.  Do **not** remove or bypass it —
+    `omni_router` / `route_for_request` are verified load-bearing for
+    omni's tool selection and dispatch (ADR-0022).
+  The legacy Tier-4 path/name heuristics (`dispatcher/proxy.py`) that
+  blurred these axes were retired in #624.  Image-gen and embed models
+  must now have explicit registry bindings so `dispatch()` handles them
+  via Tier 1.
 - **Dispatcher is HTTP-only.** It does not start/stop slots. It reads
-  slot status from the slot manager and routes requests. If a slot is
-  offline, it returns a structured error; restarting is a separate API
-  call.
-- **Providers are stateless.** Each provider (`LlamaServerProvider`,
-  `FLMProvider`, `MoonshineProvider`, `KokoroProvider`,
-  `ComfyUIProvider`) is a class with `build_env()`, `start_cmd()`,
-  `health()`, `infer()`. They don't hold connection state, don't manage
-  systemd, and don't share globals. One provider per backend type.
-  `FLMProvider` additionally probes `flm list -j` inside the toolbox
+  slot status from the slot manager (`SlotManager.state()`) and routes
+  requests. If a slot is offline, it returns a structured error;
+  restarting is a separate API call.
+- **Providers are stateless.** Each live provider (`LlamaServerProvider`,
+  `FLMProvider`, `ComfyUIProvider`) is a class with `build_env()`,
+  `start_cmd()`, `health()`, `infer()`. They don't hold connection state,
+  don't manage systemd, and don't share globals. One provider per backend
+  type. `FLMProvider` additionally probes `flm list -j` inside the toolbox
   image to advertise its own model-tag namespace
   (`share/flm/model_list.json`) — it does **not** run arbitrary GGUFs
-  from the registry.
+  from the registry.  `LemonadeProvider` is the primary slot lifecycle
+  driver (v0.2+); `LlamaServerProvider` and `FLMProvider` are retained for
+  Vulkan and NPU slots respectively.  `MoonshineProvider` and
+  `KokoroProvider` were retired in #620 — lemond serves STT/TTS natively.
 - **The registry is the only source of truth for "what models exist."**
   Atomic TOML files under `/var/lib/hal0/registry/`. mtime-cached. Slot
   configs reference model IDs from the registry; if a model is deleted,

diff --git a/src/hal0/dispatcher/proxy.py b/src/hal0/dispatcher/proxy.py
diff --git a/src/hal0/dispatcher/router.py b/src/hal0/dispatcher/router.py
@@ -17,9 +17,10 @@
      Tier 2), then re-check passthrough.  The prefetch fanout is wrapped
      in :class:`SingleFlightGroup` (Tier 3) so 100 concurrent identical
      prefetches share a single upstream call.
-  4. **legacy fallback** — :func:`hal0.dispatcher.proxy.resolve_slot`
-     path-and-name heuristics from haloai ``lib/proxy.py``.  Kept until
-     v0.2.
+
+If all three tiers miss, :class:`NoRouteFound` is raised immediately.
+The legacy path/name heuristics (``dispatcher/proxy.py``) were retired
+in #624 — image-gen and embed models must have explicit registry bindings.
 
 Decision logging: every routing decision emits one structured log line
 to journald with ``SYSLOG_IDENTIFIER=hal0-dispatch`` (PLAN.md §5 Tier 2),
@@ -43,7 +44,6 @@
 import structlog
 from fastapi.responses import Response, StreamingResponse
 
-from hal0.dispatcher.proxy import LegacyResolutionFailed, resolve_slot
 from hal0.dispatcher.single_flight import SingleFlightGroup
 from hal0.errors import Hal0Error
 from hal0.upstreams.registry import Upstream, UpstreamRegistry
@@ -544,52 +544,13 @@ async def dispatch(
                     self._log_decision(call, t0, cache_state="prefetched")
                     return call
 
-        # ── Step 4: legacy heuristics ────────────────────────────────────
-        try:  # TIER1 — narrow exception handling; log + re-raise typed errors
-            slot_upstream = resolve_slot(path, body, self._upstreams)
-        except LegacyResolutionFailed as exc:
-            # Bubble the typed error up after logging the decision point.
-            log.warning(
-                "legacy fallback exhausted",
-                model=model_id,
-                path=path,
-                error=exc.message,
-            )
-            raise NoRouteFound(
-                f"model {model_id!r} not found in registry, no upstream advertised it, "
-                f"and legacy slot resolution failed",
-                details={"model": model_id, "path": path, "legacy_error": exc.message},
-            ) from exc
-        except Hal0Error:
-            # Typed errors are caller-meaningful: re-raise unchanged.
-            raise
-        except Exception as exc:  # TIER1 — was: silent swallow at haloai dispatcher.py:291
-            log.warning(
-                "legacy fallback raised unexpectedly",
-                model=model_id,
-                path=path,
-                error=str(exc),
-                error_type=type(exc).__name__,
-            )
-            raise NoRouteFound(
-                f"model {model_id!r}: legacy slot resolution raised {type(exc).__name__}",
-                details={"model": model_id, "path": path, "error": str(exc)},
-            ) from exc
-
-        call = UpstreamCall(
-            upstream_name=slot_upstream.name,
-            target_url=_join_url(slot_upstream.url, path),
-            headers=self._build_headers(request, slot_upstream),
-            body=_remap_model(body, original_model),
-            streaming=streaming,
-            method=method,
-            resolved_model=original_model or model_id,
-            requested_model=original_model,
-            resolution_path=f"legacy_slot:{slot_upstream.name}",
-            slot_name=_slot_name_of(slot_upstream),
+        # Tiers 1-3 exhausted — no route found.  The legacy path/name heuristics
+        # (proxy.resolve_slot) were retired in #624; image-gen and embed models
+        # must have explicit registry bindings.
+        raise NoRouteFound(
+            f"model {model_id!r} not found in registry and no upstream advertised it",
+            details={"model": model_id, "path": path},
         )
-        self._log_decision(call, t0, cache_state="legacy")
-        return call
 
     async def forward(self, call: UpstreamCall) -> Response:
         """Execute the HTTP forward and return a FastAPI Response.
@@ -666,7 +627,7 @@ async def _ensure_slot_loaded_backend_aware(self, call: UpstreamCall) -> None:
 
         assert self._slot_manager is not None  # narrowed by forward()
         slot_name = call.slot_name
-        current = self._slot_manager._current_state(slot_name)
+        current = self._slot_manager.state(slot_name)
         if current in (SlotState.READY, SlotState.SERVING, SlotState.IDLE):
             # Model is already loaded under whatever backend it loaded with;
             # nothing to do. (A declared≠actual drift is surfaced by the
@@ -701,7 +662,7 @@ def _check_slot_ready_for_dispatch(self, call: UpstreamCall) -> None:
         from hal0.slots.state import SlotState
 
         assert self._slot_manager is not None  # narrowed by caller
-        current = self._slot_manager._current_state(call.slot_name)
+        current = self._slot_manager.state(call.slot_name)
         if current in (SlotState.READY, SlotState.SERVING, SlotState.IDLE):
             return
 

diff --git a/src/hal0/slots/manager.py b/src/hal0/slots/manager.py
@@ -304,6 +304,18 @@ def _ensure_known(self, name: str) -> None:
 
     # ── state machine ────────────────────────────────────────────────────────
 
+    def state(self, name: str) -> SlotState:
+        """Return the current :class:`SlotState` for *name* (public API).
+
+        Reads from the in-memory cache first; falls back to the on-disk
+        ``state.json`` if the slot has not been seen yet this process.
+        Returns :attr:`SlotState.OFFLINE` when no state record exists.
+
+        Use this instead of the private ``_current_state`` — the Dispatcher
+        calls this method; internal manager code uses ``_current_state``.
+        """
+        return self._current_state(name)
+
     def _current_state(self, name: str) -> SlotState:
         rec = self._states.get(name)
         if rec is None:

diff --git a/tests/api/test_v1_chat_slot_alias.py b/tests/api/test_v1_chat_slot_alias.py
@@ -167,39 +167,6 @@ async def test_rewrite_is_noop_without_slot_manager() -> None:
     assert body["model"] == "primary"
 
 
-# ── dispatcher / proxy non-regression ───────────────────────────────────────
-
-
-def test_resolve_slot_primary_still_falls_through_to_lemonade() -> None:
-    """``resolve_slot`` keeps the ``m != "primary"`` carve-out: a chat
-    request that reaches the legacy fallback selects ``primary`` and
-    (absent a real primary slot upstream) raises the typed legacy error,
-    which the dispatcher converts to NoRouteFound → lemonade fall-through.
-    No per-slot chat upstream is matched."""
-    from hal0.dispatcher.proxy import LegacyResolutionFailed, resolve_slot
-    from hal0.upstreams.registry import Upstream, UpstreamRegistry
-
-    reg = UpstreamRegistry()
-    # Only the composite hal0 upstream exists (no per-slot chat upstreams).
-    reg.upsert(
-        Upstream(
-            name="hal0",
-            kind="slot",
-            url="http://127.0.0.1:8080/v1",
-            slot_name=None,
-            auth_style="none",
-        )
-    )
-    with pytest.raises(LegacyResolutionFailed):
-        # model id form — not an alias, not a registered slot name → falls
-        # to the "primary" default which has no slot upstream → legacy error.
-        resolve_slot(
-            "/v1/chat/completions",
-            {"model": "hermes-4-14b-q5km", "messages": []},
-            reg,
-        )
-
-
 def _patch_alias(monkeypatch: pytest.MonkeyPatch) -> None:
     async def _fake(_sm: Any) -> dict[str, str]:
         return {

diff --git a/tests/dispatcher/test_composite_dispatch.py b/tests/dispatcher/test_composite_dispatch.py
@@ -70,6 +70,9 @@ class _OfflineSlotManager:
     def __init__(self) -> None:
         self.serving_calls: list[str] = []
 
+    def state(self, _slot_name: str) -> SlotState:
+        return SlotState.OFFLINE
+
     def _current_state(self, _slot_name: str) -> SlotState:
         return SlotState.OFFLINE