From 32d6ec78b46e57fa8eff5fd9b61470721ca3d86b Mon Sep 17 00:00:00 2001
From: Ryan <10638626+ryancee@users.noreply.github.com>
Date: Sat, 18 Apr 2026 12:49:55 -0400
Subject: [PATCH] fix(discovery): fall back to directory name for audio model
 type detection

Models whose config.json omits a top-level model_type field (e.g.
parakeet-tdt models, which use NeMo-format config files) were being
classified as 'llm' because detect_model_type() only inspects the
architectures[] and model_type fields from config.json.

This caused a KeyError('model_type') at inference time: the wrong
engine (BatchedEngine/LLM) was loaded for the model, and that engine's
LLM loading path called config['model_type'] without a .get() fallback.

Fix: after all config-based checks, extract the first hyphen-separated
segment of the model directory name and match it against the same
mlx-audio AUDIO_STT/TTS/STS_MODEL_TYPES sets used for config-based
detection.  The stem is excluded from _LLM_TYPE_COLLISIONS to avoid
false positives (e.g. a directory named 'llama-...' should not be
detected as audio).

This matches how mlx_audio.utils.base_load_model already resolves
model type from path when config is missing the field.
---
 omlx/model_discovery.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/omlx/model_discovery.py b/omlx/model_discovery.py
index ffa69e1f..48db97d7 100644
--- a/omlx/model_discovery.py
+++ b/omlx/model_discovery.py
@@ -490,6 +490,20 @@ def detect_model_type(model_path: Path) -> ModelType:
     if normalized_type.startswith("lfm") and normalized_type not in EMBEDDING_MODEL_TYPES:
         return "audio_sts"
 
+    # Directory-name fallback for audio models whose config.json omits model_type.
+    # Some model families (e.g. NeMo-format parakeet) do not include a top-level
+    # model_type field, so the checks above find nothing.  As a last resort, match
+    # the first hyphen-separated segment of the model directory name against the
+    # same mlx-audio model-type sets used above.
+    dir_stem = model_path.name.lower().split("-")[0]
+    if dir_stem and dir_stem not in _LLM_TYPE_COLLISIONS:
+        if dir_stem in AUDIO_TTS_MODEL_TYPES:
+            return "audio_tts"
+        if dir_stem in AUDIO_STT_MODEL_TYPES:
+            return "audio_stt"
+        if dir_stem in AUDIO_STS_MODEL_TYPES:
+            return "audio_sts"
+
     return "llm"