From dfa9c0f37506913fc1c41607399a08def3868153 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Mon, 22 Jun 2026 22:30:47 +0000
Subject: [PATCH] Add full Qwen 3 family GGUF support across core and SDKs

Extends Qwen 3 coverage from the four partially-wired sizes (1.7B/4B/8B/14B)
to the complete on-device, open-weight Qwen 3 GGUF lineup, and wires every
variant through all binding layers.

Models added:
- Dense: Qwen 3 0.6B and 32B (rounding out 0.6B/1.7B/4B/8B/14B/32B)
- Latest "2507" updates: Qwen 3 4B Instruct 2507 (non-thinking),
  4B Thinking 2507, and the 30B-A3B Instruct 2507 mixture-of-experts
  (loads via the qwen3moe GGUF architecture)

Wiring completed for all nine Qwen 3 variants:
- models.rs: repo/file/tokenizer constants, SUPPORTED_MODELS,
  SUPPORTED_MODEL_INFO, and tok_model_id_for_repo entries
- engine.rs: GgufModelConfig constructors, now Android-tokenizer-aware
  (the previous 1.7B/4B/8B/14B constructors passed tok_model_id: None even
  on Android, which breaks the candle GGUF backend)
- ffi.rs: #[uniffi::export] config functions (reaches Swift/Kotlin/Python)
- Kotlin OndeModels convenience wrappers, Dart FRB bridge, and the
  React Native C-ABI exports

Notes:
- The 3.5/3.6/3.7 point releases are excluded: they are either huge MoE
  (235B-A22B / 397B-A17B) or closed-weights, API-only checkpoints, none of
  which are loadable on-device through this GGUF SDK.
- expected_size_bytes for the new variants are approximate; the HuggingFace
  API was unreachable from the build environment, so exact siblings[].size
  values should be backfilled when network access is available.
- Added consistency tests: every supported model has display metadata, and
  every Qwen 3 repo resolves to an Android tokenizer.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01DkBxc4UETJyXpFUDnBSn3E
---
 .agents/AGENTS.md                             |  22 +-
 sdk/dart/rust/src/api.rs                      |  56 +++++
 .../com/ondeinference/onde/Convenience.kt     |  20 ++
 sdk/react-native/rust/src/lib.rs              |  56 +++++
 src/inference/engine.rs                       | 185 ++++++++++++++--
 src/inference/ffi.rs                          |  60 ++++++
 src/inference/models.rs                       | 204 ++++++++++++++++--
 7 files changed, 562 insertions(+), 41 deletions(-)

diff --git a/.agents/AGENTS.md b/.agents/AGENTS.md
index 7a75ba5..33eaf2a 100644
--- a/.agents/AGENTS.md
+++ b/.agents/AGENTS.md
@@ -147,10 +147,15 @@ All model constants live in `src/inference/models.rs`. When adding a new model:
 | Qwen 2.5 Coder 1.5B Instruct (GGUF Q4_K_M) | `bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF` | `Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf` | ~941 MB | All platforms (mobile default) |
 | Qwen 2.5 Coder 3B Instruct (GGUF Q4_K_M) | `bartowski/Qwen2.5-Coder-3B-Instruct-GGUF` | `Qwen2.5-Coder-3B-Instruct-Q4_K_M.gguf` | ~1.93 GB | All platforms (desktop default) |
 | Qwen 2.5 Coder 7B Instruct (GGUF Q4_K_M) | `bartowski/Qwen2.5-Coder-7B-Instruct-GGUF` | `Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf` | ~4.4 GB | Higher-memory devices |
-| Qwen 3 1.7B (GGUF Q4_K_M) | `bartowski/Qwen3-1.7B-GGUF` | `Qwen3-1.7B-Q4_K_M.gguf` | ~1.3 GB | All platforms |
-| Qwen 3 4B (GGUF Q4_K_M) | `bartowski/Qwen3-4B-GGUF` | `Qwen3-4B-Q4_K_M.gguf` | ~2.7 GB | All platforms |
-| Qwen 3 8B (GGUF Q4_K_M) | `bartowski/Qwen3-8B-GGUF` | `Qwen3-8B-Q4_K_M.gguf` | ~5 GB | Higher-memory devices |
-| Qwen 3 14B (GGUF Q4_K_M) | `bartowski/Qwen3-14B-GGUF` | `Qwen3-14B-Q4_K_M.gguf` | ~8.4 GB | Higher-memory devices |
+| Qwen 3 0.6B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-0.6B-GGUF` | `Qwen_Qwen3-0.6B-Q4_K_M.gguf` | ~0.5 GB | All platforms |
+| Qwen 3 1.7B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-1.7B-GGUF` | `Qwen_Qwen3-1.7B-Q4_K_M.gguf` | ~1.3 GB | All platforms |
+| Qwen 3 4B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-4B-GGUF` | `Qwen_Qwen3-4B-Q4_K_M.gguf` | ~2.7 GB | All platforms |
+| Qwen 3 8B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-8B-GGUF` | `Qwen_Qwen3-8B-Q4_K_M.gguf` | ~5 GB | Higher-memory devices |
+| Qwen 3 14B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-14B-GGUF` | `Qwen_Qwen3-14B-Q4_K_M.gguf` | ~8.4 GB | Higher-memory devices |
+| Qwen 3 32B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-32B-GGUF` | `Qwen_Qwen3-32B-Q4_K_M.gguf` | ~19.8 GB | High-memory desktop (32+ GB) |
+| Qwen 3 4B Instruct 2507 (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-4B-Instruct-2507-GGUF` | `Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf` | ~2.5 GB | All platforms (latest non-thinking 4B) |
+| Qwen 3 4B Thinking 2507 (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-4B-Thinking-2507-GGUF` | `Qwen_Qwen3-4B-Thinking-2507-Q4_K_M.gguf` | ~2.5 GB | All platforms (latest reasoning 4B) |
+| Qwen 3 30B-A3B Instruct 2507 (GGUF Q4_K_M, MoE) | `bartowski/Qwen_Qwen3-30B-A3B-Instruct-2507-GGUF` | `Qwen_Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf` | ~18.6 GB | High-memory desktop (32+ GB), `qwen3moe` |
 | DeepSeek Coder 6.7B Instruct (GGUF Q4_K_M) | `bartowski/deepseek-coder-6.7b-instruct-GGUF` | `deepseek-coder-6.7b-instruct-Q4_K_M.gguf` | ~3.8 GB | Higher-memory devices, custom chat template |
 | Qwen 2.5 Coder 7B Instruct (ISQ) | `Qwen/Qwen2.5-Coder-7B-Instruct` | safetensors (ISQ in-situ) | ~8 GB | macOS (ISQ pipeline) |
 
@@ -374,6 +379,15 @@ await engine.unloadModel()
 | `defaultModelConfig()` | `GgufModelConfig` | Platform-aware Coder default (1.5B on iOS/tvOS/Android, 3B on desktop) |
 | `qwen251_5bConfig()` | `GgufModelConfig` | Forces Qwen 2.5 1.5B regardless of platform |
 | `qwen253bConfig()` | `GgufModelConfig` | Forces Qwen 2.5 3B regardless of platform |
+| `qwen306bConfig()` | `GgufModelConfig` | Qwen 3 0.6B (~0.5 GB) |
+| `qwen317bConfig()` | `GgufModelConfig` | Qwen 3 1.7B (~1.3 GB) |
+| `qwen34bConfig()` | `GgufModelConfig` | Qwen 3 4B (~2.7 GB) |
+| `qwen38bConfig()` | `GgufModelConfig` | Qwen 3 8B (~5 GB) |
+| `qwen314bConfig()` | `GgufModelConfig` | Qwen 3 14B (~8.4 GB) |
+| `qwen332bConfig()` | `GgufModelConfig` | Qwen 3 32B (~19.8 GB) |
+| `qwen34bInstruct2507Config()` | `GgufModelConfig` | Qwen 3 4B Instruct 2507 — latest non-thinking 4B (~2.5 GB) |
+| `qwen34bThinking2507Config()` | `GgufModelConfig` | Qwen 3 4B Thinking 2507 — latest reasoning 4B (~2.5 GB) |
+| `qwen330bA3bInstruct2507Config()` | `GgufModelConfig` | Qwen 3 30B-A3B Instruct 2507 MoE (~18.6 GB) |
 | `defaultSamplingConfig()` | `SamplingConfig` | temp=0.7, top_p=0.95, max_tokens=512 |
 | `deterministicSamplingConfig()` | `SamplingConfig` | temp=0.0, greedy |
 | `mobileSamplingConfig()` | `SamplingConfig` | temp=0.7, max_tokens=128 |
diff --git a/sdk/dart/rust/src/api.rs b/sdk/dart/rust/src/api.rs
index 69ee56f..aafae99 100644
--- a/sdk/dart/rust/src/api.rs
+++ b/sdk/dart/rust/src/api.rs
@@ -542,6 +542,62 @@ pub fn qwen25_coder_3b_config() -> GgufModelConfig {
     OndeGgufModelConfig::qwen25_coder_3b().into()
 }
 
+// ── Qwen 3 family ────────────────────────────────────────────────────────────
+
+/// `GgufModelConfig` for Qwen 3 0.6B Q4_K_M (~0.5 GB).
+#[frb(sync)]
+pub fn qwen3_0_6b_config() -> GgufModelConfig {
+    OndeGgufModelConfig::qwen3_0_6b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 1.7B Q4_K_M (~1.3 GB).
+#[frb(sync)]
+pub fn qwen3_1_7b_config() -> GgufModelConfig {
+    OndeGgufModelConfig::qwen3_1_7b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 4B Q4_K_M (~2.7 GB).
+#[frb(sync)]
+pub fn qwen3_4b_config() -> GgufModelConfig {
+    OndeGgufModelConfig::qwen3_4b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 8B Q4_K_M (~5 GB).
+#[frb(sync)]
+pub fn qwen3_8b_config() -> GgufModelConfig {
+    OndeGgufModelConfig::qwen3_8b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 14B Q4_K_M (~8.4 GB).
+#[frb(sync)]
+pub fn qwen3_14b_config() -> GgufModelConfig {
+    OndeGgufModelConfig::qwen3_14b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 32B Q4_K_M (~19.8 GB).
+#[frb(sync)]
+pub fn qwen3_32b_config() -> GgufModelConfig {
+    OndeGgufModelConfig::qwen3_32b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 4B Instruct 2507 Q4_K_M (~2.5 GB).
+#[frb(sync)]
+pub fn qwen3_4b_instruct_2507_config() -> GgufModelConfig {
+    OndeGgufModelConfig::qwen3_4b_instruct_2507().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 4B Thinking 2507 Q4_K_M (~2.5 GB).
+#[frb(sync)]
+pub fn qwen3_4b_thinking_2507_config() -> GgufModelConfig {
+    OndeGgufModelConfig::qwen3_4b_thinking_2507().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 30B-A3B Instruct 2507 (MoE) Q4_K_M (~18.6 GB).
+#[frb(sync)]
+pub fn qwen3_30b_a3b_instruct_2507_config() -> GgufModelConfig {
+    OndeGgufModelConfig::qwen3_30b_a3b_instruct_2507().into()
+}
+
 /// Default sampling config: `temperature=0.7`, `top_p=0.95`, `max_tokens=512`.
 #[frb(sync)]
 pub fn default_sampling_config() -> SamplingConfig {
diff --git a/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt b/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt
index d99a80e..23179f8 100644
--- a/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt
+++ b/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt
@@ -25,6 +25,26 @@ object OndeModels {
     fun qwen25_1_5b(): GgufModelConfig = uniffi.onde.qwen2515bConfig()
     /** Qwen 2.5 3B Instruct GGUF Q4_K_M (~1.93 GB). */
     fun qwen25_3b(): GgufModelConfig   = uniffi.onde.qwen253bConfig()
+
+    // ── Qwen 3 family ────────────────────────────────────────────────────────
+    /** Qwen 3 0.6B GGUF Q4_K_M (~0.5 GB) — smallest Qwen 3 variant. */
+    fun qwen3_0_6b(): GgufModelConfig = uniffi.onde.qwen306bConfig()
+    /** Qwen 3 1.7B GGUF Q4_K_M (~1.3 GB). */
+    fun qwen3_1_7b(): GgufModelConfig = uniffi.onde.qwen317bConfig()
+    /** Qwen 3 4B GGUF Q4_K_M (~2.7 GB). */
+    fun qwen3_4b(): GgufModelConfig   = uniffi.onde.qwen34bConfig()
+    /** Qwen 3 8B GGUF Q4_K_M (~5 GB). */
+    fun qwen3_8b(): GgufModelConfig   = uniffi.onde.qwen38bConfig()
+    /** Qwen 3 14B GGUF Q4_K_M (~8.4 GB). */
+    fun qwen3_14b(): GgufModelConfig  = uniffi.onde.qwen314bConfig()
+    /** Qwen 3 32B GGUF Q4_K_M (~19.8 GB) — largest dense Qwen 3. */
+    fun qwen3_32b(): GgufModelConfig  = uniffi.onde.qwen332bConfig()
+    /** Qwen 3 4B Instruct 2507 GGUF Q4_K_M (~2.5 GB) — latest non-thinking 4B. */
+    fun qwen3_4b_instruct_2507(): GgufModelConfig = uniffi.onde.qwen34bInstruct2507Config()
+    /** Qwen 3 4B Thinking 2507 GGUF Q4_K_M (~2.5 GB) — latest reasoning 4B. */
+    fun qwen3_4b_thinking_2507(): GgufModelConfig = uniffi.onde.qwen34bThinking2507Config()
+    /** Qwen 3 30B-A3B Instruct 2507 GGUF Q4_K_M (~18.6 GB) — flagship MoE. */
+    fun qwen3_30b_a3b_instruct_2507(): GgufModelConfig = uniffi.onde.qwen330bA3bInstruct2507Config()
 }
 
 /**
diff --git a/sdk/react-native/rust/src/lib.rs b/sdk/react-native/rust/src/lib.rs
index 7eff78b..e6c0fa2 100644
--- a/sdk/react-native/rust/src/lib.rs
+++ b/sdk/react-native/rust/src/lib.rs
@@ -470,6 +470,62 @@ pub extern "C" fn onde_qwen25_3b_config() -> *mut c_char {
     to_json_cstring(&GgufModelConfig::qwen25_3b())
 }
 
+// ── Qwen 3 family ────────────────────────────────────────────────────────────
+
+/// Return the Qwen 3 0.6B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_0_6b_config() -> *mut c_char {
+    to_json_cstring(&GgufModelConfig::qwen3_0_6b())
+}
+
+/// Return the Qwen 3 1.7B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_1_7b_config() -> *mut c_char {
+    to_json_cstring(&GgufModelConfig::qwen3_1_7b())
+}
+
+/// Return the Qwen 3 4B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_4b_config() -> *mut c_char {
+    to_json_cstring(&GgufModelConfig::qwen3_4b())
+}
+
+/// Return the Qwen 3 8B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_8b_config() -> *mut c_char {
+    to_json_cstring(&GgufModelConfig::qwen3_8b())
+}
+
+/// Return the Qwen 3 14B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_14b_config() -> *mut c_char {
+    to_json_cstring(&GgufModelConfig::qwen3_14b())
+}
+
+/// Return the Qwen 3 32B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_32b_config() -> *mut c_char {
+    to_json_cstring(&GgufModelConfig::qwen3_32b())
+}
+
+/// Return the Qwen 3 4B Instruct 2507 GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_4b_instruct_2507_config() -> *mut c_char {
+    to_json_cstring(&GgufModelConfig::qwen3_4b_instruct_2507())
+}
+
+/// Return the Qwen 3 4B Thinking 2507 GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_4b_thinking_2507_config() -> *mut c_char {
+    to_json_cstring(&GgufModelConfig::qwen3_4b_thinking_2507())
+}
+
+/// Return the Qwen 3 30B-A3B Instruct 2507 (MoE) GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_30b_a3b_instruct_2507_config() -> *mut c_char {
+    to_json_cstring(&GgufModelConfig::qwen3_30b_a3b_instruct_2507())
+}
+
 // ── Sampling presets ─────────────────────────────────────────────────────────
 
 /// Return the default sampling config as JSON.
diff --git a/src/inference/engine.rs b/src/inference/engine.rs
index 4380816..5b146d9 100644
--- a/src/inference/engine.rs
+++ b/src/inference/engine.rs
@@ -2039,7 +2039,43 @@ impl GgufModelConfig {
         }
     }
 
-    /// Qwen 3 4B Instruct (GGUF Q4_K_M) — ~2.7 GB.
+    /// Qwen 3 0.6B (GGUF Q4_K_M) — smallest Qwen 3 variant, ~0.5 GB.
+    ///
+    /// Lightest tool-capable Qwen 3 model; fits tvOS and the most
+    /// memory-constrained mobile devices. Extended thinking mode — load with
+    /// `max_tokens ≥ 4096`.
+    pub fn qwen3_0_6b() -> Self {
+        Self {
+            model_id: super::models::BARTOWSKI_QWEN3_0_6B_GGUF.into(),
+            files: vec![super::models::QWEN3_0_6B_GGUF_FILE.into()],
+            tok_model_id: if cfg!(target_os = "android") {
+                Some(super::models::QWEN3_0_6B_TOK_MODEL_ID.into())
+            } else {
+                None
+            },
+            display_name: "Qwen 3 0.6B (Q4_K_M)".into(),
+            approx_memory: "~0.5 GB".into(),
+            chat_template: None,
+        }
+    }
+
+    /// Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling, ~1.3 GB.
+    pub fn qwen3_1_7b() -> Self {
+        Self {
+            model_id: super::models::BARTOWSKI_QWEN3_1_7B_GGUF.into(),
+            files: vec![super::models::QWEN3_1_7B_GGUF_FILE.into()],
+            tok_model_id: if cfg!(target_os = "android") {
+                Some(super::models::QWEN3_1_7B_TOK_MODEL_ID.into())
+            } else {
+                None
+            },
+            display_name: "Qwen 3 1.7B (Q4_K_M)".into(),
+            approx_memory: "~1.3 GB".into(),
+            chat_template: None,
+        }
+    }
+
+    /// Qwen 3 4B (GGUF Q4_K_M) — ~2.7 GB.
     ///
     /// Full OpenAI-compatible tool calling with extended thinking mode.
     /// Always load with `max_tokens ≥ 4096`; the `<think>…</think>` block can
@@ -2048,49 +2084,127 @@ impl GgufModelConfig {
         Self {
             model_id: super::models::BARTOWSKI_QWEN3_4B_GGUF.into(),
             files: vec![super::models::QWEN3_4B_GGUF_FILE.into()],
-            tok_model_id: None,
+            tok_model_id: if cfg!(target_os = "android") {
+                Some(super::models::QWEN3_4B_TOK_MODEL_ID.into())
+            } else {
+                None
+            },
             display_name: "Qwen 3 4B (Q4_K_M)".into(),
             approx_memory: "~2.7 GB".into(),
             chat_template: None,
         }
     }
 
-    /// Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling, ~1.3 GB.
-    pub fn qwen3_1_7b() -> Self {
+    /// Strong tool-calling model with extended thinking. Best balance of
+    /// quality and memory for macOS with 24+ GB RAM.
+    pub fn qwen3_8b() -> Self {
         Self {
-            model_id: super::models::BARTOWSKI_QWEN3_1_7B_GGUF.into(),
-            files: vec![super::models::QWEN3_1_7B_GGUF_FILE.into()],
-            tok_model_id: None,
-            display_name: "Qwen 3 1.7B (Q4_K_M)".into(),
-            approx_memory: "~1.3 GB".into(),
+            model_id: super::models::BARTOWSKI_QWEN3_8B_GGUF.into(),
+            files: vec![super::models::QWEN3_8B_GGUF_FILE.into()],
+            tok_model_id: if cfg!(target_os = "android") {
+                Some(super::models::QWEN3_8B_TOK_MODEL_ID.into())
+            } else {
+                None
+            },
+            display_name: "Qwen 3 8B (Q4_K_M)".into(),
+            approx_memory: "~5 GB".into(),
             chat_template: None,
         }
     }
 
-    /// Qwen 3 14B Instruct (GGUF Q4_K_M) — ~8.4 GB.
+    /// Qwen 3 14B (GGUF Q4_K_M) — ~8.4 GB.
     ///
-    /// Strongest reasoning and tool-calling model with extended thinking.
+    /// Strong reasoning and tool-calling model with extended thinking.
     /// Best all-around model for macOS with 16+ GB RAM.
     pub fn qwen3_14b() -> Self {
         Self {
             model_id: super::models::BARTOWSKI_QWEN3_14B_GGUF.into(),
             files: vec![super::models::QWEN3_14B_GGUF_FILE.into()],
-            tok_model_id: None,
+            tok_model_id: if cfg!(target_os = "android") {
+                Some(super::models::QWEN3_14B_TOK_MODEL_ID.into())
+            } else {
+                None
+            },
             display_name: "Qwen 3 14B (Q4_K_M)".into(),
             approx_memory: "~8.4 GB".into(),
             chat_template: None,
         }
     }
 
-    /// Strong tool-calling model with extended thinking. Best balance of
-    /// quality and memory for macOS with 24+ GB RAM.
-    pub fn qwen3_8b() -> Self {
+    /// Qwen 3 32B (GGUF Q4_K_M) — largest dense Qwen 3, ~19.8 GB.
+    ///
+    /// Highest-quality dense Qwen 3 variant. Requires a high-memory desktop
+    /// (32+ GB RAM / unified memory). Extended thinking and full tool calling.
+    pub fn qwen3_32b() -> Self {
         Self {
-            model_id: super::models::BARTOWSKI_QWEN3_8B_GGUF.into(),
-            files: vec![super::models::QWEN3_8B_GGUF_FILE.into()],
-            tok_model_id: None,
-            display_name: "Qwen 3 8B (Q4_K_M)".into(),
-            approx_memory: "~5 GB".into(),
+            model_id: super::models::BARTOWSKI_QWEN3_32B_GGUF.into(),
+            files: vec![super::models::QWEN3_32B_GGUF_FILE.into()],
+            tok_model_id: if cfg!(target_os = "android") {
+                Some(super::models::QWEN3_32B_TOK_MODEL_ID.into())
+            } else {
+                None
+            },
+            display_name: "Qwen 3 32B (Q4_K_M)".into(),
+            approx_memory: "~19.8 GB".into(),
+            chat_template: None,
+        }
+    }
+
+    /// Qwen 3 4B Instruct 2507 (GGUF Q4_K_M) — latest non-thinking 4B, ~2.5 GB.
+    ///
+    /// Updated instruction-tuned checkpoint that does *not* emit a `<think>`
+    /// block, making it faster and more predictable for chat and tool use.
+    /// Recommended general-purpose Qwen 3 model.
+    pub fn qwen3_4b_instruct_2507() -> Self {
+        Self {
+            model_id: super::models::BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF.into(),
+            files: vec![super::models::QWEN3_4B_INSTRUCT_2507_GGUF_FILE.into()],
+            tok_model_id: if cfg!(target_os = "android") {
+                Some(super::models::QWEN3_4B_INSTRUCT_2507_TOK_MODEL_ID.into())
+            } else {
+                None
+            },
+            display_name: "Qwen 3 4B Instruct 2507 (Q4_K_M)".into(),
+            approx_memory: "~2.5 GB".into(),
+            chat_template: None,
+        }
+    }
+
+    /// Qwen 3 4B Thinking 2507 (GGUF Q4_K_M) — latest reasoning 4B, ~2.5 GB.
+    ///
+    /// Updated thinking-only checkpoint with stronger reasoning and tool-use
+    /// accuracy. Always emits a `<think>` block — load with `max_tokens ≥ 4096`.
+    pub fn qwen3_4b_thinking_2507() -> Self {
+        Self {
+            model_id: super::models::BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF.into(),
+            files: vec![super::models::QWEN3_4B_THINKING_2507_GGUF_FILE.into()],
+            tok_model_id: if cfg!(target_os = "android") {
+                Some(super::models::QWEN3_4B_THINKING_2507_TOK_MODEL_ID.into())
+            } else {
+                None
+            },
+            display_name: "Qwen 3 4B Thinking 2507 (Q4_K_M)".into(),
+            approx_memory: "~2.5 GB".into(),
+            chat_template: None,
+        }
+    }
+
+    /// Qwen 3 30B-A3B Instruct 2507 (GGUF Q4_K_M) — flagship MoE, ~18.6 GB.
+    ///
+    /// Mixture-of-experts model: 30B total parameters, ~3B active per token, so
+    /// inference is far cheaper than a 30B dense model while quality rivals it.
+    /// Loads via the `qwen3moe` GGUF architecture. Requires 32+ GB RAM.
+    pub fn qwen3_30b_a3b_instruct_2507() -> Self {
+        Self {
+            model_id: super::models::BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF.into(),
+            files: vec![super::models::QWEN3_30B_A3B_INSTRUCT_2507_GGUF_FILE.into()],
+            tok_model_id: if cfg!(target_os = "android") {
+                Some(super::models::QWEN3_30B_A3B_INSTRUCT_2507_TOK_MODEL_ID.into())
+            } else {
+                None
+            },
+            display_name: "Qwen 3 30B-A3B Instruct 2507 (Q4_K_M)".into(),
+            approx_memory: "~18.6 GB".into(),
             chat_template: None,
         }
     }
@@ -2172,6 +2286,37 @@ mod tests {
         assert_eq!(cfg.files.len(), 1);
     }
 
+    #[test]
+    fn gguf_model_config_qwen3_variants() {
+        // Every Qwen 3 constructor must produce a valid, single-file config
+        // whose repo ID is registered in the supported-models list.
+        let configs = [
+            GgufModelConfig::qwen3_0_6b(),
+            GgufModelConfig::qwen3_1_7b(),
+            GgufModelConfig::qwen3_4b(),
+            GgufModelConfig::qwen3_8b(),
+            GgufModelConfig::qwen3_14b(),
+            GgufModelConfig::qwen3_32b(),
+            GgufModelConfig::qwen3_4b_instruct_2507(),
+            GgufModelConfig::qwen3_4b_thinking_2507(),
+            GgufModelConfig::qwen3_30b_a3b_instruct_2507(),
+        ];
+        for cfg in &configs {
+            assert!(
+                cfg.model_id.contains("Qwen3"),
+                "unexpected id: {}",
+                cfg.model_id
+            );
+            assert_eq!(cfg.files.len(), 1);
+            assert!(cfg.files[0].ends_with("-Q4_K_M.gguf"));
+            assert!(
+                super::super::models::SUPPORTED_MODELS.contains(&cfg.model_id.as_str()),
+                "{} missing from SUPPORTED_MODELS",
+                cfg.model_id
+            );
+        }
+    }
+
     #[test]
     fn gguf_model_config_platform_default() {
         let cfg = GgufModelConfig::platform_default();
diff --git a/src/inference/ffi.rs b/src/inference/ffi.rs
index 376aff0..12c5036 100644
--- a/src/inference/ffi.rs
+++ b/src/inference/ffi.rs
@@ -326,6 +326,66 @@ pub fn qwen25_3b_config() -> GgufModelConfig {
     GgufModelConfig::qwen25_3b()
 }
 
+// ── Qwen 3 family ────────────────────────────────────────────────────────────
+
+/// Return the Qwen 3 0.6B GGUF model configuration (~0.5 GB).
+#[uniffi::export]
+pub fn qwen3_0_6b_config() -> GgufModelConfig {
+    GgufModelConfig::qwen3_0_6b()
+}
+
+/// Return the Qwen 3 1.7B GGUF model configuration (~1.3 GB).
+#[uniffi::export]
+pub fn qwen3_1_7b_config() -> GgufModelConfig {
+    GgufModelConfig::qwen3_1_7b()
+}
+
+/// Return the Qwen 3 4B GGUF model configuration (~2.7 GB).
+#[uniffi::export]
+pub fn qwen3_4b_config() -> GgufModelConfig {
+    GgufModelConfig::qwen3_4b()
+}
+
+/// Return the Qwen 3 8B GGUF model configuration (~5 GB).
+#[uniffi::export]
+pub fn qwen3_8b_config() -> GgufModelConfig {
+    GgufModelConfig::qwen3_8b()
+}
+
+/// Return the Qwen 3 14B GGUF model configuration (~8.4 GB).
+#[uniffi::export]
+pub fn qwen3_14b_config() -> GgufModelConfig {
+    GgufModelConfig::qwen3_14b()
+}
+
+/// Return the Qwen 3 32B GGUF model configuration (~19.8 GB).
+#[uniffi::export]
+pub fn qwen3_32b_config() -> GgufModelConfig {
+    GgufModelConfig::qwen3_32b()
+}
+
+/// Return the Qwen 3 4B Instruct 2507 GGUF model configuration (~2.5 GB).
+///
+/// Latest non-thinking 4B checkpoint — recommended general-purpose Qwen 3 model.
+#[uniffi::export]
+pub fn qwen3_4b_instruct_2507_config() -> GgufModelConfig {
+    GgufModelConfig::qwen3_4b_instruct_2507()
+}
+
+/// Return the Qwen 3 4B Thinking 2507 GGUF model configuration (~2.5 GB).
+///
+/// Latest reasoning-focused 4B checkpoint; load with `max_tokens ≥ 4096`.
+#[uniffi::export]
+pub fn qwen3_4b_thinking_2507_config() -> GgufModelConfig {
+    GgufModelConfig::qwen3_4b_thinking_2507()
+}
+
+/// Return the Qwen 3 30B-A3B Instruct 2507 (MoE) GGUF model configuration (~18.6 GB).
+#[uniffi::export]
+pub fn qwen3_30b_a3b_instruct_2507_config() -> GgufModelConfig {
+    GgufModelConfig::qwen3_30b_a3b_instruct_2507()
+}
+
 /// Return default sampling parameters for creative chat.
 #[uniffi::export]
 pub fn default_sampling_config() -> SamplingConfig {
diff --git a/src/inference/models.rs b/src/inference/models.rs
index 31f83c6..b56e5bd 100644
--- a/src/inference/models.rs
+++ b/src/inference/models.rs
@@ -83,16 +83,47 @@ pub const QWEN25_3B_GGUF_FILE: &str = "Qwen2.5-3B-Instruct-Q4_K_M.gguf";
 /// Base model repo used for the HF tokenizer (tokenizer.json + tokenizer_config.json).
 pub const QWEN25_3B_TOK_MODEL_ID: &str = "Qwen/Qwen2.5-3B-Instruct";
 
-/// Pre-quantized Qwen 3 4B Instruct (GGUF Q4_K_M) — full OpenAI-compatible tool calling (~2.7 GB).
+// ── Qwen 3 family (GGUF Q4_K_M) ──────────────────────────────────────────────
+//
+// The Qwen 3 line uses the `qwen3` GGUF architecture (and `qwen3moe` for the
+// 30B-A3B mixture-of-experts variant), both supported by mistral.rs's quantized
+// loader. Every Qwen 3 model is a hybrid reasoner with an extended thinking mode
+// (`<think>…</think>`); always load with `max_tokens ≥ 4096` so the thinking
+// block does not exhaust the token budget before the real reply.
+//
+// bartowski's repos embed the tokenizer and chat template inside the GGUF, so on
+// iOS/macOS no separate tokenizer download is needed. On Android the candle GGUF
+// backend cannot parse the embedded tokenizer, so each model also declares a
+// `TOK_MODEL_ID` pointing at the official Qwen repo for a standalone tokenizer.
+
+/// Pre-quantized Qwen 3 0.6B (GGUF Q4_K_M) — smallest Qwen 3 variant (~0.5 GB).
 ///
-/// Qwen 3 uses an extended thinking mode (`<think>…</think>`) that significantly improves
-/// reasoning and tool-use accuracy. Load with `max_tokens ≥ 4096` to avoid empty replies caused
-/// by the model exhausting its token budget on thinking before producing a response.
+/// Lightest tool-capable Qwen 3 model. Suitable for tvOS and the most
+/// memory-constrained mobile devices where even the 1.7B is too large.
+pub const BARTOWSKI_QWEN3_0_6B_GGUF: &str = "bartowski/Qwen_Qwen3-0.6B-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 0.6B repo.
+pub const QWEN3_0_6B_GGUF_FILE: &str = "Qwen_Qwen3-0.6B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_0_6B_TOK_MODEL_ID: &str = "Qwen/Qwen3-0.6B";
+
+/// Pre-quantized Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling model (~1.3 GB).
+///
+/// Smallest Qwen 3 variant with comfortable tool calling. Suitable for mobile
+/// devices where the 4B model would be too large.
+pub const BARTOWSKI_QWEN3_1_7B_GGUF: &str = "bartowski/Qwen_Qwen3-1.7B-GGUF";
+/// The specific GGUF filename for the Qwen3 1.7B repo.
+pub const QWEN3_1_7B_GGUF_FILE: &str = "Qwen_Qwen3-1.7B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_1_7B_TOK_MODEL_ID: &str = "Qwen/Qwen3-1.7B";
+
+/// Pre-quantized Qwen 3 4B (GGUF Q4_K_M) — full OpenAI-compatible tool calling (~2.7 GB).
 ///
 /// Recommended model for siGit Code (coding agent with tool calling on macOS/Linux/Windows).
 pub const BARTOWSKI_QWEN3_4B_GGUF: &str = "bartowski/Qwen_Qwen3-4B-GGUF";
 /// The specific GGUF filename to download from the bartowski Qwen 3 4B repo.
 pub const QWEN3_4B_GGUF_FILE: &str = "Qwen_Qwen3-4B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_4B_TOK_MODEL_ID: &str = "Qwen/Qwen3-4B";
 
 /// Pre-quantized Qwen 3 8B (GGUF Q4_K_M) — strong tool-calling model (~5 GB).
 ///
@@ -100,22 +131,70 @@ pub const QWEN3_4B_GGUF_FILE: &str = "Qwen_Qwen3-4B-Q4_K_M.gguf";
 /// Full tool calling and extended thinking mode support.
 pub const BARTOWSKI_QWEN3_8B_GGUF: &str = "bartowski/Qwen_Qwen3-8B-GGUF";
 pub const QWEN3_8B_GGUF_FILE: &str = "Qwen_Qwen3-8B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_8B_TOK_MODEL_ID: &str = "Qwen/Qwen3-8B";
 
 /// Pre-quantized Qwen 3 14B (GGUF Q4_K_M) — strong reasoning and tool-calling model (~8.4 GB).
 ///
-/// Qwen 3 uses extended thinking mode (`<think>…</think>`) for improved reasoning.
 /// Best all-around model for macOS with 16+ GB RAM. Full tool calling support.
 pub const BARTOWSKI_QWEN3_14B_GGUF: &str = "bartowski/Qwen_Qwen3-14B-GGUF";
 /// The specific GGUF filename.
 pub const QWEN3_14B_GGUF_FILE: &str = "Qwen_Qwen3-14B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_14B_TOK_MODEL_ID: &str = "Qwen/Qwen3-14B";
 
-/// Pre-quantized Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling model (~1.3 GB).
+/// Pre-quantized Qwen 3 32B (GGUF Q4_K_M) — largest dense Qwen 3 model (~19.8 GB).
 ///
-/// Smallest Qwen 3 variant with tool calling support. Suitable for mobile devices
-/// where the 4B model would be too large.
-pub const BARTOWSKI_QWEN3_1_7B_GGUF: &str = "bartowski/Qwen_Qwen3-1.7B-GGUF";
-/// The specific GGUF filename for the Qwen3 1.7B repo.
-pub const QWEN3_1_7B_GGUF_FILE: &str = "Qwen_Qwen3-1.7B-Q4_K_M.gguf";
+/// Highest-quality dense Qwen 3 variant. Requires a high-memory desktop
+/// (32+ GB RAM / unified memory). Full tool calling and extended thinking.
+pub const BARTOWSKI_QWEN3_32B_GGUF: &str = "bartowski/Qwen_Qwen3-32B-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 32B repo.
+pub const QWEN3_32B_GGUF_FILE: &str = "Qwen_Qwen3-32B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_32B_TOK_MODEL_ID: &str = "Qwen/Qwen3-32B";
+
+// ── Qwen 3 "2507" updated releases ───────────────────────────────────────────
+//
+// The July/August 2025 refresh split Qwen 3 into dedicated non-thinking
+// (`-Instruct-2507`) and thinking-only (`-Thinking-2507`) checkpoints with
+// markedly improved instruction following, tool use, and long-context quality.
+// These are the latest open Qwen 3 weights available as on-device GGUF.
+
+/// Pre-quantized Qwen 3 4B Instruct 2507 (GGUF Q4_K_M) — latest non-thinking 4B (~2.5 GB).
+///
+/// Updated instruction-tuned checkpoint. Unlike the base 4B it does *not* emit a
+/// `<think>` block, so it is faster and more predictable for chat and tool use.
+pub const BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF: &str =
+    "bartowski/Qwen_Qwen3-4B-Instruct-2507-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 4B Instruct 2507 repo.
+pub const QWEN3_4B_INSTRUCT_2507_GGUF_FILE: &str = "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_4B_INSTRUCT_2507_TOK_MODEL_ID: &str = "Qwen/Qwen3-4B-Instruct-2507";
+
+/// Pre-quantized Qwen 3 4B Thinking 2507 (GGUF Q4_K_M) — latest reasoning 4B (~2.5 GB).
+///
+/// Updated thinking-only checkpoint with stronger reasoning and tool-use accuracy.
+/// Always emits a `<think>` block — load with `max_tokens ≥ 4096`.
+pub const BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF: &str =
+    "bartowski/Qwen_Qwen3-4B-Thinking-2507-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 4B Thinking 2507 repo.
+pub const QWEN3_4B_THINKING_2507_GGUF_FILE: &str = "Qwen_Qwen3-4B-Thinking-2507-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_4B_THINKING_2507_TOK_MODEL_ID: &str = "Qwen/Qwen3-4B-Thinking-2507";
+
+/// Pre-quantized Qwen 3 30B-A3B Instruct 2507 (GGUF Q4_K_M) — flagship MoE (~18.6 GB).
+///
+/// Mixture-of-experts model: 30B total parameters but only ~3B active per token,
+/// so inference is far cheaper than a 30B dense model while quality rivals it.
+/// Loads via the `qwen3moe` GGUF architecture in mistral.rs. Requires a
+/// high-memory desktop (32+ GB RAM / unified memory).
+pub const BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF: &str =
+    "bartowski/Qwen_Qwen3-30B-A3B-Instruct-2507-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 30B-A3B Instruct 2507 repo.
+pub const QWEN3_30B_A3B_INSTRUCT_2507_GGUF_FILE: &str =
+    "Qwen_Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_30B_A3B_INSTRUCT_2507_TOK_MODEL_ID: &str = "Qwen/Qwen3-30B-A3B-Instruct-2507";
 
 /// DeepSeek Coder v1 6.7B Instruct (GGUF Q4_K_M) — dedicated code generation model (~3.8 GB).
 ///
@@ -136,10 +215,15 @@ pub const SUPPORTED_MODELS: &[&str] = &[
     BARTOWSKI_QWEN25_0_5B_INSTRUCT_GGUF,
     BARTOWSKI_QWEN25_1_5B_INSTRUCT_GGUF,
     BARTOWSKI_QWEN25_3B_INSTRUCT_GGUF,
+    BARTOWSKI_QWEN3_0_6B_GGUF,
+    BARTOWSKI_QWEN3_1_7B_GGUF,
     BARTOWSKI_QWEN3_4B_GGUF,
     BARTOWSKI_QWEN3_8B_GGUF,
     BARTOWSKI_QWEN3_14B_GGUF,
-    BARTOWSKI_QWEN3_1_7B_GGUF,
+    BARTOWSKI_QWEN3_32B_GGUF,
+    BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF,
+    BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF,
+    BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF,
     BARTOWSKI_QWEN25_CODER_7B_INSTRUCT_GGUF,
     THEBLOKE_DEEPSEEK_CODER_6_7B_INSTRUCT_GGUF,
 ];
@@ -197,6 +281,23 @@ pub const SUPPORTED_MODEL_INFO: &[SupportedModelInfo] = &[
         // Exact file size from HuggingFace API siblings[].size.
         expected_size_bytes: 1_929_903_264,
     },
+    SupportedModelInfo {
+        id: BARTOWSKI_QWEN3_0_6B_GGUF,
+        name: "Qwen 3 0.6B (GGUF)",
+        org: "Qwen / Alibaba",
+        description: "Smallest Qwen 3 variant with tool calling (~0.5 GB). \
+             Suitable for tvOS and the most memory-constrained mobile devices.",
+        // Approximate Q4_K_M size; HF API unreachable at authoring time.
+        expected_size_bytes: 483_000_000,
+    },
+    SupportedModelInfo {
+        id: BARTOWSKI_QWEN3_1_7B_GGUF,
+        name: "Qwen 3 1.7B (GGUF)",
+        org: "Qwen / Alibaba",
+        description: "Lightweight tool-calling model for mobile (~1.3 GB). \
+             Smallest Qwen 3 variant with comfortable tool calling support.",
+        expected_size_bytes: 1_282_439_584,
+    },
     SupportedModelInfo {
         id: BARTOWSKI_QWEN3_4B_GGUF,
         name: "Qwen 3 4B (GGUF)",
@@ -225,12 +326,40 @@ pub const SUPPORTED_MODEL_INFO: &[SupportedModelInfo] = &[
         expected_size_bytes: 9_001_753_632,
     },
     SupportedModelInfo {
-        id: BARTOWSKI_QWEN3_1_7B_GGUF,
-        name: "Qwen 3 1.7B (GGUF)",
+        id: BARTOWSKI_QWEN3_32B_GGUF,
+        name: "Qwen 3 32B (GGUF)",
         org: "Qwen / Alibaba",
-        description: "Lightweight tool-calling model for mobile (~1.3 GB). \
-             Smallest Qwen 3 variant with tool calling support.",
-        expected_size_bytes: 1_282_439_584,
+        description: "Largest dense Qwen 3 model with extended thinking (~19.8 GB). \
+             Highest-quality dense variant; requires 32+ GB RAM.",
+        // Approximate Q4_K_M size; HF API unreachable at authoring time.
+        expected_size_bytes: 21_260_000_000,
+    },
+    SupportedModelInfo {
+        id: BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF,
+        name: "Qwen 3 4B Instruct 2507 (GGUF)",
+        org: "Qwen / Alibaba",
+        description: "Latest non-thinking 4B checkpoint — faster, predictable chat and \
+             tool use (~2.5 GB). Recommended general-purpose Qwen 3 model.",
+        // Approximate Q4_K_M size; HF API unreachable at authoring time.
+        expected_size_bytes: 2_500_000_000,
+    },
+    SupportedModelInfo {
+        id: BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF,
+        name: "Qwen 3 4B Thinking 2507 (GGUF)",
+        org: "Qwen / Alibaba",
+        description: "Latest reasoning-focused 4B checkpoint with extended thinking (~2.5 GB). \
+             Stronger reasoning and tool-use accuracy; load with max_tokens ≥ 4096.",
+        // Approximate Q4_K_M size; HF API unreachable at authoring time.
+        expected_size_bytes: 2_500_000_000,
+    },
+    SupportedModelInfo {
+        id: BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF,
+        name: "Qwen 3 30B-A3B Instruct 2507 (GGUF)",
+        org: "Qwen / Alibaba",
+        description: "Flagship mixture-of-experts model: 30B total / ~3B active (~18.6 GB). \
+             Near-dense quality at far lower inference cost; requires 32+ GB RAM.",
+        // Approximate Q4_K_M size; HF API unreachable at authoring time.
+        expected_size_bytes: 19_971_600_000,
     },
     SupportedModelInfo {
         id: BARTOWSKI_QWEN25_CODER_7B_INSTRUCT_GGUF,
@@ -264,7 +393,48 @@ pub fn tok_model_id_for_repo(hf_repo_id: &str) -> Option<&'static str> {
         BARTOWSKI_QWEN25_CODER_1_5B_INSTRUCT_GGUF => Some(QWEN25_CODER_1_5B_TOK_MODEL_ID),
         BARTOWSKI_QWEN25_CODER_3B_INSTRUCT_GGUF => Some(QWEN25_CODER_3B_TOK_MODEL_ID),
         BARTOWSKI_QWEN25_CODER_7B_INSTRUCT_GGUF => Some(QWEN25_CODER_7B_TOK_MODEL_ID),
+        BARTOWSKI_QWEN3_0_6B_GGUF => Some(QWEN3_0_6B_TOK_MODEL_ID),
+        BARTOWSKI_QWEN3_1_7B_GGUF => Some(QWEN3_1_7B_TOK_MODEL_ID),
+        BARTOWSKI_QWEN3_4B_GGUF => Some(QWEN3_4B_TOK_MODEL_ID),
+        BARTOWSKI_QWEN3_8B_GGUF => Some(QWEN3_8B_TOK_MODEL_ID),
+        BARTOWSKI_QWEN3_14B_GGUF => Some(QWEN3_14B_TOK_MODEL_ID),
+        BARTOWSKI_QWEN3_32B_GGUF => Some(QWEN3_32B_TOK_MODEL_ID),
+        BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF => Some(QWEN3_4B_INSTRUCT_2507_TOK_MODEL_ID),
+        BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF => Some(QWEN3_4B_THINKING_2507_TOK_MODEL_ID),
+        BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF => {
+            Some(QWEN3_30B_A3B_INSTRUCT_2507_TOK_MODEL_ID)
+        }
         THEBLOKE_DEEPSEEK_CODER_6_7B_INSTRUCT_GGUF => Some(DEEPSEEK_CODER_6_7B_TOK_MODEL_ID),
         _ => None,
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Every supported model must have a corresponding display-metadata entry
+    /// so the model-list UI can render it.
+    #[test]
+    fn every_supported_model_has_info() {
+        for id in SUPPORTED_MODELS {
+            assert!(
+                SUPPORTED_MODEL_INFO.iter().any(|info| info.id == *id),
+                "{id} is in SUPPORTED_MODELS but missing from SUPPORTED_MODEL_INFO"
+            );
+        }
+    }
+
+    /// Every Qwen 3 repo must resolve to an Android tokenizer ID so loading
+    /// works on the candle GGUF backend.
+    #[test]
+    fn qwen3_repos_have_android_tokenizer() {
+        let qwen3 = SUPPORTED_MODELS.iter().filter(|id| id.contains("Qwen3"));
+        for id in qwen3 {
+            assert!(
+                tok_model_id_for_repo(id).is_some(),
+                "{id} has no Android tokenizer mapping"
+            );
+        }
+    }
+}