diff --git a/.agents/AGENTS.md b/.agents/AGENTS.md
index 7a75ba5..33eaf2a 100644
--- a/.agents/AGENTS.md
+++ b/.agents/AGENTS.md
@@ -147,10 +147,15 @@ All model constants live in `src/inference/models.rs`. When adding a new model:
| Qwen 2.5 Coder 1.5B Instruct (GGUF Q4_K_M) | `bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF` | `Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf` | ~941 MB | All platforms (mobile default) |
| Qwen 2.5 Coder 3B Instruct (GGUF Q4_K_M) | `bartowski/Qwen2.5-Coder-3B-Instruct-GGUF` | `Qwen2.5-Coder-3B-Instruct-Q4_K_M.gguf` | ~1.93 GB | All platforms (desktop default) |
| Qwen 2.5 Coder 7B Instruct (GGUF Q4_K_M) | `bartowski/Qwen2.5-Coder-7B-Instruct-GGUF` | `Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf` | ~4.4 GB | Higher-memory devices |
-| Qwen 3 1.7B (GGUF Q4_K_M) | `bartowski/Qwen3-1.7B-GGUF` | `Qwen3-1.7B-Q4_K_M.gguf` | ~1.3 GB | All platforms |
-| Qwen 3 4B (GGUF Q4_K_M) | `bartowski/Qwen3-4B-GGUF` | `Qwen3-4B-Q4_K_M.gguf` | ~2.7 GB | All platforms |
-| Qwen 3 8B (GGUF Q4_K_M) | `bartowski/Qwen3-8B-GGUF` | `Qwen3-8B-Q4_K_M.gguf` | ~5 GB | Higher-memory devices |
-| Qwen 3 14B (GGUF Q4_K_M) | `bartowski/Qwen3-14B-GGUF` | `Qwen3-14B-Q4_K_M.gguf` | ~8.4 GB | Higher-memory devices |
+| Qwen 3 0.6B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-0.6B-GGUF` | `Qwen_Qwen3-0.6B-Q4_K_M.gguf` | ~0.5 GB | All platforms |
+| Qwen 3 1.7B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-1.7B-GGUF` | `Qwen_Qwen3-1.7B-Q4_K_M.gguf` | ~1.3 GB | All platforms |
+| Qwen 3 4B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-4B-GGUF` | `Qwen_Qwen3-4B-Q4_K_M.gguf` | ~2.7 GB | All platforms |
+| Qwen 3 8B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-8B-GGUF` | `Qwen_Qwen3-8B-Q4_K_M.gguf` | ~5 GB | Higher-memory devices |
+| Qwen 3 14B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-14B-GGUF` | `Qwen_Qwen3-14B-Q4_K_M.gguf` | ~8.4 GB | Higher-memory devices |
+| Qwen 3 32B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-32B-GGUF` | `Qwen_Qwen3-32B-Q4_K_M.gguf` | ~19.8 GB | High-memory desktop (32+ GB) |
+| Qwen 3 4B Instruct 2507 (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-4B-Instruct-2507-GGUF` | `Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf` | ~2.5 GB | All platforms (latest non-thinking 4B) |
+| Qwen 3 4B Thinking 2507 (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-4B-Thinking-2507-GGUF` | `Qwen_Qwen3-4B-Thinking-2507-Q4_K_M.gguf` | ~2.5 GB | All platforms (latest reasoning 4B) |
+| Qwen 3 30B-A3B Instruct 2507 (GGUF Q4_K_M, MoE) | `bartowski/Qwen_Qwen3-30B-A3B-Instruct-2507-GGUF` | `Qwen_Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf` | ~18.6 GB | High-memory desktop (32+ GB), `qwen3moe` |
| DeepSeek Coder 6.7B Instruct (GGUF Q4_K_M) | `bartowski/deepseek-coder-6.7b-instruct-GGUF` | `deepseek-coder-6.7b-instruct-Q4_K_M.gguf` | ~3.8 GB | Higher-memory devices, custom chat template |
| Qwen 2.5 Coder 7B Instruct (ISQ) | `Qwen/Qwen2.5-Coder-7B-Instruct` | safetensors (ISQ in-situ) | ~8 GB | macOS (ISQ pipeline) |
@@ -374,6 +379,15 @@ await engine.unloadModel()
| `defaultModelConfig()` | `GgufModelConfig` | Platform-aware Coder default (1.5B on iOS/tvOS/Android, 3B on desktop) |
| `qwen251_5bConfig()` | `GgufModelConfig` | Forces Qwen 2.5 1.5B regardless of platform |
| `qwen253bConfig()` | `GgufModelConfig` | Forces Qwen 2.5 3B regardless of platform |
+| `qwen306bConfig()` | `GgufModelConfig` | Qwen 3 0.6B (~0.5 GB) |
+| `qwen317bConfig()` | `GgufModelConfig` | Qwen 3 1.7B (~1.3 GB) |
+| `qwen34bConfig()` | `GgufModelConfig` | Qwen 3 4B (~2.7 GB) |
+| `qwen38bConfig()` | `GgufModelConfig` | Qwen 3 8B (~5 GB) |
+| `qwen314bConfig()` | `GgufModelConfig` | Qwen 3 14B (~8.4 GB) |
+| `qwen332bConfig()` | `GgufModelConfig` | Qwen 3 32B (~19.8 GB) |
+| `qwen34bInstruct2507Config()` | `GgufModelConfig` | Qwen 3 4B Instruct 2507 — latest non-thinking 4B (~2.5 GB) |
+| `qwen34bThinking2507Config()` | `GgufModelConfig` | Qwen 3 4B Thinking 2507 — latest reasoning 4B (~2.5 GB) |
+| `qwen330bA3bInstruct2507Config()` | `GgufModelConfig` | Qwen 3 30B-A3B Instruct 2507 MoE (~18.6 GB) |
| `defaultSamplingConfig()` | `SamplingConfig` | temp=0.7, top_p=0.95, max_tokens=512 |
| `deterministicSamplingConfig()` | `SamplingConfig` | temp=0.0, greedy |
| `mobileSamplingConfig()` | `SamplingConfig` | temp=0.7, max_tokens=128 |
diff --git a/sdk/dart/rust/src/api.rs b/sdk/dart/rust/src/api.rs
index 69ee56f..aafae99 100644
--- a/sdk/dart/rust/src/api.rs
+++ b/sdk/dart/rust/src/api.rs
@@ -542,6 +542,62 @@ pub fn qwen25_coder_3b_config() -> GgufModelConfig {
OndeGgufModelConfig::qwen25_coder_3b().into()
}
+// ── Qwen 3 family ────────────────────────────────────────────────────────────
+
+/// `GgufModelConfig` for Qwen 3 0.6B Q4_K_M (~0.5 GB).
+#[frb(sync)]
+pub fn qwen3_0_6b_config() -> GgufModelConfig {
+ OndeGgufModelConfig::qwen3_0_6b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 1.7B Q4_K_M (~1.3 GB).
+#[frb(sync)]
+pub fn qwen3_1_7b_config() -> GgufModelConfig {
+ OndeGgufModelConfig::qwen3_1_7b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 4B Q4_K_M (~2.7 GB).
+#[frb(sync)]
+pub fn qwen3_4b_config() -> GgufModelConfig {
+ OndeGgufModelConfig::qwen3_4b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 8B Q4_K_M (~5 GB).
+#[frb(sync)]
+pub fn qwen3_8b_config() -> GgufModelConfig {
+ OndeGgufModelConfig::qwen3_8b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 14B Q4_K_M (~8.4 GB).
+#[frb(sync)]
+pub fn qwen3_14b_config() -> GgufModelConfig {
+ OndeGgufModelConfig::qwen3_14b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 32B Q4_K_M (~19.8 GB).
+#[frb(sync)]
+pub fn qwen3_32b_config() -> GgufModelConfig {
+ OndeGgufModelConfig::qwen3_32b().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 4B Instruct 2507 Q4_K_M (~2.5 GB).
+#[frb(sync)]
+pub fn qwen3_4b_instruct_2507_config() -> GgufModelConfig {
+ OndeGgufModelConfig::qwen3_4b_instruct_2507().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 4B Thinking 2507 Q4_K_M (~2.5 GB).
+#[frb(sync)]
+pub fn qwen3_4b_thinking_2507_config() -> GgufModelConfig {
+ OndeGgufModelConfig::qwen3_4b_thinking_2507().into()
+}
+
+/// `GgufModelConfig` for Qwen 3 30B-A3B Instruct 2507 (MoE) Q4_K_M (~18.6 GB).
+#[frb(sync)]
+pub fn qwen3_30b_a3b_instruct_2507_config() -> GgufModelConfig {
+ OndeGgufModelConfig::qwen3_30b_a3b_instruct_2507().into()
+}
+
/// Default sampling config: `temperature=0.7`, `top_p=0.95`, `max_tokens=512`.
#[frb(sync)]
pub fn default_sampling_config() -> SamplingConfig {
diff --git a/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt b/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt
index d99a80e..23179f8 100644
--- a/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt
+++ b/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt
@@ -25,6 +25,26 @@ object OndeModels {
fun qwen25_1_5b(): GgufModelConfig = uniffi.onde.qwen2515bConfig()
/** Qwen 2.5 3B Instruct GGUF Q4_K_M (~1.93 GB). */
fun qwen25_3b(): GgufModelConfig = uniffi.onde.qwen253bConfig()
+
+ // ── Qwen 3 family ────────────────────────────────────────────────────────
+ /** Qwen 3 0.6B GGUF Q4_K_M (~0.5 GB) — smallest Qwen 3 variant. */
+ fun qwen3_0_6b(): GgufModelConfig = uniffi.onde.qwen306bConfig()
+ /** Qwen 3 1.7B GGUF Q4_K_M (~1.3 GB). */
+ fun qwen3_1_7b(): GgufModelConfig = uniffi.onde.qwen317bConfig()
+ /** Qwen 3 4B GGUF Q4_K_M (~2.7 GB). */
+ fun qwen3_4b(): GgufModelConfig = uniffi.onde.qwen34bConfig()
+ /** Qwen 3 8B GGUF Q4_K_M (~5 GB). */
+ fun qwen3_8b(): GgufModelConfig = uniffi.onde.qwen38bConfig()
+ /** Qwen 3 14B GGUF Q4_K_M (~8.4 GB). */
+ fun qwen3_14b(): GgufModelConfig = uniffi.onde.qwen314bConfig()
+ /** Qwen 3 32B GGUF Q4_K_M (~19.8 GB) — largest dense Qwen 3. */
+ fun qwen3_32b(): GgufModelConfig = uniffi.onde.qwen332bConfig()
+ /** Qwen 3 4B Instruct 2507 GGUF Q4_K_M (~2.5 GB) — latest non-thinking 4B. */
+ fun qwen3_4b_instruct_2507(): GgufModelConfig = uniffi.onde.qwen34bInstruct2507Config()
+ /** Qwen 3 4B Thinking 2507 GGUF Q4_K_M (~2.5 GB) — latest reasoning 4B. */
+ fun qwen3_4b_thinking_2507(): GgufModelConfig = uniffi.onde.qwen34bThinking2507Config()
+ /** Qwen 3 30B-A3B Instruct 2507 GGUF Q4_K_M (~18.6 GB) — flagship MoE. */
+ fun qwen3_30b_a3b_instruct_2507(): GgufModelConfig = uniffi.onde.qwen330bA3bInstruct2507Config()
}
/**
diff --git a/sdk/react-native/rust/src/lib.rs b/sdk/react-native/rust/src/lib.rs
index 7eff78b..e6c0fa2 100644
--- a/sdk/react-native/rust/src/lib.rs
+++ b/sdk/react-native/rust/src/lib.rs
@@ -470,6 +470,62 @@ pub extern "C" fn onde_qwen25_3b_config() -> *mut c_char {
to_json_cstring(&GgufModelConfig::qwen25_3b())
}
+// ── Qwen 3 family ────────────────────────────────────────────────────────────
+
+/// Return the Qwen 3 0.6B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_0_6b_config() -> *mut c_char {
+ to_json_cstring(&GgufModelConfig::qwen3_0_6b())
+}
+
+/// Return the Qwen 3 1.7B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_1_7b_config() -> *mut c_char {
+ to_json_cstring(&GgufModelConfig::qwen3_1_7b())
+}
+
+/// Return the Qwen 3 4B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_4b_config() -> *mut c_char {
+ to_json_cstring(&GgufModelConfig::qwen3_4b())
+}
+
+/// Return the Qwen 3 8B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_8b_config() -> *mut c_char {
+ to_json_cstring(&GgufModelConfig::qwen3_8b())
+}
+
+/// Return the Qwen 3 14B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_14b_config() -> *mut c_char {
+ to_json_cstring(&GgufModelConfig::qwen3_14b())
+}
+
+/// Return the Qwen 3 32B GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_32b_config() -> *mut c_char {
+ to_json_cstring(&GgufModelConfig::qwen3_32b())
+}
+
+/// Return the Qwen 3 4B Instruct 2507 GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_4b_instruct_2507_config() -> *mut c_char {
+ to_json_cstring(&GgufModelConfig::qwen3_4b_instruct_2507())
+}
+
+/// Return the Qwen 3 4B Thinking 2507 GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_4b_thinking_2507_config() -> *mut c_char {
+ to_json_cstring(&GgufModelConfig::qwen3_4b_thinking_2507())
+}
+
+/// Return the Qwen 3 30B-A3B Instruct 2507 (MoE) GGUF model config as JSON.
+#[no_mangle]
+pub extern "C" fn onde_qwen3_30b_a3b_instruct_2507_config() -> *mut c_char {
+ to_json_cstring(&GgufModelConfig::qwen3_30b_a3b_instruct_2507())
+}
+
// ── Sampling presets ─────────────────────────────────────────────────────────
/// Return the default sampling config as JSON.
diff --git a/src/inference/engine.rs b/src/inference/engine.rs
index 4380816..5b146d9 100644
--- a/src/inference/engine.rs
+++ b/src/inference/engine.rs
@@ -2039,7 +2039,43 @@ impl GgufModelConfig {
}
}
- /// Qwen 3 4B Instruct (GGUF Q4_K_M) — ~2.7 GB.
+ /// Qwen 3 0.6B (GGUF Q4_K_M) — smallest Qwen 3 variant, ~0.5 GB.
+ ///
+ /// Lightest tool-capable Qwen 3 model; fits tvOS and the most
+ /// memory-constrained mobile devices. Extended thinking mode — load with
+ /// `max_tokens ≥ 4096`.
+ pub fn qwen3_0_6b() -> Self {
+ Self {
+ model_id: super::models::BARTOWSKI_QWEN3_0_6B_GGUF.into(),
+ files: vec![super::models::QWEN3_0_6B_GGUF_FILE.into()],
+ tok_model_id: if cfg!(target_os = "android") {
+ Some(super::models::QWEN3_0_6B_TOK_MODEL_ID.into())
+ } else {
+ None
+ },
+ display_name: "Qwen 3 0.6B (Q4_K_M)".into(),
+ approx_memory: "~0.5 GB".into(),
+ chat_template: None,
+ }
+ }
+
+ /// Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling, ~1.3 GB.
+ pub fn qwen3_1_7b() -> Self {
+ Self {
+ model_id: super::models::BARTOWSKI_QWEN3_1_7B_GGUF.into(),
+ files: vec![super::models::QWEN3_1_7B_GGUF_FILE.into()],
+ tok_model_id: if cfg!(target_os = "android") {
+ Some(super::models::QWEN3_1_7B_TOK_MODEL_ID.into())
+ } else {
+ None
+ },
+ display_name: "Qwen 3 1.7B (Q4_K_M)".into(),
+ approx_memory: "~1.3 GB".into(),
+ chat_template: None,
+ }
+ }
+
+ /// Qwen 3 4B (GGUF Q4_K_M) — ~2.7 GB.
///
/// Full OpenAI-compatible tool calling with extended thinking mode.
/// Always load with `max_tokens ≥ 4096`; the `…` block can
@@ -2048,49 +2084,127 @@ impl GgufModelConfig {
Self {
model_id: super::models::BARTOWSKI_QWEN3_4B_GGUF.into(),
files: vec![super::models::QWEN3_4B_GGUF_FILE.into()],
- tok_model_id: None,
+ tok_model_id: if cfg!(target_os = "android") {
+ Some(super::models::QWEN3_4B_TOK_MODEL_ID.into())
+ } else {
+ None
+ },
display_name: "Qwen 3 4B (Q4_K_M)".into(),
approx_memory: "~2.7 GB".into(),
chat_template: None,
}
}
- /// Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling, ~1.3 GB.
- pub fn qwen3_1_7b() -> Self {
+ /// Strong tool-calling model with extended thinking. Best balance of
+ /// quality and memory for macOS with 24+ GB RAM.
+ pub fn qwen3_8b() -> Self {
Self {
- model_id: super::models::BARTOWSKI_QWEN3_1_7B_GGUF.into(),
- files: vec![super::models::QWEN3_1_7B_GGUF_FILE.into()],
- tok_model_id: None,
- display_name: "Qwen 3 1.7B (Q4_K_M)".into(),
- approx_memory: "~1.3 GB".into(),
+ model_id: super::models::BARTOWSKI_QWEN3_8B_GGUF.into(),
+ files: vec![super::models::QWEN3_8B_GGUF_FILE.into()],
+ tok_model_id: if cfg!(target_os = "android") {
+ Some(super::models::QWEN3_8B_TOK_MODEL_ID.into())
+ } else {
+ None
+ },
+ display_name: "Qwen 3 8B (Q4_K_M)".into(),
+ approx_memory: "~5 GB".into(),
chat_template: None,
}
}
- /// Qwen 3 14B Instruct (GGUF Q4_K_M) — ~8.4 GB.
+ /// Qwen 3 14B (GGUF Q4_K_M) — ~8.4 GB.
///
- /// Strongest reasoning and tool-calling model with extended thinking.
+ /// Strong reasoning and tool-calling model with extended thinking.
/// Best all-around model for macOS with 16+ GB RAM.
pub fn qwen3_14b() -> Self {
Self {
model_id: super::models::BARTOWSKI_QWEN3_14B_GGUF.into(),
files: vec![super::models::QWEN3_14B_GGUF_FILE.into()],
- tok_model_id: None,
+ tok_model_id: if cfg!(target_os = "android") {
+ Some(super::models::QWEN3_14B_TOK_MODEL_ID.into())
+ } else {
+ None
+ },
display_name: "Qwen 3 14B (Q4_K_M)".into(),
approx_memory: "~8.4 GB".into(),
chat_template: None,
}
}
- /// Strong tool-calling model with extended thinking. Best balance of
- /// quality and memory for macOS with 24+ GB RAM.
- pub fn qwen3_8b() -> Self {
+ /// Qwen 3 32B (GGUF Q4_K_M) — largest dense Qwen 3, ~19.8 GB.
+ ///
+ /// Highest-quality dense Qwen 3 variant. Requires a high-memory desktop
+ /// (32+ GB RAM / unified memory). Extended thinking and full tool calling.
+ pub fn qwen3_32b() -> Self {
Self {
- model_id: super::models::BARTOWSKI_QWEN3_8B_GGUF.into(),
- files: vec![super::models::QWEN3_8B_GGUF_FILE.into()],
- tok_model_id: None,
- display_name: "Qwen 3 8B (Q4_K_M)".into(),
- approx_memory: "~5 GB".into(),
+ model_id: super::models::BARTOWSKI_QWEN3_32B_GGUF.into(),
+ files: vec![super::models::QWEN3_32B_GGUF_FILE.into()],
+ tok_model_id: if cfg!(target_os = "android") {
+ Some(super::models::QWEN3_32B_TOK_MODEL_ID.into())
+ } else {
+ None
+ },
+ display_name: "Qwen 3 32B (Q4_K_M)".into(),
+ approx_memory: "~19.8 GB".into(),
+ chat_template: None,
+ }
+ }
+
+ /// Qwen 3 4B Instruct 2507 (GGUF Q4_K_M) — latest non-thinking 4B, ~2.5 GB.
+ ///
+ /// Updated instruction-tuned checkpoint that does *not* emit a ``
+ /// block, making it faster and more predictable for chat and tool use.
+ /// Recommended general-purpose Qwen 3 model.
+ pub fn qwen3_4b_instruct_2507() -> Self {
+ Self {
+ model_id: super::models::BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF.into(),
+ files: vec![super::models::QWEN3_4B_INSTRUCT_2507_GGUF_FILE.into()],
+ tok_model_id: if cfg!(target_os = "android") {
+ Some(super::models::QWEN3_4B_INSTRUCT_2507_TOK_MODEL_ID.into())
+ } else {
+ None
+ },
+ display_name: "Qwen 3 4B Instruct 2507 (Q4_K_M)".into(),
+ approx_memory: "~2.5 GB".into(),
+ chat_template: None,
+ }
+ }
+
+ /// Qwen 3 4B Thinking 2507 (GGUF Q4_K_M) — latest reasoning 4B, ~2.5 GB.
+ ///
+ /// Updated thinking-only checkpoint with stronger reasoning and tool-use
+ /// accuracy. Always emits a `` block — load with `max_tokens ≥ 4096`.
+ pub fn qwen3_4b_thinking_2507() -> Self {
+ Self {
+ model_id: super::models::BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF.into(),
+ files: vec![super::models::QWEN3_4B_THINKING_2507_GGUF_FILE.into()],
+ tok_model_id: if cfg!(target_os = "android") {
+ Some(super::models::QWEN3_4B_THINKING_2507_TOK_MODEL_ID.into())
+ } else {
+ None
+ },
+ display_name: "Qwen 3 4B Thinking 2507 (Q4_K_M)".into(),
+ approx_memory: "~2.5 GB".into(),
+ chat_template: None,
+ }
+ }
+
+ /// Qwen 3 30B-A3B Instruct 2507 (GGUF Q4_K_M) — flagship MoE, ~18.6 GB.
+ ///
+ /// Mixture-of-experts model: 30B total parameters, ~3B active per token, so
+ /// inference is far cheaper than a 30B dense model while quality rivals it.
+ /// Loads via the `qwen3moe` GGUF architecture. Requires 32+ GB RAM.
+ pub fn qwen3_30b_a3b_instruct_2507() -> Self {
+ Self {
+ model_id: super::models::BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF.into(),
+ files: vec![super::models::QWEN3_30B_A3B_INSTRUCT_2507_GGUF_FILE.into()],
+ tok_model_id: if cfg!(target_os = "android") {
+ Some(super::models::QWEN3_30B_A3B_INSTRUCT_2507_TOK_MODEL_ID.into())
+ } else {
+ None
+ },
+ display_name: "Qwen 3 30B-A3B Instruct 2507 (Q4_K_M)".into(),
+ approx_memory: "~18.6 GB".into(),
chat_template: None,
}
}
@@ -2172,6 +2286,37 @@ mod tests {
assert_eq!(cfg.files.len(), 1);
}
+ #[test]
+ fn gguf_model_config_qwen3_variants() {
+ // Every Qwen 3 constructor must produce a valid, single-file config
+ // whose repo ID is registered in the supported-models list.
+ let configs = [
+ GgufModelConfig::qwen3_0_6b(),
+ GgufModelConfig::qwen3_1_7b(),
+ GgufModelConfig::qwen3_4b(),
+ GgufModelConfig::qwen3_8b(),
+ GgufModelConfig::qwen3_14b(),
+ GgufModelConfig::qwen3_32b(),
+ GgufModelConfig::qwen3_4b_instruct_2507(),
+ GgufModelConfig::qwen3_4b_thinking_2507(),
+ GgufModelConfig::qwen3_30b_a3b_instruct_2507(),
+ ];
+ for cfg in &configs {
+ assert!(
+ cfg.model_id.contains("Qwen3"),
+ "unexpected id: {}",
+ cfg.model_id
+ );
+ assert_eq!(cfg.files.len(), 1);
+ assert!(cfg.files[0].ends_with("-Q4_K_M.gguf"));
+ assert!(
+ super::super::models::SUPPORTED_MODELS.contains(&cfg.model_id.as_str()),
+ "{} missing from SUPPORTED_MODELS",
+ cfg.model_id
+ );
+ }
+ }
+
#[test]
fn gguf_model_config_platform_default() {
let cfg = GgufModelConfig::platform_default();
diff --git a/src/inference/ffi.rs b/src/inference/ffi.rs
index 376aff0..12c5036 100644
--- a/src/inference/ffi.rs
+++ b/src/inference/ffi.rs
@@ -326,6 +326,66 @@ pub fn qwen25_3b_config() -> GgufModelConfig {
GgufModelConfig::qwen25_3b()
}
+// ── Qwen 3 family ────────────────────────────────────────────────────────────
+
+/// Return the Qwen 3 0.6B GGUF model configuration (~0.5 GB).
+#[uniffi::export]
+pub fn qwen3_0_6b_config() -> GgufModelConfig {
+ GgufModelConfig::qwen3_0_6b()
+}
+
+/// Return the Qwen 3 1.7B GGUF model configuration (~1.3 GB).
+#[uniffi::export]
+pub fn qwen3_1_7b_config() -> GgufModelConfig {
+ GgufModelConfig::qwen3_1_7b()
+}
+
+/// Return the Qwen 3 4B GGUF model configuration (~2.7 GB).
+#[uniffi::export]
+pub fn qwen3_4b_config() -> GgufModelConfig {
+ GgufModelConfig::qwen3_4b()
+}
+
+/// Return the Qwen 3 8B GGUF model configuration (~5 GB).
+#[uniffi::export]
+pub fn qwen3_8b_config() -> GgufModelConfig {
+ GgufModelConfig::qwen3_8b()
+}
+
+/// Return the Qwen 3 14B GGUF model configuration (~8.4 GB).
+#[uniffi::export]
+pub fn qwen3_14b_config() -> GgufModelConfig {
+ GgufModelConfig::qwen3_14b()
+}
+
+/// Return the Qwen 3 32B GGUF model configuration (~19.8 GB).
+#[uniffi::export]
+pub fn qwen3_32b_config() -> GgufModelConfig {
+ GgufModelConfig::qwen3_32b()
+}
+
+/// Return the Qwen 3 4B Instruct 2507 GGUF model configuration (~2.5 GB).
+///
+/// Latest non-thinking 4B checkpoint — recommended general-purpose Qwen 3 model.
+#[uniffi::export]
+pub fn qwen3_4b_instruct_2507_config() -> GgufModelConfig {
+ GgufModelConfig::qwen3_4b_instruct_2507()
+}
+
+/// Return the Qwen 3 4B Thinking 2507 GGUF model configuration (~2.5 GB).
+///
+/// Latest reasoning-focused 4B checkpoint; load with `max_tokens ≥ 4096`.
+#[uniffi::export]
+pub fn qwen3_4b_thinking_2507_config() -> GgufModelConfig {
+ GgufModelConfig::qwen3_4b_thinking_2507()
+}
+
+/// Return the Qwen 3 30B-A3B Instruct 2507 (MoE) GGUF model configuration (~18.6 GB).
+#[uniffi::export]
+pub fn qwen3_30b_a3b_instruct_2507_config() -> GgufModelConfig {
+ GgufModelConfig::qwen3_30b_a3b_instruct_2507()
+}
+
/// Return default sampling parameters for creative chat.
#[uniffi::export]
pub fn default_sampling_config() -> SamplingConfig {
diff --git a/src/inference/models.rs b/src/inference/models.rs
index 31f83c6..b56e5bd 100644
--- a/src/inference/models.rs
+++ b/src/inference/models.rs
@@ -83,16 +83,47 @@ pub const QWEN25_3B_GGUF_FILE: &str = "Qwen2.5-3B-Instruct-Q4_K_M.gguf";
/// Base model repo used for the HF tokenizer (tokenizer.json + tokenizer_config.json).
pub const QWEN25_3B_TOK_MODEL_ID: &str = "Qwen/Qwen2.5-3B-Instruct";
-/// Pre-quantized Qwen 3 4B Instruct (GGUF Q4_K_M) — full OpenAI-compatible tool calling (~2.7 GB).
+// ── Qwen 3 family (GGUF Q4_K_M) ──────────────────────────────────────────────
+//
+// The Qwen 3 line uses the `qwen3` GGUF architecture (and `qwen3moe` for the
+// 30B-A3B mixture-of-experts variant), both supported by mistral.rs's quantized
+// loader. Every Qwen 3 model is a hybrid reasoner with an extended thinking mode
+// (`…`); always load with `max_tokens ≥ 4096` so the thinking
+// block does not exhaust the token budget before the real reply.
+//
+// bartowski's repos embed the tokenizer and chat template inside the GGUF, so on
+// iOS/macOS no separate tokenizer download is needed. On Android the candle GGUF
+// backend cannot parse the embedded tokenizer, so each model also declares a
+// `TOK_MODEL_ID` pointing at the official Qwen repo for a standalone tokenizer.
+
+/// Pre-quantized Qwen 3 0.6B (GGUF Q4_K_M) — smallest Qwen 3 variant (~0.5 GB).
///
-/// Qwen 3 uses an extended thinking mode (`…`) that significantly improves
-/// reasoning and tool-use accuracy. Load with `max_tokens ≥ 4096` to avoid empty replies caused
-/// by the model exhausting its token budget on thinking before producing a response.
+/// Lightest tool-capable Qwen 3 model. Suitable for tvOS and the most
+/// memory-constrained mobile devices where even the 1.7B is too large.
+pub const BARTOWSKI_QWEN3_0_6B_GGUF: &str = "bartowski/Qwen_Qwen3-0.6B-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 0.6B repo.
+pub const QWEN3_0_6B_GGUF_FILE: &str = "Qwen_Qwen3-0.6B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_0_6B_TOK_MODEL_ID: &str = "Qwen/Qwen3-0.6B";
+
+/// Pre-quantized Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling model (~1.3 GB).
+///
+/// Smallest Qwen 3 variant with comfortable tool calling. Suitable for mobile
+/// devices where the 4B model would be too large.
+pub const BARTOWSKI_QWEN3_1_7B_GGUF: &str = "bartowski/Qwen_Qwen3-1.7B-GGUF";
+/// The specific GGUF filename for the Qwen3 1.7B repo.
+pub const QWEN3_1_7B_GGUF_FILE: &str = "Qwen_Qwen3-1.7B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_1_7B_TOK_MODEL_ID: &str = "Qwen/Qwen3-1.7B";
+
+/// Pre-quantized Qwen 3 4B (GGUF Q4_K_M) — full OpenAI-compatible tool calling (~2.7 GB).
///
/// Recommended model for siGit Code (coding agent with tool calling on macOS/Linux/Windows).
pub const BARTOWSKI_QWEN3_4B_GGUF: &str = "bartowski/Qwen_Qwen3-4B-GGUF";
/// The specific GGUF filename to download from the bartowski Qwen 3 4B repo.
pub const QWEN3_4B_GGUF_FILE: &str = "Qwen_Qwen3-4B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_4B_TOK_MODEL_ID: &str = "Qwen/Qwen3-4B";
/// Pre-quantized Qwen 3 8B (GGUF Q4_K_M) — strong tool-calling model (~5 GB).
///
@@ -100,22 +131,70 @@ pub const QWEN3_4B_GGUF_FILE: &str = "Qwen_Qwen3-4B-Q4_K_M.gguf";
/// Full tool calling and extended thinking mode support.
pub const BARTOWSKI_QWEN3_8B_GGUF: &str = "bartowski/Qwen_Qwen3-8B-GGUF";
pub const QWEN3_8B_GGUF_FILE: &str = "Qwen_Qwen3-8B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_8B_TOK_MODEL_ID: &str = "Qwen/Qwen3-8B";
/// Pre-quantized Qwen 3 14B (GGUF Q4_K_M) — strong reasoning and tool-calling model (~8.4 GB).
///
-/// Qwen 3 uses extended thinking mode (`…`) for improved reasoning.
/// Best all-around model for macOS with 16+ GB RAM. Full tool calling support.
pub const BARTOWSKI_QWEN3_14B_GGUF: &str = "bartowski/Qwen_Qwen3-14B-GGUF";
/// The specific GGUF filename.
pub const QWEN3_14B_GGUF_FILE: &str = "Qwen_Qwen3-14B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_14B_TOK_MODEL_ID: &str = "Qwen/Qwen3-14B";
-/// Pre-quantized Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling model (~1.3 GB).
+/// Pre-quantized Qwen 3 32B (GGUF Q4_K_M) — largest dense Qwen 3 model (~19.8 GB).
///
-/// Smallest Qwen 3 variant with tool calling support. Suitable for mobile devices
-/// where the 4B model would be too large.
-pub const BARTOWSKI_QWEN3_1_7B_GGUF: &str = "bartowski/Qwen_Qwen3-1.7B-GGUF";
-/// The specific GGUF filename for the Qwen3 1.7B repo.
-pub const QWEN3_1_7B_GGUF_FILE: &str = "Qwen_Qwen3-1.7B-Q4_K_M.gguf";
+/// Highest-quality dense Qwen 3 variant. Requires a high-memory desktop
+/// (32+ GB RAM / unified memory). Full tool calling and extended thinking.
+pub const BARTOWSKI_QWEN3_32B_GGUF: &str = "bartowski/Qwen_Qwen3-32B-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 32B repo.
+pub const QWEN3_32B_GGUF_FILE: &str = "Qwen_Qwen3-32B-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_32B_TOK_MODEL_ID: &str = "Qwen/Qwen3-32B";
+
+// ── Qwen 3 "2507" updated releases ───────────────────────────────────────────
+//
+// The July/August 2025 refresh split Qwen 3 into dedicated non-thinking
+// (`-Instruct-2507`) and thinking-only (`-Thinking-2507`) checkpoints with
+// markedly improved instruction following, tool use, and long-context quality.
+// These are the latest open Qwen 3 weights available as on-device GGUF.
+
+/// Pre-quantized Qwen 3 4B Instruct 2507 (GGUF Q4_K_M) — latest non-thinking 4B (~2.5 GB).
+///
+/// Updated instruction-tuned checkpoint. Unlike the base 4B it does *not* emit a
+/// `` block, so it is faster and more predictable for chat and tool use.
+pub const BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF: &str =
+ "bartowski/Qwen_Qwen3-4B-Instruct-2507-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 4B Instruct 2507 repo.
+pub const QWEN3_4B_INSTRUCT_2507_GGUF_FILE: &str = "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_4B_INSTRUCT_2507_TOK_MODEL_ID: &str = "Qwen/Qwen3-4B-Instruct-2507";
+
+/// Pre-quantized Qwen 3 4B Thinking 2507 (GGUF Q4_K_M) — latest reasoning 4B (~2.5 GB).
+///
+/// Updated thinking-only checkpoint with stronger reasoning and tool-use accuracy.
+/// Always emits a `` block — load with `max_tokens ≥ 4096`.
+pub const BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF: &str =
+ "bartowski/Qwen_Qwen3-4B-Thinking-2507-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 4B Thinking 2507 repo.
+pub const QWEN3_4B_THINKING_2507_GGUF_FILE: &str = "Qwen_Qwen3-4B-Thinking-2507-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_4B_THINKING_2507_TOK_MODEL_ID: &str = "Qwen/Qwen3-4B-Thinking-2507";
+
+/// Pre-quantized Qwen 3 30B-A3B Instruct 2507 (GGUF Q4_K_M) — flagship MoE (~18.6 GB).
+///
+/// Mixture-of-experts model: 30B total parameters but only ~3B active per token,
+/// so inference is far cheaper than a 30B dense model while quality rivals it.
+/// Loads via the `qwen3moe` GGUF architecture in mistral.rs. Requires a
+/// high-memory desktop (32+ GB RAM / unified memory).
+pub const BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF: &str =
+ "bartowski/Qwen_Qwen3-30B-A3B-Instruct-2507-GGUF";
+/// The specific GGUF filename for the bartowski Qwen 3 30B-A3B Instruct 2507 repo.
+pub const QWEN3_30B_A3B_INSTRUCT_2507_GGUF_FILE: &str =
+ "Qwen_Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf";
+/// Base model repo used for the HF tokenizer on Android.
+pub const QWEN3_30B_A3B_INSTRUCT_2507_TOK_MODEL_ID: &str = "Qwen/Qwen3-30B-A3B-Instruct-2507";
/// DeepSeek Coder v1 6.7B Instruct (GGUF Q4_K_M) — dedicated code generation model (~3.8 GB).
///
@@ -136,10 +215,15 @@ pub const SUPPORTED_MODELS: &[&str] = &[
BARTOWSKI_QWEN25_0_5B_INSTRUCT_GGUF,
BARTOWSKI_QWEN25_1_5B_INSTRUCT_GGUF,
BARTOWSKI_QWEN25_3B_INSTRUCT_GGUF,
+ BARTOWSKI_QWEN3_0_6B_GGUF,
+ BARTOWSKI_QWEN3_1_7B_GGUF,
BARTOWSKI_QWEN3_4B_GGUF,
BARTOWSKI_QWEN3_8B_GGUF,
BARTOWSKI_QWEN3_14B_GGUF,
- BARTOWSKI_QWEN3_1_7B_GGUF,
+ BARTOWSKI_QWEN3_32B_GGUF,
+ BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF,
+ BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF,
+ BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF,
BARTOWSKI_QWEN25_CODER_7B_INSTRUCT_GGUF,
THEBLOKE_DEEPSEEK_CODER_6_7B_INSTRUCT_GGUF,
];
@@ -197,6 +281,23 @@ pub const SUPPORTED_MODEL_INFO: &[SupportedModelInfo] = &[
// Exact file size from HuggingFace API siblings[].size.
expected_size_bytes: 1_929_903_264,
},
+ SupportedModelInfo {
+ id: BARTOWSKI_QWEN3_0_6B_GGUF,
+ name: "Qwen 3 0.6B (GGUF)",
+ org: "Qwen / Alibaba",
+ description: "Smallest Qwen 3 variant with tool calling (~0.5 GB). \
+ Suitable for tvOS and the most memory-constrained mobile devices.",
+ // Approximate Q4_K_M size; HF API unreachable at authoring time.
+ expected_size_bytes: 483_000_000,
+ },
+ SupportedModelInfo {
+ id: BARTOWSKI_QWEN3_1_7B_GGUF,
+ name: "Qwen 3 1.7B (GGUF)",
+ org: "Qwen / Alibaba",
+ description: "Lightweight tool-calling model for mobile (~1.3 GB). \
+ Smallest Qwen 3 variant with comfortable tool calling support.",
+ expected_size_bytes: 1_282_439_584,
+ },
SupportedModelInfo {
id: BARTOWSKI_QWEN3_4B_GGUF,
name: "Qwen 3 4B (GGUF)",
@@ -225,12 +326,40 @@ pub const SUPPORTED_MODEL_INFO: &[SupportedModelInfo] = &[
expected_size_bytes: 9_001_753_632,
},
SupportedModelInfo {
- id: BARTOWSKI_QWEN3_1_7B_GGUF,
- name: "Qwen 3 1.7B (GGUF)",
+ id: BARTOWSKI_QWEN3_32B_GGUF,
+ name: "Qwen 3 32B (GGUF)",
org: "Qwen / Alibaba",
- description: "Lightweight tool-calling model for mobile (~1.3 GB). \
- Smallest Qwen 3 variant with tool calling support.",
- expected_size_bytes: 1_282_439_584,
+ description: "Largest dense Qwen 3 model with extended thinking (~19.8 GB). \
+ Highest-quality dense variant; requires 32+ GB RAM.",
+ // Approximate Q4_K_M size; HF API unreachable at authoring time.
+ expected_size_bytes: 21_260_000_000,
+ },
+ SupportedModelInfo {
+ id: BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF,
+ name: "Qwen 3 4B Instruct 2507 (GGUF)",
+ org: "Qwen / Alibaba",
+ description: "Latest non-thinking 4B checkpoint — faster, predictable chat and \
+ tool use (~2.5 GB). Recommended general-purpose Qwen 3 model.",
+ // Approximate Q4_K_M size; HF API unreachable at authoring time.
+ expected_size_bytes: 2_500_000_000,
+ },
+ SupportedModelInfo {
+ id: BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF,
+ name: "Qwen 3 4B Thinking 2507 (GGUF)",
+ org: "Qwen / Alibaba",
+ description: "Latest reasoning-focused 4B checkpoint with extended thinking (~2.5 GB). \
+ Stronger reasoning and tool-use accuracy; load with max_tokens ≥ 4096.",
+ // Approximate Q4_K_M size; HF API unreachable at authoring time.
+ expected_size_bytes: 2_500_000_000,
+ },
+ SupportedModelInfo {
+ id: BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF,
+ name: "Qwen 3 30B-A3B Instruct 2507 (GGUF)",
+ org: "Qwen / Alibaba",
+ description: "Flagship mixture-of-experts model: 30B total / ~3B active (~18.6 GB). \
+ Near-dense quality at far lower inference cost; requires 32+ GB RAM.",
+ // Approximate Q4_K_M size; HF API unreachable at authoring time.
+ expected_size_bytes: 19_971_600_000,
},
SupportedModelInfo {
id: BARTOWSKI_QWEN25_CODER_7B_INSTRUCT_GGUF,
@@ -264,7 +393,48 @@ pub fn tok_model_id_for_repo(hf_repo_id: &str) -> Option<&'static str> {
BARTOWSKI_QWEN25_CODER_1_5B_INSTRUCT_GGUF => Some(QWEN25_CODER_1_5B_TOK_MODEL_ID),
BARTOWSKI_QWEN25_CODER_3B_INSTRUCT_GGUF => Some(QWEN25_CODER_3B_TOK_MODEL_ID),
BARTOWSKI_QWEN25_CODER_7B_INSTRUCT_GGUF => Some(QWEN25_CODER_7B_TOK_MODEL_ID),
+ BARTOWSKI_QWEN3_0_6B_GGUF => Some(QWEN3_0_6B_TOK_MODEL_ID),
+ BARTOWSKI_QWEN3_1_7B_GGUF => Some(QWEN3_1_7B_TOK_MODEL_ID),
+ BARTOWSKI_QWEN3_4B_GGUF => Some(QWEN3_4B_TOK_MODEL_ID),
+ BARTOWSKI_QWEN3_8B_GGUF => Some(QWEN3_8B_TOK_MODEL_ID),
+ BARTOWSKI_QWEN3_14B_GGUF => Some(QWEN3_14B_TOK_MODEL_ID),
+ BARTOWSKI_QWEN3_32B_GGUF => Some(QWEN3_32B_TOK_MODEL_ID),
+ BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF => Some(QWEN3_4B_INSTRUCT_2507_TOK_MODEL_ID),
+ BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF => Some(QWEN3_4B_THINKING_2507_TOK_MODEL_ID),
+ BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF => {
+ Some(QWEN3_30B_A3B_INSTRUCT_2507_TOK_MODEL_ID)
+ }
THEBLOKE_DEEPSEEK_CODER_6_7B_INSTRUCT_GGUF => Some(DEEPSEEK_CODER_6_7B_TOK_MODEL_ID),
_ => None,
}
}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ /// Every supported model must have a corresponding display-metadata entry
+ /// so the model-list UI can render it.
+ #[test]
+ fn every_supported_model_has_info() {
+ for id in SUPPORTED_MODELS {
+ assert!(
+ SUPPORTED_MODEL_INFO.iter().any(|info| info.id == *id),
+ "{id} is in SUPPORTED_MODELS but missing from SUPPORTED_MODEL_INFO"
+ );
+ }
+ }
+
+ /// Every Qwen 3 repo must resolve to an Android tokenizer ID so loading
+ /// works on the candle GGUF backend.
+ #[test]
+ fn qwen3_repos_have_android_tokenizer() {
+ let qwen3 = SUPPORTED_MODELS.iter().filter(|id| id.contains("Qwen3"));
+ for id in qwen3 {
+ assert!(
+ tok_model_id_for_repo(id).is_some(),
+ "{id} has no Android tokenizer mapping"
+ );
+ }
+ }
+}