From dfa9c0f37506913fc1c41607399a08def3868153 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 22 Jun 2026 22:30:47 +0000 Subject: [PATCH] Add full Qwen 3 family GGUF support across core and SDKs Extends Qwen 3 coverage from the four partially-wired sizes (1.7B/4B/8B/14B) to the complete on-device, open-weight Qwen 3 GGUF lineup, and wires every variant through all binding layers. Models added: - Dense: Qwen 3 0.6B and 32B (rounding out 0.6B/1.7B/4B/8B/14B/32B) - Latest "2507" updates: Qwen 3 4B Instruct 2507 (non-thinking), 4B Thinking 2507, and the 30B-A3B Instruct 2507 mixture-of-experts (loads via the qwen3moe GGUF architecture) Wiring completed for all nine Qwen 3 variants: - models.rs: repo/file/tokenizer constants, SUPPORTED_MODELS, SUPPORTED_MODEL_INFO, and tok_model_id_for_repo entries - engine.rs: GgufModelConfig constructors, now Android-tokenizer-aware (the previous 1.7B/4B/8B/14B constructors passed tok_model_id: None even on Android, which breaks the candle GGUF backend) - ffi.rs: #[uniffi::export] config functions (reaches Swift/Kotlin/Python) - Kotlin OndeModels convenience wrappers, Dart FRB bridge, and the React Native C-ABI exports Notes: - The 3.5/3.6/3.7 point releases are excluded: they are either huge MoE (235B-A22B / 397B-A17B) or closed-weights, API-only checkpoints, none of which are loadable on-device through this GGUF SDK. - expected_size_bytes for the new variants are approximate; the HuggingFace API was unreachable from the build environment, so exact siblings[].size values should be backfilled when network access is available. - Added consistency tests: every supported model has display metadata, and every Qwen 3 repo resolves to an Android tokenizer. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01DkBxc4UETJyXpFUDnBSn3E --- .agents/AGENTS.md | 22 +- sdk/dart/rust/src/api.rs | 56 +++++ .../com/ondeinference/onde/Convenience.kt | 20 ++ sdk/react-native/rust/src/lib.rs | 56 +++++ src/inference/engine.rs | 185 ++++++++++++++-- src/inference/ffi.rs | 60 ++++++ src/inference/models.rs | 204 ++++++++++++++++-- 7 files changed, 562 insertions(+), 41 deletions(-) diff --git a/.agents/AGENTS.md b/.agents/AGENTS.md index 7a75ba5..33eaf2a 100644 --- a/.agents/AGENTS.md +++ b/.agents/AGENTS.md @@ -147,10 +147,15 @@ All model constants live in `src/inference/models.rs`. When adding a new model: | Qwen 2.5 Coder 1.5B Instruct (GGUF Q4_K_M) | `bartowski/Qwen2.5-Coder-1.5B-Instruct-GGUF` | `Qwen2.5-Coder-1.5B-Instruct-Q4_K_M.gguf` | ~941 MB | All platforms (mobile default) | | Qwen 2.5 Coder 3B Instruct (GGUF Q4_K_M) | `bartowski/Qwen2.5-Coder-3B-Instruct-GGUF` | `Qwen2.5-Coder-3B-Instruct-Q4_K_M.gguf` | ~1.93 GB | All platforms (desktop default) | | Qwen 2.5 Coder 7B Instruct (GGUF Q4_K_M) | `bartowski/Qwen2.5-Coder-7B-Instruct-GGUF` | `Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf` | ~4.4 GB | Higher-memory devices | -| Qwen 3 1.7B (GGUF Q4_K_M) | `bartowski/Qwen3-1.7B-GGUF` | `Qwen3-1.7B-Q4_K_M.gguf` | ~1.3 GB | All platforms | -| Qwen 3 4B (GGUF Q4_K_M) | `bartowski/Qwen3-4B-GGUF` | `Qwen3-4B-Q4_K_M.gguf` | ~2.7 GB | All platforms | -| Qwen 3 8B (GGUF Q4_K_M) | `bartowski/Qwen3-8B-GGUF` | `Qwen3-8B-Q4_K_M.gguf` | ~5 GB | Higher-memory devices | -| Qwen 3 14B (GGUF Q4_K_M) | `bartowski/Qwen3-14B-GGUF` | `Qwen3-14B-Q4_K_M.gguf` | ~8.4 GB | Higher-memory devices | +| Qwen 3 0.6B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-0.6B-GGUF` | `Qwen_Qwen3-0.6B-Q4_K_M.gguf` | ~0.5 GB | All platforms | +| Qwen 3 1.7B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-1.7B-GGUF` | `Qwen_Qwen3-1.7B-Q4_K_M.gguf` | ~1.3 GB | All platforms | +| Qwen 3 4B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-4B-GGUF` | `Qwen_Qwen3-4B-Q4_K_M.gguf` | ~2.7 GB | All platforms | +| Qwen 3 8B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-8B-GGUF` | `Qwen_Qwen3-8B-Q4_K_M.gguf` | ~5 GB | Higher-memory devices | +| Qwen 3 14B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-14B-GGUF` | `Qwen_Qwen3-14B-Q4_K_M.gguf` | ~8.4 GB | Higher-memory devices | +| Qwen 3 32B (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-32B-GGUF` | `Qwen_Qwen3-32B-Q4_K_M.gguf` | ~19.8 GB | High-memory desktop (32+ GB) | +| Qwen 3 4B Instruct 2507 (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-4B-Instruct-2507-GGUF` | `Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf` | ~2.5 GB | All platforms (latest non-thinking 4B) | +| Qwen 3 4B Thinking 2507 (GGUF Q4_K_M) | `bartowski/Qwen_Qwen3-4B-Thinking-2507-GGUF` | `Qwen_Qwen3-4B-Thinking-2507-Q4_K_M.gguf` | ~2.5 GB | All platforms (latest reasoning 4B) | +| Qwen 3 30B-A3B Instruct 2507 (GGUF Q4_K_M, MoE) | `bartowski/Qwen_Qwen3-30B-A3B-Instruct-2507-GGUF` | `Qwen_Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf` | ~18.6 GB | High-memory desktop (32+ GB), `qwen3moe` | | DeepSeek Coder 6.7B Instruct (GGUF Q4_K_M) | `bartowski/deepseek-coder-6.7b-instruct-GGUF` | `deepseek-coder-6.7b-instruct-Q4_K_M.gguf` | ~3.8 GB | Higher-memory devices, custom chat template | | Qwen 2.5 Coder 7B Instruct (ISQ) | `Qwen/Qwen2.5-Coder-7B-Instruct` | safetensors (ISQ in-situ) | ~8 GB | macOS (ISQ pipeline) | @@ -374,6 +379,15 @@ await engine.unloadModel() | `defaultModelConfig()` | `GgufModelConfig` | Platform-aware Coder default (1.5B on iOS/tvOS/Android, 3B on desktop) | | `qwen251_5bConfig()` | `GgufModelConfig` | Forces Qwen 2.5 1.5B regardless of platform | | `qwen253bConfig()` | `GgufModelConfig` | Forces Qwen 2.5 3B regardless of platform | +| `qwen306bConfig()` | `GgufModelConfig` | Qwen 3 0.6B (~0.5 GB) | +| `qwen317bConfig()` | `GgufModelConfig` | Qwen 3 1.7B (~1.3 GB) | +| `qwen34bConfig()` | `GgufModelConfig` | Qwen 3 4B (~2.7 GB) | +| `qwen38bConfig()` | `GgufModelConfig` | Qwen 3 8B (~5 GB) | +| `qwen314bConfig()` | `GgufModelConfig` | Qwen 3 14B (~8.4 GB) | +| `qwen332bConfig()` | `GgufModelConfig` | Qwen 3 32B (~19.8 GB) | +| `qwen34bInstruct2507Config()` | `GgufModelConfig` | Qwen 3 4B Instruct 2507 — latest non-thinking 4B (~2.5 GB) | +| `qwen34bThinking2507Config()` | `GgufModelConfig` | Qwen 3 4B Thinking 2507 — latest reasoning 4B (~2.5 GB) | +| `qwen330bA3bInstruct2507Config()` | `GgufModelConfig` | Qwen 3 30B-A3B Instruct 2507 MoE (~18.6 GB) | | `defaultSamplingConfig()` | `SamplingConfig` | temp=0.7, top_p=0.95, max_tokens=512 | | `deterministicSamplingConfig()` | `SamplingConfig` | temp=0.0, greedy | | `mobileSamplingConfig()` | `SamplingConfig` | temp=0.7, max_tokens=128 | diff --git a/sdk/dart/rust/src/api.rs b/sdk/dart/rust/src/api.rs index 69ee56f..aafae99 100644 --- a/sdk/dart/rust/src/api.rs +++ b/sdk/dart/rust/src/api.rs @@ -542,6 +542,62 @@ pub fn qwen25_coder_3b_config() -> GgufModelConfig { OndeGgufModelConfig::qwen25_coder_3b().into() } +// ── Qwen 3 family ──────────────────────────────────────────────────────────── + +/// `GgufModelConfig` for Qwen 3 0.6B Q4_K_M (~0.5 GB). +#[frb(sync)] +pub fn qwen3_0_6b_config() -> GgufModelConfig { + OndeGgufModelConfig::qwen3_0_6b().into() +} + +/// `GgufModelConfig` for Qwen 3 1.7B Q4_K_M (~1.3 GB). +#[frb(sync)] +pub fn qwen3_1_7b_config() -> GgufModelConfig { + OndeGgufModelConfig::qwen3_1_7b().into() +} + +/// `GgufModelConfig` for Qwen 3 4B Q4_K_M (~2.7 GB). +#[frb(sync)] +pub fn qwen3_4b_config() -> GgufModelConfig { + OndeGgufModelConfig::qwen3_4b().into() +} + +/// `GgufModelConfig` for Qwen 3 8B Q4_K_M (~5 GB). +#[frb(sync)] +pub fn qwen3_8b_config() -> GgufModelConfig { + OndeGgufModelConfig::qwen3_8b().into() +} + +/// `GgufModelConfig` for Qwen 3 14B Q4_K_M (~8.4 GB). +#[frb(sync)] +pub fn qwen3_14b_config() -> GgufModelConfig { + OndeGgufModelConfig::qwen3_14b().into() +} + +/// `GgufModelConfig` for Qwen 3 32B Q4_K_M (~19.8 GB). +#[frb(sync)] +pub fn qwen3_32b_config() -> GgufModelConfig { + OndeGgufModelConfig::qwen3_32b().into() +} + +/// `GgufModelConfig` for Qwen 3 4B Instruct 2507 Q4_K_M (~2.5 GB). +#[frb(sync)] +pub fn qwen3_4b_instruct_2507_config() -> GgufModelConfig { + OndeGgufModelConfig::qwen3_4b_instruct_2507().into() +} + +/// `GgufModelConfig` for Qwen 3 4B Thinking 2507 Q4_K_M (~2.5 GB). +#[frb(sync)] +pub fn qwen3_4b_thinking_2507_config() -> GgufModelConfig { + OndeGgufModelConfig::qwen3_4b_thinking_2507().into() +} + +/// `GgufModelConfig` for Qwen 3 30B-A3B Instruct 2507 (MoE) Q4_K_M (~18.6 GB). +#[frb(sync)] +pub fn qwen3_30b_a3b_instruct_2507_config() -> GgufModelConfig { + OndeGgufModelConfig::qwen3_30b_a3b_instruct_2507().into() +} + /// Default sampling config: `temperature=0.7`, `top_p=0.95`, `max_tokens=512`. #[frb(sync)] pub fn default_sampling_config() -> SamplingConfig { diff --git a/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt b/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt index d99a80e..23179f8 100644 --- a/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt +++ b/sdk/kotlin/lib/src/shared/kotlin/com/ondeinference/onde/Convenience.kt @@ -25,6 +25,26 @@ object OndeModels { fun qwen25_1_5b(): GgufModelConfig = uniffi.onde.qwen2515bConfig() /** Qwen 2.5 3B Instruct GGUF Q4_K_M (~1.93 GB). */ fun qwen25_3b(): GgufModelConfig = uniffi.onde.qwen253bConfig() + + // ── Qwen 3 family ──────────────────────────────────────────────────────── + /** Qwen 3 0.6B GGUF Q4_K_M (~0.5 GB) — smallest Qwen 3 variant. */ + fun qwen3_0_6b(): GgufModelConfig = uniffi.onde.qwen306bConfig() + /** Qwen 3 1.7B GGUF Q4_K_M (~1.3 GB). */ + fun qwen3_1_7b(): GgufModelConfig = uniffi.onde.qwen317bConfig() + /** Qwen 3 4B GGUF Q4_K_M (~2.7 GB). */ + fun qwen3_4b(): GgufModelConfig = uniffi.onde.qwen34bConfig() + /** Qwen 3 8B GGUF Q4_K_M (~5 GB). */ + fun qwen3_8b(): GgufModelConfig = uniffi.onde.qwen38bConfig() + /** Qwen 3 14B GGUF Q4_K_M (~8.4 GB). */ + fun qwen3_14b(): GgufModelConfig = uniffi.onde.qwen314bConfig() + /** Qwen 3 32B GGUF Q4_K_M (~19.8 GB) — largest dense Qwen 3. */ + fun qwen3_32b(): GgufModelConfig = uniffi.onde.qwen332bConfig() + /** Qwen 3 4B Instruct 2507 GGUF Q4_K_M (~2.5 GB) — latest non-thinking 4B. */ + fun qwen3_4b_instruct_2507(): GgufModelConfig = uniffi.onde.qwen34bInstruct2507Config() + /** Qwen 3 4B Thinking 2507 GGUF Q4_K_M (~2.5 GB) — latest reasoning 4B. */ + fun qwen3_4b_thinking_2507(): GgufModelConfig = uniffi.onde.qwen34bThinking2507Config() + /** Qwen 3 30B-A3B Instruct 2507 GGUF Q4_K_M (~18.6 GB) — flagship MoE. */ + fun qwen3_30b_a3b_instruct_2507(): GgufModelConfig = uniffi.onde.qwen330bA3bInstruct2507Config() } /** diff --git a/sdk/react-native/rust/src/lib.rs b/sdk/react-native/rust/src/lib.rs index 7eff78b..e6c0fa2 100644 --- a/sdk/react-native/rust/src/lib.rs +++ b/sdk/react-native/rust/src/lib.rs @@ -470,6 +470,62 @@ pub extern "C" fn onde_qwen25_3b_config() -> *mut c_char { to_json_cstring(&GgufModelConfig::qwen25_3b()) } +// ── Qwen 3 family ──────────────────────────────────────────────────────────── + +/// Return the Qwen 3 0.6B GGUF model config as JSON. +#[no_mangle] +pub extern "C" fn onde_qwen3_0_6b_config() -> *mut c_char { + to_json_cstring(&GgufModelConfig::qwen3_0_6b()) +} + +/// Return the Qwen 3 1.7B GGUF model config as JSON. +#[no_mangle] +pub extern "C" fn onde_qwen3_1_7b_config() -> *mut c_char { + to_json_cstring(&GgufModelConfig::qwen3_1_7b()) +} + +/// Return the Qwen 3 4B GGUF model config as JSON. +#[no_mangle] +pub extern "C" fn onde_qwen3_4b_config() -> *mut c_char { + to_json_cstring(&GgufModelConfig::qwen3_4b()) +} + +/// Return the Qwen 3 8B GGUF model config as JSON. +#[no_mangle] +pub extern "C" fn onde_qwen3_8b_config() -> *mut c_char { + to_json_cstring(&GgufModelConfig::qwen3_8b()) +} + +/// Return the Qwen 3 14B GGUF model config as JSON. +#[no_mangle] +pub extern "C" fn onde_qwen3_14b_config() -> *mut c_char { + to_json_cstring(&GgufModelConfig::qwen3_14b()) +} + +/// Return the Qwen 3 32B GGUF model config as JSON. +#[no_mangle] +pub extern "C" fn onde_qwen3_32b_config() -> *mut c_char { + to_json_cstring(&GgufModelConfig::qwen3_32b()) +} + +/// Return the Qwen 3 4B Instruct 2507 GGUF model config as JSON. +#[no_mangle] +pub extern "C" fn onde_qwen3_4b_instruct_2507_config() -> *mut c_char { + to_json_cstring(&GgufModelConfig::qwen3_4b_instruct_2507()) +} + +/// Return the Qwen 3 4B Thinking 2507 GGUF model config as JSON. +#[no_mangle] +pub extern "C" fn onde_qwen3_4b_thinking_2507_config() -> *mut c_char { + to_json_cstring(&GgufModelConfig::qwen3_4b_thinking_2507()) +} + +/// Return the Qwen 3 30B-A3B Instruct 2507 (MoE) GGUF model config as JSON. +#[no_mangle] +pub extern "C" fn onde_qwen3_30b_a3b_instruct_2507_config() -> *mut c_char { + to_json_cstring(&GgufModelConfig::qwen3_30b_a3b_instruct_2507()) +} + // ── Sampling presets ───────────────────────────────────────────────────────── /// Return the default sampling config as JSON. diff --git a/src/inference/engine.rs b/src/inference/engine.rs index 4380816..5b146d9 100644 --- a/src/inference/engine.rs +++ b/src/inference/engine.rs @@ -2039,7 +2039,43 @@ impl GgufModelConfig { } } - /// Qwen 3 4B Instruct (GGUF Q4_K_M) — ~2.7 GB. + /// Qwen 3 0.6B (GGUF Q4_K_M) — smallest Qwen 3 variant, ~0.5 GB. + /// + /// Lightest tool-capable Qwen 3 model; fits tvOS and the most + /// memory-constrained mobile devices. Extended thinking mode — load with + /// `max_tokens ≥ 4096`. + pub fn qwen3_0_6b() -> Self { + Self { + model_id: super::models::BARTOWSKI_QWEN3_0_6B_GGUF.into(), + files: vec![super::models::QWEN3_0_6B_GGUF_FILE.into()], + tok_model_id: if cfg!(target_os = "android") { + Some(super::models::QWEN3_0_6B_TOK_MODEL_ID.into()) + } else { + None + }, + display_name: "Qwen 3 0.6B (Q4_K_M)".into(), + approx_memory: "~0.5 GB".into(), + chat_template: None, + } + } + + /// Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling, ~1.3 GB. + pub fn qwen3_1_7b() -> Self { + Self { + model_id: super::models::BARTOWSKI_QWEN3_1_7B_GGUF.into(), + files: vec![super::models::QWEN3_1_7B_GGUF_FILE.into()], + tok_model_id: if cfg!(target_os = "android") { + Some(super::models::QWEN3_1_7B_TOK_MODEL_ID.into()) + } else { + None + }, + display_name: "Qwen 3 1.7B (Q4_K_M)".into(), + approx_memory: "~1.3 GB".into(), + chat_template: None, + } + } + + /// Qwen 3 4B (GGUF Q4_K_M) — ~2.7 GB. /// /// Full OpenAI-compatible tool calling with extended thinking mode. /// Always load with `max_tokens ≥ 4096`; the `` block can @@ -2048,49 +2084,127 @@ impl GgufModelConfig { Self { model_id: super::models::BARTOWSKI_QWEN3_4B_GGUF.into(), files: vec![super::models::QWEN3_4B_GGUF_FILE.into()], - tok_model_id: None, + tok_model_id: if cfg!(target_os = "android") { + Some(super::models::QWEN3_4B_TOK_MODEL_ID.into()) + } else { + None + }, display_name: "Qwen 3 4B (Q4_K_M)".into(), approx_memory: "~2.7 GB".into(), chat_template: None, } } - /// Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling, ~1.3 GB. - pub fn qwen3_1_7b() -> Self { + /// Strong tool-calling model with extended thinking. Best balance of + /// quality and memory for macOS with 24+ GB RAM. + pub fn qwen3_8b() -> Self { Self { - model_id: super::models::BARTOWSKI_QWEN3_1_7B_GGUF.into(), - files: vec![super::models::QWEN3_1_7B_GGUF_FILE.into()], - tok_model_id: None, - display_name: "Qwen 3 1.7B (Q4_K_M)".into(), - approx_memory: "~1.3 GB".into(), + model_id: super::models::BARTOWSKI_QWEN3_8B_GGUF.into(), + files: vec![super::models::QWEN3_8B_GGUF_FILE.into()], + tok_model_id: if cfg!(target_os = "android") { + Some(super::models::QWEN3_8B_TOK_MODEL_ID.into()) + } else { + None + }, + display_name: "Qwen 3 8B (Q4_K_M)".into(), + approx_memory: "~5 GB".into(), chat_template: None, } } - /// Qwen 3 14B Instruct (GGUF Q4_K_M) — ~8.4 GB. + /// Qwen 3 14B (GGUF Q4_K_M) — ~8.4 GB. /// - /// Strongest reasoning and tool-calling model with extended thinking. + /// Strong reasoning and tool-calling model with extended thinking. /// Best all-around model for macOS with 16+ GB RAM. pub fn qwen3_14b() -> Self { Self { model_id: super::models::BARTOWSKI_QWEN3_14B_GGUF.into(), files: vec![super::models::QWEN3_14B_GGUF_FILE.into()], - tok_model_id: None, + tok_model_id: if cfg!(target_os = "android") { + Some(super::models::QWEN3_14B_TOK_MODEL_ID.into()) + } else { + None + }, display_name: "Qwen 3 14B (Q4_K_M)".into(), approx_memory: "~8.4 GB".into(), chat_template: None, } } - /// Strong tool-calling model with extended thinking. Best balance of - /// quality and memory for macOS with 24+ GB RAM. - pub fn qwen3_8b() -> Self { + /// Qwen 3 32B (GGUF Q4_K_M) — largest dense Qwen 3, ~19.8 GB. + /// + /// Highest-quality dense Qwen 3 variant. Requires a high-memory desktop + /// (32+ GB RAM / unified memory). Extended thinking and full tool calling. + pub fn qwen3_32b() -> Self { Self { - model_id: super::models::BARTOWSKI_QWEN3_8B_GGUF.into(), - files: vec![super::models::QWEN3_8B_GGUF_FILE.into()], - tok_model_id: None, - display_name: "Qwen 3 8B (Q4_K_M)".into(), - approx_memory: "~5 GB".into(), + model_id: super::models::BARTOWSKI_QWEN3_32B_GGUF.into(), + files: vec![super::models::QWEN3_32B_GGUF_FILE.into()], + tok_model_id: if cfg!(target_os = "android") { + Some(super::models::QWEN3_32B_TOK_MODEL_ID.into()) + } else { + None + }, + display_name: "Qwen 3 32B (Q4_K_M)".into(), + approx_memory: "~19.8 GB".into(), + chat_template: None, + } + } + + /// Qwen 3 4B Instruct 2507 (GGUF Q4_K_M) — latest non-thinking 4B, ~2.5 GB. + /// + /// Updated instruction-tuned checkpoint that does *not* emit a `` + /// block, making it faster and more predictable for chat and tool use. + /// Recommended general-purpose Qwen 3 model. + pub fn qwen3_4b_instruct_2507() -> Self { + Self { + model_id: super::models::BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF.into(), + files: vec![super::models::QWEN3_4B_INSTRUCT_2507_GGUF_FILE.into()], + tok_model_id: if cfg!(target_os = "android") { + Some(super::models::QWEN3_4B_INSTRUCT_2507_TOK_MODEL_ID.into()) + } else { + None + }, + display_name: "Qwen 3 4B Instruct 2507 (Q4_K_M)".into(), + approx_memory: "~2.5 GB".into(), + chat_template: None, + } + } + + /// Qwen 3 4B Thinking 2507 (GGUF Q4_K_M) — latest reasoning 4B, ~2.5 GB. + /// + /// Updated thinking-only checkpoint with stronger reasoning and tool-use + /// accuracy. Always emits a `` block — load with `max_tokens ≥ 4096`. + pub fn qwen3_4b_thinking_2507() -> Self { + Self { + model_id: super::models::BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF.into(), + files: vec![super::models::QWEN3_4B_THINKING_2507_GGUF_FILE.into()], + tok_model_id: if cfg!(target_os = "android") { + Some(super::models::QWEN3_4B_THINKING_2507_TOK_MODEL_ID.into()) + } else { + None + }, + display_name: "Qwen 3 4B Thinking 2507 (Q4_K_M)".into(), + approx_memory: "~2.5 GB".into(), + chat_template: None, + } + } + + /// Qwen 3 30B-A3B Instruct 2507 (GGUF Q4_K_M) — flagship MoE, ~18.6 GB. + /// + /// Mixture-of-experts model: 30B total parameters, ~3B active per token, so + /// inference is far cheaper than a 30B dense model while quality rivals it. + /// Loads via the `qwen3moe` GGUF architecture. Requires 32+ GB RAM. + pub fn qwen3_30b_a3b_instruct_2507() -> Self { + Self { + model_id: super::models::BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF.into(), + files: vec![super::models::QWEN3_30B_A3B_INSTRUCT_2507_GGUF_FILE.into()], + tok_model_id: if cfg!(target_os = "android") { + Some(super::models::QWEN3_30B_A3B_INSTRUCT_2507_TOK_MODEL_ID.into()) + } else { + None + }, + display_name: "Qwen 3 30B-A3B Instruct 2507 (Q4_K_M)".into(), + approx_memory: "~18.6 GB".into(), chat_template: None, } } @@ -2172,6 +2286,37 @@ mod tests { assert_eq!(cfg.files.len(), 1); } + #[test] + fn gguf_model_config_qwen3_variants() { + // Every Qwen 3 constructor must produce a valid, single-file config + // whose repo ID is registered in the supported-models list. + let configs = [ + GgufModelConfig::qwen3_0_6b(), + GgufModelConfig::qwen3_1_7b(), + GgufModelConfig::qwen3_4b(), + GgufModelConfig::qwen3_8b(), + GgufModelConfig::qwen3_14b(), + GgufModelConfig::qwen3_32b(), + GgufModelConfig::qwen3_4b_instruct_2507(), + GgufModelConfig::qwen3_4b_thinking_2507(), + GgufModelConfig::qwen3_30b_a3b_instruct_2507(), + ]; + for cfg in &configs { + assert!( + cfg.model_id.contains("Qwen3"), + "unexpected id: {}", + cfg.model_id + ); + assert_eq!(cfg.files.len(), 1); + assert!(cfg.files[0].ends_with("-Q4_K_M.gguf")); + assert!( + super::super::models::SUPPORTED_MODELS.contains(&cfg.model_id.as_str()), + "{} missing from SUPPORTED_MODELS", + cfg.model_id + ); + } + } + #[test] fn gguf_model_config_platform_default() { let cfg = GgufModelConfig::platform_default(); diff --git a/src/inference/ffi.rs b/src/inference/ffi.rs index 376aff0..12c5036 100644 --- a/src/inference/ffi.rs +++ b/src/inference/ffi.rs @@ -326,6 +326,66 @@ pub fn qwen25_3b_config() -> GgufModelConfig { GgufModelConfig::qwen25_3b() } +// ── Qwen 3 family ──────────────────────────────────────────────────────────── + +/// Return the Qwen 3 0.6B GGUF model configuration (~0.5 GB). +#[uniffi::export] +pub fn qwen3_0_6b_config() -> GgufModelConfig { + GgufModelConfig::qwen3_0_6b() +} + +/// Return the Qwen 3 1.7B GGUF model configuration (~1.3 GB). +#[uniffi::export] +pub fn qwen3_1_7b_config() -> GgufModelConfig { + GgufModelConfig::qwen3_1_7b() +} + +/// Return the Qwen 3 4B GGUF model configuration (~2.7 GB). +#[uniffi::export] +pub fn qwen3_4b_config() -> GgufModelConfig { + GgufModelConfig::qwen3_4b() +} + +/// Return the Qwen 3 8B GGUF model configuration (~5 GB). +#[uniffi::export] +pub fn qwen3_8b_config() -> GgufModelConfig { + GgufModelConfig::qwen3_8b() +} + +/// Return the Qwen 3 14B GGUF model configuration (~8.4 GB). +#[uniffi::export] +pub fn qwen3_14b_config() -> GgufModelConfig { + GgufModelConfig::qwen3_14b() +} + +/// Return the Qwen 3 32B GGUF model configuration (~19.8 GB). +#[uniffi::export] +pub fn qwen3_32b_config() -> GgufModelConfig { + GgufModelConfig::qwen3_32b() +} + +/// Return the Qwen 3 4B Instruct 2507 GGUF model configuration (~2.5 GB). +/// +/// Latest non-thinking 4B checkpoint — recommended general-purpose Qwen 3 model. +#[uniffi::export] +pub fn qwen3_4b_instruct_2507_config() -> GgufModelConfig { + GgufModelConfig::qwen3_4b_instruct_2507() +} + +/// Return the Qwen 3 4B Thinking 2507 GGUF model configuration (~2.5 GB). +/// +/// Latest reasoning-focused 4B checkpoint; load with `max_tokens ≥ 4096`. +#[uniffi::export] +pub fn qwen3_4b_thinking_2507_config() -> GgufModelConfig { + GgufModelConfig::qwen3_4b_thinking_2507() +} + +/// Return the Qwen 3 30B-A3B Instruct 2507 (MoE) GGUF model configuration (~18.6 GB). +#[uniffi::export] +pub fn qwen3_30b_a3b_instruct_2507_config() -> GgufModelConfig { + GgufModelConfig::qwen3_30b_a3b_instruct_2507() +} + /// Return default sampling parameters for creative chat. #[uniffi::export] pub fn default_sampling_config() -> SamplingConfig { diff --git a/src/inference/models.rs b/src/inference/models.rs index 31f83c6..b56e5bd 100644 --- a/src/inference/models.rs +++ b/src/inference/models.rs @@ -83,16 +83,47 @@ pub const QWEN25_3B_GGUF_FILE: &str = "Qwen2.5-3B-Instruct-Q4_K_M.gguf"; /// Base model repo used for the HF tokenizer (tokenizer.json + tokenizer_config.json). pub const QWEN25_3B_TOK_MODEL_ID: &str = "Qwen/Qwen2.5-3B-Instruct"; -/// Pre-quantized Qwen 3 4B Instruct (GGUF Q4_K_M) — full OpenAI-compatible tool calling (~2.7 GB). +// ── Qwen 3 family (GGUF Q4_K_M) ────────────────────────────────────────────── +// +// The Qwen 3 line uses the `qwen3` GGUF architecture (and `qwen3moe` for the +// 30B-A3B mixture-of-experts variant), both supported by mistral.rs's quantized +// loader. Every Qwen 3 model is a hybrid reasoner with an extended thinking mode +// (``); always load with `max_tokens ≥ 4096` so the thinking +// block does not exhaust the token budget before the real reply. +// +// bartowski's repos embed the tokenizer and chat template inside the GGUF, so on +// iOS/macOS no separate tokenizer download is needed. On Android the candle GGUF +// backend cannot parse the embedded tokenizer, so each model also declares a +// `TOK_MODEL_ID` pointing at the official Qwen repo for a standalone tokenizer. + +/// Pre-quantized Qwen 3 0.6B (GGUF Q4_K_M) — smallest Qwen 3 variant (~0.5 GB). /// -/// Qwen 3 uses an extended thinking mode (``) that significantly improves -/// reasoning and tool-use accuracy. Load with `max_tokens ≥ 4096` to avoid empty replies caused -/// by the model exhausting its token budget on thinking before producing a response. +/// Lightest tool-capable Qwen 3 model. Suitable for tvOS and the most +/// memory-constrained mobile devices where even the 1.7B is too large. +pub const BARTOWSKI_QWEN3_0_6B_GGUF: &str = "bartowski/Qwen_Qwen3-0.6B-GGUF"; +/// The specific GGUF filename for the bartowski Qwen 3 0.6B repo. +pub const QWEN3_0_6B_GGUF_FILE: &str = "Qwen_Qwen3-0.6B-Q4_K_M.gguf"; +/// Base model repo used for the HF tokenizer on Android. +pub const QWEN3_0_6B_TOK_MODEL_ID: &str = "Qwen/Qwen3-0.6B"; + +/// Pre-quantized Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling model (~1.3 GB). +/// +/// Smallest Qwen 3 variant with comfortable tool calling. Suitable for mobile +/// devices where the 4B model would be too large. +pub const BARTOWSKI_QWEN3_1_7B_GGUF: &str = "bartowski/Qwen_Qwen3-1.7B-GGUF"; +/// The specific GGUF filename for the Qwen3 1.7B repo. +pub const QWEN3_1_7B_GGUF_FILE: &str = "Qwen_Qwen3-1.7B-Q4_K_M.gguf"; +/// Base model repo used for the HF tokenizer on Android. +pub const QWEN3_1_7B_TOK_MODEL_ID: &str = "Qwen/Qwen3-1.7B"; + +/// Pre-quantized Qwen 3 4B (GGUF Q4_K_M) — full OpenAI-compatible tool calling (~2.7 GB). /// /// Recommended model for siGit Code (coding agent with tool calling on macOS/Linux/Windows). pub const BARTOWSKI_QWEN3_4B_GGUF: &str = "bartowski/Qwen_Qwen3-4B-GGUF"; /// The specific GGUF filename to download from the bartowski Qwen 3 4B repo. pub const QWEN3_4B_GGUF_FILE: &str = "Qwen_Qwen3-4B-Q4_K_M.gguf"; +/// Base model repo used for the HF tokenizer on Android. +pub const QWEN3_4B_TOK_MODEL_ID: &str = "Qwen/Qwen3-4B"; /// Pre-quantized Qwen 3 8B (GGUF Q4_K_M) — strong tool-calling model (~5 GB). /// @@ -100,22 +131,70 @@ pub const QWEN3_4B_GGUF_FILE: &str = "Qwen_Qwen3-4B-Q4_K_M.gguf"; /// Full tool calling and extended thinking mode support. pub const BARTOWSKI_QWEN3_8B_GGUF: &str = "bartowski/Qwen_Qwen3-8B-GGUF"; pub const QWEN3_8B_GGUF_FILE: &str = "Qwen_Qwen3-8B-Q4_K_M.gguf"; +/// Base model repo used for the HF tokenizer on Android. +pub const QWEN3_8B_TOK_MODEL_ID: &str = "Qwen/Qwen3-8B"; /// Pre-quantized Qwen 3 14B (GGUF Q4_K_M) — strong reasoning and tool-calling model (~8.4 GB). /// -/// Qwen 3 uses extended thinking mode (``) for improved reasoning. /// Best all-around model for macOS with 16+ GB RAM. Full tool calling support. pub const BARTOWSKI_QWEN3_14B_GGUF: &str = "bartowski/Qwen_Qwen3-14B-GGUF"; /// The specific GGUF filename. pub const QWEN3_14B_GGUF_FILE: &str = "Qwen_Qwen3-14B-Q4_K_M.gguf"; +/// Base model repo used for the HF tokenizer on Android. +pub const QWEN3_14B_TOK_MODEL_ID: &str = "Qwen/Qwen3-14B"; -/// Pre-quantized Qwen 3 1.7B (GGUF Q4_K_M) — lightweight tool-calling model (~1.3 GB). +/// Pre-quantized Qwen 3 32B (GGUF Q4_K_M) — largest dense Qwen 3 model (~19.8 GB). /// -/// Smallest Qwen 3 variant with tool calling support. Suitable for mobile devices -/// where the 4B model would be too large. -pub const BARTOWSKI_QWEN3_1_7B_GGUF: &str = "bartowski/Qwen_Qwen3-1.7B-GGUF"; -/// The specific GGUF filename for the Qwen3 1.7B repo. -pub const QWEN3_1_7B_GGUF_FILE: &str = "Qwen_Qwen3-1.7B-Q4_K_M.gguf"; +/// Highest-quality dense Qwen 3 variant. Requires a high-memory desktop +/// (32+ GB RAM / unified memory). Full tool calling and extended thinking. +pub const BARTOWSKI_QWEN3_32B_GGUF: &str = "bartowski/Qwen_Qwen3-32B-GGUF"; +/// The specific GGUF filename for the bartowski Qwen 3 32B repo. +pub const QWEN3_32B_GGUF_FILE: &str = "Qwen_Qwen3-32B-Q4_K_M.gguf"; +/// Base model repo used for the HF tokenizer on Android. +pub const QWEN3_32B_TOK_MODEL_ID: &str = "Qwen/Qwen3-32B"; + +// ── Qwen 3 "2507" updated releases ─────────────────────────────────────────── +// +// The July/August 2025 refresh split Qwen 3 into dedicated non-thinking +// (`-Instruct-2507`) and thinking-only (`-Thinking-2507`) checkpoints with +// markedly improved instruction following, tool use, and long-context quality. +// These are the latest open Qwen 3 weights available as on-device GGUF. + +/// Pre-quantized Qwen 3 4B Instruct 2507 (GGUF Q4_K_M) — latest non-thinking 4B (~2.5 GB). +/// +/// Updated instruction-tuned checkpoint. Unlike the base 4B it does *not* emit a +/// `` block, so it is faster and more predictable for chat and tool use. +pub const BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF: &str = + "bartowski/Qwen_Qwen3-4B-Instruct-2507-GGUF"; +/// The specific GGUF filename for the bartowski Qwen 3 4B Instruct 2507 repo. +pub const QWEN3_4B_INSTRUCT_2507_GGUF_FILE: &str = "Qwen_Qwen3-4B-Instruct-2507-Q4_K_M.gguf"; +/// Base model repo used for the HF tokenizer on Android. +pub const QWEN3_4B_INSTRUCT_2507_TOK_MODEL_ID: &str = "Qwen/Qwen3-4B-Instruct-2507"; + +/// Pre-quantized Qwen 3 4B Thinking 2507 (GGUF Q4_K_M) — latest reasoning 4B (~2.5 GB). +/// +/// Updated thinking-only checkpoint with stronger reasoning and tool-use accuracy. +/// Always emits a `` block — load with `max_tokens ≥ 4096`. +pub const BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF: &str = + "bartowski/Qwen_Qwen3-4B-Thinking-2507-GGUF"; +/// The specific GGUF filename for the bartowski Qwen 3 4B Thinking 2507 repo. +pub const QWEN3_4B_THINKING_2507_GGUF_FILE: &str = "Qwen_Qwen3-4B-Thinking-2507-Q4_K_M.gguf"; +/// Base model repo used for the HF tokenizer on Android. +pub const QWEN3_4B_THINKING_2507_TOK_MODEL_ID: &str = "Qwen/Qwen3-4B-Thinking-2507"; + +/// Pre-quantized Qwen 3 30B-A3B Instruct 2507 (GGUF Q4_K_M) — flagship MoE (~18.6 GB). +/// +/// Mixture-of-experts model: 30B total parameters but only ~3B active per token, +/// so inference is far cheaper than a 30B dense model while quality rivals it. +/// Loads via the `qwen3moe` GGUF architecture in mistral.rs. Requires a +/// high-memory desktop (32+ GB RAM / unified memory). +pub const BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF: &str = + "bartowski/Qwen_Qwen3-30B-A3B-Instruct-2507-GGUF"; +/// The specific GGUF filename for the bartowski Qwen 3 30B-A3B Instruct 2507 repo. +pub const QWEN3_30B_A3B_INSTRUCT_2507_GGUF_FILE: &str = + "Qwen_Qwen3-30B-A3B-Instruct-2507-Q4_K_M.gguf"; +/// Base model repo used for the HF tokenizer on Android. +pub const QWEN3_30B_A3B_INSTRUCT_2507_TOK_MODEL_ID: &str = "Qwen/Qwen3-30B-A3B-Instruct-2507"; /// DeepSeek Coder v1 6.7B Instruct (GGUF Q4_K_M) — dedicated code generation model (~3.8 GB). /// @@ -136,10 +215,15 @@ pub const SUPPORTED_MODELS: &[&str] = &[ BARTOWSKI_QWEN25_0_5B_INSTRUCT_GGUF, BARTOWSKI_QWEN25_1_5B_INSTRUCT_GGUF, BARTOWSKI_QWEN25_3B_INSTRUCT_GGUF, + BARTOWSKI_QWEN3_0_6B_GGUF, + BARTOWSKI_QWEN3_1_7B_GGUF, BARTOWSKI_QWEN3_4B_GGUF, BARTOWSKI_QWEN3_8B_GGUF, BARTOWSKI_QWEN3_14B_GGUF, - BARTOWSKI_QWEN3_1_7B_GGUF, + BARTOWSKI_QWEN3_32B_GGUF, + BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF, + BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF, + BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF, BARTOWSKI_QWEN25_CODER_7B_INSTRUCT_GGUF, THEBLOKE_DEEPSEEK_CODER_6_7B_INSTRUCT_GGUF, ]; @@ -197,6 +281,23 @@ pub const SUPPORTED_MODEL_INFO: &[SupportedModelInfo] = &[ // Exact file size from HuggingFace API siblings[].size. expected_size_bytes: 1_929_903_264, }, + SupportedModelInfo { + id: BARTOWSKI_QWEN3_0_6B_GGUF, + name: "Qwen 3 0.6B (GGUF)", + org: "Qwen / Alibaba", + description: "Smallest Qwen 3 variant with tool calling (~0.5 GB). \ + Suitable for tvOS and the most memory-constrained mobile devices.", + // Approximate Q4_K_M size; HF API unreachable at authoring time. + expected_size_bytes: 483_000_000, + }, + SupportedModelInfo { + id: BARTOWSKI_QWEN3_1_7B_GGUF, + name: "Qwen 3 1.7B (GGUF)", + org: "Qwen / Alibaba", + description: "Lightweight tool-calling model for mobile (~1.3 GB). \ + Smallest Qwen 3 variant with comfortable tool calling support.", + expected_size_bytes: 1_282_439_584, + }, SupportedModelInfo { id: BARTOWSKI_QWEN3_4B_GGUF, name: "Qwen 3 4B (GGUF)", @@ -225,12 +326,40 @@ pub const SUPPORTED_MODEL_INFO: &[SupportedModelInfo] = &[ expected_size_bytes: 9_001_753_632, }, SupportedModelInfo { - id: BARTOWSKI_QWEN3_1_7B_GGUF, - name: "Qwen 3 1.7B (GGUF)", + id: BARTOWSKI_QWEN3_32B_GGUF, + name: "Qwen 3 32B (GGUF)", org: "Qwen / Alibaba", - description: "Lightweight tool-calling model for mobile (~1.3 GB). \ - Smallest Qwen 3 variant with tool calling support.", - expected_size_bytes: 1_282_439_584, + description: "Largest dense Qwen 3 model with extended thinking (~19.8 GB). \ + Highest-quality dense variant; requires 32+ GB RAM.", + // Approximate Q4_K_M size; HF API unreachable at authoring time. + expected_size_bytes: 21_260_000_000, + }, + SupportedModelInfo { + id: BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF, + name: "Qwen 3 4B Instruct 2507 (GGUF)", + org: "Qwen / Alibaba", + description: "Latest non-thinking 4B checkpoint — faster, predictable chat and \ + tool use (~2.5 GB). Recommended general-purpose Qwen 3 model.", + // Approximate Q4_K_M size; HF API unreachable at authoring time. + expected_size_bytes: 2_500_000_000, + }, + SupportedModelInfo { + id: BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF, + name: "Qwen 3 4B Thinking 2507 (GGUF)", + org: "Qwen / Alibaba", + description: "Latest reasoning-focused 4B checkpoint with extended thinking (~2.5 GB). \ + Stronger reasoning and tool-use accuracy; load with max_tokens ≥ 4096.", + // Approximate Q4_K_M size; HF API unreachable at authoring time. + expected_size_bytes: 2_500_000_000, + }, + SupportedModelInfo { + id: BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF, + name: "Qwen 3 30B-A3B Instruct 2507 (GGUF)", + org: "Qwen / Alibaba", + description: "Flagship mixture-of-experts model: 30B total / ~3B active (~18.6 GB). \ + Near-dense quality at far lower inference cost; requires 32+ GB RAM.", + // Approximate Q4_K_M size; HF API unreachable at authoring time. + expected_size_bytes: 19_971_600_000, }, SupportedModelInfo { id: BARTOWSKI_QWEN25_CODER_7B_INSTRUCT_GGUF, @@ -264,7 +393,48 @@ pub fn tok_model_id_for_repo(hf_repo_id: &str) -> Option<&'static str> { BARTOWSKI_QWEN25_CODER_1_5B_INSTRUCT_GGUF => Some(QWEN25_CODER_1_5B_TOK_MODEL_ID), BARTOWSKI_QWEN25_CODER_3B_INSTRUCT_GGUF => Some(QWEN25_CODER_3B_TOK_MODEL_ID), BARTOWSKI_QWEN25_CODER_7B_INSTRUCT_GGUF => Some(QWEN25_CODER_7B_TOK_MODEL_ID), + BARTOWSKI_QWEN3_0_6B_GGUF => Some(QWEN3_0_6B_TOK_MODEL_ID), + BARTOWSKI_QWEN3_1_7B_GGUF => Some(QWEN3_1_7B_TOK_MODEL_ID), + BARTOWSKI_QWEN3_4B_GGUF => Some(QWEN3_4B_TOK_MODEL_ID), + BARTOWSKI_QWEN3_8B_GGUF => Some(QWEN3_8B_TOK_MODEL_ID), + BARTOWSKI_QWEN3_14B_GGUF => Some(QWEN3_14B_TOK_MODEL_ID), + BARTOWSKI_QWEN3_32B_GGUF => Some(QWEN3_32B_TOK_MODEL_ID), + BARTOWSKI_QWEN3_4B_INSTRUCT_2507_GGUF => Some(QWEN3_4B_INSTRUCT_2507_TOK_MODEL_ID), + BARTOWSKI_QWEN3_4B_THINKING_2507_GGUF => Some(QWEN3_4B_THINKING_2507_TOK_MODEL_ID), + BARTOWSKI_QWEN3_30B_A3B_INSTRUCT_2507_GGUF => { + Some(QWEN3_30B_A3B_INSTRUCT_2507_TOK_MODEL_ID) + } THEBLOKE_DEEPSEEK_CODER_6_7B_INSTRUCT_GGUF => Some(DEEPSEEK_CODER_6_7B_TOK_MODEL_ID), _ => None, } } + +#[cfg(test)] +mod tests { + use super::*; + + /// Every supported model must have a corresponding display-metadata entry + /// so the model-list UI can render it. + #[test] + fn every_supported_model_has_info() { + for id in SUPPORTED_MODELS { + assert!( + SUPPORTED_MODEL_INFO.iter().any(|info| info.id == *id), + "{id} is in SUPPORTED_MODELS but missing from SUPPORTED_MODEL_INFO" + ); + } + } + + /// Every Qwen 3 repo must resolve to an Android tokenizer ID so loading + /// works on the candle GGUF backend. + #[test] + fn qwen3_repos_have_android_tokenizer() { + let qwen3 = SUPPORTED_MODELS.iter().filter(|id| id.contains("Qwen3")); + for id in qwen3 { + assert!( + tok_model_id_for_repo(id).is_some(), + "{id} has no Android tokenizer mapping" + ); + } + } +}