From a86276306e3e780e4804214e034f8fc6d852b317 Mon Sep 17 00:00:00 2001 From: DeadByDawn101 Date: Fri, 12 Jun 2026 06:06:09 +0000 Subject: [PATCH 1/2] feat: Add Qwen3.6 MoE (qwen3_5_moe) GGUF export support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds GGUF export support for Qwen3.6 35B-A3B and other Qwen3.5/3.6 Mixture-of-Experts models. Changes: - fuse.py: Add 'qwen3_5_moe' to supported model types for --export-gguf - gguf.py: Add tensor name mappings for Qwen3.6 MoE architecture: - Strip 'language_model.' prefix (ConditionalGeneration wrapper) - Map switch_mlp.{gate_up,down}_proj → ffn_{gate_up,down}_exps - Map shared_expert.{gate,down,up}_proj → ffn_{gate,down,up}_shexp - Map shared_expert_gate → ffn_gate_inp_shexp - Map mlp.gate → ffn_gate_inp (MoE router) - Map linear_attn (Mamba-style SSM) tensor names - gguf.py: Pre-process gate_proj + up_proj fusion into gate_up_proj before name translation (Qwen3.6 stores these separately but GGUF expects them concatenated along the intermediate_size dimension) Background: Qwen3.6 MoE uses a hybrid architecture (Gated DeltaNet + softmax attention + MoE with shared experts) that has different tensor naming conventions than Mixtral-style MoE models. The key differences are: 1. 'switch_mlp' instead of 'block_sparse_moe.experts.{n}' 2. Merged 3D expert tensors instead of per-expert 2D tensors 3. Separate gate_proj and up_proj that need pre-fusion 4. 'language_model.' prefix from the ConditionalGeneration wrapper 5. Linear attention (Mamba SSM) tensors alongside standard attention Tested with: Qwen3.6-35B-A3B fine-tuned models fused via mlx_lm.fuse Co-authored-by: Claude (Anthropic) --- mlx_lm/fuse.py | 2 +- mlx_lm/gguf.py | 45 +++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/mlx_lm/fuse.py b/mlx_lm/fuse.py index 87f667752..75fdabadf 100644 --- a/mlx_lm/fuse.py +++ b/mlx_lm/fuse.py @@ -92,7 +92,7 @@ def main() -> None: if args.export_gguf: model_type = config["model_type"] - if model_type not in ["llama", "mixtral", "mistral"]: + if model_type not in ["llama", "mixtral", "mistral", "qwen3_5_moe"]: raise ValueError( f"Model type {model_type} not supported for GGUF conversion." ) diff --git a/mlx_lm/gguf.py b/mlx_lm/gguf.py index 241ac35a1..37e477e7b 100644 --- a/mlx_lm/gguf.py +++ b/mlx_lm/gguf.py @@ -101,6 +101,9 @@ def load(path: Path) -> "HfVocab": def translate_weight_names(name): + # Strip language_model. prefix (Qwen3.6 ConditionalGeneration wrapper) + name = name.replace("language_model.", "") + name = name.replace("model.layers.", "blk.") # for mixtral gate name = name.replace("block_sparse_moe.gate", "ffn_gate_inp") @@ -115,6 +118,27 @@ def translate_weight_names(name): replacement = r"ffn_up.\1.weight" name = re.sub(pattern, replacement, name) + # for Qwen3.6 MoE (switch_mlp merged expert tensors) + name = name.replace("mlp.switch_mlp.gate_up_proj", "ffn_gate_up_exps") + name = name.replace("mlp.switch_mlp.down_proj", "ffn_down_exps") + # Qwen3.6 shared experts + name = name.replace("mlp.shared_expert.gate_proj", "ffn_gate_shexp") + name = name.replace("mlp.shared_expert.down_proj", "ffn_down_shexp") + name = name.replace("mlp.shared_expert.up_proj", "ffn_up_shexp") + name = name.replace("mlp.shared_expert_gate", "ffn_gate_inp_shexp") + # Qwen3.6 MoE router + name = name.replace("mlp.gate", "ffn_gate_inp") + # Qwen3.6 linear attention (Mamba-style SSM) + name = name.replace("linear_attn.A_log", "ssm_a") + name = name.replace("linear_attn.conv1d", "ssm_conv1d") + name = name.replace("linear_attn.dt_bias", "ssm_dt.bias") + name = name.replace("linear_attn.in_proj_a", "ssm_alpha") + name = name.replace("linear_attn.in_proj_b", "ssm_beta") + name = name.replace("linear_attn.in_proj_qkv", "attn_qkv") + name = name.replace("linear_attn.in_proj_z", "attn_gate") + name = name.replace("linear_attn.norm", "ssm_norm") + name = name.replace("linear_attn.out_proj", "ssm_out") + name = name.replace("mlp.gate_proj", "ffn_gate") name = name.replace("mlp.down_proj", "ffn_down") name = name.replace("mlp.up_proj", "ffn_up") @@ -291,6 +315,27 @@ def convert_to_gguf( for k, v in weights.items() } + # Pre-process Qwen3.6 MoE: fuse gate_proj + up_proj → gate_up_proj + # switch_mlp stores gate and up projections as separate tensors, + # but GGUF expects them concatenated as gate_up_proj + fused_weights = {} + skip_keys = set() + for k, v in weights.items(): + if "switch_mlp.gate_proj" in k: + up_key = k.replace("gate_proj", "up_proj") + if up_key in weights: + cat_dim = 1 if v.ndim == 3 else 0 + fused = mx.concatenate([v, weights[up_key]], axis=cat_dim) + fused_key = k.replace("gate_proj", "gate_up_proj") + fused_weights[fused_key] = fused + skip_keys.add(k) + skip_keys.add(up_key) + if fused_weights: + weights = { + **(fused_weights), + **{k: v for k, v in weights.items() if k not in skip_keys}, + } + # rename weights for gguf format weights = {translate_weight_names(k): v for k, v in weights.items()} From f92e5adc4cc5dba324920b22265f8909869c7e7f Mon Sep 17 00:00:00 2001 From: DeadByDawn101 Date: Fri, 12 Jun 2026 07:51:19 +0000 Subject: [PATCH 2/2] test: Add tests for Qwen3.6 MoE GGUF conversion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Tests added: - translate_weight_names strips language_model. prefix - translate_weight_names maps switch_mlp → ffn_*_exps - translate_weight_names maps shared_expert → ffn_*_shexp - translate_weight_names maps MoE router (mlp.gate) - translate_weight_names maps linear_attn (SSM) tensors - gate_proj + up_proj fusion produces correct gate_up_proj shape --- tests/test_gguf.py | 105 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/tests/test_gguf.py b/tests/test_gguf.py index f7e789a00..21d220996 100644 --- a/tests/test_gguf.py +++ b/tests/test_gguf.py @@ -60,3 +60,108 @@ def test_convert_to_gguf( if __name__ == "__main__": unittest.main() + + +class TestQwen36MoETensorMapping(unittest.TestCase): + """Tests for Qwen3.6 MoE (qwen3_5_moe) GGUF conversion support.""" + + def test_translate_weight_names_strips_language_model_prefix(self): + from mlx_lm.gguf import translate_weight_names + + name = "language_model.model.layers.0.mlp.gate_proj.weight" + result = translate_weight_names(name) + self.assertNotIn("language_model", result) + self.assertIn("blk.0", result) + + def test_translate_weight_names_maps_switch_mlp(self): + from mlx_lm.gguf import translate_weight_names + + name = "model.layers.0.mlp.switch_mlp.down_proj.weight" + result = translate_weight_names(name) + self.assertIn("ffn_down_exps", result) + self.assertNotIn("switch_mlp", result) + + def test_translate_weight_names_maps_switch_mlp_gate_up(self): + from mlx_lm.gguf import translate_weight_names + + name = "model.layers.0.mlp.switch_mlp.gate_up_proj.weight" + result = translate_weight_names(name) + self.assertIn("ffn_gate_up_exps", result) + + def test_translate_weight_names_maps_shared_expert(self): + from mlx_lm.gguf import translate_weight_names + + gate = translate_weight_names( + "model.layers.0.mlp.shared_expert.gate_proj.weight" + ) + down = translate_weight_names( + "model.layers.0.mlp.shared_expert.down_proj.weight" + ) + up = translate_weight_names( + "model.layers.0.mlp.shared_expert.up_proj.weight" + ) + self.assertIn("ffn_gate_shexp", gate) + self.assertIn("ffn_down_shexp", down) + self.assertIn("ffn_up_shexp", up) + + def test_translate_weight_names_maps_moe_router(self): + from mlx_lm.gguf import translate_weight_names + + name = "model.layers.0.mlp.gate.weight" + result = translate_weight_names(name) + self.assertIn("ffn_gate_inp", result) + + def test_translate_weight_names_maps_linear_attn(self): + from mlx_lm.gguf import translate_weight_names + + a_log = translate_weight_names("model.layers.0.linear_attn.A_log") + conv1d = translate_weight_names( + "model.layers.0.linear_attn.conv1d.weight" + ) + self.assertIn("ssm_a", a_log) + self.assertIn("ssm_conv1d", conv1d) + + def test_gate_up_proj_fusion_in_convert(self): + """Test that switch_mlp.gate_proj + up_proj are fused before name translation.""" + # Simulate 3D expert tensors [n_experts, intermediate, hidden] + gate = mx.random.uniform(shape=[4, 512, 2048]) + up = mx.random.uniform(shape=[4, 512, 2048]) + + # Simulate the fusion logic from convert_to_gguf + weights = { + "model.layers.0.mlp.switch_mlp.gate_proj.weight": gate, + "model.layers.0.mlp.switch_mlp.up_proj.weight": up, + "model.layers.0.mlp.switch_mlp.down_proj.weight": mx.random.uniform( + shape=[4, 2048, 512] + ), + } + + # Apply fusion (same logic as in convert_to_gguf) + fused_weights = {} + skip_keys = set() + for k, v in weights.items(): + if "switch_mlp.gate_proj" in k: + up_key = k.replace("gate_proj", "up_proj") + if up_key in weights: + cat_dim = 1 if v.ndim == 3 else 0 + fused = mx.concatenate([v, weights[up_key]], axis=cat_dim) + fused_key = k.replace("gate_proj", "gate_up_proj") + fused_weights[fused_key] = fused + skip_keys.add(k) + skip_keys.add(up_key) + if fused_weights: + weights = { + **(fused_weights), + **{k: v for k, v in weights.items() if k not in skip_keys}, + } + + # Verify fusion result + fused_key = "model.layers.0.mlp.switch_mlp.gate_up_proj.weight" + self.assertIn(fused_key, weights) + self.assertEqual(weights[fused_key].shape, [4, 1024, 2048]) + self.assertNotIn( + "model.layers.0.mlp.switch_mlp.gate_proj.weight", weights + ) + self.assertNotIn( + "model.layers.0.mlp.switch_mlp.up_proj.weight", weights + )