From a86276306e3e780e4804214e034f8fc6d852b317 Mon Sep 17 00:00:00 2001
From: DeadByDawn101 <sanjosepcrepair@gmail.com>
Date: Fri, 12 Jun 2026 06:06:09 +0000
Subject: [PATCH 1/2] feat: Add Qwen3.6 MoE (qwen3_5_moe) GGUF export support
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds GGUF export support for Qwen3.6 35B-A3B and other Qwen3.5/3.6
Mixture-of-Experts models.

Changes:
- fuse.py: Add 'qwen3_5_moe' to supported model types for --export-gguf
- gguf.py: Add tensor name mappings for Qwen3.6 MoE architecture:
  - Strip 'language_model.' prefix (ConditionalGeneration wrapper)
  - Map switch_mlp.{gate_up,down}_proj → ffn_{gate_up,down}_exps
  - Map shared_expert.{gate,down,up}_proj → ffn_{gate,down,up}_shexp
  - Map shared_expert_gate → ffn_gate_inp_shexp
  - Map mlp.gate → ffn_gate_inp (MoE router)
  - Map linear_attn (Mamba-style SSM) tensor names
- gguf.py: Pre-process gate_proj + up_proj fusion into gate_up_proj
  before name translation (Qwen3.6 stores these separately but GGUF
  expects them concatenated along the intermediate_size dimension)

Background:
Qwen3.6 MoE uses a hybrid architecture (Gated DeltaNet + softmax
attention + MoE with shared experts) that has different tensor naming
conventions than Mixtral-style MoE models. The key differences are:
1. 'switch_mlp' instead of 'block_sparse_moe.experts.{n}'
2. Merged 3D expert tensors instead of per-expert 2D tensors
3. Separate gate_proj and up_proj that need pre-fusion
4. 'language_model.' prefix from the ConditionalGeneration wrapper
5. Linear attention (Mamba SSM) tensors alongside standard attention

Tested with: Qwen3.6-35B-A3B fine-tuned models fused via mlx_lm.fuse

Co-authored-by: Claude (Anthropic)
---
 mlx_lm/fuse.py |  2 +-
 mlx_lm/gguf.py | 45 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/mlx_lm/fuse.py b/mlx_lm/fuse.py
index 87f667752..75fdabadf 100644
--- a/mlx_lm/fuse.py
+++ b/mlx_lm/fuse.py
@@ -92,7 +92,7 @@ def main() -> None:
 
     if args.export_gguf:
         model_type = config["model_type"]
-        if model_type not in ["llama", "mixtral", "mistral"]:
+        if model_type not in ["llama", "mixtral", "mistral", "qwen3_5_moe"]:
             raise ValueError(
                 f"Model type {model_type} not supported for GGUF conversion."
             )
diff --git a/mlx_lm/gguf.py b/mlx_lm/gguf.py
index 241ac35a1..37e477e7b 100644
--- a/mlx_lm/gguf.py
+++ b/mlx_lm/gguf.py
@@ -101,6 +101,9 @@ def load(path: Path) -> "HfVocab":
 
 
 def translate_weight_names(name):
+    # Strip language_model. prefix (Qwen3.6 ConditionalGeneration wrapper)
+    name = name.replace("language_model.", "")
+
     name = name.replace("model.layers.", "blk.")
     # for mixtral gate
     name = name.replace("block_sparse_moe.gate", "ffn_gate_inp")
@@ -115,6 +118,27 @@ def translate_weight_names(name):
     replacement = r"ffn_up.\1.weight"
     name = re.sub(pattern, replacement, name)
 
+    # for Qwen3.6 MoE (switch_mlp merged expert tensors)
+    name = name.replace("mlp.switch_mlp.gate_up_proj", "ffn_gate_up_exps")
+    name = name.replace("mlp.switch_mlp.down_proj", "ffn_down_exps")
+    # Qwen3.6 shared experts
+    name = name.replace("mlp.shared_expert.gate_proj", "ffn_gate_shexp")
+    name = name.replace("mlp.shared_expert.down_proj", "ffn_down_shexp")
+    name = name.replace("mlp.shared_expert.up_proj", "ffn_up_shexp")
+    name = name.replace("mlp.shared_expert_gate", "ffn_gate_inp_shexp")
+    # Qwen3.6 MoE router
+    name = name.replace("mlp.gate", "ffn_gate_inp")
+    # Qwen3.6 linear attention (Mamba-style SSM)
+    name = name.replace("linear_attn.A_log", "ssm_a")
+    name = name.replace("linear_attn.conv1d", "ssm_conv1d")
+    name = name.replace("linear_attn.dt_bias", "ssm_dt.bias")
+    name = name.replace("linear_attn.in_proj_a", "ssm_alpha")
+    name = name.replace("linear_attn.in_proj_b", "ssm_beta")
+    name = name.replace("linear_attn.in_proj_qkv", "attn_qkv")
+    name = name.replace("linear_attn.in_proj_z", "attn_gate")
+    name = name.replace("linear_attn.norm", "ssm_norm")
+    name = name.replace("linear_attn.out_proj", "ssm_out")
+
     name = name.replace("mlp.gate_proj", "ffn_gate")
     name = name.replace("mlp.down_proj", "ffn_down")
     name = name.replace("mlp.up_proj", "ffn_up")
@@ -291,6 +315,27 @@ def convert_to_gguf(
         for k, v in weights.items()
     }
 
+    # Pre-process Qwen3.6 MoE: fuse gate_proj + up_proj → gate_up_proj
+    # switch_mlp stores gate and up projections as separate tensors,
+    # but GGUF expects them concatenated as gate_up_proj
+    fused_weights = {}
+    skip_keys = set()
+    for k, v in weights.items():
+        if "switch_mlp.gate_proj" in k:
+            up_key = k.replace("gate_proj", "up_proj")
+            if up_key in weights:
+                cat_dim = 1 if v.ndim == 3 else 0
+                fused = mx.concatenate([v, weights[up_key]], axis=cat_dim)
+                fused_key = k.replace("gate_proj", "gate_up_proj")
+                fused_weights[fused_key] = fused
+                skip_keys.add(k)
+                skip_keys.add(up_key)
+    if fused_weights:
+        weights = {
+            **(fused_weights),
+            **{k: v for k, v in weights.items() if k not in skip_keys},
+        }
+
     # rename weights for gguf format
     weights = {translate_weight_names(k): v for k, v in weights.items()}
 

From f92e5adc4cc5dba324920b22265f8909869c7e7f Mon Sep 17 00:00:00 2001
From: DeadByDawn101 <sanjosepcrepair@gmail.com>
Date: Fri, 12 Jun 2026 07:51:19 +0000
Subject: [PATCH 2/2] test: Add tests for Qwen3.6 MoE GGUF conversion
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Tests added:
- translate_weight_names strips language_model. prefix
- translate_weight_names maps switch_mlp → ffn_*_exps
- translate_weight_names maps shared_expert → ffn_*_shexp
- translate_weight_names maps MoE router (mlp.gate)
- translate_weight_names maps linear_attn (SSM) tensors
- gate_proj + up_proj fusion produces correct gate_up_proj shape
---
 tests/test_gguf.py | 105 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 105 insertions(+)

diff --git a/tests/test_gguf.py b/tests/test_gguf.py
index f7e789a00..21d220996 100644
--- a/tests/test_gguf.py
+++ b/tests/test_gguf.py
@@ -60,3 +60,108 @@ def test_convert_to_gguf(
 
 if __name__ == "__main__":
     unittest.main()
+
+
+class TestQwen36MoETensorMapping(unittest.TestCase):
+    """Tests for Qwen3.6 MoE (qwen3_5_moe) GGUF conversion support."""
+
+    def test_translate_weight_names_strips_language_model_prefix(self):
+        from mlx_lm.gguf import translate_weight_names
+
+        name = "language_model.model.layers.0.mlp.gate_proj.weight"
+        result = translate_weight_names(name)
+        self.assertNotIn("language_model", result)
+        self.assertIn("blk.0", result)
+
+    def test_translate_weight_names_maps_switch_mlp(self):
+        from mlx_lm.gguf import translate_weight_names
+
+        name = "model.layers.0.mlp.switch_mlp.down_proj.weight"
+        result = translate_weight_names(name)
+        self.assertIn("ffn_down_exps", result)
+        self.assertNotIn("switch_mlp", result)
+
+    def test_translate_weight_names_maps_switch_mlp_gate_up(self):
+        from mlx_lm.gguf import translate_weight_names
+
+        name = "model.layers.0.mlp.switch_mlp.gate_up_proj.weight"
+        result = translate_weight_names(name)
+        self.assertIn("ffn_gate_up_exps", result)
+
+    def test_translate_weight_names_maps_shared_expert(self):
+        from mlx_lm.gguf import translate_weight_names
+
+        gate = translate_weight_names(
+            "model.layers.0.mlp.shared_expert.gate_proj.weight"
+        )
+        down = translate_weight_names(
+            "model.layers.0.mlp.shared_expert.down_proj.weight"
+        )
+        up = translate_weight_names(
+            "model.layers.0.mlp.shared_expert.up_proj.weight"
+        )
+        self.assertIn("ffn_gate_shexp", gate)
+        self.assertIn("ffn_down_shexp", down)
+        self.assertIn("ffn_up_shexp", up)
+
+    def test_translate_weight_names_maps_moe_router(self):
+        from mlx_lm.gguf import translate_weight_names
+
+        name = "model.layers.0.mlp.gate.weight"
+        result = translate_weight_names(name)
+        self.assertIn("ffn_gate_inp", result)
+
+    def test_translate_weight_names_maps_linear_attn(self):
+        from mlx_lm.gguf import translate_weight_names
+
+        a_log = translate_weight_names("model.layers.0.linear_attn.A_log")
+        conv1d = translate_weight_names(
+            "model.layers.0.linear_attn.conv1d.weight"
+        )
+        self.assertIn("ssm_a", a_log)
+        self.assertIn("ssm_conv1d", conv1d)
+
+    def test_gate_up_proj_fusion_in_convert(self):
+        """Test that switch_mlp.gate_proj + up_proj are fused before name translation."""
+        # Simulate 3D expert tensors [n_experts, intermediate, hidden]
+        gate = mx.random.uniform(shape=[4, 512, 2048])
+        up = mx.random.uniform(shape=[4, 512, 2048])
+
+        # Simulate the fusion logic from convert_to_gguf
+        weights = {
+            "model.layers.0.mlp.switch_mlp.gate_proj.weight": gate,
+            "model.layers.0.mlp.switch_mlp.up_proj.weight": up,
+            "model.layers.0.mlp.switch_mlp.down_proj.weight": mx.random.uniform(
+                shape=[4, 2048, 512]
+            ),
+        }
+
+        # Apply fusion (same logic as in convert_to_gguf)
+        fused_weights = {}
+        skip_keys = set()
+        for k, v in weights.items():
+            if "switch_mlp.gate_proj" in k:
+                up_key = k.replace("gate_proj", "up_proj")
+                if up_key in weights:
+                    cat_dim = 1 if v.ndim == 3 else 0
+                    fused = mx.concatenate([v, weights[up_key]], axis=cat_dim)
+                    fused_key = k.replace("gate_proj", "gate_up_proj")
+                    fused_weights[fused_key] = fused
+                    skip_keys.add(k)
+                    skip_keys.add(up_key)
+        if fused_weights:
+            weights = {
+                **(fused_weights),
+                **{k: v for k, v in weights.items() if k not in skip_keys},
+            }
+
+        # Verify fusion result
+        fused_key = "model.layers.0.mlp.switch_mlp.gate_up_proj.weight"
+        self.assertIn(fused_key, weights)
+        self.assertEqual(weights[fused_key].shape, [4, 1024, 2048])
+        self.assertNotIn(
+            "model.layers.0.mlp.switch_mlp.gate_proj.weight", weights
+        )
+        self.assertNotIn(
+            "model.layers.0.mlp.switch_mlp.up_proj.weight", weights
+        )