From 99e5d03e2188623f76509f853e79bc60574b5da6 Mon Sep 17 00:00:00 2001
From: Juste-Leo <leonard.adamo66@gmail.com>
Date: Fri, 8 May 2026 11:08:36 +0200
Subject: [PATCH 01/33] ops: add Conv1dGrouped operation

---
 ggml/include/ggml.h            |  15 ++++
 ggml/src/ggml.c                |  57 ++++++++++++
 tests/test-conv-1d-grouped.cpp | 154 +++++++++++++++++++++++++++++++++
 3 files changed, 226 insertions(+)
 create mode 100644 tests/test-conv-1d-grouped.cpp

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
index 3357a0d9985..fec0287ae00 100644
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -2041,6 +2041,21 @@ extern "C" {
             int                   s0,  // stride
             int                   d0); // dilation
 
+    // grouped 1D convolution
+    // a: [K, IC/G, OC]   convolution kernel
+    // b: [L, IC,   N]    data
+    // groups must divide both IC and OC evenly
+    // when groups == 1, equivalent to ggml_conv_1d
+    // when groups == IC, equivalent to ggml_conv_1d_dw
+    GGML_API struct ggml_tensor * ggml_conv_1d_grouped(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a,      // convolution kernel
+            struct ggml_tensor  * b,      // data
+            int                   s0,     // stride
+            int                   p0,     // padding
+            int                   d0,     // dilation
+            int                   groups); // number of groups
+
     GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,   // convolution kernel
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 191cf2fa106..049f4952047 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4541,6 +4541,63 @@ struct ggml_tensor * ggml_conv_1d_dw_ph(
     return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
 }
 
+// ggml_conv_1d_grouped
+
+struct ggml_tensor * ggml_conv_1d_grouped(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        struct ggml_tensor  * b,
+        int                   s0,
+        int                   p0,
+        int                   d0,
+        int                   groups) {
+    GGML_ASSERT(groups > 0);
+
+    const int64_t OC   = a->ne[2];   // total output channels
+    const int64_t IC_G = a->ne[1];   // input channels per group (kernel dim)
+    const int64_t IC   = b->ne[1];   // total input channels
+
+    GGML_ASSERT(IC % groups == 0);
+    GGML_ASSERT(OC % groups == 0);
+    GGML_ASSERT(IC_G == IC / groups);
+
+    // degenerate cases: fall back to existing implementations
+    if (groups == 1) {
+        return ggml_conv_1d(ctx, a, b, s0, p0, d0);
+    }
+    if (groups == IC && groups == OC) {
+        return ggml_conv_1d_dw(ctx, a, b, s0, p0, d0);
+    }
+
+    const int64_t OC_G = OC / groups;
+
+    struct ggml_tensor * result = NULL;
+
+    for (int g = 0; g < groups; g++) {
+        // slice kernel for group g: [K, IC_G, OC_G]
+        struct ggml_tensor * a_g = ggml_view_3d(ctx, a,
+            a->ne[0], IC_G, OC_G,
+            a->nb[1], a->nb[2],
+            g * OC_G * a->nb[2]);
+
+        // slice input for group g: [L, IC_G, N]
+        struct ggml_tensor * b_g = ggml_view_3d(ctx, b,
+            b->ne[0], IC_G, b->ne[2],
+            b->nb[1], b->nb[2],
+            g * IC_G * b->nb[1]);
+
+        struct ggml_tensor * out_g = ggml_conv_1d(ctx, a_g, b_g, s0, p0, d0);
+
+        if (result == NULL) {
+            result = out_g;
+        } else {
+            result = ggml_concat(ctx, result, out_g, 1);
+        }
+    }
+
+    return result;
+}
+
 // ggml_conv_transpose_1d
 
 static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
diff --git a/tests/test-conv-1d-grouped.cpp b/tests/test-conv-1d-grouped.cpp
new file mode 100644
index 00000000000..80b884804ec
--- /dev/null
+++ b/tests/test-conv-1d-grouped.cpp
@@ -0,0 +1,154 @@
+// Test for ggml_conv_1d_grouped
+//
+// Verifies grouped 1D convolution by comparing against manual per-group computation.
+
+#include "ggml.h"
+#include "ggml-backend.h"
+#include "ggml-cpu.h"
+
+#include <cstdio>
+#include <cstdlib>
+#include <cstring>
+#include <cmath>
+#include <vector>
+
+static void fill_random_f16(ggml_fp16_t * data, int n) {
+    for (int i = 0; i < n; i++) {
+        float v = ((float)rand() / RAND_MAX) * 2.0f - 1.0f;
+        data[i] = ggml_fp32_to_fp16(v);
+    }
+}
+
+static void fill_random_f32(float * data, int n) {
+    for (int i = 0; i < n; i++) {
+        data[i] = ((float)rand() / RAND_MAX) * 2.0f - 1.0f;
+    }
+}
+
+static bool all_close(const float * a, const float * b, int n, float eps = 5e-3f) {
+    for (int i = 0; i < n; i++) {
+        if (fabsf(a[i] - b[i]) > eps) {
+            fprintf(stderr, "    mismatch at [%d]: %.6f vs %.6f (diff=%.6f)\n",
+                    i, a[i], b[i], fabsf(a[i] - b[i]));
+            return false;
+        }
+    }
+    return true;
+}
+
+// Compute grouped conv1d on CPU naively for reference
+// kernel (F16): [K, IC_G, OC], input (F32): [L, IC, N], output: [OL, OC, N]
+static void conv1d_grouped_ref(
+        const ggml_fp16_t * kernel, const float * input, float * output,
+        int K, int IC, int OC, int L, int N, int groups, int stride, int padding) {
+    int IC_G = IC / groups;
+    int OC_G = OC / groups;
+    int OL = (L + 2 * padding - K) / stride + 1;
+
+    memset(output, 0, (size_t)OL * OC * N * sizeof(float));
+
+    for (int n = 0; n < N; n++) {
+        for (int g = 0; g < groups; g++) {
+            for (int oc = 0; oc < OC_G; oc++) {
+                int oc_global = g * OC_G + oc;
+                for (int ol = 0; ol < OL; ol++) {
+                    float sum = 0.0f;
+                    for (int ic = 0; ic < IC_G; ic++) {
+                        for (int k = 0; k < K; k++) {
+                            int il = ol * stride + k - padding;
+                            if (il >= 0 && il < L) {
+                                int ic_global = g * IC_G + ic;
+                                // kernel: [K, IC_G, OC] -> k + ic * K + oc_global * (IC_G * K)
+                                float w = ggml_fp16_to_fp32(kernel[k + ic * K + oc_global * (IC_G * K)]);
+                                // input: [L, IC, N] -> il + ic_global * L + n * (IC * L)
+                                float x = input[il + ic_global * L + n * (IC * L)];
+                                sum += w * x;
+                            }
+                        }
+                    }
+                    // output: [OL, OC, N] -> ol + oc_global * OL + n * (OC * OL)
+                    output[ol + oc_global * OL + n * (OC * OL)] = sum;
+                }
+            }
+        }
+    }
+}
+
+static bool run_test(const char * label, int IC, int OC, int K, int L, int groups, int stride, int padding) {
+    printf("  TEST: %s (IC=%d OC=%d K=%d L=%d G=%d s=%d p=%d)\n",
+           label, IC, OC, K, L, groups, stride, padding);
+
+    int IC_G = IC / groups;
+    int OL = (L + 2 * padding - K) / stride + 1;
+
+    size_t ctx_size = 256 * 1024 * 1024;
+    struct ggml_init_params params = {
+        /*.mem_size   =*/ ctx_size,
+        /*.mem_buffer =*/ NULL,
+        /*.no_alloc   =*/ false,
+    };
+    struct ggml_context * ctx = ggml_init(params);
+
+    // kernel: [K, IC_G, OC] in F16 (like real models)
+    struct ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, K, IC_G, OC);
+    // input: [L, IC] in F32
+    struct ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, L, IC);
+
+    fill_random_f16((ggml_fp16_t *)a->data, K * IC_G * OC);
+    fill_random_f32((float *)b->data, L * IC);
+
+    // reference
+    std::vector<float> ref(OL * OC);
+    conv1d_grouped_ref((ggml_fp16_t *)a->data, (float *)b->data, ref.data(),
+                       K, IC, OC, L, 1, groups, stride, padding);
+
+    // ggml
+    struct ggml_tensor * result = ggml_conv_1d_grouped(ctx, a, b, stride, padding, 1, groups);
+
+    struct ggml_cgraph * gf = ggml_new_graph(ctx);
+    ggml_build_forward_expand(gf, result);
+
+    ggml_backend_t backend = ggml_backend_cpu_init();
+    ggml_backend_graph_compute(backend, gf);
+
+    bool ok = true;
+
+    if (result->ne[0] != OL || result->ne[1] != OC) {
+        fprintf(stderr, "    FAIL: shape [%lld, %lld], expected [%d, %d]\n",
+                (long long)result->ne[0], (long long)result->ne[1], OL, OC);
+        ok = false;
+    }
+
+    if (ok) {
+        ok = all_close((float *)result->data, ref.data(), OL * OC);
+    }
+
+    printf("    %s\n", ok ? "PASS" : "FAIL");
+
+    ggml_backend_free(backend);
+    ggml_free(ctx);
+    return ok;
+}
+
+int main(void) {
+    srand(42);
+
+    printf("Testing ggml_conv_1d_grouped\n\n");
+
+    int n_pass = 0, n_fail = 0;
+
+    auto check = [&](const char * label, int IC, int OC, int K, int L, int G, int s, int p) {
+        if (run_test(label, IC, OC, K, L, G, s, p)) { n_pass++; } else { n_fail++; }
+    };
+
+    check("groups=1 (standard conv1d)", 128, 256, 3, 32, 1, 1, 0);
+    check("ZAYA1-8B exact params",      1280, 1280, 2, 16, 10, 1, 0);
+    check("small 2 groups",             4, 4, 2, 8, 2, 1, 0);
+    check("with padding",              8, 8, 2, 16, 4, 1, 1);
+    check("IC != OC",                  12, 6, 3, 10, 3, 1, 0);
+    check("stride=2",                  8, 8, 2, 16, 4, 2, 0);
+    check("longer sequence",           1280, 1280, 2, 128, 10, 1, 0);
+
+    printf("\nResult: %d passed, %d failed\n", n_pass, n_fail);
+    return n_fail > 0 ? 1 : 0;
+}

From e0ac753e404962ace6c6e0535d38657cae7b0283 Mon Sep 17 00:00:00 2001
From: Juste-Leo <leonard.adamo66@gmail.com>
Date: Fri, 8 May 2026 15:07:17 +0200
Subject: [PATCH 02/33] initial implementation

---
 convert_hf_to_gguf.py          |  39 ++++++
 gguf-py/gguf/constants.py      |  28 +++++
 gguf-py/gguf/tensor_mapping.py |  23 ++++
 src/llama-arch.cpp             |  12 ++
 src/llama-arch.h               |   6 +
 src/llama-model.cpp            |   3 +
 src/llama-model.h              |   7 ++
 src/models/models.h            |  13 ++
 src/models/zaya.cpp            | 223 +++++++++++++++++++++++++++++++++
 9 files changed, 354 insertions(+)
 create mode 100644 src/models/zaya.cpp

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index fb1f5dd4473..33c74013fb3 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6454,6 +6454,45 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         yield from super().modify_tensors(data_torch, name, bid)
 
 
+@ModelBase.register("ZayaModel", "ZayaForCausalLM")
+class ZayaModel(TextModel):
+    """Zaya-1 model with Compressed Convolutional Attention"""
+    model_arch = gguf.MODEL_ARCH.ZAYA
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+        
+        # ZAYA-specific params if any from config.json (e.g. ssm_d_conv)
+        if "ssm_d_conv" in self.hparams:
+            self.gguf_writer.add_ssm_conv_kernel(self.hparams["ssm_d_conv"])
+        else:
+            # Fallback if config is different
+            self.gguf_writer.add_ssm_conv_kernel(2) # Default for ZAYA1-8B
+            
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        # Tensors will be automatically mapped based on tensor_mapping.py if they match
+        
+        # We skip MoE FFN weights, unused biases, etc. temporarily since we are using dense FFN
+        skip_keywords = [
+            "zaya_block.experts", 
+            "res_scale.", 
+            "val_proj2"
+        ]
+        
+        if any(kw in name for kw in skip_keywords):
+            logger.info(f"Skipping tensor (dense FFN test): {name}")
+            return
+            
+        try:
+            yield from super().modify_tensors(data_torch, name, bid)
+        except ValueError as e:
+            if "Can not map tensor" in str(e):
+                logger.warning(f"Skipping unmapped tensor: {name}")
+            else:
+                raise
+
+
 @ModelBase.register("InternLM2ForCausalLM")
 class InternLM2Model(TextModel):
     model_arch = gguf.MODEL_ARCH.INTERNLM2
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 308ebe1f4a1..13bd3d1c8f0 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -503,6 +503,7 @@ class MODEL_ARCH(IntEnum):
     LLAMA_EMBED      = auto()
     MAINCODER        = auto()
     KIMI_LINEAR      = auto()
+    ZAYA             = auto()
 
 
 class VISION_PROJECTOR_TYPE(IntEnum):
@@ -610,6 +611,10 @@ class MODEL_TENSOR(IntEnum):
     SSM_BETA             = auto() # Kimi Linear qwen3.5
     SSM_G_A              = auto() # Kimi Linear
     SSM_G_B              = auto() # Kimi Linear
+    CCA_CONV_DW          = auto() # Zaya
+    CCA_CONV_GRP         = auto() # Zaya
+    CCA_QK_NORM          = auto() # Zaya
+    CCA_K_SCALE          = auto() # Zaya
     TIME_MIX_W0          = auto()
     TIME_MIX_W1          = auto()
     TIME_MIX_W2          = auto()
@@ -1018,6 +1023,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.LLAMA_EMBED:      "llama-embed",
     MODEL_ARCH.MAINCODER:        "maincoder",
     MODEL_ARCH.KIMI_LINEAR:      "kimi-linear",
+    MODEL_ARCH.ZAYA:             "zaya",
 }
 
 VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = {
@@ -1123,6 +1129,10 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.SSM_BETA:                  "blk.{bid}.ssm_beta",             # Kimi Linear qwen3.5
     MODEL_TENSOR.SSM_G_A:                   "blk.{bid}.ssm_g_a",              # Kimi Linear
     MODEL_TENSOR.SSM_G_B:                   "blk.{bid}.ssm_g_b",              # Kimi Linear
+    MODEL_TENSOR.CCA_CONV_DW:               "blk.{bid}.cca_conv_dw",          # Zaya
+    MODEL_TENSOR.CCA_CONV_GRP:              "blk.{bid}.cca_conv_grp",         # Zaya
+    MODEL_TENSOR.CCA_QK_NORM:               "blk.{bid}.cca_qk_norm",          # Zaya
+    MODEL_TENSOR.CCA_K_SCALE:               "blk.{bid}.cca_k_scale",          # Zaya
     MODEL_TENSOR.TIME_MIX_W0:               "blk.{bid}.time_mix_w0",
     MODEL_TENSOR.TIME_MIX_W1:               "blk.{bid}.time_mix_w1",
     MODEL_TENSOR.TIME_MIX_W2:               "blk.{bid}.time_mix_w2",
@@ -3992,6 +4002,24 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN_SHEXP,
         MODEL_TENSOR.FFN_UP_SHEXP,
     ],
+    MODEL_ARCH.ZAYA: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.ATTN_Q,
+        MODEL_TENSOR.ATTN_K,
+        MODEL_TENSOR.ATTN_V,
+        MODEL_TENSOR.ATTN_OUT,
+        MODEL_TENSOR.CCA_CONV_DW,
+        MODEL_TENSOR.CCA_CONV_GRP,
+        MODEL_TENSOR.CCA_QK_NORM,
+        MODEL_TENSOR.CCA_K_SCALE,
+        MODEL_TENSOR.FFN_NORM,
+        MODEL_TENSOR.FFN_GATE,
+        MODEL_TENSOR.FFN_DOWN,
+        MODEL_TENSOR.FFN_UP,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index f27f0e4c997..db99afd4cbb 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -259,6 +259,7 @@ class TensorNameMap:
             "model.transformer.blocks.{bid}.q_proj",                     # llada
             "layers.{bid}.self_attn.q_proj",                             # qwen3-embedding
             "backbone.layers.{bid}.mixer.q_proj",                        # nemotron-h
+            "model.layers.{bid}.self_attn.qkv.linear_q",                 # Zaya
         ),
 
         # Attention key
@@ -279,6 +280,7 @@ class TensorNameMap:
             "model.transformer.blocks.{bid}.k_proj",                   # llada
             "layers.{bid}.self_attn.k_proj",                           # qwen3-embedding
             "backbone.layers.{bid}.mixer.k_proj",                      # nemotron-h
+            "model.layers.{bid}.self_attn.qkv.linear_k",               # Zaya
         ),
 
         # Attention value
@@ -298,6 +300,7 @@ class TensorNameMap:
             "model.transformer.blocks.{bid}.v_proj",                     # llada
             "layers.{bid}.self_attn.v_proj",                             # qwen3-embedding
             "backbone.layers.{bid}.mixer.v_proj",                        # nemotron-h
+            "model.layers.{bid}.self_attn.qkv.val_proj1",                # Zaya
         ),
 
         # Attention output
@@ -336,6 +339,7 @@ class TensorNameMap:
             "layers.{bid}.self_attn.o_proj",                                # qwen3-embedding
             "backbone.layers.{bid}.mixer.o_proj",                           # nemotron-h
             "model.layers.{bid}.self_attn.language_expert_dense",           # cogvlm
+            "model.layers.{bid}.self_attn.o_proj",                          # Zaya
         ),
 
         # Attention output norm
@@ -854,6 +858,12 @@ class TensorNameMap:
             "backbone.layers.{bid}.mixer.norm",     # mamba2
             "model.layers.{bid}.self_attn.o_norm",  # kimi
         ),
+        MODEL_TENSOR.ATTN_NORM: (
+            "model.layers.{bid}.input_layernorm",
+            "model.layers.{bid}.ln_1",
+            "model.layers.{bid}.norm1",
+            "model.layers.{bid}.input_norm",        # Zaya
+        ),
 
         MODEL_TENSOR.SSM_OUT: (
             "model.layers.{bid}.out_proj",               # mamba-hf
@@ -891,6 +901,19 @@ class TensorNameMap:
             "model.layers.{bid}.linear_attn.in_proj_b",  # qwen3.5
             "model.layers.{bid}.self_attn.b_proj",       # Kimi Linear
         ),
+        # ZAYA CCA
+        MODEL_TENSOR.CCA_CONV_DW: (
+            "model.layers.{bid}.self_attn.qkv.conv_qk.0", # Zaya
+        ),
+        MODEL_TENSOR.CCA_CONV_GRP: (
+            "model.layers.{bid}.self_attn.qkv.conv_qk.1", # Zaya
+        ),
+        MODEL_TENSOR.CCA_QK_NORM: (
+            "model.layers.{bid}.self_attn.qk_norm",   # Zaya
+        ),
+        MODEL_TENSOR.CCA_K_SCALE: (
+            "model.layers.{bid}.self_attn.qkv.temp",          # Zaya
+        ),
         MODEL_TENSOR.SSM_G_A: (
             "model.layers.{bid}.self_attn.g_a_proj",
         ),
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 59dde99e362..df91d973a3e 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -133,6 +133,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_LLAMA_EMBED,      "llama-embed"      },
     { LLM_ARCH_MAINCODER,        "maincoder"        },
     { LLM_ARCH_KIMI_LINEAR,      "kimi-linear"      },
+    { LLM_ARCH_ZAYA,              "zaya"             },
     { LLM_ARCH_UNKNOWN,          "(unknown)"        },
 };
 
@@ -417,6 +418,10 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_SSM_BETA,                               "blk.%d.ssm_beta" },
     { LLM_TENSOR_SSM_G_A,                                "blk.%d.ssm_g_a" },
     { LLM_TENSOR_SSM_G_B,                                "blk.%d.ssm_g_b" },
+    { LLM_TENSOR_CCA_CONV_DW,                            "blk.%d.cca_conv_dw" },
+    { LLM_TENSOR_CCA_CONV_GRP,                           "blk.%d.cca_conv_grp" },
+    { LLM_TENSOR_CCA_QK_NORM,                            "blk.%d.cca_qk_norm" },
+    { LLM_TENSOR_CCA_K_SCALE,                            "blk.%d.cca_k_scale" },
     { LLM_TENSOR_SSM_NORM,                               "blk.%d.ssm_norm" },
     { LLM_TENSOR_ATTN_Q_A_NORM,                          "blk.%d.attn_q_a_norm" },
     { LLM_TENSOR_ATTN_KV_A_NORM,                         "blk.%d.attn_kv_a_norm" },
@@ -659,6 +664,11 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_SSM_BETA,                   {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_G_A,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_G_B,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    // ZAYA CCA
+    {LLM_TENSOR_CCA_CONV_DW,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
+    {LLM_TENSOR_CCA_CONV_GRP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CCA_QK_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CCA_K_SCALE,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_TIME_MIX_LERP_X,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_TIME_MIX_LN,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CHANNEL_MIX_LERP_K,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
@@ -857,6 +867,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) {
         case LLM_ARCH_NEMOTRON_H_MOE:
         case LLM_ARCH_QWEN3NEXT:
         case LLM_ARCH_KIMI_LINEAR:
+        case LLM_ARCH_ZAYA:
         case LLM_ARCH_QWEN35:
         case LLM_ARCH_QWEN35MOE:
             return true;
@@ -902,6 +913,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) {
         case LLM_ARCH_MINIMAX_M2:
         case LLM_ARCH_MISTRAL4:
         case LLM_ARCH_KIMI_LINEAR:
+        case LLM_ARCH_ZAYA:
             return false;
         default:
             return true;
diff --git a/src/llama-arch.h b/src/llama-arch.h
index e37d548c98e..b11fa50c05f 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -137,6 +137,7 @@ enum llm_arch {
     LLM_ARCH_LLAMA_EMBED,
     LLM_ARCH_MAINCODER,
     LLM_ARCH_KIMI_LINEAR,
+    LLM_ARCH_ZAYA,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -444,6 +445,11 @@ enum llm_tensor {
     LLM_TENSOR_SSM_BETA,            // kimi: beta mixing coefficient and qwen3.5
     LLM_TENSOR_SSM_G_A,             // kimi: output gate projection A
     LLM_TENSOR_SSM_G_B,             // kimi: output gate projection B
+    // ZAYA CCA (Compressed Convolutional Attention)
+    LLM_TENSOR_CCA_CONV_DW,         // zaya: depthwise conv1d (conv_qk.0)
+    LLM_TENSOR_CCA_CONV_GRP,        // zaya: grouped conv1d  (conv_qk.1)
+    LLM_TENSOR_CCA_QK_NORM,         // zaya: RMSNorm on concat(Q,K)
+    LLM_TENSOR_CCA_K_SCALE,         // zaya: learned K temperature
     LLM_TENSOR_TIME_MIX_W0,
     LLM_TENSOR_TIME_MIX_W1,
     LLM_TENSOR_TIME_MIX_W2,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 9d011ff3464..656767318f2 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -282,6 +282,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params
             return new llama_model_mimo2(params);
         case LLM_ARCH_KIMI_LINEAR:
             return new llama_model_kimi_linear(params);
+        case LLM_ARCH_ZAYA:
+            return new llama_model_zaya(params);
         case LLM_ARCH_STEP35:
             return new llama_model_step35(params);
         default:
@@ -2206,6 +2208,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_NEMOTRON_H:
         case LLM_ARCH_NEMOTRON_H_MOE:
         case LLM_ARCH_KIMI_LINEAR:
+        case LLM_ARCH_ZAYA:
             return LLAMA_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
diff --git a/src/llama-model.h b/src/llama-model.h
index d63c689185a..8e919e15159 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -477,6 +477,13 @@ struct llama_layer {
     struct ggml_tensor * ssm_g_b    = nullptr;
     struct ggml_tensor * ssm_o_norm = nullptr;
 
+    // ZAYA CCA (Compressed Convolutional Attention)
+    struct ggml_tensor * cca_conv_dw    = nullptr;  // depthwise conv (conv_qk.0)
+    struct ggml_tensor * cca_conv_grp   = nullptr;  // grouped conv   (conv_qk.1)
+    struct ggml_tensor * cca_conv_grp_b = nullptr;  // grouped conv bias
+    struct ggml_tensor * cca_qk_norm    = nullptr;  // RMSNorm on concat(Q,K)
+    struct ggml_tensor * cca_k_scale    = nullptr;  // learned K temperature
+
     // DSA (deepseek sparse attention)
     struct ggml_tensor * indexer_k_norm   = nullptr;
     struct ggml_tensor * indexer_k_norm_b = nullptr;
diff --git a/src/models/models.h b/src/models/models.h
index 6d5f18a8e20..507f903104b 100644
--- a/src/models/models.h
+++ b/src/models/models.h
@@ -1858,3 +1858,16 @@ struct llama_model_step35 : public llama_model_base {
 
     std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
 };
+
+
+struct llama_model_zaya : public llama_model_base {
+    llama_model_zaya(const struct llama_model_params & params) : llama_model_base(params) {}
+    void load_arch_hparams(llama_model_loader & ml) override;
+    void load_arch_tensors(llama_model_loader & ml) override;
+
+    struct graph : public llm_graph_context {
+        graph(const llama_model & model, const llm_graph_params & params);
+    };
+
+    std::unique_ptr<llm_graph_context> build_arch_graph(const llm_graph_params & params) const override;
+};
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
new file mode 100644
index 00000000000..0815fc1d449
--- /dev/null
+++ b/src/models/zaya.cpp
@@ -0,0 +1,223 @@
+#include "models.h"
+
+#include "ggml.h"
+
+void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) {
+    ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+    ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+
+    switch (hparams.n_layer) {
+        case 80: type = LLM_TYPE_8B; break;
+        default: type = LLM_TYPE_UNKNOWN;
+    }
+}
+
+void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
+    LLAMA_LOAD_LOCALS;
+
+    tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
+
+    // output
+    output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
+    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
+
+    const int64_t d_conv = hparams.ssm_d_conv;
+
+    for (int i = 0; i < n_layer; ++i) {
+        auto & layer = layers[i];
+
+        const int64_t n_head_i    = hparams.n_head(i);
+        const int64_t n_head_kv_i = hparams.n_head_kv(i);
+        const int64_t n_embd_q    = n_head_i    * n_embd_head_k;
+        const int64_t n_embd_k    = n_head_kv_i * n_embd_head_k;
+        const int64_t n_qk        = n_embd_q + n_embd_k;
+        const int64_t n_groups    = n_head_i + n_head_kv_i;
+
+        layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
+
+        // CCA projections (standard Q, K, V, O)
+        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0);
+        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0);
+        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_k}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0);
+
+        // CCA conv_qk.0 (depthwise, groups = n_qk, kernel = d_conv)
+        layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0);
+
+        // CCA conv_qk.1 (grouped, groups = n_groups, kernel = d_conv)
+        layer.cca_conv_grp   = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), {d_conv, n_qk / n_groups, n_qk}, 0);
+        layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias",   i), {n_qk}, 0);
+
+        // CCA normalization and scale
+        layer.cca_qk_norm = create_tensor(tn(LLM_TENSOR_CCA_QK_NORM, "weight", i), {n_qk}, 0);
+        layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_embd_k}, 0);
+
+        // FFN (dense SwiGLU for now; MoE can be added later)
+        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
+        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
+        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
+        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+    }
+}
+
+std::unique_ptr<llm_graph_context> llama_model_zaya::build_arch_graph(const llm_graph_params & params) const {
+    return std::make_unique<graph>(*this, params);
+}
+
+llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params & params)
+    : llm_graph_context(params) {
+
+    const int64_t n_embd_head = hparams.n_embd_head_k();
+    const int64_t d_conv      = hparams.ssm_d_conv;
+
+    ggml_tensor * cur;
+    ggml_tensor * inpL;
+
+    inpL = build_inp_embd(model.tok_embd);
+
+    auto * inp = build_inp_mem_hybrid();
+
+    ggml_tensor * inp_out_ids = build_inp_out_ids();
+
+    for (int il = 0; il < n_layer; ++il) {
+        const auto & layer = model.layers[il];
+
+        const int64_t n_head    = hparams.n_head(il);
+        const int64_t n_head_kv = hparams.n_head_kv(il);
+        const int64_t n_embd_q  = n_head    * n_embd_head;
+        const int64_t n_embd_k  = n_head_kv * n_embd_head;
+        const int64_t n_qk      = n_embd_q + n_embd_k;
+        const int64_t n_groups  = n_head + n_head_kv;
+
+        ggml_tensor * inpSA = inpL;
+
+        // norm
+        cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "attn_norm", il);
+
+        // --- CCA: Q, K, V projections ---
+        ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur);
+        cb(Qraw, "Qraw", il);
+        ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur);
+        cb(Kraw, "Kraw", il);
+        ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.wv, cur);
+        cb(Vcur, "Vcur", il);
+
+        // --- CCA: concat Q+K for conv ---
+        // QK: [n_qk, n_tokens]
+        ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0);
+        cb(QK, "QK_cat", il);
+
+        // --- CCA: conv_qk.0 (depthwise, causal) ---
+        // Reshape for ssm_conv: [n_tokens, n_qk] -> [n_tokens, n_qk, 1]
+        // ssm_conv expects [seq_len, channels, batch] with state already concatenated
+        // For prompt processing, we left-pad with (d_conv-1) zeros for causality
+        {
+            // Left-pad QK with zeros for causal convolution
+            ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK));  // [n_tokens, n_qk]
+            ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk);
+            pad = ggml_scale(ctx0, pad, 0.0f);
+            ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0);  // [d_conv-1 + n_tokens, n_qk]
+
+            QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw);
+            // ssm_conv output: [n_tokens, n_qk]
+            cb(QK, "QK_dw", il);
+        }
+
+        // --- CCA: conv_qk.1 (grouped, causal) ---
+        {
+            // Left-pad for second causal conv
+            ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk);
+            pad = ggml_scale(ctx0, pad, 0.0f);
+            ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0);  // [d_conv-1 + n_tokens, n_qk]
+
+            // ggml_conv_1d_grouped expects kernel [K, IC/G, OC] and input [L, IC]
+            // QK_padded is [d_conv-1 + n_tokens, n_qk] which matches [L, IC]
+            QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups);
+            QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b);
+            cb(QK, "QK_grp", il);
+        }
+
+        // QK is now [n_tokens, n_qk] from conv output, transpose back to [n_qk, n_tokens]
+        QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK));
+
+        // --- CCA: split Q_conv, K_conv ---
+        ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens,
+            QK->nb[1], 0);
+        ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens,
+            QK->nb[1], n_embd_q * ggml_element_size(QK));
+
+        // --- CCA: QK mean (skip connection) ---
+        ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f);
+        ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f);
+        cb(Qcur, "Qcur", il);
+        cb(Kcur, "Kcur", il);
+
+        // --- CCA: RMSNorm on concat(Q, K) ---
+        ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0);  // [n_qk, n_tokens]
+        QK_for_norm = build_norm(QK_for_norm, layer.cca_qk_norm, NULL, LLM_NORM_RMS, il);
+        cb(QK_for_norm, "QK_normed", il);
+
+        // Split back
+        Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens,
+            QK_for_norm->nb[1], 0);
+        Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens,
+            QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm));
+
+        // --- CCA: K temperature scaling ---
+        Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale);
+        cb(Kcur, "Kcur_scaled", il);
+
+        // Reshape for attention: [head_dim, n_heads, n_tokens]
+        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
+        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+        // --- GQA attention ---
+        cur = build_attn(inp->get_attn(), layer.wo, NULL, NULL,
+            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
+            1.0f / sqrtf((float) n_embd_head), il);
+        cb(cur, "attn_out", il);
+
+        // select output tokens on last layer
+        if (il == n_layer - 1 && inp_out_ids) {
+            cur   = ggml_get_rows(ctx0, cur,   inp_out_ids);
+            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
+        }
+
+        // residual
+        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
+        cb(ffn_inp, "ffn_inp", il);
+
+        // --- FFN (dense SwiGLU) ---
+        cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il);
+        cb(cur, "ffn_norm", il);
+
+        cur = build_ffn(cur,
+            layer.ffn_up, NULL, NULL,
+            layer.ffn_gate, NULL, NULL,
+            layer.ffn_down, NULL, NULL,
+            NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
+        cb(cur, "ffn_out", il);
+
+        // residual
+        cur = ggml_add(ctx0, cur, ffn_inp);
+        cb(cur, "l_out", il);
+
+        inpL = cur;
+    }
+
+    cur = inpL;
+
+    // final norm
+    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cb(cur, "result_norm", -1);
+    res->t_embd = cur;
+
+    // output
+    cur = ggml_mul_mat(ctx0, model.output, cur);
+    cb(cur, "result_output", -1);
+    res->t_logits = cur;
+
+    ggml_build_forward_expand(gf, cur);
+}

From 7cc554aab3435a800d673ea588d92034958bd3c7 Mon Sep 17 00:00:00 2001
From: Juste-Leo <leonard.adamo66@gmail.com>
Date: Fri, 8 May 2026 18:39:38 +0200
Subject: [PATCH 03/33] implementation checkpoint

---
 convert_hf_to_gguf.py          | 165 ++++++++++++++--
 gguf-py/gguf/constants.py      |  60 +++++-
 gguf-py/gguf/tensor_mapping.py |  15 +-
 src/llama-arch.cpp             |  34 ++++
 src/llama-arch.h               |  19 ++
 src/llama-model.h              |  21 ++
 src/models/zaya.cpp            | 350 +++++++++++++++++++++------------
 7 files changed, 497 insertions(+), 167 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 33c74013fb3..97a5889cce9 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6456,34 +6456,150 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
 
 @ModelBase.register("ZayaModel", "ZayaForCausalLM")
 class ZayaModel(TextModel):
-    """Zaya-1 model with Compressed Convolutional Attention"""
+    """Zaya-1 model with Compressed Convolutional Attention and MoE"""
     model_arch = gguf.MODEL_ARCH.ZAYA
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Buffer for accumulating expert weights per layer
+        self._experts: dict[int, dict[str, Tensor]] | None = {}
+
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
         self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
-        
-        # ZAYA-specific params if any from config.json (e.g. ssm_d_conv)
-        if "ssm_d_conv" in self.hparams:
-            self.gguf_writer.add_ssm_conv_kernel(self.hparams["ssm_d_conv"])
-        else:
-            # Fallback if config is different
-            self.gguf_writer.add_ssm_conv_kernel(2) # Default for ZAYA1-8B
-            
+
+        # n_ff = ffn_hidden_size / 2 (SwiGLU halves the intermediate)
+        n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2
+        self.gguf_writer.add_feed_forward_length(n_ff)
+
+        # ssm_d_conv = conv_qk kernel size
+        self.gguf_writer.add_ssm_conv_kernel(5)
+
+        # partial_rotary_factor -> n_rot
+        head_dim = self.hparams.get("head_dim", 128)
+        partial_rotary = self.hparams.get("partial_rotary_factor", 0.5)
+        self.gguf_writer.add_rope_dimension_count(int(partial_rotary * head_dim))
+
+        # MoE params
+        n_expert = self.find_hparam(["num_experts"])
+        self.gguf_writer.add_expert_count(n_expert)
+        n_expert_used = self.find_hparam(["moe_router_topk", "num_experts_per_tok"], optional=True) or 1
+        self.gguf_writer.add_expert_used_count(n_expert_used)
+
+    def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
+        if "linear_q" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch
+        elif "linear_k" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch
+        elif "val_proj1" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ1, bid), data_torch
+        elif "val_proj2" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ2, bid), data_torch
+        elif "o_proj" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch
+        elif "conv_qk.0" in name and name.endswith(".weight"):
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch
+        elif "conv_qk.0" in name and name.endswith(".bias"):
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch
+        elif "conv_qk.1" in name and name.endswith(".weight"):
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch
+        elif "conv_qk.1" in name and name.endswith(".bias"):
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid, suffix=".bias"), data_torch
+        elif "temp" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_K_SCALE, bid), data_torch
+
+    def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
+        if "down_proj.weight" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN, bid), data_torch
+        elif "down_proj.bias" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch
+        elif "rmsnorm_eda" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_NORM, bid), data_torch
+        elif "router_mlp.0.weight" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0, bid), data_torch
+        elif "router_mlp.0.bias" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, bid, suffix=".bias"), data_torch
+        elif "router_mlp.2.weight" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid), data_torch
+        elif "router_mlp.2.bias" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, bid, suffix=".bias"), data_torch
+        elif "router_mlp.4.weight" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP4, bid), data_torch
+        elif "balancing_biases" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_BIASES, bid), data_torch
+        elif "router_states_scale" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, bid), data_torch
+
+    def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
+        if "hidden_states_scale" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid), data_torch
+        elif "hidden_states_bias" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B, bid, suffix=".bias"), data_torch
+        elif "residual_scale" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid), data_torch
+        elif "residual_bias" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
-        # Tensors will be automatically mapped based on tensor_mapping.py if they match
-        
-        # We skip MoE FFN weights, unused biases, etc. temporarily since we are using dense FFN
-        skip_keywords = [
-            "zaya_block.experts", 
-            "res_scale.", 
-            "val_proj2"
-        ]
-        
-        if any(kw in name for kw in skip_keywords):
-            logger.info(f"Skipping tensor (dense FFN test): {name}")
+        # Common tensors
+        if name == "model.embed_tokens.weight":
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch
             return
-            
+        if name == "model.final_norm.weight":
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM), data_torch
+            return
+
+        # Block-level tensors
+        if bid is not None:
+            # CCA attention tensors
+            if "self_attn" in name:
+                yield from self._map_cca(name, data_torch, bid)
+                return
+
+            # Router tensors
+            if "router" in name:
+                yield from self._map_router(name, data_torch, bid)
+                return
+
+            # Input norm
+            if "input_norm" in name:
+                yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, bid), data_torch
+                return
+
+            # Residual scaling
+            if "res_scale" in name:
+                yield from self._map_res_scale(name, data_torch, bid)
+                return
+
+            # Expert stacking
+            if "zaya_block.experts" in name:
+                assert bid is not None
+                if self._experts is None:
+                    self._experts = {}
+                if bid not in self._experts:
+                    self._experts[bid] = {}
+                self._experts[bid][name] = data_torch
+
+                n_expert = self.find_hparam(["num_experts"])
+                # Each layer has 2 expert weights per expert (fc1, fc2) = 2 * n_expert tensors
+                if len(self._experts[bid]) >= n_expert * 2:
+                    for w_name, gguf_tensor, permute_dims in [
+                        ("linear_fc1", gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, None),
+                        ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, (0, 2, 1)),
+                    ]:
+                        datas: list[Tensor] = []
+                        for xid in range(n_expert):
+                            ename = f"model.layers.{bid}.zaya_block.experts.local_experts.{xid}.{w_name}.weight"
+                            datas.append(self._experts[bid][ename])
+                            del self._experts[bid][ename]
+                        data_torch_stacked = torch.stack(datas, dim=0)
+                        if permute_dims is not None:
+                            data_torch_stacked = data_torch_stacked.permute(*permute_dims)
+                        yield self.format_tensor_name(gguf_tensor, bid), data_torch_stacked
+                    del self._experts[bid]
+                return
+
+        # Fallback for any remaining tensors: use tensor_mapping
         try:
             yield from super().modify_tensors(data_torch, name, bid)
         except ValueError as e:
@@ -6492,6 +6608,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             else:
                 raise
 
+    def prepare_tensors(self):
+        super().prepare_tensors()
+        if self._experts:
+            unprocessed = [k for d in self._experts.values() for k in d.keys()]
+            if unprocessed:
+                raise ValueError(f"Unprocessed expert tensors: {unprocessed}")
+
 
 @ModelBase.register("InternLM2ForCausalLM")
 class InternLM2Model(TextModel):
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 13bd3d1c8f0..de599da4a0b 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -613,8 +613,25 @@ class MODEL_TENSOR(IntEnum):
     SSM_G_B              = auto() # Kimi Linear
     CCA_CONV_DW          = auto() # Zaya
     CCA_CONV_GRP         = auto() # Zaya
-    CCA_QK_NORM          = auto() # Zaya
+    CCA_CONV_DW_B        = auto() # Zaya: conv_qk.0.bias
+    CCA_QK_NORM          = auto() # Zaya (weightless - unit RMSNorm)
     CCA_K_SCALE          = auto() # Zaya
+    CCA_VAL_PROJ1        = auto() # Zaya: CCA value projection stream 1
+    CCA_VAL_PROJ2        = auto() # Zaya: CCA value projection stream 2
+    RES_SCALE_HS         = auto() # Zaya: hidden_states_scale
+    RES_SCALE_HS_B       = auto() # Zaya: hidden_states_bias
+    RES_SCALE_RES        = auto() # Zaya: residual_scale
+    RES_SCALE_RES_B      = auto() # Zaya: residual_bias
+    ZAYA_ROUTER_DOWN     = auto() # Zaya
+    ZAYA_ROUTER_DOWN_B   = auto() # Zaya
+    ZAYA_ROUTER_NORM     = auto() # Zaya
+    ZAYA_ROUTER_MLP0     = auto() # Zaya
+    ZAYA_ROUTER_MLP0_B   = auto() # Zaya
+    ZAYA_ROUTER_MLP2     = auto() # Zaya
+    ZAYA_ROUTER_MLP2_B   = auto() # Zaya
+    ZAYA_ROUTER_MLP4     = auto() # Zaya
+    ZAYA_ROUTER_BIASES   = auto() # Zaya
+    ZAYA_ROUTER_EDA_SCALE = auto() # Zaya
     TIME_MIX_W0          = auto()
     TIME_MIX_W1          = auto()
     TIME_MIX_W2          = auto()
@@ -1130,9 +1147,26 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.SSM_G_A:                   "blk.{bid}.ssm_g_a",              # Kimi Linear
     MODEL_TENSOR.SSM_G_B:                   "blk.{bid}.ssm_g_b",              # Kimi Linear
     MODEL_TENSOR.CCA_CONV_DW:               "blk.{bid}.cca_conv_dw",          # Zaya
+    MODEL_TENSOR.CCA_CONV_DW_B:             "blk.{bid}.cca_conv_dw_b",        # Zaya
     MODEL_TENSOR.CCA_CONV_GRP:              "blk.{bid}.cca_conv_grp",         # Zaya
     MODEL_TENSOR.CCA_QK_NORM:               "blk.{bid}.cca_qk_norm",          # Zaya
     MODEL_TENSOR.CCA_K_SCALE:               "blk.{bid}.cca_k_scale",          # Zaya
+    MODEL_TENSOR.CCA_VAL_PROJ1:             "blk.{bid}.cca_val_proj1",        # Zaya
+    MODEL_TENSOR.CCA_VAL_PROJ2:             "blk.{bid}.cca_val_proj2",        # Zaya
+    MODEL_TENSOR.RES_SCALE_HS:              "blk.{bid}.res_scale_hs",         # Zaya
+    MODEL_TENSOR.RES_SCALE_HS_B:            "blk.{bid}.res_scale_hs_b",       # Zaya
+    MODEL_TENSOR.RES_SCALE_RES:             "blk.{bid}.res_scale_res",        # Zaya
+    MODEL_TENSOR.RES_SCALE_RES_B:           "blk.{bid}.res_scale_res_b",      # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_DOWN:          "blk.{bid}.zaya_router_down",     # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_DOWN_B:        "blk.{bid}.zaya_router_down_b",   # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_NORM:          "blk.{bid}.zaya_router_norm",     # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_MLP0:          "blk.{bid}.zaya_router_mlp0",     # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_MLP0_B:        "blk.{bid}.zaya_router_mlp0_b",   # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_MLP2:          "blk.{bid}.zaya_router_mlp2",     # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_MLP2_B:        "blk.{bid}.zaya_router_mlp2_b",   # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_MLP4:          "blk.{bid}.zaya_router_mlp4",     # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_BIASES:        "blk.{bid}.zaya_router_biases",   # Zaya
+    MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE:     "blk.{bid}.zaya_router_eda",      # Zaya
     MODEL_TENSOR.TIME_MIX_W0:               "blk.{bid}.time_mix_w0",
     MODEL_TENSOR.TIME_MIX_W1:               "blk.{bid}.time_mix_w1",
     MODEL_TENSOR.TIME_MIX_W2:               "blk.{bid}.time_mix_w2",
@@ -4009,16 +4043,30 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ATTN_NORM,
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
-        MODEL_TENSOR.ATTN_V,
         MODEL_TENSOR.ATTN_OUT,
         MODEL_TENSOR.CCA_CONV_DW,
+        MODEL_TENSOR.CCA_CONV_DW_B,
         MODEL_TENSOR.CCA_CONV_GRP,
         MODEL_TENSOR.CCA_QK_NORM,
         MODEL_TENSOR.CCA_K_SCALE,
-        MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.FFN_DOWN,
-        MODEL_TENSOR.FFN_UP,
+        MODEL_TENSOR.CCA_VAL_PROJ1,
+        MODEL_TENSOR.CCA_VAL_PROJ2,
+        MODEL_TENSOR.RES_SCALE_HS,
+        MODEL_TENSOR.RES_SCALE_HS_B,
+        MODEL_TENSOR.RES_SCALE_RES,
+        MODEL_TENSOR.RES_SCALE_RES_B,
+        MODEL_TENSOR.ZAYA_ROUTER_DOWN,
+        MODEL_TENSOR.ZAYA_ROUTER_DOWN_B,
+        MODEL_TENSOR.ZAYA_ROUTER_NORM,
+        MODEL_TENSOR.ZAYA_ROUTER_MLP0,
+        MODEL_TENSOR.ZAYA_ROUTER_MLP0_B,
+        MODEL_TENSOR.ZAYA_ROUTER_MLP2,
+        MODEL_TENSOR.ZAYA_ROUTER_MLP2_B,
+        MODEL_TENSOR.ZAYA_ROUTER_MLP4,
+        MODEL_TENSOR.ZAYA_ROUTER_BIASES,
+        MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE,
+        MODEL_TENSOR.FFN_GATE_UP_EXP,
+        MODEL_TENSOR.FFN_DOWN_EXP,
     ],
     # TODO
 }
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index db99afd4cbb..fbd22ccb6a3 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -107,6 +107,7 @@ class TensorNameMap:
             "model.transformer.ln_f",                  # llada
             "final_norm",                              # modern-bert
             "model.norm",                              # cogvlm
+            "model.final_norm",                        # Zaya
         ),
 
         # Rope frequencies
@@ -300,7 +301,6 @@ class TensorNameMap:
             "model.transformer.blocks.{bid}.v_proj",                     # llada
             "layers.{bid}.self_attn.v_proj",                             # qwen3-embedding
             "backbone.layers.{bid}.mixer.v_proj",                        # nemotron-h
-            "model.layers.{bid}.self_attn.qkv.val_proj1",                # Zaya
         ),
 
         # Attention output
@@ -901,19 +901,6 @@ class TensorNameMap:
             "model.layers.{bid}.linear_attn.in_proj_b",  # qwen3.5
             "model.layers.{bid}.self_attn.b_proj",       # Kimi Linear
         ),
-        # ZAYA CCA
-        MODEL_TENSOR.CCA_CONV_DW: (
-            "model.layers.{bid}.self_attn.qkv.conv_qk.0", # Zaya
-        ),
-        MODEL_TENSOR.CCA_CONV_GRP: (
-            "model.layers.{bid}.self_attn.qkv.conv_qk.1", # Zaya
-        ),
-        MODEL_TENSOR.CCA_QK_NORM: (
-            "model.layers.{bid}.self_attn.qk_norm",   # Zaya
-        ),
-        MODEL_TENSOR.CCA_K_SCALE: (
-            "model.layers.{bid}.self_attn.qkv.temp",          # Zaya
-        ),
         MODEL_TENSOR.SSM_G_A: (
             "model.layers.{bid}.self_attn.g_a_proj",
         ),
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index df91d973a3e..3bebc529300 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -419,9 +419,26 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_SSM_G_A,                                "blk.%d.ssm_g_a" },
     { LLM_TENSOR_SSM_G_B,                                "blk.%d.ssm_g_b" },
     { LLM_TENSOR_CCA_CONV_DW,                            "blk.%d.cca_conv_dw" },
+    { LLM_TENSOR_CCA_CONV_DW_B,                         "blk.%d.cca_conv_dw_b" },
     { LLM_TENSOR_CCA_CONV_GRP,                           "blk.%d.cca_conv_grp" },
     { LLM_TENSOR_CCA_QK_NORM,                            "blk.%d.cca_qk_norm" },
     { LLM_TENSOR_CCA_K_SCALE,                            "blk.%d.cca_k_scale" },
+    { LLM_TENSOR_CCA_VAL_PROJ1,                          "blk.%d.cca_val_proj1" },
+    { LLM_TENSOR_CCA_VAL_PROJ2,                          "blk.%d.cca_val_proj2" },
+    { LLM_TENSOR_RES_SCALE_HS,                           "blk.%d.res_scale_hs" },
+    { LLM_TENSOR_RES_SCALE_HS_B,                         "blk.%d.res_scale_hs_b" },
+    { LLM_TENSOR_RES_SCALE_RES,                          "blk.%d.res_scale_res" },
+    { LLM_TENSOR_RES_SCALE_RES_B,                        "blk.%d.res_scale_res_b" },
+    { LLM_TENSOR_ZAYA_ROUTER_DOWN,                       "blk.%d.zaya_router_down" },
+    { LLM_TENSOR_ZAYA_ROUTER_DOWN_B,                     "blk.%d.zaya_router_down_b" },
+    { LLM_TENSOR_ZAYA_ROUTER_NORM,                       "blk.%d.zaya_router_norm" },
+    { LLM_TENSOR_ZAYA_ROUTER_MLP0,                       "blk.%d.zaya_router_mlp0" },
+    { LLM_TENSOR_ZAYA_ROUTER_MLP0_B,                     "blk.%d.zaya_router_mlp0_b" },
+    { LLM_TENSOR_ZAYA_ROUTER_MLP2,                       "blk.%d.zaya_router_mlp2" },
+    { LLM_TENSOR_ZAYA_ROUTER_MLP2_B,                     "blk.%d.zaya_router_mlp2_b" },
+    { LLM_TENSOR_ZAYA_ROUTER_MLP4,                       "blk.%d.zaya_router_mlp4" },
+    { LLM_TENSOR_ZAYA_ROUTER_BIASES,                     "blk.%d.zaya_router_biases" },
+    { LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE,                  "blk.%d.zaya_router_eda" },
     { LLM_TENSOR_SSM_NORM,                               "blk.%d.ssm_norm" },
     { LLM_TENSOR_ATTN_Q_A_NORM,                          "blk.%d.attn_q_a_norm" },
     { LLM_TENSOR_ATTN_KV_A_NORM,                         "blk.%d.attn_kv_a_norm" },
@@ -666,9 +683,26 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_SSM_G_B,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     // ZAYA CCA
     {LLM_TENSOR_CCA_CONV_DW,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
+    {LLM_TENSOR_CCA_CONV_DW_B,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_CCA_CONV_GRP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CCA_QK_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CCA_K_SCALE,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_CCA_VAL_PROJ1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_CCA_VAL_PROJ2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_RES_SCALE_HS,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_RES_SCALE_HS_B,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_RES_SCALE_RES,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_RES_SCALE_RES_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_ZAYA_ROUTER_DOWN,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ZAYA_ROUTER_DOWN_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_ZAYA_ROUTER_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
+    {LLM_TENSOR_ZAYA_ROUTER_MLP0,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ZAYA_ROUTER_MLP0_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_ZAYA_ROUTER_MLP2,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ZAYA_ROUTER_MLP2_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_ZAYA_ROUTER_MLP4,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
+    {LLM_TENSOR_ZAYA_ROUTER_BIASES,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE,      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_TIME_MIX_LERP_X,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_TIME_MIX_LN,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CHANNEL_MIX_LERP_K,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index b11fa50c05f..72c5abddac1 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -447,9 +447,28 @@ enum llm_tensor {
     LLM_TENSOR_SSM_G_B,             // kimi: output gate projection B
     // ZAYA CCA (Compressed Convolutional Attention)
     LLM_TENSOR_CCA_CONV_DW,         // zaya: depthwise conv1d (conv_qk.0)
+    LLM_TENSOR_CCA_CONV_DW_B,       // zaya: depthwise conv1d bias
     LLM_TENSOR_CCA_CONV_GRP,        // zaya: grouped conv1d  (conv_qk.1)
     LLM_TENSOR_CCA_QK_NORM,         // zaya: RMSNorm on concat(Q,K)
     LLM_TENSOR_CCA_K_SCALE,         // zaya: learned K temperature
+    LLM_TENSOR_CCA_VAL_PROJ1,       // zaya: V projection 1
+    LLM_TENSOR_CCA_VAL_PROJ2,       // zaya: V projection 2
+    // ZAYA residual scaling
+    LLM_TENSOR_RES_SCALE_HS,        // zaya: hidden_states_scale
+    LLM_TENSOR_RES_SCALE_HS_B,      // zaya: hidden_states_bias
+    LLM_TENSOR_RES_SCALE_RES,       // zaya: residual_scale
+    LLM_TENSOR_RES_SCALE_RES_B,     // zaya: residual_bias
+    // ZAYA Router (MoE gating)
+    LLM_TENSOR_ZAYA_ROUTER_DOWN,      // zaya: router down_proj weight
+    LLM_TENSOR_ZAYA_ROUTER_DOWN_B,    // zaya: router down_proj bias
+    LLM_TENSOR_ZAYA_ROUTER_NORM,      // zaya: router rmsnorm_eda weight
+    LLM_TENSOR_ZAYA_ROUTER_MLP0,      // zaya: router MLP layer 0 weight
+    LLM_TENSOR_ZAYA_ROUTER_MLP0_B,    // zaya: router MLP layer 0 bias
+    LLM_TENSOR_ZAYA_ROUTER_MLP2,      // zaya: router MLP layer 2 weight
+    LLM_TENSOR_ZAYA_ROUTER_MLP2_B,    // zaya: router MLP layer 2 bias
+    LLM_TENSOR_ZAYA_ROUTER_MLP4,      // zaya: router MLP layer 4 weight
+    LLM_TENSOR_ZAYA_ROUTER_BIASES,    // zaya: router balancing_biases
+    LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, // zaya: router router_states_scale
     LLM_TENSOR_TIME_MIX_W0,
     LLM_TENSOR_TIME_MIX_W1,
     LLM_TENSOR_TIME_MIX_W2,
diff --git a/src/llama-model.h b/src/llama-model.h
index 8e919e15159..d9da4b318bd 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -479,10 +479,31 @@ struct llama_layer {
 
     // ZAYA CCA (Compressed Convolutional Attention)
     struct ggml_tensor * cca_conv_dw    = nullptr;  // depthwise conv (conv_qk.0)
+    struct ggml_tensor * cca_conv_dw_b  = nullptr;  // depthwise conv bias
     struct ggml_tensor * cca_conv_grp   = nullptr;  // grouped conv   (conv_qk.1)
     struct ggml_tensor * cca_conv_grp_b = nullptr;  // grouped conv bias
     struct ggml_tensor * cca_qk_norm    = nullptr;  // RMSNorm on concat(Q,K)
     struct ggml_tensor * cca_k_scale    = nullptr;  // learned K temperature
+    struct ggml_tensor * cca_val_proj1  = nullptr;  // V projection stream 1
+    struct ggml_tensor * cca_val_proj2  = nullptr;  // V projection stream 2
+
+    // ZAYA residual scaling
+    struct ggml_tensor * res_scale_hs   = nullptr;  // hidden_states_scale
+    struct ggml_tensor * res_scale_hs_b = nullptr;  // hidden_states_bias
+    struct ggml_tensor * res_scale_res  = nullptr;  // residual_scale
+    struct ggml_tensor * res_scale_res_b = nullptr; // residual_bias
+
+    // ZAYA Router (MoE gating)
+    struct ggml_tensor * zaya_router_down     = nullptr;  // router down_proj
+    struct ggml_tensor * zaya_router_down_b   = nullptr;  // router down_proj bias
+    struct ggml_tensor * zaya_router_norm     = nullptr;  // router rmsnorm_eda
+    struct ggml_tensor * zaya_router_mlp0     = nullptr;  // router MLP 0
+    struct ggml_tensor * zaya_router_mlp0_b   = nullptr;  // router MLP 0 bias
+    struct ggml_tensor * zaya_router_mlp2     = nullptr;  // router MLP 2
+    struct ggml_tensor * zaya_router_mlp2_b   = nullptr;  // router MLP 2 bias
+    struct ggml_tensor * zaya_router_mlp4     = nullptr;  // router MLP 4
+    struct ggml_tensor * zaya_router_biases   = nullptr;  // balancing_biases
+    struct ggml_tensor * zaya_router_eda_scale = nullptr; // router_states_scale
 
     // DSA (deepseek sparse attention)
     struct ggml_tensor * indexer_k_norm   = nullptr;
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 0815fc1d449..a6e77bbc198 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -17,46 +17,93 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
-    // output
+    // output norm
     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
-    output      = create_tensor(tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab}, 0);
 
-    const int64_t d_conv = hparams.ssm_d_conv;
+    // output (tied with tok_embd if not present)
+    output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
+    if (output == nullptr) {
+        output = tok_embd;
+    }
+
+    const int64_t n_embd_head = hparams.n_embd_head_k();
+    const int64_t d_conv      = hparams.ssm_d_conv;
+    // Router MLP hidden size (zaya_mlp_expansion = 256 for ZAYA1-8B)
+    const int64_t n_ff_exp    = 256;
 
     for (int i = 0; i < n_layer; ++i) {
         auto & layer = layers[i];
 
-        const int64_t n_head_i    = hparams.n_head(i);
-        const int64_t n_head_kv_i = hparams.n_head_kv(i);
-        const int64_t n_embd_q    = n_head_i    * n_embd_head_k;
-        const int64_t n_embd_k    = n_head_kv_i * n_embd_head_k;
-        const int64_t n_qk        = n_embd_q + n_embd_k;
-        const int64_t n_groups    = n_head_i + n_head_kv_i;
+        const int64_t n_head    = hparams.n_head(i);
+        const int64_t n_head_kv = hparams.n_head_kv(i);
+        const int64_t n_embd_q  = n_head    * n_embd_head;
+        const int64_t n_embd_k  = n_head_kv * n_embd_head;
+        const int64_t n_qk      = n_embd_q + n_embd_k;
+        const int64_t n_groups  = n_head + n_head_kv;
+        const int64_t n_ff      = hparams.n_ff(i);
+        const int64_t n_expert  = hparams.n_expert;
 
         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-        // CCA projections (standard Q, K, V, O)
+        // CCA projections (present on all layers)
         layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0);
         layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0);
-        layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_k}, 0);
-        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0);
-
-        // CCA conv_qk.0 (depthwise, groups = n_qk, kernel = d_conv)
-        layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0);
 
-        // CCA conv_qk.1 (grouped, groups = n_groups, kernel = d_conv)
-        layer.cca_conv_grp   = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), {d_conv, n_qk / n_groups, n_qk}, 0);
-        layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias",   i), {n_qk}, 0);
+        // CCA: V = concat(val_proj1(x), val_proj2(x)) → {n_embd_k}
+        layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i),
+            {n_embd, n_embd_head}, 0);
+        layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i),
+            {n_embd, n_embd_head}, 0);
 
-        // CCA normalization and scale
-        layer.cca_qk_norm = create_tensor(tn(LLM_TENSOR_CCA_QK_NORM, "weight", i), {n_qk}, 0);
-        layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_embd_k}, 0);
+        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0);
 
-        // FFN (dense SwiGLU for now; MoE can be added later)
-        layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
-        layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0);
-        layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0);
-        layer.ffn_up   = create_tensor(tn(LLM_TENSOR_FFN_UP,   "weight", i), {n_embd, n_ff}, 0);
+        // CCA conv_qk.0 (depthwise, causal)
+        layer.cca_conv_dw   = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0);
+        layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED);
+
+        // CCA conv_qk.1 (grouped, groups = n_groups)
+        layer.cca_conv_grp   = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i),
+            {d_conv, n_qk / n_groups, n_qk}, 0);
+        layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0);
+
+        // CCA per-KV-head temperature
+        layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0);
+
+        // Residual scaling
+        layer.res_scale_hs   = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0);
+        layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B, "bias", i), {n_embd}, 0);
+        layer.res_scale_res  = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+
+        // MoE layers (odd indices)
+        if (i % 2 == 1) {
+            // Router network
+            layer.zaya_router_down   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN, "weight", i),
+                {n_embd, n_ff_exp}, 0);
+            layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i),
+                {n_ff_exp}, 0);
+            layer.zaya_router_norm   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_NORM, "weight", i),
+                {n_ff_exp}, 0);
+            layer.zaya_router_mlp0   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0, "weight", i),
+                {n_ff_exp, n_ff_exp}, 0);
+            layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "bias", i),
+                {n_ff_exp}, 0);
+            layer.zaya_router_mlp2   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2, "weight", i),
+                {n_ff_exp, n_ff_exp}, 0);
+            layer.zaya_router_mlp2_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "bias", i),
+                {n_ff_exp}, 0);
+            layer.zaya_router_mlp4   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i),
+                {n_ff_exp, n_expert + 1}, 0);
+            layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i),
+                {n_expert + 1}, TENSOR_NOT_REQUIRED);
+            layer.zaya_router_eda_scale = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "weight", i),
+                {n_ff_exp}, TENSOR_NOT_REQUIRED);
+
+            // MoE experts (fused gate_up and down)
+            create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0);
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i),
+                {n_ff, n_embd, n_expert}, 0);
+        }
     }
 }
 
@@ -69,6 +116,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
     const int64_t n_embd_head = hparams.n_embd_head_k();
     const int64_t d_conv      = hparams.ssm_d_conv;
+    const int64_t n_expert    = hparams.n_expert;
 
     ggml_tensor * cur;
     ggml_tensor * inpL;
@@ -91,117 +139,167 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
         ggml_tensor * inpSA = inpL;
 
-        // norm
-        cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il);
+        // Pre-norm
+        cur = build_norm(inpL, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
         cb(cur, "attn_norm", il);
 
-        // --- CCA: Q, K, V projections ---
-        ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur);
-        cb(Qraw, "Qraw", il);
-        ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur);
-        cb(Kraw, "Kraw", il);
-        ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.wv, cur);
-        cb(Vcur, "Vcur", il);
-
-        // --- CCA: concat Q+K for conv ---
-        // QK: [n_qk, n_tokens]
-        ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0);
-        cb(QK, "QK_cat", il);
-
-        // --- CCA: conv_qk.0 (depthwise, causal) ---
-        // Reshape for ssm_conv: [n_tokens, n_qk] -> [n_tokens, n_qk, 1]
-        // ssm_conv expects [seq_len, channels, batch] with state already concatenated
-        // For prompt processing, we left-pad with (d_conv-1) zeros for causality
-        {
-            // Left-pad QK with zeros for causal convolution
-            ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK));  // [n_tokens, n_qk]
-            ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk);
-            pad = ggml_scale(ctx0, pad, 0.0f);
-            ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0);  // [d_conv-1 + n_tokens, n_qk]
-
-            QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw);
-            // ssm_conv output: [n_tokens, n_qk]
-            cb(QK, "QK_dw", il);
+        if (il % 2 == 0) {
+            // ===== CCA Attention =====
+
+            // Q, K projections
+            ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur);
+            cb(Qraw, "Qraw", il);
+            ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur);
+            cb(Kraw, "Kraw", il);
+
+            // V = concat(val_proj1(x), val_proj2(x)) → [n_embd_k, n_tokens]
+            ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur);
+            cb(V1, "V1", il);
+            ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, cur);
+            cb(V2, "V2", il);
+            ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0);
+            cb(Vcur, "Vcur", il);
+
+            // Concat Q+K for conv: [n_qk, n_tokens]
+            ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0);
+            cb(QK, "QK_cat", il);
+
+            // conv_qk.0 (depthwise, causal)
+            {
+                ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK));
+                ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk);
+                pad = ggml_scale(ctx0, pad, 0.0f);
+                ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0);
+
+                QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw);
+                if (layer.cca_conv_dw_b) {
+                    QK = ggml_add(ctx0, QK, layer.cca_conv_dw_b);
+                }
+                cb(QK, "QK_dw", il);
+            }
+
+            // conv_qk.1 (grouped, causal)
+            {
+                ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk);
+                pad = ggml_scale(ctx0, pad, 0.0f);
+                ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0);
+
+                QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups);
+                QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b);
+                cb(QK, "QK_grp", il);
+            }
+
+            // Transpose back to [n_qk, n_tokens]
+            QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK));
+
+            // Split Q_conv, K_conv
+            ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens,
+                QK->nb[1], 0);
+            ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens,
+                QK->nb[1], n_embd_q * ggml_element_size(QK));
+
+            // QK mean skip connection
+            ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f);
+            ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f);
+            cb(Qcur, "Qcur", il);
+            cb(Kcur, "Kcur", il);
+
+            // RMSNorm on concat(Q, K) — weightless (unit RMSNorm)
+            ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0);
+            QK_for_norm = build_norm(QK_for_norm, nullptr, nullptr, LLM_NORM_RMS, il);
+            cb(QK_for_norm, "QK_normed", il);
+
+            // Split back
+            Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens,
+                QK_for_norm->nb[1], 0);
+            Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens,
+                QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm));
+
+            // Per-KV-head temperature scaling on K
+            // Kcur: [n_embd_k=256, n_tokens], reshape to [n_embd_head, n_head_kv, n_tokens]
+            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
+            // cca_k_scale: [n_head_kv] → broadcast
+            Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale);
+            cb(Kcur, "Kcur_scaled", il);
+
+            // Reshape for attention
+            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+
+            // GQA attention
+            cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr,
+                Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
+                1.0f / sqrtf((float) n_embd_head), il);
+            cb(cur, "attn_out", il);
+
+        } else {
+            // ===== MoE Layer =====
+
+            // Build Zaya router network:
+            // down_proj → RMSNorm → SiLU(MLP0) → MLP2 → MLP4 → 17 logits → take first 16
+
+            ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur);
+            router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b);
+            cb(router_h, "router_down", il);
+
+            router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il);
+            cb(router_h, "router_norm", il);
+
+            router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h);
+            router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b);
+            router_h = ggml_silu(ctx0, router_h);
+            cb(router_h, "router_mlp0", il);
+
+            router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp2, router_h);
+            router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp2_b);
+            cb(router_h, "router_mlp2", il);
+
+            router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h);
+            // router_h now has shape [17, n_tokens] — 16 expert logits + 1 MOD skip
+            cb(router_h, "router_logits", il);
+
+            // Take only the first 16 logits (expert routing), ignore MOD skip (index 16)
+            ggml_tensor * gate_inp = ggml_view_2d(ctx0, router_h, n_expert, n_tokens,
+                router_h->nb[1], 0);
+            cb(gate_inp, "gate_inp", il);
+
+            // MoE FFN with topk=1 (pass router logits as probs_in)
+            cur = build_moe_ffn(cur,
+                /* gate_inp */        nullptr,
+                /* up_exps */         nullptr,
+                /* gate_exps */       nullptr,
+                /* down_exps */       layer.ffn_down_exps,
+                /* exp_probs_b */     nullptr,
+                /* n_expert */        n_expert,
+                /* n_expert_used */   hparams.n_expert_used,
+                /* type_op */         LLM_FFN_SILU,
+                /* norm_w */          false,
+                /* w_scale */         1.0f,
+                /* gating_op */       LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                /* il */              il,
+                /* probs_in */        gate_inp,
+                /* gate_up_exps */    layer.ffn_gate_up_exps);
+            cb(cur, "moe_out", il);
         }
 
-        // --- CCA: conv_qk.1 (grouped, causal) ---
-        {
-            // Left-pad for second causal conv
-            ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk);
-            pad = ggml_scale(ctx0, pad, 0.0f);
-            ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0);  // [d_conv-1 + n_tokens, n_qk]
-
-            // ggml_conv_1d_grouped expects kernel [K, IC/G, OC] and input [L, IC]
-            // QK_padded is [d_conv-1 + n_tokens, n_qk] which matches [L, IC]
-            QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups);
-            QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b);
-            cb(QK, "QK_grp", il);
-        }
-
-        // QK is now [n_tokens, n_qk] from conv output, transpose back to [n_qk, n_tokens]
-        QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK));
-
-        // --- CCA: split Q_conv, K_conv ---
-        ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens,
-            QK->nb[1], 0);
-        ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens,
-            QK->nb[1], n_embd_q * ggml_element_size(QK));
-
-        // --- CCA: QK mean (skip connection) ---
-        ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f);
-        ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f);
-        cb(Qcur, "Qcur", il);
-        cb(Kcur, "Kcur", il);
-
-        // --- CCA: RMSNorm on concat(Q, K) ---
-        ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0);  // [n_qk, n_tokens]
-        QK_for_norm = build_norm(QK_for_norm, layer.cca_qk_norm, NULL, LLM_NORM_RMS, il);
-        cb(QK_for_norm, "QK_normed", il);
-
-        // Split back
-        Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens,
-            QK_for_norm->nb[1], 0);
-        Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens,
-            QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm));
-
-        // --- CCA: K temperature scaling ---
-        Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale);
-        cb(Kcur, "Kcur_scaled", il);
-
-        // Reshape for attention: [head_dim, n_heads, n_tokens]
-        Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
-        Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-        Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
-
-        // --- GQA attention ---
-        cur = build_attn(inp->get_attn(), layer.wo, NULL, NULL,
-            Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
-            1.0f / sqrtf((float) n_embd_head), il);
-        cb(cur, "attn_out", il);
-
         // select output tokens on last layer
         if (il == n_layer - 1 && inp_out_ids) {
             cur   = ggml_get_rows(ctx0, cur,   inp_out_ids);
             inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
         }
 
-        // residual
-        ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
-        cb(ffn_inp, "ffn_inp", il);
+        // Residual scaling: cur = hs_scale * cur + hs_bias
+        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.res_scale_hs), layer.res_scale_hs_b);
+        cb(cur, "scaled_out", il);
 
-        // --- FFN (dense SwiGLU) ---
-        cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il);
-        cb(cur, "ffn_norm", il);
-
-        cur = build_ffn(cur,
-            layer.ffn_up, NULL, NULL,
-            layer.ffn_gate, NULL, NULL,
-            layer.ffn_down, NULL, NULL,
-            NULL, LLM_FFN_SILU, LLM_FFN_PAR, il);
-        cb(cur, "ffn_out", il);
+        // Residual scaling: inpSA = res_scale * inpSA + res_bias (if present)
+        if (layer.res_scale_res) {
+            inpSA = ggml_add(ctx0, ggml_mul(ctx0, inpSA, layer.res_scale_res), layer.res_scale_res_b);
+            cb(inpSA, "scaled_residual", il);
+        }
 
-        // residual
-        cur = ggml_add(ctx0, cur, ffn_inp);
+        // Residual add
+        cur = ggml_add(ctx0, cur, inpSA);
         cb(cur, "l_out", il);
 
         inpL = cur;
@@ -210,7 +308,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     cur = inpL;
 
     // final norm
-    cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1);
+    cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
     cb(cur, "result_norm", -1);
     res->t_embd = cur;
 

From 02a9843498a8bfe3296fd2522b0ce372bb9e2e6d Mon Sep 17 00:00:00 2001
From: Juste-Leo <leonard.adamo66@gmail.com>
Date: Fri, 8 May 2026 19:19:41 +0200
Subject: [PATCH 04/33] update

---
 convert_hf_to_gguf.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 97a5889cce9..52bddd7665e 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6608,6 +6608,30 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
             else:
                 raise
 
+    def set_vocab(self):
+        from gguf.vocab import LlamaHfVocab
+
+        vocab = LlamaHfVocab(self.dir_model)
+        tokens = []
+        scores = []
+        toktypes = []
+        for text, score, toktype in vocab.all_tokens():
+            tokens.append(text)
+            scores.append(score)
+            toktypes.append(toktype)
+
+        assert len(tokens) == vocab.vocab_size
+
+        self.gguf_writer.add_tokenizer_model("gemma4")
+        self.gguf_writer.add_token_list(tokens)
+        self.gguf_writer.add_token_scores(scores)
+        self.gguf_writer.add_token_types(toktypes)
+
+        special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
+        special_vocab.add_to_gguf(self.gguf_writer)
+        self.gguf_writer.add_add_space_prefix(False)
+        self.gguf_writer.add_add_bos_token(True)
+
     def prepare_tensors(self):
         super().prepare_tensors()
         if self._experts:

From 8362c10d438261e04bb66f3c37b3631507589a8f Mon Sep 17 00:00:00 2001
From: Juste-Leo <leonard.adamo66@gmail.com>
Date: Tue, 12 May 2026 00:30:59 +0200
Subject: [PATCH 05/33] add corrections

---
 convert_hf_to_gguf.py | 20 +++++++++++---
 ggml/src/ggml.c       |  6 ++---
 src/models/zaya.cpp   | 63 ++++++++++++++++++++++++-------------------
 3 files changed, 56 insertions(+), 33 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 52bddd7665e..41d150e30ac 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1183,7 +1183,7 @@ def set_gguf_parameters(self):
         if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
             self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
             logger.info(f"gguf: rope theta swa = {local_rope_theta}")
-        if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
+        if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps", "norm_epsilon"], optional=True)) is not None:
             self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
             logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
         if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
@@ -6463,6 +6463,13 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         # Buffer for accumulating expert weights per layer
         self._experts: dict[int, dict[str, Tensor]] | None = {}
+        # Pre-load tokenizer to know the vocab count for embedding trimming
+        self._tokenizer_vocab_size: int | None = None
+        try:
+            from gguf.vocab import LlamaHfVocab
+            self._tokenizer_vocab_size = LlamaHfVocab(self.dir_model).vocab_size
+        except Exception:
+            pass
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
@@ -6472,8 +6479,9 @@ def set_gguf_parameters(self):
         n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2
         self.gguf_writer.add_feed_forward_length(n_ff)
 
-        # ssm_d_conv = conv_qk kernel size
-        self.gguf_writer.add_ssm_conv_kernel(5)
+        # ssm_d_conv = conv_qk kernel size (cca_time0 = first depthwise conv kernel)
+        cca_time0 = self.hparams.get("cca_time0", 2)
+        self.gguf_writer.add_ssm_conv_kernel(cca_time0)
 
         # partial_rotary_factor -> n_rot
         head_dim = self.hparams.get("head_dim", 128)
@@ -6498,10 +6506,13 @@ def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[st
         elif "o_proj" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch
         elif "conv_qk.0" in name and name.endswith(".weight"):
+            # PyTorch: [n_qk, 1, kernel] (depthwise) -> ggml: {kernel, n_qk}
+            data_torch = data_torch.squeeze(1).contiguous()
             yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch
         elif "conv_qk.0" in name and name.endswith(".bias"):
             yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch
         elif "conv_qk.1" in name and name.endswith(".weight"):
+            # PyTorch: [n_qk, in_ch_per_group, kernel] -> ggml: {kernel, in_ch_per_group, n_qk}
             yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch
         elif "conv_qk.1" in name and name.endswith(".bias"):
             yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid, suffix=".bias"), data_torch
@@ -6543,6 +6554,9 @@ def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tu
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # Common tensors
         if name == "model.embed_tokens.weight":
+            # Trim embedding to match tokenizer vocab size if needed
+            if self._tokenizer_vocab_size is not None and data_torch.shape[0] > self._tokenizer_vocab_size:
+                data_torch = data_torch[:self._tokenizer_vocab_size]
             yield self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch
             return
         if name == "model.final_norm.weight":
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index 049f4952047..ae1fb2fa031 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -2018,9 +2018,9 @@ struct ggml_tensor * ggml_dup_inplace(
 
 static struct ggml_tensor * ggml_add_impl(
         struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        struct ggml_tensor  * b,
-        bool                  inplace) {
+        struct ggml_tensor * a,
+        struct ggml_tensor * b,
+        bool inplace) {
     GGML_ASSERT(ggml_can_repeat(b, a));
 
     struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index a6e77bbc198..434fa31585b 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -45,29 +45,27 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
-        // CCA projections (present on all layers)
-        layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0);
-        layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0);
+        // CCA attention layers (even indices only)
+        if (i % 2 == 0) {
+            layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0);
+            layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0);
 
-        // CCA: V = concat(val_proj1(x), val_proj2(x)) → {n_embd_k}
-        layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i),
-            {n_embd, n_embd_head}, 0);
-        layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i),
-            {n_embd, n_embd_head}, 0);
+            layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i),
+                {n_embd, n_embd_head}, 0);
+            layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i),
+                {n_embd, n_embd_head}, 0);
 
-        layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0);
+            layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0);
 
-        // CCA conv_qk.0 (depthwise, causal)
-        layer.cca_conv_dw   = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0);
-        layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED);
+            layer.cca_conv_dw   = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0);
+            layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED);
 
-        // CCA conv_qk.1 (grouped, groups = n_groups)
-        layer.cca_conv_grp   = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i),
-            {d_conv, n_qk / n_groups, n_qk}, 0);
-        layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0);
+            layer.cca_conv_grp   = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i),
+                {d_conv, n_qk / n_groups, n_qk}, 0);
+            layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0);
 
-        // CCA per-KV-head temperature
-        layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0);
+            layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0);
+        }
 
         // Residual scaling
         layer.res_scale_hs   = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0);
@@ -101,7 +99,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
             // MoE experts (fused gate_up and down)
             create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0);
-            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i),
+            layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i),
                 {n_ff, n_embd, n_expert}, 0);
         }
     }
@@ -167,30 +165,37 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             // conv_qk.0 (depthwise, causal)
             {
                 ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK));
-                ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk);
+                // ggml_ssm_conv requires 3D input: {1 + n_tokens, n_qk, 1}
+                // Use view_3d on the contiguous 2D tensor to add a batch dimension
+                QK_t = ggml_view_3d(ctx0, QK_t, n_tokens, n_qk, 1, QK_t->nb[1], QK_t->nb[1] * n_qk, 0);
+                ggml_tensor * pad = ggml_new_tensor_3d(ctx0, QK_t->type, d_conv - 1, n_qk, 1);
                 pad = ggml_scale(ctx0, pad, 0.0f);
                 ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0);
 
                 QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw);
+                // Reshape to 2D first, then apply bias to avoid 3D broadcasting
+                QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens);
                 if (layer.cca_conv_dw_b) {
                     QK = ggml_add(ctx0, QK, layer.cca_conv_dw_b);
                 }
                 cb(QK, "QK_dw", il);
             }
 
-            // conv_qk.1 (grouped, causal)
+            // conv_qk.1 (grouped, causal) — operate on {n_tokens, n_qk} format
             {
-                ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk);
+                ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK));
+                ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk);
                 pad = ggml_scale(ctx0, pad, 0.0f);
-                ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0);
+                ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0);
 
                 QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups);
+                // conv output is {OL, OC, N} -> reshape to {OC, OL}, then add bias
+                QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens);
                 QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b);
                 cb(QK, "QK_grp", il);
             }
 
-            // Transpose back to [n_qk, n_tokens]
-            QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK));
+            // QK is now [n_qk, n_tokens]
 
             // Split Q_conv, K_conv
             ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens,
@@ -217,13 +222,16 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
             // Per-KV-head temperature scaling on K
             // Kcur: [n_embd_k=256, n_tokens], reshape to [n_embd_head, n_head_kv, n_tokens]
+            Kcur = ggml_cont(ctx0, Kcur);
             Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
             // cca_k_scale: [n_head_kv] → broadcast
             Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale);
             cb(Kcur, "Kcur_scaled", il);
 
             // Reshape for attention
+            Qcur = ggml_cont(ctx0, Qcur);
             Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
+            Vcur = ggml_cont(ctx0, Vcur);
             Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
 
             // GQA attention
@@ -259,8 +267,9 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             cb(router_h, "router_logits", il);
 
             // Take only the first 16 logits (expert routing), ignore MOD skip (index 16)
-            ggml_tensor * gate_inp = ggml_view_2d(ctx0, router_h, n_expert, n_tokens,
-                router_h->nb[1], 0);
+            ggml_tensor * gate_inp = ggml_cont(ctx0,
+                ggml_view_2d(ctx0, router_h, n_expert, n_tokens,
+                    router_h->nb[1], 0));
             cb(gate_inp, "gate_inp", il);
 
             // MoE FFN with topk=1 (pass router logits as probs_in)

From 109856e8fa688e9bf4453db98c687e2de85051b0 Mon Sep 17 00:00:00 2001
From: Ganesh Nanduru <ganesh@zyphra.com>
Date: Mon, 11 May 2026 21:42:49 -0600
Subject: [PATCH 06/33] zaya generation running

---
 common/debug.cpp          |  14 +-
 convert_hf_to_gguf.py     |  15 +-
 gguf-py/gguf/constants.py |  12 ++
 src/llama-arch.cpp        |   8 +
 src/llama-arch.h          |   4 +
 src/llama-graph.cpp       |   4 +
 src/llama-model.cpp       |   9 +-
 src/llama-model.h         |   6 +
 src/models/zaya.cpp       | 312 +++++++++++++++++++++++++-------------
 9 files changed, 270 insertions(+), 114 deletions(-)

diff --git a/common/debug.cpp b/common/debug.cpp
index 102c6924dc9..60cb5fd9b4a 100644
--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -144,13 +144,6 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
     auto * cb_data = (common_debug_cb_user_data *) user_data;
     auto * pimpl = cb_data->pimpl.get();
 
-    const struct ggml_tensor * src0 = t->src[0];
-    const struct ggml_tensor * src1 = t->src[1];
-
-    if (ask) {
-        return true;  // Always retrieve data
-    }
-
     bool matches_filter = pimpl->tensor_filters.empty();
 
     if (!matches_filter) {
@@ -162,6 +155,13 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
         }
     }
 
+    if (ask) {
+        return matches_filter;
+    }
+
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
     char src1_str[128] = { 0 };
     if (src1) {
         snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 41d150e30ac..1e1adb10fe4 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6551,6 +6551,16 @@ def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tu
         elif "residual_bias" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch
 
+    def _map_final_res_scale(self, name: str, data_torch: Tensor) -> Iterable[tuple[str, Tensor]]:
+        if "hidden_states_scale" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL), data_torch
+        elif "hidden_states_bias" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B_FINAL, suffix=".bias"), data_torch
+        elif "residual_scale" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL), data_torch
+        elif "residual_bias" in name:
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B_FINAL, suffix=".bias"), data_torch
+
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # Common tensors
         if name == "model.embed_tokens.weight":
@@ -6562,6 +6572,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
         if name == "model.final_norm.weight":
             yield self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM), data_torch
             return
+        if name.startswith("model.res_scale."):
+            yield from self._map_final_res_scale(name, data_torch)
+            return
 
         # Block-level tensors
         if bid is not None:
@@ -6599,7 +6612,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
                 if len(self._experts[bid]) >= n_expert * 2:
                     for w_name, gguf_tensor, permute_dims in [
                         ("linear_fc1", gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, None),
-                        ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, (0, 2, 1)),
+                        ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, None),
                     ]:
                         datas: list[Tensor] = []
                         for xid in range(n_expert):
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index de599da4a0b..57a67cb559f 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -622,6 +622,10 @@ class MODEL_TENSOR(IntEnum):
     RES_SCALE_HS_B       = auto() # Zaya: hidden_states_bias
     RES_SCALE_RES        = auto() # Zaya: residual_scale
     RES_SCALE_RES_B      = auto() # Zaya: residual_bias
+    RES_SCALE_HS_FINAL   = auto() # Zaya: final hidden_states_scale
+    RES_SCALE_HS_B_FINAL = auto() # Zaya: final hidden_states_bias
+    RES_SCALE_RES_FINAL  = auto() # Zaya: final residual_scale
+    RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias
     ZAYA_ROUTER_DOWN     = auto() # Zaya
     ZAYA_ROUTER_DOWN_B   = auto() # Zaya
     ZAYA_ROUTER_NORM     = auto() # Zaya
@@ -1157,6 +1161,10 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.RES_SCALE_HS_B:            "blk.{bid}.res_scale_hs_b",       # Zaya
     MODEL_TENSOR.RES_SCALE_RES:             "blk.{bid}.res_scale_res",        # Zaya
     MODEL_TENSOR.RES_SCALE_RES_B:           "blk.{bid}.res_scale_res_b",      # Zaya
+    MODEL_TENSOR.RES_SCALE_HS_FINAL:        "res_scale_hs",                   # Zaya
+    MODEL_TENSOR.RES_SCALE_HS_B_FINAL:      "res_scale_hs_b",                 # Zaya
+    MODEL_TENSOR.RES_SCALE_RES_FINAL:       "res_scale_res",                  # Zaya
+    MODEL_TENSOR.RES_SCALE_RES_B_FINAL:     "res_scale_res_b",                # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_DOWN:          "blk.{bid}.zaya_router_down",     # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_DOWN_B:        "blk.{bid}.zaya_router_down_b",   # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_NORM:          "blk.{bid}.zaya_router_norm",     # Zaya
@@ -4055,6 +4063,10 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.RES_SCALE_HS_B,
         MODEL_TENSOR.RES_SCALE_RES,
         MODEL_TENSOR.RES_SCALE_RES_B,
+        MODEL_TENSOR.RES_SCALE_HS_FINAL,
+        MODEL_TENSOR.RES_SCALE_HS_B_FINAL,
+        MODEL_TENSOR.RES_SCALE_RES_FINAL,
+        MODEL_TENSOR.RES_SCALE_RES_B_FINAL,
         MODEL_TENSOR.ZAYA_ROUTER_DOWN,
         MODEL_TENSOR.ZAYA_ROUTER_DOWN_B,
         MODEL_TENSOR.ZAYA_ROUTER_NORM,
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 3bebc529300..9bdd0023028 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -429,6 +429,10 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_RES_SCALE_HS_B,                         "blk.%d.res_scale_hs_b" },
     { LLM_TENSOR_RES_SCALE_RES,                          "blk.%d.res_scale_res" },
     { LLM_TENSOR_RES_SCALE_RES_B,                        "blk.%d.res_scale_res_b" },
+    { LLM_TENSOR_RES_SCALE_HS_FINAL,                     "res_scale_hs" },
+    { LLM_TENSOR_RES_SCALE_HS_B_FINAL,                   "res_scale_hs_b" },
+    { LLM_TENSOR_RES_SCALE_RES_FINAL,                    "res_scale_res" },
+    { LLM_TENSOR_RES_SCALE_RES_B_FINAL,                  "res_scale_res_b" },
     { LLM_TENSOR_ZAYA_ROUTER_DOWN,                       "blk.%d.zaya_router_down" },
     { LLM_TENSOR_ZAYA_ROUTER_DOWN_B,                     "blk.%d.zaya_router_down_b" },
     { LLM_TENSOR_ZAYA_ROUTER_NORM,                       "blk.%d.zaya_router_norm" },
@@ -693,6 +697,10 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_RES_SCALE_HS_B,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_RES_SCALE_RES,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_RES_SCALE_RES_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
+    {LLM_TENSOR_RES_SCALE_HS_FINAL,         {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
+    {LLM_TENSOR_RES_SCALE_HS_B_FINAL,       {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
+    {LLM_TENSOR_RES_SCALE_RES_FINAL,        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
+    {LLM_TENSOR_RES_SCALE_RES_B_FINAL,      {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_DOWN,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ZAYA_ROUTER_DOWN_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 72c5abddac1..30a3f9a444a 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -458,6 +458,10 @@ enum llm_tensor {
     LLM_TENSOR_RES_SCALE_HS_B,      // zaya: hidden_states_bias
     LLM_TENSOR_RES_SCALE_RES,       // zaya: residual_scale
     LLM_TENSOR_RES_SCALE_RES_B,     // zaya: residual_bias
+    LLM_TENSOR_RES_SCALE_HS_FINAL,  // zaya: final hidden_states_scale
+    LLM_TENSOR_RES_SCALE_HS_B_FINAL,// zaya: final hidden_states_bias
+    LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale
+    LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias
     // ZAYA Router (MoE gating)
     LLM_TENSOR_ZAYA_ROUTER_DOWN,      // zaya: router down_proj weight
     LLM_TENSOR_ZAYA_ROUTER_DOWN_B,    // zaya: router down_proj bias
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index fe155c92dea..e4f0ff98ef4 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -1405,6 +1405,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn(
             {
                 probs = logits; // [n_expert, n_tokens]
             } break;
+        case LLAMA_EXPERT_GATING_FUNC_TYPE_NONE:
+            {
+                probs = logits; // already-normalized expert probabilities
+            } break;
         default:
             GGML_ABORT("fatal error");
     }
diff --git a/src/llama-model.cpp b/src/llama-model.cpp
index 656767318f2..3de55045f5c 100644
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -1957,6 +1957,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
                     if (arch == LLM_ARCH_FALCON_H1) {
                         filter_attn = [&](int32_t) { return true; };
                         filter_recr = [&](int32_t) { return true; };
+                    } else if (arch == LLM_ARCH_ZAYA) {
+                        filter_attn = [&](int32_t il) {
+                            return il % 2 == 0;
+                        };
+                        filter_recr = [&](int32_t il) {
+                            return il % 2 == 0;
+                        };
                     } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) {
                         filter_attn = [&](int32_t il) {
                             return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0;
@@ -2208,7 +2215,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_NEMOTRON_H:
         case LLM_ARCH_NEMOTRON_H_MOE:
         case LLM_ARCH_KIMI_LINEAR:
-        case LLM_ARCH_ZAYA:
             return LLAMA_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values
@@ -2311,6 +2317,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) {
         case LLM_ARCH_QWEN3NEXT:
         case LLM_ARCH_MIMO2:
         case LLM_ARCH_STEP35:
+        case LLM_ARCH_ZAYA:
             return LLAMA_ROPE_TYPE_NEOX;
 
         case LLM_ARCH_QWEN2VL:
diff --git a/src/llama-model.h b/src/llama-model.h
index d9da4b318bd..01ce976fe3e 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -561,6 +561,12 @@ struct llama_model {
     struct ggml_tensor * output_b        = nullptr;
     struct ggml_tensor * output_norm_enc = nullptr;
 
+    // Zaya final residual scaling
+    struct ggml_tensor * zaya_res_scale_hs    = nullptr;
+    struct ggml_tensor * zaya_res_scale_hs_b  = nullptr;
+    struct ggml_tensor * zaya_res_scale_res   = nullptr;
+    struct ggml_tensor * zaya_res_scale_res_b = nullptr;
+
     // classifier
     struct ggml_tensor * cls       = nullptr;
     struct ggml_tensor * cls_b     = nullptr;
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 434fa31585b..89e354450bb 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -1,11 +1,23 @@
 #include "models.h"
 
 #include "ggml.h"
+#include "llama-memory-recurrent.h"
+
+#include <cmath>
 
 void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
 
+    const uint32_t n_qk = (hparams.n_head() + hparams.n_head_kv()) * hparams.n_embd_head_k();
+    hparams.ssm_d_inner = 2*n_qk + hparams.n_embd; // CCA conv state + delayed value stream state
+    hparams.ssm_d_state = 1;
+    hparams.ssm_n_group = 0;
+
+    for (uint32_t i = 0; i < hparams.n_layer; ++i) {
+        hparams.recurrent_layer_arr[i] = (i % 2) == 0;
+    }
+
     switch (hparams.n_layer) {
         case 80: type = LLM_TYPE_8B; break;
         default: type = LLM_TYPE_UNKNOWN;
@@ -26,6 +38,11 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
         output = tok_embd;
     }
 
+    zaya_res_scale_hs    = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL,    "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
+    zaya_res_scale_hs_b  = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B_FINAL,  "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
+    zaya_res_scale_res   = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL,   "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
+    zaya_res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B_FINAL, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
+
     const int64_t n_embd_head = hparams.n_embd_head_k();
     const int64_t d_conv      = hparams.ssm_d_conv;
     // Router MLP hidden size (zaya_mlp_expansion = 256 for ZAYA1-8B)
@@ -113,8 +130,14 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     : llm_graph_context(params) {
 
     const int64_t n_embd_head = hparams.n_embd_head_k();
-    const int64_t d_conv      = hparams.ssm_d_conv;
     const int64_t n_expert    = hparams.n_expert;
+    const int64_t n_seqs      = ubatch.n_seqs;
+
+    GGML_ASSERT(n_seqs != 0);
+    GGML_ASSERT(ubatch.equal_seqs());
+    GGML_ASSERT(n_tokens % n_seqs == 0);
+
+    const int64_t n_seq_tokens = n_tokens / n_seqs;
 
     ggml_tensor * cur;
     ggml_tensor * inpL;
@@ -122,8 +145,24 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     inpL = build_inp_embd(model.tok_embd);
 
     auto * inp = build_inp_mem_hybrid();
+    auto * inp_recr = inp->get_recr();
 
+    ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
+    ggml_tensor * residual    = nullptr;
+    ggml_tensor * prev_router = nullptr;
+
+    const auto apply_res_scale = [&](ggml_tensor * x, ggml_tensor * scale, ggml_tensor * bias, const char * name, int il) {
+        if (scale == nullptr) {
+            return x;
+        }
+        if (bias != nullptr) {
+            x = ggml_add(ctx0, x, bias);
+        }
+        x = ggml_mul(ctx0, x, scale);
+        cb(x, name, il);
+        return x;
+    };
 
     for (int il = 0; il < n_layer; ++il) {
         const auto & layer = model.layers[il];
@@ -134,15 +173,41 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         const int64_t n_embd_k  = n_head_kv * n_embd_head;
         const int64_t n_qk      = n_embd_q + n_embd_k;
         const int64_t n_groups  = n_head + n_head_kv;
+        const int64_t n_gqa     = n_head / n_head_kv;
 
-        ggml_tensor * inpSA = inpL;
+        ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il);
+        if (residual != nullptr) {
+            residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il);
+            residual = ggml_add(ctx0, hidden_states, residual);
+        } else {
+            residual = hidden_states;
+        }
+        cb(residual, "residual", il);
 
         // Pre-norm
-        cur = build_norm(inpL, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
-        cb(cur, "attn_norm", il);
+        cur = build_norm(residual, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
+        cb(cur, "input_norm", il);
 
         if (il % 2 == 0) {
             // ===== CCA Attention =====
+            const int64_t conv_state_size = 2*n_qk;
+            const int64_t cca_state_size  = conv_state_size + n_embd;
+            GGML_ASSERT((int64_t) hparams.n_embd_s() == cca_state_size);
+
+            ggml_tensor * cca_state_all = inp_recr->mctx->get_s_l(il);
+            ggml_tensor * cca_state     = build_rs(inp_recr, cca_state_all, hparams.n_embd_s(), n_seqs);
+            cb(cca_state, "cca_state", il);
+
+            ggml_tensor * conv_state = ggml_view_3d(ctx0, cca_state, 2, n_qk, n_seqs,
+                    2*ggml_element_size(cca_state),
+                    cca_state->nb[1],
+                    0);
+            cb(conv_state, "cca_conv_state", il);
+
+            ggml_tensor * prev_hs = ggml_view_2d(ctx0, cca_state, n_embd, n_seqs,
+                    cca_state->nb[1],
+                    conv_state_size*ggml_element_size(cca_state));
+            cb(prev_hs, "cca_prev_hs", il);
 
             // Q, K projections
             ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur);
@@ -150,89 +215,121 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur);
             cb(Kraw, "Kraw", il);
 
-            // V = concat(val_proj1(x), val_proj2(x)) → [n_embd_k, n_tokens]
+            // HF uses a delayed hidden-state stream for val_proj2. During decode this
+            // comes from the recurrent state; during prefill it is a one-token shift.
+            ggml_tensor * cur_state_src = ggml_cont(ctx0, cur);
+            ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs);
+
+            ggml_tensor * hs_d = ggml_reshape_3d(ctx0, prev_hs, n_embd, 1, n_seqs);
+            if (n_seq_tokens > 1) {
+                ggml_tensor * cur_shift = ggml_view_3d(ctx0, cur_seq, n_embd, n_seq_tokens - 1, n_seqs,
+                        cur_seq->nb[1],
+                        cur_seq->nb[2],
+                        0);
+                hs_d = ggml_concat(ctx0, hs_d, cur_shift, 1);
+            }
+            hs_d = ggml_reshape_2d(ctx0, hs_d, n_embd, n_tokens);
+            cb(hs_d, "cca_hs_d", il);
+
+            // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens]
             ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur);
             cb(V1, "V1", il);
-            ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, cur);
+            ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, hs_d);
             cb(V2, "V2", il);
             ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0);
             cb(Vcur, "Vcur", il);
 
             // Concat Q+K for conv: [n_qk, n_tokens]
-            ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0);
-            cb(QK, "QK_cat", il);
-
-            // conv_qk.0 (depthwise, causal)
-            {
-                ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK));
-                // ggml_ssm_conv requires 3D input: {1 + n_tokens, n_qk, 1}
-                // Use view_3d on the contiguous 2D tensor to add a batch dimension
-                QK_t = ggml_view_3d(ctx0, QK_t, n_tokens, n_qk, 1, QK_t->nb[1], QK_t->nb[1] * n_qk, 0);
-                ggml_tensor * pad = ggml_new_tensor_3d(ctx0, QK_t->type, d_conv - 1, n_qk, 1);
-                pad = ggml_scale(ctx0, pad, 0.0f);
-                ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0);
-
-                QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw);
-                // Reshape to 2D first, then apply bias to avoid 3D broadcasting
-                QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens);
-                if (layer.cca_conv_dw_b) {
-                    QK = ggml_add(ctx0, QK, layer.cca_conv_dw_b);
-                }
-                cb(QK, "QK_dw", il);
+            ggml_tensor * QKraw = ggml_concat(ctx0, Qraw, Kraw, 0);
+            cb(QKraw, "QKraw", il);
+
+            ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Qraw), n_embd_head, n_head, n_tokens);
+            ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Kraw), n_embd_head, n_head_kv, n_tokens);
+
+            ggml_tensor * Kpre_grouped = ggml_reshape_4d(ctx0, Kpre, n_embd_head, 1, n_head_kv, n_tokens);
+            Kpre_grouped = ggml_repeat_4d(ctx0, Kpre_grouped, n_embd_head, n_gqa, n_head_kv, n_tokens);
+            ggml_tensor * Kpre_rep = ggml_reshape_3d(ctx0, Kpre_grouped, n_embd_head, n_head, n_tokens);
+            ggml_tensor * qk_mean_q = ggml_scale(ctx0, ggml_add(ctx0, Qpre, Kpre_rep), 0.5f);
+            cb(qk_mean_q, "qk_mean_q", il);
+
+            ggml_tensor * Qgroup = ggml_reshape_4d(ctx0, Qpre, n_embd_head, n_gqa, n_head_kv, n_tokens);
+            Qgroup = ggml_permute(ctx0, Qgroup, 1, 0, 2, 3);
+            Qgroup = ggml_cont(ctx0, Qgroup);
+            ggml_tensor * Qmean = ggml_mean(ctx0, Qgroup);
+            Qmean = ggml_reshape_3d(ctx0, Qmean, n_embd_head, n_head_kv, n_tokens);
+            ggml_tensor * qk_mean_k = ggml_scale(ctx0, ggml_add(ctx0, Qmean, Kpre), 0.5f);
+            cb(qk_mean_k, "qk_mean_k", il);
+
+            ggml_tensor * QKraw_t = ggml_cont(ctx0, ggml_transpose(ctx0, QKraw));
+            QKraw_t = ggml_reshape_3d(ctx0, QKraw_t, n_seq_tokens, n_qk, n_seqs);
+
+            ggml_tensor * conv_input = ggml_concat(ctx0, conv_state, QKraw_t, 0);
+            cb(conv_input, "cca_conv_input", il);
+
+            ggml_tensor * last_conv_states = ggml_view_3d(ctx0, conv_input, 2, n_qk, n_seqs,
+                    conv_input->nb[1],
+                    conv_input->nb[2],
+                    n_seq_tokens*conv_input->nb[0]);
+            cb(last_conv_states, "cca_last_conv_states", il);
+
+            const auto kv_head = inp_recr->mctx->get_head();
+            ggml_tensor * conv_state_update_target = ggml_view_2d(ctx0, cca_state_all, conv_state_size, n_seqs,
+                    cca_state_all->nb[1],
+                    kv_head*cca_state_size*ggml_element_size(cca_state_all));
+            ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, conv_state_update_target));
+
+            ggml_tensor * last_hs = ggml_view_2d(ctx0, cur_seq, n_embd, n_seqs,
+                    cur_seq->nb[2],
+                    (n_seq_tokens - 1)*cur_seq->nb[1]);
+            ggml_tensor * prev_hs_update_target = ggml_view_2d(ctx0, cca_state_all, n_embd, n_seqs,
+                    cca_state_all->nb[1],
+                    (kv_head*cca_state_size + conv_state_size)*ggml_element_size(cca_state_all));
+            ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target));
+
+            ggml_tensor * conv_dw = layer.cca_conv_dw;
+            if (conv_dw->type != GGML_TYPE_F32) {
+                conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F32);
             }
-
-            // conv_qk.1 (grouped, causal) — operate on {n_tokens, n_qk} format
-            {
-                ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK));
-                ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk);
-                pad = ggml_scale(ctx0, pad, 0.0f);
-                ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0);
-
-                QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups);
-                // conv output is {OL, OC, N} -> reshape to {OC, OL}, then add bias
-                QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens);
-                QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b);
-                cb(QK, "QK_grp", il);
+            conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk);
+            ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1);
+            if (layer.cca_conv_dw_b) {
+                QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1));
             }
+            cb(QK, "QK_dw", il);
 
-            // QK is now [n_qk, n_tokens]
+            QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK, 1, 0, 1, n_groups);
+            QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1));
+            cb(QK, "QK_grp", il);
 
-            // Split Q_conv, K_conv
-            ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens,
-                QK->nb[1], 0);
-            ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens,
-                QK->nb[1], n_embd_q * ggml_element_size(QK));
+            QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3));
+            QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens);
 
-            // QK mean skip connection
-            ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f);
-            ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f);
+            ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0);
+            ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, QK->nb[1], n_embd_q*ggml_element_size(QK));
+
+            ggml_tensor * Qcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Q_conv), n_embd_head, n_head, n_tokens);
+            ggml_tensor * Kcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, K_conv), n_embd_head, n_head_kv, n_tokens);
+
+            Qcur = ggml_add(ctx0, Qcur, qk_mean_q);
+            Kcur = ggml_add(ctx0, Kcur, qk_mean_k);
+
+            Qcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Qcur, 1e-12f), sqrtf((float) n_embd_head));
+            Kcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Kcur, 1e-12f), sqrtf((float) n_embd_head));
+            Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.cca_k_scale, 1, n_head_kv, 1));
+            cb(Qcur, "Qcur_pre_rope", il);
+            cb(Kcur, "Kcur_pre_rope", il);
+
+            ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
+            Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow);
+            Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors,
+                    n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
+                    ext_factor, attn_factor, beta_fast, beta_slow);
             cb(Qcur, "Qcur", il);
             cb(Kcur, "Kcur", il);
 
-            // RMSNorm on concat(Q, K) — weightless (unit RMSNorm)
-            ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0);
-            QK_for_norm = build_norm(QK_for_norm, nullptr, nullptr, LLM_NORM_RMS, il);
-            cb(QK_for_norm, "QK_normed", il);
-
-            // Split back
-            Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens,
-                QK_for_norm->nb[1], 0);
-            Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens,
-                QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm));
-
-            // Per-KV-head temperature scaling on K
-            // Kcur: [n_embd_k=256, n_tokens], reshape to [n_embd_head, n_head_kv, n_tokens]
-            Kcur = ggml_cont(ctx0, Kcur);
-            Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
-            // cca_k_scale: [n_head_kv] → broadcast
-            Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale);
-            cb(Kcur, "Kcur_scaled", il);
-
-            // Reshape for attention
-            Qcur = ggml_cont(ctx0, Qcur);
-            Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
-            Vcur = ggml_cont(ctx0, Vcur);
-            Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens);
+            Vcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Vcur), n_embd_head, n_head_kv, n_tokens);
 
             // GQA attention
             cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr,
@@ -244,77 +341,82 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             // ===== MoE Layer =====
 
             // Build Zaya router network:
-            // down_proj → RMSNorm → SiLU(MLP0) → MLP2 → MLP4 → 17 logits → take first 16
+            // down_proj -> optional EDA -> RMSNorm -> GELU MLP -> 17 logits.
 
             ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur);
             router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b);
             cb(router_h, "router_down", il);
 
+            if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) {
+                router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale));
+                cb(router_h, "router_eda", il);
+            }
+
+            prev_router = router_h;
+
             router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il);
             cb(router_h, "router_norm", il);
 
             router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h);
             router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b);
-            router_h = ggml_silu(ctx0, router_h);
+            router_h = ggml_gelu(ctx0, router_h);
             cb(router_h, "router_mlp0", il);
 
             router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp2, router_h);
             router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp2_b);
+            router_h = ggml_gelu(ctx0, router_h);
             cb(router_h, "router_mlp2", il);
 
             router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h);
-            // router_h now has shape [17, n_tokens] — 16 expert logits + 1 MOD skip
             cb(router_h, "router_logits", il);
 
-            // Take only the first 16 logits (expert routing), ignore MOD skip (index 16)
-            ggml_tensor * gate_inp = ggml_cont(ctx0,
-                ggml_view_2d(ctx0, router_h, n_expert, n_tokens,
-                    router_h->nb[1], 0));
-            cb(gate_inp, "gate_inp", il);
+            ggml_tensor * router_probs = ggml_soft_max(ctx0, router_h);
+            cb(router_probs, "router_probs", il);
+
+            // Keep the MOD skip expert in the softmax denominator, then route
+            // over real experts only. The checkpoint's skip bias keeps MOD unused.
+            ggml_tensor * gate_probs = ggml_cont(ctx0,
+                    ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0));
+            cb(gate_probs, "gate_probs", il);
+
+            ggml_tensor * expert_biases = nullptr;
+            if (layer.zaya_router_biases != nullptr) {
+                expert_biases = ggml_view_1d(ctx0, layer.zaya_router_biases, n_expert, 0);
+            }
 
-            // MoE FFN with topk=1 (pass router logits as probs_in)
             cur = build_moe_ffn(cur,
                 /* gate_inp */        nullptr,
                 /* up_exps */         nullptr,
                 /* gate_exps */       nullptr,
                 /* down_exps */       layer.ffn_down_exps,
-                /* exp_probs_b */     nullptr,
+                /* exp_probs_b */     expert_biases,
                 /* n_expert */        n_expert,
                 /* n_expert_used */   hparams.n_expert_used,
                 /* type_op */         LLM_FFN_SILU,
                 /* norm_w */          false,
                 /* w_scale */         1.0f,
-                /* gating_op */       LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX,
+                /* gating_op */       LLAMA_EXPERT_GATING_FUNC_TYPE_NONE,
                 /* il */              il,
-                /* probs_in */        gate_inp,
+                /* probs_in */        gate_probs,
                 /* gate_up_exps */    layer.ffn_gate_up_exps);
             cb(cur, "moe_out", il);
         }
 
-        // select output tokens on last layer
-        if (il == n_layer - 1 && inp_out_ids) {
-            cur   = ggml_get_rows(ctx0, cur,   inp_out_ids);
-            inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids);
-        }
-
-        // Residual scaling: cur = hs_scale * cur + hs_bias
-        cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.res_scale_hs), layer.res_scale_hs_b);
-        cb(cur, "scaled_out", il);
-
-        // Residual scaling: inpSA = res_scale * inpSA + res_bias (if present)
-        if (layer.res_scale_res) {
-            inpSA = ggml_add(ctx0, ggml_mul(ctx0, inpSA, layer.res_scale_res), layer.res_scale_res_b);
-            cb(inpSA, "scaled_residual", il);
-        }
-
-        // Residual add
-        cur = ggml_add(ctx0, cur, inpSA);
-        cb(cur, "l_out", il);
-
         inpL = cur;
     }
 
-    cur = inpL;
+    ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1);
+    if (residual != nullptr) {
+        residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1);
+        cur = ggml_add(ctx0, final_hidden, residual);
+    } else {
+        cur = final_hidden;
+    }
+    cb(cur, "final_residual", -1);
+
+    if (inp_out_ids) {
+        cur = ggml_get_rows(ctx0, cur, inp_out_ids);
+    }
 
     // final norm
     cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);

From c3ff41c0263b0ee73cec52ffe12776111f196db1 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 10:31:16 +0200
Subject: [PATCH 07/33] refactor: replace CCA_CONV_DW with generic SSM_CONV1D
 constant

- Remove LLM_TENSOR_CCA_CONV_DW and LLM_TENSOR_CCA_CONV_DW_B from llama-arch.h
- Update tensor name mappings in llama-arch.cpp to use SSM_CONV1D
- Remove CCA_CONV_DW and CCA_CONV_DW_B from gguf constants.py
- Update MODEL_ARCH.ZAYA1 tensor list to use SSM_CONV1D
- Update zaya.cpp to create tensors using LLM_TENSOR_SSM_CONV1D
- Update convert_hf_to_gguf.py to map conv_qk.0 to SSM_CONV1D
- Add HuggingFace tensor mapping for zaya conv_qk.0 to SSM_CONV1D

This improves consistency by reusing the existing SSM_CONV1D constant
that's already used by other SSM-based architectures (mamba, jamba, etc.)
---
 convert_hf_to_gguf.py          | 4 ++--
 gguf-py/gguf/constants.py      | 7 +------
 gguf-py/gguf/tensor_mapping.py | 1 +
 src/llama-arch.cpp             | 4 ----
 src/llama-arch.h               | 2 --
 src/models/zaya.cpp            | 4 ++--
 6 files changed, 6 insertions(+), 16 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 1e1adb10fe4..fc39900623e 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6508,9 +6508,9 @@ def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[st
         elif "conv_qk.0" in name and name.endswith(".weight"):
             # PyTorch: [n_qk, 1, kernel] (depthwise) -> ggml: {kernel, n_qk}
             data_torch = data_torch.squeeze(1).contiguous()
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.SSM_CONV1D, bid), data_torch
         elif "conv_qk.0" in name and name.endswith(".bias"):
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.SSM_CONV1D, bid, suffix=".bias"), data_torch
         elif "conv_qk.1" in name and name.endswith(".weight"):
             # PyTorch: [n_qk, in_ch_per_group, kernel] -> ggml: {kernel, in_ch_per_group, n_qk}
             yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 57a67cb559f..f3cba8fd7d0 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -611,9 +611,7 @@ class MODEL_TENSOR(IntEnum):
     SSM_BETA             = auto() # Kimi Linear qwen3.5
     SSM_G_A              = auto() # Kimi Linear
     SSM_G_B              = auto() # Kimi Linear
-    CCA_CONV_DW          = auto() # Zaya
     CCA_CONV_GRP         = auto() # Zaya
-    CCA_CONV_DW_B        = auto() # Zaya: conv_qk.0.bias
     CCA_QK_NORM          = auto() # Zaya (weightless - unit RMSNorm)
     CCA_K_SCALE          = auto() # Zaya
     CCA_VAL_PROJ1        = auto() # Zaya: CCA value projection stream 1
@@ -1150,8 +1148,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.SSM_BETA:                  "blk.{bid}.ssm_beta",             # Kimi Linear qwen3.5
     MODEL_TENSOR.SSM_G_A:                   "blk.{bid}.ssm_g_a",              # Kimi Linear
     MODEL_TENSOR.SSM_G_B:                   "blk.{bid}.ssm_g_b",              # Kimi Linear
-    MODEL_TENSOR.CCA_CONV_DW:               "blk.{bid}.cca_conv_dw",          # Zaya
-    MODEL_TENSOR.CCA_CONV_DW_B:             "blk.{bid}.cca_conv_dw_b",        # Zaya
     MODEL_TENSOR.CCA_CONV_GRP:              "blk.{bid}.cca_conv_grp",         # Zaya
     MODEL_TENSOR.CCA_QK_NORM:               "blk.{bid}.cca_qk_norm",          # Zaya
     MODEL_TENSOR.CCA_K_SCALE:               "blk.{bid}.cca_k_scale",          # Zaya
@@ -4052,8 +4048,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ATTN_Q,
         MODEL_TENSOR.ATTN_K,
         MODEL_TENSOR.ATTN_OUT,
-        MODEL_TENSOR.CCA_CONV_DW,
-        MODEL_TENSOR.CCA_CONV_DW_B,
+        MODEL_TENSOR.SSM_CONV1D,
         MODEL_TENSOR.CCA_CONV_GRP,
         MODEL_TENSOR.CCA_QK_NORM,
         MODEL_TENSOR.CCA_K_SCALE,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index fbd22ccb6a3..f89f483635c 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -800,6 +800,7 @@ class TensorNameMap:
             "model.layers.{bid}.mamba.conv1d",         # jamba falcon-h1 granite-hybrid
             "model.layers.layers.{bid}.mixer.conv1d",  # plamo2
             "model.layers.{bid}.linear_attn.conv1d",   # qwen3next
+            "model.layers.{bid}.self_attn.conv_qk.0",  # zaya
         ),
 
         MODEL_TENSOR.SSM_X: (
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 9bdd0023028..fa10603eb6a 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -418,8 +418,6 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_SSM_BETA,                               "blk.%d.ssm_beta" },
     { LLM_TENSOR_SSM_G_A,                                "blk.%d.ssm_g_a" },
     { LLM_TENSOR_SSM_G_B,                                "blk.%d.ssm_g_b" },
-    { LLM_TENSOR_CCA_CONV_DW,                            "blk.%d.cca_conv_dw" },
-    { LLM_TENSOR_CCA_CONV_DW_B,                         "blk.%d.cca_conv_dw_b" },
     { LLM_TENSOR_CCA_CONV_GRP,                           "blk.%d.cca_conv_grp" },
     { LLM_TENSOR_CCA_QK_NORM,                            "blk.%d.cca_qk_norm" },
     { LLM_TENSOR_CCA_K_SCALE,                            "blk.%d.cca_k_scale" },
@@ -686,8 +684,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_SSM_G_A,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_SSM_G_B,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     // ZAYA CCA
-    {LLM_TENSOR_CCA_CONV_DW,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}},
-    {LLM_TENSOR_CCA_CONV_DW_B,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_CCA_CONV_GRP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CCA_QK_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CCA_K_SCALE,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 30a3f9a444a..07078d15e60 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -446,8 +446,6 @@ enum llm_tensor {
     LLM_TENSOR_SSM_G_A,             // kimi: output gate projection A
     LLM_TENSOR_SSM_G_B,             // kimi: output gate projection B
     // ZAYA CCA (Compressed Convolutional Attention)
-    LLM_TENSOR_CCA_CONV_DW,         // zaya: depthwise conv1d (conv_qk.0)
-    LLM_TENSOR_CCA_CONV_DW_B,       // zaya: depthwise conv1d bias
     LLM_TENSOR_CCA_CONV_GRP,        // zaya: grouped conv1d  (conv_qk.1)
     LLM_TENSOR_CCA_QK_NORM,         // zaya: RMSNorm on concat(Q,K)
     LLM_TENSOR_CCA_K_SCALE,         // zaya: learned K temperature
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 89e354450bb..da35514ce62 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -74,8 +74,8 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0);
 
-            layer.cca_conv_dw   = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0);
-            layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED);
+            layer.cca_conv_dw   = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, n_qk}, 0);
+            layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED);
 
             layer.cca_conv_grp   = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i),
                 {d_conv, n_qk / n_groups, n_qk}, 0);

From a06da04c0f30d80d9734b36c0bbae8ada2c4674f Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 10:40:16 +0200
Subject: [PATCH 08/33] refactor: replace ZAYA_ROUTER_NORM with generic
 FFN_NORM constant

- Remove LLM_TENSOR_ZAYA_ROUTER_NORM from llama-arch.h
- Update tensor mappings in llama-arch.cpp to use FFN_NORM
- Remove ZAYA_ROUTER_NORM from gguf constants.py
- Update MODEL_ARCH.ZAYA1 tensor list to use FFN_NORM
- Update zaya.cpp to create router norm tensor using LLM_TENSOR_FFN_NORM
- Update convert_hf_to_gguf.py to map rmsnorm_eda to FFN_NORM
- Add HuggingFace tensor mapping for zaya rmsnorm_eda to FFN_NORM

Router normalization is a standard FFN norm (RMSNorm), making this
a semantically correct replacement that reduces custom constants.
---
 convert_hf_to_gguf.py          | 2 +-
 gguf-py/gguf/constants.py      | 4 +---
 gguf-py/gguf/tensor_mapping.py | 1 +
 src/llama-arch.cpp             | 2 --
 src/llama-arch.h               | 1 -
 src/models/zaya.cpp            | 2 +-
 6 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index fc39900623e..bdc6bd24df5 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6525,7 +6525,7 @@ def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple
         elif "down_proj.bias" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch
         elif "rmsnorm_eda" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_NORM, bid), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_NORM, bid), data_torch
         elif "router_mlp.0.weight" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0, bid), data_torch
         elif "router_mlp.0.bias" in name:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index f3cba8fd7d0..993a676cd11 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -626,7 +626,6 @@ class MODEL_TENSOR(IntEnum):
     RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias
     ZAYA_ROUTER_DOWN     = auto() # Zaya
     ZAYA_ROUTER_DOWN_B   = auto() # Zaya
-    ZAYA_ROUTER_NORM     = auto() # Zaya
     ZAYA_ROUTER_MLP0     = auto() # Zaya
     ZAYA_ROUTER_MLP0_B   = auto() # Zaya
     ZAYA_ROUTER_MLP2     = auto() # Zaya
@@ -1163,7 +1162,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.RES_SCALE_RES_B_FINAL:     "res_scale_res_b",                # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_DOWN:          "blk.{bid}.zaya_router_down",     # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_DOWN_B:        "blk.{bid}.zaya_router_down_b",   # Zaya
-    MODEL_TENSOR.ZAYA_ROUTER_NORM:          "blk.{bid}.zaya_router_norm",     # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP0:          "blk.{bid}.zaya_router_mlp0",     # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP0_B:        "blk.{bid}.zaya_router_mlp0_b",   # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP2:          "blk.{bid}.zaya_router_mlp2",     # Zaya
@@ -4064,7 +4062,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.RES_SCALE_RES_B_FINAL,
         MODEL_TENSOR.ZAYA_ROUTER_DOWN,
         MODEL_TENSOR.ZAYA_ROUTER_DOWN_B,
-        MODEL_TENSOR.ZAYA_ROUTER_NORM,
+        MODEL_TENSOR.FFN_NORM,
         MODEL_TENSOR.ZAYA_ROUTER_MLP0,
         MODEL_TENSOR.ZAYA_ROUTER_MLP0_B,
         MODEL_TENSOR.ZAYA_ROUTER_MLP2,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index f89f483635c..a2467f57132 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -403,6 +403,7 @@ class TensorNameMap:
             "model.layers.{bid}.feedforward_layernorm",                      # apertus
             "model.layers.{bid}.pre_mlp_layernorm",                          # kormo
             "layers.{bid}.mlp_norm"                                          # modern-bert
+            "model.layers.{bid}.self_attn.rmsnorm_eda",                      # zaya
         ),
 
         # Pre feed-forward norm
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index fa10603eb6a..7a06904d17e 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -433,7 +433,6 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_RES_SCALE_RES_B_FINAL,                  "res_scale_res_b" },
     { LLM_TENSOR_ZAYA_ROUTER_DOWN,                       "blk.%d.zaya_router_down" },
     { LLM_TENSOR_ZAYA_ROUTER_DOWN_B,                     "blk.%d.zaya_router_down_b" },
-    { LLM_TENSOR_ZAYA_ROUTER_NORM,                       "blk.%d.zaya_router_norm" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP0,                       "blk.%d.zaya_router_mlp0" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP0_B,                     "blk.%d.zaya_router_mlp0_b" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP2,                       "blk.%d.zaya_router_mlp2" },
@@ -699,7 +698,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_RES_SCALE_RES_B_FINAL,      {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_DOWN,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ZAYA_ROUTER_DOWN_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
-    {LLM_TENSOR_ZAYA_ROUTER_NORM,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP0,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP0_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP2,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 07078d15e60..a186a39c4b0 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -463,7 +463,6 @@ enum llm_tensor {
     // ZAYA Router (MoE gating)
     LLM_TENSOR_ZAYA_ROUTER_DOWN,      // zaya: router down_proj weight
     LLM_TENSOR_ZAYA_ROUTER_DOWN_B,    // zaya: router down_proj bias
-    LLM_TENSOR_ZAYA_ROUTER_NORM,      // zaya: router rmsnorm_eda weight
     LLM_TENSOR_ZAYA_ROUTER_MLP0,      // zaya: router MLP layer 0 weight
     LLM_TENSOR_ZAYA_ROUTER_MLP0_B,    // zaya: router MLP layer 0 bias
     LLM_TENSOR_ZAYA_ROUTER_MLP2,      // zaya: router MLP layer 2 weight
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index da35514ce62..239da53ae99 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -97,7 +97,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
                 {n_embd, n_ff_exp}, 0);
             layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i),
                 {n_ff_exp}, 0);
-            layer.zaya_router_norm   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_NORM, "weight", i),
+            layer.zaya_router_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i),
                 {n_ff_exp}, 0);
             layer.zaya_router_mlp0   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0, "weight", i),
                 {n_ff_exp, n_ff_exp}, 0);

From ed3820b43ef9234dfca5e30c8a919a71ceb6d3ee Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 10:48:46 +0200
Subject: [PATCH 09/33] refactor: replace ZAYA_ROUTER_DOWN with generic
 FFN_GATE_INP constant

- Remove LLM_TENSOR_ZAYA_ROUTER_DOWN from llama-arch.h
- Update tensor mappings in llama-arch.cpp to use FFN_GATE_INP
- Remove ZAYA_ROUTER_DOWN from gguf constants.py
- Update MODEL_ARCH.ZAYA1 tensor list to use FFN_GATE_INP
- Update zaya.cpp to create router down tensor using LLM_TENSOR_FFN_GATE_INP
- Update convert_hf_to_gguf.py to map down_proj.weight to FFN_GATE_INP
- Add HuggingFace tensor mapping for zaya router down_proj to FFN_GATE_INP

Router down projection is a linear projection similar to MoE gate input,
making this a semantically reasonable replacement.
---
 convert_hf_to_gguf.py          | 2 +-
 gguf-py/gguf/constants.py      | 4 +---
 gguf-py/gguf/tensor_mapping.py | 1 +
 src/llama-arch.cpp             | 2 --
 src/llama-arch.h               | 1 -
 src/models/zaya.cpp            | 2 +-
 6 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index bdc6bd24df5..1ca26918b06 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6521,7 +6521,7 @@ def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[st
 
     def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
         if "down_proj.weight" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN, bid), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid), data_torch
         elif "down_proj.bias" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch
         elif "rmsnorm_eda" in name:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 993a676cd11..1d511fa6fbd 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -624,7 +624,6 @@ class MODEL_TENSOR(IntEnum):
     RES_SCALE_HS_B_FINAL = auto() # Zaya: final hidden_states_bias
     RES_SCALE_RES_FINAL  = auto() # Zaya: final residual_scale
     RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias
-    ZAYA_ROUTER_DOWN     = auto() # Zaya
     ZAYA_ROUTER_DOWN_B   = auto() # Zaya
     ZAYA_ROUTER_MLP0     = auto() # Zaya
     ZAYA_ROUTER_MLP0_B   = auto() # Zaya
@@ -1160,7 +1159,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.RES_SCALE_HS_B_FINAL:      "res_scale_hs_b",                 # Zaya
     MODEL_TENSOR.RES_SCALE_RES_FINAL:       "res_scale_res",                  # Zaya
     MODEL_TENSOR.RES_SCALE_RES_B_FINAL:     "res_scale_res_b",                # Zaya
-    MODEL_TENSOR.ZAYA_ROUTER_DOWN:          "blk.{bid}.zaya_router_down",     # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_DOWN_B:        "blk.{bid}.zaya_router_down_b",   # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP0:          "blk.{bid}.zaya_router_mlp0",     # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP0_B:        "blk.{bid}.zaya_router_mlp0_b",   # Zaya
@@ -4060,7 +4058,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.RES_SCALE_HS_B_FINAL,
         MODEL_TENSOR.RES_SCALE_RES_FINAL,
         MODEL_TENSOR.RES_SCALE_RES_B_FINAL,
-        MODEL_TENSOR.ZAYA_ROUTER_DOWN,
+        MODEL_TENSOR.FFN_GATE_INP,
         MODEL_TENSOR.ZAYA_ROUTER_DOWN_B,
         MODEL_TENSOR.FFN_NORM,
         MODEL_TENSOR.ZAYA_ROUTER_MLP0,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index a2467f57132..a3667c444dc 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -454,6 +454,7 @@ class TensorNameMap:
             "backbone.layers.{bid}.mixer.gate",                 # nemotron-h-moe
             "model.layers.{bid}.moe.gate",                      # step3.5
             "model.layers.{bid}.router.proj",                   # gemma4
+            "model.layers.{bid}.self_attn.router_mlp.down_proj", # zaya
         ),
 
         MODEL_TENSOR.FFN_GATE_INP_SHEXP: (
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 7a06904d17e..b2777fb15c7 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -431,7 +431,6 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_RES_SCALE_HS_B_FINAL,                   "res_scale_hs_b" },
     { LLM_TENSOR_RES_SCALE_RES_FINAL,                    "res_scale_res" },
     { LLM_TENSOR_RES_SCALE_RES_B_FINAL,                  "res_scale_res_b" },
-    { LLM_TENSOR_ZAYA_ROUTER_DOWN,                       "blk.%d.zaya_router_down" },
     { LLM_TENSOR_ZAYA_ROUTER_DOWN_B,                     "blk.%d.zaya_router_down_b" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP0,                       "blk.%d.zaya_router_mlp0" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP0_B,                     "blk.%d.zaya_router_mlp0_b" },
@@ -696,7 +695,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_RES_SCALE_HS_B_FINAL,       {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
     {LLM_TENSOR_RES_SCALE_RES_FINAL,        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
     {LLM_TENSOR_RES_SCALE_RES_B_FINAL,      {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
-    {LLM_TENSOR_ZAYA_ROUTER_DOWN,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ZAYA_ROUTER_DOWN_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP0,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP0_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index a186a39c4b0..d0fb4c67cc7 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -461,7 +461,6 @@ enum llm_tensor {
     LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale
     LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias
     // ZAYA Router (MoE gating)
-    LLM_TENSOR_ZAYA_ROUTER_DOWN,      // zaya: router down_proj weight
     LLM_TENSOR_ZAYA_ROUTER_DOWN_B,    // zaya: router down_proj bias
     LLM_TENSOR_ZAYA_ROUTER_MLP0,      // zaya: router MLP layer 0 weight
     LLM_TENSOR_ZAYA_ROUTER_MLP0_B,    // zaya: router MLP layer 0 bias
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 239da53ae99..bf188000ad2 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -93,7 +93,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
         // MoE layers (odd indices)
         if (i % 2 == 1) {
             // Router network
-            layer.zaya_router_down   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN, "weight", i),
+            layer.zaya_router_down   = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),
                 {n_embd, n_ff_exp}, 0);
             layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i),
                 {n_ff_exp}, 0);

From 7de270f12c0534a855542e23402c4ed74be53210 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 10:55:23 +0200
Subject: [PATCH 10/33] refactor: replace ZAYA_ROUTER_MLP0 with generic
 FFN_GATE constant

- Remove LLM_TENSOR_ZAYA_ROUTER_MLP0 from llama-arch.h
- Update tensor mappings in llama-arch.cpp to use FFN_GATE
- Remove ZAYA_ROUTER_MLP0 from gguf constants.py
- Update MODEL_ARCH.ZAYA1 tensor list to use FFN_GATE
- Update zaya.cpp to create router mlp0 tensor using LLM_TENSOR_FFN_GATE
- Update convert_hf_to_gguf.py to map router_mlp.0.weight to FFN_GATE
- Add HuggingFace tensor mapping for zaya router_mlp.0 to FFN_GATE

Router MLP hidden layer is a linear projection similar to FFN gate,
making this a reasonable replacement for reducing custom constants.
---
 convert_hf_to_gguf.py          | 2 +-
 gguf-py/gguf/constants.py      | 4 +---
 gguf-py/gguf/tensor_mapping.py | 1 +
 src/llama-arch.cpp             | 2 --
 src/llama-arch.h               | 1 -
 src/models/zaya.cpp            | 2 +-
 6 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 1ca26918b06..1f8fb5c1280 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6527,7 +6527,7 @@ def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple
         elif "rmsnorm_eda" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_NORM, bid), data_torch
         elif "router_mlp.0.weight" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0, bid), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch
         elif "router_mlp.0.bias" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, bid, suffix=".bias"), data_torch
         elif "router_mlp.2.weight" in name:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 1d511fa6fbd..494ca5fe0fe 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -625,7 +625,6 @@ class MODEL_TENSOR(IntEnum):
     RES_SCALE_RES_FINAL  = auto() # Zaya: final residual_scale
     RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias
     ZAYA_ROUTER_DOWN_B   = auto() # Zaya
-    ZAYA_ROUTER_MLP0     = auto() # Zaya
     ZAYA_ROUTER_MLP0_B   = auto() # Zaya
     ZAYA_ROUTER_MLP2     = auto() # Zaya
     ZAYA_ROUTER_MLP2_B   = auto() # Zaya
@@ -1160,7 +1159,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.RES_SCALE_RES_FINAL:       "res_scale_res",                  # Zaya
     MODEL_TENSOR.RES_SCALE_RES_B_FINAL:     "res_scale_res_b",                # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_DOWN_B:        "blk.{bid}.zaya_router_down_b",   # Zaya
-    MODEL_TENSOR.ZAYA_ROUTER_MLP0:          "blk.{bid}.zaya_router_mlp0",     # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP0_B:        "blk.{bid}.zaya_router_mlp0_b",   # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP2:          "blk.{bid}.zaya_router_mlp2",     # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP2_B:        "blk.{bid}.zaya_router_mlp2_b",   # Zaya
@@ -4061,7 +4059,7 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_GATE_INP,
         MODEL_TENSOR.ZAYA_ROUTER_DOWN_B,
         MODEL_TENSOR.FFN_NORM,
-        MODEL_TENSOR.ZAYA_ROUTER_MLP0,
+        MODEL_TENSOR.FFN_GATE,
         MODEL_TENSOR.ZAYA_ROUTER_MLP0_B,
         MODEL_TENSOR.ZAYA_ROUTER_MLP2,
         MODEL_TENSOR.ZAYA_ROUTER_MLP2_B,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index a3667c444dc..41cd9262434 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -567,6 +567,7 @@ class TensorNameMap:
             "model.transformer.blocks.{bid}.ff_proj",         # llada
             "layers.{bid}.mlp.gate_proj",                     # qwen3-embedding
             "model.layers.{bid}.mlp.language_mlp.gate_proj",  # cogvlm
+            "model.layers.{bid}.self_attn.router_mlp.0",      # zaya
         ),
 
         MODEL_TENSOR.FFN_GATE_EXP: (
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index b2777fb15c7..f8c3f57cb69 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -432,7 +432,6 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_RES_SCALE_RES_FINAL,                    "res_scale_res" },
     { LLM_TENSOR_RES_SCALE_RES_B_FINAL,                  "res_scale_res_b" },
     { LLM_TENSOR_ZAYA_ROUTER_DOWN_B,                     "blk.%d.zaya_router_down_b" },
-    { LLM_TENSOR_ZAYA_ROUTER_MLP0,                       "blk.%d.zaya_router_mlp0" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP0_B,                     "blk.%d.zaya_router_mlp0_b" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP2,                       "blk.%d.zaya_router_mlp2" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP2_B,                     "blk.%d.zaya_router_mlp2_b" },
@@ -696,7 +695,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_RES_SCALE_RES_FINAL,        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
     {LLM_TENSOR_RES_SCALE_RES_B_FINAL,      {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_DOWN_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
-    {LLM_TENSOR_ZAYA_ROUTER_MLP0,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP0_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP2,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP2_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index d0fb4c67cc7..20ee10a7402 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -462,7 +462,6 @@ enum llm_tensor {
     LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias
     // ZAYA Router (MoE gating)
     LLM_TENSOR_ZAYA_ROUTER_DOWN_B,    // zaya: router down_proj bias
-    LLM_TENSOR_ZAYA_ROUTER_MLP0,      // zaya: router MLP layer 0 weight
     LLM_TENSOR_ZAYA_ROUTER_MLP0_B,    // zaya: router MLP layer 0 bias
     LLM_TENSOR_ZAYA_ROUTER_MLP2,      // zaya: router MLP layer 2 weight
     LLM_TENSOR_ZAYA_ROUTER_MLP2_B,    // zaya: router MLP layer 2 bias
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index bf188000ad2..0f55d6570f7 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -99,7 +99,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
                 {n_ff_exp}, 0);
             layer.zaya_router_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i),
                 {n_ff_exp}, 0);
-            layer.zaya_router_mlp0   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0, "weight", i),
+            layer.zaya_router_mlp0   = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i),
                 {n_ff_exp, n_ff_exp}, 0);
             layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "bias", i),
                 {n_ff_exp}, 0);

From a5c885bcc578663a0b0a3ec8579b453bfbc77774 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 11:08:26 +0200
Subject: [PATCH 11/33] refactor: merge RES_SCALE_*_B bias constants into
 RES_SCALE_* constants

- Remove LLM_TENSOR_RES_SCALE_HS_B, RES_SCALE_RES_B, RES_SCALE_HS_B_FINAL, RES_SCALE_RES_B_FINAL
- Use single RES_SCALE_HS for both weight and bias (same for RES_SCALE_RES)
- Update tensor mappings in llama-arch.cpp
- Remove bias constants from gguf constants.py
- Update MODEL_ARCH.ZAYA1 tensor list
- Update zaya.cpp to create bias tensors using same constant with 'bias' suffix
- Update convert_hf_to_gguf.py to map bias tensors with .bias suffix

This reduces 8 custom ZAYA constants to 4 by reusing the same constant
for both weight and bias tensors, differentiated by suffix.
---
 convert_hf_to_gguf.py     |  8 ++++----
 gguf-py/gguf/constants.py | 20 ++++----------------
 src/llama-arch.cpp        |  8 --------
 src/llama-arch.h          | 12 ++++--------
 src/models/zaya.cpp       |  8 ++++----
 5 files changed, 16 insertions(+), 40 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 1f8fb5c1280..382a3abcb6a 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6545,21 +6545,21 @@ def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tu
         if "hidden_states_scale" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid), data_torch
         elif "hidden_states_bias" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B, bid, suffix=".bias"), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid, suffix=".bias"), data_torch
         elif "residual_scale" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid), data_torch
         elif "residual_bias" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid, suffix=".bias"), data_torch
 
     def _map_final_res_scale(self, name: str, data_torch: Tensor) -> Iterable[tuple[str, Tensor]]:
         if "hidden_states_scale" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL), data_torch
         elif "hidden_states_bias" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B_FINAL, suffix=".bias"), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL, suffix=".bias"), data_torch
         elif "residual_scale" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL), data_torch
         elif "residual_bias" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B_FINAL, suffix=".bias"), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL, suffix=".bias"), data_torch
 
     def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
         # Common tensors
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 494ca5fe0fe..b42c58f1b2f 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -616,14 +616,10 @@ class MODEL_TENSOR(IntEnum):
     CCA_K_SCALE          = auto() # Zaya
     CCA_VAL_PROJ1        = auto() # Zaya: CCA value projection stream 1
     CCA_VAL_PROJ2        = auto() # Zaya: CCA value projection stream 2
-    RES_SCALE_HS         = auto() # Zaya: hidden_states_scale
-    RES_SCALE_HS_B       = auto() # Zaya: hidden_states_bias
-    RES_SCALE_RES        = auto() # Zaya: residual_scale
-    RES_SCALE_RES_B      = auto() # Zaya: residual_bias
-    RES_SCALE_HS_FINAL   = auto() # Zaya: final hidden_states_scale
-    RES_SCALE_HS_B_FINAL = auto() # Zaya: final hidden_states_bias
-    RES_SCALE_RES_FINAL  = auto() # Zaya: final residual_scale
-    RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias
+    RES_SCALE_HS         = auto() # Zaya: hidden_states_scale (+ bias)
+    RES_SCALE_RES        = auto() # Zaya: residual_scale (+ bias)
+    RES_SCALE_HS_FINAL   = auto() # Zaya: final hidden_states_scale (+ bias)
+    RES_SCALE_RES_FINAL  = auto() # Zaya: final residual_scale (+ bias)
     ZAYA_ROUTER_DOWN_B   = auto() # Zaya
     ZAYA_ROUTER_MLP0_B   = auto() # Zaya
     ZAYA_ROUTER_MLP2     = auto() # Zaya
@@ -1151,13 +1147,9 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.CCA_VAL_PROJ1:             "blk.{bid}.cca_val_proj1",        # Zaya
     MODEL_TENSOR.CCA_VAL_PROJ2:             "blk.{bid}.cca_val_proj2",        # Zaya
     MODEL_TENSOR.RES_SCALE_HS:              "blk.{bid}.res_scale_hs",         # Zaya
-    MODEL_TENSOR.RES_SCALE_HS_B:            "blk.{bid}.res_scale_hs_b",       # Zaya
     MODEL_TENSOR.RES_SCALE_RES:             "blk.{bid}.res_scale_res",        # Zaya
-    MODEL_TENSOR.RES_SCALE_RES_B:           "blk.{bid}.res_scale_res_b",      # Zaya
     MODEL_TENSOR.RES_SCALE_HS_FINAL:        "res_scale_hs",                   # Zaya
-    MODEL_TENSOR.RES_SCALE_HS_B_FINAL:      "res_scale_hs_b",                 # Zaya
     MODEL_TENSOR.RES_SCALE_RES_FINAL:       "res_scale_res",                  # Zaya
-    MODEL_TENSOR.RES_SCALE_RES_B_FINAL:     "res_scale_res_b",                # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_DOWN_B:        "blk.{bid}.zaya_router_down_b",   # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP0_B:        "blk.{bid}.zaya_router_mlp0_b",   # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP2:          "blk.{bid}.zaya_router_mlp2",     # Zaya
@@ -4049,13 +4041,9 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.CCA_VAL_PROJ1,
         MODEL_TENSOR.CCA_VAL_PROJ2,
         MODEL_TENSOR.RES_SCALE_HS,
-        MODEL_TENSOR.RES_SCALE_HS_B,
         MODEL_TENSOR.RES_SCALE_RES,
-        MODEL_TENSOR.RES_SCALE_RES_B,
         MODEL_TENSOR.RES_SCALE_HS_FINAL,
-        MODEL_TENSOR.RES_SCALE_HS_B_FINAL,
         MODEL_TENSOR.RES_SCALE_RES_FINAL,
-        MODEL_TENSOR.RES_SCALE_RES_B_FINAL,
         MODEL_TENSOR.FFN_GATE_INP,
         MODEL_TENSOR.ZAYA_ROUTER_DOWN_B,
         MODEL_TENSOR.FFN_NORM,
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index f8c3f57cb69..5af26e8e107 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -424,13 +424,9 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_CCA_VAL_PROJ1,                          "blk.%d.cca_val_proj1" },
     { LLM_TENSOR_CCA_VAL_PROJ2,                          "blk.%d.cca_val_proj2" },
     { LLM_TENSOR_RES_SCALE_HS,                           "blk.%d.res_scale_hs" },
-    { LLM_TENSOR_RES_SCALE_HS_B,                         "blk.%d.res_scale_hs_b" },
     { LLM_TENSOR_RES_SCALE_RES,                          "blk.%d.res_scale_res" },
-    { LLM_TENSOR_RES_SCALE_RES_B,                        "blk.%d.res_scale_res_b" },
     { LLM_TENSOR_RES_SCALE_HS_FINAL,                     "res_scale_hs" },
-    { LLM_TENSOR_RES_SCALE_HS_B_FINAL,                   "res_scale_hs_b" },
     { LLM_TENSOR_RES_SCALE_RES_FINAL,                    "res_scale_res" },
-    { LLM_TENSOR_RES_SCALE_RES_B_FINAL,                  "res_scale_res_b" },
     { LLM_TENSOR_ZAYA_ROUTER_DOWN_B,                     "blk.%d.zaya_router_down_b" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP0_B,                     "blk.%d.zaya_router_mlp0_b" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP2,                       "blk.%d.zaya_router_mlp2" },
@@ -687,13 +683,9 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_CCA_VAL_PROJ1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CCA_VAL_PROJ2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_RES_SCALE_HS,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_RES_SCALE_HS_B,             {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_RES_SCALE_RES,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_RES_SCALE_RES_B,            {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_RES_SCALE_HS_FINAL,         {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
-    {LLM_TENSOR_RES_SCALE_HS_B_FINAL,       {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
     {LLM_TENSOR_RES_SCALE_RES_FINAL,        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
-    {LLM_TENSOR_RES_SCALE_RES_B_FINAL,      {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_DOWN_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP0_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP2,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 20ee10a7402..1e69f62ed53 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -452,14 +452,10 @@ enum llm_tensor {
     LLM_TENSOR_CCA_VAL_PROJ1,       // zaya: V projection 1
     LLM_TENSOR_CCA_VAL_PROJ2,       // zaya: V projection 2
     // ZAYA residual scaling
-    LLM_TENSOR_RES_SCALE_HS,        // zaya: hidden_states_scale
-    LLM_TENSOR_RES_SCALE_HS_B,      // zaya: hidden_states_bias
-    LLM_TENSOR_RES_SCALE_RES,       // zaya: residual_scale
-    LLM_TENSOR_RES_SCALE_RES_B,     // zaya: residual_bias
-    LLM_TENSOR_RES_SCALE_HS_FINAL,  // zaya: final hidden_states_scale
-    LLM_TENSOR_RES_SCALE_HS_B_FINAL,// zaya: final hidden_states_bias
-    LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale
-    LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias
+    LLM_TENSOR_RES_SCALE_HS,        // zaya: hidden_states_scale (+ bias)
+    LLM_TENSOR_RES_SCALE_RES,       // zaya: residual_scale (+ bias)
+    LLM_TENSOR_RES_SCALE_HS_FINAL,  // zaya: final hidden_states_scale (+ bias)
+    LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale (+ bias)
     // ZAYA Router (MoE gating)
     LLM_TENSOR_ZAYA_ROUTER_DOWN_B,    // zaya: router down_proj bias
     LLM_TENSOR_ZAYA_ROUTER_MLP0_B,    // zaya: router MLP layer 0 bias
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 0f55d6570f7..8ca33296407 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -39,9 +39,9 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
     }
 
     zaya_res_scale_hs    = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL,    "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
-    zaya_res_scale_hs_b  = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B_FINAL,  "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
+    zaya_res_scale_hs_b  = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL,    "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
     zaya_res_scale_res   = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL,   "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
-    zaya_res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B_FINAL, "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
+    zaya_res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL,   "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
 
     const int64_t n_embd_head = hparams.n_embd_head_k();
     const int64_t d_conv      = hparams.ssm_d_conv;
@@ -86,9 +86,9 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
         // Residual scaling
         layer.res_scale_hs   = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0);
-        layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B, "bias", i), {n_embd}, 0);
+        layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
         layer.res_scale_res  = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
-        layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
 
         // MoE layers (odd indices)
         if (i % 2 == 1) {

From 45bf02136822992127fa8ee722c788f1590827bd Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 11:20:05 +0200
Subject: [PATCH 12/33] refactor: merge router bias constants into parent
 constants

- Remove ZAYA_ROUTER_DOWN_B, ZAYA_ROUTER_MLP0_B, ZAYA_ROUTER_MLP2_B
- Use FFN_GATE_INP for both router down weight and bias
- Use FFN_GATE for both router mlp0 weight and bias
- Use ZAYA_ROUTER_MLP2 for both router mlp2 weight and bias
- Update tensor mappings in llama-arch.cpp
- Remove bias constants from gguf constants.py
- Update MODEL_ARCH.ZAYA1 tensor list
- Update zaya.cpp to create bias tensors using same constant with 'bias' suffix
- Update convert_hf_to_gguf.py to map bias tensors with .bias suffix
- Add ZAYA_ROUTER_MLP2 tensor mapping for HuggingFace auto-detection

This reduces 3 more custom constants by reusing the same constant
for both weight and bias tensors, differentiated by suffix.
---
 convert_hf_to_gguf.py          |  6 +++---
 gguf-py/gguf/constants.py      | 11 +----------
 gguf-py/gguf/tensor_mapping.py |  3 +++
 src/llama-arch.cpp             |  6 ------
 src/llama-arch.h               |  5 +----
 src/models/zaya.cpp            | 12 ++++++------
 6 files changed, 14 insertions(+), 29 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 382a3abcb6a..dda858537f9 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6523,17 +6523,17 @@ def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple
         if "down_proj.weight" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid), data_torch
         elif "down_proj.bias" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, suffix=".bias"), data_torch
         elif "rmsnorm_eda" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_NORM, bid), data_torch
         elif "router_mlp.0.weight" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch
         elif "router_mlp.0.bias" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, bid, suffix=".bias"), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid, suffix=".bias"), data_torch
         elif "router_mlp.2.weight" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid), data_torch
         elif "router_mlp.2.bias" in name:
-            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, bid, suffix=".bias"), data_torch
+            yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid, suffix=".bias"), data_torch
         elif "router_mlp.4.weight" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP4, bid), data_torch
         elif "balancing_biases" in name:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index b42c58f1b2f..a979d89c577 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -620,10 +620,7 @@ class MODEL_TENSOR(IntEnum):
     RES_SCALE_RES        = auto() # Zaya: residual_scale (+ bias)
     RES_SCALE_HS_FINAL   = auto() # Zaya: final hidden_states_scale (+ bias)
     RES_SCALE_RES_FINAL  = auto() # Zaya: final residual_scale (+ bias)
-    ZAYA_ROUTER_DOWN_B   = auto() # Zaya
-    ZAYA_ROUTER_MLP0_B   = auto() # Zaya
-    ZAYA_ROUTER_MLP2     = auto() # Zaya
-    ZAYA_ROUTER_MLP2_B   = auto() # Zaya
+    ZAYA_ROUTER_MLP2     = auto() # Zaya: router MLP layer 2 (+ bias)
     ZAYA_ROUTER_MLP4     = auto() # Zaya
     ZAYA_ROUTER_BIASES   = auto() # Zaya
     ZAYA_ROUTER_EDA_SCALE = auto() # Zaya
@@ -1150,10 +1147,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.RES_SCALE_RES:             "blk.{bid}.res_scale_res",        # Zaya
     MODEL_TENSOR.RES_SCALE_HS_FINAL:        "res_scale_hs",                   # Zaya
     MODEL_TENSOR.RES_SCALE_RES_FINAL:       "res_scale_res",                  # Zaya
-    MODEL_TENSOR.ZAYA_ROUTER_DOWN_B:        "blk.{bid}.zaya_router_down_b",   # Zaya
-    MODEL_TENSOR.ZAYA_ROUTER_MLP0_B:        "blk.{bid}.zaya_router_mlp0_b",   # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP2:          "blk.{bid}.zaya_router_mlp2",     # Zaya
-    MODEL_TENSOR.ZAYA_ROUTER_MLP2_B:        "blk.{bid}.zaya_router_mlp2_b",   # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_MLP4:          "blk.{bid}.zaya_router_mlp4",     # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_BIASES:        "blk.{bid}.zaya_router_biases",   # Zaya
     MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE:     "blk.{bid}.zaya_router_eda",      # Zaya
@@ -4045,12 +4039,9 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.RES_SCALE_HS_FINAL,
         MODEL_TENSOR.RES_SCALE_RES_FINAL,
         MODEL_TENSOR.FFN_GATE_INP,
-        MODEL_TENSOR.ZAYA_ROUTER_DOWN_B,
         MODEL_TENSOR.FFN_NORM,
         MODEL_TENSOR.FFN_GATE,
-        MODEL_TENSOR.ZAYA_ROUTER_MLP0_B,
         MODEL_TENSOR.ZAYA_ROUTER_MLP2,
-        MODEL_TENSOR.ZAYA_ROUTER_MLP2_B,
         MODEL_TENSOR.ZAYA_ROUTER_MLP4,
         MODEL_TENSOR.ZAYA_ROUTER_BIASES,
         MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE,
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 41cd9262434..5d235e46f58 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -171,6 +171,9 @@ class TensorNameMap:
         MODEL_TENSOR.A_QF_PROJ_LINEAR: (
             "projector.linear",
         ),
+        MODEL_TENSOR.ZAYA_ROUTER_MLP2: (
+            "model.layers.{bid}.self_attn.router_mlp.2",      # zaya
+        ),
     }
 
     block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 5af26e8e107..f3031dc32fc 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -427,10 +427,7 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_RES_SCALE_RES,                          "blk.%d.res_scale_res" },
     { LLM_TENSOR_RES_SCALE_HS_FINAL,                     "res_scale_hs" },
     { LLM_TENSOR_RES_SCALE_RES_FINAL,                    "res_scale_res" },
-    { LLM_TENSOR_ZAYA_ROUTER_DOWN_B,                     "blk.%d.zaya_router_down_b" },
-    { LLM_TENSOR_ZAYA_ROUTER_MLP0_B,                     "blk.%d.zaya_router_mlp0_b" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP2,                       "blk.%d.zaya_router_mlp2" },
-    { LLM_TENSOR_ZAYA_ROUTER_MLP2_B,                     "blk.%d.zaya_router_mlp2_b" },
     { LLM_TENSOR_ZAYA_ROUTER_MLP4,                       "blk.%d.zaya_router_mlp4" },
     { LLM_TENSOR_ZAYA_ROUTER_BIASES,                     "blk.%d.zaya_router_biases" },
     { LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE,                  "blk.%d.zaya_router_eda" },
@@ -686,10 +683,7 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_RES_SCALE_RES,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_RES_SCALE_HS_FINAL,         {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
     {LLM_TENSOR_RES_SCALE_RES_FINAL,        {LLM_TENSOR_LAYER_OUTPUT,    GGML_OP_MUL}},
-    {LLM_TENSOR_ZAYA_ROUTER_DOWN_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
-    {LLM_TENSOR_ZAYA_ROUTER_MLP0_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP2,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
-    {LLM_TENSOR_ZAYA_ROUTER_MLP2_B,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_MLP4,           {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_ZAYA_ROUTER_BIASES,         {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}},
     {LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE,      {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 1e69f62ed53..4b2c8f83314 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -457,10 +457,7 @@ enum llm_tensor {
     LLM_TENSOR_RES_SCALE_HS_FINAL,  // zaya: final hidden_states_scale (+ bias)
     LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale (+ bias)
     // ZAYA Router (MoE gating)
-    LLM_TENSOR_ZAYA_ROUTER_DOWN_B,    // zaya: router down_proj bias
-    LLM_TENSOR_ZAYA_ROUTER_MLP0_B,    // zaya: router MLP layer 0 bias
-    LLM_TENSOR_ZAYA_ROUTER_MLP2,      // zaya: router MLP layer 2 weight
-    LLM_TENSOR_ZAYA_ROUTER_MLP2_B,    // zaya: router MLP layer 2 bias
+    LLM_TENSOR_ZAYA_ROUTER_MLP2,      // zaya: router MLP layer 2 weight (+ bias)
     LLM_TENSOR_ZAYA_ROUTER_MLP4,      // zaya: router MLP layer 4 weight
     LLM_TENSOR_ZAYA_ROUTER_BIASES,    // zaya: router balancing_biases
     LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, // zaya: router router_states_scale
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 8ca33296407..9b240884eed 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -95,18 +95,18 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
             // Router network
             layer.zaya_router_down   = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),
                 {n_embd, n_ff_exp}, 0);
-            layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i),
-                {n_ff_exp}, 0);
+            layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i),
+                {n_ff_exp}, TENSOR_NOT_REQUIRED);
             layer.zaya_router_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i),
                 {n_ff_exp}, 0);
             layer.zaya_router_mlp0   = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i),
                 {n_ff_exp, n_ff_exp}, 0);
-            layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "bias", i),
-                {n_ff_exp}, 0);
+            layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i),
+                {n_ff_exp}, TENSOR_NOT_REQUIRED);
             layer.zaya_router_mlp2   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2, "weight", i),
                 {n_ff_exp, n_ff_exp}, 0);
-            layer.zaya_router_mlp2_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "bias", i),
-                {n_ff_exp}, 0);
+            layer.zaya_router_mlp2_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2, "bias", i),
+                {n_ff_exp}, TENSOR_NOT_REQUIRED);
             layer.zaya_router_mlp4   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i),
                 {n_ff_exp, n_expert + 1}, 0);
             layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i),

From fede4c6774f2d9d3bee77bc70c16fca051917f33 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 12:24:01 +0200
Subject: [PATCH 13/33] zaya: remove unused CCA_QK_NORM tensor constant

---
 gguf-py/gguf/constants.py | 3 ---
 src/llama-arch.cpp        | 2 --
 src/llama-arch.h          | 1 -
 src/llama-model.h         | 1 -
 4 files changed, 7 deletions(-)

diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index a979d89c577..eeb14f6aa76 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -612,7 +612,6 @@ class MODEL_TENSOR(IntEnum):
     SSM_G_A              = auto() # Kimi Linear
     SSM_G_B              = auto() # Kimi Linear
     CCA_CONV_GRP         = auto() # Zaya
-    CCA_QK_NORM          = auto() # Zaya (weightless - unit RMSNorm)
     CCA_K_SCALE          = auto() # Zaya
     CCA_VAL_PROJ1        = auto() # Zaya: CCA value projection stream 1
     CCA_VAL_PROJ2        = auto() # Zaya: CCA value projection stream 2
@@ -1139,7 +1138,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.SSM_G_A:                   "blk.{bid}.ssm_g_a",              # Kimi Linear
     MODEL_TENSOR.SSM_G_B:                   "blk.{bid}.ssm_g_b",              # Kimi Linear
     MODEL_TENSOR.CCA_CONV_GRP:              "blk.{bid}.cca_conv_grp",         # Zaya
-    MODEL_TENSOR.CCA_QK_NORM:               "blk.{bid}.cca_qk_norm",          # Zaya
     MODEL_TENSOR.CCA_K_SCALE:               "blk.{bid}.cca_k_scale",          # Zaya
     MODEL_TENSOR.CCA_VAL_PROJ1:             "blk.{bid}.cca_val_proj1",        # Zaya
     MODEL_TENSOR.CCA_VAL_PROJ2:             "blk.{bid}.cca_val_proj2",        # Zaya
@@ -4030,7 +4028,6 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.ATTN_OUT,
         MODEL_TENSOR.SSM_CONV1D,
         MODEL_TENSOR.CCA_CONV_GRP,
-        MODEL_TENSOR.CCA_QK_NORM,
         MODEL_TENSOR.CCA_K_SCALE,
         MODEL_TENSOR.CCA_VAL_PROJ1,
         MODEL_TENSOR.CCA_VAL_PROJ2,
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index f3031dc32fc..e0ac2a625dc 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -419,7 +419,6 @@ static const std::map<llm_tensor, const char *> LLM_TENSOR_NAMES = {
     { LLM_TENSOR_SSM_G_A,                                "blk.%d.ssm_g_a" },
     { LLM_TENSOR_SSM_G_B,                                "blk.%d.ssm_g_b" },
     { LLM_TENSOR_CCA_CONV_GRP,                           "blk.%d.cca_conv_grp" },
-    { LLM_TENSOR_CCA_QK_NORM,                            "blk.%d.cca_qk_norm" },
     { LLM_TENSOR_CCA_K_SCALE,                            "blk.%d.cca_k_scale" },
     { LLM_TENSOR_CCA_VAL_PROJ1,                          "blk.%d.cca_val_proj1" },
     { LLM_TENSOR_CCA_VAL_PROJ2,                          "blk.%d.cca_val_proj2" },
@@ -675,7 +674,6 @@ static const std::map<llm_tensor, llm_tensor_info> LLM_TENSOR_INFOS = {
     {LLM_TENSOR_SSM_G_B,                    {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     // ZAYA CCA
     {LLM_TENSOR_CCA_CONV_GRP,               {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
-    {LLM_TENSOR_CCA_QK_NORM,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CCA_K_SCALE,                {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}},
     {LLM_TENSOR_CCA_VAL_PROJ1,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
     {LLM_TENSOR_CCA_VAL_PROJ2,              {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}},
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 4b2c8f83314..3809afc124c 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -447,7 +447,6 @@ enum llm_tensor {
     LLM_TENSOR_SSM_G_B,             // kimi: output gate projection B
     // ZAYA CCA (Compressed Convolutional Attention)
     LLM_TENSOR_CCA_CONV_GRP,        // zaya: grouped conv1d  (conv_qk.1)
-    LLM_TENSOR_CCA_QK_NORM,         // zaya: RMSNorm on concat(Q,K)
     LLM_TENSOR_CCA_K_SCALE,         // zaya: learned K temperature
     LLM_TENSOR_CCA_VAL_PROJ1,       // zaya: V projection 1
     LLM_TENSOR_CCA_VAL_PROJ2,       // zaya: V projection 2
diff --git a/src/llama-model.h b/src/llama-model.h
index 01ce976fe3e..1a61503f3b0 100644
--- a/src/llama-model.h
+++ b/src/llama-model.h
@@ -482,7 +482,6 @@ struct llama_layer {
     struct ggml_tensor * cca_conv_dw_b  = nullptr;  // depthwise conv bias
     struct ggml_tensor * cca_conv_grp   = nullptr;  // grouped conv   (conv_qk.1)
     struct ggml_tensor * cca_conv_grp_b = nullptr;  // grouped conv bias
-    struct ggml_tensor * cca_qk_norm    = nullptr;  // RMSNorm on concat(Q,K)
     struct ggml_tensor * cca_k_scale    = nullptr;  // learned K temperature
     struct ggml_tensor * cca_val_proj1  = nullptr;  // V projection stream 1
     struct ggml_tensor * cca_val_proj2  = nullptr;  // V projection stream 2

From 2069583f8e4094b15345b092bd3771b159607a86 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 12:28:54 +0200
Subject: [PATCH 14/33] zaya: remove dead ZAYA_ROUTER_MLP2 mapping from
 non-block config

---
 gguf-py/gguf/tensor_mapping.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 5d235e46f58..41cd9262434 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -171,9 +171,6 @@ class TensorNameMap:
         MODEL_TENSOR.A_QF_PROJ_LINEAR: (
             "projector.linear",
         ),
-        MODEL_TENSOR.ZAYA_ROUTER_MLP2: (
-            "model.layers.{bid}.self_attn.router_mlp.2",      # zaya
-        ),
     }
 
     block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {

From 356e9620462235827eff0b9c060013ad2870b41c Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 12:42:14 +0200
Subject: [PATCH 15/33] zaya: revert unrelated debug.cpp changes

---
 common/debug.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/common/debug.cpp b/common/debug.cpp
index 60cb5fd9b4a..102c6924dc9 100644
--- a/common/debug.cpp
+++ b/common/debug.cpp
@@ -144,6 +144,13 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
     auto * cb_data = (common_debug_cb_user_data *) user_data;
     auto * pimpl = cb_data->pimpl.get();
 
+    const struct ggml_tensor * src0 = t->src[0];
+    const struct ggml_tensor * src1 = t->src[1];
+
+    if (ask) {
+        return true;  // Always retrieve data
+    }
+
     bool matches_filter = pimpl->tensor_filters.empty();
 
     if (!matches_filter) {
@@ -155,13 +162,6 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
         }
     }
 
-    if (ask) {
-        return matches_filter;
-    }
-
-    const struct ggml_tensor * src0 = t->src[0];
-    const struct ggml_tensor * src1 = t->src[1];
-
     char src1_str[128] = { 0 };
     if (src1) {
         snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());

From 81d727f0af3894029575b74e9444a2065ac84edd Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 17:01:43 +0200
Subject: [PATCH 16/33] zaya: replace hardcoded n_ff_exp with GGUF metadata

Remove hardcoded 256 value for router MLP hidden size and read it
from the GGUF expert_feed_forward_length metadata key instead.
The converter now writes zaya_mlp_expansion from config.json.
---
 convert_hf_to_gguf.py | 4 ++++
 src/models/zaya.cpp   | 5 +++--
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index dda858537f9..2054515da19 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6494,6 +6494,10 @@ def set_gguf_parameters(self):
         n_expert_used = self.find_hparam(["moe_router_topk", "num_experts_per_tok"], optional=True) or 1
         self.gguf_writer.add_expert_used_count(n_expert_used)
 
+        # Router MLP hidden size (zaya_mlp_expansion)
+        n_ff_exp = self.hparams.get("zaya_mlp_expansion", 256)
+        self.gguf_writer.add_expert_feed_forward_length(n_ff_exp)
+
     def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
         if "linear_q" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 9b240884eed..63d6e197975 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -8,6 +8,7 @@
 void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
+    ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false);
 
     const uint32_t n_qk = (hparams.n_head() + hparams.n_head_kv()) * hparams.n_embd_head_k();
     hparams.ssm_d_inner = 2*n_qk + hparams.n_embd; // CCA conv state + delayed value stream state
@@ -45,8 +46,8 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
     const int64_t n_embd_head = hparams.n_embd_head_k();
     const int64_t d_conv      = hparams.ssm_d_conv;
-    // Router MLP hidden size (zaya_mlp_expansion = 256 for ZAYA1-8B)
-    const int64_t n_ff_exp    = 256;
+    // Router MLP hidden size (zaya_mlp_expansion)
+    const int64_t n_ff_exp    = hparams.n_ff_exp;
 
     for (int i = 0; i < n_layer; ++i) {
         auto & layer = layers[i];

From 45d78817343110a8f4018e977b58dc6471f624db Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 15 May 2026 17:04:23 +0200
Subject: [PATCH 17/33] zaya: fix val_proj dimensions to use n_embd_k / 2
 instead of n_embd_head

val_proj1 and val_proj2 output dimension should be latent_k_dim / 2
(n_embd_k / 2) as per vLLM reference, not n_embd_head. Currently
both are equal for ZAYA1-8B (n_head_kv=2), but this would break
for any other n_head_kv configuration.
---
 src/models/zaya.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 63d6e197975..a7561bd8c90 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -69,9 +69,9 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
             layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0);
 
             layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i),
-                {n_embd, n_embd_head}, 0);
+                {n_embd, n_embd_k / 2}, 0);
             layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i),
-                {n_embd, n_embd_head}, 0);
+                {n_embd, n_embd_k / 2}, 0);
 
             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0);
 

From 800fbe8ffdd4964690fe656dc1f18e82a2fd1913 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Sat, 16 May 2026 11:08:03 +0200
Subject: [PATCH 18/33] quant: exclude Zaya cca_conv_grp tensors from
 quantization

Follows the same pattern as Mamba ssm_conv1d, Kimi shortconv,
and RWKV time_mix tensors. These small conv weights (d_conv=2)
are not divisible by quant block sizes (32), causing Q8_0 failures.
---
 src/llama-quant.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp
index 43e05c3d56f..bec2f15eb45 100644
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -322,6 +322,9 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param
     quantize &= name.find("ssm_conv1d") == std::string::npos;
     quantize &= name.find("shortconv.conv.weight") == std::string::npos;
 
+    // do not quantize Zaya's small grouped conv1d weights (d_conv=2)
+    quantize &= name.find("cca_conv_grp") == std::string::npos;
+
     // do not quantize RWKV's small yet 2D weights
     quantize &= name.find("time_mix_first.weight") == std::string::npos;
     quantize &= name.find("time_mix_w0.weight") == std::string::npos;

From f2efd8c70f7f6857519d7c424fcf62ca113246c5 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Sat, 16 May 2026 20:56:01 +0200
Subject: [PATCH 19/33] zaya: cast conv kernels to F16 for CPU backend
 compatibility

ggml_im2col on CPU requires F16 kernel weights. Cast cca_conv_dw
and cca_conv_grp to F16 before convolution to support quantized
models (Q4, Q8). CUDA/SYCL backends are unaffected since their
im2col implementation only reads kernel dimensions, not data.
---
 src/models/zaya.cpp | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index a7561bd8c90..cda5abeea4c 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -288,8 +288,8 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target));
 
             ggml_tensor * conv_dw = layer.cca_conv_dw;
-            if (conv_dw->type != GGML_TYPE_F32) {
-                conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F32);
+            if (conv_dw->type != GGML_TYPE_F16) {
+                conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F16);
             }
             conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk);
             ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1);
@@ -298,7 +298,11 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             }
             cb(QK, "QK_dw", il);
 
-            QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK, 1, 0, 1, n_groups);
+            ggml_tensor * conv_grp = layer.cca_conv_grp;
+            if (conv_grp->type != GGML_TYPE_F16) {
+                conv_grp = ggml_cast(ctx0, conv_grp, GGML_TYPE_F16);
+            }
+            QK = ggml_conv_1d_grouped(ctx0, conv_grp, QK, 1, 0, 1, n_groups);
             QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1));
             cb(QK, "QK_grp", il);
 

From 3aaab7f7bb3be1b13cb9cec71182f642986a38bd Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Sat, 16 May 2026 22:35:00 +0200
Subject: [PATCH 20/33] zaya: add ggml_cont for ROCm/compatibility with
 non-contiguous tensors

ROCm and Vulkan backends require contiguous tensors for im2col and
mul_mat operations. Add ggml_cont after ggml_cast for conv kernels
and after ggml_concat for hs_d to ensure compatibility across all
backends. CUDA was unaffected since it handles non-contiguous
tensors more permissively.
---
 src/models/zaya.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index cda5abeea4c..f5fc16b8899 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -229,7 +229,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
                         0);
                 hs_d = ggml_concat(ctx0, hs_d, cur_shift, 1);
             }
-            hs_d = ggml_reshape_2d(ctx0, hs_d, n_embd, n_tokens);
+            hs_d = ggml_reshape_2d(ctx0, ggml_cont(ctx0, hs_d), n_embd, n_tokens);
             cb(hs_d, "cca_hs_d", il);
 
             // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens]
@@ -289,7 +289,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
             ggml_tensor * conv_dw = layer.cca_conv_dw;
             if (conv_dw->type != GGML_TYPE_F16) {
-                conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F16);
+                conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F16));
             }
             conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk);
             ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1);
@@ -300,7 +300,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
             ggml_tensor * conv_grp = layer.cca_conv_grp;
             if (conv_grp->type != GGML_TYPE_F16) {
-                conv_grp = ggml_cast(ctx0, conv_grp, GGML_TYPE_F16);
+                conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F16));
             }
             QK = ggml_conv_1d_grouped(ctx0, conv_grp, QK, 1, 0, 1, n_groups);
             QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1));

From c8d3a6c93685b2ffa2daf3a91ab524b5e02d25f6 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Sun, 17 May 2026 18:33:18 +0200
Subject: [PATCH 21/33] zaya: fix server compatibility with batched inference

- Add ggml_cont(prev_hs) for non-contiguous tensor view (n_seqs > 1)
- Replace ggml_conv_1d_dw with ggml_ssm_conv for proper batch support
- Cast conv kernel to F32 and permute output shape

ggml_conv_1d_dw does not support n_seqs > 1 (assert b->ne[3] == 1).
Use ggml_ssm_conv which is designed for SSM models with batching.
---
 ggml/src/ggml-cuda/ssm-conv.cu |  3 ++-
 src/models/zaya.cpp            | 14 +++++++++-----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu
index 4841389fbc8..f983869c215 100644
--- a/ggml/src/ggml-cuda/ssm-conv.cu
+++ b/ggml/src/ggml-cuda/ssm-conv.cu
@@ -140,11 +140,12 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa
     };
 
     switch (nc) {
+        case 2: launch_kernel(std::integral_constant<int, 2>{}); break;
         case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
         case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
         case 5: launch_kernel(std::integral_constant<int, 5>{}); break;
         case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
-        default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now.");
+        default: GGML_ABORT("Only support kernel sizes 2, 3, 4, 5, 9 right now.");
     }
 }
 
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index f5fc16b8899..ce65c2281fa 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -221,7 +221,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_tensor * cur_state_src = ggml_cont(ctx0, cur);
             ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs);
 
-            ggml_tensor * hs_d = ggml_reshape_3d(ctx0, prev_hs, n_embd, 1, n_seqs);
+            ggml_tensor * hs_d = ggml_reshape_3d(ctx0, ggml_cont(ctx0, prev_hs), n_embd, 1, n_seqs);
             if (n_seq_tokens > 1) {
                 ggml_tensor * cur_shift = ggml_view_3d(ctx0, cur_seq, n_embd, n_seq_tokens - 1, n_seqs,
                         cur_seq->nb[1],
@@ -288,11 +288,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target));
 
             ggml_tensor * conv_dw = layer.cca_conv_dw;
-            if (conv_dw->type != GGML_TYPE_F16) {
-                conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F16));
+            if (conv_dw->type != GGML_TYPE_F32) {
+                conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F32));
             }
-            conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk);
-            ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1);
+            // conv_input is [L, n_qk, n_seqs], ssm_conv outputs [n_qk, n_tokens, n_seqs]
+            ggml_tensor * QK = ggml_ssm_conv(ctx0, conv_input, conv_dw);
+            // permute from [n_qk, n_tokens, n_seqs] to [n_tokens, n_qk, n_seqs]
+            QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3));
             if (layer.cca_conv_dw_b) {
                 QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1));
             }
@@ -307,6 +309,8 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             cb(QK, "QK_grp", il);
 
             QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3));
+            // QK is now [n_qk, n_seq_tokens, n_seqs]
+            // Flatten to 2D: [n_qk, n_tokens] where n_tokens = n_seq_tokens * n_seqs
             QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens);
 
             ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0);

From 7c5cc5305288b07afde5c7505b529342fcd559c1 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Mon, 18 May 2026 20:19:33 +0200
Subject: [PATCH 22/33] fix(zaya): use actual tokenizer vocab size instead of
 config vocab_size

The model's config.json reports vocab_size=262272 but the actual tokenizer
only has 262147 tokens. The 125 extra entries are padding in PyTorch's
embed_tokens.weight matrix that don't correspond to any real tokens.

Use the pre-computed _tokenizer_vocab_size to write the correct vocab size
in the GGUF metadata, matching llama.cpp's actual tokenizer vocabulary.
---
 convert_hf_to_gguf.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 2054515da19..5ce42b465c8 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6473,7 +6473,9 @@ def __init__(self, *args, **kwargs):
 
     def set_gguf_parameters(self):
         super().set_gguf_parameters()
-        self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])
+        # Use actual tokenizer vocab size if available, fallback to config vocab_size
+        vocab_size = self._tokenizer_vocab_size if self._tokenizer_vocab_size is not None else self.hparams["vocab_size"]
+        self.gguf_writer.add_vocab_size(vocab_size)
 
         # n_ff = ffn_hidden_size / 2 (SwiGLU halves the intermediate)
         n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2

From f1bd772a37f3407d8fd2809edef1f0e0d245b760 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Thu, 21 May 2026 22:50:15 +0200
Subject: [PATCH 23/33] docs(zaya): add Python reference comments to C++
 implementation

Add detailed inline comments mapping each C++ code section to the
corresponding zaya.py and cca.py Python lines, including code snippets
for direct comparison.
---
 src/models/zaya.cpp | 488 +++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 461 insertions(+), 27 deletions(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index ce65c2281fa..e2dd6e52bcf 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -5,6 +5,25 @@
 
 #include <cmath>
 
+/*
+ * zaya.py ref: L52-81 (ResidualScaling class)
+ *
+ * class ResidualScaling(nn.Module):
+ *     def __init__(self, config, layer_n, ...):
+ *         self.not_first_layer = (layer_n != 0)
+ *         self.hidden_states_scale = torch.nn.Parameter(torch.ones(config.hidden_size))
+ *         self.hidden_states_bias  = torch.nn.Parameter(torch.zeros(config.hidden_size))
+ *         if self.not_first_layer:
+ *             self.residual_scale = torch.nn.Parameter(torch.ones(config.hidden_size))
+ *             self.residual_bias  = torch.nn.Parameter(torch.zeros(config.hidden_size))
+ *
+ *     def forward(self, residual, hidden_states):
+ *         hidden_states = (hidden_states.float() + hs_bias) * hs_scale
+ *         if self.not_first_layer and residual is not None:
+ *             residual = (residual.float() + res_bias) * res_scale
+ *         return residual, hidden_states
+ */
+
 void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
@@ -15,6 +34,15 @@ void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) {
     hparams.ssm_d_state = 1;
     hparams.ssm_n_group = 0;
 
+    /*
+     * zaya.py ref: L575-602 (layer alternation)
+     *
+     * for layer_n in range(config.num_hidden_layers):
+     *     if layer_n % 2 == 1:
+     *         self.layers.append(ZayaDecoderMLPLayer(...))   # MoE layer
+     *     else:
+     *         self.layers.append(ZayaDecoderATTLayer(...))   # Attention layer
+     */
     for (uint32_t i = 0; i < hparams.n_layer; ++i) {
         hparams.recurrent_layer_arr[i] = (i % 2) == 0;
     }
@@ -28,17 +56,42 @@ void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) {
 void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
     LLAMA_LOAD_LOCALS;
 
+    /*
+     * zaya.py ref: L569-573
+     *
+     * self.embed_tokens = VocabParallelEmbedding(
+     *     self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size)
+     */
     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
-    // output norm
+    /*
+     * zaya.py ref: L608-613
+     *
+     * if (config.normalization == "RMSNorm"):
+     *     self.final_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon)
+     * elif (config.normalization == "LayerNorm"):
+     *     self.final_norm = nn.LayerNorm(self.config.hidden_size, eps=config.norm_epsilon)
+     */
     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
 
-    // output (tied with tok_embd if not present)
+    /*
+     * zaya.py ref: L729-743
+     *
+     * self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, ...)
+     * if self.config.tie_word_embeddings:
+     *     self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
+     */
     output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
     if (output == nullptr) {
-        output = tok_embd;
+        output = tok_embd;  // tied weights
     }
 
+    /*
+     * zaya.py ref: L605-606 (final ResidualScaling after all layers)
+     *
+     * if self.config.scale_residual_merge:
+     *     self.res_scale = ResidualScaling(config, config.num_hidden_layers)
+     */
     zaya_res_scale_hs    = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL,    "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
     zaya_res_scale_hs_b  = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL,    "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
     zaya_res_scale_res   = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL,   "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
@@ -46,7 +99,6 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
     const int64_t n_embd_head = hparams.n_embd_head_k();
     const int64_t d_conv      = hparams.ssm_d_conv;
-    // Router MLP hidden size (zaya_mlp_expansion)
     const int64_t n_ff_exp    = hparams.n_ff_exp;
 
     for (int i = 0; i < n_layer; ++i) {
@@ -61,31 +113,97 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
         const int64_t n_ff      = hparams.n_ff(i);
         const int64_t n_expert  = hparams.n_expert;
 
+        /*
+         * zaya.py ref: L212-217 (ZayaDecoderATTLayer input_norm)
+         * zaya.py ref: L508-513 (ZayaDecoderMLPLayer input_norm)
+         *
+         * if (config.normalization == "RMSNorm"):
+         *     self.input_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon)
+         * elif (config.normalization == "LayerNorm"):
+         *     self.input_norm = nn.LayerNorm(self.config.hidden_size, eps=config.norm_epsilon)
+         */
         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
         // CCA attention layers (even indices only)
         if (i % 2 == 0) {
+            /*
+             * zaya.py ref: L98-184 (ZayaAttention class)
+             *
+             * self.q_dim = cca_num_q_heads * head_dim
+             * self.k_dim = cca_num_k_heads * head_dim
+             * self.v_dim = cca_num_k_heads * head_dim
+             *
+             * self.qkv = CCA(config, cca_num_k_heads, cca_num_q_heads, cca_num_heads, ...)
+             * self.o_proj = ReplicatedLinear(cca_num_q_heads * head_dim, hidden_size, ...)
+             * self.attn = Attention(cca_num_q_heads, head_dim, scale, cca_num_k_heads, ...)
+             * self.rotary_emb = get_rope(head_size=head_dim, ..., partial_rotary_factor=0.5)
+             */
+
+            /*
+             * zaya.py ref: L125-138 (CCA layer for Q, K projections)
+             *
+             * self.qkv = CCA(...)
+             * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...)
+             * self.qkv(hidden_states, output_qkv)
+             * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1)
+             */
             layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0);
             layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0);
 
+            /*
+             * zaya.py ref: CCA.py - value projections (val_proj1, val_proj2)
+             *
+             * V1 = val_proj1(x)
+             * V2 = val_proj2(x_delayed)
+             * V = concat(V1, V2)
+             */
             layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i),
                 {n_embd, n_embd_k / 2}, 0);
             layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i),
                 {n_embd, n_embd_k / 2}, 0);
 
+            /*
+             * zaya.py ref: L139-144
+             *
+             * self.o_proj = ReplicatedLinear(self.cca_num_q_heads * self.head_dim,
+             *                                self.hidden_size, bias=self.config.attention_bias, ...)
+             */
             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0);
 
+            /*
+             * zaya.py ref: CCA.py - depthwise conv on QK
+             *
+             * conv_dw applied to [Q, K] concatenated
+             */
             layer.cca_conv_dw   = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, n_qk}, 0);
             layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED);
 
+            /*
+             * zaya.py ref: CCA.py - grouped conv on QK
+             *
+             * conv_grp applied after dw conv, with n_groups = n_head + n_head_kv
+             */
             layer.cca_conv_grp   = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i),
                 {d_conv, n_qk / n_groups, n_qk}, 0);
             layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0);
 
+            /*
+             * zaya.py ref: CCA.py - K scaling after L2 norm
+             *
+             * Kcur = Kcur * cca_k_scale
+             */
             layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0);
         }
 
-        // Residual scaling
+        /*
+         * zaya.py ref: L52-81, L219-220, L515-516 (per-layer ResidualScaling)
+         *
+         * if self.config.scale_residual_merge:
+         *     self.res_scale = ResidualScaling(config, layer_n)
+         *
+         * hidden_states = (hidden_states.float() + hs_bias) * hs_scale
+         * residual = (residual.float() + res_bias) * res_scale
+         */
         layer.res_scale_hs   = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0);
         layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
         layer.res_scale_res  = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
@@ -93,13 +211,51 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
         // MoE layers (odd indices)
         if (i % 2 == 1) {
-            // Router network
+            /*
+             * zaya.py ref: L251-380 (ZayaRouter class)
+             *
+             * self.down_proj = ReplicatedLinear(self.hidden_size, self.mlp_expansion, bias=True, ...)
+             * self.rmsnorm_eda = RMSNorm(self.mlp_expansion, eps=ln_eps)
+             * self.router_states_scale = nn.Parameter(torch.ones(self.mlp_expansion))  // EDA scale
+             * self.router_mlp = nn.Sequential(
+             *     ReplicatedLinear(D, D, bias=True, ...),
+             *     nn.GELU(),
+             *     ReplicatedLinear(D, D, bias=True, ...),
+             *     nn.GELU(),
+             *     ReplicatedLinear(D, E, bias=False, ...),
+             * )
+             * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32))
+             */
+
+            /*
+             * zaya.py ref: L291
+             *
+             * self.down_proj = ReplicatedLinear(self.hidden_size, self.mlp_expansion, bias=True, ...)
+             */
             layer.zaya_router_down   = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),
                 {n_embd, n_ff_exp}, 0);
             layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i),
                 {n_ff_exp}, TENSOR_NOT_REQUIRED);
+
+            /*
+             * zaya.py ref: L298-299
+             *
+             * self.rmsnorm_eda = RMSNorm(self.mlp_expansion, eps=ln_eps)
+             */
             layer.zaya_router_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i),
                 {n_ff_exp}, 0);
+
+            /*
+             * zaya.py ref: L305-314 (router MLP layers 0, 2, 4)
+             *
+             * self.router_mlp = nn.Sequential(
+             *     ReplicatedLinear(D, D, bias=True, ...),   // mlp0
+             *     self.non_linearity,                        // GELU
+             *     ReplicatedLinear(D, D, bias=True, ...),   // mlp2
+             *     self.non_linearity,                        // GELU
+             *     ReplicatedLinear(D, E, bias=False, ...),  // mlp4
+             * )
+             */
             layer.zaya_router_mlp0   = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i),
                 {n_ff_exp, n_ff_exp}, 0);
             layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i),
@@ -110,12 +266,40 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
                 {n_ff_exp}, TENSOR_NOT_REQUIRED);
             layer.zaya_router_mlp4   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i),
                 {n_ff_exp, n_expert + 1}, 0);
+
+            /*
+             * zaya.py ref: L317-319
+             *
+             * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32))
+             * if self.use_mod:
+             *     self.balancing_biases[-1] = -1.0
+             */
             layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i),
                 {n_expert + 1}, TENSOR_NOT_REQUIRED);
+
+            /*
+             * zaya.py ref: L302-303
+             *
+             * self.router_states_scale = nn.Parameter(torch.ones(self.mlp_expansion))
+             */
             layer.zaya_router_eda_scale = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "weight", i),
                 {n_ff_exp}, TENSOR_NOT_REQUIRED);
 
-            // MoE experts (fused gate_up and down)
+            /*
+             * zaya.py ref: L435-446 (FusedMoE experts)
+             *
+             * self.experts = FusedMoE(
+             *     num_experts=self.num_moe_experts,
+             *     top_k=self.topk,
+             *     hidden_size=config.hidden_size,
+             *     intermediate_size=ffn_hidden_size // 2,
+             *     reduce_results=False,
+             *     renormalize=False,
+             *     custom_routing_function=_custom_routing_fn,
+             *     activation="silu",
+             *     ...
+             * )
+             */
             create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0);
             layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i),
                 {n_ff, n_embd, n_expert}, 0);
@@ -143,6 +327,15 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
+    /*
+     * zaya.py ref: L638-641 (ZayaModel.forward)
+     *
+     * if inputs_embeds is None:
+     *     inputs_embeds = self.embed_tokens(input_ids)
+     * residual = None
+     * hidden_states = inputs_embeds
+     * prev_router_hidden_states = None
+     */
     inpL = build_inp_embd(model.tok_embd);
 
     auto * inp = build_inp_mem_hybrid();
@@ -153,6 +346,14 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     ggml_tensor * residual    = nullptr;
     ggml_tensor * prev_router = nullptr;
 
+    /*
+     * zaya.py ref: L71-81 (ResidualScaling.forward)
+     *
+     * hidden_states = (hidden_states.float() + hs_bias) * hs_scale
+     * if self.not_first_layer and residual is not None:
+     *     residual = (residual.float() + res_bias) * res_scale
+     * return residual, hidden_states
+     */
     const auto apply_res_scale = [&](ggml_tensor * x, ggml_tensor * scale, ggml_tensor * bias, const char * name, int il) {
         if (scale == nullptr) {
             return x;
@@ -165,6 +366,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         return x;
     };
 
+    /*
+     * zaya.py ref: L644-651 (ZayaModel.forward layer loop)
+     *
+     * for layer_n, decoder_layer in enumerate(self.layers):
+     *     hidden_states, residual, prev_router_hidden_states = decoder_layer(
+     *         hidden_states, residual, positions, layer_n, prev_router_hidden_states)
+     */
     for (int il = 0; il < n_layer; ++il) {
         const auto & layer = model.layers[il];
 
@@ -176,6 +384,18 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         const int64_t n_groups  = n_head + n_head_kv;
         const int64_t n_gqa     = n_head / n_head_kv;
 
+        /*
+         * zaya.py ref: L234-241 (ZayaDecoderATTLayer.forward)
+         * zaya.py ref: L530-537 (ZayaDecoderMLPLayer.forward)
+         *
+         * if self.config.scale_residual_merge:
+         *     residual, hidden_states = self.res_scale(residual, hidden_states)
+         * if residual is not None:
+         *     residual = residual.float() + hidden_states.float()
+         * else:
+         *     residual = hidden_states.float()
+         * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype)
+         */
         ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il);
         if (residual != nullptr) {
             residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il);
@@ -185,16 +405,45 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         }
         cb(residual, "residual", il);
 
-        // Pre-norm
+        /*
+         * zaya.py ref: L84-95 (_apply_norm_with_fp32_residual)
+         * zaya.py ref: L240-241, L536-537
+         *
+         * if isinstance(norm, RMSNorm):
+         *     if residual.dtype != norm.weight.dtype:
+         *         hidden_states = norm.forward_native(residual)
+         *     else:
+         *         hidden_states = norm(residual)
+         *     return hidden_states.to(target_dtype)
+         */
         cur = build_norm(residual, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
         cb(cur, "input_norm", il);
 
         if (il % 2 == 0) {
             // ===== CCA Attention =====
+            /*
+             * zaya.py ref: L98-184 (ZayaAttention)
+             * zaya.py ref: L171-184 (ZayaAttention.forward)
+             *
+             * def forward(self, hidden_states, position_ids):
+             *     output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...)
+             *     self.qkv(hidden_states, output_qkv)
+             *     q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1)
+             *     q, k = self.rotary_emb(position_ids, q, k)
+             *     attn_output = self.attn(q, k, v)
+             *     attn_output = self.o_proj(attn_output)
+             *     return attn_output
+             */
+
             const int64_t conv_state_size = 2*n_qk;
             const int64_t cca_state_size  = conv_state_size + n_embd;
             GGML_ASSERT((int64_t) hparams.n_embd_s() == cca_state_size);
 
+            /*
+             * zaya.py ref: CCA.py - recurrent state management
+             *
+             * CCA maintains conv_state and prev_hs in recurrent memory
+             */
             ggml_tensor * cca_state_all = inp_recr->mctx->get_s_l(il);
             ggml_tensor * cca_state     = build_rs(inp_recr, cca_state_all, hparams.n_embd_s(), n_seqs);
             cb(cca_state, "cca_state", il);
@@ -210,14 +459,26 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
                     conv_state_size*ggml_element_size(cca_state));
             cb(prev_hs, "cca_prev_hs", il);
 
-            // Q, K projections
+            /*
+             * zaya.py ref: L177-179
+             *
+             * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...)
+             * self.qkv(hidden_states, output_qkv)
+             * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1)
+             */
             ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur);
             cb(Qraw, "Qraw", il);
             ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur);
             cb(Kraw, "Kraw", il);
 
-            // HF uses a delayed hidden-state stream for val_proj2. During decode this
-            // comes from the recurrent state; during prefill it is a one-token shift.
+            /*
+             * zaya.py ref: CCA.py - delayed hidden state stream for val_proj2
+             *
+             * During decode: comes from recurrent state
+             * During prefill: one-token shift of current sequence
+             *
+             * hs_d = concat(prev_hs_last, cur[:-1])  along seq dimension
+             */
             ggml_tensor * cur_state_src = ggml_cont(ctx0, cur);
             ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs);
 
@@ -232,7 +493,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             hs_d = ggml_reshape_2d(ctx0, ggml_cont(ctx0, hs_d), n_embd, n_tokens);
             cb(hs_d, "cca_hs_d", il);
 
-            // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens]
+            /*
+             * zaya.py ref: CCA.py - V projection
+             *
+             * V1 = val_proj1(cur)
+             * V2 = val_proj2(hs_d)
+             * Vcur = concat(V1, V2, dim=0)
+             */
             ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur);
             cb(V1, "V1", il);
             ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, hs_d);
@@ -240,10 +507,25 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0);
             cb(Vcur, "Vcur", il);
 
-            // Concat Q+K for conv: [n_qk, n_tokens]
+            /*
+             * zaya.py ref: CCA.py - QK concatenation for conv
+             *
+             * QKraw = concat(Qraw, Kraw, dim=0)
+             */
             ggml_tensor * QKraw = ggml_concat(ctx0, Qraw, Kraw, 0);
             cb(QKraw, "QKraw", il);
 
+            /*
+             * zaya.py ref: CCA.py - qk_mean computation
+             *
+             * Qpre: [n_embd_head, n_head, n_tokens]
+             * Kpre: [n_embd_head, n_head_kv, n_tokens]
+             * Kpre_grouped = repeat(Kpre, n_gqa times along head dim)
+             * qk_mean_q = (Qpre + Kpre_rep) * 0.5
+             *
+             * Qgroup = group Q by GQA, mean across group
+             * qk_mean_k = (Qmean + Kpre) * 0.5
+             */
             ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Qraw), n_embd_head, n_head, n_tokens);
             ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Kraw), n_embd_head, n_head_kv, n_tokens);
 
@@ -261,6 +543,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_tensor * qk_mean_k = ggml_scale(ctx0, ggml_add(ctx0, Qmean, Kpre), 0.5f);
             cb(qk_mean_k, "qk_mean_k", il);
 
+            /*
+             * zaya.py ref: CCA.py - conv state update
+             *
+             * conv_input = concat(conv_state, QKraw_reshaped, dim=0)
+             * last_conv_states = conv_input[-2:]  (last 2 positions for state update)
+             */
             ggml_tensor * QKraw_t = ggml_cont(ctx0, ggml_transpose(ctx0, QKraw));
             QKraw_t = ggml_reshape_3d(ctx0, QKraw_t, n_seq_tokens, n_qk, n_seqs);
 
@@ -273,6 +561,11 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
                     n_seq_tokens*conv_input->nb[0]);
             cb(last_conv_states, "cca_last_conv_states", il);
 
+            /*
+             * zaya.py ref: CCA.py - recurrent state write-back
+             *
+             * Update conv_state and prev_hs in recurrent memory for next step
+             */
             const auto kv_head = inp_recr->mctx->get_head();
             ggml_tensor * conv_state_update_target = ggml_view_2d(ctx0, cca_state_all, conv_state_size, n_seqs,
                     cca_state_all->nb[1],
@@ -287,19 +580,27 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
                     (kv_head*cca_state_size + conv_state_size)*ggml_element_size(cca_state_all));
             ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target));
 
+            /*
+             * zaya.py ref: CCA.py - depthwise conv
+             *
+             * QK = ssm_conv(conv_input, conv_dw) + conv_dw_b
+             */
             ggml_tensor * conv_dw = layer.cca_conv_dw;
             if (conv_dw->type != GGML_TYPE_F32) {
                 conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F32));
             }
-            // conv_input is [L, n_qk, n_seqs], ssm_conv outputs [n_qk, n_tokens, n_seqs]
             ggml_tensor * QK = ggml_ssm_conv(ctx0, conv_input, conv_dw);
-            // permute from [n_qk, n_tokens, n_seqs] to [n_tokens, n_qk, n_seqs]
             QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3));
             if (layer.cca_conv_dw_b) {
                 QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1));
             }
             cb(QK, "QK_dw", il);
 
+            /*
+             * zaya.py ref: CCA.py - grouped conv
+             *
+             * QK = conv_1d_grouped(QK, conv_grp, n_groups) + conv_grp_b
+             */
             ggml_tensor * conv_grp = layer.cca_conv_grp;
             if (conv_grp->type != GGML_TYPE_F16) {
                 conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F16));
@@ -309,8 +610,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             cb(QK, "QK_grp", il);
 
             QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3));
-            // QK is now [n_qk, n_seq_tokens, n_seqs]
-            // Flatten to 2D: [n_qk, n_tokens] where n_tokens = n_seq_tokens * n_seqs
             QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens);
 
             ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0);
@@ -319,15 +618,38 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_tensor * Qcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Q_conv), n_embd_head, n_head, n_tokens);
             ggml_tensor * Kcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, K_conv), n_embd_head, n_head_kv, n_tokens);
 
+            /*
+             * zaya.py ref: CCA.py - add qk_mean back to Q, K
+             *
+             * Qcur = Qcur + qk_mean_q
+             * Kcur = Kcur + qk_mean_k
+             */
             Qcur = ggml_add(ctx0, Qcur, qk_mean_q);
             Kcur = ggml_add(ctx0, Kcur, qk_mean_k);
 
+            /*
+             * zaya.py ref: CCA.py - L2 normalization and scaling
+             *
+             * Qcur = l2_norm(Qcur) * sqrt(n_embd_head)
+             * Kcur = l2_norm(Kcur) * sqrt(n_embd_head) * cca_k_scale
+             */
             Qcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Qcur, 1e-12f), sqrtf((float) n_embd_head));
             Kcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Kcur, 1e-12f), sqrtf((float) n_embd_head));
             Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.cca_k_scale, 1, n_head_kv, 1));
             cb(Qcur, "Qcur_pre_rope", il);
             cb(Kcur, "Kcur_pre_rope", il);
 
+            /*
+             * zaya.py ref: L155-164 (rotary embedding)
+             *
+             * self.rotary_emb = get_rope(
+             *     head_size=self.head_dim,
+             *     max_position=config.max_position_embeddings,
+             *     is_neox_style=True,
+             *     rope_parameters={"rope_theta": config.rope_theta, "rope_type": "default", "partial_rotary_factor": 0.5},
+             * )
+             * q, k = self.rotary_emb(position_ids, q, k)
+             */
             ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
             Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -340,7 +662,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
             Vcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Vcur), n_embd_head, n_head_kv, n_tokens);
 
-            // GQA attention
+            /*
+             * zaya.py ref: L146-153, L181-182 (Attention + output projection)
+             *
+             * self.attn = Attention(self.cca_num_q_heads, self.head_dim, self.scale, self.cca_num_k_heads, ...)
+             * attn_output = self.attn(q, k, v)
+             * attn_output = self.o_proj(attn_output)
+             */
             cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr,
                 Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
                 1.0f / sqrtf((float) n_embd_head), il);
@@ -348,24 +676,78 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
         } else {
             // ===== MoE Layer =====
-
-            // Build Zaya router network:
-            // down_proj -> optional EDA -> RMSNorm -> GELU MLP -> 17 logits.
-
+            /*
+             * zaya.py ref: L481-541 (ZayaDecoderMLPLayer)
+             * zaya.py ref: L382-479 (ZayaBlock)
+             * zaya.py ref: L251-380 (ZayaRouter)
+             *
+             * def forward(self, hidden_states, residual, position_ids, layer_n, prev_router_hidden_states):
+             *     if self.config.scale_residual_merge:
+             *         residual, hidden_states = self.res_scale(residual, hidden_states)
+             *     residual = residual.float() + hidden_states.float()
+             *     hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype)
+             *     hidden_states, prev_router_hidden_states = self.zaya_block(hidden_states, prev_router_hidden_states)
+             *     return hidden_states, residual, prev_router_hidden_states
+             */
+
+            /*
+             * zaya.py ref: L321-380 (ZayaRouter.forward)
+             *
+             * hs = self.down_proj(hidden_states)
+             * if self.use_eda and (prev_router_hidden_states is not None):
+             *     hs = hs + prev_router_hidden_states * self.router_states_scale
+             * router_hidden_states_next = hs[-S:].clone()
+             * hs_norm = self.rmsnorm_eda(hs)
+             * logits = self.router_mlp(hs_norm)  // Linear->GELU->Linear->GELU->Linear
+             * expert_prob = torch.softmax(logits, dim=-1, dtype=torch.float32)
+             * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases
+             * _, expert_choice_t = torch.topk(biased, self.topk, dim=-1)
+             * route_prob = torch.gather(expert_prob, dim=1, index=expert_choice_t)
+             * return route_prob_flat, expert_choice_flat, router_hidden_states_next
+             */
+
+            /*
+             * zaya.py ref: L343
+             *
+             * hs = self.down_proj(hidden_states)
+             */
             ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur);
             router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b);
             cb(router_h, "router_down", il);
 
+            /*
+             * zaya.py ref: L344-345
+             *
+             * if self.use_eda and (prev_router_hidden_states is not None):
+             *     hs = hs + prev_router_hidden_states * self.router_states_scale
+             */
             if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) {
                 router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale));
                 cb(router_h, "router_eda", il);
             }
 
-            prev_router = router_h;
+            prev_router = router_h;  // zaya.py ref: L348 (router_hidden_states_next)
 
+            /*
+             * zaya.py ref: L351
+             *
+             * hs_norm = self.rmsnorm_eda(hs)
+             */
             router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il);
             cb(router_h, "router_norm", il);
 
+            /*
+             * zaya.py ref: L305-314, L354
+             *
+             * logits = self.router_mlp(hs_norm)
+             * self.router_mlp = nn.Sequential(
+             *     ReplicatedLinear(D, D, bias=True, ...),   // mlp0
+             *     nn.GELU(),
+             *     ReplicatedLinear(D, D, bias=True, ...),   // mlp2
+             *     nn.GELU(),
+             *     ReplicatedLinear(D, E, bias=False, ...),  // mlp4
+             * )
+             */
             router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h);
             router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b);
             router_h = ggml_gelu(ctx0, router_h);
@@ -379,20 +761,51 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h);
             cb(router_h, "router_logits", il);
 
+            /*
+             * zaya.py ref: L355-359
+             *
+             * expert_prob = torch.softmax(logits, dim=-1, dtype=torch.float32)
+             * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases
+             * _, expert_choice_t = torch.topk(biased, self.topk, dim=-1)
+             */
             ggml_tensor * router_probs = ggml_soft_max(ctx0, router_h);
             cb(router_probs, "router_probs", il);
 
-            // Keep the MOD skip expert in the softmax denominator, then route
-            // over real experts only. The checkpoint's skip bias keeps MOD unused.
+            /*
+             * zaya.py ref: L387-389 (MOD skip expert handling)
+             *
+             * gate_probs = router_probs[:, :n_expert]  // exclude skip expert from routing
+             */
             ggml_tensor * gate_probs = ggml_cont(ctx0,
                     ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0));
             cb(gate_probs, "gate_probs", il);
 
+            /*
+             * zaya.py ref: L317-319, L362-363
+             *
+             * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32))
+             * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases
+             */
             ggml_tensor * expert_biases = nullptr;
             if (layer.zaya_router_biases != nullptr) {
                 expert_biases = ggml_view_1d(ctx0, layer.zaya_router_biases, n_expert, 0);
             }
 
+            /*
+             * zaya.py ref: L448-479 (ZayaBlock.forward - MoE execution)
+             *
+             * probs, indices, router_hidden_states_out = self.router(hidden_states, prev_router_hidden_states)
+             * if self.config.zaya_use_mod:
+             *     clamped_indices = torch.clamp(indices, min=0, max=self.num_moe_experts - 1)
+             *     packed_logits = torch.cat([probs, clamped_indices.to(probs.dtype)], dim=-1)
+             *     hidden_states_experts = self.experts(hidden_states, packed_logits)
+             *     hidden_states_mod = hidden_states * probs
+             *     mod_mask = (indices != self.num_moe_experts)
+             *     hidden_states = (mod_mask * hidden_states_experts) + ((~mod_mask) * hidden_states_mod)
+             * else:
+             *     packed_logits = torch.cat([probs, indices.to(probs.dtype)], dim=-1)
+             *     hidden_states = self.experts(hidden_states, packed_logits)
+             */
             cur = build_moe_ffn(cur,
                 /* gate_inp */        nullptr,
                 /* up_exps */         nullptr,
@@ -414,6 +827,17 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         inpL = cur;
     }
 
+    /*
+     * zaya.py ref: L653-664 (ZayaModel.forward - final residual + norm)
+     *
+     * if self.config.scale_residual_merge:
+     *     residual, hidden_states = self.res_scale(residual, hidden_states)
+     * if residual is not None:
+     *     hidden_states = hidden_states.float() + residual.float()
+     * else:
+     *     hidden_states = hidden_states.float()
+     * hidden_states = _apply_norm_with_fp32_residual(self.final_norm, hidden_states, final_input_dtype)
+     */
     ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1);
     if (residual != nullptr) {
         residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1);
@@ -427,12 +851,22 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
     }
 
-    // final norm
+    /*
+     * zaya.py ref: L608-613 (final norm)
+     *
+     * if (config.normalization == "RMSNorm"):
+     *     self.final_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon)
+     */
     cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
     cb(cur, "result_norm", -1);
     res->t_embd = cur;
 
-    // output
+    /*
+     * zaya.py ref: L729-746, L769-782 (lm_head + logits_processor)
+     *
+     * self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, ...)
+     * logits = self.logits_processor(self.lm_head, hidden_states)
+     */
     cur = ggml_mul_mat(ctx0, model.output, cur);
     cb(cur, "result_output", -1);
     res->t_logits = cur;

From 2234dab56972888aeee8625905c13a5190c2f9fa Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Thu, 21 May 2026 23:01:34 +0200
Subject: [PATCH 24/33] fix(zaya): gate EDA with layer check matching Python
 use_eda logic

zaya.py L294-296: EDA is disabled for layer 1 (first MoE layer) via
(self.layer_number != zaya_first_layer). Add il != 1 guard to match.
---
 src/models/zaya.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index e2dd6e52bcf..516bd8b81db 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -716,12 +716,19 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             cb(router_h, "router_down", il);
 
             /*
-             * zaya.py ref: L344-345
+             * zaya.py ref: L294-296, L344-345
+             *
+             * zaya_first_layer = 1
+             * use_eda_cfg = bool(getattr(config, "zaya_use_eda", False))
+             * self.use_eda = use_eda_cfg and (zaya_first_layer is not None) and (self.layer_number != zaya_first_layer)
              *
              * if self.use_eda and (prev_router_hidden_states is not None):
              *     hs = hs + prev_router_hidden_states * self.router_states_scale
+             *
+             * EDA is disabled for layer 1 (first MoE layer) via (self.layer_number != zaya_first_layer).
+             * When zaya_use_eda is False globally, the parameter is never created (tensor stays nullptr).
              */
-            if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) {
+            if (il != 1 && prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) {
                 router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale));
                 cb(router_h, "router_eda", il);
             }

From 1fc45810144b1ab8b77a04ea6451c049bf37742d Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 22 May 2026 00:34:48 +0200
Subject: [PATCH 25/33] feat(zaya): add zaya_high_prec for FP32 output logits
 matching Python _FP32EmbeddingMethod

---
 convert_hf_to_gguf.py       |  4 ++++
 gguf-py/gguf/constants.py   |  3 +++
 gguf-py/gguf/gguf_writer.py |  3 +++
 src/llama-arch.cpp          |  3 +++
 src/llama-arch.h            |  3 +++
 src/llama-hparams.h         |  3 +++
 src/llama-model-saver.cpp   |  3 +++
 src/models/zaya.cpp         | 14 +++++++++++++-
 8 files changed, 35 insertions(+), 1 deletion(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 5ce42b465c8..0fa85283db2 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6500,6 +6500,10 @@ def set_gguf_parameters(self):
         n_ff_exp = self.hparams.get("zaya_mlp_expansion", 256)
         self.gguf_writer.add_expert_feed_forward_length(n_ff_exp)
 
+        # FP32 output logits for numerical stability (zaya_high_prec)
+        zaya_high_prec = self.hparams.get("zaya_high_prec", True)
+        self.gguf_writer.add_zaya_high_prec(zaya_high_prec)
+
     def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
         if "linear_q" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index eeb14f6aa76..1d759ac54f7 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -153,6 +153,9 @@ class LLM:
         DENSE_FEAT_IN_SIZE                = "{arch}.{dense}_feat_in"
         DENSE_FEAT_OUT_SIZE               = "{arch}.{dense}_feat_out"
 
+        # Zaya-specific
+        ZAYA_HIGH_PREC                    = "zaya.high_prec"
+
     class Attention:
         HEAD_COUNT                   = "{arch}.attention.head_count"
         HEAD_COUNT_KV                = "{arch}.attention.head_count_kv"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 35fb01470c4..d45a529bd90 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1293,6 +1293,9 @@ def add_xielu_beta(self, values: Sequence[float]):
     def add_xielu_eps(self, values: Sequence[float]):
         self.add_array(Keys.xIELU.EPS, values)
 
+    def add_zaya_high_prec(self, value: bool) -> None:
+        self.add_bool(Keys.LLM.ZAYA_HIGH_PREC, value)
+
     # diffusion models
 
     def add_diffusion_shift_logits(self, value: bool) -> None:
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index e0ac2a625dc..43fdb881cbd 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -293,6 +293,9 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_DENSE_3_FEAT_IN,        "%s.dense_3_feat_in"   },
     { LLM_KV_DENSE_3_FEAT_OUT,       "%s.dense_3_feat_out"  },
 
+    // Zaya-specific
+    { LLM_KV_ZAYA_HIGH_PREC,         "%s.zaya.high_prec"    },
+
     { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
     { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
     { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index 3809afc124c..a0a18843356 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -345,6 +345,9 @@ enum llm_kv {
     LLM_KV_DENSE_2_FEAT_OUT,
     LLM_KV_DENSE_3_FEAT_IN,
     LLM_KV_DENSE_3_FEAT_OUT,
+
+    // Zaya-specific
+    LLM_KV_ZAYA_HIGH_PREC,
 };
 
 enum llm_tensor {
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 0160a89caa2..7982dba8ab2 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -212,6 +212,9 @@ struct llama_hparams {
     // qwen3vl deepstack
     uint32_t n_deepstack_layers = 0;
 
+    // zaya: FP32 output logits for numerical stability
+    bool zaya_high_prec = true;
+
     // gemma4 per-layer embedding
     uint32_t n_embd_per_layer = 0;
 
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
index e83056557bf..07bf6bc812d 100644
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -377,6 +377,9 @@ void llama_model_saver::add_kv_from_model() {
     add_kv(LLM_KV_DENSE_2_FEAT_OUT,                  hparams.dense_2_feat_out);
     add_kv(LLM_KV_DENSE_3_FEAT_IN,                   hparams.dense_3_feat_in);
     add_kv(LLM_KV_DENSE_3_FEAT_OUT,                  hparams.dense_3_feat_out);
+
+    // Zaya-specific
+    add_kv(LLM_KV_ZAYA_HIGH_PREC,                    hparams.zaya_high_prec);
 }
 
 void llama_model_saver::add_tensors_from_model() {
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 516bd8b81db..241a6ad8f32 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -204,7 +204,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
          * hidden_states = (hidden_states.float() + hs_bias) * hs_scale
          * residual = (residual.float() + res_bias) * res_scale
          */
-        layer.res_scale_hs   = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0);
+        layer.res_scale_hs   = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
         layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
         layer.res_scale_res  = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
         layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
@@ -876,6 +876,18 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
      */
     cur = ggml_mul_mat(ctx0, model.output, cur);
     cb(cur, "result_output", -1);
+
+    /*
+     * zaya.py ref: L748-749 (_FP32EmbeddingMethod)
+     *
+     * if self.zaya_high_prec:
+     *     out = out.to(dtype=torch.float32)
+     */
+    if (hparams.zaya_high_prec) {
+        cur = ggml_cont(ctx0, ggml_cast(ctx0, cur, GGML_TYPE_F32));
+        cb(cur, "result_output_fp32", -1);
+    }
+
     res->t_logits = cur;
 
     ggml_build_forward_expand(gf, cur);

From 0f37acecd23ae9e95e70929ee8f534a40652cfe5 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 22 May 2026 01:56:05 +0200
Subject: [PATCH 26/33] zaya.cpp: fix comment reference to MOD skip expert
 handling

Correct line reference from zaya.py L387-389 to L459-469, and add
note explaining why excluding the skip expert from gate_probs is
correct (bias=-1.0 makes it effectively never selected at inference
with topk=1).
---
 src/models/zaya.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 241a6ad8f32..5e86085fb7a 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -779,9 +779,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             cb(router_probs, "router_probs", il);
 
             /*
-             * zaya.py ref: L387-389 (MOD skip expert handling)
+             * zaya.py ref: L459-469 (MOD skip expert handling)
              *
              * gate_probs = router_probs[:, :n_expert]  // exclude skip expert from routing
+             *
+             * Note: the skip expert (index n_expert) has a -1.0 bias in
+             * balancing_biases, making it practically never selected during
+             * inference with topk=1.
              */
             ggml_tensor * gate_probs = ggml_cont(ctx0,
                     ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0));

From 6b6700a1123eacd8d2bdf04a6964aff24be732b7 Mon Sep 17 00:00:00 2001
From: leo <leo@example.com>
Date: Fri, 22 May 2026 16:30:29 +0200
Subject: [PATCH 27/33] zaya: add cca_mask input tensor for CCA padding masking

- New llm_graph_input_cca_mask class + build_inp_cca_mask() in graph infra
- cca_mask tensor [1, n_tokens] F32 binary mask applied to hidden_states
  before CCA convolutions (modeling_zaya.py ref: CCA.forward L325-328)
- Applied only during prefill (n_seq_tokens > 1), matching Python logic
- Mask filled with 1.0f for all positions (no padding info in ubatch)
---
 src/llama-graph.cpp | 32 ++++++++++++++++++++++++++++++++
 src/llama-graph.h   | 16 ++++++++++++++++
 src/models/zaya.cpp | 27 +++++++++++++++++++++++++++
 3 files changed, 75 insertions(+)

diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index e4f0ff98ef4..80db7b66e25 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -132,6 +132,23 @@ bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
+void llm_graph_input_cca_mask::set_input(const llama_ubatch * ubatch) {
+    if (cca_mask) {
+        const int64_t n_tokens = ubatch->n_tokens;
+        // modeling_zaya.py ref: L1555-1558 (ZayaModel.forward)
+        //
+        // if attention_mask is not None:
+        //     cca_mask = attention_mask.clone()
+        // else:
+        //     cca_mask = None
+        //
+        // In llama.cpp, all tokens are valid (no padding tokens in the ubatch),
+        // so the mask is set to 1.0 for every token position.
+        std::vector<float> mask_data(n_tokens, 1.0f);
+        ggml_backend_tensor_set(cca_mask, mask_data.data(), 0, n_tokens * ggml_element_size(cca_mask));
+    }
+}
+
 void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && attn_scale) {
         const int64_t n_tokens = ubatch->n_tokens;
@@ -1822,6 +1839,21 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
     return cur;
 }
 
+ggml_tensor * llm_graph_context::build_inp_cca_mask() const {
+    auto inp = std::make_unique<llm_graph_input_cca_mask>();
+
+    auto & cur = inp->cca_mask;
+
+    // shape: [1, n_tokens] for broadcasting with [n_embd, n_tokens] hidden states
+    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_tokens);
+    ggml_set_input(cur);
+    ggml_set_name(cur, "cca_mask");
+
+    res->add_input(std::move(inp));
+
+    return cur;
+}
+
 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
     // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
     //       but this would make the graph topology depend on the number of output tokens, which can interfere with
diff --git a/src/llama-graph.h b/src/llama-graph.h
index 5cb1756c6a9..d6c0d0458c3 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -150,6 +150,21 @@ class llm_graph_input_attn_temp : public llm_graph_input_i {
     const float    f_attn_temp_offset;
 };
 
+// cca_mask for CCA (Channel-wise Cross Attention), used by zaya
+// Binary mask applied to hidden_states before CCA convolutions,
+// matching modeling_zaya.py ref: CCA.forward L325-328
+//   if cca_mask is not None and hidden_states.shape[1] > 1:
+//       hidden_states = (hidden_states * cca_mask[:, :, None]).to(dtype)
+class llm_graph_input_cca_mask : public llm_graph_input_i {
+public:
+    llm_graph_input_cca_mask() = default;
+    virtual ~llm_graph_input_cca_mask() = default;
+
+    void set_input(const llama_ubatch * ubatch) override;
+
+    ggml_tensor * cca_mask = nullptr; // F32 [1, n_tokens]
+};
+
 class llm_graph_input_pos_bucket : public llm_graph_input_i {
 public:
     llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
@@ -880,6 +895,7 @@ struct llm_graph_context {
     ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
     ggml_tensor * build_inp_pos() const;
     ggml_tensor * build_inp_attn_scale() const;
+    ggml_tensor * build_inp_cca_mask() const;
     ggml_tensor * build_inp_out_ids() const;
     ggml_tensor * build_inp_mean() const;
     ggml_tensor * build_inp_cls() const;
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 5e86085fb7a..5d403c65d19 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -343,6 +343,17 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
+    ggml_tensor * inp_cca_mask = nullptr;
+    // modeling_zaya.py ref: L1555-1558 (ZayaModel.forward)
+    //
+    // if attention_mask is not None:
+    //     cca_mask = attention_mask.clone()
+    // else:
+    //     cca_mask = None
+    //
+    // Built unconditionally; set_input fills with 1.0 for all positions
+    // (padding mask is not available in llama.cpp ubatch).
+    inp_cca_mask = build_inp_cca_mask();
     ggml_tensor * residual    = nullptr;
     ggml_tensor * prev_router = nullptr;
 
@@ -459,6 +470,22 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
                     conv_state_size*ggml_element_size(cca_state));
             cb(prev_hs, "cca_prev_hs", il);
 
+            /*
+             * modeling_zaya.py ref: L325-328 (CCA.forward)
+             *
+             * if cca_mask is not None and hidden_states.shape[1] > 1:
+             *     # Only applying in prefill
+             *     dtype = hidden_states.dtype
+             *     hidden_states = (hidden_states * cca_mask[:, :, None]).to(dtype)
+             *
+             * In ggml: cur is [n_embd, n_tokens], cca_mask is [1, n_tokens].
+             * Broadcasting along dim 0 zeros out hidden states of masked positions.
+             */
+            if (inp_cca_mask != nullptr && n_seq_tokens > 1) {
+                cur = ggml_mul(ctx0, cur, inp_cca_mask);
+                cb(cur, "cca_masked", il);
+            }
+
             /*
              * zaya.py ref: L177-179
              *

From 9aaef944393aa0791467fb2717212ec50dd39754 Mon Sep 17 00:00:00 2001
From: leo <leo@example.com>
Date: Fri, 22 May 2026 18:36:26 +0200
Subject: [PATCH 28/33] zaya: cast residual to F32 before addition
 (residual_in_fp32)

Match Python reference which casts hidden_states and residual to
float32 before ggml_add in both per-layer and final residual paths.

zaya.py ref: L900, L1387, L1701
---
 src/models/zaya.cpp | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 5d403c65d19..efaedebac8e 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -407,12 +407,17 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
          *     residual = hidden_states.float()
          * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype)
          */
-        ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il);
+         ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il);
+        /*
+         * zaya.py ref: L900, L1387, L1701
+         * if self.config.residual_in_fp32:
+         *     residual = hidden_states.to(torch.float32)
+         */
         if (residual != nullptr) {
             residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il);
-            residual = ggml_add(ctx0, hidden_states, residual);
+            residual = ggml_add(ctx0, ggml_cast(ctx0, hidden_states, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32));
         } else {
-            residual = hidden_states;
+            residual = ggml_cast(ctx0, hidden_states, GGML_TYPE_F32);
         }
         cb(residual, "residual", il);
 
@@ -879,9 +884,15 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1);
     if (residual != nullptr) {
         residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1);
-        cur = ggml_add(ctx0, final_hidden, residual);
+        /*
+         * zaya.py ref: L1701
+         * if self.config.residual_in_fp32:
+         *     hidden_states = hidden_states.float()
+         *     residual = residual.float()
+         */
+        cur = ggml_add(ctx0, ggml_cast(ctx0, final_hidden, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32));
     } else {
-        cur = final_hidden;
+        cur = ggml_cast(ctx0, final_hidden, GGML_TYPE_F32);
     }
     cb(cur, "final_residual", -1);
 

From abe9e401080d147a0a2a4976471025799f828e8c Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 22 May 2026 22:31:20 +0200
Subject: [PATCH 29/33] cleanup: revert debugs commits

---
 convert_hf_to_gguf.py       |  4 --
 gguf-py/gguf/constants.py   |  3 --
 gguf-py/gguf/gguf_writer.py |  3 --
 src/llama-arch.cpp          |  3 --
 src/llama-arch.h            |  3 --
 src/llama-graph.cpp         | 32 ---------------
 src/llama-graph.h           | 16 --------
 src/llama-hparams.h         |  3 --
 src/llama-model-saver.cpp   |  3 --
 src/models/zaya.cpp         | 79 +++++--------------------------------
 10 files changed, 9 insertions(+), 140 deletions(-)

diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py
index 0fa85283db2..5ce42b465c8 100755
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -6500,10 +6500,6 @@ def set_gguf_parameters(self):
         n_ff_exp = self.hparams.get("zaya_mlp_expansion", 256)
         self.gguf_writer.add_expert_feed_forward_length(n_ff_exp)
 
-        # FP32 output logits for numerical stability (zaya_high_prec)
-        zaya_high_prec = self.hparams.get("zaya_high_prec", True)
-        self.gguf_writer.add_zaya_high_prec(zaya_high_prec)
-
     def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
         if "linear_q" in name:
             yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 1d759ac54f7..eeb14f6aa76 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -153,9 +153,6 @@ class LLM:
         DENSE_FEAT_IN_SIZE                = "{arch}.{dense}_feat_in"
         DENSE_FEAT_OUT_SIZE               = "{arch}.{dense}_feat_out"
 
-        # Zaya-specific
-        ZAYA_HIGH_PREC                    = "zaya.high_prec"
-
     class Attention:
         HEAD_COUNT                   = "{arch}.attention.head_count"
         HEAD_COUNT_KV                = "{arch}.attention.head_count_kv"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index d45a529bd90..35fb01470c4 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1293,9 +1293,6 @@ def add_xielu_beta(self, values: Sequence[float]):
     def add_xielu_eps(self, values: Sequence[float]):
         self.add_array(Keys.xIELU.EPS, values)
 
-    def add_zaya_high_prec(self, value: bool) -> None:
-        self.add_bool(Keys.LLM.ZAYA_HIGH_PREC, value)
-
     # diffusion models
 
     def add_diffusion_shift_logits(self, value: bool) -> None:
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp
index 43fdb881cbd..e0ac2a625dc 100644
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -293,9 +293,6 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_DENSE_3_FEAT_IN,        "%s.dense_3_feat_in"   },
     { LLM_KV_DENSE_3_FEAT_OUT,       "%s.dense_3_feat_out"  },
 
-    // Zaya-specific
-    { LLM_KV_ZAYA_HIGH_PREC,         "%s.zaya.high_prec"    },
-
     { LLM_KV_TOKENIZER_MODEL,                "tokenizer.ggml.model"                    },
     { LLM_KV_TOKENIZER_PRE,                  "tokenizer.ggml.pre"                      },
     { LLM_KV_TOKENIZER_LIST,                 "tokenizer.ggml.tokens"                   },
diff --git a/src/llama-arch.h b/src/llama-arch.h
index a0a18843356..3809afc124c 100644
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -345,9 +345,6 @@ enum llm_kv {
     LLM_KV_DENSE_2_FEAT_OUT,
     LLM_KV_DENSE_3_FEAT_IN,
     LLM_KV_DENSE_3_FEAT_OUT,
-
-    // Zaya-specific
-    LLM_KV_ZAYA_HIGH_PREC,
 };
 
 enum llm_tensor {
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp
index 80db7b66e25..e4f0ff98ef4 100644
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -132,23 +132,6 @@ bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) {
     return res;
 }
 
-void llm_graph_input_cca_mask::set_input(const llama_ubatch * ubatch) {
-    if (cca_mask) {
-        const int64_t n_tokens = ubatch->n_tokens;
-        // modeling_zaya.py ref: L1555-1558 (ZayaModel.forward)
-        //
-        // if attention_mask is not None:
-        //     cca_mask = attention_mask.clone()
-        // else:
-        //     cca_mask = None
-        //
-        // In llama.cpp, all tokens are valid (no padding tokens in the ubatch),
-        // so the mask is set to 1.0 for every token position.
-        std::vector<float> mask_data(n_tokens, 1.0f);
-        ggml_backend_tensor_set(cca_mask, mask_data.data(), 0, n_tokens * ggml_element_size(cca_mask));
-    }
-}
-
 void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) {
     if (ubatch->pos && attn_scale) {
         const int64_t n_tokens = ubatch->n_tokens;
@@ -1839,21 +1822,6 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const {
     return cur;
 }
 
-ggml_tensor * llm_graph_context::build_inp_cca_mask() const {
-    auto inp = std::make_unique<llm_graph_input_cca_mask>();
-
-    auto & cur = inp->cca_mask;
-
-    // shape: [1, n_tokens] for broadcasting with [n_embd, n_tokens] hidden states
-    cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_tokens);
-    ggml_set_input(cur);
-    ggml_set_name(cur, "cca_mask");
-
-    res->add_input(std::move(inp));
-
-    return cur;
-}
-
 ggml_tensor * llm_graph_context::build_inp_out_ids() const {
     // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls,
     //       but this would make the graph topology depend on the number of output tokens, which can interfere with
diff --git a/src/llama-graph.h b/src/llama-graph.h
index d6c0d0458c3..5cb1756c6a9 100644
--- a/src/llama-graph.h
+++ b/src/llama-graph.h
@@ -150,21 +150,6 @@ class llm_graph_input_attn_temp : public llm_graph_input_i {
     const float    f_attn_temp_offset;
 };
 
-// cca_mask for CCA (Channel-wise Cross Attention), used by zaya
-// Binary mask applied to hidden_states before CCA convolutions,
-// matching modeling_zaya.py ref: CCA.forward L325-328
-//   if cca_mask is not None and hidden_states.shape[1] > 1:
-//       hidden_states = (hidden_states * cca_mask[:, :, None]).to(dtype)
-class llm_graph_input_cca_mask : public llm_graph_input_i {
-public:
-    llm_graph_input_cca_mask() = default;
-    virtual ~llm_graph_input_cca_mask() = default;
-
-    void set_input(const llama_ubatch * ubatch) override;
-
-    ggml_tensor * cca_mask = nullptr; // F32 [1, n_tokens]
-};
-
 class llm_graph_input_pos_bucket : public llm_graph_input_i {
 public:
     llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {}
@@ -895,7 +880,6 @@ struct llm_graph_context {
     ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const;
     ggml_tensor * build_inp_pos() const;
     ggml_tensor * build_inp_attn_scale() const;
-    ggml_tensor * build_inp_cca_mask() const;
     ggml_tensor * build_inp_out_ids() const;
     ggml_tensor * build_inp_mean() const;
     ggml_tensor * build_inp_cls() const;
diff --git a/src/llama-hparams.h b/src/llama-hparams.h
index 7982dba8ab2..0160a89caa2 100644
--- a/src/llama-hparams.h
+++ b/src/llama-hparams.h
@@ -212,9 +212,6 @@ struct llama_hparams {
     // qwen3vl deepstack
     uint32_t n_deepstack_layers = 0;
 
-    // zaya: FP32 output logits for numerical stability
-    bool zaya_high_prec = true;
-
     // gemma4 per-layer embedding
     uint32_t n_embd_per_layer = 0;
 
diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp
index 07bf6bc812d..e83056557bf 100644
--- a/src/llama-model-saver.cpp
+++ b/src/llama-model-saver.cpp
@@ -377,9 +377,6 @@ void llama_model_saver::add_kv_from_model() {
     add_kv(LLM_KV_DENSE_2_FEAT_OUT,                  hparams.dense_2_feat_out);
     add_kv(LLM_KV_DENSE_3_FEAT_IN,                   hparams.dense_3_feat_in);
     add_kv(LLM_KV_DENSE_3_FEAT_OUT,                  hparams.dense_3_feat_out);
-
-    // Zaya-specific
-    add_kv(LLM_KV_ZAYA_HIGH_PREC,                    hparams.zaya_high_prec);
 }
 
 void llama_model_saver::add_tensors_from_model() {
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index efaedebac8e..e2dd6e52bcf 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -204,7 +204,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
          * hidden_states = (hidden_states.float() + hs_bias) * hs_scale
          * residual = (residual.float() + res_bias) * res_scale
          */
-        layer.res_scale_hs   = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
+        layer.res_scale_hs   = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0);
         layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
         layer.res_scale_res  = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
         layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
@@ -343,17 +343,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
     ggml_tensor * inp_pos     = build_inp_pos();
     ggml_tensor * inp_out_ids = build_inp_out_ids();
-    ggml_tensor * inp_cca_mask = nullptr;
-    // modeling_zaya.py ref: L1555-1558 (ZayaModel.forward)
-    //
-    // if attention_mask is not None:
-    //     cca_mask = attention_mask.clone()
-    // else:
-    //     cca_mask = None
-    //
-    // Built unconditionally; set_input fills with 1.0 for all positions
-    // (padding mask is not available in llama.cpp ubatch).
-    inp_cca_mask = build_inp_cca_mask();
     ggml_tensor * residual    = nullptr;
     ggml_tensor * prev_router = nullptr;
 
@@ -407,17 +396,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
          *     residual = hidden_states.float()
          * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype)
          */
-         ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il);
-        /*
-         * zaya.py ref: L900, L1387, L1701
-         * if self.config.residual_in_fp32:
-         *     residual = hidden_states.to(torch.float32)
-         */
+        ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il);
         if (residual != nullptr) {
             residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il);
-            residual = ggml_add(ctx0, ggml_cast(ctx0, hidden_states, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32));
+            residual = ggml_add(ctx0, hidden_states, residual);
         } else {
-            residual = ggml_cast(ctx0, hidden_states, GGML_TYPE_F32);
+            residual = hidden_states;
         }
         cb(residual, "residual", il);
 
@@ -475,22 +459,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
                     conv_state_size*ggml_element_size(cca_state));
             cb(prev_hs, "cca_prev_hs", il);
 
-            /*
-             * modeling_zaya.py ref: L325-328 (CCA.forward)
-             *
-             * if cca_mask is not None and hidden_states.shape[1] > 1:
-             *     # Only applying in prefill
-             *     dtype = hidden_states.dtype
-             *     hidden_states = (hidden_states * cca_mask[:, :, None]).to(dtype)
-             *
-             * In ggml: cur is [n_embd, n_tokens], cca_mask is [1, n_tokens].
-             * Broadcasting along dim 0 zeros out hidden states of masked positions.
-             */
-            if (inp_cca_mask != nullptr && n_seq_tokens > 1) {
-                cur = ggml_mul(ctx0, cur, inp_cca_mask);
-                cb(cur, "cca_masked", il);
-            }
-
             /*
              * zaya.py ref: L177-179
              *
@@ -748,19 +716,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             cb(router_h, "router_down", il);
 
             /*
-             * zaya.py ref: L294-296, L344-345
-             *
-             * zaya_first_layer = 1
-             * use_eda_cfg = bool(getattr(config, "zaya_use_eda", False))
-             * self.use_eda = use_eda_cfg and (zaya_first_layer is not None) and (self.layer_number != zaya_first_layer)
+             * zaya.py ref: L344-345
              *
              * if self.use_eda and (prev_router_hidden_states is not None):
              *     hs = hs + prev_router_hidden_states * self.router_states_scale
-             *
-             * EDA is disabled for layer 1 (first MoE layer) via (self.layer_number != zaya_first_layer).
-             * When zaya_use_eda is False globally, the parameter is never created (tensor stays nullptr).
              */
-            if (il != 1 && prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) {
+            if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) {
                 router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale));
                 cb(router_h, "router_eda", il);
             }
@@ -811,13 +772,9 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             cb(router_probs, "router_probs", il);
 
             /*
-             * zaya.py ref: L459-469 (MOD skip expert handling)
+             * zaya.py ref: L387-389 (MOD skip expert handling)
              *
              * gate_probs = router_probs[:, :n_expert]  // exclude skip expert from routing
-             *
-             * Note: the skip expert (index n_expert) has a -1.0 bias in
-             * balancing_biases, making it practically never selected during
-             * inference with topk=1.
              */
             ggml_tensor * gate_probs = ggml_cont(ctx0,
                     ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0));
@@ -884,15 +841,9 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1);
     if (residual != nullptr) {
         residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1);
-        /*
-         * zaya.py ref: L1701
-         * if self.config.residual_in_fp32:
-         *     hidden_states = hidden_states.float()
-         *     residual = residual.float()
-         */
-        cur = ggml_add(ctx0, ggml_cast(ctx0, final_hidden, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32));
+        cur = ggml_add(ctx0, final_hidden, residual);
     } else {
-        cur = ggml_cast(ctx0, final_hidden, GGML_TYPE_F32);
+        cur = final_hidden;
     }
     cb(cur, "final_residual", -1);
 
@@ -918,18 +869,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
      */
     cur = ggml_mul_mat(ctx0, model.output, cur);
     cb(cur, "result_output", -1);
-
-    /*
-     * zaya.py ref: L748-749 (_FP32EmbeddingMethod)
-     *
-     * if self.zaya_high_prec:
-     *     out = out.to(dtype=torch.float32)
-     */
-    if (hparams.zaya_high_prec) {
-        cur = ggml_cont(ctx0, ggml_cast(ctx0, cur, GGML_TYPE_F32));
-        cb(cur, "result_output_fp32", -1);
-    }
-
     res->t_logits = cur;
 
     ggml_build_forward_expand(gf, cur);

From 6fad5d867ef87efd8998cdde57a164c72a522ea4 Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 22 May 2026 22:43:47 +0200
Subject: [PATCH 30/33] Revert "docs(zaya): add Python reference comments to
 C++ implementation"

This reverts commit f1bd772a37f3407d8fd2809edef1f0e0d245b760.
---
 src/models/zaya.cpp | 488 +++-----------------------------------------
 1 file changed, 27 insertions(+), 461 deletions(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index e2dd6e52bcf..ce65c2281fa 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -5,25 +5,6 @@
 
 #include <cmath>
 
-/*
- * zaya.py ref: L52-81 (ResidualScaling class)
- *
- * class ResidualScaling(nn.Module):
- *     def __init__(self, config, layer_n, ...):
- *         self.not_first_layer = (layer_n != 0)
- *         self.hidden_states_scale = torch.nn.Parameter(torch.ones(config.hidden_size))
- *         self.hidden_states_bias  = torch.nn.Parameter(torch.zeros(config.hidden_size))
- *         if self.not_first_layer:
- *             self.residual_scale = torch.nn.Parameter(torch.ones(config.hidden_size))
- *             self.residual_bias  = torch.nn.Parameter(torch.zeros(config.hidden_size))
- *
- *     def forward(self, residual, hidden_states):
- *         hidden_states = (hidden_states.float() + hs_bias) * hs_scale
- *         if self.not_first_layer and residual is not None:
- *             residual = (residual.float() + res_bias) * res_scale
- *         return residual, hidden_states
- */
-
 void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) {
     ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
     ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv);
@@ -34,15 +15,6 @@ void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) {
     hparams.ssm_d_state = 1;
     hparams.ssm_n_group = 0;
 
-    /*
-     * zaya.py ref: L575-602 (layer alternation)
-     *
-     * for layer_n in range(config.num_hidden_layers):
-     *     if layer_n % 2 == 1:
-     *         self.layers.append(ZayaDecoderMLPLayer(...))   # MoE layer
-     *     else:
-     *         self.layers.append(ZayaDecoderATTLayer(...))   # Attention layer
-     */
     for (uint32_t i = 0; i < hparams.n_layer; ++i) {
         hparams.recurrent_layer_arr[i] = (i % 2) == 0;
     }
@@ -56,42 +28,17 @@ void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) {
 void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
     LLAMA_LOAD_LOCALS;
 
-    /*
-     * zaya.py ref: L569-573
-     *
-     * self.embed_tokens = VocabParallelEmbedding(
-     *     self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size)
-     */
     tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0);
 
-    /*
-     * zaya.py ref: L608-613
-     *
-     * if (config.normalization == "RMSNorm"):
-     *     self.final_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon)
-     * elif (config.normalization == "LayerNorm"):
-     *     self.final_norm = nn.LayerNorm(self.config.hidden_size, eps=config.norm_epsilon)
-     */
+    // output norm
     output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0);
 
-    /*
-     * zaya.py ref: L729-743
-     *
-     * self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, ...)
-     * if self.config.tie_word_embeddings:
-     *     self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens)
-     */
+    // output (tied with tok_embd if not present)
     output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED);
     if (output == nullptr) {
-        output = tok_embd;  // tied weights
+        output = tok_embd;
     }
 
-    /*
-     * zaya.py ref: L605-606 (final ResidualScaling after all layers)
-     *
-     * if self.config.scale_residual_merge:
-     *     self.res_scale = ResidualScaling(config, config.num_hidden_layers)
-     */
     zaya_res_scale_hs    = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL,    "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
     zaya_res_scale_hs_b  = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL,    "bias"),   {n_embd}, TENSOR_NOT_REQUIRED);
     zaya_res_scale_res   = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL,   "weight"), {n_embd}, TENSOR_NOT_REQUIRED);
@@ -99,6 +46,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
     const int64_t n_embd_head = hparams.n_embd_head_k();
     const int64_t d_conv      = hparams.ssm_d_conv;
+    // Router MLP hidden size (zaya_mlp_expansion)
     const int64_t n_ff_exp    = hparams.n_ff_exp;
 
     for (int i = 0; i < n_layer; ++i) {
@@ -113,97 +61,31 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
         const int64_t n_ff      = hparams.n_ff(i);
         const int64_t n_expert  = hparams.n_expert;
 
-        /*
-         * zaya.py ref: L212-217 (ZayaDecoderATTLayer input_norm)
-         * zaya.py ref: L508-513 (ZayaDecoderMLPLayer input_norm)
-         *
-         * if (config.normalization == "RMSNorm"):
-         *     self.input_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon)
-         * elif (config.normalization == "LayerNorm"):
-         *     self.input_norm = nn.LayerNorm(self.config.hidden_size, eps=config.norm_epsilon)
-         */
         layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0);
 
         // CCA attention layers (even indices only)
         if (i % 2 == 0) {
-            /*
-             * zaya.py ref: L98-184 (ZayaAttention class)
-             *
-             * self.q_dim = cca_num_q_heads * head_dim
-             * self.k_dim = cca_num_k_heads * head_dim
-             * self.v_dim = cca_num_k_heads * head_dim
-             *
-             * self.qkv = CCA(config, cca_num_k_heads, cca_num_q_heads, cca_num_heads, ...)
-             * self.o_proj = ReplicatedLinear(cca_num_q_heads * head_dim, hidden_size, ...)
-             * self.attn = Attention(cca_num_q_heads, head_dim, scale, cca_num_k_heads, ...)
-             * self.rotary_emb = get_rope(head_size=head_dim, ..., partial_rotary_factor=0.5)
-             */
-
-            /*
-             * zaya.py ref: L125-138 (CCA layer for Q, K projections)
-             *
-             * self.qkv = CCA(...)
-             * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...)
-             * self.qkv(hidden_states, output_qkv)
-             * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1)
-             */
             layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0);
             layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0);
 
-            /*
-             * zaya.py ref: CCA.py - value projections (val_proj1, val_proj2)
-             *
-             * V1 = val_proj1(x)
-             * V2 = val_proj2(x_delayed)
-             * V = concat(V1, V2)
-             */
             layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i),
                 {n_embd, n_embd_k / 2}, 0);
             layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i),
                 {n_embd, n_embd_k / 2}, 0);
 
-            /*
-             * zaya.py ref: L139-144
-             *
-             * self.o_proj = ReplicatedLinear(self.cca_num_q_heads * self.head_dim,
-             *                                self.hidden_size, bias=self.config.attention_bias, ...)
-             */
             layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0);
 
-            /*
-             * zaya.py ref: CCA.py - depthwise conv on QK
-             *
-             * conv_dw applied to [Q, K] concatenated
-             */
             layer.cca_conv_dw   = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, n_qk}, 0);
             layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED);
 
-            /*
-             * zaya.py ref: CCA.py - grouped conv on QK
-             *
-             * conv_grp applied after dw conv, with n_groups = n_head + n_head_kv
-             */
             layer.cca_conv_grp   = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i),
                 {d_conv, n_qk / n_groups, n_qk}, 0);
             layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0);
 
-            /*
-             * zaya.py ref: CCA.py - K scaling after L2 norm
-             *
-             * Kcur = Kcur * cca_k_scale
-             */
             layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0);
         }
 
-        /*
-         * zaya.py ref: L52-81, L219-220, L515-516 (per-layer ResidualScaling)
-         *
-         * if self.config.scale_residual_merge:
-         *     self.res_scale = ResidualScaling(config, layer_n)
-         *
-         * hidden_states = (hidden_states.float() + hs_bias) * hs_scale
-         * residual = (residual.float() + res_bias) * res_scale
-         */
+        // Residual scaling
         layer.res_scale_hs   = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0);
         layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
         layer.res_scale_res  = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED);
@@ -211,51 +93,13 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
 
         // MoE layers (odd indices)
         if (i % 2 == 1) {
-            /*
-             * zaya.py ref: L251-380 (ZayaRouter class)
-             *
-             * self.down_proj = ReplicatedLinear(self.hidden_size, self.mlp_expansion, bias=True, ...)
-             * self.rmsnorm_eda = RMSNorm(self.mlp_expansion, eps=ln_eps)
-             * self.router_states_scale = nn.Parameter(torch.ones(self.mlp_expansion))  // EDA scale
-             * self.router_mlp = nn.Sequential(
-             *     ReplicatedLinear(D, D, bias=True, ...),
-             *     nn.GELU(),
-             *     ReplicatedLinear(D, D, bias=True, ...),
-             *     nn.GELU(),
-             *     ReplicatedLinear(D, E, bias=False, ...),
-             * )
-             * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32))
-             */
-
-            /*
-             * zaya.py ref: L291
-             *
-             * self.down_proj = ReplicatedLinear(self.hidden_size, self.mlp_expansion, bias=True, ...)
-             */
+            // Router network
             layer.zaya_router_down   = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i),
                 {n_embd, n_ff_exp}, 0);
             layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i),
                 {n_ff_exp}, TENSOR_NOT_REQUIRED);
-
-            /*
-             * zaya.py ref: L298-299
-             *
-             * self.rmsnorm_eda = RMSNorm(self.mlp_expansion, eps=ln_eps)
-             */
             layer.zaya_router_norm   = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i),
                 {n_ff_exp}, 0);
-
-            /*
-             * zaya.py ref: L305-314 (router MLP layers 0, 2, 4)
-             *
-             * self.router_mlp = nn.Sequential(
-             *     ReplicatedLinear(D, D, bias=True, ...),   // mlp0
-             *     self.non_linearity,                        // GELU
-             *     ReplicatedLinear(D, D, bias=True, ...),   // mlp2
-             *     self.non_linearity,                        // GELU
-             *     ReplicatedLinear(D, E, bias=False, ...),  // mlp4
-             * )
-             */
             layer.zaya_router_mlp0   = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i),
                 {n_ff_exp, n_ff_exp}, 0);
             layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i),
@@ -266,40 +110,12 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) {
                 {n_ff_exp}, TENSOR_NOT_REQUIRED);
             layer.zaya_router_mlp4   = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i),
                 {n_ff_exp, n_expert + 1}, 0);
-
-            /*
-             * zaya.py ref: L317-319
-             *
-             * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32))
-             * if self.use_mod:
-             *     self.balancing_biases[-1] = -1.0
-             */
             layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i),
                 {n_expert + 1}, TENSOR_NOT_REQUIRED);
-
-            /*
-             * zaya.py ref: L302-303
-             *
-             * self.router_states_scale = nn.Parameter(torch.ones(self.mlp_expansion))
-             */
             layer.zaya_router_eda_scale = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "weight", i),
                 {n_ff_exp}, TENSOR_NOT_REQUIRED);
 
-            /*
-             * zaya.py ref: L435-446 (FusedMoE experts)
-             *
-             * self.experts = FusedMoE(
-             *     num_experts=self.num_moe_experts,
-             *     top_k=self.topk,
-             *     hidden_size=config.hidden_size,
-             *     intermediate_size=ffn_hidden_size // 2,
-             *     reduce_results=False,
-             *     renormalize=False,
-             *     custom_routing_function=_custom_routing_fn,
-             *     activation="silu",
-             *     ...
-             * )
-             */
+            // MoE experts (fused gate_up and down)
             create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0);
             layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i),
                 {n_ff, n_embd, n_expert}, 0);
@@ -327,15 +143,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     ggml_tensor * cur;
     ggml_tensor * inpL;
 
-    /*
-     * zaya.py ref: L638-641 (ZayaModel.forward)
-     *
-     * if inputs_embeds is None:
-     *     inputs_embeds = self.embed_tokens(input_ids)
-     * residual = None
-     * hidden_states = inputs_embeds
-     * prev_router_hidden_states = None
-     */
     inpL = build_inp_embd(model.tok_embd);
 
     auto * inp = build_inp_mem_hybrid();
@@ -346,14 +153,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     ggml_tensor * residual    = nullptr;
     ggml_tensor * prev_router = nullptr;
 
-    /*
-     * zaya.py ref: L71-81 (ResidualScaling.forward)
-     *
-     * hidden_states = (hidden_states.float() + hs_bias) * hs_scale
-     * if self.not_first_layer and residual is not None:
-     *     residual = (residual.float() + res_bias) * res_scale
-     * return residual, hidden_states
-     */
     const auto apply_res_scale = [&](ggml_tensor * x, ggml_tensor * scale, ggml_tensor * bias, const char * name, int il) {
         if (scale == nullptr) {
             return x;
@@ -366,13 +165,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         return x;
     };
 
-    /*
-     * zaya.py ref: L644-651 (ZayaModel.forward layer loop)
-     *
-     * for layer_n, decoder_layer in enumerate(self.layers):
-     *     hidden_states, residual, prev_router_hidden_states = decoder_layer(
-     *         hidden_states, residual, positions, layer_n, prev_router_hidden_states)
-     */
     for (int il = 0; il < n_layer; ++il) {
         const auto & layer = model.layers[il];
 
@@ -384,18 +176,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         const int64_t n_groups  = n_head + n_head_kv;
         const int64_t n_gqa     = n_head / n_head_kv;
 
-        /*
-         * zaya.py ref: L234-241 (ZayaDecoderATTLayer.forward)
-         * zaya.py ref: L530-537 (ZayaDecoderMLPLayer.forward)
-         *
-         * if self.config.scale_residual_merge:
-         *     residual, hidden_states = self.res_scale(residual, hidden_states)
-         * if residual is not None:
-         *     residual = residual.float() + hidden_states.float()
-         * else:
-         *     residual = hidden_states.float()
-         * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype)
-         */
         ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il);
         if (residual != nullptr) {
             residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il);
@@ -405,45 +185,16 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         }
         cb(residual, "residual", il);
 
-        /*
-         * zaya.py ref: L84-95 (_apply_norm_with_fp32_residual)
-         * zaya.py ref: L240-241, L536-537
-         *
-         * if isinstance(norm, RMSNorm):
-         *     if residual.dtype != norm.weight.dtype:
-         *         hidden_states = norm.forward_native(residual)
-         *     else:
-         *         hidden_states = norm(residual)
-         *     return hidden_states.to(target_dtype)
-         */
+        // Pre-norm
         cur = build_norm(residual, layer.attn_norm, nullptr, LLM_NORM_RMS, il);
         cb(cur, "input_norm", il);
 
         if (il % 2 == 0) {
             // ===== CCA Attention =====
-            /*
-             * zaya.py ref: L98-184 (ZayaAttention)
-             * zaya.py ref: L171-184 (ZayaAttention.forward)
-             *
-             * def forward(self, hidden_states, position_ids):
-             *     output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...)
-             *     self.qkv(hidden_states, output_qkv)
-             *     q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1)
-             *     q, k = self.rotary_emb(position_ids, q, k)
-             *     attn_output = self.attn(q, k, v)
-             *     attn_output = self.o_proj(attn_output)
-             *     return attn_output
-             */
-
             const int64_t conv_state_size = 2*n_qk;
             const int64_t cca_state_size  = conv_state_size + n_embd;
             GGML_ASSERT((int64_t) hparams.n_embd_s() == cca_state_size);
 
-            /*
-             * zaya.py ref: CCA.py - recurrent state management
-             *
-             * CCA maintains conv_state and prev_hs in recurrent memory
-             */
             ggml_tensor * cca_state_all = inp_recr->mctx->get_s_l(il);
             ggml_tensor * cca_state     = build_rs(inp_recr, cca_state_all, hparams.n_embd_s(), n_seqs);
             cb(cca_state, "cca_state", il);
@@ -459,26 +210,14 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
                     conv_state_size*ggml_element_size(cca_state));
             cb(prev_hs, "cca_prev_hs", il);
 
-            /*
-             * zaya.py ref: L177-179
-             *
-             * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...)
-             * self.qkv(hidden_states, output_qkv)
-             * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1)
-             */
+            // Q, K projections
             ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur);
             cb(Qraw, "Qraw", il);
             ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur);
             cb(Kraw, "Kraw", il);
 
-            /*
-             * zaya.py ref: CCA.py - delayed hidden state stream for val_proj2
-             *
-             * During decode: comes from recurrent state
-             * During prefill: one-token shift of current sequence
-             *
-             * hs_d = concat(prev_hs_last, cur[:-1])  along seq dimension
-             */
+            // HF uses a delayed hidden-state stream for val_proj2. During decode this
+            // comes from the recurrent state; during prefill it is a one-token shift.
             ggml_tensor * cur_state_src = ggml_cont(ctx0, cur);
             ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs);
 
@@ -493,13 +232,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             hs_d = ggml_reshape_2d(ctx0, ggml_cont(ctx0, hs_d), n_embd, n_tokens);
             cb(hs_d, "cca_hs_d", il);
 
-            /*
-             * zaya.py ref: CCA.py - V projection
-             *
-             * V1 = val_proj1(cur)
-             * V2 = val_proj2(hs_d)
-             * Vcur = concat(V1, V2, dim=0)
-             */
+            // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens]
             ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur);
             cb(V1, "V1", il);
             ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, hs_d);
@@ -507,25 +240,10 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0);
             cb(Vcur, "Vcur", il);
 
-            /*
-             * zaya.py ref: CCA.py - QK concatenation for conv
-             *
-             * QKraw = concat(Qraw, Kraw, dim=0)
-             */
+            // Concat Q+K for conv: [n_qk, n_tokens]
             ggml_tensor * QKraw = ggml_concat(ctx0, Qraw, Kraw, 0);
             cb(QKraw, "QKraw", il);
 
-            /*
-             * zaya.py ref: CCA.py - qk_mean computation
-             *
-             * Qpre: [n_embd_head, n_head, n_tokens]
-             * Kpre: [n_embd_head, n_head_kv, n_tokens]
-             * Kpre_grouped = repeat(Kpre, n_gqa times along head dim)
-             * qk_mean_q = (Qpre + Kpre_rep) * 0.5
-             *
-             * Qgroup = group Q by GQA, mean across group
-             * qk_mean_k = (Qmean + Kpre) * 0.5
-             */
             ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Qraw), n_embd_head, n_head, n_tokens);
             ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Kraw), n_embd_head, n_head_kv, n_tokens);
 
@@ -543,12 +261,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_tensor * qk_mean_k = ggml_scale(ctx0, ggml_add(ctx0, Qmean, Kpre), 0.5f);
             cb(qk_mean_k, "qk_mean_k", il);
 
-            /*
-             * zaya.py ref: CCA.py - conv state update
-             *
-             * conv_input = concat(conv_state, QKraw_reshaped, dim=0)
-             * last_conv_states = conv_input[-2:]  (last 2 positions for state update)
-             */
             ggml_tensor * QKraw_t = ggml_cont(ctx0, ggml_transpose(ctx0, QKraw));
             QKraw_t = ggml_reshape_3d(ctx0, QKraw_t, n_seq_tokens, n_qk, n_seqs);
 
@@ -561,11 +273,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
                     n_seq_tokens*conv_input->nb[0]);
             cb(last_conv_states, "cca_last_conv_states", il);
 
-            /*
-             * zaya.py ref: CCA.py - recurrent state write-back
-             *
-             * Update conv_state and prev_hs in recurrent memory for next step
-             */
             const auto kv_head = inp_recr->mctx->get_head();
             ggml_tensor * conv_state_update_target = ggml_view_2d(ctx0, cca_state_all, conv_state_size, n_seqs,
                     cca_state_all->nb[1],
@@ -580,27 +287,19 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
                     (kv_head*cca_state_size + conv_state_size)*ggml_element_size(cca_state_all));
             ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target));
 
-            /*
-             * zaya.py ref: CCA.py - depthwise conv
-             *
-             * QK = ssm_conv(conv_input, conv_dw) + conv_dw_b
-             */
             ggml_tensor * conv_dw = layer.cca_conv_dw;
             if (conv_dw->type != GGML_TYPE_F32) {
                 conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F32));
             }
+            // conv_input is [L, n_qk, n_seqs], ssm_conv outputs [n_qk, n_tokens, n_seqs]
             ggml_tensor * QK = ggml_ssm_conv(ctx0, conv_input, conv_dw);
+            // permute from [n_qk, n_tokens, n_seqs] to [n_tokens, n_qk, n_seqs]
             QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3));
             if (layer.cca_conv_dw_b) {
                 QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1));
             }
             cb(QK, "QK_dw", il);
 
-            /*
-             * zaya.py ref: CCA.py - grouped conv
-             *
-             * QK = conv_1d_grouped(QK, conv_grp, n_groups) + conv_grp_b
-             */
             ggml_tensor * conv_grp = layer.cca_conv_grp;
             if (conv_grp->type != GGML_TYPE_F16) {
                 conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F16));
@@ -610,6 +309,8 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             cb(QK, "QK_grp", il);
 
             QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3));
+            // QK is now [n_qk, n_seq_tokens, n_seqs]
+            // Flatten to 2D: [n_qk, n_tokens] where n_tokens = n_seq_tokens * n_seqs
             QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens);
 
             ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0);
@@ -618,38 +319,15 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             ggml_tensor * Qcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Q_conv), n_embd_head, n_head, n_tokens);
             ggml_tensor * Kcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, K_conv), n_embd_head, n_head_kv, n_tokens);
 
-            /*
-             * zaya.py ref: CCA.py - add qk_mean back to Q, K
-             *
-             * Qcur = Qcur + qk_mean_q
-             * Kcur = Kcur + qk_mean_k
-             */
             Qcur = ggml_add(ctx0, Qcur, qk_mean_q);
             Kcur = ggml_add(ctx0, Kcur, qk_mean_k);
 
-            /*
-             * zaya.py ref: CCA.py - L2 normalization and scaling
-             *
-             * Qcur = l2_norm(Qcur) * sqrt(n_embd_head)
-             * Kcur = l2_norm(Kcur) * sqrt(n_embd_head) * cca_k_scale
-             */
             Qcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Qcur, 1e-12f), sqrtf((float) n_embd_head));
             Kcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Kcur, 1e-12f), sqrtf((float) n_embd_head));
             Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.cca_k_scale, 1, n_head_kv, 1));
             cb(Qcur, "Qcur_pre_rope", il);
             cb(Kcur, "Kcur_pre_rope", il);
 
-            /*
-             * zaya.py ref: L155-164 (rotary embedding)
-             *
-             * self.rotary_emb = get_rope(
-             *     head_size=self.head_dim,
-             *     max_position=config.max_position_embeddings,
-             *     is_neox_style=True,
-             *     rope_parameters={"rope_theta": config.rope_theta, "rope_type": "default", "partial_rotary_factor": 0.5},
-             * )
-             * q, k = self.rotary_emb(position_ids, q, k)
-             */
             ggml_tensor * rope_factors = model.get_rope_factors(cparams, il);
             Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors,
                     n_rot, rope_type, n_ctx_orig, freq_base, freq_scale,
@@ -662,13 +340,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
             Vcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Vcur), n_embd_head, n_head_kv, n_tokens);
 
-            /*
-             * zaya.py ref: L146-153, L181-182 (Attention + output projection)
-             *
-             * self.attn = Attention(self.cca_num_q_heads, self.head_dim, self.scale, self.cca_num_k_heads, ...)
-             * attn_output = self.attn(q, k, v)
-             * attn_output = self.o_proj(attn_output)
-             */
+            // GQA attention
             cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr,
                 Qcur, Kcur, Vcur, nullptr, nullptr, nullptr,
                 1.0f / sqrtf((float) n_embd_head), il);
@@ -676,78 +348,24 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
 
         } else {
             // ===== MoE Layer =====
-            /*
-             * zaya.py ref: L481-541 (ZayaDecoderMLPLayer)
-             * zaya.py ref: L382-479 (ZayaBlock)
-             * zaya.py ref: L251-380 (ZayaRouter)
-             *
-             * def forward(self, hidden_states, residual, position_ids, layer_n, prev_router_hidden_states):
-             *     if self.config.scale_residual_merge:
-             *         residual, hidden_states = self.res_scale(residual, hidden_states)
-             *     residual = residual.float() + hidden_states.float()
-             *     hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype)
-             *     hidden_states, prev_router_hidden_states = self.zaya_block(hidden_states, prev_router_hidden_states)
-             *     return hidden_states, residual, prev_router_hidden_states
-             */
-
-            /*
-             * zaya.py ref: L321-380 (ZayaRouter.forward)
-             *
-             * hs = self.down_proj(hidden_states)
-             * if self.use_eda and (prev_router_hidden_states is not None):
-             *     hs = hs + prev_router_hidden_states * self.router_states_scale
-             * router_hidden_states_next = hs[-S:].clone()
-             * hs_norm = self.rmsnorm_eda(hs)
-             * logits = self.router_mlp(hs_norm)  // Linear->GELU->Linear->GELU->Linear
-             * expert_prob = torch.softmax(logits, dim=-1, dtype=torch.float32)
-             * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases
-             * _, expert_choice_t = torch.topk(biased, self.topk, dim=-1)
-             * route_prob = torch.gather(expert_prob, dim=1, index=expert_choice_t)
-             * return route_prob_flat, expert_choice_flat, router_hidden_states_next
-             */
-
-            /*
-             * zaya.py ref: L343
-             *
-             * hs = self.down_proj(hidden_states)
-             */
+
+            // Build Zaya router network:
+            // down_proj -> optional EDA -> RMSNorm -> GELU MLP -> 17 logits.
+
             ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur);
             router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b);
             cb(router_h, "router_down", il);
 
-            /*
-             * zaya.py ref: L344-345
-             *
-             * if self.use_eda and (prev_router_hidden_states is not None):
-             *     hs = hs + prev_router_hidden_states * self.router_states_scale
-             */
             if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) {
                 router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale));
                 cb(router_h, "router_eda", il);
             }
 
-            prev_router = router_h;  // zaya.py ref: L348 (router_hidden_states_next)
+            prev_router = router_h;
 
-            /*
-             * zaya.py ref: L351
-             *
-             * hs_norm = self.rmsnorm_eda(hs)
-             */
             router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il);
             cb(router_h, "router_norm", il);
 
-            /*
-             * zaya.py ref: L305-314, L354
-             *
-             * logits = self.router_mlp(hs_norm)
-             * self.router_mlp = nn.Sequential(
-             *     ReplicatedLinear(D, D, bias=True, ...),   // mlp0
-             *     nn.GELU(),
-             *     ReplicatedLinear(D, D, bias=True, ...),   // mlp2
-             *     nn.GELU(),
-             *     ReplicatedLinear(D, E, bias=False, ...),  // mlp4
-             * )
-             */
             router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h);
             router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b);
             router_h = ggml_gelu(ctx0, router_h);
@@ -761,51 +379,20 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h);
             cb(router_h, "router_logits", il);
 
-            /*
-             * zaya.py ref: L355-359
-             *
-             * expert_prob = torch.softmax(logits, dim=-1, dtype=torch.float32)
-             * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases
-             * _, expert_choice_t = torch.topk(biased, self.topk, dim=-1)
-             */
             ggml_tensor * router_probs = ggml_soft_max(ctx0, router_h);
             cb(router_probs, "router_probs", il);
 
-            /*
-             * zaya.py ref: L387-389 (MOD skip expert handling)
-             *
-             * gate_probs = router_probs[:, :n_expert]  // exclude skip expert from routing
-             */
+            // Keep the MOD skip expert in the softmax denominator, then route
+            // over real experts only. The checkpoint's skip bias keeps MOD unused.
             ggml_tensor * gate_probs = ggml_cont(ctx0,
                     ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0));
             cb(gate_probs, "gate_probs", il);
 
-            /*
-             * zaya.py ref: L317-319, L362-363
-             *
-             * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32))
-             * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases
-             */
             ggml_tensor * expert_biases = nullptr;
             if (layer.zaya_router_biases != nullptr) {
                 expert_biases = ggml_view_1d(ctx0, layer.zaya_router_biases, n_expert, 0);
             }
 
-            /*
-             * zaya.py ref: L448-479 (ZayaBlock.forward - MoE execution)
-             *
-             * probs, indices, router_hidden_states_out = self.router(hidden_states, prev_router_hidden_states)
-             * if self.config.zaya_use_mod:
-             *     clamped_indices = torch.clamp(indices, min=0, max=self.num_moe_experts - 1)
-             *     packed_logits = torch.cat([probs, clamped_indices.to(probs.dtype)], dim=-1)
-             *     hidden_states_experts = self.experts(hidden_states, packed_logits)
-             *     hidden_states_mod = hidden_states * probs
-             *     mod_mask = (indices != self.num_moe_experts)
-             *     hidden_states = (mod_mask * hidden_states_experts) + ((~mod_mask) * hidden_states_mod)
-             * else:
-             *     packed_logits = torch.cat([probs, indices.to(probs.dtype)], dim=-1)
-             *     hidden_states = self.experts(hidden_states, packed_logits)
-             */
             cur = build_moe_ffn(cur,
                 /* gate_inp */        nullptr,
                 /* up_exps */         nullptr,
@@ -827,17 +414,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         inpL = cur;
     }
 
-    /*
-     * zaya.py ref: L653-664 (ZayaModel.forward - final residual + norm)
-     *
-     * if self.config.scale_residual_merge:
-     *     residual, hidden_states = self.res_scale(residual, hidden_states)
-     * if residual is not None:
-     *     hidden_states = hidden_states.float() + residual.float()
-     * else:
-     *     hidden_states = hidden_states.float()
-     * hidden_states = _apply_norm_with_fp32_residual(self.final_norm, hidden_states, final_input_dtype)
-     */
     ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1);
     if (residual != nullptr) {
         residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1);
@@ -851,22 +427,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         cur = ggml_get_rows(ctx0, cur, inp_out_ids);
     }
 
-    /*
-     * zaya.py ref: L608-613 (final norm)
-     *
-     * if (config.normalization == "RMSNorm"):
-     *     self.final_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon)
-     */
+    // final norm
     cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);
     cb(cur, "result_norm", -1);
     res->t_embd = cur;
 
-    /*
-     * zaya.py ref: L729-746, L769-782 (lm_head + logits_processor)
-     *
-     * self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, ...)
-     * logits = self.logits_processor(self.lm_head, hidden_states)
-     */
+    // output
     cur = ggml_mul_mat(ctx0, model.output, cur);
     cb(cur, "result_output", -1);
     res->t_logits = cur;

From 82f8f1575e76209c862f1d02a93565269709d46b Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Fri, 22 May 2026 23:22:05 +0200
Subject: [PATCH 31/33] ggml/zaya: fix precision loss in conv_1d and support
 BF16

- ggml: Update `ggml_conv_1d` (and variants) to use a conditional type for `im2col` activation (`a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32`) instead of hardcoding `GGML_TYPE_F16`. This aligns with `ggml_conv_2d`, preserving F32/BF16 precision while still safely protecting against quantized weight crashes (e.g., Q4_0).
- zaya: Replace the forced F16 downcast for grouped convolutions with a dynamic promotion to F32 for unsupported types (like BF16 or quantized types). This ensures `im2col` properly allocates an F32 matrix and computes an F32xF32 mul_mat, avoiding CUDA/CPU backend crashes while fully restoring model accuracy and NMSE metrics.
---
 ggml/src/ggml.c     | 6 +++---
 src/models/zaya.cpp | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
index ae1fb2fa031..276c11cb68c 100644
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -4487,7 +4487,7 @@ struct ggml_tensor * ggml_conv_1d(
         int                   s0,
         int                   p0,
         int                   d0) {
-    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
+    struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32); // [N, OL, IC * K]
 
     struct ggml_tensor * result =
         ggml_mul_mat(ctx,
@@ -4521,7 +4521,7 @@ struct ggml_tensor * ggml_conv_1d_dw(
         int                   d0) {
     struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);
 
-    struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
+    struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32);
 
     struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);
 
@@ -4781,7 +4781,7 @@ struct ggml_tensor * ggml_conv_2d_dw(
     struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
     struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
                                         ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
-                                        s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
+                                        s0, s1, p0, p1, d0, d1, true, a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32); // [N * IC, OH, OW, KH * KW]
     struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]
 
     new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2],  new_a->ne[3], 1);                       // [OC，1, KH, KW] => [1, OC, 1, KH * KW]
diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index ce65c2281fa..1db69c313b6 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -301,8 +301,8 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             cb(QK, "QK_dw", il);
 
             ggml_tensor * conv_grp = layer.cca_conv_grp;
-            if (conv_grp->type != GGML_TYPE_F16) {
-                conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F16));
+            if (conv_grp->type != GGML_TYPE_F16 && conv_grp->type != GGML_TYPE_F32) {
+                conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F32));
             }
             QK = ggml_conv_1d_grouped(ctx0, conv_grp, QK, 1, 0, 1, n_groups);
             QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1));

From 894ffd4274cc54e3c0d08c33a71a0df3eecf2f5f Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Mon, 25 May 2026 11:30:31 +0200
Subject: [PATCH 32/33] zaya: add il != 1 check for EDA to match python
 reference

This is a safety guard matching self.layer_number != zaya_first_layer
in the original implementation. No behavioral change for correctly
converted models since the tensor is already nullptr for layer 1.
---
 src/models/zaya.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index 1db69c313b6..d952cc74007 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -356,7 +356,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
             router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b);
             cb(router_h, "router_down", il);
 
-            if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) {
+            if (il != 1 && prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) {
                 router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale));
                 cb(router_h, "router_eda", il);
             }

From 1a7582b911a53b691e9b7d926a92bd504bcdf26e Mon Sep 17 00:00:00 2001
From: Juste-Leo2 <leonard.adamo66@gmail.com>
Date: Mon, 25 May 2026 11:47:51 +0200
Subject: [PATCH 33/33] zaya: compute residual in fp32 to match config

The model config has residual_in_fp32=true. Cast both residual
branches to float32 to align with the python reference.
---
 src/models/zaya.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp
index d952cc74007..2ee2b676c7e 100644
--- a/src/models/zaya.cpp
+++ b/src/models/zaya.cpp
@@ -177,11 +177,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
         const int64_t n_gqa     = n_head / n_head_kv;
 
         ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il);
+        // residual_in_fp32 = true in config
         if (residual != nullptr) {
             residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il);
-            residual = ggml_add(ctx0, hidden_states, residual);
+            residual = ggml_add(ctx0, ggml_cast(ctx0, hidden_states, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32));
         } else {
-            residual = hidden_states;
+            residual = ggml_cast(ctx0, hidden_states, GGML_TYPE_F32);
         }
         cb(residual, "residual", il);
 
@@ -415,11 +416,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params
     }
 
     ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1);
+    // residual_in_fp32 = true in config
     if (residual != nullptr) {
         residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1);
-        cur = ggml_add(ctx0, final_hidden, residual);
+        cur = ggml_add(ctx0, ggml_cast(ctx0, final_hidden, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32));
     } else {
-        cur = final_hidden;
+        cur = ggml_cast(ctx0, final_hidden, GGML_TYPE_F32);
     }
     cb(cur, "final_residual", -1);