From 99e5d03e2188623f76509f853e79bc60574b5da6 Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Fri, 8 May 2026 11:08:36 +0200 Subject: [PATCH 01/33] ops: add Conv1dGrouped operation --- ggml/include/ggml.h | 15 ++++ ggml/src/ggml.c | 57 ++++++++++++ tests/test-conv-1d-grouped.cpp | 154 +++++++++++++++++++++++++++++++++ 3 files changed, 226 insertions(+) create mode 100644 tests/test-conv-1d-grouped.cpp diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3357a0d9985..fec0287ae00 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2041,6 +2041,21 @@ extern "C" { int s0, // stride int d0); // dilation + // grouped 1D convolution + // a: [K, IC/G, OC] convolution kernel + // b: [L, IC, N] data + // groups must divide both IC and OC evenly + // when groups == 1, equivalent to ggml_conv_1d + // when groups == IC, equivalent to ggml_conv_1d_dw + GGML_API struct ggml_tensor * ggml_conv_1d_grouped( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int p0, // padding + int d0, // dilation + int groups); // number of groups + GGML_API struct ggml_tensor * ggml_conv_transpose_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 191cf2fa106..049f4952047 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4541,6 +4541,63 @@ struct ggml_tensor * ggml_conv_1d_dw_ph( return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0); } +// ggml_conv_1d_grouped + +struct ggml_tensor * ggml_conv_1d_grouped( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0, + int groups) { + GGML_ASSERT(groups > 0); + + const int64_t OC = a->ne[2]; // total output channels + const int64_t IC_G = a->ne[1]; // input channels per group (kernel dim) + const int64_t IC = b->ne[1]; // total input channels + + GGML_ASSERT(IC % groups == 0); + GGML_ASSERT(OC % groups == 0); + GGML_ASSERT(IC_G == IC / groups); + + // degenerate cases: fall back to existing implementations + if (groups == 1) { + return ggml_conv_1d(ctx, a, b, s0, p0, d0); + } + if (groups == IC && groups == OC) { + return ggml_conv_1d_dw(ctx, a, b, s0, p0, d0); + } + + const int64_t OC_G = OC / groups; + + struct ggml_tensor * result = NULL; + + for (int g = 0; g < groups; g++) { + // slice kernel for group g: [K, IC_G, OC_G] + struct ggml_tensor * a_g = ggml_view_3d(ctx, a, + a->ne[0], IC_G, OC_G, + a->nb[1], a->nb[2], + g * OC_G * a->nb[2]); + + // slice input for group g: [L, IC_G, N] + struct ggml_tensor * b_g = ggml_view_3d(ctx, b, + b->ne[0], IC_G, b->ne[2], + b->nb[1], b->nb[2], + g * IC_G * b->nb[1]); + + struct ggml_tensor * out_g = ggml_conv_1d(ctx, a_g, b_g, s0, p0, d0); + + if (result == NULL) { + result = out_g; + } else { + result = ggml_concat(ctx, result, out_g, 1); + } + } + + return result; +} + // ggml_conv_transpose_1d static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { diff --git a/tests/test-conv-1d-grouped.cpp b/tests/test-conv-1d-grouped.cpp new file mode 100644 index 00000000000..80b884804ec --- /dev/null +++ b/tests/test-conv-1d-grouped.cpp @@ -0,0 +1,154 @@ +// Test for ggml_conv_1d_grouped +// +// Verifies grouped 1D convolution by comparing against manual per-group computation. + +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" + +#include +#include +#include +#include +#include + +static void fill_random_f16(ggml_fp16_t * data, int n) { + for (int i = 0; i < n; i++) { + float v = ((float)rand() / RAND_MAX) * 2.0f - 1.0f; + data[i] = ggml_fp32_to_fp16(v); + } +} + +static void fill_random_f32(float * data, int n) { + for (int i = 0; i < n; i++) { + data[i] = ((float)rand() / RAND_MAX) * 2.0f - 1.0f; + } +} + +static bool all_close(const float * a, const float * b, int n, float eps = 5e-3f) { + for (int i = 0; i < n; i++) { + if (fabsf(a[i] - b[i]) > eps) { + fprintf(stderr, " mismatch at [%d]: %.6f vs %.6f (diff=%.6f)\n", + i, a[i], b[i], fabsf(a[i] - b[i])); + return false; + } + } + return true; +} + +// Compute grouped conv1d on CPU naively for reference +// kernel (F16): [K, IC_G, OC], input (F32): [L, IC, N], output: [OL, OC, N] +static void conv1d_grouped_ref( + const ggml_fp16_t * kernel, const float * input, float * output, + int K, int IC, int OC, int L, int N, int groups, int stride, int padding) { + int IC_G = IC / groups; + int OC_G = OC / groups; + int OL = (L + 2 * padding - K) / stride + 1; + + memset(output, 0, (size_t)OL * OC * N * sizeof(float)); + + for (int n = 0; n < N; n++) { + for (int g = 0; g < groups; g++) { + for (int oc = 0; oc < OC_G; oc++) { + int oc_global = g * OC_G + oc; + for (int ol = 0; ol < OL; ol++) { + float sum = 0.0f; + for (int ic = 0; ic < IC_G; ic++) { + for (int k = 0; k < K; k++) { + int il = ol * stride + k - padding; + if (il >= 0 && il < L) { + int ic_global = g * IC_G + ic; + // kernel: [K, IC_G, OC] -> k + ic * K + oc_global * (IC_G * K) + float w = ggml_fp16_to_fp32(kernel[k + ic * K + oc_global * (IC_G * K)]); + // input: [L, IC, N] -> il + ic_global * L + n * (IC * L) + float x = input[il + ic_global * L + n * (IC * L)]; + sum += w * x; + } + } + } + // output: [OL, OC, N] -> ol + oc_global * OL + n * (OC * OL) + output[ol + oc_global * OL + n * (OC * OL)] = sum; + } + } + } + } +} + +static bool run_test(const char * label, int IC, int OC, int K, int L, int groups, int stride, int padding) { + printf(" TEST: %s (IC=%d OC=%d K=%d L=%d G=%d s=%d p=%d)\n", + label, IC, OC, K, L, groups, stride, padding); + + int IC_G = IC / groups; + int OL = (L + 2 * padding - K) / stride + 1; + + size_t ctx_size = 256 * 1024 * 1024; + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx = ggml_init(params); + + // kernel: [K, IC_G, OC] in F16 (like real models) + struct ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, K, IC_G, OC); + // input: [L, IC] in F32 + struct ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, L, IC); + + fill_random_f16((ggml_fp16_t *)a->data, K * IC_G * OC); + fill_random_f32((float *)b->data, L * IC); + + // reference + std::vector ref(OL * OC); + conv1d_grouped_ref((ggml_fp16_t *)a->data, (float *)b->data, ref.data(), + K, IC, OC, L, 1, groups, stride, padding); + + // ggml + struct ggml_tensor * result = ggml_conv_1d_grouped(ctx, a, b, stride, padding, 1, groups); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, result); + + ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_graph_compute(backend, gf); + + bool ok = true; + + if (result->ne[0] != OL || result->ne[1] != OC) { + fprintf(stderr, " FAIL: shape [%lld, %lld], expected [%d, %d]\n", + (long long)result->ne[0], (long long)result->ne[1], OL, OC); + ok = false; + } + + if (ok) { + ok = all_close((float *)result->data, ref.data(), OL * OC); + } + + printf(" %s\n", ok ? "PASS" : "FAIL"); + + ggml_backend_free(backend); + ggml_free(ctx); + return ok; +} + +int main(void) { + srand(42); + + printf("Testing ggml_conv_1d_grouped\n\n"); + + int n_pass = 0, n_fail = 0; + + auto check = [&](const char * label, int IC, int OC, int K, int L, int G, int s, int p) { + if (run_test(label, IC, OC, K, L, G, s, p)) { n_pass++; } else { n_fail++; } + }; + + check("groups=1 (standard conv1d)", 128, 256, 3, 32, 1, 1, 0); + check("ZAYA1-8B exact params", 1280, 1280, 2, 16, 10, 1, 0); + check("small 2 groups", 4, 4, 2, 8, 2, 1, 0); + check("with padding", 8, 8, 2, 16, 4, 1, 1); + check("IC != OC", 12, 6, 3, 10, 3, 1, 0); + check("stride=2", 8, 8, 2, 16, 4, 2, 0); + check("longer sequence", 1280, 1280, 2, 128, 10, 1, 0); + + printf("\nResult: %d passed, %d failed\n", n_pass, n_fail); + return n_fail > 0 ? 1 : 0; +} From e0ac753e404962ace6c6e0535d38657cae7b0283 Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Fri, 8 May 2026 15:07:17 +0200 Subject: [PATCH 02/33] initial implementation --- convert_hf_to_gguf.py | 39 ++++++ gguf-py/gguf/constants.py | 28 +++++ gguf-py/gguf/tensor_mapping.py | 23 ++++ src/llama-arch.cpp | 12 ++ src/llama-arch.h | 6 + src/llama-model.cpp | 3 + src/llama-model.h | 7 ++ src/models/models.h | 13 ++ src/models/zaya.cpp | 223 +++++++++++++++++++++++++++++++++ 9 files changed, 354 insertions(+) create mode 100644 src/models/zaya.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index fb1f5dd4473..33c74013fb3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6454,6 +6454,45 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("ZayaModel", "ZayaForCausalLM") +class ZayaModel(TextModel): + """Zaya-1 model with Compressed Convolutional Attention""" + model_arch = gguf.MODEL_ARCH.ZAYA + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + # ZAYA-specific params if any from config.json (e.g. ssm_d_conv) + if "ssm_d_conv" in self.hparams: + self.gguf_writer.add_ssm_conv_kernel(self.hparams["ssm_d_conv"]) + else: + # Fallback if config is different + self.gguf_writer.add_ssm_conv_kernel(2) # Default for ZAYA1-8B + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Tensors will be automatically mapped based on tensor_mapping.py if they match + + # We skip MoE FFN weights, unused biases, etc. temporarily since we are using dense FFN + skip_keywords = [ + "zaya_block.experts", + "res_scale.", + "val_proj2" + ] + + if any(kw in name for kw in skip_keywords): + logger.info(f"Skipping tensor (dense FFN test): {name}") + return + + try: + yield from super().modify_tensors(data_torch, name, bid) + except ValueError as e: + if "Can not map tensor" in str(e): + logger.warning(f"Skipping unmapped tensor: {name}") + else: + raise + + @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 308ebe1f4a1..13bd3d1c8f0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -503,6 +503,7 @@ class MODEL_ARCH(IntEnum): LLAMA_EMBED = auto() MAINCODER = auto() KIMI_LINEAR = auto() + ZAYA = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -610,6 +611,10 @@ class MODEL_TENSOR(IntEnum): SSM_BETA = auto() # Kimi Linear qwen3.5 SSM_G_A = auto() # Kimi Linear SSM_G_B = auto() # Kimi Linear + CCA_CONV_DW = auto() # Zaya + CCA_CONV_GRP = auto() # Zaya + CCA_QK_NORM = auto() # Zaya + CCA_K_SCALE = auto() # Zaya TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -1018,6 +1023,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.LLAMA_EMBED: "llama-embed", MODEL_ARCH.MAINCODER: "maincoder", MODEL_ARCH.KIMI_LINEAR: "kimi-linear", + MODEL_ARCH.ZAYA: "zaya", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -1123,6 +1129,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear qwen3.5 MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear + MODEL_TENSOR.CCA_CONV_DW: "blk.{bid}.cca_conv_dw", # Zaya + MODEL_TENSOR.CCA_CONV_GRP: "blk.{bid}.cca_conv_grp", # Zaya + MODEL_TENSOR.CCA_QK_NORM: "blk.{bid}.cca_qk_norm", # Zaya + MODEL_TENSOR.CCA_K_SCALE: "blk.{bid}.cca_k_scale", # Zaya MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -3992,6 +4002,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, ], + MODEL_ARCH.ZAYA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.CCA_CONV_DW, + MODEL_TENSOR.CCA_CONV_GRP, + MODEL_TENSOR.CCA_QK_NORM, + MODEL_TENSOR.CCA_K_SCALE, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f27f0e4c997..db99afd4cbb 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -259,6 +259,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.q_proj", # llada "layers.{bid}.self_attn.q_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.q_proj", # nemotron-h + "model.layers.{bid}.self_attn.qkv.linear_q", # Zaya ), # Attention key @@ -279,6 +280,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.k_proj", # llada "layers.{bid}.self_attn.k_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.k_proj", # nemotron-h + "model.layers.{bid}.self_attn.qkv.linear_k", # Zaya ), # Attention value @@ -298,6 +300,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.v_proj", # llada "layers.{bid}.self_attn.v_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.v_proj", # nemotron-h + "model.layers.{bid}.self_attn.qkv.val_proj1", # Zaya ), # Attention output @@ -336,6 +339,7 @@ class TensorNameMap: "layers.{bid}.self_attn.o_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.o_proj", # nemotron-h "model.layers.{bid}.self_attn.language_expert_dense", # cogvlm + "model.layers.{bid}.self_attn.o_proj", # Zaya ), # Attention output norm @@ -854,6 +858,12 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.norm", # mamba2 "model.layers.{bid}.self_attn.o_norm", # kimi ), + MODEL_TENSOR.ATTN_NORM: ( + "model.layers.{bid}.input_layernorm", + "model.layers.{bid}.ln_1", + "model.layers.{bid}.norm1", + "model.layers.{bid}.input_norm", # Zaya + ), MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", # mamba-hf @@ -891,6 +901,19 @@ class TensorNameMap: "model.layers.{bid}.linear_attn.in_proj_b", # qwen3.5 "model.layers.{bid}.self_attn.b_proj", # Kimi Linear ), + # ZAYA CCA + MODEL_TENSOR.CCA_CONV_DW: ( + "model.layers.{bid}.self_attn.qkv.conv_qk.0", # Zaya + ), + MODEL_TENSOR.CCA_CONV_GRP: ( + "model.layers.{bid}.self_attn.qkv.conv_qk.1", # Zaya + ), + MODEL_TENSOR.CCA_QK_NORM: ( + "model.layers.{bid}.self_attn.qk_norm", # Zaya + ), + MODEL_TENSOR.CCA_K_SCALE: ( + "model.layers.{bid}.self_attn.qkv.temp", # Zaya + ), MODEL_TENSOR.SSM_G_A: ( "model.layers.{bid}.self_attn.g_a_proj", ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 59dde99e362..df91d973a3e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -133,6 +133,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_MAINCODER, "maincoder" }, { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, + { LLM_ARCH_ZAYA, "zaya" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -417,6 +418,10 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_SSM_BETA, "blk.%d.ssm_beta" }, { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, + { LLM_TENSOR_CCA_CONV_DW, "blk.%d.cca_conv_dw" }, + { LLM_TENSOR_CCA_CONV_GRP, "blk.%d.cca_conv_grp" }, + { LLM_TENSOR_CCA_QK_NORM, "blk.%d.cca_qk_norm" }, + { LLM_TENSOR_CCA_K_SCALE, "blk.%d.cca_k_scale" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, @@ -659,6 +664,11 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_BETA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_G_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + // ZAYA CCA + {LLM_TENSOR_CCA_CONV_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, + {LLM_TENSOR_CCA_CONV_GRP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CCA_QK_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CCA_K_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -857,6 +867,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_ZAYA: case LLM_ARCH_QWEN35: case LLM_ARCH_QWEN35MOE: return true; @@ -902,6 +913,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) { case LLM_ARCH_MINIMAX_M2: case LLM_ARCH_MISTRAL4: case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_ZAYA: return false; default: return true; diff --git a/src/llama-arch.h b/src/llama-arch.h index e37d548c98e..b11fa50c05f 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -137,6 +137,7 @@ enum llm_arch { LLM_ARCH_LLAMA_EMBED, LLM_ARCH_MAINCODER, LLM_ARCH_KIMI_LINEAR, + LLM_ARCH_ZAYA, LLM_ARCH_UNKNOWN, }; @@ -444,6 +445,11 @@ enum llm_tensor { LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient and qwen3.5 LLM_TENSOR_SSM_G_A, // kimi: output gate projection A LLM_TENSOR_SSM_G_B, // kimi: output gate projection B + // ZAYA CCA (Compressed Convolutional Attention) + LLM_TENSOR_CCA_CONV_DW, // zaya: depthwise conv1d (conv_qk.0) + LLM_TENSOR_CCA_CONV_GRP, // zaya: grouped conv1d (conv_qk.1) + LLM_TENSOR_CCA_QK_NORM, // zaya: RMSNorm on concat(Q,K) + LLM_TENSOR_CCA_K_SCALE, // zaya: learned K temperature LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W2, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9d011ff3464..656767318f2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -282,6 +282,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_mimo2(params); case LLM_ARCH_KIMI_LINEAR: return new llama_model_kimi_linear(params); + case LLM_ARCH_ZAYA: + return new llama_model_zaya(params); case LLM_ARCH_STEP35: return new llama_model_step35(params); default: @@ -2206,6 +2208,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_ZAYA: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values diff --git a/src/llama-model.h b/src/llama-model.h index d63c689185a..8e919e15159 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -477,6 +477,13 @@ struct llama_layer { struct ggml_tensor * ssm_g_b = nullptr; struct ggml_tensor * ssm_o_norm = nullptr; + // ZAYA CCA (Compressed Convolutional Attention) + struct ggml_tensor * cca_conv_dw = nullptr; // depthwise conv (conv_qk.0) + struct ggml_tensor * cca_conv_grp = nullptr; // grouped conv (conv_qk.1) + struct ggml_tensor * cca_conv_grp_b = nullptr; // grouped conv bias + struct ggml_tensor * cca_qk_norm = nullptr; // RMSNorm on concat(Q,K) + struct ggml_tensor * cca_k_scale = nullptr; // learned K temperature + // DSA (deepseek sparse attention) struct ggml_tensor * indexer_k_norm = nullptr; struct ggml_tensor * indexer_k_norm_b = nullptr; diff --git a/src/models/models.h b/src/models/models.h index 6d5f18a8e20..507f903104b 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -1858,3 +1858,16 @@ struct llama_model_step35 : public llama_model_base { std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; + + +struct llama_model_zaya : public llama_model_base { + llama_model_zaya(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp new file mode 100644 index 00000000000..0815fc1d449 --- /dev/null +++ b/src/models/zaya.cpp @@ -0,0 +1,223 @@ +#include "models.h" + +#include "ggml.h" + +void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + + switch (hparams.n_layer) { + case 80: type = LLM_TYPE_8B; break; + default: type = LLM_TYPE_UNKNOWN; + } +} + +void llama_model_zaya::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + const int64_t d_conv = hparams.ssm_d_conv; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + const int64_t n_head_i = hparams.n_head(i); + const int64_t n_head_kv_i = hparams.n_head_kv(i); + const int64_t n_embd_q = n_head_i * n_embd_head_k; + const int64_t n_embd_k = n_head_kv_i * n_embd_head_k; + const int64_t n_qk = n_embd_q + n_embd_k; + const int64_t n_groups = n_head_i + n_head_kv_i; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // CCA projections (standard Q, K, V, O) + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_k}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); + + // CCA conv_qk.0 (depthwise, groups = n_qk, kernel = d_conv) + layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); + + // CCA conv_qk.1 (grouped, groups = n_groups, kernel = d_conv) + layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), {d_conv, n_qk / n_groups, n_qk}, 0); + layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + + // CCA normalization and scale + layer.cca_qk_norm = create_tensor(tn(LLM_TENSOR_CCA_QK_NORM, "weight", i), {n_qk}, 0); + layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_embd_k}, 0); + + // FFN (dense SwiGLU for now; MoE can be added later) + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } +} + +std::unique_ptr llama_model_zaya::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + + const int64_t n_embd_head = hparams.n_embd_head_k(); + const int64_t d_conv = hparams.ssm_d_conv; + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + auto * inp = build_inp_mem_hybrid(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_q = n_head * n_embd_head; + const int64_t n_embd_k = n_head_kv * n_embd_head; + const int64_t n_qk = n_embd_q + n_embd_k; + const int64_t n_groups = n_head + n_head_kv; + + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // --- CCA: Q, K, V projections --- + ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); + cb(Qraw, "Qraw", il); + ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); + cb(Kraw, "Kraw", il); + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.wv, cur); + cb(Vcur, "Vcur", il); + + // --- CCA: concat Q+K for conv --- + // QK: [n_qk, n_tokens] + ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0); + cb(QK, "QK_cat", il); + + // --- CCA: conv_qk.0 (depthwise, causal) --- + // Reshape for ssm_conv: [n_tokens, n_qk] -> [n_tokens, n_qk, 1] + // ssm_conv expects [seq_len, channels, batch] with state already concatenated + // For prompt processing, we left-pad with (d_conv-1) zeros for causality + { + // Left-pad QK with zeros for causal convolution + ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); // [n_tokens, n_qk] + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); + pad = ggml_scale(ctx0, pad, 0.0f); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); // [d_conv-1 + n_tokens, n_qk] + + QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); + // ssm_conv output: [n_tokens, n_qk] + cb(QK, "QK_dw", il); + } + + // --- CCA: conv_qk.1 (grouped, causal) --- + { + // Left-pad for second causal conv + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk); + pad = ggml_scale(ctx0, pad, 0.0f); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0); // [d_conv-1 + n_tokens, n_qk] + + // ggml_conv_1d_grouped expects kernel [K, IC/G, OC] and input [L, IC] + // QK_padded is [d_conv-1 + n_tokens, n_qk] which matches [L, IC] + QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); + QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); + cb(QK, "QK_grp", il); + } + + // QK is now [n_tokens, n_qk] from conv output, transpose back to [n_qk, n_tokens] + QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + + // --- CCA: split Q_conv, K_conv --- + ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, + QK->nb[1], 0); + ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, + QK->nb[1], n_embd_q * ggml_element_size(QK)); + + // --- CCA: QK mean (skip connection) --- + ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f); + ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + + // --- CCA: RMSNorm on concat(Q, K) --- + ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0); // [n_qk, n_tokens] + QK_for_norm = build_norm(QK_for_norm, layer.cca_qk_norm, NULL, LLM_NORM_RMS, il); + cb(QK_for_norm, "QK_normed", il); + + // Split back + Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens, + QK_for_norm->nb[1], 0); + Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens, + QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm)); + + // --- CCA: K temperature scaling --- + Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); + cb(Kcur, "Kcur_scaled", il); + + // Reshape for attention: [head_dim, n_heads, n_tokens] + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // --- GQA attention --- + cur = build_attn(inp->get_attn(), layer.wo, NULL, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + 1.0f / sqrtf((float) n_embd_head), il); + cb(cur, "attn_out", il); + + // select output tokens on last layer + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // residual + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // --- FFN (dense SwiGLU) --- + cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + layer.ffn_up, NULL, NULL, + layer.ffn_gate, NULL, NULL, + layer.ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + // residual + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = inpL; + + // final norm + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // output + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} From 7cc554aab3435a800d673ea588d92034958bd3c7 Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Fri, 8 May 2026 18:39:38 +0200 Subject: [PATCH 03/33] implementation checkpoint --- convert_hf_to_gguf.py | 165 ++++++++++++++-- gguf-py/gguf/constants.py | 60 +++++- gguf-py/gguf/tensor_mapping.py | 15 +- src/llama-arch.cpp | 34 ++++ src/llama-arch.h | 19 ++ src/llama-model.h | 21 ++ src/models/zaya.cpp | 350 +++++++++++++++++++++------------ 7 files changed, 497 insertions(+), 167 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 33c74013fb3..97a5889cce9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6456,34 +6456,150 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("ZayaModel", "ZayaForCausalLM") class ZayaModel(TextModel): - """Zaya-1 model with Compressed Convolutional Attention""" + """Zaya-1 model with Compressed Convolutional Attention and MoE""" model_arch = gguf.MODEL_ARCH.ZAYA + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Buffer for accumulating expert weights per layer + self._experts: dict[int, dict[str, Tensor]] | None = {} + def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - # ZAYA-specific params if any from config.json (e.g. ssm_d_conv) - if "ssm_d_conv" in self.hparams: - self.gguf_writer.add_ssm_conv_kernel(self.hparams["ssm_d_conv"]) - else: - # Fallback if config is different - self.gguf_writer.add_ssm_conv_kernel(2) # Default for ZAYA1-8B - + + # n_ff = ffn_hidden_size / 2 (SwiGLU halves the intermediate) + n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2 + self.gguf_writer.add_feed_forward_length(n_ff) + + # ssm_d_conv = conv_qk kernel size + self.gguf_writer.add_ssm_conv_kernel(5) + + # partial_rotary_factor -> n_rot + head_dim = self.hparams.get("head_dim", 128) + partial_rotary = self.hparams.get("partial_rotary_factor", 0.5) + self.gguf_writer.add_rope_dimension_count(int(partial_rotary * head_dim)) + + # MoE params + n_expert = self.find_hparam(["num_experts"]) + self.gguf_writer.add_expert_count(n_expert) + n_expert_used = self.find_hparam(["moe_router_topk", "num_experts_per_tok"], optional=True) or 1 + self.gguf_writer.add_expert_used_count(n_expert_used) + + def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: + if "linear_q" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch + elif "linear_k" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch + elif "val_proj1" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ1, bid), data_torch + elif "val_proj2" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ2, bid), data_torch + elif "o_proj" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch + elif "conv_qk.0" in name and name.endswith(".weight"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch + elif "conv_qk.0" in name and name.endswith(".bias"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch + elif "conv_qk.1" in name and name.endswith(".weight"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch + elif "conv_qk.1" in name and name.endswith(".bias"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid, suffix=".bias"), data_torch + elif "temp" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_K_SCALE, bid), data_torch + + def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: + if "down_proj.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN, bid), data_torch + elif "down_proj.bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch + elif "rmsnorm_eda" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_NORM, bid), data_torch + elif "router_mlp.0.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0, bid), data_torch + elif "router_mlp.0.bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, bid, suffix=".bias"), data_torch + elif "router_mlp.2.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid), data_torch + elif "router_mlp.2.bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, bid, suffix=".bias"), data_torch + elif "router_mlp.4.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP4, bid), data_torch + elif "balancing_biases" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_BIASES, bid), data_torch + elif "router_states_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, bid), data_torch + + def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: + if "hidden_states_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid), data_torch + elif "hidden_states_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B, bid, suffix=".bias"), data_torch + elif "residual_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid), data_torch + elif "residual_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Tensors will be automatically mapped based on tensor_mapping.py if they match - - # We skip MoE FFN weights, unused biases, etc. temporarily since we are using dense FFN - skip_keywords = [ - "zaya_block.experts", - "res_scale.", - "val_proj2" - ] - - if any(kw in name for kw in skip_keywords): - logger.info(f"Skipping tensor (dense FFN test): {name}") + # Common tensors + if name == "model.embed_tokens.weight": + yield self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch return - + if name == "model.final_norm.weight": + yield self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM), data_torch + return + + # Block-level tensors + if bid is not None: + # CCA attention tensors + if "self_attn" in name: + yield from self._map_cca(name, data_torch, bid) + return + + # Router tensors + if "router" in name: + yield from self._map_router(name, data_torch, bid) + return + + # Input norm + if "input_norm" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, bid), data_torch + return + + # Residual scaling + if "res_scale" in name: + yield from self._map_res_scale(name, data_torch, bid) + return + + # Expert stacking + if "zaya_block.experts" in name: + assert bid is not None + if self._experts is None: + self._experts = {} + if bid not in self._experts: + self._experts[bid] = {} + self._experts[bid][name] = data_torch + + n_expert = self.find_hparam(["num_experts"]) + # Each layer has 2 expert weights per expert (fc1, fc2) = 2 * n_expert tensors + if len(self._experts[bid]) >= n_expert * 2: + for w_name, gguf_tensor, permute_dims in [ + ("linear_fc1", gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, None), + ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, (0, 2, 1)), + ]: + datas: list[Tensor] = [] + for xid in range(n_expert): + ename = f"model.layers.{bid}.zaya_block.experts.local_experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + data_torch_stacked = torch.stack(datas, dim=0) + if permute_dims is not None: + data_torch_stacked = data_torch_stacked.permute(*permute_dims) + yield self.format_tensor_name(gguf_tensor, bid), data_torch_stacked + del self._experts[bid] + return + + # Fallback for any remaining tensors: use tensor_mapping try: yield from super().modify_tensors(data_torch, name, bid) except ValueError as e: @@ -6492,6 +6608,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: raise + def prepare_tensors(self): + super().prepare_tensors() + if self._experts: + unprocessed = [k for d in self._experts.values() for k in d.keys()] + if unprocessed: + raise ValueError(f"Unprocessed expert tensors: {unprocessed}") + @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 13bd3d1c8f0..de599da4a0b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -613,8 +613,25 @@ class MODEL_TENSOR(IntEnum): SSM_G_B = auto() # Kimi Linear CCA_CONV_DW = auto() # Zaya CCA_CONV_GRP = auto() # Zaya - CCA_QK_NORM = auto() # Zaya + CCA_CONV_DW_B = auto() # Zaya: conv_qk.0.bias + CCA_QK_NORM = auto() # Zaya (weightless - unit RMSNorm) CCA_K_SCALE = auto() # Zaya + CCA_VAL_PROJ1 = auto() # Zaya: CCA value projection stream 1 + CCA_VAL_PROJ2 = auto() # Zaya: CCA value projection stream 2 + RES_SCALE_HS = auto() # Zaya: hidden_states_scale + RES_SCALE_HS_B = auto() # Zaya: hidden_states_bias + RES_SCALE_RES = auto() # Zaya: residual_scale + RES_SCALE_RES_B = auto() # Zaya: residual_bias + ZAYA_ROUTER_DOWN = auto() # Zaya + ZAYA_ROUTER_DOWN_B = auto() # Zaya + ZAYA_ROUTER_NORM = auto() # Zaya + ZAYA_ROUTER_MLP0 = auto() # Zaya + ZAYA_ROUTER_MLP0_B = auto() # Zaya + ZAYA_ROUTER_MLP2 = auto() # Zaya + ZAYA_ROUTER_MLP2_B = auto() # Zaya + ZAYA_ROUTER_MLP4 = auto() # Zaya + ZAYA_ROUTER_BIASES = auto() # Zaya + ZAYA_ROUTER_EDA_SCALE = auto() # Zaya TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -1130,9 +1147,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear MODEL_TENSOR.CCA_CONV_DW: "blk.{bid}.cca_conv_dw", # Zaya + MODEL_TENSOR.CCA_CONV_DW_B: "blk.{bid}.cca_conv_dw_b", # Zaya MODEL_TENSOR.CCA_CONV_GRP: "blk.{bid}.cca_conv_grp", # Zaya MODEL_TENSOR.CCA_QK_NORM: "blk.{bid}.cca_qk_norm", # Zaya MODEL_TENSOR.CCA_K_SCALE: "blk.{bid}.cca_k_scale", # Zaya + MODEL_TENSOR.CCA_VAL_PROJ1: "blk.{bid}.cca_val_proj1", # Zaya + MODEL_TENSOR.CCA_VAL_PROJ2: "blk.{bid}.cca_val_proj2", # Zaya + MODEL_TENSOR.RES_SCALE_HS: "blk.{bid}.res_scale_hs", # Zaya + MODEL_TENSOR.RES_SCALE_HS_B: "blk.{bid}.res_scale_hs_b", # Zaya + MODEL_TENSOR.RES_SCALE_RES: "blk.{bid}.res_scale_res", # Zaya + MODEL_TENSOR.RES_SCALE_RES_B: "blk.{bid}.res_scale_res_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_DOWN: "blk.{bid}.zaya_router_down", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_NORM: "blk.{bid}.zaya_router_norm", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP0: "blk.{bid}.zaya_router_mlp0", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP0_B: "blk.{bid}.zaya_router_mlp0_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP2: "blk.{bid}.zaya_router_mlp2", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP2_B: "blk.{bid}.zaya_router_mlp2_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP4: "blk.{bid}.zaya_router_mlp4", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_BIASES: "blk.{bid}.zaya_router_biases", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE: "blk.{bid}.zaya_router_eda", # Zaya MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -4009,16 +4043,30 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.CCA_CONV_DW, + MODEL_TENSOR.CCA_CONV_DW_B, MODEL_TENSOR.CCA_CONV_GRP, MODEL_TENSOR.CCA_QK_NORM, MODEL_TENSOR.CCA_K_SCALE, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.CCA_VAL_PROJ1, + MODEL_TENSOR.CCA_VAL_PROJ2, + MODEL_TENSOR.RES_SCALE_HS, + MODEL_TENSOR.RES_SCALE_HS_B, + MODEL_TENSOR.RES_SCALE_RES, + MODEL_TENSOR.RES_SCALE_RES_B, + MODEL_TENSOR.ZAYA_ROUTER_DOWN, + MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, + MODEL_TENSOR.ZAYA_ROUTER_NORM, + MODEL_TENSOR.ZAYA_ROUTER_MLP0, + MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, + MODEL_TENSOR.ZAYA_ROUTER_MLP2, + MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, + MODEL_TENSOR.ZAYA_ROUTER_MLP4, + MODEL_TENSOR.ZAYA_ROUTER_BIASES, + MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, + MODEL_TENSOR.FFN_GATE_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index db99afd4cbb..fbd22ccb6a3 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -107,6 +107,7 @@ class TensorNameMap: "model.transformer.ln_f", # llada "final_norm", # modern-bert "model.norm", # cogvlm + "model.final_norm", # Zaya ), # Rope frequencies @@ -300,7 +301,6 @@ class TensorNameMap: "model.transformer.blocks.{bid}.v_proj", # llada "layers.{bid}.self_attn.v_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.v_proj", # nemotron-h - "model.layers.{bid}.self_attn.qkv.val_proj1", # Zaya ), # Attention output @@ -901,19 +901,6 @@ class TensorNameMap: "model.layers.{bid}.linear_attn.in_proj_b", # qwen3.5 "model.layers.{bid}.self_attn.b_proj", # Kimi Linear ), - # ZAYA CCA - MODEL_TENSOR.CCA_CONV_DW: ( - "model.layers.{bid}.self_attn.qkv.conv_qk.0", # Zaya - ), - MODEL_TENSOR.CCA_CONV_GRP: ( - "model.layers.{bid}.self_attn.qkv.conv_qk.1", # Zaya - ), - MODEL_TENSOR.CCA_QK_NORM: ( - "model.layers.{bid}.self_attn.qk_norm", # Zaya - ), - MODEL_TENSOR.CCA_K_SCALE: ( - "model.layers.{bid}.self_attn.qkv.temp", # Zaya - ), MODEL_TENSOR.SSM_G_A: ( "model.layers.{bid}.self_attn.g_a_proj", ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index df91d973a3e..3bebc529300 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -419,9 +419,26 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, { LLM_TENSOR_CCA_CONV_DW, "blk.%d.cca_conv_dw" }, + { LLM_TENSOR_CCA_CONV_DW_B, "blk.%d.cca_conv_dw_b" }, { LLM_TENSOR_CCA_CONV_GRP, "blk.%d.cca_conv_grp" }, { LLM_TENSOR_CCA_QK_NORM, "blk.%d.cca_qk_norm" }, { LLM_TENSOR_CCA_K_SCALE, "blk.%d.cca_k_scale" }, + { LLM_TENSOR_CCA_VAL_PROJ1, "blk.%d.cca_val_proj1" }, + { LLM_TENSOR_CCA_VAL_PROJ2, "blk.%d.cca_val_proj2" }, + { LLM_TENSOR_RES_SCALE_HS, "blk.%d.res_scale_hs" }, + { LLM_TENSOR_RES_SCALE_HS_B, "blk.%d.res_scale_hs_b" }, + { LLM_TENSOR_RES_SCALE_RES, "blk.%d.res_scale_res" }, + { LLM_TENSOR_RES_SCALE_RES_B, "blk.%d.res_scale_res_b" }, + { LLM_TENSOR_ZAYA_ROUTER_DOWN, "blk.%d.zaya_router_down" }, + { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, + { LLM_TENSOR_ZAYA_ROUTER_NORM, "blk.%d.zaya_router_norm" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP0, "blk.%d.zaya_router_mlp0" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "blk.%d.zaya_router_mlp0_b" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP2, "blk.%d.zaya_router_mlp2" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "blk.%d.zaya_router_mlp2_b" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP4, "blk.%d.zaya_router_mlp4" }, + { LLM_TENSOR_ZAYA_ROUTER_BIASES, "blk.%d.zaya_router_biases" }, + { LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "blk.%d.zaya_router_eda" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, @@ -666,9 +683,26 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, // ZAYA CCA {LLM_TENSOR_CCA_CONV_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, + {LLM_TENSOR_CCA_CONV_DW_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_CCA_CONV_GRP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CCA_QK_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CCA_K_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CCA_VAL_PROJ1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CCA_VAL_PROJ2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_RES_SCALE_HS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_HS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_RES_SCALE_RES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_RES_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP0_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP2_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP4, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_BIASES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index b11fa50c05f..72c5abddac1 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -447,9 +447,28 @@ enum llm_tensor { LLM_TENSOR_SSM_G_B, // kimi: output gate projection B // ZAYA CCA (Compressed Convolutional Attention) LLM_TENSOR_CCA_CONV_DW, // zaya: depthwise conv1d (conv_qk.0) + LLM_TENSOR_CCA_CONV_DW_B, // zaya: depthwise conv1d bias LLM_TENSOR_CCA_CONV_GRP, // zaya: grouped conv1d (conv_qk.1) LLM_TENSOR_CCA_QK_NORM, // zaya: RMSNorm on concat(Q,K) LLM_TENSOR_CCA_K_SCALE, // zaya: learned K temperature + LLM_TENSOR_CCA_VAL_PROJ1, // zaya: V projection 1 + LLM_TENSOR_CCA_VAL_PROJ2, // zaya: V projection 2 + // ZAYA residual scaling + LLM_TENSOR_RES_SCALE_HS, // zaya: hidden_states_scale + LLM_TENSOR_RES_SCALE_HS_B, // zaya: hidden_states_bias + LLM_TENSOR_RES_SCALE_RES, // zaya: residual_scale + LLM_TENSOR_RES_SCALE_RES_B, // zaya: residual_bias + // ZAYA Router (MoE gating) + LLM_TENSOR_ZAYA_ROUTER_DOWN, // zaya: router down_proj weight + LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias + LLM_TENSOR_ZAYA_ROUTER_NORM, // zaya: router rmsnorm_eda weight + LLM_TENSOR_ZAYA_ROUTER_MLP0, // zaya: router MLP layer 0 weight + LLM_TENSOR_ZAYA_ROUTER_MLP0_B, // zaya: router MLP layer 0 bias + LLM_TENSOR_ZAYA_ROUTER_MLP2, // zaya: router MLP layer 2 weight + LLM_TENSOR_ZAYA_ROUTER_MLP2_B, // zaya: router MLP layer 2 bias + LLM_TENSOR_ZAYA_ROUTER_MLP4, // zaya: router MLP layer 4 weight + LLM_TENSOR_ZAYA_ROUTER_BIASES, // zaya: router balancing_biases + LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, // zaya: router router_states_scale LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W2, diff --git a/src/llama-model.h b/src/llama-model.h index 8e919e15159..d9da4b318bd 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -479,10 +479,31 @@ struct llama_layer { // ZAYA CCA (Compressed Convolutional Attention) struct ggml_tensor * cca_conv_dw = nullptr; // depthwise conv (conv_qk.0) + struct ggml_tensor * cca_conv_dw_b = nullptr; // depthwise conv bias struct ggml_tensor * cca_conv_grp = nullptr; // grouped conv (conv_qk.1) struct ggml_tensor * cca_conv_grp_b = nullptr; // grouped conv bias struct ggml_tensor * cca_qk_norm = nullptr; // RMSNorm on concat(Q,K) struct ggml_tensor * cca_k_scale = nullptr; // learned K temperature + struct ggml_tensor * cca_val_proj1 = nullptr; // V projection stream 1 + struct ggml_tensor * cca_val_proj2 = nullptr; // V projection stream 2 + + // ZAYA residual scaling + struct ggml_tensor * res_scale_hs = nullptr; // hidden_states_scale + struct ggml_tensor * res_scale_hs_b = nullptr; // hidden_states_bias + struct ggml_tensor * res_scale_res = nullptr; // residual_scale + struct ggml_tensor * res_scale_res_b = nullptr; // residual_bias + + // ZAYA Router (MoE gating) + struct ggml_tensor * zaya_router_down = nullptr; // router down_proj + struct ggml_tensor * zaya_router_down_b = nullptr; // router down_proj bias + struct ggml_tensor * zaya_router_norm = nullptr; // router rmsnorm_eda + struct ggml_tensor * zaya_router_mlp0 = nullptr; // router MLP 0 + struct ggml_tensor * zaya_router_mlp0_b = nullptr; // router MLP 0 bias + struct ggml_tensor * zaya_router_mlp2 = nullptr; // router MLP 2 + struct ggml_tensor * zaya_router_mlp2_b = nullptr; // router MLP 2 bias + struct ggml_tensor * zaya_router_mlp4 = nullptr; // router MLP 4 + struct ggml_tensor * zaya_router_biases = nullptr; // balancing_biases + struct ggml_tensor * zaya_router_eda_scale = nullptr; // router_states_scale // DSA (deepseek sparse attention) struct ggml_tensor * indexer_k_norm = nullptr; diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 0815fc1d449..a6e77bbc198 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -17,46 +17,93 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - // output + // output norm output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - const int64_t d_conv = hparams.ssm_d_conv; + // output (tied with tok_embd if not present) + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (output == nullptr) { + output = tok_embd; + } + + const int64_t n_embd_head = hparams.n_embd_head_k(); + const int64_t d_conv = hparams.ssm_d_conv; + // Router MLP hidden size (zaya_mlp_expansion = 256 for ZAYA1-8B) + const int64_t n_ff_exp = 256; for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; - const int64_t n_head_i = hparams.n_head(i); - const int64_t n_head_kv_i = hparams.n_head_kv(i); - const int64_t n_embd_q = n_head_i * n_embd_head_k; - const int64_t n_embd_k = n_head_kv_i * n_embd_head_k; - const int64_t n_qk = n_embd_q + n_embd_k; - const int64_t n_groups = n_head_i + n_head_kv_i; + const int64_t n_head = hparams.n_head(i); + const int64_t n_head_kv = hparams.n_head_kv(i); + const int64_t n_embd_q = n_head * n_embd_head; + const int64_t n_embd_k = n_head_kv * n_embd_head; + const int64_t n_qk = n_embd_q + n_embd_k; + const int64_t n_groups = n_head + n_head_kv; + const int64_t n_ff = hparams.n_ff(i); + const int64_t n_expert = hparams.n_expert; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - // CCA projections (standard Q, K, V, O) + // CCA projections (present on all layers) layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_k}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); - - // CCA conv_qk.0 (depthwise, groups = n_qk, kernel = d_conv) - layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); - // CCA conv_qk.1 (grouped, groups = n_groups, kernel = d_conv) - layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), {d_conv, n_qk / n_groups, n_qk}, 0); - layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + // CCA: V = concat(val_proj1(x), val_proj2(x)) → {n_embd_k} + layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), + {n_embd, n_embd_head}, 0); + layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), + {n_embd, n_embd_head}, 0); - // CCA normalization and scale - layer.cca_qk_norm = create_tensor(tn(LLM_TENSOR_CCA_QK_NORM, "weight", i), {n_qk}, 0); - layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_embd_k}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); - // FFN (dense SwiGLU for now; MoE can be added later) - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + // CCA conv_qk.0 (depthwise, causal) + layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); + layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); + + // CCA conv_qk.1 (grouped, groups = n_groups) + layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), + {d_conv, n_qk / n_groups, n_qk}, 0); + layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + + // CCA per-KV-head temperature + layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0); + + // Residual scaling + layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); + layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B, "bias", i), {n_embd}, 0); + layer.res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + + // MoE layers (odd indices) + if (i % 2 == 1) { + // Router network + layer.zaya_router_down = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN, "weight", i), + {n_embd, n_ff_exp}, 0); + layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i), + {n_ff_exp}, 0); + layer.zaya_router_norm = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_NORM, "weight", i), + {n_ff_exp}, 0); + layer.zaya_router_mlp0 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0, "weight", i), + {n_ff_exp, n_ff_exp}, 0); + layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "bias", i), + {n_ff_exp}, 0); + layer.zaya_router_mlp2 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2, "weight", i), + {n_ff_exp, n_ff_exp}, 0); + layer.zaya_router_mlp2_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "bias", i), + {n_ff_exp}, 0); + layer.zaya_router_mlp4 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i), + {n_ff_exp, n_expert + 1}, 0); + layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i), + {n_expert + 1}, TENSOR_NOT_REQUIRED); + layer.zaya_router_eda_scale = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "weight", i), + {n_ff_exp}, TENSOR_NOT_REQUIRED); + + // MoE experts (fused gate_up and down) + create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i), + {n_ff, n_embd, n_expert}, 0); + } } } @@ -69,6 +116,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params const int64_t n_embd_head = hparams.n_embd_head_k(); const int64_t d_conv = hparams.ssm_d_conv; + const int64_t n_expert = hparams.n_expert; ggml_tensor * cur; ggml_tensor * inpL; @@ -91,117 +139,167 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * inpSA = inpL; - // norm - cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); + // Pre-norm + cur = build_norm(inpL, layer.attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - // --- CCA: Q, K, V projections --- - ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); - cb(Qraw, "Qraw", il); - ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); - cb(Kraw, "Kraw", il); - ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.wv, cur); - cb(Vcur, "Vcur", il); - - // --- CCA: concat Q+K for conv --- - // QK: [n_qk, n_tokens] - ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0); - cb(QK, "QK_cat", il); - - // --- CCA: conv_qk.0 (depthwise, causal) --- - // Reshape for ssm_conv: [n_tokens, n_qk] -> [n_tokens, n_qk, 1] - // ssm_conv expects [seq_len, channels, batch] with state already concatenated - // For prompt processing, we left-pad with (d_conv-1) zeros for causality - { - // Left-pad QK with zeros for causal convolution - ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); // [n_tokens, n_qk] - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); - pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); // [d_conv-1 + n_tokens, n_qk] - - QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); - // ssm_conv output: [n_tokens, n_qk] - cb(QK, "QK_dw", il); + if (il % 2 == 0) { + // ===== CCA Attention ===== + + // Q, K projections + ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); + cb(Qraw, "Qraw", il); + ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); + cb(Kraw, "Kraw", il); + + // V = concat(val_proj1(x), val_proj2(x)) → [n_embd_k, n_tokens] + ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur); + cb(V1, "V1", il); + ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, cur); + cb(V2, "V2", il); + ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0); + cb(Vcur, "Vcur", il); + + // Concat Q+K for conv: [n_qk, n_tokens] + ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0); + cb(QK, "QK_cat", il); + + // conv_qk.0 (depthwise, causal) + { + ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); + pad = ggml_scale(ctx0, pad, 0.0f); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); + + QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); + if (layer.cca_conv_dw_b) { + QK = ggml_add(ctx0, QK, layer.cca_conv_dw_b); + } + cb(QK, "QK_dw", il); + } + + // conv_qk.1 (grouped, causal) + { + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk); + pad = ggml_scale(ctx0, pad, 0.0f); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0); + + QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); + QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); + cb(QK, "QK_grp", il); + } + + // Transpose back to [n_qk, n_tokens] + QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + + // Split Q_conv, K_conv + ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, + QK->nb[1], 0); + ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, + QK->nb[1], n_embd_q * ggml_element_size(QK)); + + // QK mean skip connection + ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f); + ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + + // RMSNorm on concat(Q, K) — weightless (unit RMSNorm) + ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0); + QK_for_norm = build_norm(QK_for_norm, nullptr, nullptr, LLM_NORM_RMS, il); + cb(QK_for_norm, "QK_normed", il); + + // Split back + Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens, + QK_for_norm->nb[1], 0); + Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens, + QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm)); + + // Per-KV-head temperature scaling on K + // Kcur: [n_embd_k=256, n_tokens], reshape to [n_embd_head, n_head_kv, n_tokens] + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + // cca_k_scale: [n_head_kv] → broadcast + Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); + cb(Kcur, "Kcur_scaled", il); + + // Reshape for attention + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // GQA attention + cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + 1.0f / sqrtf((float) n_embd_head), il); + cb(cur, "attn_out", il); + + } else { + // ===== MoE Layer ===== + + // Build Zaya router network: + // down_proj → RMSNorm → SiLU(MLP0) → MLP2 → MLP4 → 17 logits → take first 16 + + ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur); + router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b); + cb(router_h, "router_down", il); + + router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il); + cb(router_h, "router_norm", il); + + router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h); + router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b); + router_h = ggml_silu(ctx0, router_h); + cb(router_h, "router_mlp0", il); + + router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp2, router_h); + router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp2_b); + cb(router_h, "router_mlp2", il); + + router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h); + // router_h now has shape [17, n_tokens] — 16 expert logits + 1 MOD skip + cb(router_h, "router_logits", il); + + // Take only the first 16 logits (expert routing), ignore MOD skip (index 16) + ggml_tensor * gate_inp = ggml_view_2d(ctx0, router_h, n_expert, n_tokens, + router_h->nb[1], 0); + cb(gate_inp, "gate_inp", il); + + // MoE FFN with topk=1 (pass router logits as probs_in) + cur = build_moe_ffn(cur, + /* gate_inp */ nullptr, + /* up_exps */ nullptr, + /* gate_exps */ nullptr, + /* down_exps */ layer.ffn_down_exps, + /* exp_probs_b */ nullptr, + /* n_expert */ n_expert, + /* n_expert_used */ hparams.n_expert_used, + /* type_op */ LLM_FFN_SILU, + /* norm_w */ false, + /* w_scale */ 1.0f, + /* gating_op */ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + /* il */ il, + /* probs_in */ gate_inp, + /* gate_up_exps */ layer.ffn_gate_up_exps); + cb(cur, "moe_out", il); } - // --- CCA: conv_qk.1 (grouped, causal) --- - { - // Left-pad for second causal conv - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk); - pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0); // [d_conv-1 + n_tokens, n_qk] - - // ggml_conv_1d_grouped expects kernel [K, IC/G, OC] and input [L, IC] - // QK_padded is [d_conv-1 + n_tokens, n_qk] which matches [L, IC] - QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); - QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); - cb(QK, "QK_grp", il); - } - - // QK is now [n_tokens, n_qk] from conv output, transpose back to [n_qk, n_tokens] - QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); - - // --- CCA: split Q_conv, K_conv --- - ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, - QK->nb[1], 0); - ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, - QK->nb[1], n_embd_q * ggml_element_size(QK)); - - // --- CCA: QK mean (skip connection) --- - ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f); - ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f); - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - - // --- CCA: RMSNorm on concat(Q, K) --- - ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0); // [n_qk, n_tokens] - QK_for_norm = build_norm(QK_for_norm, layer.cca_qk_norm, NULL, LLM_NORM_RMS, il); - cb(QK_for_norm, "QK_normed", il); - - // Split back - Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens, - QK_for_norm->nb[1], 0); - Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens, - QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm)); - - // --- CCA: K temperature scaling --- - Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); - cb(Kcur, "Kcur_scaled", il); - - // Reshape for attention: [head_dim, n_heads, n_tokens] - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - - // --- GQA attention --- - cur = build_attn(inp->get_attn(), layer.wo, NULL, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, - 1.0f / sqrtf((float) n_embd_head), il); - cb(cur, "attn_out", il); - // select output tokens on last layer if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - // residual - ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + // Residual scaling: cur = hs_scale * cur + hs_bias + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.res_scale_hs), layer.res_scale_hs_b); + cb(cur, "scaled_out", il); - // --- FFN (dense SwiGLU) --- - cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - layer.ffn_up, NULL, NULL, - layer.ffn_gate, NULL, NULL, - layer.ffn_down, NULL, NULL, - NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); + // Residual scaling: inpSA = res_scale * inpSA + res_bias (if present) + if (layer.res_scale_res) { + inpSA = ggml_add(ctx0, ggml_mul(ctx0, inpSA, layer.res_scale_res), layer.res_scale_res_b); + cb(inpSA, "scaled_residual", il); + } - // residual - cur = ggml_add(ctx0, cur, ffn_inp); + // Residual add + cur = ggml_add(ctx0, cur, inpSA); cb(cur, "l_out", il); inpL = cur; @@ -210,7 +308,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cur = inpL; // final norm - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; From 02a9843498a8bfe3296fd2522b0ce372bb9e2e6d Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Fri, 8 May 2026 19:19:41 +0200 Subject: [PATCH 04/33] update --- convert_hf_to_gguf.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 97a5889cce9..52bddd7665e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6608,6 +6608,30 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: raise + def set_vocab(self): + from gguf.vocab import LlamaHfVocab + + vocab = LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("gemma4") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + self.gguf_writer.add_add_space_prefix(False) + self.gguf_writer.add_add_bos_token(True) + def prepare_tensors(self): super().prepare_tensors() if self._experts: From 8362c10d438261e04bb66f3c37b3631507589a8f Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Tue, 12 May 2026 00:30:59 +0200 Subject: [PATCH 05/33] add corrections --- convert_hf_to_gguf.py | 20 +++++++++++--- ggml/src/ggml.c | 6 ++--- src/models/zaya.cpp | 63 ++++++++++++++++++++++++------------------- 3 files changed, 56 insertions(+), 33 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 52bddd7665e..41d150e30ac 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1183,7 +1183,7 @@ def set_gguf_parameters(self): if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base_swa(local_rope_theta) logger.info(f"gguf: rope theta swa = {local_rope_theta}") - if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: + if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: @@ -6463,6 +6463,13 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Buffer for accumulating expert weights per layer self._experts: dict[int, dict[str, Tensor]] | None = {} + # Pre-load tokenizer to know the vocab count for embedding trimming + self._tokenizer_vocab_size: int | None = None + try: + from gguf.vocab import LlamaHfVocab + self._tokenizer_vocab_size = LlamaHfVocab(self.dir_model).vocab_size + except Exception: + pass def set_gguf_parameters(self): super().set_gguf_parameters() @@ -6472,8 +6479,9 @@ def set_gguf_parameters(self): n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2 self.gguf_writer.add_feed_forward_length(n_ff) - # ssm_d_conv = conv_qk kernel size - self.gguf_writer.add_ssm_conv_kernel(5) + # ssm_d_conv = conv_qk kernel size (cca_time0 = first depthwise conv kernel) + cca_time0 = self.hparams.get("cca_time0", 2) + self.gguf_writer.add_ssm_conv_kernel(cca_time0) # partial_rotary_factor -> n_rot head_dim = self.hparams.get("head_dim", 128) @@ -6498,10 +6506,13 @@ def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[st elif "o_proj" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch elif "conv_qk.0" in name and name.endswith(".weight"): + # PyTorch: [n_qk, 1, kernel] (depthwise) -> ggml: {kernel, n_qk} + data_torch = data_torch.squeeze(1).contiguous() yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch elif "conv_qk.0" in name and name.endswith(".bias"): yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch elif "conv_qk.1" in name and name.endswith(".weight"): + # PyTorch: [n_qk, in_ch_per_group, kernel] -> ggml: {kernel, in_ch_per_group, n_qk} yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch elif "conv_qk.1" in name and name.endswith(".bias"): yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid, suffix=".bias"), data_torch @@ -6543,6 +6554,9 @@ def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tu def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Common tensors if name == "model.embed_tokens.weight": + # Trim embedding to match tokenizer vocab size if needed + if self._tokenizer_vocab_size is not None and data_torch.shape[0] > self._tokenizer_vocab_size: + data_torch = data_torch[:self._tokenizer_vocab_size] yield self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch return if name == "model.final_norm.weight": diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 049f4952047..ae1fb2fa031 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2018,9 +2018,9 @@ struct ggml_tensor * ggml_dup_inplace( static struct ggml_tensor * ggml_add_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index a6e77bbc198..434fa31585b 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -45,29 +45,27 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - // CCA projections (present on all layers) - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); + // CCA attention layers (even indices only) + if (i % 2 == 0) { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); - // CCA: V = concat(val_proj1(x), val_proj2(x)) → {n_embd_k} - layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), - {n_embd, n_embd_head}, 0); - layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), - {n_embd, n_embd_head}, 0); + layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), + {n_embd, n_embd_head}, 0); + layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), + {n_embd, n_embd_head}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); - // CCA conv_qk.0 (depthwise, causal) - layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); - layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); + layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); + layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); - // CCA conv_qk.1 (grouped, groups = n_groups) - layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), - {d_conv, n_qk / n_groups, n_qk}, 0); - layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), + {d_conv, n_qk / n_groups, n_qk}, 0); + layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); - // CCA per-KV-head temperature - layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0); + layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0); + } // Residual scaling layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); @@ -101,7 +99,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { // MoE experts (fused gate_up and down) create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i), + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0); } } @@ -167,30 +165,37 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params // conv_qk.0 (depthwise, causal) { ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); + // ggml_ssm_conv requires 3D input: {1 + n_tokens, n_qk, 1} + // Use view_3d on the contiguous 2D tensor to add a batch dimension + QK_t = ggml_view_3d(ctx0, QK_t, n_tokens, n_qk, 1, QK_t->nb[1], QK_t->nb[1] * n_qk, 0); + ggml_tensor * pad = ggml_new_tensor_3d(ctx0, QK_t->type, d_conv - 1, n_qk, 1); pad = ggml_scale(ctx0, pad, 0.0f); ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); + // Reshape to 2D first, then apply bias to avoid 3D broadcasting + QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); if (layer.cca_conv_dw_b) { QK = ggml_add(ctx0, QK, layer.cca_conv_dw_b); } cb(QK, "QK_dw", il); } - // conv_qk.1 (grouped, causal) + // conv_qk.1 (grouped, causal) — operate on {n_tokens, n_qk} format { - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk); + ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); + // conv output is {OL, OC, N} -> reshape to {OC, OL}, then add bias + QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); cb(QK, "QK_grp", il); } - // Transpose back to [n_qk, n_tokens] - QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + // QK is now [n_qk, n_tokens] // Split Q_conv, K_conv ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, @@ -217,13 +222,16 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params // Per-KV-head temperature scaling on K // Kcur: [n_embd_k=256, n_tokens], reshape to [n_embd_head, n_head_kv, n_tokens] + Kcur = ggml_cont(ctx0, Kcur); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // cca_k_scale: [n_head_kv] → broadcast Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); cb(Kcur, "Kcur_scaled", il); // Reshape for attention + Qcur = ggml_cont(ctx0, Qcur); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Vcur = ggml_cont(ctx0, Vcur); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // GQA attention @@ -259,8 +267,9 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(router_h, "router_logits", il); // Take only the first 16 logits (expert routing), ignore MOD skip (index 16) - ggml_tensor * gate_inp = ggml_view_2d(ctx0, router_h, n_expert, n_tokens, - router_h->nb[1], 0); + ggml_tensor * gate_inp = ggml_cont(ctx0, + ggml_view_2d(ctx0, router_h, n_expert, n_tokens, + router_h->nb[1], 0)); cb(gate_inp, "gate_inp", il); // MoE FFN with topk=1 (pass router logits as probs_in) From 109856e8fa688e9bf4453db98c687e2de85051b0 Mon Sep 17 00:00:00 2001 From: Ganesh Nanduru Date: Mon, 11 May 2026 21:42:49 -0600 Subject: [PATCH 06/33] zaya generation running --- common/debug.cpp | 14 +- convert_hf_to_gguf.py | 15 +- gguf-py/gguf/constants.py | 12 ++ src/llama-arch.cpp | 8 + src/llama-arch.h | 4 + src/llama-graph.cpp | 4 + src/llama-model.cpp | 9 +- src/llama-model.h | 6 + src/models/zaya.cpp | 312 +++++++++++++++++++++++++------------- 9 files changed, 270 insertions(+), 114 deletions(-) diff --git a/common/debug.cpp b/common/debug.cpp index 102c6924dc9..60cb5fd9b4a 100644 --- a/common/debug.cpp +++ b/common/debug.cpp @@ -144,13 +144,6 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (common_debug_cb_user_data *) user_data; auto * pimpl = cb_data->pimpl.get(); - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - - if (ask) { - return true; // Always retrieve data - } - bool matches_filter = pimpl->tensor_filters.empty(); if (!matches_filter) { @@ -162,6 +155,13 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } } + if (ask) { + return matches_filter; + } + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + char src1_str[128] = { 0 }; if (src1) { snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str()); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 41d150e30ac..1e1adb10fe4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6551,6 +6551,16 @@ def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tu elif "residual_bias" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch + def _map_final_res_scale(self, name: str, data_torch: Tensor) -> Iterable[tuple[str, Tensor]]: + if "hidden_states_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL), data_torch + elif "hidden_states_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B_FINAL, suffix=".bias"), data_torch + elif "residual_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL), data_torch + elif "residual_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B_FINAL, suffix=".bias"), data_torch + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Common tensors if name == "model.embed_tokens.weight": @@ -6562,6 +6572,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "model.final_norm.weight": yield self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM), data_torch return + if name.startswith("model.res_scale."): + yield from self._map_final_res_scale(name, data_torch) + return # Block-level tensors if bid is not None: @@ -6599,7 +6612,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._experts[bid]) >= n_expert * 2: for w_name, gguf_tensor, permute_dims in [ ("linear_fc1", gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, None), - ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, (0, 2, 1)), + ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, None), ]: datas: list[Tensor] = [] for xid in range(n_expert): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index de599da4a0b..57a67cb559f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -622,6 +622,10 @@ class MODEL_TENSOR(IntEnum): RES_SCALE_HS_B = auto() # Zaya: hidden_states_bias RES_SCALE_RES = auto() # Zaya: residual_scale RES_SCALE_RES_B = auto() # Zaya: residual_bias + RES_SCALE_HS_FINAL = auto() # Zaya: final hidden_states_scale + RES_SCALE_HS_B_FINAL = auto() # Zaya: final hidden_states_bias + RES_SCALE_RES_FINAL = auto() # Zaya: final residual_scale + RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias ZAYA_ROUTER_DOWN = auto() # Zaya ZAYA_ROUTER_DOWN_B = auto() # Zaya ZAYA_ROUTER_NORM = auto() # Zaya @@ -1157,6 +1161,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_HS_B: "blk.{bid}.res_scale_hs_b", # Zaya MODEL_TENSOR.RES_SCALE_RES: "blk.{bid}.res_scale_res", # Zaya MODEL_TENSOR.RES_SCALE_RES_B: "blk.{bid}.res_scale_res_b", # Zaya + MODEL_TENSOR.RES_SCALE_HS_FINAL: "res_scale_hs", # Zaya + MODEL_TENSOR.RES_SCALE_HS_B_FINAL: "res_scale_hs_b", # Zaya + MODEL_TENSOR.RES_SCALE_RES_FINAL: "res_scale_res", # Zaya + MODEL_TENSOR.RES_SCALE_RES_B_FINAL: "res_scale_res_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_DOWN: "blk.{bid}.zaya_router_down", # Zaya MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_NORM: "blk.{bid}.zaya_router_norm", # Zaya @@ -4055,6 +4063,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_HS_B, MODEL_TENSOR.RES_SCALE_RES, MODEL_TENSOR.RES_SCALE_RES_B, + MODEL_TENSOR.RES_SCALE_HS_FINAL, + MODEL_TENSOR.RES_SCALE_HS_B_FINAL, + MODEL_TENSOR.RES_SCALE_RES_FINAL, + MODEL_TENSOR.RES_SCALE_RES_B_FINAL, MODEL_TENSOR.ZAYA_ROUTER_DOWN, MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, MODEL_TENSOR.ZAYA_ROUTER_NORM, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 3bebc529300..9bdd0023028 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -429,6 +429,10 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_RES_SCALE_HS_B, "blk.%d.res_scale_hs_b" }, { LLM_TENSOR_RES_SCALE_RES, "blk.%d.res_scale_res" }, { LLM_TENSOR_RES_SCALE_RES_B, "blk.%d.res_scale_res_b" }, + { LLM_TENSOR_RES_SCALE_HS_FINAL, "res_scale_hs" }, + { LLM_TENSOR_RES_SCALE_HS_B_FINAL, "res_scale_hs_b" }, + { LLM_TENSOR_RES_SCALE_RES_FINAL, "res_scale_res" }, + { LLM_TENSOR_RES_SCALE_RES_B_FINAL, "res_scale_res_b" }, { LLM_TENSOR_ZAYA_ROUTER_DOWN, "blk.%d.zaya_router_down" }, { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, { LLM_TENSOR_ZAYA_ROUTER_NORM, "blk.%d.zaya_router_norm" }, @@ -693,6 +697,10 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_RES_SCALE_HS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_RES_SCALE_RES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_RES_SCALE_RES_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_RES_SCALE_HS_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_HS_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, + {LLM_TENSOR_RES_SCALE_RES_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_RES_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 72c5abddac1..30a3f9a444a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -458,6 +458,10 @@ enum llm_tensor { LLM_TENSOR_RES_SCALE_HS_B, // zaya: hidden_states_bias LLM_TENSOR_RES_SCALE_RES, // zaya: residual_scale LLM_TENSOR_RES_SCALE_RES_B, // zaya: residual_bias + LLM_TENSOR_RES_SCALE_HS_FINAL, // zaya: final hidden_states_scale + LLM_TENSOR_RES_SCALE_HS_B_FINAL,// zaya: final hidden_states_bias + LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale + LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias // ZAYA Router (MoE gating) LLM_TENSOR_ZAYA_ROUTER_DOWN, // zaya: router down_proj weight LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index fe155c92dea..e4f0ff98ef4 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1405,6 +1405,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn( { probs = logits; // [n_expert, n_tokens] } break; + case LLAMA_EXPERT_GATING_FUNC_TYPE_NONE: + { + probs = logits; // already-normalized expert probabilities + } break; default: GGML_ABORT("fatal error"); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 656767318f2..3de55045f5c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1957,6 +1957,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, if (arch == LLM_ARCH_FALCON_H1) { filter_attn = [&](int32_t) { return true; }; filter_recr = [&](int32_t) { return true; }; + } else if (arch == LLM_ARCH_ZAYA) { + filter_attn = [&](int32_t il) { + return il % 2 == 0; + }; + filter_recr = [&](int32_t il) { + return il % 2 == 0; + }; } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { filter_attn = [&](int32_t il) { return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0; @@ -2208,7 +2215,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_KIMI_LINEAR: - case LLM_ARCH_ZAYA: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values @@ -2311,6 +2317,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_MIMO2: case LLM_ARCH_STEP35: + case LLM_ARCH_ZAYA: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/src/llama-model.h b/src/llama-model.h index d9da4b318bd..01ce976fe3e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -561,6 +561,12 @@ struct llama_model { struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; + // Zaya final residual scaling + struct ggml_tensor * zaya_res_scale_hs = nullptr; + struct ggml_tensor * zaya_res_scale_hs_b = nullptr; + struct ggml_tensor * zaya_res_scale_res = nullptr; + struct ggml_tensor * zaya_res_scale_res_b = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 434fa31585b..89e354450bb 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -1,11 +1,23 @@ #include "models.h" #include "ggml.h" +#include "llama-memory-recurrent.h" + +#include void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + const uint32_t n_qk = (hparams.n_head() + hparams.n_head_kv()) * hparams.n_embd_head_k(); + hparams.ssm_d_inner = 2*n_qk + hparams.n_embd; // CCA conv state + delayed value stream state + hparams.ssm_d_state = 1; + hparams.ssm_n_group = 0; + + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.recurrent_layer_arr[i] = (i % 2) == 0; + } + switch (hparams.n_layer) { case 80: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; @@ -26,6 +38,11 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { output = tok_embd; } + zaya_res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); + const int64_t n_embd_head = hparams.n_embd_head_k(); const int64_t d_conv = hparams.ssm_d_conv; // Router MLP hidden size (zaya_mlp_expansion = 256 for ZAYA1-8B) @@ -113,8 +130,14 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_k(); - const int64_t d_conv = hparams.ssm_d_conv; const int64_t n_expert = hparams.n_expert; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(n_tokens % n_seqs == 0); + + const int64_t n_seq_tokens = n_tokens / n_seqs; ggml_tensor * cur; ggml_tensor * inpL; @@ -122,8 +145,24 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params inpL = build_inp_embd(model.tok_embd); auto * inp = build_inp_mem_hybrid(); + auto * inp_recr = inp->get_recr(); + ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * residual = nullptr; + ggml_tensor * prev_router = nullptr; + + const auto apply_res_scale = [&](ggml_tensor * x, ggml_tensor * scale, ggml_tensor * bias, const char * name, int il) { + if (scale == nullptr) { + return x; + } + if (bias != nullptr) { + x = ggml_add(ctx0, x, bias); + } + x = ggml_mul(ctx0, x, scale); + cb(x, name, il); + return x; + }; for (int il = 0; il < n_layer; ++il) { const auto & layer = model.layers[il]; @@ -134,15 +173,41 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params const int64_t n_embd_k = n_head_kv * n_embd_head; const int64_t n_qk = n_embd_q + n_embd_k; const int64_t n_groups = n_head + n_head_kv; + const int64_t n_gqa = n_head / n_head_kv; - ggml_tensor * inpSA = inpL; + ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); + if (residual != nullptr) { + residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il); + residual = ggml_add(ctx0, hidden_states, residual); + } else { + residual = hidden_states; + } + cb(residual, "residual", il); // Pre-norm - cur = build_norm(inpL, layer.attn_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); + cur = build_norm(residual, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "input_norm", il); if (il % 2 == 0) { // ===== CCA Attention ===== + const int64_t conv_state_size = 2*n_qk; + const int64_t cca_state_size = conv_state_size + n_embd; + GGML_ASSERT((int64_t) hparams.n_embd_s() == cca_state_size); + + ggml_tensor * cca_state_all = inp_recr->mctx->get_s_l(il); + ggml_tensor * cca_state = build_rs(inp_recr, cca_state_all, hparams.n_embd_s(), n_seqs); + cb(cca_state, "cca_state", il); + + ggml_tensor * conv_state = ggml_view_3d(ctx0, cca_state, 2, n_qk, n_seqs, + 2*ggml_element_size(cca_state), + cca_state->nb[1], + 0); + cb(conv_state, "cca_conv_state", il); + + ggml_tensor * prev_hs = ggml_view_2d(ctx0, cca_state, n_embd, n_seqs, + cca_state->nb[1], + conv_state_size*ggml_element_size(cca_state)); + cb(prev_hs, "cca_prev_hs", il); // Q, K projections ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); @@ -150,89 +215,121 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); cb(Kraw, "Kraw", il); - // V = concat(val_proj1(x), val_proj2(x)) → [n_embd_k, n_tokens] + // HF uses a delayed hidden-state stream for val_proj2. During decode this + // comes from the recurrent state; during prefill it is a one-token shift. + ggml_tensor * cur_state_src = ggml_cont(ctx0, cur); + ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs); + + ggml_tensor * hs_d = ggml_reshape_3d(ctx0, prev_hs, n_embd, 1, n_seqs); + if (n_seq_tokens > 1) { + ggml_tensor * cur_shift = ggml_view_3d(ctx0, cur_seq, n_embd, n_seq_tokens - 1, n_seqs, + cur_seq->nb[1], + cur_seq->nb[2], + 0); + hs_d = ggml_concat(ctx0, hs_d, cur_shift, 1); + } + hs_d = ggml_reshape_2d(ctx0, hs_d, n_embd, n_tokens); + cb(hs_d, "cca_hs_d", il); + + // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens] ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur); cb(V1, "V1", il); - ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, cur); + ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, hs_d); cb(V2, "V2", il); ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0); cb(Vcur, "Vcur", il); // Concat Q+K for conv: [n_qk, n_tokens] - ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0); - cb(QK, "QK_cat", il); - - // conv_qk.0 (depthwise, causal) - { - ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); - // ggml_ssm_conv requires 3D input: {1 + n_tokens, n_qk, 1} - // Use view_3d on the contiguous 2D tensor to add a batch dimension - QK_t = ggml_view_3d(ctx0, QK_t, n_tokens, n_qk, 1, QK_t->nb[1], QK_t->nb[1] * n_qk, 0); - ggml_tensor * pad = ggml_new_tensor_3d(ctx0, QK_t->type, d_conv - 1, n_qk, 1); - pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); - - QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); - // Reshape to 2D first, then apply bias to avoid 3D broadcasting - QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); - if (layer.cca_conv_dw_b) { - QK = ggml_add(ctx0, QK, layer.cca_conv_dw_b); - } - cb(QK, "QK_dw", il); + ggml_tensor * QKraw = ggml_concat(ctx0, Qraw, Kraw, 0); + cb(QKraw, "QKraw", il); + + ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Qraw), n_embd_head, n_head, n_tokens); + ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Kraw), n_embd_head, n_head_kv, n_tokens); + + ggml_tensor * Kpre_grouped = ggml_reshape_4d(ctx0, Kpre, n_embd_head, 1, n_head_kv, n_tokens); + Kpre_grouped = ggml_repeat_4d(ctx0, Kpre_grouped, n_embd_head, n_gqa, n_head_kv, n_tokens); + ggml_tensor * Kpre_rep = ggml_reshape_3d(ctx0, Kpre_grouped, n_embd_head, n_head, n_tokens); + ggml_tensor * qk_mean_q = ggml_scale(ctx0, ggml_add(ctx0, Qpre, Kpre_rep), 0.5f); + cb(qk_mean_q, "qk_mean_q", il); + + ggml_tensor * Qgroup = ggml_reshape_4d(ctx0, Qpre, n_embd_head, n_gqa, n_head_kv, n_tokens); + Qgroup = ggml_permute(ctx0, Qgroup, 1, 0, 2, 3); + Qgroup = ggml_cont(ctx0, Qgroup); + ggml_tensor * Qmean = ggml_mean(ctx0, Qgroup); + Qmean = ggml_reshape_3d(ctx0, Qmean, n_embd_head, n_head_kv, n_tokens); + ggml_tensor * qk_mean_k = ggml_scale(ctx0, ggml_add(ctx0, Qmean, Kpre), 0.5f); + cb(qk_mean_k, "qk_mean_k", il); + + ggml_tensor * QKraw_t = ggml_cont(ctx0, ggml_transpose(ctx0, QKraw)); + QKraw_t = ggml_reshape_3d(ctx0, QKraw_t, n_seq_tokens, n_qk, n_seqs); + + ggml_tensor * conv_input = ggml_concat(ctx0, conv_state, QKraw_t, 0); + cb(conv_input, "cca_conv_input", il); + + ggml_tensor * last_conv_states = ggml_view_3d(ctx0, conv_input, 2, n_qk, n_seqs, + conv_input->nb[1], + conv_input->nb[2], + n_seq_tokens*conv_input->nb[0]); + cb(last_conv_states, "cca_last_conv_states", il); + + const auto kv_head = inp_recr->mctx->get_head(); + ggml_tensor * conv_state_update_target = ggml_view_2d(ctx0, cca_state_all, conv_state_size, n_seqs, + cca_state_all->nb[1], + kv_head*cca_state_size*ggml_element_size(cca_state_all)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, conv_state_update_target)); + + ggml_tensor * last_hs = ggml_view_2d(ctx0, cur_seq, n_embd, n_seqs, + cur_seq->nb[2], + (n_seq_tokens - 1)*cur_seq->nb[1]); + ggml_tensor * prev_hs_update_target = ggml_view_2d(ctx0, cca_state_all, n_embd, n_seqs, + cca_state_all->nb[1], + (kv_head*cca_state_size + conv_state_size)*ggml_element_size(cca_state_all)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target)); + + ggml_tensor * conv_dw = layer.cca_conv_dw; + if (conv_dw->type != GGML_TYPE_F32) { + conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F32); } - - // conv_qk.1 (grouped, causal) — operate on {n_tokens, n_qk} format - { - ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); - pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); - - QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); - // conv output is {OL, OC, N} -> reshape to {OC, OL}, then add bias - QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); - QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); - cb(QK, "QK_grp", il); + conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk); + ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1); + if (layer.cca_conv_dw_b) { + QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1)); } + cb(QK, "QK_dw", il); - // QK is now [n_qk, n_tokens] + QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK, 1, 0, 1, n_groups); + QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1)); + cb(QK, "QK_grp", il); - // Split Q_conv, K_conv - ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, - QK->nb[1], 0); - ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, - QK->nb[1], n_embd_q * ggml_element_size(QK)); + QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3)); + QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); - // QK mean skip connection - ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f); - ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f); + ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0); + ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, QK->nb[1], n_embd_q*ggml_element_size(QK)); + + ggml_tensor * Qcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Q_conv), n_embd_head, n_head, n_tokens); + ggml_tensor * Kcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, K_conv), n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_add(ctx0, Qcur, qk_mean_q); + Kcur = ggml_add(ctx0, Kcur, qk_mean_k); + + Qcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Qcur, 1e-12f), sqrtf((float) n_embd_head)); + Kcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Kcur, 1e-12f), sqrtf((float) n_embd_head)); + Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.cca_k_scale, 1, n_head_kv, 1)); + cb(Qcur, "Qcur_pre_rope", il); + cb(Kcur, "Kcur_pre_rope", il); + + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - // RMSNorm on concat(Q, K) — weightless (unit RMSNorm) - ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0); - QK_for_norm = build_norm(QK_for_norm, nullptr, nullptr, LLM_NORM_RMS, il); - cb(QK_for_norm, "QK_normed", il); - - // Split back - Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens, - QK_for_norm->nb[1], 0); - Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens, - QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm)); - - // Per-KV-head temperature scaling on K - // Kcur: [n_embd_k=256, n_tokens], reshape to [n_embd_head, n_head_kv, n_tokens] - Kcur = ggml_cont(ctx0, Kcur); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - // cca_k_scale: [n_head_kv] → broadcast - Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); - cb(Kcur, "Kcur_scaled", il); - - // Reshape for attention - Qcur = ggml_cont(ctx0, Qcur); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Vcur = ggml_cont(ctx0, Vcur); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Vcur), n_embd_head, n_head_kv, n_tokens); // GQA attention cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr, @@ -244,77 +341,82 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params // ===== MoE Layer ===== // Build Zaya router network: - // down_proj → RMSNorm → SiLU(MLP0) → MLP2 → MLP4 → 17 logits → take first 16 + // down_proj -> optional EDA -> RMSNorm -> GELU MLP -> 17 logits. ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur); router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b); cb(router_h, "router_down", il); + if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { + router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale)); + cb(router_h, "router_eda", il); + } + + prev_router = router_h; + router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il); cb(router_h, "router_norm", il); router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h); router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b); - router_h = ggml_silu(ctx0, router_h); + router_h = ggml_gelu(ctx0, router_h); cb(router_h, "router_mlp0", il); router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp2, router_h); router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp2_b); + router_h = ggml_gelu(ctx0, router_h); cb(router_h, "router_mlp2", il); router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h); - // router_h now has shape [17, n_tokens] — 16 expert logits + 1 MOD skip cb(router_h, "router_logits", il); - // Take only the first 16 logits (expert routing), ignore MOD skip (index 16) - ggml_tensor * gate_inp = ggml_cont(ctx0, - ggml_view_2d(ctx0, router_h, n_expert, n_tokens, - router_h->nb[1], 0)); - cb(gate_inp, "gate_inp", il); + ggml_tensor * router_probs = ggml_soft_max(ctx0, router_h); + cb(router_probs, "router_probs", il); + + // Keep the MOD skip expert in the softmax denominator, then route + // over real experts only. The checkpoint's skip bias keeps MOD unused. + ggml_tensor * gate_probs = ggml_cont(ctx0, + ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0)); + cb(gate_probs, "gate_probs", il); + + ggml_tensor * expert_biases = nullptr; + if (layer.zaya_router_biases != nullptr) { + expert_biases = ggml_view_1d(ctx0, layer.zaya_router_biases, n_expert, 0); + } - // MoE FFN with topk=1 (pass router logits as probs_in) cur = build_moe_ffn(cur, /* gate_inp */ nullptr, /* up_exps */ nullptr, /* gate_exps */ nullptr, /* down_exps */ layer.ffn_down_exps, - /* exp_probs_b */ nullptr, + /* exp_probs_b */ expert_biases, /* n_expert */ n_expert, /* n_expert_used */ hparams.n_expert_used, /* type_op */ LLM_FFN_SILU, /* norm_w */ false, /* w_scale */ 1.0f, - /* gating_op */ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + /* gating_op */ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE, /* il */ il, - /* probs_in */ gate_inp, + /* probs_in */ gate_probs, /* gate_up_exps */ layer.ffn_gate_up_exps); cb(cur, "moe_out", il); } - // select output tokens on last layer - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // Residual scaling: cur = hs_scale * cur + hs_bias - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.res_scale_hs), layer.res_scale_hs_b); - cb(cur, "scaled_out", il); - - // Residual scaling: inpSA = res_scale * inpSA + res_bias (if present) - if (layer.res_scale_res) { - inpSA = ggml_add(ctx0, ggml_mul(ctx0, inpSA, layer.res_scale_res), layer.res_scale_res_b); - cb(inpSA, "scaled_residual", il); - } - - // Residual add - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "l_out", il); - inpL = cur; } - cur = inpL; + ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1); + if (residual != nullptr) { + residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1); + cur = ggml_add(ctx0, final_hidden, residual); + } else { + cur = final_hidden; + } + cb(cur, "final_residual", -1); + + if (inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } // final norm cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); From c3ff41c0263b0ee73cec52ffe12776111f196db1 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 10:31:16 +0200 Subject: [PATCH 07/33] refactor: replace CCA_CONV_DW with generic SSM_CONV1D constant - Remove LLM_TENSOR_CCA_CONV_DW and LLM_TENSOR_CCA_CONV_DW_B from llama-arch.h - Update tensor name mappings in llama-arch.cpp to use SSM_CONV1D - Remove CCA_CONV_DW and CCA_CONV_DW_B from gguf constants.py - Update MODEL_ARCH.ZAYA1 tensor list to use SSM_CONV1D - Update zaya.cpp to create tensors using LLM_TENSOR_SSM_CONV1D - Update convert_hf_to_gguf.py to map conv_qk.0 to SSM_CONV1D - Add HuggingFace tensor mapping for zaya conv_qk.0 to SSM_CONV1D This improves consistency by reusing the existing SSM_CONV1D constant that's already used by other SSM-based architectures (mamba, jamba, etc.) --- convert_hf_to_gguf.py | 4 ++-- gguf-py/gguf/constants.py | 7 +------ gguf-py/gguf/tensor_mapping.py | 1 + src/llama-arch.cpp | 4 ---- src/llama-arch.h | 2 -- src/models/zaya.cpp | 4 ++-- 6 files changed, 6 insertions(+), 16 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1e1adb10fe4..fc39900623e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6508,9 +6508,9 @@ def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[st elif "conv_qk.0" in name and name.endswith(".weight"): # PyTorch: [n_qk, 1, kernel] (depthwise) -> ggml: {kernel, n_qk} data_torch = data_torch.squeeze(1).contiguous() - yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.SSM_CONV1D, bid), data_torch elif "conv_qk.0" in name and name.endswith(".bias"): - yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.SSM_CONV1D, bid, suffix=".bias"), data_torch elif "conv_qk.1" in name and name.endswith(".weight"): # PyTorch: [n_qk, in_ch_per_group, kernel] -> ggml: {kernel, in_ch_per_group, n_qk} yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 57a67cb559f..f3cba8fd7d0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -611,9 +611,7 @@ class MODEL_TENSOR(IntEnum): SSM_BETA = auto() # Kimi Linear qwen3.5 SSM_G_A = auto() # Kimi Linear SSM_G_B = auto() # Kimi Linear - CCA_CONV_DW = auto() # Zaya CCA_CONV_GRP = auto() # Zaya - CCA_CONV_DW_B = auto() # Zaya: conv_qk.0.bias CCA_QK_NORM = auto() # Zaya (weightless - unit RMSNorm) CCA_K_SCALE = auto() # Zaya CCA_VAL_PROJ1 = auto() # Zaya: CCA value projection stream 1 @@ -1150,8 +1148,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear qwen3.5 MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear - MODEL_TENSOR.CCA_CONV_DW: "blk.{bid}.cca_conv_dw", # Zaya - MODEL_TENSOR.CCA_CONV_DW_B: "blk.{bid}.cca_conv_dw_b", # Zaya MODEL_TENSOR.CCA_CONV_GRP: "blk.{bid}.cca_conv_grp", # Zaya MODEL_TENSOR.CCA_QK_NORM: "blk.{bid}.cca_qk_norm", # Zaya MODEL_TENSOR.CCA_K_SCALE: "blk.{bid}.cca_k_scale", # Zaya @@ -4052,8 +4048,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, MODEL_TENSOR.ATTN_OUT, - MODEL_TENSOR.CCA_CONV_DW, - MODEL_TENSOR.CCA_CONV_DW_B, + MODEL_TENSOR.SSM_CONV1D, MODEL_TENSOR.CCA_CONV_GRP, MODEL_TENSOR.CCA_QK_NORM, MODEL_TENSOR.CCA_K_SCALE, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index fbd22ccb6a3..f89f483635c 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -800,6 +800,7 @@ class TensorNameMap: "model.layers.{bid}.mamba.conv1d", # jamba falcon-h1 granite-hybrid "model.layers.layers.{bid}.mixer.conv1d", # plamo2 "model.layers.{bid}.linear_attn.conv1d", # qwen3next + "model.layers.{bid}.self_attn.conv_qk.0", # zaya ), MODEL_TENSOR.SSM_X: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 9bdd0023028..fa10603eb6a 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -418,8 +418,6 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_SSM_BETA, "blk.%d.ssm_beta" }, { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, - { LLM_TENSOR_CCA_CONV_DW, "blk.%d.cca_conv_dw" }, - { LLM_TENSOR_CCA_CONV_DW_B, "blk.%d.cca_conv_dw_b" }, { LLM_TENSOR_CCA_CONV_GRP, "blk.%d.cca_conv_grp" }, { LLM_TENSOR_CCA_QK_NORM, "blk.%d.cca_qk_norm" }, { LLM_TENSOR_CCA_K_SCALE, "blk.%d.cca_k_scale" }, @@ -686,8 +684,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_G_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, // ZAYA CCA - {LLM_TENSOR_CCA_CONV_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, - {LLM_TENSOR_CCA_CONV_DW_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_CCA_CONV_GRP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CCA_QK_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CCA_K_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 30a3f9a444a..07078d15e60 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -446,8 +446,6 @@ enum llm_tensor { LLM_TENSOR_SSM_G_A, // kimi: output gate projection A LLM_TENSOR_SSM_G_B, // kimi: output gate projection B // ZAYA CCA (Compressed Convolutional Attention) - LLM_TENSOR_CCA_CONV_DW, // zaya: depthwise conv1d (conv_qk.0) - LLM_TENSOR_CCA_CONV_DW_B, // zaya: depthwise conv1d bias LLM_TENSOR_CCA_CONV_GRP, // zaya: grouped conv1d (conv_qk.1) LLM_TENSOR_CCA_QK_NORM, // zaya: RMSNorm on concat(Q,K) LLM_TENSOR_CCA_K_SCALE, // zaya: learned K temperature diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 89e354450bb..da35514ce62 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -74,8 +74,8 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); - layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); - layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); + layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, n_qk}, 0); + layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), {d_conv, n_qk / n_groups, n_qk}, 0); From a06da04c0f30d80d9734b36c0bbae8ada2c4674f Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 10:40:16 +0200 Subject: [PATCH 08/33] refactor: replace ZAYA_ROUTER_NORM with generic FFN_NORM constant - Remove LLM_TENSOR_ZAYA_ROUTER_NORM from llama-arch.h - Update tensor mappings in llama-arch.cpp to use FFN_NORM - Remove ZAYA_ROUTER_NORM from gguf constants.py - Update MODEL_ARCH.ZAYA1 tensor list to use FFN_NORM - Update zaya.cpp to create router norm tensor using LLM_TENSOR_FFN_NORM - Update convert_hf_to_gguf.py to map rmsnorm_eda to FFN_NORM - Add HuggingFace tensor mapping for zaya rmsnorm_eda to FFN_NORM Router normalization is a standard FFN norm (RMSNorm), making this a semantically correct replacement that reduces custom constants. --- convert_hf_to_gguf.py | 2 +- gguf-py/gguf/constants.py | 4 +--- gguf-py/gguf/tensor_mapping.py | 1 + src/llama-arch.cpp | 2 -- src/llama-arch.h | 1 - src/models/zaya.cpp | 2 +- 6 files changed, 4 insertions(+), 8 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index fc39900623e..bdc6bd24df5 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6525,7 +6525,7 @@ def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple elif "down_proj.bias" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch elif "rmsnorm_eda" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_NORM, bid), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_NORM, bid), data_torch elif "router_mlp.0.weight" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0, bid), data_torch elif "router_mlp.0.bias" in name: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index f3cba8fd7d0..993a676cd11 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -626,7 +626,6 @@ class MODEL_TENSOR(IntEnum): RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias ZAYA_ROUTER_DOWN = auto() # Zaya ZAYA_ROUTER_DOWN_B = auto() # Zaya - ZAYA_ROUTER_NORM = auto() # Zaya ZAYA_ROUTER_MLP0 = auto() # Zaya ZAYA_ROUTER_MLP0_B = auto() # Zaya ZAYA_ROUTER_MLP2 = auto() # Zaya @@ -1163,7 +1162,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_RES_B_FINAL: "res_scale_res_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_DOWN: "blk.{bid}.zaya_router_down", # Zaya MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya - MODEL_TENSOR.ZAYA_ROUTER_NORM: "blk.{bid}.zaya_router_norm", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP0: "blk.{bid}.zaya_router_mlp0", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP0_B: "blk.{bid}.zaya_router_mlp0_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP2: "blk.{bid}.zaya_router_mlp2", # Zaya @@ -4064,7 +4062,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_RES_B_FINAL, MODEL_TENSOR.ZAYA_ROUTER_DOWN, MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, - MODEL_TENSOR.ZAYA_ROUTER_NORM, + MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.ZAYA_ROUTER_MLP0, MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, MODEL_TENSOR.ZAYA_ROUTER_MLP2, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f89f483635c..a2467f57132 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -403,6 +403,7 @@ class TensorNameMap: "model.layers.{bid}.feedforward_layernorm", # apertus "model.layers.{bid}.pre_mlp_layernorm", # kormo "layers.{bid}.mlp_norm" # modern-bert + "model.layers.{bid}.self_attn.rmsnorm_eda", # zaya ), # Pre feed-forward norm diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index fa10603eb6a..7a06904d17e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -433,7 +433,6 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_RES_SCALE_RES_B_FINAL, "res_scale_res_b" }, { LLM_TENSOR_ZAYA_ROUTER_DOWN, "blk.%d.zaya_router_down" }, { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, - { LLM_TENSOR_ZAYA_ROUTER_NORM, "blk.%d.zaya_router_norm" }, { LLM_TENSOR_ZAYA_ROUTER_MLP0, "blk.%d.zaya_router_mlp0" }, { LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "blk.%d.zaya_router_mlp0_b" }, { LLM_TENSOR_ZAYA_ROUTER_MLP2, "blk.%d.zaya_router_mlp2" }, @@ -699,7 +698,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_RES_SCALE_RES_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, - {LLM_TENSOR_ZAYA_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_ZAYA_ROUTER_MLP0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ZAYA_ROUTER_MLP0_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_MLP2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 07078d15e60..a186a39c4b0 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -463,7 +463,6 @@ enum llm_tensor { // ZAYA Router (MoE gating) LLM_TENSOR_ZAYA_ROUTER_DOWN, // zaya: router down_proj weight LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias - LLM_TENSOR_ZAYA_ROUTER_NORM, // zaya: router rmsnorm_eda weight LLM_TENSOR_ZAYA_ROUTER_MLP0, // zaya: router MLP layer 0 weight LLM_TENSOR_ZAYA_ROUTER_MLP0_B, // zaya: router MLP layer 0 bias LLM_TENSOR_ZAYA_ROUTER_MLP2, // zaya: router MLP layer 2 weight diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index da35514ce62..239da53ae99 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -97,7 +97,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { {n_embd, n_ff_exp}, 0); layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i), {n_ff_exp}, 0); - layer.zaya_router_norm = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_NORM, "weight", i), + layer.zaya_router_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_ff_exp}, 0); layer.zaya_router_mlp0 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0, "weight", i), {n_ff_exp, n_ff_exp}, 0); From ed3820b43ef9234dfca5e30c8a919a71ceb6d3ee Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 10:48:46 +0200 Subject: [PATCH 09/33] refactor: replace ZAYA_ROUTER_DOWN with generic FFN_GATE_INP constant - Remove LLM_TENSOR_ZAYA_ROUTER_DOWN from llama-arch.h - Update tensor mappings in llama-arch.cpp to use FFN_GATE_INP - Remove ZAYA_ROUTER_DOWN from gguf constants.py - Update MODEL_ARCH.ZAYA1 tensor list to use FFN_GATE_INP - Update zaya.cpp to create router down tensor using LLM_TENSOR_FFN_GATE_INP - Update convert_hf_to_gguf.py to map down_proj.weight to FFN_GATE_INP - Add HuggingFace tensor mapping for zaya router down_proj to FFN_GATE_INP Router down projection is a linear projection similar to MoE gate input, making this a semantically reasonable replacement. --- convert_hf_to_gguf.py | 2 +- gguf-py/gguf/constants.py | 4 +--- gguf-py/gguf/tensor_mapping.py | 1 + src/llama-arch.cpp | 2 -- src/llama-arch.h | 1 - src/models/zaya.cpp | 2 +- 6 files changed, 4 insertions(+), 8 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index bdc6bd24df5..1ca26918b06 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6521,7 +6521,7 @@ def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[st def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: if "down_proj.weight" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN, bid), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid), data_torch elif "down_proj.bias" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch elif "rmsnorm_eda" in name: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 993a676cd11..1d511fa6fbd 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -624,7 +624,6 @@ class MODEL_TENSOR(IntEnum): RES_SCALE_HS_B_FINAL = auto() # Zaya: final hidden_states_bias RES_SCALE_RES_FINAL = auto() # Zaya: final residual_scale RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias - ZAYA_ROUTER_DOWN = auto() # Zaya ZAYA_ROUTER_DOWN_B = auto() # Zaya ZAYA_ROUTER_MLP0 = auto() # Zaya ZAYA_ROUTER_MLP0_B = auto() # Zaya @@ -1160,7 +1159,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_HS_B_FINAL: "res_scale_hs_b", # Zaya MODEL_TENSOR.RES_SCALE_RES_FINAL: "res_scale_res", # Zaya MODEL_TENSOR.RES_SCALE_RES_B_FINAL: "res_scale_res_b", # Zaya - MODEL_TENSOR.ZAYA_ROUTER_DOWN: "blk.{bid}.zaya_router_down", # Zaya MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP0: "blk.{bid}.zaya_router_mlp0", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP0_B: "blk.{bid}.zaya_router_mlp0_b", # Zaya @@ -4060,7 +4058,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_HS_B_FINAL, MODEL_TENSOR.RES_SCALE_RES_FINAL, MODEL_TENSOR.RES_SCALE_RES_B_FINAL, - MODEL_TENSOR.ZAYA_ROUTER_DOWN, + MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.ZAYA_ROUTER_MLP0, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a2467f57132..a3667c444dc 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -454,6 +454,7 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.gate", # nemotron-h-moe "model.layers.{bid}.moe.gate", # step3.5 "model.layers.{bid}.router.proj", # gemma4 + "model.layers.{bid}.self_attn.router_mlp.down_proj", # zaya ), MODEL_TENSOR.FFN_GATE_INP_SHEXP: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 7a06904d17e..b2777fb15c7 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -431,7 +431,6 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_RES_SCALE_HS_B_FINAL, "res_scale_hs_b" }, { LLM_TENSOR_RES_SCALE_RES_FINAL, "res_scale_res" }, { LLM_TENSOR_RES_SCALE_RES_B_FINAL, "res_scale_res_b" }, - { LLM_TENSOR_ZAYA_ROUTER_DOWN, "blk.%d.zaya_router_down" }, { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, { LLM_TENSOR_ZAYA_ROUTER_MLP0, "blk.%d.zaya_router_mlp0" }, { LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "blk.%d.zaya_router_mlp0_b" }, @@ -696,7 +695,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_RES_SCALE_HS_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, {LLM_TENSOR_RES_SCALE_RES_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_RES_SCALE_RES_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, - {LLM_TENSOR_ZAYA_ROUTER_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_MLP0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ZAYA_ROUTER_MLP0_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index a186a39c4b0..d0fb4c67cc7 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -461,7 +461,6 @@ enum llm_tensor { LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias // ZAYA Router (MoE gating) - LLM_TENSOR_ZAYA_ROUTER_DOWN, // zaya: router down_proj weight LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias LLM_TENSOR_ZAYA_ROUTER_MLP0, // zaya: router MLP layer 0 weight LLM_TENSOR_ZAYA_ROUTER_MLP0_B, // zaya: router MLP layer 0 bias diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 239da53ae99..bf188000ad2 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -93,7 +93,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { // MoE layers (odd indices) if (i % 2 == 1) { // Router network - layer.zaya_router_down = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN, "weight", i), + layer.zaya_router_down = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_ff_exp}, 0); layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i), {n_ff_exp}, 0); From 7de270f12c0534a855542e23402c4ed74be53210 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 10:55:23 +0200 Subject: [PATCH 10/33] refactor: replace ZAYA_ROUTER_MLP0 with generic FFN_GATE constant - Remove LLM_TENSOR_ZAYA_ROUTER_MLP0 from llama-arch.h - Update tensor mappings in llama-arch.cpp to use FFN_GATE - Remove ZAYA_ROUTER_MLP0 from gguf constants.py - Update MODEL_ARCH.ZAYA1 tensor list to use FFN_GATE - Update zaya.cpp to create router mlp0 tensor using LLM_TENSOR_FFN_GATE - Update convert_hf_to_gguf.py to map router_mlp.0.weight to FFN_GATE - Add HuggingFace tensor mapping for zaya router_mlp.0 to FFN_GATE Router MLP hidden layer is a linear projection similar to FFN gate, making this a reasonable replacement for reducing custom constants. --- convert_hf_to_gguf.py | 2 +- gguf-py/gguf/constants.py | 4 +--- gguf-py/gguf/tensor_mapping.py | 1 + src/llama-arch.cpp | 2 -- src/llama-arch.h | 1 - src/models/zaya.cpp | 2 +- 6 files changed, 4 insertions(+), 8 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1ca26918b06..1f8fb5c1280 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6527,7 +6527,7 @@ def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple elif "rmsnorm_eda" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_NORM, bid), data_torch elif "router_mlp.0.weight" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0, bid), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch elif "router_mlp.0.bias" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, bid, suffix=".bias"), data_torch elif "router_mlp.2.weight" in name: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1d511fa6fbd..494ca5fe0fe 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -625,7 +625,6 @@ class MODEL_TENSOR(IntEnum): RES_SCALE_RES_FINAL = auto() # Zaya: final residual_scale RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias ZAYA_ROUTER_DOWN_B = auto() # Zaya - ZAYA_ROUTER_MLP0 = auto() # Zaya ZAYA_ROUTER_MLP0_B = auto() # Zaya ZAYA_ROUTER_MLP2 = auto() # Zaya ZAYA_ROUTER_MLP2_B = auto() # Zaya @@ -1160,7 +1159,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_RES_FINAL: "res_scale_res", # Zaya MODEL_TENSOR.RES_SCALE_RES_B_FINAL: "res_scale_res_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya - MODEL_TENSOR.ZAYA_ROUTER_MLP0: "blk.{bid}.zaya_router_mlp0", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP0_B: "blk.{bid}.zaya_router_mlp0_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP2: "blk.{bid}.zaya_router_mlp2", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP2_B: "blk.{bid}.zaya_router_mlp2_b", # Zaya @@ -4061,7 +4059,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.ZAYA_ROUTER_MLP0, + MODEL_TENSOR.FFN_GATE, MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, MODEL_TENSOR.ZAYA_ROUTER_MLP2, MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a3667c444dc..41cd9262434 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -567,6 +567,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.ff_proj", # llada "layers.{bid}.mlp.gate_proj", # qwen3-embedding "model.layers.{bid}.mlp.language_mlp.gate_proj", # cogvlm + "model.layers.{bid}.self_attn.router_mlp.0", # zaya ), MODEL_TENSOR.FFN_GATE_EXP: ( diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index b2777fb15c7..f8c3f57cb69 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -432,7 +432,6 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_RES_SCALE_RES_FINAL, "res_scale_res" }, { LLM_TENSOR_RES_SCALE_RES_B_FINAL, "res_scale_res_b" }, { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, - { LLM_TENSOR_ZAYA_ROUTER_MLP0, "blk.%d.zaya_router_mlp0" }, { LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "blk.%d.zaya_router_mlp0_b" }, { LLM_TENSOR_ZAYA_ROUTER_MLP2, "blk.%d.zaya_router_mlp2" }, { LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "blk.%d.zaya_router_mlp2_b" }, @@ -696,7 +695,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_RES_SCALE_RES_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_RES_SCALE_RES_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, - {LLM_TENSOR_ZAYA_ROUTER_MLP0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ZAYA_ROUTER_MLP0_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_MLP2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ZAYA_ROUTER_MLP2_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index d0fb4c67cc7..20ee10a7402 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -462,7 +462,6 @@ enum llm_tensor { LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias // ZAYA Router (MoE gating) LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias - LLM_TENSOR_ZAYA_ROUTER_MLP0, // zaya: router MLP layer 0 weight LLM_TENSOR_ZAYA_ROUTER_MLP0_B, // zaya: router MLP layer 0 bias LLM_TENSOR_ZAYA_ROUTER_MLP2, // zaya: router MLP layer 2 weight LLM_TENSOR_ZAYA_ROUTER_MLP2_B, // zaya: router MLP layer 2 bias diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index bf188000ad2..0f55d6570f7 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -99,7 +99,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { {n_ff_exp}, 0); layer.zaya_router_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_ff_exp}, 0); - layer.zaya_router_mlp0 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0, "weight", i), + layer.zaya_router_mlp0 = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_ff_exp, n_ff_exp}, 0); layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "bias", i), {n_ff_exp}, 0); From a5c885bcc578663a0b0a3ec8579b453bfbc77774 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 11:08:26 +0200 Subject: [PATCH 11/33] refactor: merge RES_SCALE_*_B bias constants into RES_SCALE_* constants - Remove LLM_TENSOR_RES_SCALE_HS_B, RES_SCALE_RES_B, RES_SCALE_HS_B_FINAL, RES_SCALE_RES_B_FINAL - Use single RES_SCALE_HS for both weight and bias (same for RES_SCALE_RES) - Update tensor mappings in llama-arch.cpp - Remove bias constants from gguf constants.py - Update MODEL_ARCH.ZAYA1 tensor list - Update zaya.cpp to create bias tensors using same constant with 'bias' suffix - Update convert_hf_to_gguf.py to map bias tensors with .bias suffix This reduces 8 custom ZAYA constants to 4 by reusing the same constant for both weight and bias tensors, differentiated by suffix. --- convert_hf_to_gguf.py | 8 ++++---- gguf-py/gguf/constants.py | 20 ++++---------------- src/llama-arch.cpp | 8 -------- src/llama-arch.h | 12 ++++-------- src/models/zaya.cpp | 8 ++++---- 5 files changed, 16 insertions(+), 40 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 1f8fb5c1280..382a3abcb6a 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6545,21 +6545,21 @@ def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tu if "hidden_states_scale" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid), data_torch elif "hidden_states_bias" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B, bid, suffix=".bias"), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid, suffix=".bias"), data_torch elif "residual_scale" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid), data_torch elif "residual_bias" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid, suffix=".bias"), data_torch def _map_final_res_scale(self, name: str, data_torch: Tensor) -> Iterable[tuple[str, Tensor]]: if "hidden_states_scale" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL), data_torch elif "hidden_states_bias" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B_FINAL, suffix=".bias"), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL, suffix=".bias"), data_torch elif "residual_scale" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL), data_torch elif "residual_bias" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B_FINAL, suffix=".bias"), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL, suffix=".bias"), data_torch def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Common tensors diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 494ca5fe0fe..b42c58f1b2f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -616,14 +616,10 @@ class MODEL_TENSOR(IntEnum): CCA_K_SCALE = auto() # Zaya CCA_VAL_PROJ1 = auto() # Zaya: CCA value projection stream 1 CCA_VAL_PROJ2 = auto() # Zaya: CCA value projection stream 2 - RES_SCALE_HS = auto() # Zaya: hidden_states_scale - RES_SCALE_HS_B = auto() # Zaya: hidden_states_bias - RES_SCALE_RES = auto() # Zaya: residual_scale - RES_SCALE_RES_B = auto() # Zaya: residual_bias - RES_SCALE_HS_FINAL = auto() # Zaya: final hidden_states_scale - RES_SCALE_HS_B_FINAL = auto() # Zaya: final hidden_states_bias - RES_SCALE_RES_FINAL = auto() # Zaya: final residual_scale - RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias + RES_SCALE_HS = auto() # Zaya: hidden_states_scale (+ bias) + RES_SCALE_RES = auto() # Zaya: residual_scale (+ bias) + RES_SCALE_HS_FINAL = auto() # Zaya: final hidden_states_scale (+ bias) + RES_SCALE_RES_FINAL = auto() # Zaya: final residual_scale (+ bias) ZAYA_ROUTER_DOWN_B = auto() # Zaya ZAYA_ROUTER_MLP0_B = auto() # Zaya ZAYA_ROUTER_MLP2 = auto() # Zaya @@ -1151,13 +1147,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.CCA_VAL_PROJ1: "blk.{bid}.cca_val_proj1", # Zaya MODEL_TENSOR.CCA_VAL_PROJ2: "blk.{bid}.cca_val_proj2", # Zaya MODEL_TENSOR.RES_SCALE_HS: "blk.{bid}.res_scale_hs", # Zaya - MODEL_TENSOR.RES_SCALE_HS_B: "blk.{bid}.res_scale_hs_b", # Zaya MODEL_TENSOR.RES_SCALE_RES: "blk.{bid}.res_scale_res", # Zaya - MODEL_TENSOR.RES_SCALE_RES_B: "blk.{bid}.res_scale_res_b", # Zaya MODEL_TENSOR.RES_SCALE_HS_FINAL: "res_scale_hs", # Zaya - MODEL_TENSOR.RES_SCALE_HS_B_FINAL: "res_scale_hs_b", # Zaya MODEL_TENSOR.RES_SCALE_RES_FINAL: "res_scale_res", # Zaya - MODEL_TENSOR.RES_SCALE_RES_B_FINAL: "res_scale_res_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP0_B: "blk.{bid}.zaya_router_mlp0_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP2: "blk.{bid}.zaya_router_mlp2", # Zaya @@ -4049,13 +4041,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.CCA_VAL_PROJ1, MODEL_TENSOR.CCA_VAL_PROJ2, MODEL_TENSOR.RES_SCALE_HS, - MODEL_TENSOR.RES_SCALE_HS_B, MODEL_TENSOR.RES_SCALE_RES, - MODEL_TENSOR.RES_SCALE_RES_B, MODEL_TENSOR.RES_SCALE_HS_FINAL, - MODEL_TENSOR.RES_SCALE_HS_B_FINAL, MODEL_TENSOR.RES_SCALE_RES_FINAL, - MODEL_TENSOR.RES_SCALE_RES_B_FINAL, MODEL_TENSOR.FFN_GATE_INP, MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, MODEL_TENSOR.FFN_NORM, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index f8c3f57cb69..5af26e8e107 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -424,13 +424,9 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_CCA_VAL_PROJ1, "blk.%d.cca_val_proj1" }, { LLM_TENSOR_CCA_VAL_PROJ2, "blk.%d.cca_val_proj2" }, { LLM_TENSOR_RES_SCALE_HS, "blk.%d.res_scale_hs" }, - { LLM_TENSOR_RES_SCALE_HS_B, "blk.%d.res_scale_hs_b" }, { LLM_TENSOR_RES_SCALE_RES, "blk.%d.res_scale_res" }, - { LLM_TENSOR_RES_SCALE_RES_B, "blk.%d.res_scale_res_b" }, { LLM_TENSOR_RES_SCALE_HS_FINAL, "res_scale_hs" }, - { LLM_TENSOR_RES_SCALE_HS_B_FINAL, "res_scale_hs_b" }, { LLM_TENSOR_RES_SCALE_RES_FINAL, "res_scale_res" }, - { LLM_TENSOR_RES_SCALE_RES_B_FINAL, "res_scale_res_b" }, { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, { LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "blk.%d.zaya_router_mlp0_b" }, { LLM_TENSOR_ZAYA_ROUTER_MLP2, "blk.%d.zaya_router_mlp2" }, @@ -687,13 +683,9 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_CCA_VAL_PROJ1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CCA_VAL_PROJ2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_RES_SCALE_HS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_RES_SCALE_HS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_RES_SCALE_RES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_RES_SCALE_RES_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_RES_SCALE_HS_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, - {LLM_TENSOR_RES_SCALE_HS_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, {LLM_TENSOR_RES_SCALE_RES_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, - {LLM_TENSOR_RES_SCALE_RES_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_MLP0_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_MLP2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 20ee10a7402..1e69f62ed53 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -452,14 +452,10 @@ enum llm_tensor { LLM_TENSOR_CCA_VAL_PROJ1, // zaya: V projection 1 LLM_TENSOR_CCA_VAL_PROJ2, // zaya: V projection 2 // ZAYA residual scaling - LLM_TENSOR_RES_SCALE_HS, // zaya: hidden_states_scale - LLM_TENSOR_RES_SCALE_HS_B, // zaya: hidden_states_bias - LLM_TENSOR_RES_SCALE_RES, // zaya: residual_scale - LLM_TENSOR_RES_SCALE_RES_B, // zaya: residual_bias - LLM_TENSOR_RES_SCALE_HS_FINAL, // zaya: final hidden_states_scale - LLM_TENSOR_RES_SCALE_HS_B_FINAL,// zaya: final hidden_states_bias - LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale - LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias + LLM_TENSOR_RES_SCALE_HS, // zaya: hidden_states_scale (+ bias) + LLM_TENSOR_RES_SCALE_RES, // zaya: residual_scale (+ bias) + LLM_TENSOR_RES_SCALE_HS_FINAL, // zaya: final hidden_states_scale (+ bias) + LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale (+ bias) // ZAYA Router (MoE gating) LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias LLM_TENSOR_ZAYA_ROUTER_MLP0_B, // zaya: router MLP layer 0 bias diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 0f55d6570f7..8ca33296407 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -39,9 +39,9 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { } zaya_res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); - zaya_res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); zaya_res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); - zaya_res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); const int64_t n_embd_head = hparams.n_embd_head_k(); const int64_t d_conv = hparams.ssm_d_conv; @@ -86,9 +86,9 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { // Residual scaling layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); - layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B, "bias", i), {n_embd}, 0); + layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); - layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); // MoE layers (odd indices) if (i % 2 == 1) { From 45bf02136822992127fa8ee722c788f1590827bd Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 11:20:05 +0200 Subject: [PATCH 12/33] refactor: merge router bias constants into parent constants - Remove ZAYA_ROUTER_DOWN_B, ZAYA_ROUTER_MLP0_B, ZAYA_ROUTER_MLP2_B - Use FFN_GATE_INP for both router down weight and bias - Use FFN_GATE for both router mlp0 weight and bias - Use ZAYA_ROUTER_MLP2 for both router mlp2 weight and bias - Update tensor mappings in llama-arch.cpp - Remove bias constants from gguf constants.py - Update MODEL_ARCH.ZAYA1 tensor list - Update zaya.cpp to create bias tensors using same constant with 'bias' suffix - Update convert_hf_to_gguf.py to map bias tensors with .bias suffix - Add ZAYA_ROUTER_MLP2 tensor mapping for HuggingFace auto-detection This reduces 3 more custom constants by reusing the same constant for both weight and bias tensors, differentiated by suffix. --- convert_hf_to_gguf.py | 6 +++--- gguf-py/gguf/constants.py | 11 +---------- gguf-py/gguf/tensor_mapping.py | 3 +++ src/llama-arch.cpp | 6 ------ src/llama-arch.h | 5 +---- src/models/zaya.cpp | 12 ++++++------ 6 files changed, 14 insertions(+), 29 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 382a3abcb6a..dda858537f9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6523,17 +6523,17 @@ def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple if "down_proj.weight" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid), data_torch elif "down_proj.bias" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, suffix=".bias"), data_torch elif "rmsnorm_eda" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_NORM, bid), data_torch elif "router_mlp.0.weight" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch elif "router_mlp.0.bias" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, bid, suffix=".bias"), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid, suffix=".bias"), data_torch elif "router_mlp.2.weight" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid), data_torch elif "router_mlp.2.bias" in name: - yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, bid, suffix=".bias"), data_torch + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid, suffix=".bias"), data_torch elif "router_mlp.4.weight" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP4, bid), data_torch elif "balancing_biases" in name: diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index b42c58f1b2f..a979d89c577 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -620,10 +620,7 @@ class MODEL_TENSOR(IntEnum): RES_SCALE_RES = auto() # Zaya: residual_scale (+ bias) RES_SCALE_HS_FINAL = auto() # Zaya: final hidden_states_scale (+ bias) RES_SCALE_RES_FINAL = auto() # Zaya: final residual_scale (+ bias) - ZAYA_ROUTER_DOWN_B = auto() # Zaya - ZAYA_ROUTER_MLP0_B = auto() # Zaya - ZAYA_ROUTER_MLP2 = auto() # Zaya - ZAYA_ROUTER_MLP2_B = auto() # Zaya + ZAYA_ROUTER_MLP2 = auto() # Zaya: router MLP layer 2 (+ bias) ZAYA_ROUTER_MLP4 = auto() # Zaya ZAYA_ROUTER_BIASES = auto() # Zaya ZAYA_ROUTER_EDA_SCALE = auto() # Zaya @@ -1150,10 +1147,7 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_RES: "blk.{bid}.res_scale_res", # Zaya MODEL_TENSOR.RES_SCALE_HS_FINAL: "res_scale_hs", # Zaya MODEL_TENSOR.RES_SCALE_RES_FINAL: "res_scale_res", # Zaya - MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya - MODEL_TENSOR.ZAYA_ROUTER_MLP0_B: "blk.{bid}.zaya_router_mlp0_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP2: "blk.{bid}.zaya_router_mlp2", # Zaya - MODEL_TENSOR.ZAYA_ROUTER_MLP2_B: "blk.{bid}.zaya_router_mlp2_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_MLP4: "blk.{bid}.zaya_router_mlp4", # Zaya MODEL_TENSOR.ZAYA_ROUTER_BIASES: "blk.{bid}.zaya_router_biases", # Zaya MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE: "blk.{bid}.zaya_router_eda", # Zaya @@ -4045,12 +4039,9 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_HS_FINAL, MODEL_TENSOR.RES_SCALE_RES_FINAL, MODEL_TENSOR.FFN_GATE_INP, - MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, MODEL_TENSOR.FFN_NORM, MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, MODEL_TENSOR.ZAYA_ROUTER_MLP2, - MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, MODEL_TENSOR.ZAYA_ROUTER_MLP4, MODEL_TENSOR.ZAYA_ROUTER_BIASES, MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 41cd9262434..5d235e46f58 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -171,6 +171,9 @@ class TensorNameMap: MODEL_TENSOR.A_QF_PROJ_LINEAR: ( "projector.linear", ), + MODEL_TENSOR.ZAYA_ROUTER_MLP2: ( + "model.layers.{bid}.self_attn.router_mlp.2", # zaya + ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 5af26e8e107..f3031dc32fc 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -427,10 +427,7 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_RES_SCALE_RES, "blk.%d.res_scale_res" }, { LLM_TENSOR_RES_SCALE_HS_FINAL, "res_scale_hs" }, { LLM_TENSOR_RES_SCALE_RES_FINAL, "res_scale_res" }, - { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, - { LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "blk.%d.zaya_router_mlp0_b" }, { LLM_TENSOR_ZAYA_ROUTER_MLP2, "blk.%d.zaya_router_mlp2" }, - { LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "blk.%d.zaya_router_mlp2_b" }, { LLM_TENSOR_ZAYA_ROUTER_MLP4, "blk.%d.zaya_router_mlp4" }, { LLM_TENSOR_ZAYA_ROUTER_BIASES, "blk.%d.zaya_router_biases" }, { LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "blk.%d.zaya_router_eda" }, @@ -686,10 +683,7 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_RES_SCALE_RES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_RES_SCALE_HS_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, {LLM_TENSOR_RES_SCALE_RES_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, - {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, - {LLM_TENSOR_ZAYA_ROUTER_MLP0_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_MLP2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, - {LLM_TENSOR_ZAYA_ROUTER_MLP2_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_MLP4, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ZAYA_ROUTER_BIASES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 1e69f62ed53..4b2c8f83314 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -457,10 +457,7 @@ enum llm_tensor { LLM_TENSOR_RES_SCALE_HS_FINAL, // zaya: final hidden_states_scale (+ bias) LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale (+ bias) // ZAYA Router (MoE gating) - LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias - LLM_TENSOR_ZAYA_ROUTER_MLP0_B, // zaya: router MLP layer 0 bias - LLM_TENSOR_ZAYA_ROUTER_MLP2, // zaya: router MLP layer 2 weight - LLM_TENSOR_ZAYA_ROUTER_MLP2_B, // zaya: router MLP layer 2 bias + LLM_TENSOR_ZAYA_ROUTER_MLP2, // zaya: router MLP layer 2 weight (+ bias) LLM_TENSOR_ZAYA_ROUTER_MLP4, // zaya: router MLP layer 4 weight LLM_TENSOR_ZAYA_ROUTER_BIASES, // zaya: router balancing_biases LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, // zaya: router router_states_scale diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 8ca33296407..9b240884eed 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -95,18 +95,18 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { // Router network layer.zaya_router_down = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_ff_exp}, 0); - layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i), - {n_ff_exp}, 0); + layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), + {n_ff_exp}, TENSOR_NOT_REQUIRED); layer.zaya_router_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_ff_exp}, 0); layer.zaya_router_mlp0 = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_ff_exp, n_ff_exp}, 0); - layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "bias", i), - {n_ff_exp}, 0); + layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), + {n_ff_exp}, TENSOR_NOT_REQUIRED); layer.zaya_router_mlp2 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2, "weight", i), {n_ff_exp, n_ff_exp}, 0); - layer.zaya_router_mlp2_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "bias", i), - {n_ff_exp}, 0); + layer.zaya_router_mlp2_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2, "bias", i), + {n_ff_exp}, TENSOR_NOT_REQUIRED); layer.zaya_router_mlp4 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i), {n_ff_exp, n_expert + 1}, 0); layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i), From fede4c6774f2d9d3bee77bc70c16fca051917f33 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 12:24:01 +0200 Subject: [PATCH 13/33] zaya: remove unused CCA_QK_NORM tensor constant --- gguf-py/gguf/constants.py | 3 --- src/llama-arch.cpp | 2 -- src/llama-arch.h | 1 - src/llama-model.h | 1 - 4 files changed, 7 deletions(-) diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index a979d89c577..eeb14f6aa76 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -612,7 +612,6 @@ class MODEL_TENSOR(IntEnum): SSM_G_A = auto() # Kimi Linear SSM_G_B = auto() # Kimi Linear CCA_CONV_GRP = auto() # Zaya - CCA_QK_NORM = auto() # Zaya (weightless - unit RMSNorm) CCA_K_SCALE = auto() # Zaya CCA_VAL_PROJ1 = auto() # Zaya: CCA value projection stream 1 CCA_VAL_PROJ2 = auto() # Zaya: CCA value projection stream 2 @@ -1139,7 +1138,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear MODEL_TENSOR.CCA_CONV_GRP: "blk.{bid}.cca_conv_grp", # Zaya - MODEL_TENSOR.CCA_QK_NORM: "blk.{bid}.cca_qk_norm", # Zaya MODEL_TENSOR.CCA_K_SCALE: "blk.{bid}.cca_k_scale", # Zaya MODEL_TENSOR.CCA_VAL_PROJ1: "blk.{bid}.cca_val_proj1", # Zaya MODEL_TENSOR.CCA_VAL_PROJ2: "blk.{bid}.cca_val_proj2", # Zaya @@ -4030,7 +4028,6 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.SSM_CONV1D, MODEL_TENSOR.CCA_CONV_GRP, - MODEL_TENSOR.CCA_QK_NORM, MODEL_TENSOR.CCA_K_SCALE, MODEL_TENSOR.CCA_VAL_PROJ1, MODEL_TENSOR.CCA_VAL_PROJ2, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index f3031dc32fc..e0ac2a625dc 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -419,7 +419,6 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, { LLM_TENSOR_CCA_CONV_GRP, "blk.%d.cca_conv_grp" }, - { LLM_TENSOR_CCA_QK_NORM, "blk.%d.cca_qk_norm" }, { LLM_TENSOR_CCA_K_SCALE, "blk.%d.cca_k_scale" }, { LLM_TENSOR_CCA_VAL_PROJ1, "blk.%d.cca_val_proj1" }, { LLM_TENSOR_CCA_VAL_PROJ2, "blk.%d.cca_val_proj2" }, @@ -675,7 +674,6 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, // ZAYA CCA {LLM_TENSOR_CCA_CONV_GRP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, - {LLM_TENSOR_CCA_QK_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CCA_K_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CCA_VAL_PROJ1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_CCA_VAL_PROJ2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 4b2c8f83314..3809afc124c 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -447,7 +447,6 @@ enum llm_tensor { LLM_TENSOR_SSM_G_B, // kimi: output gate projection B // ZAYA CCA (Compressed Convolutional Attention) LLM_TENSOR_CCA_CONV_GRP, // zaya: grouped conv1d (conv_qk.1) - LLM_TENSOR_CCA_QK_NORM, // zaya: RMSNorm on concat(Q,K) LLM_TENSOR_CCA_K_SCALE, // zaya: learned K temperature LLM_TENSOR_CCA_VAL_PROJ1, // zaya: V projection 1 LLM_TENSOR_CCA_VAL_PROJ2, // zaya: V projection 2 diff --git a/src/llama-model.h b/src/llama-model.h index 01ce976fe3e..1a61503f3b0 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -482,7 +482,6 @@ struct llama_layer { struct ggml_tensor * cca_conv_dw_b = nullptr; // depthwise conv bias struct ggml_tensor * cca_conv_grp = nullptr; // grouped conv (conv_qk.1) struct ggml_tensor * cca_conv_grp_b = nullptr; // grouped conv bias - struct ggml_tensor * cca_qk_norm = nullptr; // RMSNorm on concat(Q,K) struct ggml_tensor * cca_k_scale = nullptr; // learned K temperature struct ggml_tensor * cca_val_proj1 = nullptr; // V projection stream 1 struct ggml_tensor * cca_val_proj2 = nullptr; // V projection stream 2 From 2069583f8e4094b15345b092bd3771b159607a86 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 12:28:54 +0200 Subject: [PATCH 14/33] zaya: remove dead ZAYA_ROUTER_MLP2 mapping from non-block config --- gguf-py/gguf/tensor_mapping.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 5d235e46f58..41cd9262434 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -171,9 +171,6 @@ class TensorNameMap: MODEL_TENSOR.A_QF_PROJ_LINEAR: ( "projector.linear", ), - MODEL_TENSOR.ZAYA_ROUTER_MLP2: ( - "model.layers.{bid}.self_attn.router_mlp.2", # zaya - ), } block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = { From 356e9620462235827eff0b9c060013ad2870b41c Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 12:42:14 +0200 Subject: [PATCH 15/33] zaya: revert unrelated debug.cpp changes --- common/debug.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/common/debug.cpp b/common/debug.cpp index 60cb5fd9b4a..102c6924dc9 100644 --- a/common/debug.cpp +++ b/common/debug.cpp @@ -144,6 +144,13 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (common_debug_cb_user_data *) user_data; auto * pimpl = cb_data->pimpl.get(); + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + + if (ask) { + return true; // Always retrieve data + } + bool matches_filter = pimpl->tensor_filters.empty(); if (!matches_filter) { @@ -155,13 +162,6 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } } - if (ask) { - return matches_filter; - } - - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - char src1_str[128] = { 0 }; if (src1) { snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str()); From 81d727f0af3894029575b74e9444a2065ac84edd Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 17:01:43 +0200 Subject: [PATCH 16/33] zaya: replace hardcoded n_ff_exp with GGUF metadata Remove hardcoded 256 value for router MLP hidden size and read it from the GGUF expert_feed_forward_length metadata key instead. The converter now writes zaya_mlp_expansion from config.json. --- convert_hf_to_gguf.py | 4 ++++ src/models/zaya.cpp | 5 +++-- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index dda858537f9..2054515da19 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6494,6 +6494,10 @@ def set_gguf_parameters(self): n_expert_used = self.find_hparam(["moe_router_topk", "num_experts_per_tok"], optional=True) or 1 self.gguf_writer.add_expert_used_count(n_expert_used) + # Router MLP hidden size (zaya_mlp_expansion) + n_ff_exp = self.hparams.get("zaya_mlp_expansion", 256) + self.gguf_writer.add_expert_feed_forward_length(n_ff_exp) + def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: if "linear_q" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 9b240884eed..63d6e197975 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -8,6 +8,7 @@ void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + ml.get_key(LLM_KV_EXPERT_FEED_FORWARD_LENGTH, hparams.n_ff_exp, false); const uint32_t n_qk = (hparams.n_head() + hparams.n_head_kv()) * hparams.n_embd_head_k(); hparams.ssm_d_inner = 2*n_qk + hparams.n_embd; // CCA conv state + delayed value stream state @@ -45,8 +46,8 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { const int64_t n_embd_head = hparams.n_embd_head_k(); const int64_t d_conv = hparams.ssm_d_conv; - // Router MLP hidden size (zaya_mlp_expansion = 256 for ZAYA1-8B) - const int64_t n_ff_exp = 256; + // Router MLP hidden size (zaya_mlp_expansion) + const int64_t n_ff_exp = hparams.n_ff_exp; for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; From 45d78817343110a8f4018e977b58dc6471f624db Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 15 May 2026 17:04:23 +0200 Subject: [PATCH 17/33] zaya: fix val_proj dimensions to use n_embd_k / 2 instead of n_embd_head val_proj1 and val_proj2 output dimension should be latent_k_dim / 2 (n_embd_k / 2) as per vLLM reference, not n_embd_head. Currently both are equal for ZAYA1-8B (n_head_kv=2), but this would break for any other n_head_kv configuration. --- src/models/zaya.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 63d6e197975..a7561bd8c90 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -69,9 +69,9 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), - {n_embd, n_embd_head}, 0); + {n_embd, n_embd_k / 2}, 0); layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), - {n_embd, n_embd_head}, 0); + {n_embd, n_embd_k / 2}, 0); layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); From 800fbe8ffdd4964690fe656dc1f18e82a2fd1913 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Sat, 16 May 2026 11:08:03 +0200 Subject: [PATCH 18/33] quant: exclude Zaya cca_conv_grp tensors from quantization Follows the same pattern as Mamba ssm_conv1d, Kimi shortconv, and RWKV time_mix tensors. These small conv weights (d_conv=2) are not divisible by quant block sizes (32), causing Q8_0 failures. --- src/llama-quant.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index 43e05c3d56f..bec2f15eb45 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -322,6 +322,9 @@ static bool tensor_allows_quantization(const llama_model_quantize_params * param quantize &= name.find("ssm_conv1d") == std::string::npos; quantize &= name.find("shortconv.conv.weight") == std::string::npos; + // do not quantize Zaya's small grouped conv1d weights (d_conv=2) + quantize &= name.find("cca_conv_grp") == std::string::npos; + // do not quantize RWKV's small yet 2D weights quantize &= name.find("time_mix_first.weight") == std::string::npos; quantize &= name.find("time_mix_w0.weight") == std::string::npos; From f2efd8c70f7f6857519d7c424fcf62ca113246c5 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Sat, 16 May 2026 20:56:01 +0200 Subject: [PATCH 19/33] zaya: cast conv kernels to F16 for CPU backend compatibility ggml_im2col on CPU requires F16 kernel weights. Cast cca_conv_dw and cca_conv_grp to F16 before convolution to support quantized models (Q4, Q8). CUDA/SYCL backends are unaffected since their im2col implementation only reads kernel dimensions, not data. --- src/models/zaya.cpp | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index a7561bd8c90..cda5abeea4c 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -288,8 +288,8 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target)); ggml_tensor * conv_dw = layer.cca_conv_dw; - if (conv_dw->type != GGML_TYPE_F32) { - conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F32); + if (conv_dw->type != GGML_TYPE_F16) { + conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F16); } conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk); ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1); @@ -298,7 +298,11 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params } cb(QK, "QK_dw", il); - QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK, 1, 0, 1, n_groups); + ggml_tensor * conv_grp = layer.cca_conv_grp; + if (conv_grp->type != GGML_TYPE_F16) { + conv_grp = ggml_cast(ctx0, conv_grp, GGML_TYPE_F16); + } + QK = ggml_conv_1d_grouped(ctx0, conv_grp, QK, 1, 0, 1, n_groups); QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1)); cb(QK, "QK_grp", il); From 3aaab7f7bb3be1b13cb9cec71182f642986a38bd Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Sat, 16 May 2026 22:35:00 +0200 Subject: [PATCH 20/33] zaya: add ggml_cont for ROCm/compatibility with non-contiguous tensors ROCm and Vulkan backends require contiguous tensors for im2col and mul_mat operations. Add ggml_cont after ggml_cast for conv kernels and after ggml_concat for hs_d to ensure compatibility across all backends. CUDA was unaffected since it handles non-contiguous tensors more permissively. --- src/models/zaya.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index cda5abeea4c..f5fc16b8899 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -229,7 +229,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params 0); hs_d = ggml_concat(ctx0, hs_d, cur_shift, 1); } - hs_d = ggml_reshape_2d(ctx0, hs_d, n_embd, n_tokens); + hs_d = ggml_reshape_2d(ctx0, ggml_cont(ctx0, hs_d), n_embd, n_tokens); cb(hs_d, "cca_hs_d", il); // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens] @@ -289,7 +289,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * conv_dw = layer.cca_conv_dw; if (conv_dw->type != GGML_TYPE_F16) { - conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F16); + conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F16)); } conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk); ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1); @@ -300,7 +300,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * conv_grp = layer.cca_conv_grp; if (conv_grp->type != GGML_TYPE_F16) { - conv_grp = ggml_cast(ctx0, conv_grp, GGML_TYPE_F16); + conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F16)); } QK = ggml_conv_1d_grouped(ctx0, conv_grp, QK, 1, 0, 1, n_groups); QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1)); From c8d3a6c93685b2ffa2daf3a91ab524b5e02d25f6 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Sun, 17 May 2026 18:33:18 +0200 Subject: [PATCH 21/33] zaya: fix server compatibility with batched inference - Add ggml_cont(prev_hs) for non-contiguous tensor view (n_seqs > 1) - Replace ggml_conv_1d_dw with ggml_ssm_conv for proper batch support - Cast conv kernel to F32 and permute output shape ggml_conv_1d_dw does not support n_seqs > 1 (assert b->ne[3] == 1). Use ggml_ssm_conv which is designed for SSM models with batching. --- ggml/src/ggml-cuda/ssm-conv.cu | 3 ++- src/models/zaya.cpp | 14 +++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 4841389fbc8..f983869c215 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -140,11 +140,12 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa }; switch (nc) { + case 2: launch_kernel(std::integral_constant{}); break; case 3: launch_kernel(std::integral_constant{}); break; case 4: launch_kernel(std::integral_constant{}); break; case 5: launch_kernel(std::integral_constant{}); break; case 9: launch_kernel(std::integral_constant{}); break; - default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now."); + default: GGML_ABORT("Only support kernel sizes 2, 3, 4, 5, 9 right now."); } } diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index f5fc16b8899..ce65c2281fa 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -221,7 +221,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * cur_state_src = ggml_cont(ctx0, cur); ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs); - ggml_tensor * hs_d = ggml_reshape_3d(ctx0, prev_hs, n_embd, 1, n_seqs); + ggml_tensor * hs_d = ggml_reshape_3d(ctx0, ggml_cont(ctx0, prev_hs), n_embd, 1, n_seqs); if (n_seq_tokens > 1) { ggml_tensor * cur_shift = ggml_view_3d(ctx0, cur_seq, n_embd, n_seq_tokens - 1, n_seqs, cur_seq->nb[1], @@ -288,11 +288,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target)); ggml_tensor * conv_dw = layer.cca_conv_dw; - if (conv_dw->type != GGML_TYPE_F16) { - conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F16)); + if (conv_dw->type != GGML_TYPE_F32) { + conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F32)); } - conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk); - ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1); + // conv_input is [L, n_qk, n_seqs], ssm_conv outputs [n_qk, n_tokens, n_seqs] + ggml_tensor * QK = ggml_ssm_conv(ctx0, conv_input, conv_dw); + // permute from [n_qk, n_tokens, n_seqs] to [n_tokens, n_qk, n_seqs] + QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3)); if (layer.cca_conv_dw_b) { QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1)); } @@ -307,6 +309,8 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(QK, "QK_grp", il); QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3)); + // QK is now [n_qk, n_seq_tokens, n_seqs] + // Flatten to 2D: [n_qk, n_tokens] where n_tokens = n_seq_tokens * n_seqs QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0); From 7c5cc5305288b07afde5c7505b529342fcd559c1 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Mon, 18 May 2026 20:19:33 +0200 Subject: [PATCH 22/33] fix(zaya): use actual tokenizer vocab size instead of config vocab_size The model's config.json reports vocab_size=262272 but the actual tokenizer only has 262147 tokens. The 125 extra entries are padding in PyTorch's embed_tokens.weight matrix that don't correspond to any real tokens. Use the pre-computed _tokenizer_vocab_size to write the correct vocab size in the GGUF metadata, matching llama.cpp's actual tokenizer vocabulary. --- convert_hf_to_gguf.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 2054515da19..5ce42b465c8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6473,7 +6473,9 @@ def __init__(self, *args, **kwargs): def set_gguf_parameters(self): super().set_gguf_parameters() - self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + # Use actual tokenizer vocab size if available, fallback to config vocab_size + vocab_size = self._tokenizer_vocab_size if self._tokenizer_vocab_size is not None else self.hparams["vocab_size"] + self.gguf_writer.add_vocab_size(vocab_size) # n_ff = ffn_hidden_size / 2 (SwiGLU halves the intermediate) n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2 From f1bd772a37f3407d8fd2809edef1f0e0d245b760 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Thu, 21 May 2026 22:50:15 +0200 Subject: [PATCH 23/33] docs(zaya): add Python reference comments to C++ implementation Add detailed inline comments mapping each C++ code section to the corresponding zaya.py and cca.py Python lines, including code snippets for direct comparison. --- src/models/zaya.cpp | 488 +++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 461 insertions(+), 27 deletions(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index ce65c2281fa..e2dd6e52bcf 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -5,6 +5,25 @@ #include +/* + * zaya.py ref: L52-81 (ResidualScaling class) + * + * class ResidualScaling(nn.Module): + * def __init__(self, config, layer_n, ...): + * self.not_first_layer = (layer_n != 0) + * self.hidden_states_scale = torch.nn.Parameter(torch.ones(config.hidden_size)) + * self.hidden_states_bias = torch.nn.Parameter(torch.zeros(config.hidden_size)) + * if self.not_first_layer: + * self.residual_scale = torch.nn.Parameter(torch.ones(config.hidden_size)) + * self.residual_bias = torch.nn.Parameter(torch.zeros(config.hidden_size)) + * + * def forward(self, residual, hidden_states): + * hidden_states = (hidden_states.float() + hs_bias) * hs_scale + * if self.not_first_layer and residual is not None: + * residual = (residual.float() + res_bias) * res_scale + * return residual, hidden_states + */ + void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); @@ -15,6 +34,15 @@ void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { hparams.ssm_d_state = 1; hparams.ssm_n_group = 0; + /* + * zaya.py ref: L575-602 (layer alternation) + * + * for layer_n in range(config.num_hidden_layers): + * if layer_n % 2 == 1: + * self.layers.append(ZayaDecoderMLPLayer(...)) # MoE layer + * else: + * self.layers.append(ZayaDecoderATTLayer(...)) # Attention layer + */ for (uint32_t i = 0; i < hparams.n_layer; ++i) { hparams.recurrent_layer_arr[i] = (i % 2) == 0; } @@ -28,17 +56,42 @@ void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { void llama_model_zaya::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; + /* + * zaya.py ref: L569-573 + * + * self.embed_tokens = VocabParallelEmbedding( + * self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size) + */ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - // output norm + /* + * zaya.py ref: L608-613 + * + * if (config.normalization == "RMSNorm"): + * self.final_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon) + * elif (config.normalization == "LayerNorm"): + * self.final_norm = nn.LayerNorm(self.config.hidden_size, eps=config.norm_epsilon) + */ output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - // output (tied with tok_embd if not present) + /* + * zaya.py ref: L729-743 + * + * self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, ...) + * if self.config.tie_word_embeddings: + * self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) + */ output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); if (output == nullptr) { - output = tok_embd; + output = tok_embd; // tied weights } + /* + * zaya.py ref: L605-606 (final ResidualScaling after all layers) + * + * if self.config.scale_residual_merge: + * self.res_scale = ResidualScaling(config, config.num_hidden_layers) + */ zaya_res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); zaya_res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); zaya_res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); @@ -46,7 +99,6 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { const int64_t n_embd_head = hparams.n_embd_head_k(); const int64_t d_conv = hparams.ssm_d_conv; - // Router MLP hidden size (zaya_mlp_expansion) const int64_t n_ff_exp = hparams.n_ff_exp; for (int i = 0; i < n_layer; ++i) { @@ -61,31 +113,97 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { const int64_t n_ff = hparams.n_ff(i); const int64_t n_expert = hparams.n_expert; + /* + * zaya.py ref: L212-217 (ZayaDecoderATTLayer input_norm) + * zaya.py ref: L508-513 (ZayaDecoderMLPLayer input_norm) + * + * if (config.normalization == "RMSNorm"): + * self.input_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon) + * elif (config.normalization == "LayerNorm"): + * self.input_norm = nn.LayerNorm(self.config.hidden_size, eps=config.norm_epsilon) + */ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); // CCA attention layers (even indices only) if (i % 2 == 0) { + /* + * zaya.py ref: L98-184 (ZayaAttention class) + * + * self.q_dim = cca_num_q_heads * head_dim + * self.k_dim = cca_num_k_heads * head_dim + * self.v_dim = cca_num_k_heads * head_dim + * + * self.qkv = CCA(config, cca_num_k_heads, cca_num_q_heads, cca_num_heads, ...) + * self.o_proj = ReplicatedLinear(cca_num_q_heads * head_dim, hidden_size, ...) + * self.attn = Attention(cca_num_q_heads, head_dim, scale, cca_num_k_heads, ...) + * self.rotary_emb = get_rope(head_size=head_dim, ..., partial_rotary_factor=0.5) + */ + + /* + * zaya.py ref: L125-138 (CCA layer for Q, K projections) + * + * self.qkv = CCA(...) + * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...) + * self.qkv(hidden_states, output_qkv) + * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1) + */ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); + /* + * zaya.py ref: CCA.py - value projections (val_proj1, val_proj2) + * + * V1 = val_proj1(x) + * V2 = val_proj2(x_delayed) + * V = concat(V1, V2) + */ layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), {n_embd, n_embd_k / 2}, 0); layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), {n_embd, n_embd_k / 2}, 0); + /* + * zaya.py ref: L139-144 + * + * self.o_proj = ReplicatedLinear(self.cca_num_q_heads * self.head_dim, + * self.hidden_size, bias=self.config.attention_bias, ...) + */ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); + /* + * zaya.py ref: CCA.py - depthwise conv on QK + * + * conv_dw applied to [Q, K] concatenated + */ layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, n_qk}, 0); layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); + /* + * zaya.py ref: CCA.py - grouped conv on QK + * + * conv_grp applied after dw conv, with n_groups = n_head + n_head_kv + */ layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), {d_conv, n_qk / n_groups, n_qk}, 0); layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + /* + * zaya.py ref: CCA.py - K scaling after L2 norm + * + * Kcur = Kcur * cca_k_scale + */ layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0); } - // Residual scaling + /* + * zaya.py ref: L52-81, L219-220, L515-516 (per-layer ResidualScaling) + * + * if self.config.scale_residual_merge: + * self.res_scale = ResidualScaling(config, layer_n) + * + * hidden_states = (hidden_states.float() + hs_bias) * hs_scale + * residual = (residual.float() + res_bias) * res_scale + */ layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); @@ -93,13 +211,51 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { // MoE layers (odd indices) if (i % 2 == 1) { - // Router network + /* + * zaya.py ref: L251-380 (ZayaRouter class) + * + * self.down_proj = ReplicatedLinear(self.hidden_size, self.mlp_expansion, bias=True, ...) + * self.rmsnorm_eda = RMSNorm(self.mlp_expansion, eps=ln_eps) + * self.router_states_scale = nn.Parameter(torch.ones(self.mlp_expansion)) // EDA scale + * self.router_mlp = nn.Sequential( + * ReplicatedLinear(D, D, bias=True, ...), + * nn.GELU(), + * ReplicatedLinear(D, D, bias=True, ...), + * nn.GELU(), + * ReplicatedLinear(D, E, bias=False, ...), + * ) + * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32)) + */ + + /* + * zaya.py ref: L291 + * + * self.down_proj = ReplicatedLinear(self.hidden_size, self.mlp_expansion, bias=True, ...) + */ layer.zaya_router_down = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_ff_exp}, 0); layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_ff_exp}, TENSOR_NOT_REQUIRED); + + /* + * zaya.py ref: L298-299 + * + * self.rmsnorm_eda = RMSNorm(self.mlp_expansion, eps=ln_eps) + */ layer.zaya_router_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_ff_exp}, 0); + + /* + * zaya.py ref: L305-314 (router MLP layers 0, 2, 4) + * + * self.router_mlp = nn.Sequential( + * ReplicatedLinear(D, D, bias=True, ...), // mlp0 + * self.non_linearity, // GELU + * ReplicatedLinear(D, D, bias=True, ...), // mlp2 + * self.non_linearity, // GELU + * ReplicatedLinear(D, E, bias=False, ...), // mlp4 + * ) + */ layer.zaya_router_mlp0 = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_ff_exp, n_ff_exp}, 0); layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), @@ -110,12 +266,40 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { {n_ff_exp}, TENSOR_NOT_REQUIRED); layer.zaya_router_mlp4 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i), {n_ff_exp, n_expert + 1}, 0); + + /* + * zaya.py ref: L317-319 + * + * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32)) + * if self.use_mod: + * self.balancing_biases[-1] = -1.0 + */ layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i), {n_expert + 1}, TENSOR_NOT_REQUIRED); + + /* + * zaya.py ref: L302-303 + * + * self.router_states_scale = nn.Parameter(torch.ones(self.mlp_expansion)) + */ layer.zaya_router_eda_scale = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "weight", i), {n_ff_exp}, TENSOR_NOT_REQUIRED); - // MoE experts (fused gate_up and down) + /* + * zaya.py ref: L435-446 (FusedMoE experts) + * + * self.experts = FusedMoE( + * num_experts=self.num_moe_experts, + * top_k=self.topk, + * hidden_size=config.hidden_size, + * intermediate_size=ffn_hidden_size // 2, + * reduce_results=False, + * renormalize=False, + * custom_routing_function=_custom_routing_fn, + * activation="silu", + * ... + * ) + */ create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0); @@ -143,6 +327,15 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * cur; ggml_tensor * inpL; + /* + * zaya.py ref: L638-641 (ZayaModel.forward) + * + * if inputs_embeds is None: + * inputs_embeds = self.embed_tokens(input_ids) + * residual = None + * hidden_states = inputs_embeds + * prev_router_hidden_states = None + */ inpL = build_inp_embd(model.tok_embd); auto * inp = build_inp_mem_hybrid(); @@ -153,6 +346,14 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * residual = nullptr; ggml_tensor * prev_router = nullptr; + /* + * zaya.py ref: L71-81 (ResidualScaling.forward) + * + * hidden_states = (hidden_states.float() + hs_bias) * hs_scale + * if self.not_first_layer and residual is not None: + * residual = (residual.float() + res_bias) * res_scale + * return residual, hidden_states + */ const auto apply_res_scale = [&](ggml_tensor * x, ggml_tensor * scale, ggml_tensor * bias, const char * name, int il) { if (scale == nullptr) { return x; @@ -165,6 +366,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params return x; }; + /* + * zaya.py ref: L644-651 (ZayaModel.forward layer loop) + * + * for layer_n, decoder_layer in enumerate(self.layers): + * hidden_states, residual, prev_router_hidden_states = decoder_layer( + * hidden_states, residual, positions, layer_n, prev_router_hidden_states) + */ for (int il = 0; il < n_layer; ++il) { const auto & layer = model.layers[il]; @@ -176,6 +384,18 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params const int64_t n_groups = n_head + n_head_kv; const int64_t n_gqa = n_head / n_head_kv; + /* + * zaya.py ref: L234-241 (ZayaDecoderATTLayer.forward) + * zaya.py ref: L530-537 (ZayaDecoderMLPLayer.forward) + * + * if self.config.scale_residual_merge: + * residual, hidden_states = self.res_scale(residual, hidden_states) + * if residual is not None: + * residual = residual.float() + hidden_states.float() + * else: + * residual = hidden_states.float() + * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype) + */ ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); if (residual != nullptr) { residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il); @@ -185,16 +405,45 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params } cb(residual, "residual", il); - // Pre-norm + /* + * zaya.py ref: L84-95 (_apply_norm_with_fp32_residual) + * zaya.py ref: L240-241, L536-537 + * + * if isinstance(norm, RMSNorm): + * if residual.dtype != norm.weight.dtype: + * hidden_states = norm.forward_native(residual) + * else: + * hidden_states = norm(residual) + * return hidden_states.to(target_dtype) + */ cur = build_norm(residual, layer.attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "input_norm", il); if (il % 2 == 0) { // ===== CCA Attention ===== + /* + * zaya.py ref: L98-184 (ZayaAttention) + * zaya.py ref: L171-184 (ZayaAttention.forward) + * + * def forward(self, hidden_states, position_ids): + * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...) + * self.qkv(hidden_states, output_qkv) + * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1) + * q, k = self.rotary_emb(position_ids, q, k) + * attn_output = self.attn(q, k, v) + * attn_output = self.o_proj(attn_output) + * return attn_output + */ + const int64_t conv_state_size = 2*n_qk; const int64_t cca_state_size = conv_state_size + n_embd; GGML_ASSERT((int64_t) hparams.n_embd_s() == cca_state_size); + /* + * zaya.py ref: CCA.py - recurrent state management + * + * CCA maintains conv_state and prev_hs in recurrent memory + */ ggml_tensor * cca_state_all = inp_recr->mctx->get_s_l(il); ggml_tensor * cca_state = build_rs(inp_recr, cca_state_all, hparams.n_embd_s(), n_seqs); cb(cca_state, "cca_state", il); @@ -210,14 +459,26 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params conv_state_size*ggml_element_size(cca_state)); cb(prev_hs, "cca_prev_hs", il); - // Q, K projections + /* + * zaya.py ref: L177-179 + * + * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...) + * self.qkv(hidden_states, output_qkv) + * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1) + */ ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); cb(Qraw, "Qraw", il); ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); cb(Kraw, "Kraw", il); - // HF uses a delayed hidden-state stream for val_proj2. During decode this - // comes from the recurrent state; during prefill it is a one-token shift. + /* + * zaya.py ref: CCA.py - delayed hidden state stream for val_proj2 + * + * During decode: comes from recurrent state + * During prefill: one-token shift of current sequence + * + * hs_d = concat(prev_hs_last, cur[:-1]) along seq dimension + */ ggml_tensor * cur_state_src = ggml_cont(ctx0, cur); ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs); @@ -232,7 +493,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params hs_d = ggml_reshape_2d(ctx0, ggml_cont(ctx0, hs_d), n_embd, n_tokens); cb(hs_d, "cca_hs_d", il); - // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens] + /* + * zaya.py ref: CCA.py - V projection + * + * V1 = val_proj1(cur) + * V2 = val_proj2(hs_d) + * Vcur = concat(V1, V2, dim=0) + */ ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur); cb(V1, "V1", il); ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, hs_d); @@ -240,10 +507,25 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0); cb(Vcur, "Vcur", il); - // Concat Q+K for conv: [n_qk, n_tokens] + /* + * zaya.py ref: CCA.py - QK concatenation for conv + * + * QKraw = concat(Qraw, Kraw, dim=0) + */ ggml_tensor * QKraw = ggml_concat(ctx0, Qraw, Kraw, 0); cb(QKraw, "QKraw", il); + /* + * zaya.py ref: CCA.py - qk_mean computation + * + * Qpre: [n_embd_head, n_head, n_tokens] + * Kpre: [n_embd_head, n_head_kv, n_tokens] + * Kpre_grouped = repeat(Kpre, n_gqa times along head dim) + * qk_mean_q = (Qpre + Kpre_rep) * 0.5 + * + * Qgroup = group Q by GQA, mean across group + * qk_mean_k = (Qmean + Kpre) * 0.5 + */ ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Qraw), n_embd_head, n_head, n_tokens); ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Kraw), n_embd_head, n_head_kv, n_tokens); @@ -261,6 +543,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * qk_mean_k = ggml_scale(ctx0, ggml_add(ctx0, Qmean, Kpre), 0.5f); cb(qk_mean_k, "qk_mean_k", il); + /* + * zaya.py ref: CCA.py - conv state update + * + * conv_input = concat(conv_state, QKraw_reshaped, dim=0) + * last_conv_states = conv_input[-2:] (last 2 positions for state update) + */ ggml_tensor * QKraw_t = ggml_cont(ctx0, ggml_transpose(ctx0, QKraw)); QKraw_t = ggml_reshape_3d(ctx0, QKraw_t, n_seq_tokens, n_qk, n_seqs); @@ -273,6 +561,11 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params n_seq_tokens*conv_input->nb[0]); cb(last_conv_states, "cca_last_conv_states", il); + /* + * zaya.py ref: CCA.py - recurrent state write-back + * + * Update conv_state and prev_hs in recurrent memory for next step + */ const auto kv_head = inp_recr->mctx->get_head(); ggml_tensor * conv_state_update_target = ggml_view_2d(ctx0, cca_state_all, conv_state_size, n_seqs, cca_state_all->nb[1], @@ -287,19 +580,27 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params (kv_head*cca_state_size + conv_state_size)*ggml_element_size(cca_state_all)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target)); + /* + * zaya.py ref: CCA.py - depthwise conv + * + * QK = ssm_conv(conv_input, conv_dw) + conv_dw_b + */ ggml_tensor * conv_dw = layer.cca_conv_dw; if (conv_dw->type != GGML_TYPE_F32) { conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F32)); } - // conv_input is [L, n_qk, n_seqs], ssm_conv outputs [n_qk, n_tokens, n_seqs] ggml_tensor * QK = ggml_ssm_conv(ctx0, conv_input, conv_dw); - // permute from [n_qk, n_tokens, n_seqs] to [n_tokens, n_qk, n_seqs] QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3)); if (layer.cca_conv_dw_b) { QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1)); } cb(QK, "QK_dw", il); + /* + * zaya.py ref: CCA.py - grouped conv + * + * QK = conv_1d_grouped(QK, conv_grp, n_groups) + conv_grp_b + */ ggml_tensor * conv_grp = layer.cca_conv_grp; if (conv_grp->type != GGML_TYPE_F16) { conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F16)); @@ -309,8 +610,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(QK, "QK_grp", il); QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3)); - // QK is now [n_qk, n_seq_tokens, n_seqs] - // Flatten to 2D: [n_qk, n_tokens] where n_tokens = n_seq_tokens * n_seqs QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0); @@ -319,15 +618,38 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * Qcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Q_conv), n_embd_head, n_head, n_tokens); ggml_tensor * Kcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, K_conv), n_embd_head, n_head_kv, n_tokens); + /* + * zaya.py ref: CCA.py - add qk_mean back to Q, K + * + * Qcur = Qcur + qk_mean_q + * Kcur = Kcur + qk_mean_k + */ Qcur = ggml_add(ctx0, Qcur, qk_mean_q); Kcur = ggml_add(ctx0, Kcur, qk_mean_k); + /* + * zaya.py ref: CCA.py - L2 normalization and scaling + * + * Qcur = l2_norm(Qcur) * sqrt(n_embd_head) + * Kcur = l2_norm(Kcur) * sqrt(n_embd_head) * cca_k_scale + */ Qcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Qcur, 1e-12f), sqrtf((float) n_embd_head)); Kcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Kcur, 1e-12f), sqrtf((float) n_embd_head)); Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.cca_k_scale, 1, n_head_kv, 1)); cb(Qcur, "Qcur_pre_rope", il); cb(Kcur, "Kcur_pre_rope", il); + /* + * zaya.py ref: L155-164 (rotary embedding) + * + * self.rotary_emb = get_rope( + * head_size=self.head_dim, + * max_position=config.max_position_embeddings, + * is_neox_style=True, + * rope_parameters={"rope_theta": config.rope_theta, "rope_type": "default", "partial_rotary_factor": 0.5}, + * ) + * q, k = self.rotary_emb(position_ids, q, k) + */ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -340,7 +662,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params Vcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Vcur), n_embd_head, n_head_kv, n_tokens); - // GQA attention + /* + * zaya.py ref: L146-153, L181-182 (Attention + output projection) + * + * self.attn = Attention(self.cca_num_q_heads, self.head_dim, self.scale, self.cca_num_k_heads, ...) + * attn_output = self.attn(q, k, v) + * attn_output = self.o_proj(attn_output) + */ cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf((float) n_embd_head), il); @@ -348,24 +676,78 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params } else { // ===== MoE Layer ===== - - // Build Zaya router network: - // down_proj -> optional EDA -> RMSNorm -> GELU MLP -> 17 logits. - + /* + * zaya.py ref: L481-541 (ZayaDecoderMLPLayer) + * zaya.py ref: L382-479 (ZayaBlock) + * zaya.py ref: L251-380 (ZayaRouter) + * + * def forward(self, hidden_states, residual, position_ids, layer_n, prev_router_hidden_states): + * if self.config.scale_residual_merge: + * residual, hidden_states = self.res_scale(residual, hidden_states) + * residual = residual.float() + hidden_states.float() + * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype) + * hidden_states, prev_router_hidden_states = self.zaya_block(hidden_states, prev_router_hidden_states) + * return hidden_states, residual, prev_router_hidden_states + */ + + /* + * zaya.py ref: L321-380 (ZayaRouter.forward) + * + * hs = self.down_proj(hidden_states) + * if self.use_eda and (prev_router_hidden_states is not None): + * hs = hs + prev_router_hidden_states * self.router_states_scale + * router_hidden_states_next = hs[-S:].clone() + * hs_norm = self.rmsnorm_eda(hs) + * logits = self.router_mlp(hs_norm) // Linear->GELU->Linear->GELU->Linear + * expert_prob = torch.softmax(logits, dim=-1, dtype=torch.float32) + * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases + * _, expert_choice_t = torch.topk(biased, self.topk, dim=-1) + * route_prob = torch.gather(expert_prob, dim=1, index=expert_choice_t) + * return route_prob_flat, expert_choice_flat, router_hidden_states_next + */ + + /* + * zaya.py ref: L343 + * + * hs = self.down_proj(hidden_states) + */ ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur); router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b); cb(router_h, "router_down", il); + /* + * zaya.py ref: L344-345 + * + * if self.use_eda and (prev_router_hidden_states is not None): + * hs = hs + prev_router_hidden_states * self.router_states_scale + */ if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale)); cb(router_h, "router_eda", il); } - prev_router = router_h; + prev_router = router_h; // zaya.py ref: L348 (router_hidden_states_next) + /* + * zaya.py ref: L351 + * + * hs_norm = self.rmsnorm_eda(hs) + */ router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il); cb(router_h, "router_norm", il); + /* + * zaya.py ref: L305-314, L354 + * + * logits = self.router_mlp(hs_norm) + * self.router_mlp = nn.Sequential( + * ReplicatedLinear(D, D, bias=True, ...), // mlp0 + * nn.GELU(), + * ReplicatedLinear(D, D, bias=True, ...), // mlp2 + * nn.GELU(), + * ReplicatedLinear(D, E, bias=False, ...), // mlp4 + * ) + */ router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h); router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b); router_h = ggml_gelu(ctx0, router_h); @@ -379,20 +761,51 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h); cb(router_h, "router_logits", il); + /* + * zaya.py ref: L355-359 + * + * expert_prob = torch.softmax(logits, dim=-1, dtype=torch.float32) + * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases + * _, expert_choice_t = torch.topk(biased, self.topk, dim=-1) + */ ggml_tensor * router_probs = ggml_soft_max(ctx0, router_h); cb(router_probs, "router_probs", il); - // Keep the MOD skip expert in the softmax denominator, then route - // over real experts only. The checkpoint's skip bias keeps MOD unused. + /* + * zaya.py ref: L387-389 (MOD skip expert handling) + * + * gate_probs = router_probs[:, :n_expert] // exclude skip expert from routing + */ ggml_tensor * gate_probs = ggml_cont(ctx0, ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0)); cb(gate_probs, "gate_probs", il); + /* + * zaya.py ref: L317-319, L362-363 + * + * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32)) + * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases + */ ggml_tensor * expert_biases = nullptr; if (layer.zaya_router_biases != nullptr) { expert_biases = ggml_view_1d(ctx0, layer.zaya_router_biases, n_expert, 0); } + /* + * zaya.py ref: L448-479 (ZayaBlock.forward - MoE execution) + * + * probs, indices, router_hidden_states_out = self.router(hidden_states, prev_router_hidden_states) + * if self.config.zaya_use_mod: + * clamped_indices = torch.clamp(indices, min=0, max=self.num_moe_experts - 1) + * packed_logits = torch.cat([probs, clamped_indices.to(probs.dtype)], dim=-1) + * hidden_states_experts = self.experts(hidden_states, packed_logits) + * hidden_states_mod = hidden_states * probs + * mod_mask = (indices != self.num_moe_experts) + * hidden_states = (mod_mask * hidden_states_experts) + ((~mod_mask) * hidden_states_mod) + * else: + * packed_logits = torch.cat([probs, indices.to(probs.dtype)], dim=-1) + * hidden_states = self.experts(hidden_states, packed_logits) + */ cur = build_moe_ffn(cur, /* gate_inp */ nullptr, /* up_exps */ nullptr, @@ -414,6 +827,17 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params inpL = cur; } + /* + * zaya.py ref: L653-664 (ZayaModel.forward - final residual + norm) + * + * if self.config.scale_residual_merge: + * residual, hidden_states = self.res_scale(residual, hidden_states) + * if residual is not None: + * hidden_states = hidden_states.float() + residual.float() + * else: + * hidden_states = hidden_states.float() + * hidden_states = _apply_norm_with_fp32_residual(self.final_norm, hidden_states, final_input_dtype) + */ ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1); if (residual != nullptr) { residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1); @@ -427,12 +851,22 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cur = ggml_get_rows(ctx0, cur, inp_out_ids); } - // final norm + /* + * zaya.py ref: L608-613 (final norm) + * + * if (config.normalization == "RMSNorm"): + * self.final_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon) + */ cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; - // output + /* + * zaya.py ref: L729-746, L769-782 (lm_head + logits_processor) + * + * self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, ...) + * logits = self.logits_processor(self.lm_head, hidden_states) + */ cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); res->t_logits = cur; From 2234dab56972888aeee8625905c13a5190c2f9fa Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Thu, 21 May 2026 23:01:34 +0200 Subject: [PATCH 24/33] fix(zaya): gate EDA with layer check matching Python use_eda logic zaya.py L294-296: EDA is disabled for layer 1 (first MoE layer) via (self.layer_number != zaya_first_layer). Add il != 1 guard to match. --- src/models/zaya.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index e2dd6e52bcf..516bd8b81db 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -716,12 +716,19 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(router_h, "router_down", il); /* - * zaya.py ref: L344-345 + * zaya.py ref: L294-296, L344-345 + * + * zaya_first_layer = 1 + * use_eda_cfg = bool(getattr(config, "zaya_use_eda", False)) + * self.use_eda = use_eda_cfg and (zaya_first_layer is not None) and (self.layer_number != zaya_first_layer) * * if self.use_eda and (prev_router_hidden_states is not None): * hs = hs + prev_router_hidden_states * self.router_states_scale + * + * EDA is disabled for layer 1 (first MoE layer) via (self.layer_number != zaya_first_layer). + * When zaya_use_eda is False globally, the parameter is never created (tensor stays nullptr). */ - if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { + if (il != 1 && prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale)); cb(router_h, "router_eda", il); } From 1fc45810144b1ab8b77a04ea6451c049bf37742d Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 22 May 2026 00:34:48 +0200 Subject: [PATCH 25/33] feat(zaya): add zaya_high_prec for FP32 output logits matching Python _FP32EmbeddingMethod --- convert_hf_to_gguf.py | 4 ++++ gguf-py/gguf/constants.py | 3 +++ gguf-py/gguf/gguf_writer.py | 3 +++ src/llama-arch.cpp | 3 +++ src/llama-arch.h | 3 +++ src/llama-hparams.h | 3 +++ src/llama-model-saver.cpp | 3 +++ src/models/zaya.cpp | 14 +++++++++++++- 8 files changed, 35 insertions(+), 1 deletion(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 5ce42b465c8..0fa85283db2 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6500,6 +6500,10 @@ def set_gguf_parameters(self): n_ff_exp = self.hparams.get("zaya_mlp_expansion", 256) self.gguf_writer.add_expert_feed_forward_length(n_ff_exp) + # FP32 output logits for numerical stability (zaya_high_prec) + zaya_high_prec = self.hparams.get("zaya_high_prec", True) + self.gguf_writer.add_zaya_high_prec(zaya_high_prec) + def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: if "linear_q" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index eeb14f6aa76..1d759ac54f7 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -153,6 +153,9 @@ class LLM: DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" + # Zaya-specific + ZAYA_HIGH_PREC = "zaya.high_prec" + class Attention: HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT_KV = "{arch}.attention.head_count_kv" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index 35fb01470c4..d45a529bd90 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1293,6 +1293,9 @@ def add_xielu_beta(self, values: Sequence[float]): def add_xielu_eps(self, values: Sequence[float]): self.add_array(Keys.xIELU.EPS, values) + def add_zaya_high_prec(self, value: bool) -> None: + self.add_bool(Keys.LLM.ZAYA_HIGH_PREC, value) + # diffusion models def add_diffusion_shift_logits(self, value: bool) -> None: diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index e0ac2a625dc..43fdb881cbd 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -293,6 +293,9 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" }, { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" }, + // Zaya-specific + { LLM_KV_ZAYA_HIGH_PREC, "%s.zaya.high_prec" }, + { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index 3809afc124c..a0a18843356 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -345,6 +345,9 @@ enum llm_kv { LLM_KV_DENSE_2_FEAT_OUT, LLM_KV_DENSE_3_FEAT_IN, LLM_KV_DENSE_3_FEAT_OUT, + + // Zaya-specific + LLM_KV_ZAYA_HIGH_PREC, }; enum llm_tensor { diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 0160a89caa2..7982dba8ab2 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -212,6 +212,9 @@ struct llama_hparams { // qwen3vl deepstack uint32_t n_deepstack_layers = 0; + // zaya: FP32 output logits for numerical stability + bool zaya_high_prec = true; + // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index e83056557bf..07bf6bc812d 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -377,6 +377,9 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out); add_kv(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in); add_kv(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out); + + // Zaya-specific + add_kv(LLM_KV_ZAYA_HIGH_PREC, hparams.zaya_high_prec); } void llama_model_saver::add_tensors_from_model() { diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 516bd8b81db..241a6ad8f32 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -204,7 +204,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { * hidden_states = (hidden_states.float() + hs_bias) * hs_scale * residual = (residual.float() + res_bias) * res_scale */ - layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); + layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); @@ -876,6 +876,18 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params */ cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); + + /* + * zaya.py ref: L748-749 (_FP32EmbeddingMethod) + * + * if self.zaya_high_prec: + * out = out.to(dtype=torch.float32) + */ + if (hparams.zaya_high_prec) { + cur = ggml_cont(ctx0, ggml_cast(ctx0, cur, GGML_TYPE_F32)); + cb(cur, "result_output_fp32", -1); + } + res->t_logits = cur; ggml_build_forward_expand(gf, cur); From 0f37acecd23ae9e95e70929ee8f534a40652cfe5 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 22 May 2026 01:56:05 +0200 Subject: [PATCH 26/33] zaya.cpp: fix comment reference to MOD skip expert handling Correct line reference from zaya.py L387-389 to L459-469, and add note explaining why excluding the skip expert from gate_probs is correct (bias=-1.0 makes it effectively never selected at inference with topk=1). --- src/models/zaya.cpp | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 241a6ad8f32..5e86085fb7a 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -779,9 +779,13 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(router_probs, "router_probs", il); /* - * zaya.py ref: L387-389 (MOD skip expert handling) + * zaya.py ref: L459-469 (MOD skip expert handling) * * gate_probs = router_probs[:, :n_expert] // exclude skip expert from routing + * + * Note: the skip expert (index n_expert) has a -1.0 bias in + * balancing_biases, making it practically never selected during + * inference with topk=1. */ ggml_tensor * gate_probs = ggml_cont(ctx0, ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0)); From 6b6700a1123eacd8d2bdf04a6964aff24be732b7 Mon Sep 17 00:00:00 2001 From: leo Date: Fri, 22 May 2026 16:30:29 +0200 Subject: [PATCH 27/33] zaya: add cca_mask input tensor for CCA padding masking - New llm_graph_input_cca_mask class + build_inp_cca_mask() in graph infra - cca_mask tensor [1, n_tokens] F32 binary mask applied to hidden_states before CCA convolutions (modeling_zaya.py ref: CCA.forward L325-328) - Applied only during prefill (n_seq_tokens > 1), matching Python logic - Mask filled with 1.0f for all positions (no padding info in ubatch) --- src/llama-graph.cpp | 32 ++++++++++++++++++++++++++++++++ src/llama-graph.h | 16 ++++++++++++++++ src/models/zaya.cpp | 27 +++++++++++++++++++++++++++ 3 files changed, 75 insertions(+) diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index e4f0ff98ef4..80db7b66e25 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -132,6 +132,23 @@ bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) { return res; } +void llm_graph_input_cca_mask::set_input(const llama_ubatch * ubatch) { + if (cca_mask) { + const int64_t n_tokens = ubatch->n_tokens; + // modeling_zaya.py ref: L1555-1558 (ZayaModel.forward) + // + // if attention_mask is not None: + // cca_mask = attention_mask.clone() + // else: + // cca_mask = None + // + // In llama.cpp, all tokens are valid (no padding tokens in the ubatch), + // so the mask is set to 1.0 for every token position. + std::vector mask_data(n_tokens, 1.0f); + ggml_backend_tensor_set(cca_mask, mask_data.data(), 0, n_tokens * ggml_element_size(cca_mask)); + } +} + void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && attn_scale) { const int64_t n_tokens = ubatch->n_tokens; @@ -1822,6 +1839,21 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const { return cur; } +ggml_tensor * llm_graph_context::build_inp_cca_mask() const { + auto inp = std::make_unique(); + + auto & cur = inp->cca_mask; + + // shape: [1, n_tokens] for broadcasting with [n_embd, n_tokens] hidden states + cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_tokens); + ggml_set_input(cur); + ggml_set_name(cur, "cca_mask"); + + res->add_input(std::move(inp)); + + return cur; +} + ggml_tensor * llm_graph_context::build_inp_out_ids() const { // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls, // but this would make the graph topology depend on the number of output tokens, which can interfere with diff --git a/src/llama-graph.h b/src/llama-graph.h index 5cb1756c6a9..d6c0d0458c3 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -150,6 +150,21 @@ class llm_graph_input_attn_temp : public llm_graph_input_i { const float f_attn_temp_offset; }; +// cca_mask for CCA (Channel-wise Cross Attention), used by zaya +// Binary mask applied to hidden_states before CCA convolutions, +// matching modeling_zaya.py ref: CCA.forward L325-328 +// if cca_mask is not None and hidden_states.shape[1] > 1: +// hidden_states = (hidden_states * cca_mask[:, :, None]).to(dtype) +class llm_graph_input_cca_mask : public llm_graph_input_i { +public: + llm_graph_input_cca_mask() = default; + virtual ~llm_graph_input_cca_mask() = default; + + void set_input(const llama_ubatch * ubatch) override; + + ggml_tensor * cca_mask = nullptr; // F32 [1, n_tokens] +}; + class llm_graph_input_pos_bucket : public llm_graph_input_i { public: llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} @@ -880,6 +895,7 @@ struct llm_graph_context { ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const; ggml_tensor * build_inp_pos() const; ggml_tensor * build_inp_attn_scale() const; + ggml_tensor * build_inp_cca_mask() const; ggml_tensor * build_inp_out_ids() const; ggml_tensor * build_inp_mean() const; ggml_tensor * build_inp_cls() const; diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 5e86085fb7a..5d403c65d19 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -343,6 +343,17 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * inp_cca_mask = nullptr; + // modeling_zaya.py ref: L1555-1558 (ZayaModel.forward) + // + // if attention_mask is not None: + // cca_mask = attention_mask.clone() + // else: + // cca_mask = None + // + // Built unconditionally; set_input fills with 1.0 for all positions + // (padding mask is not available in llama.cpp ubatch). + inp_cca_mask = build_inp_cca_mask(); ggml_tensor * residual = nullptr; ggml_tensor * prev_router = nullptr; @@ -459,6 +470,22 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params conv_state_size*ggml_element_size(cca_state)); cb(prev_hs, "cca_prev_hs", il); + /* + * modeling_zaya.py ref: L325-328 (CCA.forward) + * + * if cca_mask is not None and hidden_states.shape[1] > 1: + * # Only applying in prefill + * dtype = hidden_states.dtype + * hidden_states = (hidden_states * cca_mask[:, :, None]).to(dtype) + * + * In ggml: cur is [n_embd, n_tokens], cca_mask is [1, n_tokens]. + * Broadcasting along dim 0 zeros out hidden states of masked positions. + */ + if (inp_cca_mask != nullptr && n_seq_tokens > 1) { + cur = ggml_mul(ctx0, cur, inp_cca_mask); + cb(cur, "cca_masked", il); + } + /* * zaya.py ref: L177-179 * From 9aaef944393aa0791467fb2717212ec50dd39754 Mon Sep 17 00:00:00 2001 From: leo Date: Fri, 22 May 2026 18:36:26 +0200 Subject: [PATCH 28/33] zaya: cast residual to F32 before addition (residual_in_fp32) Match Python reference which casts hidden_states and residual to float32 before ggml_add in both per-layer and final residual paths. zaya.py ref: L900, L1387, L1701 --- src/models/zaya.cpp | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 5d403c65d19..efaedebac8e 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -407,12 +407,17 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params * residual = hidden_states.float() * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype) */ - ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); + ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); + /* + * zaya.py ref: L900, L1387, L1701 + * if self.config.residual_in_fp32: + * residual = hidden_states.to(torch.float32) + */ if (residual != nullptr) { residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il); - residual = ggml_add(ctx0, hidden_states, residual); + residual = ggml_add(ctx0, ggml_cast(ctx0, hidden_states, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32)); } else { - residual = hidden_states; + residual = ggml_cast(ctx0, hidden_states, GGML_TYPE_F32); } cb(residual, "residual", il); @@ -879,9 +884,15 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1); if (residual != nullptr) { residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1); - cur = ggml_add(ctx0, final_hidden, residual); + /* + * zaya.py ref: L1701 + * if self.config.residual_in_fp32: + * hidden_states = hidden_states.float() + * residual = residual.float() + */ + cur = ggml_add(ctx0, ggml_cast(ctx0, final_hidden, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32)); } else { - cur = final_hidden; + cur = ggml_cast(ctx0, final_hidden, GGML_TYPE_F32); } cb(cur, "final_residual", -1); From abe9e401080d147a0a2a4976471025799f828e8c Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 22 May 2026 22:31:20 +0200 Subject: [PATCH 29/33] cleanup: revert debugs commits --- convert_hf_to_gguf.py | 4 -- gguf-py/gguf/constants.py | 3 -- gguf-py/gguf/gguf_writer.py | 3 -- src/llama-arch.cpp | 3 -- src/llama-arch.h | 3 -- src/llama-graph.cpp | 32 --------------- src/llama-graph.h | 16 -------- src/llama-hparams.h | 3 -- src/llama-model-saver.cpp | 3 -- src/models/zaya.cpp | 79 +++++-------------------------------- 10 files changed, 9 insertions(+), 140 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 0fa85283db2..5ce42b465c8 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6500,10 +6500,6 @@ def set_gguf_parameters(self): n_ff_exp = self.hparams.get("zaya_mlp_expansion", 256) self.gguf_writer.add_expert_feed_forward_length(n_ff_exp) - # FP32 output logits for numerical stability (zaya_high_prec) - zaya_high_prec = self.hparams.get("zaya_high_prec", True) - self.gguf_writer.add_zaya_high_prec(zaya_high_prec) - def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: if "linear_q" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 1d759ac54f7..eeb14f6aa76 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -153,9 +153,6 @@ class LLM: DENSE_FEAT_IN_SIZE = "{arch}.{dense}_feat_in" DENSE_FEAT_OUT_SIZE = "{arch}.{dense}_feat_out" - # Zaya-specific - ZAYA_HIGH_PREC = "zaya.high_prec" - class Attention: HEAD_COUNT = "{arch}.attention.head_count" HEAD_COUNT_KV = "{arch}.attention.head_count_kv" diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py index d45a529bd90..35fb01470c4 100644 --- a/gguf-py/gguf/gguf_writer.py +++ b/gguf-py/gguf/gguf_writer.py @@ -1293,9 +1293,6 @@ def add_xielu_beta(self, values: Sequence[float]): def add_xielu_eps(self, values: Sequence[float]): self.add_array(Keys.xIELU.EPS, values) - def add_zaya_high_prec(self, value: bool) -> None: - self.add_bool(Keys.LLM.ZAYA_HIGH_PREC, value) - # diffusion models def add_diffusion_shift_logits(self, value: bool) -> None: diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 43fdb881cbd..e0ac2a625dc 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -293,9 +293,6 @@ static const std::map LLM_KV_NAMES = { { LLM_KV_DENSE_3_FEAT_IN, "%s.dense_3_feat_in" }, { LLM_KV_DENSE_3_FEAT_OUT, "%s.dense_3_feat_out" }, - // Zaya-specific - { LLM_KV_ZAYA_HIGH_PREC, "%s.zaya.high_prec" }, - { LLM_KV_TOKENIZER_MODEL, "tokenizer.ggml.model" }, { LLM_KV_TOKENIZER_PRE, "tokenizer.ggml.pre" }, { LLM_KV_TOKENIZER_LIST, "tokenizer.ggml.tokens" }, diff --git a/src/llama-arch.h b/src/llama-arch.h index a0a18843356..3809afc124c 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -345,9 +345,6 @@ enum llm_kv { LLM_KV_DENSE_2_FEAT_OUT, LLM_KV_DENSE_3_FEAT_IN, LLM_KV_DENSE_3_FEAT_OUT, - - // Zaya-specific - LLM_KV_ZAYA_HIGH_PREC, }; enum llm_tensor { diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index 80db7b66e25..e4f0ff98ef4 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -132,23 +132,6 @@ bool llm_graph_input_pos::can_reuse(const llm_graph_params & params) { return res; } -void llm_graph_input_cca_mask::set_input(const llama_ubatch * ubatch) { - if (cca_mask) { - const int64_t n_tokens = ubatch->n_tokens; - // modeling_zaya.py ref: L1555-1558 (ZayaModel.forward) - // - // if attention_mask is not None: - // cca_mask = attention_mask.clone() - // else: - // cca_mask = None - // - // In llama.cpp, all tokens are valid (no padding tokens in the ubatch), - // so the mask is set to 1.0 for every token position. - std::vector mask_data(n_tokens, 1.0f); - ggml_backend_tensor_set(cca_mask, mask_data.data(), 0, n_tokens * ggml_element_size(cca_mask)); - } -} - void llm_graph_input_attn_temp::set_input(const llama_ubatch * ubatch) { if (ubatch->pos && attn_scale) { const int64_t n_tokens = ubatch->n_tokens; @@ -1839,21 +1822,6 @@ ggml_tensor * llm_graph_context::build_inp_attn_scale() const { return cur; } -ggml_tensor * llm_graph_context::build_inp_cca_mask() const { - auto inp = std::make_unique(); - - auto & cur = inp->cca_mask; - - // shape: [1, n_tokens] for broadcasting with [n_embd, n_tokens] hidden states - cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 1, n_tokens); - ggml_set_input(cur); - ggml_set_name(cur, "cca_mask"); - - res->add_input(std::move(inp)); - - return cur; -} - ggml_tensor * llm_graph_context::build_inp_out_ids() const { // note: when all tokens are output, we could skip this optimization to spare the ggml_get_rows() calls, // but this would make the graph topology depend on the number of output tokens, which can interfere with diff --git a/src/llama-graph.h b/src/llama-graph.h index d6c0d0458c3..5cb1756c6a9 100644 --- a/src/llama-graph.h +++ b/src/llama-graph.h @@ -150,21 +150,6 @@ class llm_graph_input_attn_temp : public llm_graph_input_i { const float f_attn_temp_offset; }; -// cca_mask for CCA (Channel-wise Cross Attention), used by zaya -// Binary mask applied to hidden_states before CCA convolutions, -// matching modeling_zaya.py ref: CCA.forward L325-328 -// if cca_mask is not None and hidden_states.shape[1] > 1: -// hidden_states = (hidden_states * cca_mask[:, :, None]).to(dtype) -class llm_graph_input_cca_mask : public llm_graph_input_i { -public: - llm_graph_input_cca_mask() = default; - virtual ~llm_graph_input_cca_mask() = default; - - void set_input(const llama_ubatch * ubatch) override; - - ggml_tensor * cca_mask = nullptr; // F32 [1, n_tokens] -}; - class llm_graph_input_pos_bucket : public llm_graph_input_i { public: llm_graph_input_pos_bucket(const llama_hparams & hparams) : hparams(hparams) {} @@ -895,7 +880,6 @@ struct llm_graph_context { ggml_tensor * build_inp_embd(ggml_tensor * tok_embd) const; ggml_tensor * build_inp_pos() const; ggml_tensor * build_inp_attn_scale() const; - ggml_tensor * build_inp_cca_mask() const; ggml_tensor * build_inp_out_ids() const; ggml_tensor * build_inp_mean() const; ggml_tensor * build_inp_cls() const; diff --git a/src/llama-hparams.h b/src/llama-hparams.h index 7982dba8ab2..0160a89caa2 100644 --- a/src/llama-hparams.h +++ b/src/llama-hparams.h @@ -212,9 +212,6 @@ struct llama_hparams { // qwen3vl deepstack uint32_t n_deepstack_layers = 0; - // zaya: FP32 output logits for numerical stability - bool zaya_high_prec = true; - // gemma4 per-layer embedding uint32_t n_embd_per_layer = 0; diff --git a/src/llama-model-saver.cpp b/src/llama-model-saver.cpp index 07bf6bc812d..e83056557bf 100644 --- a/src/llama-model-saver.cpp +++ b/src/llama-model-saver.cpp @@ -377,9 +377,6 @@ void llama_model_saver::add_kv_from_model() { add_kv(LLM_KV_DENSE_2_FEAT_OUT, hparams.dense_2_feat_out); add_kv(LLM_KV_DENSE_3_FEAT_IN, hparams.dense_3_feat_in); add_kv(LLM_KV_DENSE_3_FEAT_OUT, hparams.dense_3_feat_out); - - // Zaya-specific - add_kv(LLM_KV_ZAYA_HIGH_PREC, hparams.zaya_high_prec); } void llama_model_saver::add_tensors_from_model() { diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index efaedebac8e..e2dd6e52bcf 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -204,7 +204,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { * hidden_states = (hidden_states.float() + hs_bias) * hs_scale * residual = (residual.float() + res_bias) * res_scale */ - layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); @@ -343,17 +343,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); - ggml_tensor * inp_cca_mask = nullptr; - // modeling_zaya.py ref: L1555-1558 (ZayaModel.forward) - // - // if attention_mask is not None: - // cca_mask = attention_mask.clone() - // else: - // cca_mask = None - // - // Built unconditionally; set_input fills with 1.0 for all positions - // (padding mask is not available in llama.cpp ubatch). - inp_cca_mask = build_inp_cca_mask(); ggml_tensor * residual = nullptr; ggml_tensor * prev_router = nullptr; @@ -407,17 +396,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params * residual = hidden_states.float() * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype) */ - ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); - /* - * zaya.py ref: L900, L1387, L1701 - * if self.config.residual_in_fp32: - * residual = hidden_states.to(torch.float32) - */ + ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); if (residual != nullptr) { residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il); - residual = ggml_add(ctx0, ggml_cast(ctx0, hidden_states, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32)); + residual = ggml_add(ctx0, hidden_states, residual); } else { - residual = ggml_cast(ctx0, hidden_states, GGML_TYPE_F32); + residual = hidden_states; } cb(residual, "residual", il); @@ -475,22 +459,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params conv_state_size*ggml_element_size(cca_state)); cb(prev_hs, "cca_prev_hs", il); - /* - * modeling_zaya.py ref: L325-328 (CCA.forward) - * - * if cca_mask is not None and hidden_states.shape[1] > 1: - * # Only applying in prefill - * dtype = hidden_states.dtype - * hidden_states = (hidden_states * cca_mask[:, :, None]).to(dtype) - * - * In ggml: cur is [n_embd, n_tokens], cca_mask is [1, n_tokens]. - * Broadcasting along dim 0 zeros out hidden states of masked positions. - */ - if (inp_cca_mask != nullptr && n_seq_tokens > 1) { - cur = ggml_mul(ctx0, cur, inp_cca_mask); - cb(cur, "cca_masked", il); - } - /* * zaya.py ref: L177-179 * @@ -748,19 +716,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(router_h, "router_down", il); /* - * zaya.py ref: L294-296, L344-345 - * - * zaya_first_layer = 1 - * use_eda_cfg = bool(getattr(config, "zaya_use_eda", False)) - * self.use_eda = use_eda_cfg and (zaya_first_layer is not None) and (self.layer_number != zaya_first_layer) + * zaya.py ref: L344-345 * * if self.use_eda and (prev_router_hidden_states is not None): * hs = hs + prev_router_hidden_states * self.router_states_scale - * - * EDA is disabled for layer 1 (first MoE layer) via (self.layer_number != zaya_first_layer). - * When zaya_use_eda is False globally, the parameter is never created (tensor stays nullptr). */ - if (il != 1 && prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { + if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale)); cb(router_h, "router_eda", il); } @@ -811,13 +772,9 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(router_probs, "router_probs", il); /* - * zaya.py ref: L459-469 (MOD skip expert handling) + * zaya.py ref: L387-389 (MOD skip expert handling) * * gate_probs = router_probs[:, :n_expert] // exclude skip expert from routing - * - * Note: the skip expert (index n_expert) has a -1.0 bias in - * balancing_biases, making it practically never selected during - * inference with topk=1. */ ggml_tensor * gate_probs = ggml_cont(ctx0, ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0)); @@ -884,15 +841,9 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1); if (residual != nullptr) { residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1); - /* - * zaya.py ref: L1701 - * if self.config.residual_in_fp32: - * hidden_states = hidden_states.float() - * residual = residual.float() - */ - cur = ggml_add(ctx0, ggml_cast(ctx0, final_hidden, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32)); + cur = ggml_add(ctx0, final_hidden, residual); } else { - cur = ggml_cast(ctx0, final_hidden, GGML_TYPE_F32); + cur = final_hidden; } cb(cur, "final_residual", -1); @@ -918,18 +869,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params */ cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); - - /* - * zaya.py ref: L748-749 (_FP32EmbeddingMethod) - * - * if self.zaya_high_prec: - * out = out.to(dtype=torch.float32) - */ - if (hparams.zaya_high_prec) { - cur = ggml_cont(ctx0, ggml_cast(ctx0, cur, GGML_TYPE_F32)); - cb(cur, "result_output_fp32", -1); - } - res->t_logits = cur; ggml_build_forward_expand(gf, cur); From 6fad5d867ef87efd8998cdde57a164c72a522ea4 Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 22 May 2026 22:43:47 +0200 Subject: [PATCH 30/33] Revert "docs(zaya): add Python reference comments to C++ implementation" This reverts commit f1bd772a37f3407d8fd2809edef1f0e0d245b760. --- src/models/zaya.cpp | 488 +++----------------------------------------- 1 file changed, 27 insertions(+), 461 deletions(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index e2dd6e52bcf..ce65c2281fa 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -5,25 +5,6 @@ #include -/* - * zaya.py ref: L52-81 (ResidualScaling class) - * - * class ResidualScaling(nn.Module): - * def __init__(self, config, layer_n, ...): - * self.not_first_layer = (layer_n != 0) - * self.hidden_states_scale = torch.nn.Parameter(torch.ones(config.hidden_size)) - * self.hidden_states_bias = torch.nn.Parameter(torch.zeros(config.hidden_size)) - * if self.not_first_layer: - * self.residual_scale = torch.nn.Parameter(torch.ones(config.hidden_size)) - * self.residual_bias = torch.nn.Parameter(torch.zeros(config.hidden_size)) - * - * def forward(self, residual, hidden_states): - * hidden_states = (hidden_states.float() + hs_bias) * hs_scale - * if self.not_first_layer and residual is not None: - * residual = (residual.float() + res_bias) * res_scale - * return residual, hidden_states - */ - void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); @@ -34,15 +15,6 @@ void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { hparams.ssm_d_state = 1; hparams.ssm_n_group = 0; - /* - * zaya.py ref: L575-602 (layer alternation) - * - * for layer_n in range(config.num_hidden_layers): - * if layer_n % 2 == 1: - * self.layers.append(ZayaDecoderMLPLayer(...)) # MoE layer - * else: - * self.layers.append(ZayaDecoderATTLayer(...)) # Attention layer - */ for (uint32_t i = 0; i < hparams.n_layer; ++i) { hparams.recurrent_layer_arr[i] = (i % 2) == 0; } @@ -56,42 +28,17 @@ void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { void llama_model_zaya::load_arch_tensors(llama_model_loader &) { LLAMA_LOAD_LOCALS; - /* - * zaya.py ref: L569-573 - * - * self.embed_tokens = VocabParallelEmbedding( - * self.vocab_size, config.hidden_size, org_num_embeddings=config.vocab_size) - */ tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - /* - * zaya.py ref: L608-613 - * - * if (config.normalization == "RMSNorm"): - * self.final_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon) - * elif (config.normalization == "LayerNorm"): - * self.final_norm = nn.LayerNorm(self.config.hidden_size, eps=config.norm_epsilon) - */ + // output norm output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - /* - * zaya.py ref: L729-743 - * - * self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, ...) - * if self.config.tie_word_embeddings: - * self.lm_head = self.lm_head.tie_weights(self.model.embed_tokens) - */ + // output (tied with tok_embd if not present) output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); if (output == nullptr) { - output = tok_embd; // tied weights + output = tok_embd; } - /* - * zaya.py ref: L605-606 (final ResidualScaling after all layers) - * - * if self.config.scale_residual_merge: - * self.res_scale = ResidualScaling(config, config.num_hidden_layers) - */ zaya_res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); zaya_res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); zaya_res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); @@ -99,6 +46,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { const int64_t n_embd_head = hparams.n_embd_head_k(); const int64_t d_conv = hparams.ssm_d_conv; + // Router MLP hidden size (zaya_mlp_expansion) const int64_t n_ff_exp = hparams.n_ff_exp; for (int i = 0; i < n_layer; ++i) { @@ -113,97 +61,31 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { const int64_t n_ff = hparams.n_ff(i); const int64_t n_expert = hparams.n_expert; - /* - * zaya.py ref: L212-217 (ZayaDecoderATTLayer input_norm) - * zaya.py ref: L508-513 (ZayaDecoderMLPLayer input_norm) - * - * if (config.normalization == "RMSNorm"): - * self.input_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon) - * elif (config.normalization == "LayerNorm"): - * self.input_norm = nn.LayerNorm(self.config.hidden_size, eps=config.norm_epsilon) - */ layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); // CCA attention layers (even indices only) if (i % 2 == 0) { - /* - * zaya.py ref: L98-184 (ZayaAttention class) - * - * self.q_dim = cca_num_q_heads * head_dim - * self.k_dim = cca_num_k_heads * head_dim - * self.v_dim = cca_num_k_heads * head_dim - * - * self.qkv = CCA(config, cca_num_k_heads, cca_num_q_heads, cca_num_heads, ...) - * self.o_proj = ReplicatedLinear(cca_num_q_heads * head_dim, hidden_size, ...) - * self.attn = Attention(cca_num_q_heads, head_dim, scale, cca_num_k_heads, ...) - * self.rotary_emb = get_rope(head_size=head_dim, ..., partial_rotary_factor=0.5) - */ - - /* - * zaya.py ref: L125-138 (CCA layer for Q, K projections) - * - * self.qkv = CCA(...) - * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...) - * self.qkv(hidden_states, output_qkv) - * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1) - */ layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); - /* - * zaya.py ref: CCA.py - value projections (val_proj1, val_proj2) - * - * V1 = val_proj1(x) - * V2 = val_proj2(x_delayed) - * V = concat(V1, V2) - */ layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), {n_embd, n_embd_k / 2}, 0); layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), {n_embd, n_embd_k / 2}, 0); - /* - * zaya.py ref: L139-144 - * - * self.o_proj = ReplicatedLinear(self.cca_num_q_heads * self.head_dim, - * self.hidden_size, bias=self.config.attention_bias, ...) - */ layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); - /* - * zaya.py ref: CCA.py - depthwise conv on QK - * - * conv_dw applied to [Q, K] concatenated - */ layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, n_qk}, 0); layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); - /* - * zaya.py ref: CCA.py - grouped conv on QK - * - * conv_grp applied after dw conv, with n_groups = n_head + n_head_kv - */ layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), {d_conv, n_qk / n_groups, n_qk}, 0); layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); - /* - * zaya.py ref: CCA.py - K scaling after L2 norm - * - * Kcur = Kcur * cca_k_scale - */ layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0); } - /* - * zaya.py ref: L52-81, L219-220, L515-516 (per-layer ResidualScaling) - * - * if self.config.scale_residual_merge: - * self.res_scale = ResidualScaling(config, layer_n) - * - * hidden_states = (hidden_states.float() + hs_bias) * hs_scale - * residual = (residual.float() + res_bias) * res_scale - */ + // Residual scaling layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); layer.res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); @@ -211,51 +93,13 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { // MoE layers (odd indices) if (i % 2 == 1) { - /* - * zaya.py ref: L251-380 (ZayaRouter class) - * - * self.down_proj = ReplicatedLinear(self.hidden_size, self.mlp_expansion, bias=True, ...) - * self.rmsnorm_eda = RMSNorm(self.mlp_expansion, eps=ln_eps) - * self.router_states_scale = nn.Parameter(torch.ones(self.mlp_expansion)) // EDA scale - * self.router_mlp = nn.Sequential( - * ReplicatedLinear(D, D, bias=True, ...), - * nn.GELU(), - * ReplicatedLinear(D, D, bias=True, ...), - * nn.GELU(), - * ReplicatedLinear(D, E, bias=False, ...), - * ) - * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32)) - */ - - /* - * zaya.py ref: L291 - * - * self.down_proj = ReplicatedLinear(self.hidden_size, self.mlp_expansion, bias=True, ...) - */ + // Router network layer.zaya_router_down = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd, n_ff_exp}, 0); layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_FFN_GATE_INP, "bias", i), {n_ff_exp}, TENSOR_NOT_REQUIRED); - - /* - * zaya.py ref: L298-299 - * - * self.rmsnorm_eda = RMSNorm(self.mlp_expansion, eps=ln_eps) - */ layer.zaya_router_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_ff_exp}, 0); - - /* - * zaya.py ref: L305-314 (router MLP layers 0, 2, 4) - * - * self.router_mlp = nn.Sequential( - * ReplicatedLinear(D, D, bias=True, ...), // mlp0 - * self.non_linearity, // GELU - * ReplicatedLinear(D, D, bias=True, ...), // mlp2 - * self.non_linearity, // GELU - * ReplicatedLinear(D, E, bias=False, ...), // mlp4 - * ) - */ layer.zaya_router_mlp0 = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_ff_exp, n_ff_exp}, 0); layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_FFN_GATE, "bias", i), @@ -266,40 +110,12 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { {n_ff_exp}, TENSOR_NOT_REQUIRED); layer.zaya_router_mlp4 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i), {n_ff_exp, n_expert + 1}, 0); - - /* - * zaya.py ref: L317-319 - * - * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32)) - * if self.use_mod: - * self.balancing_biases[-1] = -1.0 - */ layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i), {n_expert + 1}, TENSOR_NOT_REQUIRED); - - /* - * zaya.py ref: L302-303 - * - * self.router_states_scale = nn.Parameter(torch.ones(self.mlp_expansion)) - */ layer.zaya_router_eda_scale = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "weight", i), {n_ff_exp}, TENSOR_NOT_REQUIRED); - /* - * zaya.py ref: L435-446 (FusedMoE experts) - * - * self.experts = FusedMoE( - * num_experts=self.num_moe_experts, - * top_k=self.topk, - * hidden_size=config.hidden_size, - * intermediate_size=ffn_hidden_size // 2, - * reduce_results=False, - * renormalize=False, - * custom_routing_function=_custom_routing_fn, - * activation="silu", - * ... - * ) - */ + // MoE experts (fused gate_up and down) create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0); layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0); @@ -327,15 +143,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * cur; ggml_tensor * inpL; - /* - * zaya.py ref: L638-641 (ZayaModel.forward) - * - * if inputs_embeds is None: - * inputs_embeds = self.embed_tokens(input_ids) - * residual = None - * hidden_states = inputs_embeds - * prev_router_hidden_states = None - */ inpL = build_inp_embd(model.tok_embd); auto * inp = build_inp_mem_hybrid(); @@ -346,14 +153,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * residual = nullptr; ggml_tensor * prev_router = nullptr; - /* - * zaya.py ref: L71-81 (ResidualScaling.forward) - * - * hidden_states = (hidden_states.float() + hs_bias) * hs_scale - * if self.not_first_layer and residual is not None: - * residual = (residual.float() + res_bias) * res_scale - * return residual, hidden_states - */ const auto apply_res_scale = [&](ggml_tensor * x, ggml_tensor * scale, ggml_tensor * bias, const char * name, int il) { if (scale == nullptr) { return x; @@ -366,13 +165,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params return x; }; - /* - * zaya.py ref: L644-651 (ZayaModel.forward layer loop) - * - * for layer_n, decoder_layer in enumerate(self.layers): - * hidden_states, residual, prev_router_hidden_states = decoder_layer( - * hidden_states, residual, positions, layer_n, prev_router_hidden_states) - */ for (int il = 0; il < n_layer; ++il) { const auto & layer = model.layers[il]; @@ -384,18 +176,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params const int64_t n_groups = n_head + n_head_kv; const int64_t n_gqa = n_head / n_head_kv; - /* - * zaya.py ref: L234-241 (ZayaDecoderATTLayer.forward) - * zaya.py ref: L530-537 (ZayaDecoderMLPLayer.forward) - * - * if self.config.scale_residual_merge: - * residual, hidden_states = self.res_scale(residual, hidden_states) - * if residual is not None: - * residual = residual.float() + hidden_states.float() - * else: - * residual = hidden_states.float() - * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype) - */ ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); if (residual != nullptr) { residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il); @@ -405,45 +185,16 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params } cb(residual, "residual", il); - /* - * zaya.py ref: L84-95 (_apply_norm_with_fp32_residual) - * zaya.py ref: L240-241, L536-537 - * - * if isinstance(norm, RMSNorm): - * if residual.dtype != norm.weight.dtype: - * hidden_states = norm.forward_native(residual) - * else: - * hidden_states = norm(residual) - * return hidden_states.to(target_dtype) - */ + // Pre-norm cur = build_norm(residual, layer.attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "input_norm", il); if (il % 2 == 0) { // ===== CCA Attention ===== - /* - * zaya.py ref: L98-184 (ZayaAttention) - * zaya.py ref: L171-184 (ZayaAttention.forward) - * - * def forward(self, hidden_states, position_ids): - * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...) - * self.qkv(hidden_states, output_qkv) - * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1) - * q, k = self.rotary_emb(position_ids, q, k) - * attn_output = self.attn(q, k, v) - * attn_output = self.o_proj(attn_output) - * return attn_output - */ - const int64_t conv_state_size = 2*n_qk; const int64_t cca_state_size = conv_state_size + n_embd; GGML_ASSERT((int64_t) hparams.n_embd_s() == cca_state_size); - /* - * zaya.py ref: CCA.py - recurrent state management - * - * CCA maintains conv_state and prev_hs in recurrent memory - */ ggml_tensor * cca_state_all = inp_recr->mctx->get_s_l(il); ggml_tensor * cca_state = build_rs(inp_recr, cca_state_all, hparams.n_embd_s(), n_seqs); cb(cca_state, "cca_state", il); @@ -459,26 +210,14 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params conv_state_size*ggml_element_size(cca_state)); cb(prev_hs, "cca_prev_hs", il); - /* - * zaya.py ref: L177-179 - * - * output_qkv = torch.zeros((hidden_states.shape[0], self.qkv_dim), ...) - * self.qkv(hidden_states, output_qkv) - * q, k, v = output_qkv.split([self.q_dim, self.k_dim, self.v_dim], dim=-1) - */ + // Q, K projections ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); cb(Qraw, "Qraw", il); ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); cb(Kraw, "Kraw", il); - /* - * zaya.py ref: CCA.py - delayed hidden state stream for val_proj2 - * - * During decode: comes from recurrent state - * During prefill: one-token shift of current sequence - * - * hs_d = concat(prev_hs_last, cur[:-1]) along seq dimension - */ + // HF uses a delayed hidden-state stream for val_proj2. During decode this + // comes from the recurrent state; during prefill it is a one-token shift. ggml_tensor * cur_state_src = ggml_cont(ctx0, cur); ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs); @@ -493,13 +232,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params hs_d = ggml_reshape_2d(ctx0, ggml_cont(ctx0, hs_d), n_embd, n_tokens); cb(hs_d, "cca_hs_d", il); - /* - * zaya.py ref: CCA.py - V projection - * - * V1 = val_proj1(cur) - * V2 = val_proj2(hs_d) - * Vcur = concat(V1, V2, dim=0) - */ + // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens] ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur); cb(V1, "V1", il); ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, hs_d); @@ -507,25 +240,10 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0); cb(Vcur, "Vcur", il); - /* - * zaya.py ref: CCA.py - QK concatenation for conv - * - * QKraw = concat(Qraw, Kraw, dim=0) - */ + // Concat Q+K for conv: [n_qk, n_tokens] ggml_tensor * QKraw = ggml_concat(ctx0, Qraw, Kraw, 0); cb(QKraw, "QKraw", il); - /* - * zaya.py ref: CCA.py - qk_mean computation - * - * Qpre: [n_embd_head, n_head, n_tokens] - * Kpre: [n_embd_head, n_head_kv, n_tokens] - * Kpre_grouped = repeat(Kpre, n_gqa times along head dim) - * qk_mean_q = (Qpre + Kpre_rep) * 0.5 - * - * Qgroup = group Q by GQA, mean across group - * qk_mean_k = (Qmean + Kpre) * 0.5 - */ ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Qraw), n_embd_head, n_head, n_tokens); ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Kraw), n_embd_head, n_head_kv, n_tokens); @@ -543,12 +261,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * qk_mean_k = ggml_scale(ctx0, ggml_add(ctx0, Qmean, Kpre), 0.5f); cb(qk_mean_k, "qk_mean_k", il); - /* - * zaya.py ref: CCA.py - conv state update - * - * conv_input = concat(conv_state, QKraw_reshaped, dim=0) - * last_conv_states = conv_input[-2:] (last 2 positions for state update) - */ ggml_tensor * QKraw_t = ggml_cont(ctx0, ggml_transpose(ctx0, QKraw)); QKraw_t = ggml_reshape_3d(ctx0, QKraw_t, n_seq_tokens, n_qk, n_seqs); @@ -561,11 +273,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params n_seq_tokens*conv_input->nb[0]); cb(last_conv_states, "cca_last_conv_states", il); - /* - * zaya.py ref: CCA.py - recurrent state write-back - * - * Update conv_state and prev_hs in recurrent memory for next step - */ const auto kv_head = inp_recr->mctx->get_head(); ggml_tensor * conv_state_update_target = ggml_view_2d(ctx0, cca_state_all, conv_state_size, n_seqs, cca_state_all->nb[1], @@ -580,27 +287,19 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params (kv_head*cca_state_size + conv_state_size)*ggml_element_size(cca_state_all)); ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target)); - /* - * zaya.py ref: CCA.py - depthwise conv - * - * QK = ssm_conv(conv_input, conv_dw) + conv_dw_b - */ ggml_tensor * conv_dw = layer.cca_conv_dw; if (conv_dw->type != GGML_TYPE_F32) { conv_dw = ggml_cont(ctx0, ggml_cast(ctx0, conv_dw, GGML_TYPE_F32)); } + // conv_input is [L, n_qk, n_seqs], ssm_conv outputs [n_qk, n_tokens, n_seqs] ggml_tensor * QK = ggml_ssm_conv(ctx0, conv_input, conv_dw); + // permute from [n_qk, n_tokens, n_seqs] to [n_tokens, n_qk, n_seqs] QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3)); if (layer.cca_conv_dw_b) { QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1)); } cb(QK, "QK_dw", il); - /* - * zaya.py ref: CCA.py - grouped conv - * - * QK = conv_1d_grouped(QK, conv_grp, n_groups) + conv_grp_b - */ ggml_tensor * conv_grp = layer.cca_conv_grp; if (conv_grp->type != GGML_TYPE_F16) { conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F16)); @@ -610,6 +309,8 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(QK, "QK_grp", il); QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3)); + // QK is now [n_qk, n_seq_tokens, n_seqs] + // Flatten to 2D: [n_qk, n_tokens] where n_tokens = n_seq_tokens * n_seqs QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0); @@ -618,38 +319,15 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * Qcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Q_conv), n_embd_head, n_head, n_tokens); ggml_tensor * Kcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, K_conv), n_embd_head, n_head_kv, n_tokens); - /* - * zaya.py ref: CCA.py - add qk_mean back to Q, K - * - * Qcur = Qcur + qk_mean_q - * Kcur = Kcur + qk_mean_k - */ Qcur = ggml_add(ctx0, Qcur, qk_mean_q); Kcur = ggml_add(ctx0, Kcur, qk_mean_k); - /* - * zaya.py ref: CCA.py - L2 normalization and scaling - * - * Qcur = l2_norm(Qcur) * sqrt(n_embd_head) - * Kcur = l2_norm(Kcur) * sqrt(n_embd_head) * cca_k_scale - */ Qcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Qcur, 1e-12f), sqrtf((float) n_embd_head)); Kcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Kcur, 1e-12f), sqrtf((float) n_embd_head)); Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.cca_k_scale, 1, n_head_kv, 1)); cb(Qcur, "Qcur_pre_rope", il); cb(Kcur, "Kcur_pre_rope", il); - /* - * zaya.py ref: L155-164 (rotary embedding) - * - * self.rotary_emb = get_rope( - * head_size=self.head_dim, - * max_position=config.max_position_embeddings, - * is_neox_style=True, - * rope_parameters={"rope_theta": config.rope_theta, "rope_type": "default", "partial_rotary_factor": 0.5}, - * ) - * q, k = self.rotary_emb(position_ids, q, k) - */ ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, @@ -662,13 +340,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params Vcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Vcur), n_embd_head, n_head_kv, n_tokens); - /* - * zaya.py ref: L146-153, L181-182 (Attention + output projection) - * - * self.attn = Attention(self.cca_num_q_heads, self.head_dim, self.scale, self.cca_num_k_heads, ...) - * attn_output = self.attn(q, k, v) - * attn_output = self.o_proj(attn_output) - */ + // GQA attention cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr, Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, 1.0f / sqrtf((float) n_embd_head), il); @@ -676,78 +348,24 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params } else { // ===== MoE Layer ===== - /* - * zaya.py ref: L481-541 (ZayaDecoderMLPLayer) - * zaya.py ref: L382-479 (ZayaBlock) - * zaya.py ref: L251-380 (ZayaRouter) - * - * def forward(self, hidden_states, residual, position_ids, layer_n, prev_router_hidden_states): - * if self.config.scale_residual_merge: - * residual, hidden_states = self.res_scale(residual, hidden_states) - * residual = residual.float() + hidden_states.float() - * hidden_states = _apply_norm_with_fp32_residual(self.input_norm, residual, layer_input_dtype) - * hidden_states, prev_router_hidden_states = self.zaya_block(hidden_states, prev_router_hidden_states) - * return hidden_states, residual, prev_router_hidden_states - */ - - /* - * zaya.py ref: L321-380 (ZayaRouter.forward) - * - * hs = self.down_proj(hidden_states) - * if self.use_eda and (prev_router_hidden_states is not None): - * hs = hs + prev_router_hidden_states * self.router_states_scale - * router_hidden_states_next = hs[-S:].clone() - * hs_norm = self.rmsnorm_eda(hs) - * logits = self.router_mlp(hs_norm) // Linear->GELU->Linear->GELU->Linear - * expert_prob = torch.softmax(logits, dim=-1, dtype=torch.float32) - * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases - * _, expert_choice_t = torch.topk(biased, self.topk, dim=-1) - * route_prob = torch.gather(expert_prob, dim=1, index=expert_choice_t) - * return route_prob_flat, expert_choice_flat, router_hidden_states_next - */ - - /* - * zaya.py ref: L343 - * - * hs = self.down_proj(hidden_states) - */ + + // Build Zaya router network: + // down_proj -> optional EDA -> RMSNorm -> GELU MLP -> 17 logits. + ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur); router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b); cb(router_h, "router_down", il); - /* - * zaya.py ref: L344-345 - * - * if self.use_eda and (prev_router_hidden_states is not None): - * hs = hs + prev_router_hidden_states * self.router_states_scale - */ if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale)); cb(router_h, "router_eda", il); } - prev_router = router_h; // zaya.py ref: L348 (router_hidden_states_next) + prev_router = router_h; - /* - * zaya.py ref: L351 - * - * hs_norm = self.rmsnorm_eda(hs) - */ router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il); cb(router_h, "router_norm", il); - /* - * zaya.py ref: L305-314, L354 - * - * logits = self.router_mlp(hs_norm) - * self.router_mlp = nn.Sequential( - * ReplicatedLinear(D, D, bias=True, ...), // mlp0 - * nn.GELU(), - * ReplicatedLinear(D, D, bias=True, ...), // mlp2 - * nn.GELU(), - * ReplicatedLinear(D, E, bias=False, ...), // mlp4 - * ) - */ router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h); router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b); router_h = ggml_gelu(ctx0, router_h); @@ -761,51 +379,20 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h); cb(router_h, "router_logits", il); - /* - * zaya.py ref: L355-359 - * - * expert_prob = torch.softmax(logits, dim=-1, dtype=torch.float32) - * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases - * _, expert_choice_t = torch.topk(biased, self.topk, dim=-1) - */ ggml_tensor * router_probs = ggml_soft_max(ctx0, router_h); cb(router_probs, "router_probs", il); - /* - * zaya.py ref: L387-389 (MOD skip expert handling) - * - * gate_probs = router_probs[:, :n_expert] // exclude skip expert from routing - */ + // Keep the MOD skip expert in the softmax denominator, then route + // over real experts only. The checkpoint's skip bias keeps MOD unused. ggml_tensor * gate_probs = ggml_cont(ctx0, ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0)); cb(gate_probs, "gate_probs", il); - /* - * zaya.py ref: L317-319, L362-363 - * - * self.register_buffer("balancing_biases", torch.zeros(self.num_experts, dtype=torch.float32)) - * biased = expert_prob.detach().to(torch.float32) + self.balancing_biases - */ ggml_tensor * expert_biases = nullptr; if (layer.zaya_router_biases != nullptr) { expert_biases = ggml_view_1d(ctx0, layer.zaya_router_biases, n_expert, 0); } - /* - * zaya.py ref: L448-479 (ZayaBlock.forward - MoE execution) - * - * probs, indices, router_hidden_states_out = self.router(hidden_states, prev_router_hidden_states) - * if self.config.zaya_use_mod: - * clamped_indices = torch.clamp(indices, min=0, max=self.num_moe_experts - 1) - * packed_logits = torch.cat([probs, clamped_indices.to(probs.dtype)], dim=-1) - * hidden_states_experts = self.experts(hidden_states, packed_logits) - * hidden_states_mod = hidden_states * probs - * mod_mask = (indices != self.num_moe_experts) - * hidden_states = (mod_mask * hidden_states_experts) + ((~mod_mask) * hidden_states_mod) - * else: - * packed_logits = torch.cat([probs, indices.to(probs.dtype)], dim=-1) - * hidden_states = self.experts(hidden_states, packed_logits) - */ cur = build_moe_ffn(cur, /* gate_inp */ nullptr, /* up_exps */ nullptr, @@ -827,17 +414,6 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params inpL = cur; } - /* - * zaya.py ref: L653-664 (ZayaModel.forward - final residual + norm) - * - * if self.config.scale_residual_merge: - * residual, hidden_states = self.res_scale(residual, hidden_states) - * if residual is not None: - * hidden_states = hidden_states.float() + residual.float() - * else: - * hidden_states = hidden_states.float() - * hidden_states = _apply_norm_with_fp32_residual(self.final_norm, hidden_states, final_input_dtype) - */ ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1); if (residual != nullptr) { residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1); @@ -851,22 +427,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cur = ggml_get_rows(ctx0, cur, inp_out_ids); } - /* - * zaya.py ref: L608-613 (final norm) - * - * if (config.normalization == "RMSNorm"): - * self.final_norm = RMSNorm(self.config.hidden_size, eps=config.norm_epsilon) - */ + // final norm cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; - /* - * zaya.py ref: L729-746, L769-782 (lm_head + logits_processor) - * - * self.lm_head = ParallelLMHead(self.unpadded_vocab_size, config.hidden_size, ...) - * logits = self.logits_processor(self.lm_head, hidden_states) - */ + // output cur = ggml_mul_mat(ctx0, model.output, cur); cb(cur, "result_output", -1); res->t_logits = cur; From 82f8f1575e76209c862f1d02a93565269709d46b Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Fri, 22 May 2026 23:22:05 +0200 Subject: [PATCH 31/33] ggml/zaya: fix precision loss in conv_1d and support BF16 - ggml: Update `ggml_conv_1d` (and variants) to use a conditional type for `im2col` activation (`a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32`) instead of hardcoding `GGML_TYPE_F16`. This aligns with `ggml_conv_2d`, preserving F32/BF16 precision while still safely protecting against quantized weight crashes (e.g., Q4_0). - zaya: Replace the forced F16 downcast for grouped convolutions with a dynamic promotion to F32 for unsupported types (like BF16 or quantized types). This ensures `im2col` properly allocates an F32 matrix and computes an F32xF32 mul_mat, avoiding CUDA/CPU backend crashes while fully restoring model accuracy and NMSE metrics. --- ggml/src/ggml.c | 6 +++--- src/models/zaya.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index ae1fb2fa031..276c11cb68c 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4487,7 +4487,7 @@ struct ggml_tensor * ggml_conv_1d( int s0, int p0, int d0) { - struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K] + struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32); // [N, OL, IC * K] struct ggml_tensor * result = ggml_mul_mat(ctx, @@ -4521,7 +4521,7 @@ struct ggml_tensor * ggml_conv_1d_dw( int d0) { struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]); - struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); + struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32); struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a); @@ -4781,7 +4781,7 @@ struct ggml_tensor * ggml_conv_2d_dw( struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]); struct ggml_tensor * im2col = ggml_im2col(ctx, new_a, ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]), - s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW] + s0, s1, p0, p1, d0, d1, true, a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32); // [N * IC, OH, OW, KH * KW] struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW] new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW] diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index ce65c2281fa..1db69c313b6 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -301,8 +301,8 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(QK, "QK_dw", il); ggml_tensor * conv_grp = layer.cca_conv_grp; - if (conv_grp->type != GGML_TYPE_F16) { - conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F16)); + if (conv_grp->type != GGML_TYPE_F16 && conv_grp->type != GGML_TYPE_F32) { + conv_grp = ggml_cont(ctx0, ggml_cast(ctx0, conv_grp, GGML_TYPE_F32)); } QK = ggml_conv_1d_grouped(ctx0, conv_grp, QK, 1, 0, 1, n_groups); QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1)); From 894ffd4274cc54e3c0d08c33a71a0df3eecf2f5f Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Mon, 25 May 2026 11:30:31 +0200 Subject: [PATCH 32/33] zaya: add il != 1 check for EDA to match python reference This is a safety guard matching self.layer_number != zaya_first_layer in the original implementation. No behavioral change for correctly converted models since the tensor is already nullptr for layer 1. --- src/models/zaya.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 1db69c313b6..d952cc74007 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -356,7 +356,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b); cb(router_h, "router_down", il); - if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { + if (il != 1 && prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale)); cb(router_h, "router_eda", il); } From 1a7582b911a53b691e9b7d926a92bd504bcdf26e Mon Sep 17 00:00:00 2001 From: Juste-Leo2 Date: Mon, 25 May 2026 11:47:51 +0200 Subject: [PATCH 33/33] zaya: compute residual in fp32 to match config The model config has residual_in_fp32=true. Cast both residual branches to float32 to align with the python reference. --- src/models/zaya.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index d952cc74007..2ee2b676c7e 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -177,11 +177,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params const int64_t n_gqa = n_head / n_head_kv; ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); + // residual_in_fp32 = true in config if (residual != nullptr) { residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il); - residual = ggml_add(ctx0, hidden_states, residual); + residual = ggml_add(ctx0, ggml_cast(ctx0, hidden_states, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32)); } else { - residual = hidden_states; + residual = ggml_cast(ctx0, hidden_states, GGML_TYPE_F32); } cb(residual, "residual", il); @@ -415,11 +416,12 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params } ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1); + // residual_in_fp32 = true in config if (residual != nullptr) { residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1); - cur = ggml_add(ctx0, final_hidden, residual); + cur = ggml_add(ctx0, ggml_cast(ctx0, final_hidden, GGML_TYPE_F32), ggml_cast(ctx0, residual, GGML_TYPE_F32)); } else { - cur = final_hidden; + cur = ggml_cast(ctx0, final_hidden, GGML_TYPE_F32); } cb(cur, "final_residual", -1);