From 99e5d03e2188623f76509f853e79bc60574b5da6 Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Fri, 8 May 2026 11:08:36 +0200 Subject: [PATCH 1/6] ops: add Conv1dGrouped operation --- ggml/include/ggml.h | 15 ++++ ggml/src/ggml.c | 57 ++++++++++++ tests/test-conv-1d-grouped.cpp | 154 +++++++++++++++++++++++++++++++++ 3 files changed, 226 insertions(+) create mode 100644 tests/test-conv-1d-grouped.cpp diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3357a0d9985..fec0287ae00 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2041,6 +2041,21 @@ extern "C" { int s0, // stride int d0); // dilation + // grouped 1D convolution + // a: [K, IC/G, OC] convolution kernel + // b: [L, IC, N] data + // groups must divide both IC and OC evenly + // when groups == 1, equivalent to ggml_conv_1d + // when groups == IC, equivalent to ggml_conv_1d_dw + GGML_API struct ggml_tensor * ggml_conv_1d_grouped( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int p0, // padding + int d0, // dilation + int groups); // number of groups + GGML_API struct ggml_tensor * ggml_conv_transpose_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 191cf2fa106..049f4952047 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -4541,6 +4541,63 @@ struct ggml_tensor * ggml_conv_1d_dw_ph( return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0); } +// ggml_conv_1d_grouped + +struct ggml_tensor * ggml_conv_1d_grouped( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0, + int groups) { + GGML_ASSERT(groups > 0); + + const int64_t OC = a->ne[2]; // total output channels + const int64_t IC_G = a->ne[1]; // input channels per group (kernel dim) + const int64_t IC = b->ne[1]; // total input channels + + GGML_ASSERT(IC % groups == 0); + GGML_ASSERT(OC % groups == 0); + GGML_ASSERT(IC_G == IC / groups); + + // degenerate cases: fall back to existing implementations + if (groups == 1) { + return ggml_conv_1d(ctx, a, b, s0, p0, d0); + } + if (groups == IC && groups == OC) { + return ggml_conv_1d_dw(ctx, a, b, s0, p0, d0); + } + + const int64_t OC_G = OC / groups; + + struct ggml_tensor * result = NULL; + + for (int g = 0; g < groups; g++) { + // slice kernel for group g: [K, IC_G, OC_G] + struct ggml_tensor * a_g = ggml_view_3d(ctx, a, + a->ne[0], IC_G, OC_G, + a->nb[1], a->nb[2], + g * OC_G * a->nb[2]); + + // slice input for group g: [L, IC_G, N] + struct ggml_tensor * b_g = ggml_view_3d(ctx, b, + b->ne[0], IC_G, b->ne[2], + b->nb[1], b->nb[2], + g * IC_G * b->nb[1]); + + struct ggml_tensor * out_g = ggml_conv_1d(ctx, a_g, b_g, s0, p0, d0); + + if (result == NULL) { + result = out_g; + } else { + result = ggml_concat(ctx, result, out_g, 1); + } + } + + return result; +} + // ggml_conv_transpose_1d static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { diff --git a/tests/test-conv-1d-grouped.cpp b/tests/test-conv-1d-grouped.cpp new file mode 100644 index 00000000000..80b884804ec --- /dev/null +++ b/tests/test-conv-1d-grouped.cpp @@ -0,0 +1,154 @@ +// Test for ggml_conv_1d_grouped +// +// Verifies grouped 1D convolution by comparing against manual per-group computation. + +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" + +#include +#include +#include +#include +#include + +static void fill_random_f16(ggml_fp16_t * data, int n) { + for (int i = 0; i < n; i++) { + float v = ((float)rand() / RAND_MAX) * 2.0f - 1.0f; + data[i] = ggml_fp32_to_fp16(v); + } +} + +static void fill_random_f32(float * data, int n) { + for (int i = 0; i < n; i++) { + data[i] = ((float)rand() / RAND_MAX) * 2.0f - 1.0f; + } +} + +static bool all_close(const float * a, const float * b, int n, float eps = 5e-3f) { + for (int i = 0; i < n; i++) { + if (fabsf(a[i] - b[i]) > eps) { + fprintf(stderr, " mismatch at [%d]: %.6f vs %.6f (diff=%.6f)\n", + i, a[i], b[i], fabsf(a[i] - b[i])); + return false; + } + } + return true; +} + +// Compute grouped conv1d on CPU naively for reference +// kernel (F16): [K, IC_G, OC], input (F32): [L, IC, N], output: [OL, OC, N] +static void conv1d_grouped_ref( + const ggml_fp16_t * kernel, const float * input, float * output, + int K, int IC, int OC, int L, int N, int groups, int stride, int padding) { + int IC_G = IC / groups; + int OC_G = OC / groups; + int OL = (L + 2 * padding - K) / stride + 1; + + memset(output, 0, (size_t)OL * OC * N * sizeof(float)); + + for (int n = 0; n < N; n++) { + for (int g = 0; g < groups; g++) { + for (int oc = 0; oc < OC_G; oc++) { + int oc_global = g * OC_G + oc; + for (int ol = 0; ol < OL; ol++) { + float sum = 0.0f; + for (int ic = 0; ic < IC_G; ic++) { + for (int k = 0; k < K; k++) { + int il = ol * stride + k - padding; + if (il >= 0 && il < L) { + int ic_global = g * IC_G + ic; + // kernel: [K, IC_G, OC] -> k + ic * K + oc_global * (IC_G * K) + float w = ggml_fp16_to_fp32(kernel[k + ic * K + oc_global * (IC_G * K)]); + // input: [L, IC, N] -> il + ic_global * L + n * (IC * L) + float x = input[il + ic_global * L + n * (IC * L)]; + sum += w * x; + } + } + } + // output: [OL, OC, N] -> ol + oc_global * OL + n * (OC * OL) + output[ol + oc_global * OL + n * (OC * OL)] = sum; + } + } + } + } +} + +static bool run_test(const char * label, int IC, int OC, int K, int L, int groups, int stride, int padding) { + printf(" TEST: %s (IC=%d OC=%d K=%d L=%d G=%d s=%d p=%d)\n", + label, IC, OC, K, L, groups, stride, padding); + + int IC_G = IC / groups; + int OL = (L + 2 * padding - K) / stride + 1; + + size_t ctx_size = 256 * 1024 * 1024; + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx = ggml_init(params); + + // kernel: [K, IC_G, OC] in F16 (like real models) + struct ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, K, IC_G, OC); + // input: [L, IC] in F32 + struct ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, L, IC); + + fill_random_f16((ggml_fp16_t *)a->data, K * IC_G * OC); + fill_random_f32((float *)b->data, L * IC); + + // reference + std::vector ref(OL * OC); + conv1d_grouped_ref((ggml_fp16_t *)a->data, (float *)b->data, ref.data(), + K, IC, OC, L, 1, groups, stride, padding); + + // ggml + struct ggml_tensor * result = ggml_conv_1d_grouped(ctx, a, b, stride, padding, 1, groups); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, result); + + ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_graph_compute(backend, gf); + + bool ok = true; + + if (result->ne[0] != OL || result->ne[1] != OC) { + fprintf(stderr, " FAIL: shape [%lld, %lld], expected [%d, %d]\n", + (long long)result->ne[0], (long long)result->ne[1], OL, OC); + ok = false; + } + + if (ok) { + ok = all_close((float *)result->data, ref.data(), OL * OC); + } + + printf(" %s\n", ok ? "PASS" : "FAIL"); + + ggml_backend_free(backend); + ggml_free(ctx); + return ok; +} + +int main(void) { + srand(42); + + printf("Testing ggml_conv_1d_grouped\n\n"); + + int n_pass = 0, n_fail = 0; + + auto check = [&](const char * label, int IC, int OC, int K, int L, int G, int s, int p) { + if (run_test(label, IC, OC, K, L, G, s, p)) { n_pass++; } else { n_fail++; } + }; + + check("groups=1 (standard conv1d)", 128, 256, 3, 32, 1, 1, 0); + check("ZAYA1-8B exact params", 1280, 1280, 2, 16, 10, 1, 0); + check("small 2 groups", 4, 4, 2, 8, 2, 1, 0); + check("with padding", 8, 8, 2, 16, 4, 1, 1); + check("IC != OC", 12, 6, 3, 10, 3, 1, 0); + check("stride=2", 8, 8, 2, 16, 4, 2, 0); + check("longer sequence", 1280, 1280, 2, 128, 10, 1, 0); + + printf("\nResult: %d passed, %d failed\n", n_pass, n_fail); + return n_fail > 0 ? 1 : 0; +} From e0ac753e404962ace6c6e0535d38657cae7b0283 Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Fri, 8 May 2026 15:07:17 +0200 Subject: [PATCH 2/6] initial implementation --- convert_hf_to_gguf.py | 39 ++++++ gguf-py/gguf/constants.py | 28 +++++ gguf-py/gguf/tensor_mapping.py | 23 ++++ src/llama-arch.cpp | 12 ++ src/llama-arch.h | 6 + src/llama-model.cpp | 3 + src/llama-model.h | 7 ++ src/models/models.h | 13 ++ src/models/zaya.cpp | 223 +++++++++++++++++++++++++++++++++ 9 files changed, 354 insertions(+) create mode 100644 src/models/zaya.cpp diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index fb1f5dd4473..33c74013fb3 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6454,6 +6454,45 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("ZayaModel", "ZayaForCausalLM") +class ZayaModel(TextModel): + """Zaya-1 model with Compressed Convolutional Attention""" + model_arch = gguf.MODEL_ARCH.ZAYA + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + # ZAYA-specific params if any from config.json (e.g. ssm_d_conv) + if "ssm_d_conv" in self.hparams: + self.gguf_writer.add_ssm_conv_kernel(self.hparams["ssm_d_conv"]) + else: + # Fallback if config is different + self.gguf_writer.add_ssm_conv_kernel(2) # Default for ZAYA1-8B + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Tensors will be automatically mapped based on tensor_mapping.py if they match + + # We skip MoE FFN weights, unused biases, etc. temporarily since we are using dense FFN + skip_keywords = [ + "zaya_block.experts", + "res_scale.", + "val_proj2" + ] + + if any(kw in name for kw in skip_keywords): + logger.info(f"Skipping tensor (dense FFN test): {name}") + return + + try: + yield from super().modify_tensors(data_torch, name, bid) + except ValueError as e: + if "Can not map tensor" in str(e): + logger.warning(f"Skipping unmapped tensor: {name}") + else: + raise + + @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 308ebe1f4a1..13bd3d1c8f0 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -503,6 +503,7 @@ class MODEL_ARCH(IntEnum): LLAMA_EMBED = auto() MAINCODER = auto() KIMI_LINEAR = auto() + ZAYA = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -610,6 +611,10 @@ class MODEL_TENSOR(IntEnum): SSM_BETA = auto() # Kimi Linear qwen3.5 SSM_G_A = auto() # Kimi Linear SSM_G_B = auto() # Kimi Linear + CCA_CONV_DW = auto() # Zaya + CCA_CONV_GRP = auto() # Zaya + CCA_QK_NORM = auto() # Zaya + CCA_K_SCALE = auto() # Zaya TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -1018,6 +1023,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.LLAMA_EMBED: "llama-embed", MODEL_ARCH.MAINCODER: "maincoder", MODEL_ARCH.KIMI_LINEAR: "kimi-linear", + MODEL_ARCH.ZAYA: "zaya", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -1123,6 +1129,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear qwen3.5 MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear + MODEL_TENSOR.CCA_CONV_DW: "blk.{bid}.cca_conv_dw", # Zaya + MODEL_TENSOR.CCA_CONV_GRP: "blk.{bid}.cca_conv_grp", # Zaya + MODEL_TENSOR.CCA_QK_NORM: "blk.{bid}.cca_qk_norm", # Zaya + MODEL_TENSOR.CCA_K_SCALE: "blk.{bid}.cca_k_scale", # Zaya MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -3992,6 +4002,24 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, ], + MODEL_ARCH.ZAYA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_V, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.CCA_CONV_DW, + MODEL_TENSOR.CCA_CONV_GRP, + MODEL_TENSOR.CCA_QK_NORM, + MODEL_TENSOR.CCA_K_SCALE, + MODEL_TENSOR.FFN_NORM, + MODEL_TENSOR.FFN_GATE, + MODEL_TENSOR.FFN_DOWN, + MODEL_TENSOR.FFN_UP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f27f0e4c997..db99afd4cbb 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -259,6 +259,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.q_proj", # llada "layers.{bid}.self_attn.q_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.q_proj", # nemotron-h + "model.layers.{bid}.self_attn.qkv.linear_q", # Zaya ), # Attention key @@ -279,6 +280,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.k_proj", # llada "layers.{bid}.self_attn.k_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.k_proj", # nemotron-h + "model.layers.{bid}.self_attn.qkv.linear_k", # Zaya ), # Attention value @@ -298,6 +300,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.v_proj", # llada "layers.{bid}.self_attn.v_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.v_proj", # nemotron-h + "model.layers.{bid}.self_attn.qkv.val_proj1", # Zaya ), # Attention output @@ -336,6 +339,7 @@ class TensorNameMap: "layers.{bid}.self_attn.o_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.o_proj", # nemotron-h "model.layers.{bid}.self_attn.language_expert_dense", # cogvlm + "model.layers.{bid}.self_attn.o_proj", # Zaya ), # Attention output norm @@ -854,6 +858,12 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.norm", # mamba2 "model.layers.{bid}.self_attn.o_norm", # kimi ), + MODEL_TENSOR.ATTN_NORM: ( + "model.layers.{bid}.input_layernorm", + "model.layers.{bid}.ln_1", + "model.layers.{bid}.norm1", + "model.layers.{bid}.input_norm", # Zaya + ), MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", # mamba-hf @@ -891,6 +901,19 @@ class TensorNameMap: "model.layers.{bid}.linear_attn.in_proj_b", # qwen3.5 "model.layers.{bid}.self_attn.b_proj", # Kimi Linear ), + # ZAYA CCA + MODEL_TENSOR.CCA_CONV_DW: ( + "model.layers.{bid}.self_attn.qkv.conv_qk.0", # Zaya + ), + MODEL_TENSOR.CCA_CONV_GRP: ( + "model.layers.{bid}.self_attn.qkv.conv_qk.1", # Zaya + ), + MODEL_TENSOR.CCA_QK_NORM: ( + "model.layers.{bid}.self_attn.qk_norm", # Zaya + ), + MODEL_TENSOR.CCA_K_SCALE: ( + "model.layers.{bid}.self_attn.qkv.temp", # Zaya + ), MODEL_TENSOR.SSM_G_A: ( "model.layers.{bid}.self_attn.g_a_proj", ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 59dde99e362..df91d973a3e 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -133,6 +133,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_MAINCODER, "maincoder" }, { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, + { LLM_ARCH_ZAYA, "zaya" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -417,6 +418,10 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_SSM_BETA, "blk.%d.ssm_beta" }, { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, + { LLM_TENSOR_CCA_CONV_DW, "blk.%d.cca_conv_dw" }, + { LLM_TENSOR_CCA_CONV_GRP, "blk.%d.cca_conv_grp" }, + { LLM_TENSOR_CCA_QK_NORM, "blk.%d.cca_qk_norm" }, + { LLM_TENSOR_CCA_K_SCALE, "blk.%d.cca_k_scale" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, @@ -659,6 +664,11 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_BETA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_G_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + // ZAYA CCA + {LLM_TENSOR_CCA_CONV_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, + {LLM_TENSOR_CCA_CONV_GRP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CCA_QK_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CCA_K_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -857,6 +867,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_ZAYA: case LLM_ARCH_QWEN35: case LLM_ARCH_QWEN35MOE: return true; @@ -902,6 +913,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) { case LLM_ARCH_MINIMAX_M2: case LLM_ARCH_MISTRAL4: case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_ZAYA: return false; default: return true; diff --git a/src/llama-arch.h b/src/llama-arch.h index e37d548c98e..b11fa50c05f 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -137,6 +137,7 @@ enum llm_arch { LLM_ARCH_LLAMA_EMBED, LLM_ARCH_MAINCODER, LLM_ARCH_KIMI_LINEAR, + LLM_ARCH_ZAYA, LLM_ARCH_UNKNOWN, }; @@ -444,6 +445,11 @@ enum llm_tensor { LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient and qwen3.5 LLM_TENSOR_SSM_G_A, // kimi: output gate projection A LLM_TENSOR_SSM_G_B, // kimi: output gate projection B + // ZAYA CCA (Compressed Convolutional Attention) + LLM_TENSOR_CCA_CONV_DW, // zaya: depthwise conv1d (conv_qk.0) + LLM_TENSOR_CCA_CONV_GRP, // zaya: grouped conv1d (conv_qk.1) + LLM_TENSOR_CCA_QK_NORM, // zaya: RMSNorm on concat(Q,K) + LLM_TENSOR_CCA_K_SCALE, // zaya: learned K temperature LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W2, diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9d011ff3464..656767318f2 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -282,6 +282,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_mimo2(params); case LLM_ARCH_KIMI_LINEAR: return new llama_model_kimi_linear(params); + case LLM_ARCH_ZAYA: + return new llama_model_zaya(params); case LLM_ARCH_STEP35: return new llama_model_step35(params); default: @@ -2206,6 +2208,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_ZAYA: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values diff --git a/src/llama-model.h b/src/llama-model.h index d63c689185a..8e919e15159 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -477,6 +477,13 @@ struct llama_layer { struct ggml_tensor * ssm_g_b = nullptr; struct ggml_tensor * ssm_o_norm = nullptr; + // ZAYA CCA (Compressed Convolutional Attention) + struct ggml_tensor * cca_conv_dw = nullptr; // depthwise conv (conv_qk.0) + struct ggml_tensor * cca_conv_grp = nullptr; // grouped conv (conv_qk.1) + struct ggml_tensor * cca_conv_grp_b = nullptr; // grouped conv bias + struct ggml_tensor * cca_qk_norm = nullptr; // RMSNorm on concat(Q,K) + struct ggml_tensor * cca_k_scale = nullptr; // learned K temperature + // DSA (deepseek sparse attention) struct ggml_tensor * indexer_k_norm = nullptr; struct ggml_tensor * indexer_k_norm_b = nullptr; diff --git a/src/models/models.h b/src/models/models.h index 6d5f18a8e20..507f903104b 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -1858,3 +1858,16 @@ struct llama_model_step35 : public llama_model_base { std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; + + +struct llama_model_zaya : public llama_model_base { + llama_model_zaya(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp new file mode 100644 index 00000000000..0815fc1d449 --- /dev/null +++ b/src/models/zaya.cpp @@ -0,0 +1,223 @@ +#include "models.h" + +#include "ggml.h" + +void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + + switch (hparams.n_layer) { + case 80: type = LLM_TYPE_8B; break; + default: type = LLM_TYPE_UNKNOWN; + } +} + +void llama_model_zaya::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); + + const int64_t d_conv = hparams.ssm_d_conv; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + const int64_t n_head_i = hparams.n_head(i); + const int64_t n_head_kv_i = hparams.n_head_kv(i); + const int64_t n_embd_q = n_head_i * n_embd_head_k; + const int64_t n_embd_k = n_head_kv_i * n_embd_head_k; + const int64_t n_qk = n_embd_q + n_embd_k; + const int64_t n_groups = n_head_i + n_head_kv_i; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // CCA projections (standard Q, K, V, O) + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); + layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_k}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); + + // CCA conv_qk.0 (depthwise, groups = n_qk, kernel = d_conv) + layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); + + // CCA conv_qk.1 (grouped, groups = n_groups, kernel = d_conv) + layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), {d_conv, n_qk / n_groups, n_qk}, 0); + layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + + // CCA normalization and scale + layer.cca_qk_norm = create_tensor(tn(LLM_TENSOR_CCA_QK_NORM, "weight", i), {n_qk}, 0); + layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_embd_k}, 0); + + // FFN (dense SwiGLU for now; MoE can be added later) + layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); + layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); + layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); + layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + } +} + +std::unique_ptr llama_model_zaya::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + + const int64_t n_embd_head = hparams.n_embd_head_k(); + const int64_t d_conv = hparams.ssm_d_conv; + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + auto * inp = build_inp_mem_hybrid(); + + ggml_tensor * inp_out_ids = build_inp_out_ids(); + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_q = n_head * n_embd_head; + const int64_t n_embd_k = n_head_kv * n_embd_head; + const int64_t n_qk = n_embd_q + n_embd_k; + const int64_t n_groups = n_head + n_head_kv; + + ggml_tensor * inpSA = inpL; + + // norm + cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "attn_norm", il); + + // --- CCA: Q, K, V projections --- + ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); + cb(Qraw, "Qraw", il); + ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); + cb(Kraw, "Kraw", il); + ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.wv, cur); + cb(Vcur, "Vcur", il); + + // --- CCA: concat Q+K for conv --- + // QK: [n_qk, n_tokens] + ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0); + cb(QK, "QK_cat", il); + + // --- CCA: conv_qk.0 (depthwise, causal) --- + // Reshape for ssm_conv: [n_tokens, n_qk] -> [n_tokens, n_qk, 1] + // ssm_conv expects [seq_len, channels, batch] with state already concatenated + // For prompt processing, we left-pad with (d_conv-1) zeros for causality + { + // Left-pad QK with zeros for causal convolution + ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); // [n_tokens, n_qk] + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); + pad = ggml_scale(ctx0, pad, 0.0f); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); // [d_conv-1 + n_tokens, n_qk] + + QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); + // ssm_conv output: [n_tokens, n_qk] + cb(QK, "QK_dw", il); + } + + // --- CCA: conv_qk.1 (grouped, causal) --- + { + // Left-pad for second causal conv + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk); + pad = ggml_scale(ctx0, pad, 0.0f); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0); // [d_conv-1 + n_tokens, n_qk] + + // ggml_conv_1d_grouped expects kernel [K, IC/G, OC] and input [L, IC] + // QK_padded is [d_conv-1 + n_tokens, n_qk] which matches [L, IC] + QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); + QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); + cb(QK, "QK_grp", il); + } + + // QK is now [n_tokens, n_qk] from conv output, transpose back to [n_qk, n_tokens] + QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + + // --- CCA: split Q_conv, K_conv --- + ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, + QK->nb[1], 0); + ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, + QK->nb[1], n_embd_q * ggml_element_size(QK)); + + // --- CCA: QK mean (skip connection) --- + ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f); + ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + + // --- CCA: RMSNorm on concat(Q, K) --- + ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0); // [n_qk, n_tokens] + QK_for_norm = build_norm(QK_for_norm, layer.cca_qk_norm, NULL, LLM_NORM_RMS, il); + cb(QK_for_norm, "QK_normed", il); + + // Split back + Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens, + QK_for_norm->nb[1], 0); + Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens, + QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm)); + + // --- CCA: K temperature scaling --- + Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); + cb(Kcur, "Kcur_scaled", il); + + // Reshape for attention: [head_dim, n_heads, n_tokens] + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // --- GQA attention --- + cur = build_attn(inp->get_attn(), layer.wo, NULL, NULL, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + 1.0f / sqrtf((float) n_embd_head), il); + cb(cur, "attn_out", il); + + // select output tokens on last layer + if (il == n_layer - 1 && inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); + } + + // residual + ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); + cb(ffn_inp, "ffn_inp", il); + + // --- FFN (dense SwiGLU) --- + cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il); + cb(cur, "ffn_norm", il); + + cur = build_ffn(cur, + layer.ffn_up, NULL, NULL, + layer.ffn_gate, NULL, NULL, + layer.ffn_down, NULL, NULL, + NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); + cb(cur, "ffn_out", il); + + // residual + cur = ggml_add(ctx0, cur, ffn_inp); + cb(cur, "l_out", il); + + inpL = cur; + } + + cur = inpL; + + // final norm + cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // output + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} From 7cc554aab3435a800d673ea588d92034958bd3c7 Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Fri, 8 May 2026 18:39:38 +0200 Subject: [PATCH 3/6] implementation checkpoint --- convert_hf_to_gguf.py | 165 ++++++++++++++-- gguf-py/gguf/constants.py | 60 +++++- gguf-py/gguf/tensor_mapping.py | 15 +- src/llama-arch.cpp | 34 ++++ src/llama-arch.h | 19 ++ src/llama-model.h | 21 ++ src/models/zaya.cpp | 350 +++++++++++++++++++++------------ 7 files changed, 497 insertions(+), 167 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 33c74013fb3..97a5889cce9 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6456,34 +6456,150 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter @ModelBase.register("ZayaModel", "ZayaForCausalLM") class ZayaModel(TextModel): - """Zaya-1 model with Compressed Convolutional Attention""" + """Zaya-1 model with Compressed Convolutional Attention and MoE""" model_arch = gguf.MODEL_ARCH.ZAYA + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Buffer for accumulating expert weights per layer + self._experts: dict[int, dict[str, Tensor]] | None = {} + def set_gguf_parameters(self): super().set_gguf_parameters() self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) - - # ZAYA-specific params if any from config.json (e.g. ssm_d_conv) - if "ssm_d_conv" in self.hparams: - self.gguf_writer.add_ssm_conv_kernel(self.hparams["ssm_d_conv"]) - else: - # Fallback if config is different - self.gguf_writer.add_ssm_conv_kernel(2) # Default for ZAYA1-8B - + + # n_ff = ffn_hidden_size / 2 (SwiGLU halves the intermediate) + n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2 + self.gguf_writer.add_feed_forward_length(n_ff) + + # ssm_d_conv = conv_qk kernel size + self.gguf_writer.add_ssm_conv_kernel(5) + + # partial_rotary_factor -> n_rot + head_dim = self.hparams.get("head_dim", 128) + partial_rotary = self.hparams.get("partial_rotary_factor", 0.5) + self.gguf_writer.add_rope_dimension_count(int(partial_rotary * head_dim)) + + # MoE params + n_expert = self.find_hparam(["num_experts"]) + self.gguf_writer.add_expert_count(n_expert) + n_expert_used = self.find_hparam(["moe_router_topk", "num_experts_per_tok"], optional=True) or 1 + self.gguf_writer.add_expert_used_count(n_expert_used) + + def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: + if "linear_q" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch + elif "linear_k" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch + elif "val_proj1" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ1, bid), data_torch + elif "val_proj2" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ2, bid), data_torch + elif "o_proj" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch + elif "conv_qk.0" in name and name.endswith(".weight"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch + elif "conv_qk.0" in name and name.endswith(".bias"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch + elif "conv_qk.1" in name and name.endswith(".weight"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch + elif "conv_qk.1" in name and name.endswith(".bias"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid, suffix=".bias"), data_torch + elif "temp" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_K_SCALE, bid), data_torch + + def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: + if "down_proj.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN, bid), data_torch + elif "down_proj.bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch + elif "rmsnorm_eda" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_NORM, bid), data_torch + elif "router_mlp.0.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0, bid), data_torch + elif "router_mlp.0.bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, bid, suffix=".bias"), data_torch + elif "router_mlp.2.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid), data_torch + elif "router_mlp.2.bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, bid, suffix=".bias"), data_torch + elif "router_mlp.4.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP4, bid), data_torch + elif "balancing_biases" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_BIASES, bid), data_torch + elif "router_states_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, bid), data_torch + + def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: + if "hidden_states_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid), data_torch + elif "hidden_states_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B, bid, suffix=".bias"), data_torch + elif "residual_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid), data_torch + elif "residual_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: - # Tensors will be automatically mapped based on tensor_mapping.py if they match - - # We skip MoE FFN weights, unused biases, etc. temporarily since we are using dense FFN - skip_keywords = [ - "zaya_block.experts", - "res_scale.", - "val_proj2" - ] - - if any(kw in name for kw in skip_keywords): - logger.info(f"Skipping tensor (dense FFN test): {name}") + # Common tensors + if name == "model.embed_tokens.weight": + yield self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch return - + if name == "model.final_norm.weight": + yield self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM), data_torch + return + + # Block-level tensors + if bid is not None: + # CCA attention tensors + if "self_attn" in name: + yield from self._map_cca(name, data_torch, bid) + return + + # Router tensors + if "router" in name: + yield from self._map_router(name, data_torch, bid) + return + + # Input norm + if "input_norm" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, bid), data_torch + return + + # Residual scaling + if "res_scale" in name: + yield from self._map_res_scale(name, data_torch, bid) + return + + # Expert stacking + if "zaya_block.experts" in name: + assert bid is not None + if self._experts is None: + self._experts = {} + if bid not in self._experts: + self._experts[bid] = {} + self._experts[bid][name] = data_torch + + n_expert = self.find_hparam(["num_experts"]) + # Each layer has 2 expert weights per expert (fc1, fc2) = 2 * n_expert tensors + if len(self._experts[bid]) >= n_expert * 2: + for w_name, gguf_tensor, permute_dims in [ + ("linear_fc1", gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, None), + ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, (0, 2, 1)), + ]: + datas: list[Tensor] = [] + for xid in range(n_expert): + ename = f"model.layers.{bid}.zaya_block.experts.local_experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + data_torch_stacked = torch.stack(datas, dim=0) + if permute_dims is not None: + data_torch_stacked = data_torch_stacked.permute(*permute_dims) + yield self.format_tensor_name(gguf_tensor, bid), data_torch_stacked + del self._experts[bid] + return + + # Fallback for any remaining tensors: use tensor_mapping try: yield from super().modify_tensors(data_torch, name, bid) except ValueError as e: @@ -6492,6 +6608,13 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: raise + def prepare_tensors(self): + super().prepare_tensors() + if self._experts: + unprocessed = [k for d in self._experts.values() for k in d.keys()] + if unprocessed: + raise ValueError(f"Unprocessed expert tensors: {unprocessed}") + @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 13bd3d1c8f0..de599da4a0b 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -613,8 +613,25 @@ class MODEL_TENSOR(IntEnum): SSM_G_B = auto() # Kimi Linear CCA_CONV_DW = auto() # Zaya CCA_CONV_GRP = auto() # Zaya - CCA_QK_NORM = auto() # Zaya + CCA_CONV_DW_B = auto() # Zaya: conv_qk.0.bias + CCA_QK_NORM = auto() # Zaya (weightless - unit RMSNorm) CCA_K_SCALE = auto() # Zaya + CCA_VAL_PROJ1 = auto() # Zaya: CCA value projection stream 1 + CCA_VAL_PROJ2 = auto() # Zaya: CCA value projection stream 2 + RES_SCALE_HS = auto() # Zaya: hidden_states_scale + RES_SCALE_HS_B = auto() # Zaya: hidden_states_bias + RES_SCALE_RES = auto() # Zaya: residual_scale + RES_SCALE_RES_B = auto() # Zaya: residual_bias + ZAYA_ROUTER_DOWN = auto() # Zaya + ZAYA_ROUTER_DOWN_B = auto() # Zaya + ZAYA_ROUTER_NORM = auto() # Zaya + ZAYA_ROUTER_MLP0 = auto() # Zaya + ZAYA_ROUTER_MLP0_B = auto() # Zaya + ZAYA_ROUTER_MLP2 = auto() # Zaya + ZAYA_ROUTER_MLP2_B = auto() # Zaya + ZAYA_ROUTER_MLP4 = auto() # Zaya + ZAYA_ROUTER_BIASES = auto() # Zaya + ZAYA_ROUTER_EDA_SCALE = auto() # Zaya TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -1130,9 +1147,26 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear MODEL_TENSOR.CCA_CONV_DW: "blk.{bid}.cca_conv_dw", # Zaya + MODEL_TENSOR.CCA_CONV_DW_B: "blk.{bid}.cca_conv_dw_b", # Zaya MODEL_TENSOR.CCA_CONV_GRP: "blk.{bid}.cca_conv_grp", # Zaya MODEL_TENSOR.CCA_QK_NORM: "blk.{bid}.cca_qk_norm", # Zaya MODEL_TENSOR.CCA_K_SCALE: "blk.{bid}.cca_k_scale", # Zaya + MODEL_TENSOR.CCA_VAL_PROJ1: "blk.{bid}.cca_val_proj1", # Zaya + MODEL_TENSOR.CCA_VAL_PROJ2: "blk.{bid}.cca_val_proj2", # Zaya + MODEL_TENSOR.RES_SCALE_HS: "blk.{bid}.res_scale_hs", # Zaya + MODEL_TENSOR.RES_SCALE_HS_B: "blk.{bid}.res_scale_hs_b", # Zaya + MODEL_TENSOR.RES_SCALE_RES: "blk.{bid}.res_scale_res", # Zaya + MODEL_TENSOR.RES_SCALE_RES_B: "blk.{bid}.res_scale_res_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_DOWN: "blk.{bid}.zaya_router_down", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_NORM: "blk.{bid}.zaya_router_norm", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP0: "blk.{bid}.zaya_router_mlp0", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP0_B: "blk.{bid}.zaya_router_mlp0_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP2: "blk.{bid}.zaya_router_mlp2", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP2_B: "blk.{bid}.zaya_router_mlp2_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP4: "blk.{bid}.zaya_router_mlp4", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_BIASES: "blk.{bid}.zaya_router_biases", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE: "blk.{bid}.zaya_router_eda", # Zaya MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -4009,16 +4043,30 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.ATTN_NORM, MODEL_TENSOR.ATTN_Q, MODEL_TENSOR.ATTN_K, - MODEL_TENSOR.ATTN_V, MODEL_TENSOR.ATTN_OUT, MODEL_TENSOR.CCA_CONV_DW, + MODEL_TENSOR.CCA_CONV_DW_B, MODEL_TENSOR.CCA_CONV_GRP, MODEL_TENSOR.CCA_QK_NORM, MODEL_TENSOR.CCA_K_SCALE, - MODEL_TENSOR.FFN_NORM, - MODEL_TENSOR.FFN_GATE, - MODEL_TENSOR.FFN_DOWN, - MODEL_TENSOR.FFN_UP, + MODEL_TENSOR.CCA_VAL_PROJ1, + MODEL_TENSOR.CCA_VAL_PROJ2, + MODEL_TENSOR.RES_SCALE_HS, + MODEL_TENSOR.RES_SCALE_HS_B, + MODEL_TENSOR.RES_SCALE_RES, + MODEL_TENSOR.RES_SCALE_RES_B, + MODEL_TENSOR.ZAYA_ROUTER_DOWN, + MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, + MODEL_TENSOR.ZAYA_ROUTER_NORM, + MODEL_TENSOR.ZAYA_ROUTER_MLP0, + MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, + MODEL_TENSOR.ZAYA_ROUTER_MLP2, + MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, + MODEL_TENSOR.ZAYA_ROUTER_MLP4, + MODEL_TENSOR.ZAYA_ROUTER_BIASES, + MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, + MODEL_TENSOR.FFN_GATE_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index db99afd4cbb..fbd22ccb6a3 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -107,6 +107,7 @@ class TensorNameMap: "model.transformer.ln_f", # llada "final_norm", # modern-bert "model.norm", # cogvlm + "model.final_norm", # Zaya ), # Rope frequencies @@ -300,7 +301,6 @@ class TensorNameMap: "model.transformer.blocks.{bid}.v_proj", # llada "layers.{bid}.self_attn.v_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.v_proj", # nemotron-h - "model.layers.{bid}.self_attn.qkv.val_proj1", # Zaya ), # Attention output @@ -901,19 +901,6 @@ class TensorNameMap: "model.layers.{bid}.linear_attn.in_proj_b", # qwen3.5 "model.layers.{bid}.self_attn.b_proj", # Kimi Linear ), - # ZAYA CCA - MODEL_TENSOR.CCA_CONV_DW: ( - "model.layers.{bid}.self_attn.qkv.conv_qk.0", # Zaya - ), - MODEL_TENSOR.CCA_CONV_GRP: ( - "model.layers.{bid}.self_attn.qkv.conv_qk.1", # Zaya - ), - MODEL_TENSOR.CCA_QK_NORM: ( - "model.layers.{bid}.self_attn.qk_norm", # Zaya - ), - MODEL_TENSOR.CCA_K_SCALE: ( - "model.layers.{bid}.self_attn.qkv.temp", # Zaya - ), MODEL_TENSOR.SSM_G_A: ( "model.layers.{bid}.self_attn.g_a_proj", ), diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index df91d973a3e..3bebc529300 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -419,9 +419,26 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, { LLM_TENSOR_CCA_CONV_DW, "blk.%d.cca_conv_dw" }, + { LLM_TENSOR_CCA_CONV_DW_B, "blk.%d.cca_conv_dw_b" }, { LLM_TENSOR_CCA_CONV_GRP, "blk.%d.cca_conv_grp" }, { LLM_TENSOR_CCA_QK_NORM, "blk.%d.cca_qk_norm" }, { LLM_TENSOR_CCA_K_SCALE, "blk.%d.cca_k_scale" }, + { LLM_TENSOR_CCA_VAL_PROJ1, "blk.%d.cca_val_proj1" }, + { LLM_TENSOR_CCA_VAL_PROJ2, "blk.%d.cca_val_proj2" }, + { LLM_TENSOR_RES_SCALE_HS, "blk.%d.res_scale_hs" }, + { LLM_TENSOR_RES_SCALE_HS_B, "blk.%d.res_scale_hs_b" }, + { LLM_TENSOR_RES_SCALE_RES, "blk.%d.res_scale_res" }, + { LLM_TENSOR_RES_SCALE_RES_B, "blk.%d.res_scale_res_b" }, + { LLM_TENSOR_ZAYA_ROUTER_DOWN, "blk.%d.zaya_router_down" }, + { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, + { LLM_TENSOR_ZAYA_ROUTER_NORM, "blk.%d.zaya_router_norm" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP0, "blk.%d.zaya_router_mlp0" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "blk.%d.zaya_router_mlp0_b" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP2, "blk.%d.zaya_router_mlp2" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "blk.%d.zaya_router_mlp2_b" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP4, "blk.%d.zaya_router_mlp4" }, + { LLM_TENSOR_ZAYA_ROUTER_BIASES, "blk.%d.zaya_router_biases" }, + { LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "blk.%d.zaya_router_eda" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, @@ -666,9 +683,26 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, // ZAYA CCA {LLM_TENSOR_CCA_CONV_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, + {LLM_TENSOR_CCA_CONV_DW_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_CCA_CONV_GRP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CCA_QK_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CCA_K_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CCA_VAL_PROJ1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CCA_VAL_PROJ2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_RES_SCALE_HS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_HS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_RES_SCALE_RES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_RES_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP0_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP2_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP4, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_BIASES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index b11fa50c05f..72c5abddac1 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -447,9 +447,28 @@ enum llm_tensor { LLM_TENSOR_SSM_G_B, // kimi: output gate projection B // ZAYA CCA (Compressed Convolutional Attention) LLM_TENSOR_CCA_CONV_DW, // zaya: depthwise conv1d (conv_qk.0) + LLM_TENSOR_CCA_CONV_DW_B, // zaya: depthwise conv1d bias LLM_TENSOR_CCA_CONV_GRP, // zaya: grouped conv1d (conv_qk.1) LLM_TENSOR_CCA_QK_NORM, // zaya: RMSNorm on concat(Q,K) LLM_TENSOR_CCA_K_SCALE, // zaya: learned K temperature + LLM_TENSOR_CCA_VAL_PROJ1, // zaya: V projection 1 + LLM_TENSOR_CCA_VAL_PROJ2, // zaya: V projection 2 + // ZAYA residual scaling + LLM_TENSOR_RES_SCALE_HS, // zaya: hidden_states_scale + LLM_TENSOR_RES_SCALE_HS_B, // zaya: hidden_states_bias + LLM_TENSOR_RES_SCALE_RES, // zaya: residual_scale + LLM_TENSOR_RES_SCALE_RES_B, // zaya: residual_bias + // ZAYA Router (MoE gating) + LLM_TENSOR_ZAYA_ROUTER_DOWN, // zaya: router down_proj weight + LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias + LLM_TENSOR_ZAYA_ROUTER_NORM, // zaya: router rmsnorm_eda weight + LLM_TENSOR_ZAYA_ROUTER_MLP0, // zaya: router MLP layer 0 weight + LLM_TENSOR_ZAYA_ROUTER_MLP0_B, // zaya: router MLP layer 0 bias + LLM_TENSOR_ZAYA_ROUTER_MLP2, // zaya: router MLP layer 2 weight + LLM_TENSOR_ZAYA_ROUTER_MLP2_B, // zaya: router MLP layer 2 bias + LLM_TENSOR_ZAYA_ROUTER_MLP4, // zaya: router MLP layer 4 weight + LLM_TENSOR_ZAYA_ROUTER_BIASES, // zaya: router balancing_biases + LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, // zaya: router router_states_scale LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W2, diff --git a/src/llama-model.h b/src/llama-model.h index 8e919e15159..d9da4b318bd 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -479,10 +479,31 @@ struct llama_layer { // ZAYA CCA (Compressed Convolutional Attention) struct ggml_tensor * cca_conv_dw = nullptr; // depthwise conv (conv_qk.0) + struct ggml_tensor * cca_conv_dw_b = nullptr; // depthwise conv bias struct ggml_tensor * cca_conv_grp = nullptr; // grouped conv (conv_qk.1) struct ggml_tensor * cca_conv_grp_b = nullptr; // grouped conv bias struct ggml_tensor * cca_qk_norm = nullptr; // RMSNorm on concat(Q,K) struct ggml_tensor * cca_k_scale = nullptr; // learned K temperature + struct ggml_tensor * cca_val_proj1 = nullptr; // V projection stream 1 + struct ggml_tensor * cca_val_proj2 = nullptr; // V projection stream 2 + + // ZAYA residual scaling + struct ggml_tensor * res_scale_hs = nullptr; // hidden_states_scale + struct ggml_tensor * res_scale_hs_b = nullptr; // hidden_states_bias + struct ggml_tensor * res_scale_res = nullptr; // residual_scale + struct ggml_tensor * res_scale_res_b = nullptr; // residual_bias + + // ZAYA Router (MoE gating) + struct ggml_tensor * zaya_router_down = nullptr; // router down_proj + struct ggml_tensor * zaya_router_down_b = nullptr; // router down_proj bias + struct ggml_tensor * zaya_router_norm = nullptr; // router rmsnorm_eda + struct ggml_tensor * zaya_router_mlp0 = nullptr; // router MLP 0 + struct ggml_tensor * zaya_router_mlp0_b = nullptr; // router MLP 0 bias + struct ggml_tensor * zaya_router_mlp2 = nullptr; // router MLP 2 + struct ggml_tensor * zaya_router_mlp2_b = nullptr; // router MLP 2 bias + struct ggml_tensor * zaya_router_mlp4 = nullptr; // router MLP 4 + struct ggml_tensor * zaya_router_biases = nullptr; // balancing_biases + struct ggml_tensor * zaya_router_eda_scale = nullptr; // router_states_scale // DSA (deepseek sparse attention) struct ggml_tensor * indexer_k_norm = nullptr; diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 0815fc1d449..a6e77bbc198 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -17,46 +17,93 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); - // output + // output norm output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); - output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, 0); - const int64_t d_conv = hparams.ssm_d_conv; + // output (tied with tok_embd if not present) + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (output == nullptr) { + output = tok_embd; + } + + const int64_t n_embd_head = hparams.n_embd_head_k(); + const int64_t d_conv = hparams.ssm_d_conv; + // Router MLP hidden size (zaya_mlp_expansion = 256 for ZAYA1-8B) + const int64_t n_ff_exp = 256; for (int i = 0; i < n_layer; ++i) { auto & layer = layers[i]; - const int64_t n_head_i = hparams.n_head(i); - const int64_t n_head_kv_i = hparams.n_head_kv(i); - const int64_t n_embd_q = n_head_i * n_embd_head_k; - const int64_t n_embd_k = n_head_kv_i * n_embd_head_k; - const int64_t n_qk = n_embd_q + n_embd_k; - const int64_t n_groups = n_head_i + n_head_kv_i; + const int64_t n_head = hparams.n_head(i); + const int64_t n_head_kv = hparams.n_head_kv(i); + const int64_t n_embd_q = n_head * n_embd_head; + const int64_t n_embd_k = n_head_kv * n_embd_head; + const int64_t n_qk = n_embd_q + n_embd_k; + const int64_t n_groups = n_head + n_head_kv; + const int64_t n_ff = hparams.n_ff(i); + const int64_t n_expert = hparams.n_expert; layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - // CCA projections (standard Q, K, V, O) + // CCA projections (present on all layers) layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); - layer.wv = create_tensor(tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_k}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); - - // CCA conv_qk.0 (depthwise, groups = n_qk, kernel = d_conv) - layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); - // CCA conv_qk.1 (grouped, groups = n_groups, kernel = d_conv) - layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), {d_conv, n_qk / n_groups, n_qk}, 0); - layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + // CCA: V = concat(val_proj1(x), val_proj2(x)) → {n_embd_k} + layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), + {n_embd, n_embd_head}, 0); + layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), + {n_embd, n_embd_head}, 0); - // CCA normalization and scale - layer.cca_qk_norm = create_tensor(tn(LLM_TENSOR_CCA_QK_NORM, "weight", i), {n_qk}, 0); - layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_embd_k}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); - // FFN (dense SwiGLU for now; MoE can be added later) - layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0); - layer.ffn_gate = create_tensor(tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, 0); - layer.ffn_down = create_tensor(tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, 0); - layer.ffn_up = create_tensor(tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, 0); + // CCA conv_qk.0 (depthwise, causal) + layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); + layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); + + // CCA conv_qk.1 (grouped, groups = n_groups) + layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), + {d_conv, n_qk / n_groups, n_qk}, 0); + layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + + // CCA per-KV-head temperature + layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0); + + // Residual scaling + layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); + layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B, "bias", i), {n_embd}, 0); + layer.res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + + // MoE layers (odd indices) + if (i % 2 == 1) { + // Router network + layer.zaya_router_down = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN, "weight", i), + {n_embd, n_ff_exp}, 0); + layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i), + {n_ff_exp}, 0); + layer.zaya_router_norm = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_NORM, "weight", i), + {n_ff_exp}, 0); + layer.zaya_router_mlp0 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0, "weight", i), + {n_ff_exp, n_ff_exp}, 0); + layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "bias", i), + {n_ff_exp}, 0); + layer.zaya_router_mlp2 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2, "weight", i), + {n_ff_exp, n_ff_exp}, 0); + layer.zaya_router_mlp2_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "bias", i), + {n_ff_exp}, 0); + layer.zaya_router_mlp4 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i), + {n_ff_exp, n_expert + 1}, 0); + layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i), + {n_expert + 1}, TENSOR_NOT_REQUIRED); + layer.zaya_router_eda_scale = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "weight", i), + {n_ff_exp}, TENSOR_NOT_REQUIRED); + + // MoE experts (fused gate_up and down) + create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i), + {n_ff, n_embd, n_expert}, 0); + } } } @@ -69,6 +116,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params const int64_t n_embd_head = hparams.n_embd_head_k(); const int64_t d_conv = hparams.ssm_d_conv; + const int64_t n_expert = hparams.n_expert; ggml_tensor * cur; ggml_tensor * inpL; @@ -91,117 +139,167 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * inpSA = inpL; - // norm - cur = build_norm(inpL, layer.attn_norm, NULL, LLM_NORM_RMS, il); + // Pre-norm + cur = build_norm(inpL, layer.attn_norm, nullptr, LLM_NORM_RMS, il); cb(cur, "attn_norm", il); - // --- CCA: Q, K, V projections --- - ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); - cb(Qraw, "Qraw", il); - ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); - cb(Kraw, "Kraw", il); - ggml_tensor * Vcur = ggml_mul_mat(ctx0, layer.wv, cur); - cb(Vcur, "Vcur", il); - - // --- CCA: concat Q+K for conv --- - // QK: [n_qk, n_tokens] - ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0); - cb(QK, "QK_cat", il); - - // --- CCA: conv_qk.0 (depthwise, causal) --- - // Reshape for ssm_conv: [n_tokens, n_qk] -> [n_tokens, n_qk, 1] - // ssm_conv expects [seq_len, channels, batch] with state already concatenated - // For prompt processing, we left-pad with (d_conv-1) zeros for causality - { - // Left-pad QK with zeros for causal convolution - ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); // [n_tokens, n_qk] - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); - pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); // [d_conv-1 + n_tokens, n_qk] - - QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); - // ssm_conv output: [n_tokens, n_qk] - cb(QK, "QK_dw", il); + if (il % 2 == 0) { + // ===== CCA Attention ===== + + // Q, K projections + ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); + cb(Qraw, "Qraw", il); + ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); + cb(Kraw, "Kraw", il); + + // V = concat(val_proj1(x), val_proj2(x)) → [n_embd_k, n_tokens] + ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur); + cb(V1, "V1", il); + ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, cur); + cb(V2, "V2", il); + ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0); + cb(Vcur, "Vcur", il); + + // Concat Q+K for conv: [n_qk, n_tokens] + ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0); + cb(QK, "QK_cat", il); + + // conv_qk.0 (depthwise, causal) + { + ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); + pad = ggml_scale(ctx0, pad, 0.0f); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); + + QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); + if (layer.cca_conv_dw_b) { + QK = ggml_add(ctx0, QK, layer.cca_conv_dw_b); + } + cb(QK, "QK_dw", il); + } + + // conv_qk.1 (grouped, causal) + { + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk); + pad = ggml_scale(ctx0, pad, 0.0f); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0); + + QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); + QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); + cb(QK, "QK_grp", il); + } + + // Transpose back to [n_qk, n_tokens] + QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + + // Split Q_conv, K_conv + ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, + QK->nb[1], 0); + ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, + QK->nb[1], n_embd_q * ggml_element_size(QK)); + + // QK mean skip connection + ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f); + ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + + // RMSNorm on concat(Q, K) — weightless (unit RMSNorm) + ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0); + QK_for_norm = build_norm(QK_for_norm, nullptr, nullptr, LLM_NORM_RMS, il); + cb(QK_for_norm, "QK_normed", il); + + // Split back + Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens, + QK_for_norm->nb[1], 0); + Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens, + QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm)); + + // Per-KV-head temperature scaling on K + // Kcur: [n_embd_k=256, n_tokens], reshape to [n_embd_head, n_head_kv, n_tokens] + Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); + // cca_k_scale: [n_head_kv] → broadcast + Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); + cb(Kcur, "Kcur_scaled", il); + + // Reshape for attention + Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + + // GQA attention + cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + 1.0f / sqrtf((float) n_embd_head), il); + cb(cur, "attn_out", il); + + } else { + // ===== MoE Layer ===== + + // Build Zaya router network: + // down_proj → RMSNorm → SiLU(MLP0) → MLP2 → MLP4 → 17 logits → take first 16 + + ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur); + router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b); + cb(router_h, "router_down", il); + + router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il); + cb(router_h, "router_norm", il); + + router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h); + router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b); + router_h = ggml_silu(ctx0, router_h); + cb(router_h, "router_mlp0", il); + + router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp2, router_h); + router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp2_b); + cb(router_h, "router_mlp2", il); + + router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h); + // router_h now has shape [17, n_tokens] — 16 expert logits + 1 MOD skip + cb(router_h, "router_logits", il); + + // Take only the first 16 logits (expert routing), ignore MOD skip (index 16) + ggml_tensor * gate_inp = ggml_view_2d(ctx0, router_h, n_expert, n_tokens, + router_h->nb[1], 0); + cb(gate_inp, "gate_inp", il); + + // MoE FFN with topk=1 (pass router logits as probs_in) + cur = build_moe_ffn(cur, + /* gate_inp */ nullptr, + /* up_exps */ nullptr, + /* gate_exps */ nullptr, + /* down_exps */ layer.ffn_down_exps, + /* exp_probs_b */ nullptr, + /* n_expert */ n_expert, + /* n_expert_used */ hparams.n_expert_used, + /* type_op */ LLM_FFN_SILU, + /* norm_w */ false, + /* w_scale */ 1.0f, + /* gating_op */ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + /* il */ il, + /* probs_in */ gate_inp, + /* gate_up_exps */ layer.ffn_gate_up_exps); + cb(cur, "moe_out", il); } - // --- CCA: conv_qk.1 (grouped, causal) --- - { - // Left-pad for second causal conv - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk); - pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0); // [d_conv-1 + n_tokens, n_qk] - - // ggml_conv_1d_grouped expects kernel [K, IC/G, OC] and input [L, IC] - // QK_padded is [d_conv-1 + n_tokens, n_qk] which matches [L, IC] - QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); - QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); - cb(QK, "QK_grp", il); - } - - // QK is now [n_tokens, n_qk] from conv output, transpose back to [n_qk, n_tokens] - QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); - - // --- CCA: split Q_conv, K_conv --- - ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, - QK->nb[1], 0); - ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, - QK->nb[1], n_embd_q * ggml_element_size(QK)); - - // --- CCA: QK mean (skip connection) --- - ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f); - ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f); - cb(Qcur, "Qcur", il); - cb(Kcur, "Kcur", il); - - // --- CCA: RMSNorm on concat(Q, K) --- - ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0); // [n_qk, n_tokens] - QK_for_norm = build_norm(QK_for_norm, layer.cca_qk_norm, NULL, LLM_NORM_RMS, il); - cb(QK_for_norm, "QK_normed", il); - - // Split back - Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens, - QK_for_norm->nb[1], 0); - Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens, - QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm)); - - // --- CCA: K temperature scaling --- - Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); - cb(Kcur, "Kcur_scaled", il); - - // Reshape for attention: [head_dim, n_heads, n_tokens] - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); - - // --- GQA attention --- - cur = build_attn(inp->get_attn(), layer.wo, NULL, NULL, - Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, - 1.0f / sqrtf((float) n_embd_head), il); - cb(cur, "attn_out", il); - // select output tokens on last layer if (il == n_layer - 1 && inp_out_ids) { cur = ggml_get_rows(ctx0, cur, inp_out_ids); inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); } - // residual - ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA); - cb(ffn_inp, "ffn_inp", il); + // Residual scaling: cur = hs_scale * cur + hs_bias + cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.res_scale_hs), layer.res_scale_hs_b); + cb(cur, "scaled_out", il); - // --- FFN (dense SwiGLU) --- - cur = build_norm(ffn_inp, layer.ffn_norm, NULL, LLM_NORM_RMS, il); - cb(cur, "ffn_norm", il); - - cur = build_ffn(cur, - layer.ffn_up, NULL, NULL, - layer.ffn_gate, NULL, NULL, - layer.ffn_down, NULL, NULL, - NULL, LLM_FFN_SILU, LLM_FFN_PAR, il); - cb(cur, "ffn_out", il); + // Residual scaling: inpSA = res_scale * inpSA + res_bias (if present) + if (layer.res_scale_res) { + inpSA = ggml_add(ctx0, ggml_mul(ctx0, inpSA, layer.res_scale_res), layer.res_scale_res_b); + cb(inpSA, "scaled_residual", il); + } - // residual - cur = ggml_add(ctx0, cur, ffn_inp); + // Residual add + cur = ggml_add(ctx0, cur, inpSA); cb(cur, "l_out", il); inpL = cur; @@ -210,7 +308,7 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cur = inpL; // final norm - cur = build_norm(cur, model.output_norm, NULL, LLM_NORM_RMS, -1); + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); cb(cur, "result_norm", -1); res->t_embd = cur; From 02a9843498a8bfe3296fd2522b0ce372bb9e2e6d Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Fri, 8 May 2026 19:19:41 +0200 Subject: [PATCH 4/6] update --- convert_hf_to_gguf.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 97a5889cce9..52bddd7665e 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6608,6 +6608,30 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter else: raise + def set_vocab(self): + from gguf.vocab import LlamaHfVocab + + vocab = LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("gemma4") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + self.gguf_writer.add_add_space_prefix(False) + self.gguf_writer.add_add_bos_token(True) + def prepare_tensors(self): super().prepare_tensors() if self._experts: From 8362c10d438261e04bb66f3c37b3631507589a8f Mon Sep 17 00:00:00 2001 From: Juste-Leo Date: Tue, 12 May 2026 00:30:59 +0200 Subject: [PATCH 5/6] add corrections --- convert_hf_to_gguf.py | 20 +++++++++++--- ggml/src/ggml.c | 6 ++--- src/models/zaya.cpp | 63 ++++++++++++++++++++++++------------------- 3 files changed, 56 insertions(+), 33 deletions(-) diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 52bddd7665e..41d150e30ac 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1183,7 +1183,7 @@ def set_gguf_parameters(self): if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base_swa(local_rope_theta) logger.info(f"gguf: rope theta swa = {local_rope_theta}") - if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: + if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: @@ -6463,6 +6463,13 @@ def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) # Buffer for accumulating expert weights per layer self._experts: dict[int, dict[str, Tensor]] | None = {} + # Pre-load tokenizer to know the vocab count for embedding trimming + self._tokenizer_vocab_size: int | None = None + try: + from gguf.vocab import LlamaHfVocab + self._tokenizer_vocab_size = LlamaHfVocab(self.dir_model).vocab_size + except Exception: + pass def set_gguf_parameters(self): super().set_gguf_parameters() @@ -6472,8 +6479,9 @@ def set_gguf_parameters(self): n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2 self.gguf_writer.add_feed_forward_length(n_ff) - # ssm_d_conv = conv_qk kernel size - self.gguf_writer.add_ssm_conv_kernel(5) + # ssm_d_conv = conv_qk kernel size (cca_time0 = first depthwise conv kernel) + cca_time0 = self.hparams.get("cca_time0", 2) + self.gguf_writer.add_ssm_conv_kernel(cca_time0) # partial_rotary_factor -> n_rot head_dim = self.hparams.get("head_dim", 128) @@ -6498,10 +6506,13 @@ def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[st elif "o_proj" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch elif "conv_qk.0" in name and name.endswith(".weight"): + # PyTorch: [n_qk, 1, kernel] (depthwise) -> ggml: {kernel, n_qk} + data_torch = data_torch.squeeze(1).contiguous() yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch elif "conv_qk.0" in name and name.endswith(".bias"): yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch elif "conv_qk.1" in name and name.endswith(".weight"): + # PyTorch: [n_qk, in_ch_per_group, kernel] -> ggml: {kernel, in_ch_per_group, n_qk} yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch elif "conv_qk.1" in name and name.endswith(".bias"): yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid, suffix=".bias"), data_torch @@ -6543,6 +6554,9 @@ def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tu def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Common tensors if name == "model.embed_tokens.weight": + # Trim embedding to match tokenizer vocab size if needed + if self._tokenizer_vocab_size is not None and data_torch.shape[0] > self._tokenizer_vocab_size: + data_torch = data_torch[:self._tokenizer_vocab_size] yield self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch return if name == "model.final_norm.weight": diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 049f4952047..ae1fb2fa031 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2018,9 +2018,9 @@ struct ggml_tensor * ggml_dup_inplace( static struct ggml_tensor * ggml_add_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index a6e77bbc198..434fa31585b 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -45,29 +45,27 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); - // CCA projections (present on all layers) - layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); - layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); + // CCA attention layers (even indices only) + if (i % 2 == 0) { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); - // CCA: V = concat(val_proj1(x), val_proj2(x)) → {n_embd_k} - layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), - {n_embd, n_embd_head}, 0); - layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), - {n_embd, n_embd_head}, 0); + layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), + {n_embd, n_embd_head}, 0); + layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), + {n_embd, n_embd_head}, 0); - layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); - // CCA conv_qk.0 (depthwise, causal) - layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); - layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); + layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); + layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); - // CCA conv_qk.1 (grouped, groups = n_groups) - layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), - {d_conv, n_qk / n_groups, n_qk}, 0); - layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), + {d_conv, n_qk / n_groups, n_qk}, 0); + layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); - // CCA per-KV-head temperature - layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0); + layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0); + } // Residual scaling layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); @@ -101,7 +99,7 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { // MoE experts (fused gate_up and down) create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0); - layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i), + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), {n_ff, n_embd, n_expert}, 0); } } @@ -167,30 +165,37 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params // conv_qk.0 (depthwise, causal) { ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); + // ggml_ssm_conv requires 3D input: {1 + n_tokens, n_qk, 1} + // Use view_3d on the contiguous 2D tensor to add a batch dimension + QK_t = ggml_view_3d(ctx0, QK_t, n_tokens, n_qk, 1, QK_t->nb[1], QK_t->nb[1] * n_qk, 0); + ggml_tensor * pad = ggml_new_tensor_3d(ctx0, QK_t->type, d_conv - 1, n_qk, 1); pad = ggml_scale(ctx0, pad, 0.0f); ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); + // Reshape to 2D first, then apply bias to avoid 3D broadcasting + QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); if (layer.cca_conv_dw_b) { QK = ggml_add(ctx0, QK, layer.cca_conv_dw_b); } cb(QK, "QK_dw", il); } - // conv_qk.1 (grouped, causal) + // conv_qk.1 (grouped, causal) — operate on {n_tokens, n_qk} format { - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK->type, d_conv - 1, n_qk); + ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK, 0); + ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); + // conv output is {OL, OC, N} -> reshape to {OC, OL}, then add bias + QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); cb(QK, "QK_grp", il); } - // Transpose back to [n_qk, n_tokens] - QK = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); + // QK is now [n_qk, n_tokens] // Split Q_conv, K_conv ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, @@ -217,13 +222,16 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params // Per-KV-head temperature scaling on K // Kcur: [n_embd_k=256, n_tokens], reshape to [n_embd_head, n_head_kv, n_tokens] + Kcur = ggml_cont(ctx0, Kcur); Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); // cca_k_scale: [n_head_kv] → broadcast Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); cb(Kcur, "Kcur_scaled", il); // Reshape for attention + Qcur = ggml_cont(ctx0, Qcur); Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); + Vcur = ggml_cont(ctx0, Vcur); Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); // GQA attention @@ -259,8 +267,9 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params cb(router_h, "router_logits", il); // Take only the first 16 logits (expert routing), ignore MOD skip (index 16) - ggml_tensor * gate_inp = ggml_view_2d(ctx0, router_h, n_expert, n_tokens, - router_h->nb[1], 0); + ggml_tensor * gate_inp = ggml_cont(ctx0, + ggml_view_2d(ctx0, router_h, n_expert, n_tokens, + router_h->nb[1], 0)); cb(gate_inp, "gate_inp", il); // MoE FFN with topk=1 (pass router logits as probs_in) From 109856e8fa688e9bf4453db98c687e2de85051b0 Mon Sep 17 00:00:00 2001 From: Ganesh Nanduru Date: Mon, 11 May 2026 21:42:49 -0600 Subject: [PATCH 6/6] zaya generation running --- common/debug.cpp | 14 +- convert_hf_to_gguf.py | 15 +- gguf-py/gguf/constants.py | 12 ++ src/llama-arch.cpp | 8 + src/llama-arch.h | 4 + src/llama-graph.cpp | 4 + src/llama-model.cpp | 9 +- src/llama-model.h | 6 + src/models/zaya.cpp | 312 +++++++++++++++++++++++++------------- 9 files changed, 270 insertions(+), 114 deletions(-) diff --git a/common/debug.cpp b/common/debug.cpp index 102c6924dc9..60cb5fd9b4a 100644 --- a/common/debug.cpp +++ b/common/debug.cpp @@ -144,13 +144,6 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (common_debug_cb_user_data *) user_data; auto * pimpl = cb_data->pimpl.get(); - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - - if (ask) { - return true; // Always retrieve data - } - bool matches_filter = pimpl->tensor_filters.empty(); if (!matches_filter) { @@ -162,6 +155,13 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } } + if (ask) { + return matches_filter; + } + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + char src1_str[128] = { 0 }; if (src1) { snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str()); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index 41d150e30ac..1e1adb10fe4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -6551,6 +6551,16 @@ def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tu elif "residual_bias" in name: yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch + def _map_final_res_scale(self, name: str, data_torch: Tensor) -> Iterable[tuple[str, Tensor]]: + if "hidden_states_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL), data_torch + elif "hidden_states_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B_FINAL, suffix=".bias"), data_torch + elif "residual_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL), data_torch + elif "residual_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B_FINAL, suffix=".bias"), data_torch + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: # Common tensors if name == "model.embed_tokens.weight": @@ -6562,6 +6572,9 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if name == "model.final_norm.weight": yield self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM), data_torch return + if name.startswith("model.res_scale."): + yield from self._map_final_res_scale(name, data_torch) + return # Block-level tensors if bid is not None: @@ -6599,7 +6612,7 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter if len(self._experts[bid]) >= n_expert * 2: for w_name, gguf_tensor, permute_dims in [ ("linear_fc1", gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, None), - ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, (0, 2, 1)), + ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, None), ]: datas: list[Tensor] = [] for xid in range(n_expert): diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index de599da4a0b..57a67cb559f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -622,6 +622,10 @@ class MODEL_TENSOR(IntEnum): RES_SCALE_HS_B = auto() # Zaya: hidden_states_bias RES_SCALE_RES = auto() # Zaya: residual_scale RES_SCALE_RES_B = auto() # Zaya: residual_bias + RES_SCALE_HS_FINAL = auto() # Zaya: final hidden_states_scale + RES_SCALE_HS_B_FINAL = auto() # Zaya: final hidden_states_bias + RES_SCALE_RES_FINAL = auto() # Zaya: final residual_scale + RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias ZAYA_ROUTER_DOWN = auto() # Zaya ZAYA_ROUTER_DOWN_B = auto() # Zaya ZAYA_ROUTER_NORM = auto() # Zaya @@ -1157,6 +1161,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_HS_B: "blk.{bid}.res_scale_hs_b", # Zaya MODEL_TENSOR.RES_SCALE_RES: "blk.{bid}.res_scale_res", # Zaya MODEL_TENSOR.RES_SCALE_RES_B: "blk.{bid}.res_scale_res_b", # Zaya + MODEL_TENSOR.RES_SCALE_HS_FINAL: "res_scale_hs", # Zaya + MODEL_TENSOR.RES_SCALE_HS_B_FINAL: "res_scale_hs_b", # Zaya + MODEL_TENSOR.RES_SCALE_RES_FINAL: "res_scale_res", # Zaya + MODEL_TENSOR.RES_SCALE_RES_B_FINAL: "res_scale_res_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_DOWN: "blk.{bid}.zaya_router_down", # Zaya MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya MODEL_TENSOR.ZAYA_ROUTER_NORM: "blk.{bid}.zaya_router_norm", # Zaya @@ -4055,6 +4063,10 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.RES_SCALE_HS_B, MODEL_TENSOR.RES_SCALE_RES, MODEL_TENSOR.RES_SCALE_RES_B, + MODEL_TENSOR.RES_SCALE_HS_FINAL, + MODEL_TENSOR.RES_SCALE_HS_B_FINAL, + MODEL_TENSOR.RES_SCALE_RES_FINAL, + MODEL_TENSOR.RES_SCALE_RES_B_FINAL, MODEL_TENSOR.ZAYA_ROUTER_DOWN, MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, MODEL_TENSOR.ZAYA_ROUTER_NORM, diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 3bebc529300..9bdd0023028 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -429,6 +429,10 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_RES_SCALE_HS_B, "blk.%d.res_scale_hs_b" }, { LLM_TENSOR_RES_SCALE_RES, "blk.%d.res_scale_res" }, { LLM_TENSOR_RES_SCALE_RES_B, "blk.%d.res_scale_res_b" }, + { LLM_TENSOR_RES_SCALE_HS_FINAL, "res_scale_hs" }, + { LLM_TENSOR_RES_SCALE_HS_B_FINAL, "res_scale_hs_b" }, + { LLM_TENSOR_RES_SCALE_RES_FINAL, "res_scale_res" }, + { LLM_TENSOR_RES_SCALE_RES_B_FINAL, "res_scale_res_b" }, { LLM_TENSOR_ZAYA_ROUTER_DOWN, "blk.%d.zaya_router_down" }, { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, { LLM_TENSOR_ZAYA_ROUTER_NORM, "blk.%d.zaya_router_norm" }, @@ -693,6 +697,10 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_RES_SCALE_HS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_RES_SCALE_RES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_RES_SCALE_RES_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_RES_SCALE_HS_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_HS_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, + {LLM_TENSOR_RES_SCALE_RES_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_RES_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, {LLM_TENSOR_ZAYA_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, diff --git a/src/llama-arch.h b/src/llama-arch.h index 72c5abddac1..30a3f9a444a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -458,6 +458,10 @@ enum llm_tensor { LLM_TENSOR_RES_SCALE_HS_B, // zaya: hidden_states_bias LLM_TENSOR_RES_SCALE_RES, // zaya: residual_scale LLM_TENSOR_RES_SCALE_RES_B, // zaya: residual_bias + LLM_TENSOR_RES_SCALE_HS_FINAL, // zaya: final hidden_states_scale + LLM_TENSOR_RES_SCALE_HS_B_FINAL,// zaya: final hidden_states_bias + LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale + LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias // ZAYA Router (MoE gating) LLM_TENSOR_ZAYA_ROUTER_DOWN, // zaya: router down_proj weight LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index fe155c92dea..e4f0ff98ef4 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1405,6 +1405,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn( { probs = logits; // [n_expert, n_tokens] } break; + case LLAMA_EXPERT_GATING_FUNC_TYPE_NONE: + { + probs = logits; // already-normalized expert probabilities + } break; default: GGML_ABORT("fatal error"); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 656767318f2..3de55045f5c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1957,6 +1957,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, if (arch == LLM_ARCH_FALCON_H1) { filter_attn = [&](int32_t) { return true; }; filter_recr = [&](int32_t) { return true; }; + } else if (arch == LLM_ARCH_ZAYA) { + filter_attn = [&](int32_t il) { + return il % 2 == 0; + }; + filter_recr = [&](int32_t il) { + return il % 2 == 0; + }; } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { filter_attn = [&](int32_t il) { return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0; @@ -2208,7 +2215,6 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_NEMOTRON_H: case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_KIMI_LINEAR: - case LLM_ARCH_ZAYA: return LLAMA_ROPE_TYPE_NONE; // use what we call a normal RoPE, operating on pairs of consecutive head values @@ -2311,6 +2317,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_MIMO2: case LLM_ARCH_STEP35: + case LLM_ARCH_ZAYA: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/src/llama-model.h b/src/llama-model.h index d9da4b318bd..01ce976fe3e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -561,6 +561,12 @@ struct llama_model { struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; + // Zaya final residual scaling + struct ggml_tensor * zaya_res_scale_hs = nullptr; + struct ggml_tensor * zaya_res_scale_hs_b = nullptr; + struct ggml_tensor * zaya_res_scale_res = nullptr; + struct ggml_tensor * zaya_res_scale_res_b = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp index 434fa31585b..89e354450bb 100644 --- a/src/models/zaya.cpp +++ b/src/models/zaya.cpp @@ -1,11 +1,23 @@ #include "models.h" #include "ggml.h" +#include "llama-memory-recurrent.h" + +#include void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + const uint32_t n_qk = (hparams.n_head() + hparams.n_head_kv()) * hparams.n_embd_head_k(); + hparams.ssm_d_inner = 2*n_qk + hparams.n_embd; // CCA conv state + delayed value stream state + hparams.ssm_d_state = 1; + hparams.ssm_n_group = 0; + + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.recurrent_layer_arr[i] = (i % 2) == 0; + } + switch (hparams.n_layer) { case 80: type = LLM_TYPE_8B; break; default: type = LLM_TYPE_UNKNOWN; @@ -26,6 +38,11 @@ void llama_model_zaya::load_arch_tensors(llama_model_loader &) { output = tok_embd; } + zaya_res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); + const int64_t n_embd_head = hparams.n_embd_head_k(); const int64_t d_conv = hparams.ssm_d_conv; // Router MLP hidden size (zaya_mlp_expansion = 256 for ZAYA1-8B) @@ -113,8 +130,14 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params : llm_graph_context(params) { const int64_t n_embd_head = hparams.n_embd_head_k(); - const int64_t d_conv = hparams.ssm_d_conv; const int64_t n_expert = hparams.n_expert; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(n_tokens % n_seqs == 0); + + const int64_t n_seq_tokens = n_tokens / n_seqs; ggml_tensor * cur; ggml_tensor * inpL; @@ -122,8 +145,24 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params inpL = build_inp_embd(model.tok_embd); auto * inp = build_inp_mem_hybrid(); + auto * inp_recr = inp->get_recr(); + ggml_tensor * inp_pos = build_inp_pos(); ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * residual = nullptr; + ggml_tensor * prev_router = nullptr; + + const auto apply_res_scale = [&](ggml_tensor * x, ggml_tensor * scale, ggml_tensor * bias, const char * name, int il) { + if (scale == nullptr) { + return x; + } + if (bias != nullptr) { + x = ggml_add(ctx0, x, bias); + } + x = ggml_mul(ctx0, x, scale); + cb(x, name, il); + return x; + }; for (int il = 0; il < n_layer; ++il) { const auto & layer = model.layers[il]; @@ -134,15 +173,41 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params const int64_t n_embd_k = n_head_kv * n_embd_head; const int64_t n_qk = n_embd_q + n_embd_k; const int64_t n_groups = n_head + n_head_kv; + const int64_t n_gqa = n_head / n_head_kv; - ggml_tensor * inpSA = inpL; + ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); + if (residual != nullptr) { + residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il); + residual = ggml_add(ctx0, hidden_states, residual); + } else { + residual = hidden_states; + } + cb(residual, "residual", il); // Pre-norm - cur = build_norm(inpL, layer.attn_norm, nullptr, LLM_NORM_RMS, il); - cb(cur, "attn_norm", il); + cur = build_norm(residual, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "input_norm", il); if (il % 2 == 0) { // ===== CCA Attention ===== + const int64_t conv_state_size = 2*n_qk; + const int64_t cca_state_size = conv_state_size + n_embd; + GGML_ASSERT((int64_t) hparams.n_embd_s() == cca_state_size); + + ggml_tensor * cca_state_all = inp_recr->mctx->get_s_l(il); + ggml_tensor * cca_state = build_rs(inp_recr, cca_state_all, hparams.n_embd_s(), n_seqs); + cb(cca_state, "cca_state", il); + + ggml_tensor * conv_state = ggml_view_3d(ctx0, cca_state, 2, n_qk, n_seqs, + 2*ggml_element_size(cca_state), + cca_state->nb[1], + 0); + cb(conv_state, "cca_conv_state", il); + + ggml_tensor * prev_hs = ggml_view_2d(ctx0, cca_state, n_embd, n_seqs, + cca_state->nb[1], + conv_state_size*ggml_element_size(cca_state)); + cb(prev_hs, "cca_prev_hs", il); // Q, K projections ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); @@ -150,89 +215,121 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); cb(Kraw, "Kraw", il); - // V = concat(val_proj1(x), val_proj2(x)) → [n_embd_k, n_tokens] + // HF uses a delayed hidden-state stream for val_proj2. During decode this + // comes from the recurrent state; during prefill it is a one-token shift. + ggml_tensor * cur_state_src = ggml_cont(ctx0, cur); + ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs); + + ggml_tensor * hs_d = ggml_reshape_3d(ctx0, prev_hs, n_embd, 1, n_seqs); + if (n_seq_tokens > 1) { + ggml_tensor * cur_shift = ggml_view_3d(ctx0, cur_seq, n_embd, n_seq_tokens - 1, n_seqs, + cur_seq->nb[1], + cur_seq->nb[2], + 0); + hs_d = ggml_concat(ctx0, hs_d, cur_shift, 1); + } + hs_d = ggml_reshape_2d(ctx0, hs_d, n_embd, n_tokens); + cb(hs_d, "cca_hs_d", il); + + // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens] ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur); cb(V1, "V1", il); - ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, cur); + ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, hs_d); cb(V2, "V2", il); ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0); cb(Vcur, "Vcur", il); // Concat Q+K for conv: [n_qk, n_tokens] - ggml_tensor * QK = ggml_concat(ctx0, Qraw, Kraw, 0); - cb(QK, "QK_cat", il); - - // conv_qk.0 (depthwise, causal) - { - ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); - // ggml_ssm_conv requires 3D input: {1 + n_tokens, n_qk, 1} - // Use view_3d on the contiguous 2D tensor to add a batch dimension - QK_t = ggml_view_3d(ctx0, QK_t, n_tokens, n_qk, 1, QK_t->nb[1], QK_t->nb[1] * n_qk, 0); - ggml_tensor * pad = ggml_new_tensor_3d(ctx0, QK_t->type, d_conv - 1, n_qk, 1); - pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); - - QK = ggml_ssm_conv(ctx0, QK_padded, layer.cca_conv_dw); - // Reshape to 2D first, then apply bias to avoid 3D broadcasting - QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); - if (layer.cca_conv_dw_b) { - QK = ggml_add(ctx0, QK, layer.cca_conv_dw_b); - } - cb(QK, "QK_dw", il); + ggml_tensor * QKraw = ggml_concat(ctx0, Qraw, Kraw, 0); + cb(QKraw, "QKraw", il); + + ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Qraw), n_embd_head, n_head, n_tokens); + ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Kraw), n_embd_head, n_head_kv, n_tokens); + + ggml_tensor * Kpre_grouped = ggml_reshape_4d(ctx0, Kpre, n_embd_head, 1, n_head_kv, n_tokens); + Kpre_grouped = ggml_repeat_4d(ctx0, Kpre_grouped, n_embd_head, n_gqa, n_head_kv, n_tokens); + ggml_tensor * Kpre_rep = ggml_reshape_3d(ctx0, Kpre_grouped, n_embd_head, n_head, n_tokens); + ggml_tensor * qk_mean_q = ggml_scale(ctx0, ggml_add(ctx0, Qpre, Kpre_rep), 0.5f); + cb(qk_mean_q, "qk_mean_q", il); + + ggml_tensor * Qgroup = ggml_reshape_4d(ctx0, Qpre, n_embd_head, n_gqa, n_head_kv, n_tokens); + Qgroup = ggml_permute(ctx0, Qgroup, 1, 0, 2, 3); + Qgroup = ggml_cont(ctx0, Qgroup); + ggml_tensor * Qmean = ggml_mean(ctx0, Qgroup); + Qmean = ggml_reshape_3d(ctx0, Qmean, n_embd_head, n_head_kv, n_tokens); + ggml_tensor * qk_mean_k = ggml_scale(ctx0, ggml_add(ctx0, Qmean, Kpre), 0.5f); + cb(qk_mean_k, "qk_mean_k", il); + + ggml_tensor * QKraw_t = ggml_cont(ctx0, ggml_transpose(ctx0, QKraw)); + QKraw_t = ggml_reshape_3d(ctx0, QKraw_t, n_seq_tokens, n_qk, n_seqs); + + ggml_tensor * conv_input = ggml_concat(ctx0, conv_state, QKraw_t, 0); + cb(conv_input, "cca_conv_input", il); + + ggml_tensor * last_conv_states = ggml_view_3d(ctx0, conv_input, 2, n_qk, n_seqs, + conv_input->nb[1], + conv_input->nb[2], + n_seq_tokens*conv_input->nb[0]); + cb(last_conv_states, "cca_last_conv_states", il); + + const auto kv_head = inp_recr->mctx->get_head(); + ggml_tensor * conv_state_update_target = ggml_view_2d(ctx0, cca_state_all, conv_state_size, n_seqs, + cca_state_all->nb[1], + kv_head*cca_state_size*ggml_element_size(cca_state_all)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, conv_state_update_target)); + + ggml_tensor * last_hs = ggml_view_2d(ctx0, cur_seq, n_embd, n_seqs, + cur_seq->nb[2], + (n_seq_tokens - 1)*cur_seq->nb[1]); + ggml_tensor * prev_hs_update_target = ggml_view_2d(ctx0, cca_state_all, n_embd, n_seqs, + cca_state_all->nb[1], + (kv_head*cca_state_size + conv_state_size)*ggml_element_size(cca_state_all)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target)); + + ggml_tensor * conv_dw = layer.cca_conv_dw; + if (conv_dw->type != GGML_TYPE_F32) { + conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F32); } - - // conv_qk.1 (grouped, causal) — operate on {n_tokens, n_qk} format - { - ggml_tensor * QK_t = ggml_cont(ctx0, ggml_transpose(ctx0, QK)); - ggml_tensor * pad = ggml_new_tensor_2d(ctx0, QK_t->type, d_conv - 1, n_qk); - pad = ggml_scale(ctx0, pad, 0.0f); - ggml_tensor * QK_padded = ggml_concat(ctx0, pad, QK_t, 0); - - QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK_padded, 1, 0, 1, n_groups); - // conv output is {OL, OC, N} -> reshape to {OC, OL}, then add bias - QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); - QK = ggml_add(ctx0, QK, layer.cca_conv_grp_b); - cb(QK, "QK_grp", il); + conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk); + ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1); + if (layer.cca_conv_dw_b) { + QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1)); } + cb(QK, "QK_dw", il); - // QK is now [n_qk, n_tokens] + QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK, 1, 0, 1, n_groups); + QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1)); + cb(QK, "QK_grp", il); - // Split Q_conv, K_conv - ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, - QK->nb[1], 0); - ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, - QK->nb[1], n_embd_q * ggml_element_size(QK)); + QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3)); + QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); - // QK mean skip connection - ggml_tensor * Qcur = ggml_scale(ctx0, ggml_add(ctx0, Q_conv, Qraw), 0.5f); - ggml_tensor * Kcur = ggml_scale(ctx0, ggml_add(ctx0, K_conv, Kraw), 0.5f); + ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0); + ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, QK->nb[1], n_embd_q*ggml_element_size(QK)); + + ggml_tensor * Qcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Q_conv), n_embd_head, n_head, n_tokens); + ggml_tensor * Kcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, K_conv), n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_add(ctx0, Qcur, qk_mean_q); + Kcur = ggml_add(ctx0, Kcur, qk_mean_k); + + Qcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Qcur, 1e-12f), sqrtf((float) n_embd_head)); + Kcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Kcur, 1e-12f), sqrtf((float) n_embd_head)); + Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.cca_k_scale, 1, n_head_kv, 1)); + cb(Qcur, "Qcur_pre_rope", il); + cb(Kcur, "Kcur_pre_rope", il); + + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); cb(Qcur, "Qcur", il); cb(Kcur, "Kcur", il); - // RMSNorm on concat(Q, K) — weightless (unit RMSNorm) - ggml_tensor * QK_for_norm = ggml_concat(ctx0, Qcur, Kcur, 0); - QK_for_norm = build_norm(QK_for_norm, nullptr, nullptr, LLM_NORM_RMS, il); - cb(QK_for_norm, "QK_normed", il); - - // Split back - Qcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_q, n_tokens, - QK_for_norm->nb[1], 0); - Kcur = ggml_view_2d(ctx0, QK_for_norm, n_embd_k, n_tokens, - QK_for_norm->nb[1], n_embd_q * ggml_element_size(QK_for_norm)); - - // Per-KV-head temperature scaling on K - // Kcur: [n_embd_k=256, n_tokens], reshape to [n_embd_head, n_head_kv, n_tokens] - Kcur = ggml_cont(ctx0, Kcur); - Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens); - // cca_k_scale: [n_head_kv] → broadcast - Kcur = ggml_mul(ctx0, Kcur, layer.cca_k_scale); - cb(Kcur, "Kcur_scaled", il); - - // Reshape for attention - Qcur = ggml_cont(ctx0, Qcur); - Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens); - Vcur = ggml_cont(ctx0, Vcur); - Vcur = ggml_reshape_3d(ctx0, Vcur, n_embd_head, n_head_kv, n_tokens); + Vcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Vcur), n_embd_head, n_head_kv, n_tokens); // GQA attention cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr, @@ -244,77 +341,82 @@ llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params // ===== MoE Layer ===== // Build Zaya router network: - // down_proj → RMSNorm → SiLU(MLP0) → MLP2 → MLP4 → 17 logits → take first 16 + // down_proj -> optional EDA -> RMSNorm -> GELU MLP -> 17 logits. ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur); router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b); cb(router_h, "router_down", il); + if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { + router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale)); + cb(router_h, "router_eda", il); + } + + prev_router = router_h; + router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il); cb(router_h, "router_norm", il); router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h); router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b); - router_h = ggml_silu(ctx0, router_h); + router_h = ggml_gelu(ctx0, router_h); cb(router_h, "router_mlp0", il); router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp2, router_h); router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp2_b); + router_h = ggml_gelu(ctx0, router_h); cb(router_h, "router_mlp2", il); router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h); - // router_h now has shape [17, n_tokens] — 16 expert logits + 1 MOD skip cb(router_h, "router_logits", il); - // Take only the first 16 logits (expert routing), ignore MOD skip (index 16) - ggml_tensor * gate_inp = ggml_cont(ctx0, - ggml_view_2d(ctx0, router_h, n_expert, n_tokens, - router_h->nb[1], 0)); - cb(gate_inp, "gate_inp", il); + ggml_tensor * router_probs = ggml_soft_max(ctx0, router_h); + cb(router_probs, "router_probs", il); + + // Keep the MOD skip expert in the softmax denominator, then route + // over real experts only. The checkpoint's skip bias keeps MOD unused. + ggml_tensor * gate_probs = ggml_cont(ctx0, + ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0)); + cb(gate_probs, "gate_probs", il); + + ggml_tensor * expert_biases = nullptr; + if (layer.zaya_router_biases != nullptr) { + expert_biases = ggml_view_1d(ctx0, layer.zaya_router_biases, n_expert, 0); + } - // MoE FFN with topk=1 (pass router logits as probs_in) cur = build_moe_ffn(cur, /* gate_inp */ nullptr, /* up_exps */ nullptr, /* gate_exps */ nullptr, /* down_exps */ layer.ffn_down_exps, - /* exp_probs_b */ nullptr, + /* exp_probs_b */ expert_biases, /* n_expert */ n_expert, /* n_expert_used */ hparams.n_expert_used, /* type_op */ LLM_FFN_SILU, /* norm_w */ false, /* w_scale */ 1.0f, - /* gating_op */ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX, + /* gating_op */ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE, /* il */ il, - /* probs_in */ gate_inp, + /* probs_in */ gate_probs, /* gate_up_exps */ layer.ffn_gate_up_exps); cb(cur, "moe_out", il); } - // select output tokens on last layer - if (il == n_layer - 1 && inp_out_ids) { - cur = ggml_get_rows(ctx0, cur, inp_out_ids); - inpSA = ggml_get_rows(ctx0, inpSA, inp_out_ids); - } - - // Residual scaling: cur = hs_scale * cur + hs_bias - cur = ggml_add(ctx0, ggml_mul(ctx0, cur, layer.res_scale_hs), layer.res_scale_hs_b); - cb(cur, "scaled_out", il); - - // Residual scaling: inpSA = res_scale * inpSA + res_bias (if present) - if (layer.res_scale_res) { - inpSA = ggml_add(ctx0, ggml_mul(ctx0, inpSA, layer.res_scale_res), layer.res_scale_res_b); - cb(inpSA, "scaled_residual", il); - } - - // Residual add - cur = ggml_add(ctx0, cur, inpSA); - cb(cur, "l_out", il); - inpL = cur; } - cur = inpL; + ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1); + if (residual != nullptr) { + residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1); + cur = ggml_add(ctx0, final_hidden, residual); + } else { + cur = final_hidden; + } + cb(cur, "final_residual", -1); + + if (inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } // final norm cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1);