diff --git a/common/debug.cpp b/common/debug.cpp index 102c6924dc9..60cb5fd9b4a 100644 --- a/common/debug.cpp +++ b/common/debug.cpp @@ -144,13 +144,6 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { auto * cb_data = (common_debug_cb_user_data *) user_data; auto * pimpl = cb_data->pimpl.get(); - const struct ggml_tensor * src0 = t->src[0]; - const struct ggml_tensor * src1 = t->src[1]; - - if (ask) { - return true; // Always retrieve data - } - bool matches_filter = pimpl->tensor_filters.empty(); if (!matches_filter) { @@ -162,6 +155,13 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) { } } + if (ask) { + return matches_filter; + } + + const struct ggml_tensor * src0 = t->src[0]; + const struct ggml_tensor * src1 = t->src[1]; + char src1_str[128] = { 0 }; if (src1) { snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str()); diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py index fb1f5dd4473..1e1adb10fe4 100755 --- a/convert_hf_to_gguf.py +++ b/convert_hf_to_gguf.py @@ -1183,7 +1183,7 @@ def set_gguf_parameters(self): if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None: self.gguf_writer.add_rope_freq_base_swa(local_rope_theta) logger.info(f"gguf: rope theta swa = {local_rope_theta}") - if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None: + if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps", "norm_epsilon"], optional=True)) is not None: self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps) logger.info(f"gguf: rms norm epsilon = {f_rms_eps}") if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None: @@ -6454,6 +6454,219 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter yield from super().modify_tensors(data_torch, name, bid) +@ModelBase.register("ZayaModel", "ZayaForCausalLM") +class ZayaModel(TextModel): + """Zaya-1 model with Compressed Convolutional Attention and MoE""" + model_arch = gguf.MODEL_ARCH.ZAYA + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # Buffer for accumulating expert weights per layer + self._experts: dict[int, dict[str, Tensor]] | None = {} + # Pre-load tokenizer to know the vocab count for embedding trimming + self._tokenizer_vocab_size: int | None = None + try: + from gguf.vocab import LlamaHfVocab + self._tokenizer_vocab_size = LlamaHfVocab(self.dir_model).vocab_size + except Exception: + pass + + def set_gguf_parameters(self): + super().set_gguf_parameters() + self.gguf_writer.add_vocab_size(self.hparams["vocab_size"]) + + # n_ff = ffn_hidden_size / 2 (SwiGLU halves the intermediate) + n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2 + self.gguf_writer.add_feed_forward_length(n_ff) + + # ssm_d_conv = conv_qk kernel size (cca_time0 = first depthwise conv kernel) + cca_time0 = self.hparams.get("cca_time0", 2) + self.gguf_writer.add_ssm_conv_kernel(cca_time0) + + # partial_rotary_factor -> n_rot + head_dim = self.hparams.get("head_dim", 128) + partial_rotary = self.hparams.get("partial_rotary_factor", 0.5) + self.gguf_writer.add_rope_dimension_count(int(partial_rotary * head_dim)) + + # MoE params + n_expert = self.find_hparam(["num_experts"]) + self.gguf_writer.add_expert_count(n_expert) + n_expert_used = self.find_hparam(["moe_router_topk", "num_experts_per_tok"], optional=True) or 1 + self.gguf_writer.add_expert_used_count(n_expert_used) + + def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: + if "linear_q" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch + elif "linear_k" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch + elif "val_proj1" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ1, bid), data_torch + elif "val_proj2" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ2, bid), data_torch + elif "o_proj" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch + elif "conv_qk.0" in name and name.endswith(".weight"): + # PyTorch: [n_qk, 1, kernel] (depthwise) -> ggml: {kernel, n_qk} + data_torch = data_torch.squeeze(1).contiguous() + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch + elif "conv_qk.0" in name and name.endswith(".bias"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch + elif "conv_qk.1" in name and name.endswith(".weight"): + # PyTorch: [n_qk, in_ch_per_group, kernel] -> ggml: {kernel, in_ch_per_group, n_qk} + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch + elif "conv_qk.1" in name and name.endswith(".bias"): + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid, suffix=".bias"), data_torch + elif "temp" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_K_SCALE, bid), data_torch + + def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: + if "down_proj.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN, bid), data_torch + elif "down_proj.bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch + elif "rmsnorm_eda" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_NORM, bid), data_torch + elif "router_mlp.0.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0, bid), data_torch + elif "router_mlp.0.bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, bid, suffix=".bias"), data_torch + elif "router_mlp.2.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid), data_torch + elif "router_mlp.2.bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, bid, suffix=".bias"), data_torch + elif "router_mlp.4.weight" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP4, bid), data_torch + elif "balancing_biases" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_BIASES, bid), data_torch + elif "router_states_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, bid), data_torch + + def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]: + if "hidden_states_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid), data_torch + elif "hidden_states_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B, bid, suffix=".bias"), data_torch + elif "residual_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid), data_torch + elif "residual_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch + + def _map_final_res_scale(self, name: str, data_torch: Tensor) -> Iterable[tuple[str, Tensor]]: + if "hidden_states_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL), data_torch + elif "hidden_states_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B_FINAL, suffix=".bias"), data_torch + elif "residual_scale" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL), data_torch + elif "residual_bias" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B_FINAL, suffix=".bias"), data_torch + + def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]: + # Common tensors + if name == "model.embed_tokens.weight": + # Trim embedding to match tokenizer vocab size if needed + if self._tokenizer_vocab_size is not None and data_torch.shape[0] > self._tokenizer_vocab_size: + data_torch = data_torch[:self._tokenizer_vocab_size] + yield self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch + return + if name == "model.final_norm.weight": + yield self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM), data_torch + return + if name.startswith("model.res_scale."): + yield from self._map_final_res_scale(name, data_torch) + return + + # Block-level tensors + if bid is not None: + # CCA attention tensors + if "self_attn" in name: + yield from self._map_cca(name, data_torch, bid) + return + + # Router tensors + if "router" in name: + yield from self._map_router(name, data_torch, bid) + return + + # Input norm + if "input_norm" in name: + yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, bid), data_torch + return + + # Residual scaling + if "res_scale" in name: + yield from self._map_res_scale(name, data_torch, bid) + return + + # Expert stacking + if "zaya_block.experts" in name: + assert bid is not None + if self._experts is None: + self._experts = {} + if bid not in self._experts: + self._experts[bid] = {} + self._experts[bid][name] = data_torch + + n_expert = self.find_hparam(["num_experts"]) + # Each layer has 2 expert weights per expert (fc1, fc2) = 2 * n_expert tensors + if len(self._experts[bid]) >= n_expert * 2: + for w_name, gguf_tensor, permute_dims in [ + ("linear_fc1", gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, None), + ("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, None), + ]: + datas: list[Tensor] = [] + for xid in range(n_expert): + ename = f"model.layers.{bid}.zaya_block.experts.local_experts.{xid}.{w_name}.weight" + datas.append(self._experts[bid][ename]) + del self._experts[bid][ename] + data_torch_stacked = torch.stack(datas, dim=0) + if permute_dims is not None: + data_torch_stacked = data_torch_stacked.permute(*permute_dims) + yield self.format_tensor_name(gguf_tensor, bid), data_torch_stacked + del self._experts[bid] + return + + # Fallback for any remaining tensors: use tensor_mapping + try: + yield from super().modify_tensors(data_torch, name, bid) + except ValueError as e: + if "Can not map tensor" in str(e): + logger.warning(f"Skipping unmapped tensor: {name}") + else: + raise + + def set_vocab(self): + from gguf.vocab import LlamaHfVocab + + vocab = LlamaHfVocab(self.dir_model) + tokens = [] + scores = [] + toktypes = [] + for text, score, toktype in vocab.all_tokens(): + tokens.append(text) + scores.append(score) + toktypes.append(toktype) + + assert len(tokens) == vocab.vocab_size + + self.gguf_writer.add_tokenizer_model("gemma4") + self.gguf_writer.add_token_list(tokens) + self.gguf_writer.add_token_scores(scores) + self.gguf_writer.add_token_types(toktypes) + + special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True) + special_vocab.add_to_gguf(self.gguf_writer) + self.gguf_writer.add_add_space_prefix(False) + self.gguf_writer.add_add_bos_token(True) + + def prepare_tensors(self): + super().prepare_tensors() + if self._experts: + unprocessed = [k for d in self._experts.values() for k in d.keys()] + if unprocessed: + raise ValueError(f"Unprocessed expert tensors: {unprocessed}") + + @ModelBase.register("InternLM2ForCausalLM") class InternLM2Model(TextModel): model_arch = gguf.MODEL_ARCH.INTERNLM2 diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h index 3357a0d9985..fec0287ae00 100644 --- a/ggml/include/ggml.h +++ b/ggml/include/ggml.h @@ -2041,6 +2041,21 @@ extern "C" { int s0, // stride int d0); // dilation + // grouped 1D convolution + // a: [K, IC/G, OC] convolution kernel + // b: [L, IC, N] data + // groups must divide both IC and OC evenly + // when groups == 1, equivalent to ggml_conv_1d + // when groups == IC, equivalent to ggml_conv_1d_dw + GGML_API struct ggml_tensor * ggml_conv_1d_grouped( + struct ggml_context * ctx, + struct ggml_tensor * a, // convolution kernel + struct ggml_tensor * b, // data + int s0, // stride + int p0, // padding + int d0, // dilation + int groups); // number of groups + GGML_API struct ggml_tensor * ggml_conv_transpose_1d( struct ggml_context * ctx, struct ggml_tensor * a, // convolution kernel diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c index 191cf2fa106..ae1fb2fa031 100644 --- a/ggml/src/ggml.c +++ b/ggml/src/ggml.c @@ -2018,9 +2018,9 @@ struct ggml_tensor * ggml_dup_inplace( static struct ggml_tensor * ggml_add_impl( struct ggml_context * ctx, - struct ggml_tensor * a, - struct ggml_tensor * b, - bool inplace) { + struct ggml_tensor * a, + struct ggml_tensor * b, + bool inplace) { GGML_ASSERT(ggml_can_repeat(b, a)); struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a); @@ -4541,6 +4541,63 @@ struct ggml_tensor * ggml_conv_1d_dw_ph( return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0); } +// ggml_conv_1d_grouped + +struct ggml_tensor * ggml_conv_1d_grouped( + struct ggml_context * ctx, + struct ggml_tensor * a, + struct ggml_tensor * b, + int s0, + int p0, + int d0, + int groups) { + GGML_ASSERT(groups > 0); + + const int64_t OC = a->ne[2]; // total output channels + const int64_t IC_G = a->ne[1]; // input channels per group (kernel dim) + const int64_t IC = b->ne[1]; // total input channels + + GGML_ASSERT(IC % groups == 0); + GGML_ASSERT(OC % groups == 0); + GGML_ASSERT(IC_G == IC / groups); + + // degenerate cases: fall back to existing implementations + if (groups == 1) { + return ggml_conv_1d(ctx, a, b, s0, p0, d0); + } + if (groups == IC && groups == OC) { + return ggml_conv_1d_dw(ctx, a, b, s0, p0, d0); + } + + const int64_t OC_G = OC / groups; + + struct ggml_tensor * result = NULL; + + for (int g = 0; g < groups; g++) { + // slice kernel for group g: [K, IC_G, OC_G] + struct ggml_tensor * a_g = ggml_view_3d(ctx, a, + a->ne[0], IC_G, OC_G, + a->nb[1], a->nb[2], + g * OC_G * a->nb[2]); + + // slice input for group g: [L, IC_G, N] + struct ggml_tensor * b_g = ggml_view_3d(ctx, b, + b->ne[0], IC_G, b->ne[2], + b->nb[1], b->nb[2], + g * IC_G * b->nb[1]); + + struct ggml_tensor * out_g = ggml_conv_1d(ctx, a_g, b_g, s0, p0, d0); + + if (result == NULL) { + result = out_g; + } else { + result = ggml_concat(ctx, result, out_g, 1); + } + } + + return result; +} + // ggml_conv_transpose_1d static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) { diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py index 308ebe1f4a1..57a67cb559f 100644 --- a/gguf-py/gguf/constants.py +++ b/gguf-py/gguf/constants.py @@ -503,6 +503,7 @@ class MODEL_ARCH(IntEnum): LLAMA_EMBED = auto() MAINCODER = auto() KIMI_LINEAR = auto() + ZAYA = auto() class VISION_PROJECTOR_TYPE(IntEnum): @@ -610,6 +611,31 @@ class MODEL_TENSOR(IntEnum): SSM_BETA = auto() # Kimi Linear qwen3.5 SSM_G_A = auto() # Kimi Linear SSM_G_B = auto() # Kimi Linear + CCA_CONV_DW = auto() # Zaya + CCA_CONV_GRP = auto() # Zaya + CCA_CONV_DW_B = auto() # Zaya: conv_qk.0.bias + CCA_QK_NORM = auto() # Zaya (weightless - unit RMSNorm) + CCA_K_SCALE = auto() # Zaya + CCA_VAL_PROJ1 = auto() # Zaya: CCA value projection stream 1 + CCA_VAL_PROJ2 = auto() # Zaya: CCA value projection stream 2 + RES_SCALE_HS = auto() # Zaya: hidden_states_scale + RES_SCALE_HS_B = auto() # Zaya: hidden_states_bias + RES_SCALE_RES = auto() # Zaya: residual_scale + RES_SCALE_RES_B = auto() # Zaya: residual_bias + RES_SCALE_HS_FINAL = auto() # Zaya: final hidden_states_scale + RES_SCALE_HS_B_FINAL = auto() # Zaya: final hidden_states_bias + RES_SCALE_RES_FINAL = auto() # Zaya: final residual_scale + RES_SCALE_RES_B_FINAL = auto() # Zaya: final residual_bias + ZAYA_ROUTER_DOWN = auto() # Zaya + ZAYA_ROUTER_DOWN_B = auto() # Zaya + ZAYA_ROUTER_NORM = auto() # Zaya + ZAYA_ROUTER_MLP0 = auto() # Zaya + ZAYA_ROUTER_MLP0_B = auto() # Zaya + ZAYA_ROUTER_MLP2 = auto() # Zaya + ZAYA_ROUTER_MLP2_B = auto() # Zaya + ZAYA_ROUTER_MLP4 = auto() # Zaya + ZAYA_ROUTER_BIASES = auto() # Zaya + ZAYA_ROUTER_EDA_SCALE = auto() # Zaya TIME_MIX_W0 = auto() TIME_MIX_W1 = auto() TIME_MIX_W2 = auto() @@ -1018,6 +1044,7 @@ class MODEL_TENSOR(IntEnum): MODEL_ARCH.LLAMA_EMBED: "llama-embed", MODEL_ARCH.MAINCODER: "maincoder", MODEL_ARCH.KIMI_LINEAR: "kimi-linear", + MODEL_ARCH.ZAYA: "zaya", } VISION_PROJECTOR_TYPE_NAMES: dict[VISION_PROJECTOR_TYPE, str] = { @@ -1123,6 +1150,31 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.SSM_BETA: "blk.{bid}.ssm_beta", # Kimi Linear qwen3.5 MODEL_TENSOR.SSM_G_A: "blk.{bid}.ssm_g_a", # Kimi Linear MODEL_TENSOR.SSM_G_B: "blk.{bid}.ssm_g_b", # Kimi Linear + MODEL_TENSOR.CCA_CONV_DW: "blk.{bid}.cca_conv_dw", # Zaya + MODEL_TENSOR.CCA_CONV_DW_B: "blk.{bid}.cca_conv_dw_b", # Zaya + MODEL_TENSOR.CCA_CONV_GRP: "blk.{bid}.cca_conv_grp", # Zaya + MODEL_TENSOR.CCA_QK_NORM: "blk.{bid}.cca_qk_norm", # Zaya + MODEL_TENSOR.CCA_K_SCALE: "blk.{bid}.cca_k_scale", # Zaya + MODEL_TENSOR.CCA_VAL_PROJ1: "blk.{bid}.cca_val_proj1", # Zaya + MODEL_TENSOR.CCA_VAL_PROJ2: "blk.{bid}.cca_val_proj2", # Zaya + MODEL_TENSOR.RES_SCALE_HS: "blk.{bid}.res_scale_hs", # Zaya + MODEL_TENSOR.RES_SCALE_HS_B: "blk.{bid}.res_scale_hs_b", # Zaya + MODEL_TENSOR.RES_SCALE_RES: "blk.{bid}.res_scale_res", # Zaya + MODEL_TENSOR.RES_SCALE_RES_B: "blk.{bid}.res_scale_res_b", # Zaya + MODEL_TENSOR.RES_SCALE_HS_FINAL: "res_scale_hs", # Zaya + MODEL_TENSOR.RES_SCALE_HS_B_FINAL: "res_scale_hs_b", # Zaya + MODEL_TENSOR.RES_SCALE_RES_FINAL: "res_scale_res", # Zaya + MODEL_TENSOR.RES_SCALE_RES_B_FINAL: "res_scale_res_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_DOWN: "blk.{bid}.zaya_router_down", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_DOWN_B: "blk.{bid}.zaya_router_down_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_NORM: "blk.{bid}.zaya_router_norm", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP0: "blk.{bid}.zaya_router_mlp0", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP0_B: "blk.{bid}.zaya_router_mlp0_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP2: "blk.{bid}.zaya_router_mlp2", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP2_B: "blk.{bid}.zaya_router_mlp2_b", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_MLP4: "blk.{bid}.zaya_router_mlp4", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_BIASES: "blk.{bid}.zaya_router_biases", # Zaya + MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE: "blk.{bid}.zaya_router_eda", # Zaya MODEL_TENSOR.TIME_MIX_W0: "blk.{bid}.time_mix_w0", MODEL_TENSOR.TIME_MIX_W1: "blk.{bid}.time_mix_w1", MODEL_TENSOR.TIME_MIX_W2: "blk.{bid}.time_mix_w2", @@ -3992,6 +4044,42 @@ class MODEL_TENSOR(IntEnum): MODEL_TENSOR.FFN_DOWN_SHEXP, MODEL_TENSOR.FFN_UP_SHEXP, ], + MODEL_ARCH.ZAYA: [ + MODEL_TENSOR.TOKEN_EMBD, + MODEL_TENSOR.OUTPUT_NORM, + MODEL_TENSOR.OUTPUT, + MODEL_TENSOR.ATTN_NORM, + MODEL_TENSOR.ATTN_Q, + MODEL_TENSOR.ATTN_K, + MODEL_TENSOR.ATTN_OUT, + MODEL_TENSOR.CCA_CONV_DW, + MODEL_TENSOR.CCA_CONV_DW_B, + MODEL_TENSOR.CCA_CONV_GRP, + MODEL_TENSOR.CCA_QK_NORM, + MODEL_TENSOR.CCA_K_SCALE, + MODEL_TENSOR.CCA_VAL_PROJ1, + MODEL_TENSOR.CCA_VAL_PROJ2, + MODEL_TENSOR.RES_SCALE_HS, + MODEL_TENSOR.RES_SCALE_HS_B, + MODEL_TENSOR.RES_SCALE_RES, + MODEL_TENSOR.RES_SCALE_RES_B, + MODEL_TENSOR.RES_SCALE_HS_FINAL, + MODEL_TENSOR.RES_SCALE_HS_B_FINAL, + MODEL_TENSOR.RES_SCALE_RES_FINAL, + MODEL_TENSOR.RES_SCALE_RES_B_FINAL, + MODEL_TENSOR.ZAYA_ROUTER_DOWN, + MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, + MODEL_TENSOR.ZAYA_ROUTER_NORM, + MODEL_TENSOR.ZAYA_ROUTER_MLP0, + MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, + MODEL_TENSOR.ZAYA_ROUTER_MLP2, + MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, + MODEL_TENSOR.ZAYA_ROUTER_MLP4, + MODEL_TENSOR.ZAYA_ROUTER_BIASES, + MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, + MODEL_TENSOR.FFN_GATE_UP_EXP, + MODEL_TENSOR.FFN_DOWN_EXP, + ], # TODO } diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index f27f0e4c997..fbd22ccb6a3 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -107,6 +107,7 @@ class TensorNameMap: "model.transformer.ln_f", # llada "final_norm", # modern-bert "model.norm", # cogvlm + "model.final_norm", # Zaya ), # Rope frequencies @@ -259,6 +260,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.q_proj", # llada "layers.{bid}.self_attn.q_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.q_proj", # nemotron-h + "model.layers.{bid}.self_attn.qkv.linear_q", # Zaya ), # Attention key @@ -279,6 +281,7 @@ class TensorNameMap: "model.transformer.blocks.{bid}.k_proj", # llada "layers.{bid}.self_attn.k_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.k_proj", # nemotron-h + "model.layers.{bid}.self_attn.qkv.linear_k", # Zaya ), # Attention value @@ -336,6 +339,7 @@ class TensorNameMap: "layers.{bid}.self_attn.o_proj", # qwen3-embedding "backbone.layers.{bid}.mixer.o_proj", # nemotron-h "model.layers.{bid}.self_attn.language_expert_dense", # cogvlm + "model.layers.{bid}.self_attn.o_proj", # Zaya ), # Attention output norm @@ -854,6 +858,12 @@ class TensorNameMap: "backbone.layers.{bid}.mixer.norm", # mamba2 "model.layers.{bid}.self_attn.o_norm", # kimi ), + MODEL_TENSOR.ATTN_NORM: ( + "model.layers.{bid}.input_layernorm", + "model.layers.{bid}.ln_1", + "model.layers.{bid}.norm1", + "model.layers.{bid}.input_norm", # Zaya + ), MODEL_TENSOR.SSM_OUT: ( "model.layers.{bid}.out_proj", # mamba-hf diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp index 59dde99e362..9bdd0023028 100644 --- a/src/llama-arch.cpp +++ b/src/llama-arch.cpp @@ -133,6 +133,7 @@ static const std::map LLM_ARCH_NAMES = { { LLM_ARCH_LLAMA_EMBED, "llama-embed" }, { LLM_ARCH_MAINCODER, "maincoder" }, { LLM_ARCH_KIMI_LINEAR, "kimi-linear" }, + { LLM_ARCH_ZAYA, "zaya" }, { LLM_ARCH_UNKNOWN, "(unknown)" }, }; @@ -417,6 +418,31 @@ static const std::map LLM_TENSOR_NAMES = { { LLM_TENSOR_SSM_BETA, "blk.%d.ssm_beta" }, { LLM_TENSOR_SSM_G_A, "blk.%d.ssm_g_a" }, { LLM_TENSOR_SSM_G_B, "blk.%d.ssm_g_b" }, + { LLM_TENSOR_CCA_CONV_DW, "blk.%d.cca_conv_dw" }, + { LLM_TENSOR_CCA_CONV_DW_B, "blk.%d.cca_conv_dw_b" }, + { LLM_TENSOR_CCA_CONV_GRP, "blk.%d.cca_conv_grp" }, + { LLM_TENSOR_CCA_QK_NORM, "blk.%d.cca_qk_norm" }, + { LLM_TENSOR_CCA_K_SCALE, "blk.%d.cca_k_scale" }, + { LLM_TENSOR_CCA_VAL_PROJ1, "blk.%d.cca_val_proj1" }, + { LLM_TENSOR_CCA_VAL_PROJ2, "blk.%d.cca_val_proj2" }, + { LLM_TENSOR_RES_SCALE_HS, "blk.%d.res_scale_hs" }, + { LLM_TENSOR_RES_SCALE_HS_B, "blk.%d.res_scale_hs_b" }, + { LLM_TENSOR_RES_SCALE_RES, "blk.%d.res_scale_res" }, + { LLM_TENSOR_RES_SCALE_RES_B, "blk.%d.res_scale_res_b" }, + { LLM_TENSOR_RES_SCALE_HS_FINAL, "res_scale_hs" }, + { LLM_TENSOR_RES_SCALE_HS_B_FINAL, "res_scale_hs_b" }, + { LLM_TENSOR_RES_SCALE_RES_FINAL, "res_scale_res" }, + { LLM_TENSOR_RES_SCALE_RES_B_FINAL, "res_scale_res_b" }, + { LLM_TENSOR_ZAYA_ROUTER_DOWN, "blk.%d.zaya_router_down" }, + { LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "blk.%d.zaya_router_down_b" }, + { LLM_TENSOR_ZAYA_ROUTER_NORM, "blk.%d.zaya_router_norm" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP0, "blk.%d.zaya_router_mlp0" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "blk.%d.zaya_router_mlp0_b" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP2, "blk.%d.zaya_router_mlp2" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "blk.%d.zaya_router_mlp2_b" }, + { LLM_TENSOR_ZAYA_ROUTER_MLP4, "blk.%d.zaya_router_mlp4" }, + { LLM_TENSOR_ZAYA_ROUTER_BIASES, "blk.%d.zaya_router_biases" }, + { LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "blk.%d.zaya_router_eda" }, { LLM_TENSOR_SSM_NORM, "blk.%d.ssm_norm" }, { LLM_TENSOR_ATTN_Q_A_NORM, "blk.%d.attn_q_a_norm" }, { LLM_TENSOR_ATTN_KV_A_NORM, "blk.%d.attn_kv_a_norm" }, @@ -659,6 +685,32 @@ static const std::map LLM_TENSOR_INFOS = { {LLM_TENSOR_SSM_BETA, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_G_A, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, {LLM_TENSOR_SSM_G_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + // ZAYA CCA + {LLM_TENSOR_CCA_CONV_DW, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_SSM_CONV}}, + {LLM_TENSOR_CCA_CONV_DW_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_CCA_CONV_GRP, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CCA_QK_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CCA_K_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_CCA_VAL_PROJ1, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_CCA_VAL_PROJ2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_RES_SCALE_HS, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_HS_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_RES_SCALE_RES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_RES_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_RES_SCALE_HS_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_HS_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, + {LLM_TENSOR_RES_SCALE_RES_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_MUL}}, + {LLM_TENSOR_RES_SCALE_RES_B_FINAL, {LLM_TENSOR_LAYER_OUTPUT, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_DOWN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_DOWN_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_NORM, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP0, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP0_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP2, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP2_B, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_MLP4, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL_MAT}}, + {LLM_TENSOR_ZAYA_ROUTER_BIASES, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_ADD}}, + {LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LERP_X, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_TIME_MIX_LN, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, {LLM_TENSOR_CHANNEL_MIX_LERP_K, {LLM_TENSOR_LAYER_REPEATING, GGML_OP_MUL}}, @@ -857,6 +909,7 @@ bool llm_arch_is_hybrid(const llm_arch & arch) { case LLM_ARCH_NEMOTRON_H_MOE: case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_ZAYA: case LLM_ARCH_QWEN35: case LLM_ARCH_QWEN35MOE: return true; @@ -902,6 +955,7 @@ bool llm_arch_supports_sm_tensor(const llm_arch & arch) { case LLM_ARCH_MINIMAX_M2: case LLM_ARCH_MISTRAL4: case LLM_ARCH_KIMI_LINEAR: + case LLM_ARCH_ZAYA: return false; default: return true; diff --git a/src/llama-arch.h b/src/llama-arch.h index e37d548c98e..30a3f9a444a 100644 --- a/src/llama-arch.h +++ b/src/llama-arch.h @@ -137,6 +137,7 @@ enum llm_arch { LLM_ARCH_LLAMA_EMBED, LLM_ARCH_MAINCODER, LLM_ARCH_KIMI_LINEAR, + LLM_ARCH_ZAYA, LLM_ARCH_UNKNOWN, }; @@ -444,6 +445,34 @@ enum llm_tensor { LLM_TENSOR_SSM_BETA, // kimi: beta mixing coefficient and qwen3.5 LLM_TENSOR_SSM_G_A, // kimi: output gate projection A LLM_TENSOR_SSM_G_B, // kimi: output gate projection B + // ZAYA CCA (Compressed Convolutional Attention) + LLM_TENSOR_CCA_CONV_DW, // zaya: depthwise conv1d (conv_qk.0) + LLM_TENSOR_CCA_CONV_DW_B, // zaya: depthwise conv1d bias + LLM_TENSOR_CCA_CONV_GRP, // zaya: grouped conv1d (conv_qk.1) + LLM_TENSOR_CCA_QK_NORM, // zaya: RMSNorm on concat(Q,K) + LLM_TENSOR_CCA_K_SCALE, // zaya: learned K temperature + LLM_TENSOR_CCA_VAL_PROJ1, // zaya: V projection 1 + LLM_TENSOR_CCA_VAL_PROJ2, // zaya: V projection 2 + // ZAYA residual scaling + LLM_TENSOR_RES_SCALE_HS, // zaya: hidden_states_scale + LLM_TENSOR_RES_SCALE_HS_B, // zaya: hidden_states_bias + LLM_TENSOR_RES_SCALE_RES, // zaya: residual_scale + LLM_TENSOR_RES_SCALE_RES_B, // zaya: residual_bias + LLM_TENSOR_RES_SCALE_HS_FINAL, // zaya: final hidden_states_scale + LLM_TENSOR_RES_SCALE_HS_B_FINAL,// zaya: final hidden_states_bias + LLM_TENSOR_RES_SCALE_RES_FINAL, // zaya: final residual_scale + LLM_TENSOR_RES_SCALE_RES_B_FINAL,// zaya: final residual_bias + // ZAYA Router (MoE gating) + LLM_TENSOR_ZAYA_ROUTER_DOWN, // zaya: router down_proj weight + LLM_TENSOR_ZAYA_ROUTER_DOWN_B, // zaya: router down_proj bias + LLM_TENSOR_ZAYA_ROUTER_NORM, // zaya: router rmsnorm_eda weight + LLM_TENSOR_ZAYA_ROUTER_MLP0, // zaya: router MLP layer 0 weight + LLM_TENSOR_ZAYA_ROUTER_MLP0_B, // zaya: router MLP layer 0 bias + LLM_TENSOR_ZAYA_ROUTER_MLP2, // zaya: router MLP layer 2 weight + LLM_TENSOR_ZAYA_ROUTER_MLP2_B, // zaya: router MLP layer 2 bias + LLM_TENSOR_ZAYA_ROUTER_MLP4, // zaya: router MLP layer 4 weight + LLM_TENSOR_ZAYA_ROUTER_BIASES, // zaya: router balancing_biases + LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, // zaya: router router_states_scale LLM_TENSOR_TIME_MIX_W0, LLM_TENSOR_TIME_MIX_W1, LLM_TENSOR_TIME_MIX_W2, diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp index fe155c92dea..e4f0ff98ef4 100644 --- a/src/llama-graph.cpp +++ b/src/llama-graph.cpp @@ -1405,6 +1405,10 @@ ggml_tensor * llm_graph_context::build_moe_ffn( { probs = logits; // [n_expert, n_tokens] } break; + case LLAMA_EXPERT_GATING_FUNC_TYPE_NONE: + { + probs = logits; // already-normalized expert probabilities + } break; default: GGML_ABORT("fatal error"); } diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 9d011ff3464..3de55045f5c 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -282,6 +282,8 @@ static llama_model * llama_model_mapping(llm_arch arch, const llama_model_params return new llama_model_mimo2(params); case LLM_ARCH_KIMI_LINEAR: return new llama_model_kimi_linear(params); + case LLM_ARCH_ZAYA: + return new llama_model_zaya(params); case LLM_ARCH_STEP35: return new llama_model_step35(params); default: @@ -1955,6 +1957,13 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params, if (arch == LLM_ARCH_FALCON_H1) { filter_attn = [&](int32_t) { return true; }; filter_recr = [&](int32_t) { return true; }; + } else if (arch == LLM_ARCH_ZAYA) { + filter_attn = [&](int32_t il) { + return il % 2 == 0; + }; + filter_recr = [&](int32_t il) { + return il % 2 == 0; + }; } else if (arch == LLM_ARCH_NEMOTRON_H || arch == LLM_ARCH_NEMOTRON_H_MOE) { filter_attn = [&](int32_t il) { return !hparams.is_recurrent(il) && hparams.n_ff(il) == 0; @@ -2308,6 +2317,7 @@ llama_rope_type llama_model_rope_type(const llama_model * model) { case LLM_ARCH_QWEN3NEXT: case LLM_ARCH_MIMO2: case LLM_ARCH_STEP35: + case LLM_ARCH_ZAYA: return LLAMA_ROPE_TYPE_NEOX; case LLM_ARCH_QWEN2VL: diff --git a/src/llama-model.h b/src/llama-model.h index d63c689185a..01ce976fe3e 100644 --- a/src/llama-model.h +++ b/src/llama-model.h @@ -477,6 +477,34 @@ struct llama_layer { struct ggml_tensor * ssm_g_b = nullptr; struct ggml_tensor * ssm_o_norm = nullptr; + // ZAYA CCA (Compressed Convolutional Attention) + struct ggml_tensor * cca_conv_dw = nullptr; // depthwise conv (conv_qk.0) + struct ggml_tensor * cca_conv_dw_b = nullptr; // depthwise conv bias + struct ggml_tensor * cca_conv_grp = nullptr; // grouped conv (conv_qk.1) + struct ggml_tensor * cca_conv_grp_b = nullptr; // grouped conv bias + struct ggml_tensor * cca_qk_norm = nullptr; // RMSNorm on concat(Q,K) + struct ggml_tensor * cca_k_scale = nullptr; // learned K temperature + struct ggml_tensor * cca_val_proj1 = nullptr; // V projection stream 1 + struct ggml_tensor * cca_val_proj2 = nullptr; // V projection stream 2 + + // ZAYA residual scaling + struct ggml_tensor * res_scale_hs = nullptr; // hidden_states_scale + struct ggml_tensor * res_scale_hs_b = nullptr; // hidden_states_bias + struct ggml_tensor * res_scale_res = nullptr; // residual_scale + struct ggml_tensor * res_scale_res_b = nullptr; // residual_bias + + // ZAYA Router (MoE gating) + struct ggml_tensor * zaya_router_down = nullptr; // router down_proj + struct ggml_tensor * zaya_router_down_b = nullptr; // router down_proj bias + struct ggml_tensor * zaya_router_norm = nullptr; // router rmsnorm_eda + struct ggml_tensor * zaya_router_mlp0 = nullptr; // router MLP 0 + struct ggml_tensor * zaya_router_mlp0_b = nullptr; // router MLP 0 bias + struct ggml_tensor * zaya_router_mlp2 = nullptr; // router MLP 2 + struct ggml_tensor * zaya_router_mlp2_b = nullptr; // router MLP 2 bias + struct ggml_tensor * zaya_router_mlp4 = nullptr; // router MLP 4 + struct ggml_tensor * zaya_router_biases = nullptr; // balancing_biases + struct ggml_tensor * zaya_router_eda_scale = nullptr; // router_states_scale + // DSA (deepseek sparse attention) struct ggml_tensor * indexer_k_norm = nullptr; struct ggml_tensor * indexer_k_norm_b = nullptr; @@ -533,6 +561,12 @@ struct llama_model { struct ggml_tensor * output_b = nullptr; struct ggml_tensor * output_norm_enc = nullptr; + // Zaya final residual scaling + struct ggml_tensor * zaya_res_scale_hs = nullptr; + struct ggml_tensor * zaya_res_scale_hs_b = nullptr; + struct ggml_tensor * zaya_res_scale_res = nullptr; + struct ggml_tensor * zaya_res_scale_res_b = nullptr; + // classifier struct ggml_tensor * cls = nullptr; struct ggml_tensor * cls_b = nullptr; diff --git a/src/models/models.h b/src/models/models.h index 6d5f18a8e20..507f903104b 100644 --- a/src/models/models.h +++ b/src/models/models.h @@ -1858,3 +1858,16 @@ struct llama_model_step35 : public llama_model_base { std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; }; + + +struct llama_model_zaya : public llama_model_base { + llama_model_zaya(const struct llama_model_params & params) : llama_model_base(params) {} + void load_arch_hparams(llama_model_loader & ml) override; + void load_arch_tensors(llama_model_loader & ml) override; + + struct graph : public llm_graph_context { + graph(const llama_model & model, const llm_graph_params & params); + }; + + std::unique_ptr build_arch_graph(const llm_graph_params & params) const override; +}; diff --git a/src/models/zaya.cpp b/src/models/zaya.cpp new file mode 100644 index 00000000000..89e354450bb --- /dev/null +++ b/src/models/zaya.cpp @@ -0,0 +1,432 @@ +#include "models.h" + +#include "ggml.h" +#include "llama-memory-recurrent.h" + +#include + +void llama_model_zaya::load_arch_hparams(llama_model_loader & ml) { + ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); + ml.get_key(LLM_KV_SSM_CONV_KERNEL, hparams.ssm_d_conv); + + const uint32_t n_qk = (hparams.n_head() + hparams.n_head_kv()) * hparams.n_embd_head_k(); + hparams.ssm_d_inner = 2*n_qk + hparams.n_embd; // CCA conv state + delayed value stream state + hparams.ssm_d_state = 1; + hparams.ssm_n_group = 0; + + for (uint32_t i = 0; i < hparams.n_layer; ++i) { + hparams.recurrent_layer_arr[i] = (i % 2) == 0; + } + + switch (hparams.n_layer) { + case 80: type = LLM_TYPE_8B; break; + default: type = LLM_TYPE_UNKNOWN; + } +} + +void llama_model_zaya::load_arch_tensors(llama_model_loader &) { + LLAMA_LOAD_LOCALS; + + tok_embd = create_tensor(tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, 0); + + // output norm + output_norm = create_tensor(tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, 0); + + // output (tied with tok_embd if not present) + output = create_tensor(tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, TENSOR_NOT_REQUIRED); + if (output == nullptr) { + output = tok_embd; + } + + zaya_res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_FINAL, "weight"), {n_embd}, TENSOR_NOT_REQUIRED); + zaya_res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B_FINAL, "bias"), {n_embd}, TENSOR_NOT_REQUIRED); + + const int64_t n_embd_head = hparams.n_embd_head_k(); + const int64_t d_conv = hparams.ssm_d_conv; + // Router MLP hidden size (zaya_mlp_expansion = 256 for ZAYA1-8B) + const int64_t n_ff_exp = 256; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = layers[i]; + + const int64_t n_head = hparams.n_head(i); + const int64_t n_head_kv = hparams.n_head_kv(i); + const int64_t n_embd_q = n_head * n_embd_head; + const int64_t n_embd_k = n_head_kv * n_embd_head; + const int64_t n_qk = n_embd_q + n_embd_k; + const int64_t n_groups = n_head + n_head_kv; + const int64_t n_ff = hparams.n_ff(i); + const int64_t n_expert = hparams.n_expert; + + layer.attn_norm = create_tensor(tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, 0); + + // CCA attention layers (even indices only) + if (i % 2 == 0) { + layer.wq = create_tensor(tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd_q}, 0); + layer.wk = create_tensor(tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_k}, 0); + + layer.cca_val_proj1 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ1, "weight", i), + {n_embd, n_embd_head}, 0); + layer.cca_val_proj2 = create_tensor(tn(LLM_TENSOR_CCA_VAL_PROJ2, "weight", i), + {n_embd, n_embd_head}, 0); + + layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd_q, n_embd}, 0); + + layer.cca_conv_dw = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW, "weight", i), {d_conv, n_qk}, 0); + layer.cca_conv_dw_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_DW_B, "bias", i), {n_qk}, TENSOR_NOT_REQUIRED); + + layer.cca_conv_grp = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "weight", i), + {d_conv, n_qk / n_groups, n_qk}, 0); + layer.cca_conv_grp_b = create_tensor(tn(LLM_TENSOR_CCA_CONV_GRP, "bias", i), {n_qk}, 0); + + layer.cca_k_scale = create_tensor(tn(LLM_TENSOR_CCA_K_SCALE, "weight", i), {n_head_kv}, 0); + } + + // Residual scaling + layer.res_scale_hs = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS, "weight", i), {n_embd}, 0); + layer.res_scale_hs_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_HS_B, "bias", i), {n_embd}, 0); + layer.res_scale_res = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES, "weight", i), {n_embd}, TENSOR_NOT_REQUIRED); + layer.res_scale_res_b = create_tensor(tn(LLM_TENSOR_RES_SCALE_RES_B, "bias", i), {n_embd}, TENSOR_NOT_REQUIRED); + + // MoE layers (odd indices) + if (i % 2 == 1) { + // Router network + layer.zaya_router_down = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN, "weight", i), + {n_embd, n_ff_exp}, 0); + layer.zaya_router_down_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_DOWN_B, "bias", i), + {n_ff_exp}, 0); + layer.zaya_router_norm = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_NORM, "weight", i), + {n_ff_exp}, 0); + layer.zaya_router_mlp0 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0, "weight", i), + {n_ff_exp, n_ff_exp}, 0); + layer.zaya_router_mlp0_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP0_B, "bias", i), + {n_ff_exp}, 0); + layer.zaya_router_mlp2 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2, "weight", i), + {n_ff_exp, n_ff_exp}, 0); + layer.zaya_router_mlp2_b = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP2_B, "bias", i), + {n_ff_exp}, 0); + layer.zaya_router_mlp4 = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_MLP4, "weight", i), + {n_ff_exp, n_expert + 1}, 0); + layer.zaya_router_biases = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_BIASES, "weight", i), + {n_expert + 1}, TENSOR_NOT_REQUIRED); + layer.zaya_router_eda_scale = create_tensor(tn(LLM_TENSOR_ZAYA_ROUTER_EDA_SCALE, "weight", i), + {n_ff_exp}, TENSOR_NOT_REQUIRED); + + // MoE experts (fused gate_up and down) + create_tensor_gate_up_exps(layer, i, n_embd, n_ff, n_expert, 0); + layer.ffn_down_exps = create_tensor(tn(LLM_TENSOR_FFN_DOWN_EXPS, "weight", i), + {n_ff, n_embd, n_expert}, 0); + } + } +} + +std::unique_ptr llama_model_zaya::build_arch_graph(const llm_graph_params & params) const { + return std::make_unique(*this, params); +} + +llama_model_zaya::graph::graph(const llama_model & model, const llm_graph_params & params) + : llm_graph_context(params) { + + const int64_t n_embd_head = hparams.n_embd_head_k(); + const int64_t n_expert = hparams.n_expert; + const int64_t n_seqs = ubatch.n_seqs; + + GGML_ASSERT(n_seqs != 0); + GGML_ASSERT(ubatch.equal_seqs()); + GGML_ASSERT(n_tokens % n_seqs == 0); + + const int64_t n_seq_tokens = n_tokens / n_seqs; + + ggml_tensor * cur; + ggml_tensor * inpL; + + inpL = build_inp_embd(model.tok_embd); + + auto * inp = build_inp_mem_hybrid(); + auto * inp_recr = inp->get_recr(); + + ggml_tensor * inp_pos = build_inp_pos(); + ggml_tensor * inp_out_ids = build_inp_out_ids(); + ggml_tensor * residual = nullptr; + ggml_tensor * prev_router = nullptr; + + const auto apply_res_scale = [&](ggml_tensor * x, ggml_tensor * scale, ggml_tensor * bias, const char * name, int il) { + if (scale == nullptr) { + return x; + } + if (bias != nullptr) { + x = ggml_add(ctx0, x, bias); + } + x = ggml_mul(ctx0, x, scale); + cb(x, name, il); + return x; + }; + + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers[il]; + + const int64_t n_head = hparams.n_head(il); + const int64_t n_head_kv = hparams.n_head_kv(il); + const int64_t n_embd_q = n_head * n_embd_head; + const int64_t n_embd_k = n_head_kv * n_embd_head; + const int64_t n_qk = n_embd_q + n_embd_k; + const int64_t n_groups = n_head + n_head_kv; + const int64_t n_gqa = n_head / n_head_kv; + + ggml_tensor * hidden_states = apply_res_scale(inpL, layer.res_scale_hs, layer.res_scale_hs_b, "res_scale_hs", il); + if (residual != nullptr) { + residual = apply_res_scale(residual, layer.res_scale_res, layer.res_scale_res_b, "res_scale_res", il); + residual = ggml_add(ctx0, hidden_states, residual); + } else { + residual = hidden_states; + } + cb(residual, "residual", il); + + // Pre-norm + cur = build_norm(residual, layer.attn_norm, nullptr, LLM_NORM_RMS, il); + cb(cur, "input_norm", il); + + if (il % 2 == 0) { + // ===== CCA Attention ===== + const int64_t conv_state_size = 2*n_qk; + const int64_t cca_state_size = conv_state_size + n_embd; + GGML_ASSERT((int64_t) hparams.n_embd_s() == cca_state_size); + + ggml_tensor * cca_state_all = inp_recr->mctx->get_s_l(il); + ggml_tensor * cca_state = build_rs(inp_recr, cca_state_all, hparams.n_embd_s(), n_seqs); + cb(cca_state, "cca_state", il); + + ggml_tensor * conv_state = ggml_view_3d(ctx0, cca_state, 2, n_qk, n_seqs, + 2*ggml_element_size(cca_state), + cca_state->nb[1], + 0); + cb(conv_state, "cca_conv_state", il); + + ggml_tensor * prev_hs = ggml_view_2d(ctx0, cca_state, n_embd, n_seqs, + cca_state->nb[1], + conv_state_size*ggml_element_size(cca_state)); + cb(prev_hs, "cca_prev_hs", il); + + // Q, K projections + ggml_tensor * Qraw = ggml_mul_mat(ctx0, layer.wq, cur); + cb(Qraw, "Qraw", il); + ggml_tensor * Kraw = ggml_mul_mat(ctx0, layer.wk, cur); + cb(Kraw, "Kraw", il); + + // HF uses a delayed hidden-state stream for val_proj2. During decode this + // comes from the recurrent state; during prefill it is a one-token shift. + ggml_tensor * cur_state_src = ggml_cont(ctx0, cur); + ggml_tensor * cur_seq = ggml_reshape_3d(ctx0, cur_state_src, n_embd, n_seq_tokens, n_seqs); + + ggml_tensor * hs_d = ggml_reshape_3d(ctx0, prev_hs, n_embd, 1, n_seqs); + if (n_seq_tokens > 1) { + ggml_tensor * cur_shift = ggml_view_3d(ctx0, cur_seq, n_embd, n_seq_tokens - 1, n_seqs, + cur_seq->nb[1], + cur_seq->nb[2], + 0); + hs_d = ggml_concat(ctx0, hs_d, cur_shift, 1); + } + hs_d = ggml_reshape_2d(ctx0, hs_d, n_embd, n_tokens); + cb(hs_d, "cca_hs_d", il); + + // V = concat(val_proj1(x), val_proj2(x delayed)) -> [n_embd_k, n_tokens] + ggml_tensor * V1 = ggml_mul_mat(ctx0, layer.cca_val_proj1, cur); + cb(V1, "V1", il); + ggml_tensor * V2 = ggml_mul_mat(ctx0, layer.cca_val_proj2, hs_d); + cb(V2, "V2", il); + ggml_tensor * Vcur = ggml_concat(ctx0, V1, V2, 0); + cb(Vcur, "Vcur", il); + + // Concat Q+K for conv: [n_qk, n_tokens] + ggml_tensor * QKraw = ggml_concat(ctx0, Qraw, Kraw, 0); + cb(QKraw, "QKraw", il); + + ggml_tensor * Qpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Qraw), n_embd_head, n_head, n_tokens); + ggml_tensor * Kpre = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Kraw), n_embd_head, n_head_kv, n_tokens); + + ggml_tensor * Kpre_grouped = ggml_reshape_4d(ctx0, Kpre, n_embd_head, 1, n_head_kv, n_tokens); + Kpre_grouped = ggml_repeat_4d(ctx0, Kpre_grouped, n_embd_head, n_gqa, n_head_kv, n_tokens); + ggml_tensor * Kpre_rep = ggml_reshape_3d(ctx0, Kpre_grouped, n_embd_head, n_head, n_tokens); + ggml_tensor * qk_mean_q = ggml_scale(ctx0, ggml_add(ctx0, Qpre, Kpre_rep), 0.5f); + cb(qk_mean_q, "qk_mean_q", il); + + ggml_tensor * Qgroup = ggml_reshape_4d(ctx0, Qpre, n_embd_head, n_gqa, n_head_kv, n_tokens); + Qgroup = ggml_permute(ctx0, Qgroup, 1, 0, 2, 3); + Qgroup = ggml_cont(ctx0, Qgroup); + ggml_tensor * Qmean = ggml_mean(ctx0, Qgroup); + Qmean = ggml_reshape_3d(ctx0, Qmean, n_embd_head, n_head_kv, n_tokens); + ggml_tensor * qk_mean_k = ggml_scale(ctx0, ggml_add(ctx0, Qmean, Kpre), 0.5f); + cb(qk_mean_k, "qk_mean_k", il); + + ggml_tensor * QKraw_t = ggml_cont(ctx0, ggml_transpose(ctx0, QKraw)); + QKraw_t = ggml_reshape_3d(ctx0, QKraw_t, n_seq_tokens, n_qk, n_seqs); + + ggml_tensor * conv_input = ggml_concat(ctx0, conv_state, QKraw_t, 0); + cb(conv_input, "cca_conv_input", il); + + ggml_tensor * last_conv_states = ggml_view_3d(ctx0, conv_input, 2, n_qk, n_seqs, + conv_input->nb[1], + conv_input->nb[2], + n_seq_tokens*conv_input->nb[0]); + cb(last_conv_states, "cca_last_conv_states", il); + + const auto kv_head = inp_recr->mctx->get_head(); + ggml_tensor * conv_state_update_target = ggml_view_2d(ctx0, cca_state_all, conv_state_size, n_seqs, + cca_state_all->nb[1], + kv_head*cca_state_size*ggml_element_size(cca_state_all)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_conv_states, conv_state_update_target)); + + ggml_tensor * last_hs = ggml_view_2d(ctx0, cur_seq, n_embd, n_seqs, + cur_seq->nb[2], + (n_seq_tokens - 1)*cur_seq->nb[1]); + ggml_tensor * prev_hs_update_target = ggml_view_2d(ctx0, cca_state_all, n_embd, n_seqs, + cca_state_all->nb[1], + (kv_head*cca_state_size + conv_state_size)*ggml_element_size(cca_state_all)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, last_hs, prev_hs_update_target)); + + ggml_tensor * conv_dw = layer.cca_conv_dw; + if (conv_dw->type != GGML_TYPE_F32) { + conv_dw = ggml_cast(ctx0, conv_dw, GGML_TYPE_F32); + } + conv_dw = ggml_reshape_3d(ctx0, conv_dw, conv_dw->ne[0], 1, n_qk); + ggml_tensor * QK = ggml_conv_1d_dw(ctx0, conv_dw, conv_input, 1, 0, 1); + if (layer.cca_conv_dw_b) { + QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_dw_b, 1, n_qk, 1)); + } + cb(QK, "QK_dw", il); + + QK = ggml_conv_1d_grouped(ctx0, layer.cca_conv_grp, QK, 1, 0, 1, n_groups); + QK = ggml_add(ctx0, QK, ggml_reshape_3d(ctx0, layer.cca_conv_grp_b, 1, n_qk, 1)); + cb(QK, "QK_grp", il); + + QK = ggml_cont(ctx0, ggml_permute(ctx0, QK, 1, 0, 2, 3)); + QK = ggml_reshape_2d(ctx0, QK, n_qk, n_tokens); + + ggml_tensor * Q_conv = ggml_view_2d(ctx0, QK, n_embd_q, n_tokens, QK->nb[1], 0); + ggml_tensor * K_conv = ggml_view_2d(ctx0, QK, n_embd_k, n_tokens, QK->nb[1], n_embd_q*ggml_element_size(QK)); + + ggml_tensor * Qcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Q_conv), n_embd_head, n_head, n_tokens); + ggml_tensor * Kcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, K_conv), n_embd_head, n_head_kv, n_tokens); + + Qcur = ggml_add(ctx0, Qcur, qk_mean_q); + Kcur = ggml_add(ctx0, Kcur, qk_mean_k); + + Qcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Qcur, 1e-12f), sqrtf((float) n_embd_head)); + Kcur = ggml_scale(ctx0, ggml_l2_norm(ctx0, Kcur, 1e-12f), sqrtf((float) n_embd_head)); + Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.cca_k_scale, 1, n_head_kv, 1)); + cb(Qcur, "Qcur_pre_rope", il); + cb(Kcur, "Kcur_pre_rope", il); + + ggml_tensor * rope_factors = model.get_rope_factors(cparams, il); + Qcur = ggml_rope_ext(ctx0, Qcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + Kcur = ggml_rope_ext(ctx0, Kcur, inp_pos, rope_factors, + n_rot, rope_type, n_ctx_orig, freq_base, freq_scale, + ext_factor, attn_factor, beta_fast, beta_slow); + cb(Qcur, "Qcur", il); + cb(Kcur, "Kcur", il); + + Vcur = ggml_reshape_3d(ctx0, ggml_cont(ctx0, Vcur), n_embd_head, n_head_kv, n_tokens); + + // GQA attention + cur = build_attn(inp->get_attn(), layer.wo, nullptr, nullptr, + Qcur, Kcur, Vcur, nullptr, nullptr, nullptr, + 1.0f / sqrtf((float) n_embd_head), il); + cb(cur, "attn_out", il); + + } else { + // ===== MoE Layer ===== + + // Build Zaya router network: + // down_proj -> optional EDA -> RMSNorm -> GELU MLP -> 17 logits. + + ggml_tensor * router_h = ggml_mul_mat(ctx0, layer.zaya_router_down, cur); + router_h = ggml_add(ctx0, router_h, layer.zaya_router_down_b); + cb(router_h, "router_down", il); + + if (prev_router != nullptr && layer.zaya_router_eda_scale != nullptr) { + router_h = ggml_add(ctx0, router_h, ggml_mul(ctx0, prev_router, layer.zaya_router_eda_scale)); + cb(router_h, "router_eda", il); + } + + prev_router = router_h; + + router_h = build_norm(router_h, layer.zaya_router_norm, nullptr, LLM_NORM_RMS, il); + cb(router_h, "router_norm", il); + + router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp0, router_h); + router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp0_b); + router_h = ggml_gelu(ctx0, router_h); + cb(router_h, "router_mlp0", il); + + router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp2, router_h); + router_h = ggml_add(ctx0, router_h, layer.zaya_router_mlp2_b); + router_h = ggml_gelu(ctx0, router_h); + cb(router_h, "router_mlp2", il); + + router_h = ggml_mul_mat(ctx0, layer.zaya_router_mlp4, router_h); + cb(router_h, "router_logits", il); + + ggml_tensor * router_probs = ggml_soft_max(ctx0, router_h); + cb(router_probs, "router_probs", il); + + // Keep the MOD skip expert in the softmax denominator, then route + // over real experts only. The checkpoint's skip bias keeps MOD unused. + ggml_tensor * gate_probs = ggml_cont(ctx0, + ggml_view_2d(ctx0, router_probs, n_expert, n_tokens, router_probs->nb[1], 0)); + cb(gate_probs, "gate_probs", il); + + ggml_tensor * expert_biases = nullptr; + if (layer.zaya_router_biases != nullptr) { + expert_biases = ggml_view_1d(ctx0, layer.zaya_router_biases, n_expert, 0); + } + + cur = build_moe_ffn(cur, + /* gate_inp */ nullptr, + /* up_exps */ nullptr, + /* gate_exps */ nullptr, + /* down_exps */ layer.ffn_down_exps, + /* exp_probs_b */ expert_biases, + /* n_expert */ n_expert, + /* n_expert_used */ hparams.n_expert_used, + /* type_op */ LLM_FFN_SILU, + /* norm_w */ false, + /* w_scale */ 1.0f, + /* gating_op */ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE, + /* il */ il, + /* probs_in */ gate_probs, + /* gate_up_exps */ layer.ffn_gate_up_exps); + cb(cur, "moe_out", il); + } + + inpL = cur; + } + + ggml_tensor * final_hidden = apply_res_scale(inpL, model.zaya_res_scale_hs, model.zaya_res_scale_hs_b, "final_res_scale_hs", -1); + if (residual != nullptr) { + residual = apply_res_scale(residual, model.zaya_res_scale_res, model.zaya_res_scale_res_b, "final_res_scale_res", -1); + cur = ggml_add(ctx0, final_hidden, residual); + } else { + cur = final_hidden; + } + cb(cur, "final_residual", -1); + + if (inp_out_ids) { + cur = ggml_get_rows(ctx0, cur, inp_out_ids); + } + + // final norm + cur = build_norm(cur, model.output_norm, nullptr, LLM_NORM_RMS, -1); + cb(cur, "result_norm", -1); + res->t_embd = cur; + + // output + cur = ggml_mul_mat(ctx0, model.output, cur); + cb(cur, "result_output", -1); + res->t_logits = cur; + + ggml_build_forward_expand(gf, cur); +} diff --git a/tests/test-conv-1d-grouped.cpp b/tests/test-conv-1d-grouped.cpp new file mode 100644 index 00000000000..80b884804ec --- /dev/null +++ b/tests/test-conv-1d-grouped.cpp @@ -0,0 +1,154 @@ +// Test for ggml_conv_1d_grouped +// +// Verifies grouped 1D convolution by comparing against manual per-group computation. + +#include "ggml.h" +#include "ggml-backend.h" +#include "ggml-cpu.h" + +#include +#include +#include +#include +#include + +static void fill_random_f16(ggml_fp16_t * data, int n) { + for (int i = 0; i < n; i++) { + float v = ((float)rand() / RAND_MAX) * 2.0f - 1.0f; + data[i] = ggml_fp32_to_fp16(v); + } +} + +static void fill_random_f32(float * data, int n) { + for (int i = 0; i < n; i++) { + data[i] = ((float)rand() / RAND_MAX) * 2.0f - 1.0f; + } +} + +static bool all_close(const float * a, const float * b, int n, float eps = 5e-3f) { + for (int i = 0; i < n; i++) { + if (fabsf(a[i] - b[i]) > eps) { + fprintf(stderr, " mismatch at [%d]: %.6f vs %.6f (diff=%.6f)\n", + i, a[i], b[i], fabsf(a[i] - b[i])); + return false; + } + } + return true; +} + +// Compute grouped conv1d on CPU naively for reference +// kernel (F16): [K, IC_G, OC], input (F32): [L, IC, N], output: [OL, OC, N] +static void conv1d_grouped_ref( + const ggml_fp16_t * kernel, const float * input, float * output, + int K, int IC, int OC, int L, int N, int groups, int stride, int padding) { + int IC_G = IC / groups; + int OC_G = OC / groups; + int OL = (L + 2 * padding - K) / stride + 1; + + memset(output, 0, (size_t)OL * OC * N * sizeof(float)); + + for (int n = 0; n < N; n++) { + for (int g = 0; g < groups; g++) { + for (int oc = 0; oc < OC_G; oc++) { + int oc_global = g * OC_G + oc; + for (int ol = 0; ol < OL; ol++) { + float sum = 0.0f; + for (int ic = 0; ic < IC_G; ic++) { + for (int k = 0; k < K; k++) { + int il = ol * stride + k - padding; + if (il >= 0 && il < L) { + int ic_global = g * IC_G + ic; + // kernel: [K, IC_G, OC] -> k + ic * K + oc_global * (IC_G * K) + float w = ggml_fp16_to_fp32(kernel[k + ic * K + oc_global * (IC_G * K)]); + // input: [L, IC, N] -> il + ic_global * L + n * (IC * L) + float x = input[il + ic_global * L + n * (IC * L)]; + sum += w * x; + } + } + } + // output: [OL, OC, N] -> ol + oc_global * OL + n * (OC * OL) + output[ol + oc_global * OL + n * (OC * OL)] = sum; + } + } + } + } +} + +static bool run_test(const char * label, int IC, int OC, int K, int L, int groups, int stride, int padding) { + printf(" TEST: %s (IC=%d OC=%d K=%d L=%d G=%d s=%d p=%d)\n", + label, IC, OC, K, L, groups, stride, padding); + + int IC_G = IC / groups; + int OL = (L + 2 * padding - K) / stride + 1; + + size_t ctx_size = 256 * 1024 * 1024; + struct ggml_init_params params = { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, + }; + struct ggml_context * ctx = ggml_init(params); + + // kernel: [K, IC_G, OC] in F16 (like real models) + struct ggml_tensor * a = ggml_new_tensor_3d(ctx, GGML_TYPE_F16, K, IC_G, OC); + // input: [L, IC] in F32 + struct ggml_tensor * b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, L, IC); + + fill_random_f16((ggml_fp16_t *)a->data, K * IC_G * OC); + fill_random_f32((float *)b->data, L * IC); + + // reference + std::vector ref(OL * OC); + conv1d_grouped_ref((ggml_fp16_t *)a->data, (float *)b->data, ref.data(), + K, IC, OC, L, 1, groups, stride, padding); + + // ggml + struct ggml_tensor * result = ggml_conv_1d_grouped(ctx, a, b, stride, padding, 1, groups); + + struct ggml_cgraph * gf = ggml_new_graph(ctx); + ggml_build_forward_expand(gf, result); + + ggml_backend_t backend = ggml_backend_cpu_init(); + ggml_backend_graph_compute(backend, gf); + + bool ok = true; + + if (result->ne[0] != OL || result->ne[1] != OC) { + fprintf(stderr, " FAIL: shape [%lld, %lld], expected [%d, %d]\n", + (long long)result->ne[0], (long long)result->ne[1], OL, OC); + ok = false; + } + + if (ok) { + ok = all_close((float *)result->data, ref.data(), OL * OC); + } + + printf(" %s\n", ok ? "PASS" : "FAIL"); + + ggml_backend_free(backend); + ggml_free(ctx); + return ok; +} + +int main(void) { + srand(42); + + printf("Testing ggml_conv_1d_grouped\n\n"); + + int n_pass = 0, n_fail = 0; + + auto check = [&](const char * label, int IC, int OC, int K, int L, int G, int s, int p) { + if (run_test(label, IC, OC, K, L, G, s, p)) { n_pass++; } else { n_fail++; } + }; + + check("groups=1 (standard conv1d)", 128, 256, 3, 32, 1, 1, 0); + check("ZAYA1-8B exact params", 1280, 1280, 2, 16, 10, 1, 0); + check("small 2 groups", 4, 4, 2, 8, 2, 1, 0); + check("with padding", 8, 8, 2, 16, 4, 1, 1); + check("IC != OC", 12, 6, 3, 10, 3, 1, 0); + check("stride=2", 8, 8, 2, 16, 4, 2, 0); + check("longer sequence", 1280, 1280, 2, 128, 10, 1, 0); + + printf("\nResult: %d passed, %d failed\n", n_pass, n_fail); + return n_fail > 0 ? 1 : 0; +}