Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
99e5d03
ops: add Conv1dGrouped operation
Juste-Leo2 May 8, 2026
e0ac753
initial implementation
Juste-Leo2 May 8, 2026
7cc554a
implementation checkpoint
Juste-Leo2 May 8, 2026
02a9843
update
Juste-Leo2 May 8, 2026
8362c10
add corrections
Juste-Leo2 May 11, 2026
109856e
zaya generation running
May 12, 2026
c3ff41c
refactor: replace CCA_CONV_DW with generic SSM_CONV1D constant
Juste-Leo2 May 15, 2026
a06da04
refactor: replace ZAYA_ROUTER_NORM with generic FFN_NORM constant
Juste-Leo2 May 15, 2026
ed3820b
refactor: replace ZAYA_ROUTER_DOWN with generic FFN_GATE_INP constant
Juste-Leo2 May 15, 2026
7de270f
refactor: replace ZAYA_ROUTER_MLP0 with generic FFN_GATE constant
Juste-Leo2 May 15, 2026
a5c885b
refactor: merge RES_SCALE_*_B bias constants into RES_SCALE_* constants
Juste-Leo2 May 15, 2026
45bf021
refactor: merge router bias constants into parent constants
Juste-Leo2 May 15, 2026
fede4c6
zaya: remove unused CCA_QK_NORM tensor constant
Juste-Leo2 May 15, 2026
2069583
zaya: remove dead ZAYA_ROUTER_MLP2 mapping from non-block config
Juste-Leo2 May 15, 2026
356e962
zaya: revert unrelated debug.cpp changes
Juste-Leo2 May 15, 2026
81d727f
zaya: replace hardcoded n_ff_exp with GGUF metadata
Juste-Leo2 May 15, 2026
45d7881
zaya: fix val_proj dimensions to use n_embd_k / 2 instead of n_embd_head
Juste-Leo2 May 15, 2026
800fbe8
quant: exclude Zaya cca_conv_grp tensors from quantization
Juste-Leo2 May 16, 2026
f2efd8c
zaya: cast conv kernels to F16 for CPU backend compatibility
Juste-Leo2 May 16, 2026
3aaab7f
zaya: add ggml_cont for ROCm/compatibility with non-contiguous tensors
Juste-Leo2 May 16, 2026
c8d3a6c
zaya: fix server compatibility with batched inference
Juste-Leo2 May 17, 2026
7c5cc53
fix(zaya): use actual tokenizer vocab size instead of config vocab_size
Juste-Leo2 May 18, 2026
f1bd772
docs(zaya): add Python reference comments to C++ implementation
Juste-Leo2 May 21, 2026
2234dab
fix(zaya): gate EDA with layer check matching Python use_eda logic
Juste-Leo2 May 21, 2026
1fc4581
feat(zaya): add zaya_high_prec for FP32 output logits matching Python…
Juste-Leo2 May 21, 2026
0f37ace
zaya.cpp: fix comment reference to MOD skip expert handling
Juste-Leo2 May 21, 2026
6b6700a
zaya: add cca_mask input tensor for CCA padding masking
May 22, 2026
9aaef94
zaya: cast residual to F32 before addition (residual_in_fp32)
May 22, 2026
abe9e40
cleanup: revert debugs commits
Juste-Leo2 May 22, 2026
6fad5d8
Revert "docs(zaya): add Python reference comments to C++ implementation"
Juste-Leo2 May 22, 2026
82f8f15
ggml/zaya: fix precision loss in conv_1d and support BF16
Juste-Leo2 May 22, 2026
894ffd4
zaya: add il != 1 check for EDA to match python reference
Juste-Leo2 May 25, 2026
1a7582b
zaya: compute residual in fp32 to match config
Juste-Leo2 May 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
221 changes: 220 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1183,7 +1183,7 @@ def set_gguf_parameters(self):
if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
logger.info(f"gguf: rope theta swa = {local_rope_theta}")
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps", "norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
Expand Down Expand Up @@ -6454,6 +6454,225 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("ZayaModel", "ZayaForCausalLM")
class ZayaModel(TextModel):
"""Zaya-1 model with Compressed Convolutional Attention and MoE"""
model_arch = gguf.MODEL_ARCH.ZAYA

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Buffer for accumulating expert weights per layer
self._experts: dict[int, dict[str, Tensor]] | None = {}
# Pre-load tokenizer to know the vocab count for embedding trimming
self._tokenizer_vocab_size: int | None = None
try:
from gguf.vocab import LlamaHfVocab
self._tokenizer_vocab_size = LlamaHfVocab(self.dir_model).vocab_size
except Exception:
pass

def set_gguf_parameters(self):
super().set_gguf_parameters()
# Use actual tokenizer vocab size if available, fallback to config vocab_size
vocab_size = self._tokenizer_vocab_size if self._tokenizer_vocab_size is not None else self.hparams["vocab_size"]
self.gguf_writer.add_vocab_size(vocab_size)

# n_ff = ffn_hidden_size / 2 (SwiGLU halves the intermediate)
n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2
self.gguf_writer.add_feed_forward_length(n_ff)

# ssm_d_conv = conv_qk kernel size (cca_time0 = first depthwise conv kernel)
cca_time0 = self.hparams.get("cca_time0", 2)
self.gguf_writer.add_ssm_conv_kernel(cca_time0)

# partial_rotary_factor -> n_rot
head_dim = self.hparams.get("head_dim", 128)
partial_rotary = self.hparams.get("partial_rotary_factor", 0.5)
self.gguf_writer.add_rope_dimension_count(int(partial_rotary * head_dim))

# MoE params
n_expert = self.find_hparam(["num_experts"])
self.gguf_writer.add_expert_count(n_expert)
n_expert_used = self.find_hparam(["moe_router_topk", "num_experts_per_tok"], optional=True) or 1
self.gguf_writer.add_expert_used_count(n_expert_used)

# Router MLP hidden size (zaya_mlp_expansion)
n_ff_exp = self.hparams.get("zaya_mlp_expansion", 256)
self.gguf_writer.add_expert_feed_forward_length(n_ff_exp)

def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
if "linear_q" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch
elif "linear_k" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch
elif "val_proj1" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ1, bid), data_torch
elif "val_proj2" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ2, bid), data_torch
elif "o_proj" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch
elif "conv_qk.0" in name and name.endswith(".weight"):
# PyTorch: [n_qk, 1, kernel] (depthwise) -> ggml: {kernel, n_qk}
data_torch = data_torch.squeeze(1).contiguous()
yield self.format_tensor_name(gguf.MODEL_TENSOR.SSM_CONV1D, bid), data_torch
elif "conv_qk.0" in name and name.endswith(".bias"):
yield self.format_tensor_name(gguf.MODEL_TENSOR.SSM_CONV1D, bid, suffix=".bias"), data_torch
elif "conv_qk.1" in name and name.endswith(".weight"):
# PyTorch: [n_qk, in_ch_per_group, kernel] -> ggml: {kernel, in_ch_per_group, n_qk}
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch
elif "conv_qk.1" in name and name.endswith(".bias"):
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid, suffix=".bias"), data_torch
elif "temp" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_K_SCALE, bid), data_torch

def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
if "down_proj.weight" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid), data_torch
elif "down_proj.bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE_INP, bid, suffix=".bias"), data_torch
elif "rmsnorm_eda" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_NORM, bid), data_torch
elif "router_mlp.0.weight" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid), data_torch
elif "router_mlp.0.bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.FFN_GATE, bid, suffix=".bias"), data_torch
elif "router_mlp.2.weight" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid), data_torch
elif "router_mlp.2.bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid, suffix=".bias"), data_torch
elif "router_mlp.4.weight" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP4, bid), data_torch
elif "balancing_biases" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_BIASES, bid), data_torch
elif "router_states_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, bid), data_torch

def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
if "hidden_states_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid), data_torch
elif "hidden_states_bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid, suffix=".bias"), data_torch
elif "residual_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid), data_torch
elif "residual_bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid, suffix=".bias"), data_torch

def _map_final_res_scale(self, name: str, data_torch: Tensor) -> Iterable[tuple[str, Tensor]]:
if "hidden_states_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL), data_torch
elif "hidden_states_bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL, suffix=".bias"), data_torch
elif "residual_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL), data_torch
elif "residual_bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL, suffix=".bias"), data_torch

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Common tensors
if name == "model.embed_tokens.weight":
# Trim embedding to match tokenizer vocab size if needed
if self._tokenizer_vocab_size is not None and data_torch.shape[0] > self._tokenizer_vocab_size:
data_torch = data_torch[:self._tokenizer_vocab_size]
yield self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch
return
if name == "model.final_norm.weight":
yield self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM), data_torch
return
if name.startswith("model.res_scale."):
yield from self._map_final_res_scale(name, data_torch)
return

# Block-level tensors
if bid is not None:
# CCA attention tensors
if "self_attn" in name:
yield from self._map_cca(name, data_torch, bid)
return

# Router tensors
if "router" in name:
yield from self._map_router(name, data_torch, bid)
return

# Input norm
if "input_norm" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, bid), data_torch
return

# Residual scaling
if "res_scale" in name:
yield from self._map_res_scale(name, data_torch, bid)
return

# Expert stacking
if "zaya_block.experts" in name:
assert bid is not None
if self._experts is None:
self._experts = {}
if bid not in self._experts:
self._experts[bid] = {}
self._experts[bid][name] = data_torch

n_expert = self.find_hparam(["num_experts"])
# Each layer has 2 expert weights per expert (fc1, fc2) = 2 * n_expert tensors
if len(self._experts[bid]) >= n_expert * 2:
for w_name, gguf_tensor, permute_dims in [
("linear_fc1", gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, None),
("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, None),
]:
datas: list[Tensor] = []
for xid in range(n_expert):
ename = f"model.layers.{bid}.zaya_block.experts.local_experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch_stacked = torch.stack(datas, dim=0)
if permute_dims is not None:
data_torch_stacked = data_torch_stacked.permute(*permute_dims)
yield self.format_tensor_name(gguf_tensor, bid), data_torch_stacked
del self._experts[bid]
return

# Fallback for any remaining tensors: use tensor_mapping
try:
yield from super().modify_tensors(data_torch, name, bid)
except ValueError as e:
if "Can not map tensor" in str(e):
logger.warning(f"Skipping unmapped tensor: {name}")
else:
raise

def set_vocab(self):
from gguf.vocab import LlamaHfVocab

vocab = LlamaHfVocab(self.dir_model)
tokens = []
scores = []
toktypes = []
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)

assert len(tokens) == vocab.vocab_size

self.gguf_writer.add_tokenizer_model("gemma4")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False)
self.gguf_writer.add_add_bos_token(True)

def prepare_tensors(self):
super().prepare_tensors()
if self._experts:
unprocessed = [k for d in self._experts.values() for k in d.keys()]
if unprocessed:
raise ValueError(f"Unprocessed expert tensors: {unprocessed}")


@ModelBase.register("InternLM2ForCausalLM")
class InternLM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.INTERNLM2
Expand Down
15 changes: 15 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2041,6 +2041,21 @@ extern "C" {
int s0, // stride
int d0); // dilation

// grouped 1D convolution
// a: [K, IC/G, OC] convolution kernel
// b: [L, IC, N] data
// groups must divide both IC and OC evenly
// when groups == 1, equivalent to ggml_conv_1d
// when groups == IC, equivalent to ggml_conv_1d_dw
GGML_API struct ggml_tensor * ggml_conv_1d_grouped(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride
int p0, // padding
int d0, // dilation
int groups); // number of groups

GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
Expand Down
3 changes: 2 additions & 1 deletion ggml/src/ggml-cuda/ssm-conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,12 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const floa
};

switch (nc) {
case 2: launch_kernel(std::integral_constant<int, 2>{}); break;
case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
case 5: launch_kernel(std::integral_constant<int, 5>{}); break;
case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now.");
default: GGML_ABORT("Only support kernel sizes 2, 3, 4, 5, 9 right now.");
}
}

Expand Down
69 changes: 63 additions & 6 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2018,9 +2018,9 @@ struct ggml_tensor * ggml_dup_inplace(

static struct ggml_tensor * ggml_add_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
GGML_ASSERT(ggml_can_repeat(b, a));

struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
Expand Down Expand Up @@ -4487,7 +4487,7 @@ struct ggml_tensor * ggml_conv_1d(
int s0,
int p0,
int d0) {
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16); // [N, OL, IC * K]
struct ggml_tensor * im2col = ggml_im2col(ctx, a, b, s0, 0, p0, 0, d0, 0, false, a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32); // [N, OL, IC * K]

struct ggml_tensor * result =
ggml_mul_mat(ctx,
Expand Down Expand Up @@ -4521,7 +4521,7 @@ struct ggml_tensor * ggml_conv_1d_dw(
int d0) {
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, b, b->ne[0], 1, b->ne[1], b->ne[2]);

struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, GGML_TYPE_F16);
struct ggml_tensor * im2col = ggml_im2col(ctx, a, new_b, s0, 0, p0, 0, d0, 0, false, a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32);

struct ggml_tensor * result = ggml_mul_mat(ctx, im2col, a);

Expand All @@ -4541,6 +4541,63 @@ struct ggml_tensor * ggml_conv_1d_dw_ph(
return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
}

// ggml_conv_1d_grouped

struct ggml_tensor * ggml_conv_1d_grouped(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int p0,
int d0,
int groups) {
GGML_ASSERT(groups > 0);

const int64_t OC = a->ne[2]; // total output channels
const int64_t IC_G = a->ne[1]; // input channels per group (kernel dim)
const int64_t IC = b->ne[1]; // total input channels

GGML_ASSERT(IC % groups == 0);
GGML_ASSERT(OC % groups == 0);
GGML_ASSERT(IC_G == IC / groups);

// degenerate cases: fall back to existing implementations
if (groups == 1) {
return ggml_conv_1d(ctx, a, b, s0, p0, d0);
}
if (groups == IC && groups == OC) {
return ggml_conv_1d_dw(ctx, a, b, s0, p0, d0);
}

const int64_t OC_G = OC / groups;

struct ggml_tensor * result = NULL;

for (int g = 0; g < groups; g++) {
// slice kernel for group g: [K, IC_G, OC_G]
struct ggml_tensor * a_g = ggml_view_3d(ctx, a,
a->ne[0], IC_G, OC_G,
a->nb[1], a->nb[2],
g * OC_G * a->nb[2]);

// slice input for group g: [L, IC_G, N]
struct ggml_tensor * b_g = ggml_view_3d(ctx, b,
b->ne[0], IC_G, b->ne[2],
b->nb[1], b->nb[2],
g * IC_G * b->nb[1]);

struct ggml_tensor * out_g = ggml_conv_1d(ctx, a_g, b_g, s0, p0, d0);

if (result == NULL) {
result = out_g;
} else {
result = ggml_concat(ctx, result, out_g, 1);
}
}

return result;
}

// ggml_conv_transpose_1d

static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
Expand Down Expand Up @@ -4724,7 +4781,7 @@ struct ggml_tensor * ggml_conv_2d_dw(
struct ggml_tensor * new_a = ggml_reshape_4d(ctx, a, a->ne[0], a->ne[1], 1, a->ne[2] * a->ne[3]);
struct ggml_tensor * im2col = ggml_im2col(ctx, new_a,
ggml_reshape_4d(ctx, b, b->ne[0], b->ne[1], 1, b->ne[2] * b->ne[3]),
s0, s1, p0, p1, d0, d1, true, GGML_TYPE_F16); // [N * IC, OH, OW, KH * KW]
s0, s1, p0, p1, d0, d1, true, a->type == GGML_TYPE_F16 ? GGML_TYPE_F16 : GGML_TYPE_F32); // [N * IC, OH, OW, KH * KW]
struct ggml_tensor * new_b = ggml_reshape_4d(ctx, im2col, im2col->ne[0], im2col->ne[2] * im2col->ne[1], b->ne[2], b->ne[3]); // [N * IC, OH, OW, KH * KW] => [N, IC, OH * OW, KH * KW]

new_a = ggml_reshape_4d(ctx, new_a, (new_a->ne[0] * new_a->ne[1]), new_a->ne[2], new_a->ne[3], 1); // [OC,1, KH, KW] => [1, OC, 1, KH * KW]
Expand Down
Loading