Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions common/debug.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -144,13 +144,6 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
auto * cb_data = (common_debug_cb_user_data *) user_data;
auto * pimpl = cb_data->pimpl.get();

const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];

if (ask) {
return true; // Always retrieve data
}

bool matches_filter = pimpl->tensor_filters.empty();

if (!matches_filter) {
Expand All @@ -162,6 +155,13 @@ bool common_debug_cb_eval(struct ggml_tensor * t, bool ask, void * user_data) {
}
}

if (ask) {
return matches_filter;
}

const struct ggml_tensor * src0 = t->src[0];
const struct ggml_tensor * src1 = t->src[1];

Comment on lines 146 to +164
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These changes will need to be removed

char src1_str[128] = { 0 };
if (src1) {
snprintf(src1_str, sizeof(src1_str), "%s{%s}", src1->name, common_ggml_ne_string(src1).c_str());
Expand Down
215 changes: 214 additions & 1 deletion convert_hf_to_gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1183,7 +1183,7 @@ def set_gguf_parameters(self):
if (local_rope_theta := self.rope_parameters.get("sliding_attention", {}).get("rope_theta")) is not None:
self.gguf_writer.add_rope_freq_base_swa(local_rope_theta)
logger.info(f"gguf: rope theta swa = {local_rope_theta}")
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps"], optional=True)) is not None:
if (f_rms_eps := self.find_hparam(["rms_norm_eps", "norm_eps", "norm_epsilon"], optional=True)) is not None:
self.gguf_writer.add_layer_norm_rms_eps(f_rms_eps)
logger.info(f"gguf: rms norm epsilon = {f_rms_eps}")
if (f_norm_eps := self.find_hparam(["layer_norm_eps", "layer_norm_epsilon", "norm_epsilon"], optional=True)) is not None:
Expand Down Expand Up @@ -6454,6 +6454,219 @@ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iter
yield from super().modify_tensors(data_torch, name, bid)


@ModelBase.register("ZayaModel", "ZayaForCausalLM")
class ZayaModel(TextModel):
"""Zaya-1 model with Compressed Convolutional Attention and MoE"""
model_arch = gguf.MODEL_ARCH.ZAYA

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
# Buffer for accumulating expert weights per layer
self._experts: dict[int, dict[str, Tensor]] | None = {}
# Pre-load tokenizer to know the vocab count for embedding trimming
self._tokenizer_vocab_size: int | None = None
try:
from gguf.vocab import LlamaHfVocab
self._tokenizer_vocab_size = LlamaHfVocab(self.dir_model).vocab_size
except Exception:
pass

def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_vocab_size(self.hparams["vocab_size"])

# n_ff = ffn_hidden_size / 2 (SwiGLU halves the intermediate)
n_ff = self.hparams.get("ffn_hidden_size", 4096) // 2
self.gguf_writer.add_feed_forward_length(n_ff)

# ssm_d_conv = conv_qk kernel size (cca_time0 = first depthwise conv kernel)
cca_time0 = self.hparams.get("cca_time0", 2)
self.gguf_writer.add_ssm_conv_kernel(cca_time0)

# partial_rotary_factor -> n_rot
head_dim = self.hparams.get("head_dim", 128)
partial_rotary = self.hparams.get("partial_rotary_factor", 0.5)
self.gguf_writer.add_rope_dimension_count(int(partial_rotary * head_dim))

# MoE params
n_expert = self.find_hparam(["num_experts"])
self.gguf_writer.add_expert_count(n_expert)
n_expert_used = self.find_hparam(["moe_router_topk", "num_experts_per_tok"], optional=True) or 1
self.gguf_writer.add_expert_used_count(n_expert_used)

def _map_cca(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
if "linear_q" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_Q, bid), data_torch
elif "linear_k" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_K, bid), data_torch
elif "val_proj1" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ1, bid), data_torch
elif "val_proj2" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_VAL_PROJ2, bid), data_torch
elif "o_proj" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_OUT, bid), data_torch
elif "conv_qk.0" in name and name.endswith(".weight"):
# PyTorch: [n_qk, 1, kernel] (depthwise) -> ggml: {kernel, n_qk}
data_torch = data_torch.squeeze(1).contiguous()
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW, bid), data_torch
elif "conv_qk.0" in name and name.endswith(".bias"):
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_DW_B, bid, suffix=".bias"), data_torch
elif "conv_qk.1" in name and name.endswith(".weight"):
# PyTorch: [n_qk, in_ch_per_group, kernel] -> ggml: {kernel, in_ch_per_group, n_qk}
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid), data_torch
elif "conv_qk.1" in name and name.endswith(".bias"):
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_CONV_GRP, bid, suffix=".bias"), data_torch
elif "temp" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.CCA_K_SCALE, bid), data_torch

def _map_router(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
if "down_proj.weight" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN, bid), data_torch
elif "down_proj.bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_DOWN_B, bid, suffix=".bias"), data_torch
elif "rmsnorm_eda" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_NORM, bid), data_torch
elif "router_mlp.0.weight" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0, bid), data_torch
elif "router_mlp.0.bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP0_B, bid, suffix=".bias"), data_torch
elif "router_mlp.2.weight" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2, bid), data_torch
elif "router_mlp.2.bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP2_B, bid, suffix=".bias"), data_torch
elif "router_mlp.4.weight" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_MLP4, bid), data_torch
elif "balancing_biases" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_BIASES, bid), data_torch
elif "router_states_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ZAYA_ROUTER_EDA_SCALE, bid), data_torch

def _map_res_scale(self, name: str, data_torch: Tensor, bid: int) -> Iterable[tuple[str, Tensor]]:
if "hidden_states_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS, bid), data_torch
elif "hidden_states_bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B, bid, suffix=".bias"), data_torch
elif "residual_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES, bid), data_torch
elif "residual_bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B, bid, suffix=".bias"), data_torch

def _map_final_res_scale(self, name: str, data_torch: Tensor) -> Iterable[tuple[str, Tensor]]:
if "hidden_states_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_FINAL), data_torch
elif "hidden_states_bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_HS_B_FINAL, suffix=".bias"), data_torch
elif "residual_scale" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_FINAL), data_torch
elif "residual_bias" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.RES_SCALE_RES_B_FINAL, suffix=".bias"), data_torch

def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
# Common tensors
if name == "model.embed_tokens.weight":
# Trim embedding to match tokenizer vocab size if needed
if self._tokenizer_vocab_size is not None and data_torch.shape[0] > self._tokenizer_vocab_size:
data_torch = data_torch[:self._tokenizer_vocab_size]
yield self.format_tensor_name(gguf.MODEL_TENSOR.TOKEN_EMBD), data_torch
return
if name == "model.final_norm.weight":
yield self.format_tensor_name(gguf.MODEL_TENSOR.OUTPUT_NORM), data_torch
return
if name.startswith("model.res_scale."):
yield from self._map_final_res_scale(name, data_torch)
return

# Block-level tensors
if bid is not None:
# CCA attention tensors
if "self_attn" in name:
yield from self._map_cca(name, data_torch, bid)
return

# Router tensors
if "router" in name:
yield from self._map_router(name, data_torch, bid)
return

# Input norm
if "input_norm" in name:
yield self.format_tensor_name(gguf.MODEL_TENSOR.ATTN_NORM, bid), data_torch
return

# Residual scaling
if "res_scale" in name:
yield from self._map_res_scale(name, data_torch, bid)
return

# Expert stacking
if "zaya_block.experts" in name:
assert bid is not None
if self._experts is None:
self._experts = {}
if bid not in self._experts:
self._experts[bid] = {}
self._experts[bid][name] = data_torch

n_expert = self.find_hparam(["num_experts"])
# Each layer has 2 expert weights per expert (fc1, fc2) = 2 * n_expert tensors
if len(self._experts[bid]) >= n_expert * 2:
for w_name, gguf_tensor, permute_dims in [
("linear_fc1", gguf.MODEL_TENSOR.FFN_GATE_UP_EXP, None),
("linear_fc2", gguf.MODEL_TENSOR.FFN_DOWN_EXP, None),
]:
datas: list[Tensor] = []
for xid in range(n_expert):
ename = f"model.layers.{bid}.zaya_block.experts.local_experts.{xid}.{w_name}.weight"
datas.append(self._experts[bid][ename])
del self._experts[bid][ename]
data_torch_stacked = torch.stack(datas, dim=0)
if permute_dims is not None:
data_torch_stacked = data_torch_stacked.permute(*permute_dims)
yield self.format_tensor_name(gguf_tensor, bid), data_torch_stacked
del self._experts[bid]
return

# Fallback for any remaining tensors: use tensor_mapping
try:
yield from super().modify_tensors(data_torch, name, bid)
except ValueError as e:
if "Can not map tensor" in str(e):
logger.warning(f"Skipping unmapped tensor: {name}")
else:
raise

def set_vocab(self):
from gguf.vocab import LlamaHfVocab

vocab = LlamaHfVocab(self.dir_model)
tokens = []
scores = []
toktypes = []
for text, score, toktype in vocab.all_tokens():
tokens.append(text)
scores.append(score)
toktypes.append(toktype)

assert len(tokens) == vocab.vocab_size

self.gguf_writer.add_tokenizer_model("gemma4")
self.gguf_writer.add_token_list(tokens)
self.gguf_writer.add_token_scores(scores)
self.gguf_writer.add_token_types(toktypes)

special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True)
special_vocab.add_to_gguf(self.gguf_writer)
self.gguf_writer.add_add_space_prefix(False)
self.gguf_writer.add_add_bos_token(True)

def prepare_tensors(self):
super().prepare_tensors()
if self._experts:
unprocessed = [k for d in self._experts.values() for k in d.keys()]
if unprocessed:
raise ValueError(f"Unprocessed expert tensors: {unprocessed}")


@ModelBase.register("InternLM2ForCausalLM")
class InternLM2Model(TextModel):
model_arch = gguf.MODEL_ARCH.INTERNLM2
Expand Down
15 changes: 15 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -2041,6 +2041,21 @@ extern "C" {
int s0, // stride
int d0); // dilation

// grouped 1D convolution
// a: [K, IC/G, OC] convolution kernel
// b: [L, IC, N] data
// groups must divide both IC and OC evenly
// when groups == 1, equivalent to ggml_conv_1d
// when groups == IC, equivalent to ggml_conv_1d_dw
GGML_API struct ggml_tensor * ggml_conv_1d_grouped(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
struct ggml_tensor * b, // data
int s0, // stride
int p0, // padding
int d0, // dilation
int groups); // number of groups
Comment on lines +2050 to +2057
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So, a separate PR


GGML_API struct ggml_tensor * ggml_conv_transpose_1d(
struct ggml_context * ctx,
struct ggml_tensor * a, // convolution kernel
Expand Down
63 changes: 60 additions & 3 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2018,9 +2018,9 @@ struct ggml_tensor * ggml_dup_inplace(

static struct ggml_tensor * ggml_add_impl(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
struct ggml_tensor * a,
struct ggml_tensor * b,
bool inplace) {
GGML_ASSERT(ggml_can_repeat(b, a));

struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
Expand Down Expand Up @@ -4541,6 +4541,63 @@ struct ggml_tensor * ggml_conv_1d_dw_ph(
return ggml_conv_1d_dw(ctx, a, b, s0, a->ne[0] / 2, d0);
}

// ggml_conv_1d_grouped

struct ggml_tensor * ggml_conv_1d_grouped(
struct ggml_context * ctx,
struct ggml_tensor * a,
struct ggml_tensor * b,
int s0,
int p0,
int d0,
int groups) {
GGML_ASSERT(groups > 0);

const int64_t OC = a->ne[2]; // total output channels
const int64_t IC_G = a->ne[1]; // input channels per group (kernel dim)
const int64_t IC = b->ne[1]; // total input channels

GGML_ASSERT(IC % groups == 0);
GGML_ASSERT(OC % groups == 0);
GGML_ASSERT(IC_G == IC / groups);

// degenerate cases: fall back to existing implementations
if (groups == 1) {
return ggml_conv_1d(ctx, a, b, s0, p0, d0);
}
if (groups == IC && groups == OC) {
return ggml_conv_1d_dw(ctx, a, b, s0, p0, d0);
}

const int64_t OC_G = OC / groups;

struct ggml_tensor * result = NULL;

for (int g = 0; g < groups; g++) {
// slice kernel for group g: [K, IC_G, OC_G]
struct ggml_tensor * a_g = ggml_view_3d(ctx, a,
a->ne[0], IC_G, OC_G,
a->nb[1], a->nb[2],
g * OC_G * a->nb[2]);

// slice input for group g: [L, IC_G, N]
struct ggml_tensor * b_g = ggml_view_3d(ctx, b,
b->ne[0], IC_G, b->ne[2],
b->nb[1], b->nb[2],
g * IC_G * b->nb[1]);

struct ggml_tensor * out_g = ggml_conv_1d(ctx, a_g, b_g, s0, p0, d0);

if (result == NULL) {
result = out_g;
} else {
result = ggml_concat(ctx, result, out_g, 1);
}
}

return result;
}
Comment on lines +4578 to +4599
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We're reusing the ggml operations; we'll need to see later whether it's necessary to create a specific operation for each backend.


// ggml_conv_transpose_1d

static int64_t ggml_calc_conv_transpose_1d_output_size(int64_t ins, int64_t ks, int s, int p, int d) {
Expand Down
Loading