Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion ggml/src/ggml-cuda/ssm-conv.cu
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int
switch (nc) {
case 3: launch_kernel(std::integral_constant<int, 3>{}); break;
case 4: launch_kernel(std::integral_constant<int, 4>{}); break;
case 5: launch_kernel(std::integral_constant<int, 5>{}); break;
case 9: launch_kernel(std::integral_constant<int, 9>{}); break;
default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now.");
default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now.");
}
}

Expand Down
6 changes: 4 additions & 2 deletions src/models/gemma4-iswa.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
inpL = build_inp_embd(model.tok_embd);

// important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings)
inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f);
// use BF16-rounded scale to match PyTorch's native BF16 training precision
inpL = ggml_scale(ctx0, inpL, ubatch.token ? ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf(n_embd))) : 1.0f);
cb(inpL, "inp_scaled", -1);

// inp_pos - contains the positions
Expand Down Expand Up @@ -149,8 +150,9 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll
cb(cur_moe, "ffn_norm_2", il);

// custom MoE logits calculation (router operates on attn_out, not cur)
// use BF16-rounded scale to match PyTorch's native BF16 training precision
ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps);
tmp = ggml_scale(ctx0, tmp, 1.0f / sqrtf((float) n_embd));
tmp = ggml_scale(ctx0, tmp, 1.0f / ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf((float) n_embd))));
tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s);
ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, tmp); // [n_expert, n_tokens]
cb(logits, "ffn_moe_logits", il);
Expand Down
19 changes: 16 additions & 3 deletions tests/test-llama-archs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,11 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
uint32_t n_layer = 2;
if (arch == LLM_ARCH_LLAMA4) {
n_layer = 4; // hparams.n_no_rope_layer_step is hard-coded to 4
} else if (arch == LLM_ARCH_GEMMA4) {
n_embd = 128;
n_head = 2;
n_ff = 192;
n_layer = 5; // need at least 5 for swa_pattern (every 5th is full_attention)
} else if (arch == LLM_ARCH_GEMMA3N) {
n_embd = 64;
n_head = 1;
Expand Down Expand Up @@ -167,7 +172,15 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) {
ms.add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, uint32_t(8));
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx/8);

if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
if (arch == LLM_ARCH_GEMMA4) {
ms.add_kv(LLM_KV_EMBEDDING_LENGTH_PER_LAYER, n_embd/2);
ms.add_kv(LLM_KV_ATTENTION_SHARED_KV_LAYERS, uint32_t(0));
ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, n_embd_head);
ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, n_embd_head);
ms.add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, 10000.0f);
// SWA pattern: every 5th layer is full attention (matches E2B layer_types)
ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(5));
} else if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) {
std::vector<uint32_t> pattern;
pattern.reserve(n_layer);
for (uint32_t il = 0; il < n_layer; il++) {
Expand Down Expand Up @@ -386,7 +399,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml
continue; // Only half-implemented and to be removed in the future.
}
if (arch == LLM_ARCH_GEMMA4) {
continue; // FIXME @ngxson
continue; // FIXME: ISWA KV cache initialization needs more fixture params
}
if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) {
continue; // FIXME
Expand Down Expand Up @@ -455,7 +468,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg
continue; // Only half-implemented and to be removed in the future.
}
if (arch == LLM_ARCH_GEMMA4) {
continue; // FIXME @ngxson
continue; // FIXME: ISWA KV cache initialization needs more fixture params
}
if (arch == LLM_ARCH_WAVTOKENIZER_DEC) {
continue; // FIXME CUDA backend crashes.
Expand Down
1 change: 1 addition & 0 deletions tools/mtmd/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ add_library(mtmd
models/models.h
models/cogvlm.cpp
models/conformer.cpp
models/gemma4a.cpp
models/gemma4v.cpp
models/glm4v.cpp
models/hunyuanocr.cpp
Expand Down
15 changes: 15 additions & 0 deletions tools/mtmd/clip-impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,21 @@
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"

// gemma4 audio conformer
#define TN_A_MM_INP_PROJ "mm.a.input_projection.%s"
#define TN_A_MM_SOFT_EMB_N "mm.a.soft_emb_norm.%s"
#define TN_A_INP_PROJ "a.input_projection.%s"
#define TN_A_CONV1D "a.conv1d.%d.%s"
#define TN_A_CONV1D_NORM "a.conv1d.%d.norm.%s"
#define TN_A_OUT_PROJ "a.pre_encode.out.%s"
#define TN_A_ATTN_PRE_NORM "%s.blk.%d.attn_pre_norm.%s"
#define TN_A_ATTN_POST_NORM "%s.blk.%d.attn_post_norm.%s"
#define TN_A_ATTN_K_REL "%s.blk.%d.attn_k_rel.%s"
#define TN_A_PER_DIM_SCALE "%s.blk.%d.per_dim_scale.%s"
#define TN_A_PER_DIM_K_SCALE "%s.blk.%d.per_dim_k_scale.%s"
#define TN_A_FFN_POST_NORM "%s.blk.%d.ffn_post_norm.%s"
#define TN_A_FFN_POST_NORM_1 "%s.blk.%d.ffn_post_norm_1.%s"

// mobilenetv5 (gemma3n) definitions
#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
Expand Down
16 changes: 16 additions & 0 deletions tools/mtmd/clip-model.h
Original file line number Diff line number Diff line change
Expand Up @@ -217,6 +217,13 @@ struct clip_layer {
ggml_tensor * conv_pw2_w = nullptr;
ggml_tensor * conv_pw2_b = nullptr;

// gemma4 audio conformer per-layer
ggml_tensor * attn_pre_norm_w = nullptr;
ggml_tensor * attn_k_rel_w = nullptr;
ggml_tensor * per_dim_scale_w = nullptr;
ggml_tensor * per_dim_k_scale_w = nullptr;
ggml_tensor * ff_post_norm_1_w = nullptr;

bool has_deepstack() const {
return deepstack_fc1_w != nullptr;
}
Expand Down Expand Up @@ -459,6 +466,15 @@ struct clip_model {
};
std::map<std::string, clamp_info> clamp_info_map;

// gemma4 audio conformer
std::array<ggml_tensor *, 2> sscp_conv_w = {nullptr};
std::array<ggml_tensor *, 2> sscp_conv_b = {nullptr};
std::array<ggml_tensor *, 2> sscp_norm_w = {nullptr};
ggml_tensor * sscp_inp_proj_w = nullptr;
ggml_tensor * sscp_inp_proj_b = nullptr;
ggml_tensor * audio_out_proj_w = nullptr;
ggml_tensor * audio_out_proj_b = nullptr;

bool audio_has_avgpool() const {
return proj_type == PROJECTOR_TYPE_QWEN2A
|| proj_type == PROJECTOR_TYPE_VOXTRAL
Expand Down
Loading
Loading