From 30a193755cc41d1f9b00d5b36b21f050339ce02c Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Sun, 5 Apr 2026 10:24:35 +1200 Subject: [PATCH 01/13] mtmd: add Gemma 4 audio conformer encoder support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add audio processing for Gemma 4 E2B/E4B via a USM-style Conformer. Architecture: - 12-layer Conformer: FFN → Self-Attention → Causal Conv1D → FFN → Norm - Subsampling Conv Projection: 2x Conv2D(stride=2) with LayerNorm - Full self-attention with sinusoidal RPE and sliding window mask (24) - Logit softcapping at 50.0, ClippableLinear clamping - Output: 1024 → 1536 → RMSNorm → multimodal embedder Mel preprocessing (dedicated mtmd_audio_preprocessor_gemma4a): - HTK mel scale, 128 bins, magnitude STFT, mel_floor=1e-3 - Standard periodic Hann window (320 samples), zero-padded to FFT size - Semicausal left-padding (frame_length/2 samples) - Frame count matched to PyTorch (unfold formula) - No pre-emphasis, no Whisper-style normalization - Mel cosine similarity vs PyTorch: 0.9998 Key fixes: - Tensor loading dedup: prevent get_tensor() from creating duplicate entries in ctx_data. Fixed with std::set guard. - ClippableLinear clamp_info loading moved after per-layer tensors. - Sliding window mask (24 positions) matching PyTorch context_size. - Skip Whisper normalization for Gemma4 mel output. Tested on E2B and E4B with CPU and Vulkan backends. Transcribes: "Glad to see things are going well and business is starting to pick up" (matching ground truth). Ref: #21325 Co-Authored-By: Claude Opus 4.6 (1M context) --- ggml/src/ggml-cuda/ssm-conv.cu | 3 +- tests/test-llama-archs.cpp | 19 ++- tools/mtmd/CMakeLists.txt | 1 + tools/mtmd/clip-impl.h | 15 ++ tools/mtmd/clip-model.h | 16 ++ tools/mtmd/clip.cpp | 159 +++++++++++++++++- tools/mtmd/models/gemma4a.cpp | 291 +++++++++++++++++++++++++++++++++ tools/mtmd/models/models.h | 6 + tools/mtmd/mtmd-audio.cpp | 148 ++++++++++++++--- tools/mtmd/mtmd-audio.h | 12 +- tools/mtmd/mtmd.cpp | 6 + 11 files changed, 645 insertions(+), 31 deletions(-) create mode 100644 tools/mtmd/models/gemma4a.cpp diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 69985cd335..b77cdc1c13 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -134,8 +134,9 @@ static void ssm_conv_f32_cuda(const float * src0, const float * src1, const int switch (nc) { case 3: launch_kernel(std::integral_constant{}); break; case 4: launch_kernel(std::integral_constant{}); break; + case 5: launch_kernel(std::integral_constant{}); break; case 9: launch_kernel(std::integral_constant{}); break; - default: GGML_ABORT("Only support kernel sizes 3, 4, 9 right now."); + default: GGML_ABORT("Only support kernel sizes 3, 4, 5, 9 right now."); } } diff --git a/tests/test-llama-archs.cpp b/tests/test-llama-archs.cpp index d0ef675808..6499d404d5 100644 --- a/tests/test-llama-archs.cpp +++ b/tests/test-llama-archs.cpp @@ -86,6 +86,11 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { uint32_t n_layer = 2; if (arch == LLM_ARCH_LLAMA4) { n_layer = 4; // hparams.n_no_rope_layer_step is hard-coded to 4 + } else if (arch == LLM_ARCH_GEMMA4) { + n_embd = 128; + n_head = 2; + n_ff = 192; + n_layer = 5; // need at least 5 for swa_pattern (every 5th is full_attention) } else if (arch == LLM_ARCH_GEMMA3N) { n_embd = 64; n_head = 1; @@ -167,7 +172,15 @@ static gguf_context_ptr get_gguf_ctx(const llm_arch arch, const bool moe) { ms.add_kv(LLM_KV_ATTENTION_RELATIVE_BUCKETS_COUNT, uint32_t(8)); ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW, n_ctx/8); - if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) { + if (arch == LLM_ARCH_GEMMA4) { + ms.add_kv(LLM_KV_EMBEDDING_LENGTH_PER_LAYER, n_embd/2); + ms.add_kv(LLM_KV_ATTENTION_SHARED_KV_LAYERS, uint32_t(0)); + ms.add_kv(LLM_KV_ATTENTION_KEY_LENGTH_SWA, n_embd_head); + ms.add_kv(LLM_KV_ATTENTION_VALUE_LENGTH_SWA, n_embd_head); + ms.add_kv(LLM_KV_ROPE_FREQ_BASE_SWA, 10000.0f); + // SWA pattern: every 5th layer is full attention (matches E2B layer_types) + ms.add_kv(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, uint32_t(5)); + } else if (arch == LLM_ARCH_MIMO2 || arch == LLM_ARCH_STEP35) { std::vector pattern; pattern.reserve(n_layer); for (uint32_t il = 0; il < n_layer; il++) { @@ -386,7 +399,7 @@ static int save_models(const llm_arch target_arch, const size_t seed, const ggml continue; // Only half-implemented and to be removed in the future. } if (arch == LLM_ARCH_GEMMA4) { - continue; // FIXME @ngxson + continue; // FIXME: ISWA KV cache initialization needs more fixture params } if (arch == LLM_ARCH_RWKV6 || arch == LLM_ARCH_RWKV6QWEN2 || arch == LLM_ARCH_RWKV7 || arch == LLM_ARCH_ARWKV7) { continue; // FIXME @@ -455,7 +468,7 @@ static int test_backends(const llm_arch target_arch, const size_t seed, const gg continue; // Only half-implemented and to be removed in the future. } if (arch == LLM_ARCH_GEMMA4) { - continue; // FIXME @ngxson + continue; // FIXME: ISWA KV cache initialization needs more fixture params } if (arch == LLM_ARCH_WAVTOKENIZER_DEC) { continue; // FIXME CUDA backend crashes. diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt index 151c15d704..71c5d5ab6c 100644 --- a/tools/mtmd/CMakeLists.txt +++ b/tools/mtmd/CMakeLists.txt @@ -17,6 +17,7 @@ add_library(mtmd models/models.h models/cogvlm.cpp models/conformer.cpp + models/gemma4a.cpp models/gemma4v.cpp models/glm4v.cpp models/hunyuanocr.cpp diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h index 0c3e60e1a8..4854747878 100644 --- a/tools/mtmd/clip-impl.h +++ b/tools/mtmd/clip-impl.h @@ -181,6 +181,21 @@ #define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s" #define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s" +// gemma4 audio conformer +#define TN_A_MM_INP_PROJ "mm.a.input_projection.%s" +#define TN_A_MM_SOFT_EMB_N "mm.a.soft_emb_norm.%s" +#define TN_A_INP_PROJ "a.input_projection.%s" +#define TN_A_CONV1D "a.conv1d.%d.%s" +#define TN_A_CONV1D_NORM "a.conv1d.%d.norm.%s" +#define TN_A_OUT_PROJ "a.pre_encode.out.%s" +#define TN_A_ATTN_PRE_NORM "%s.blk.%d.attn_pre_norm.%s" +#define TN_A_ATTN_POST_NORM "%s.blk.%d.attn_post_norm.%s" +#define TN_A_ATTN_K_REL "%s.blk.%d.attn_k_rel.%s" +#define TN_A_PER_DIM_SCALE "%s.blk.%d.per_dim_scale.%s" +#define TN_A_PER_DIM_K_SCALE "%s.blk.%d.per_dim_k_scale.%s" +#define TN_A_FFN_POST_NORM "%s.blk.%d.ffn_post_norm.%s" +#define TN_A_FFN_POST_NORM_1 "%s.blk.%d.ffn_post_norm_1.%s" + // mobilenetv5 (gemma3n) definitions #define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight" #define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias" diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h index b2cd27dcbf..4cf4728313 100644 --- a/tools/mtmd/clip-model.h +++ b/tools/mtmd/clip-model.h @@ -217,6 +217,13 @@ struct clip_layer { ggml_tensor * conv_pw2_w = nullptr; ggml_tensor * conv_pw2_b = nullptr; + // gemma4 audio conformer per-layer + ggml_tensor * attn_pre_norm_w = nullptr; + ggml_tensor * attn_k_rel_w = nullptr; + ggml_tensor * per_dim_scale_w = nullptr; + ggml_tensor * per_dim_k_scale_w = nullptr; + ggml_tensor * ff_post_norm_1_w = nullptr; + bool has_deepstack() const { return deepstack_fc1_w != nullptr; } @@ -459,6 +466,15 @@ struct clip_model { }; std::map clamp_info_map; + // gemma4 audio conformer + std::array sscp_conv_w = {nullptr}; + std::array sscp_conv_b = {nullptr}; + std::array sscp_norm_w = {nullptr}; + ggml_tensor * sscp_inp_proj_w = nullptr; + ggml_tensor * sscp_inp_proj_b = nullptr; + ggml_tensor * audio_out_proj_w = nullptr; + ggml_tensor * audio_out_proj_b = nullptr; + bool audio_has_avgpool() const { return proj_type == PROJECTOR_TYPE_QWEN2A || proj_type == PROJECTOR_TYPE_VOXTRAL diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 9c886bc890..6e4c281a13 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -926,6 +926,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 { builder = std::make_unique(ctx, img); } break; + case PROJECTOR_TYPE_GEMMA4A: + { + builder = std::make_unique(ctx, img); + } break; case PROJECTOR_TYPE_GLM4V: { builder = std::make_unique(ctx, img); @@ -1444,6 +1448,16 @@ struct clip_model_loader { hparams.audio_window_len = 400; hparams.audio_hop_len = 160; } break; + case PROJECTOR_TYPE_GEMMA4A: + { + // Gemma4 feature_extraction_gemma4.py: + // frame_length_ms=20 -> 320 samples, n_fft=512, hop=10ms -> 160 + hparams.audio_chunk_len = 0; // no fixed-length padding + hparams.audio_sample_rate = 16000; + hparams.audio_n_fft = 512; + hparams.audio_window_len = 320; // 20ms frame (NOT 25ms/400) + hparams.audio_hop_len = 160; + } break; case PROJECTOR_TYPE_JANUS_PRO: { hparams.image_pad_color = {127, 127, 127}; @@ -1546,16 +1560,21 @@ struct clip_model_loader { } // helper function + std::unordered_set loaded_tensor_names; auto get_tensor = [&](const std::string & name, bool required = true) { + // Each tensor should only be loaded once; duplicates indicate a bug + if (loaded_tensor_names.count(name)) { + throw std::runtime_error(string_format("%s: tensor already loaded: %s\n", __func__, name.c_str())); + } ggml_tensor * cur = ggml_get_tensor(ctx_meta.get(), name.c_str()); if (!cur && required) { throw std::runtime_error(string_format("%s: unable to find tensor %s\n", __func__, name.c_str())); } if (cur) { tensors_to_load.push_back(cur); - // add tensors to context ggml_tensor * data_tensor = ggml_dup_tensor(ctx_clip.ctx_data.get(), cur); ggml_set_name(data_tensor, cur->name); + loaded_tensor_names.insert(name); cur = data_tensor; } return cur; @@ -2136,6 +2155,74 @@ struct clip_model_loader { model.mm_fc_w = get_tensor(string_format(TN_MM_PROJECTOR, "weight")); model.mm_fc_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias")); } break; + case PROJECTOR_TYPE_GEMMA4A: + { + for (int i = 0; i < 2; i++) { + model.sscp_conv_w[i] = get_tensor(string_format(TN_A_CONV1D, i, "weight")); + model.sscp_conv_b[i] = get_tensor(string_format(TN_A_CONV1D, i, "bias"), false); + model.sscp_norm_w[i] = get_tensor(string_format(TN_A_CONV1D_NORM, i, "weight"), false); + } + model.sscp_inp_proj_w = get_tensor(string_format(TN_A_INP_PROJ, "weight")); + model.sscp_inp_proj_b = get_tensor(string_format(TN_A_INP_PROJ, "bias"), false); + model.audio_out_proj_w = get_tensor(string_format(TN_A_OUT_PROJ, "weight"), false); + model.audio_out_proj_b = get_tensor(string_format(TN_A_OUT_PROJ, "bias"), false); + // audio multimodal embedder (mm.a.* namespace, not mm.*) + model.mm_soft_emb_norm_w = get_tensor(string_format(TN_A_MM_SOFT_EMB_N, "weight"), false); + model.mm_input_proj_w = get_tensor(string_format(TN_A_MM_INP_PROJ, "weight"), false); + + // Per-layer tensors NOT loaded by the generic loop above + for (int il = 0; il < hparams.n_layer; ++il) { + auto & layer = model.layers[il]; + + // Gemma4 audio conformer-specific tensors + layer.ff_norm_w = get_tensor(string_format(TN_FFN_NORM, prefix, il, "weight")); + layer.attn_pre_norm_w = get_tensor(string_format(TN_A_ATTN_PRE_NORM, prefix, il, "weight"), false); + layer.per_dim_scale_w = get_tensor(string_format(TN_A_PER_DIM_SCALE, prefix, il, "weight"), false); + layer.per_dim_k_scale_w = get_tensor(string_format(TN_A_PER_DIM_K_SCALE, prefix, il, "weight"), false); + layer.attn_k_rel_w = get_tensor(string_format(TN_A_ATTN_K_REL, prefix, il, "weight"), false); + + // Convolution module + layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"), false); + layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"), false); + layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight")); + layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"), false); + layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight")); + layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"), false); + layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"), false); + layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"), false); + layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight")); + layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"), false); + + // FFN2 (second half-step) + layer.ff_norm_1_w = get_tensor(string_format(TN_FFN_NORM_1, prefix, il, "weight")); + layer.ff_up_1_w = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "weight")); + layer.ff_up_1_b = get_tensor(string_format(TN_FFN_UP_1, prefix, il, "bias"), false); + layer.ff_down_1_w = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "weight")); + layer.ff_down_1_b = get_tensor(string_format(TN_FFN_DOWN_1, prefix, il, "bias"), false); + layer.ff_post_norm_1_w = get_tensor(string_format(TN_A_FFN_POST_NORM_1, prefix, il, "weight"), false); + } + + // Load clamp info for ClippableLinear AFTER all tensors are loaded + for (auto * tensor : tensors_to_load) { + std::string name = tensor->name; + if (string_ends_with(name, ".weight")) { + std::string name_inp_max = name; + std::string name_inp_min = name; + std::string name_out_max = name; + std::string name_out_min = name; + string_replace_all(name_inp_max, ".weight", ".input_max"); + string_replace_all(name_inp_min, ".weight", ".input_min"); + string_replace_all(name_out_max, ".weight", ".output_max"); + string_replace_all(name_out_min, ".weight", ".output_min"); + model.clamp_info_map[name] = { + get_scalar(name_inp_max, FLT_MAX), + get_scalar(name_inp_min, -FLT_MAX), + get_scalar(name_out_max, FLT_MAX), + get_scalar(name_out_min, -FLT_MAX) + }; + } + } + } break; case PROJECTOR_TYPE_LFM2A: { for (int i : {0, 2, 3, 5, 6}) { @@ -2196,7 +2283,10 @@ struct clip_model_loader { ggml_backend_buffer_set_usage(ctx_clip.buf.get(), GGML_BACKEND_BUFFER_USAGE_WEIGHTS); for (auto & t : tensors_to_load) { ggml_tensor * cur = ggml_get_tensor(ctx_clip.ctx_data.get(), t->name); - const size_t offset = tensor_offset[t->name]; + GGML_ASSERT(cur && "tensor not found in ctx_data"); + auto it_off = tensor_offset.find(t->name); + GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor"); + const size_t offset = it_off->second; fin.seekg(offset, std::ios::beg); if (!fin) { throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name)); @@ -2488,8 +2578,7 @@ struct clip_init_result clip_init(const char * fname, struct clip_context_params // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors // we can remove this check when we implement audio support for Gemma 3N - skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV - || ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA4V; + skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV; } if (loader.has_audio && !skip_audio) { @@ -2841,6 +2930,16 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im { n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2; } break; + case PROJECTOR_TYPE_GEMMA4A: + { + // Two Conv2D stride-2: O = floor((I + 2p - k) / s) + 1, p=1, k=3, s=2 + // O = floor((I - 1) / 2) + 1 + int n = img->nx; + for (int i = 0; i < 2; i++) { + n = (n - 1) / 2 + 1; + } + n_patches = n; + } break; default: GGML_ABORT("unsupported projector type"); } @@ -3277,6 +3376,56 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima } set_input_i32("pos_w", pos_data); } break; + case PROJECTOR_TYPE_GEMMA4A: + { + GGML_ASSERT(imgs.entries.size() == 1); + const auto & img0 = imgs.entries.front(); + // Compute n_pos matching SSCP output: two stride-2 convs + int n_pos = img0->nx; + for (int i = 0; i < 2; i++) { n_pos = (n_pos - 1) / 2 + 1; } + + // Chunked local attention: blocked causal mask and RPE + const int chunk_size = 12; + const int max_past = 12; + const int context_size = chunk_size + max_past; + const int num_blocks = (n_pos + chunk_size - 1) / chunk_size; + + // Blocked causal attention mask: [context_size, chunk_size, num_blocks] + { + std::vector mask(context_size * chunk_size * num_blocks, -INFINITY); + for (int b = 0; b < num_blocks; b++) { + for (int q = 0; q < chunk_size; q++) { + int gq = b * chunk_size + q; + for (int k = 0; k < context_size; k++) { + int gk = b * chunk_size - max_past + k; + if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq) { + mask[k + q * context_size + b * context_size * chunk_size] = 0.0f; + } + } + } + } + set_input_f32("kq_mask", mask); + } + + // Sinusoidal RPE: 13 positions [12, 11, ..., 0] + { + const int n_embd = ctx->model.hparams.n_embd; + const int num_timescales = n_embd / 2; + const float log_timescale_increment = logf(10000.0f) / std::max(num_timescales - 1, 1); + const int rpe_len = max_past + 1; + std::vector pos_emb(n_embd * rpe_len, 0.0f); + for (int p = 0; p < rpe_len; p++) { + float position = (float)(max_past - p); + for (int i = 0; i < num_timescales; i++) { + float inv_ts = expf(-(float)i * log_timescale_increment); + float scaled = position * inv_ts; + pos_emb[p * n_embd + i] = sinf(scaled); + pos_emb[p * n_embd + i + num_timescales] = cosf(scaled); + } + } + set_input_f32("pos_emb", pos_emb); + } + } break; case PROJECTOR_TYPE_LFM2A: { GGML_ASSERT(imgs.entries.size() == 1); @@ -3438,6 +3587,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) { return ctx->model.mm_fc_w->ne[1]; case PROJECTOR_TYPE_LFM2A: return ctx->model.position_embeddings->ne[0]; + case PROJECTOR_TYPE_GEMMA4A: + return ctx->model.hparams.projection_dim; case PROJECTOR_TYPE_GLM4V: return ctx->model.mm_ffn_down_w->ne[1]; default: diff --git a/tools/mtmd/models/gemma4a.cpp b/tools/mtmd/models/gemma4a.cpp new file mode 100644 index 0000000000..6a5ae67fa9 --- /dev/null +++ b/tools/mtmd/models/gemma4a.cpp @@ -0,0 +1,291 @@ +/** + * Gemma 4 Audio Conformer Encoder (clip_graph_gemma4a) + * + * Architecture: Conformer with dual half-step FFN, full self-attention + * with sinusoidal RPE, depthwise light conv, and output projection. + */ + +#include "models.h" +#include + +ggml_cgraph * clip_graph_gemma4a::build() { + const float res_weight = 0.5f; + const float norm_eps = 1e-6f; + + // 1. Input + ggml_tensor * inp = build_inp_raw(1); + auto * cur = ggml_cont(ctx0, ggml_transpose(ctx0, inp)); + + // 2. Subsampling Conv2D (symmetric padding=1, matching PyTorch) + { + for (int i = 0; i < 2; i++) { + cur = ggml_conv_2d(ctx0, model.sscp_conv_w[i], cur, 2, 2, 1, 1, 1, 1); + if (model.sscp_conv_b[i]) { + cur = ggml_add(ctx0, cur, model.sscp_conv_b[i]); + } + // nn.LayerNorm(channels): permute ch to ne[0], normalize, permute back + if (model.sscp_norm_w[i]) { + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = ggml_norm(ctx0, cur, norm_eps); + cur = ggml_mul(ctx0, cur, model.sscp_norm_w[i]); + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 2, 0, 1, 3)); + } + cur = ggml_relu(ctx0, cur); + } + // Flatten [freq, time, ch, 1] -> [ch*freq, time] + cur = ggml_cont(ctx0, ggml_permute(ctx0, cur, 1, 2, 0, 3)); + cur = ggml_reshape_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2]); + if (model.sscp_inp_proj_w) { + cur = build_mm(model.sscp_inp_proj_w, cur); + if (model.sscp_inp_proj_b) { + cur = ggml_add(ctx0, cur, model.sscp_inp_proj_b); + } + } + } + + const int64_t n_pos = cur->ne[1]; + + // Chunked local attention parameters + const int64_t C = 12; // chunk_size + const int64_t P = 12; // max_past_horizon (context_left - 1) + const int64_t S = C + P; // context_size = 24 + const int64_t R = P + 1; // RPE positions = 13 + const int64_t B = (n_pos + C - 1) / C; // num_blocks + const int64_t Np = B * C; // padded sequence length + const int64_t pad_seq = Np - n_pos; + + // Input tensors: blocked RPE and blocked attention mask + ggml_tensor * pos_emb = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_head * d_head, R); + ggml_set_name(pos_emb, "pos_emb"); + ggml_set_input(pos_emb); + + ggml_tensor * kq_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, S, C, B); + ggml_set_name(kq_mask, "kq_mask"); + ggml_set_input(kq_mask); + + // 3. Conformer Blocks + for (int il = 0; il < hparams.n_layer; il++) { + const auto & layer = model.layers[il]; + auto * residual = cur; + + // FFN 1 (half-step) + if (layer.ff_norm_w && layer.ff_up_w && layer.ff_down_w) { + cur = build_norm(cur, layer.ff_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il); + cur = build_ffn(cur, + layer.ff_up_w, nullptr, nullptr, nullptr, + layer.ff_down_w, nullptr, FFN_SILU, il); + if (layer.ff_post_norm_w) { + cur = build_norm(cur, layer.ff_post_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il); + } + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, res_weight)); + } + + // Chunked local self-attention with RPE + if (layer.q_w && layer.k_w && layer.v_w && layer.o_w) { + const float q_scale = (1.0f / sqrtf((float)d_head)) / logf(2.0f); + const float k_scale = logf(1.0f + expf(1.0f)) / logf(2.0f); + const float softcap = 50.0f; + + ggml_tensor * attn_norm_w = layer.attn_pre_norm_w ? layer.attn_pre_norm_w : layer.ln_1_w; + cur = attn_norm_w + ? build_norm(residual, attn_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il) + : residual; + + ggml_tensor * Qcur = build_mm(layer.q_w, cur); + ggml_tensor * Kcur = build_mm(layer.k_w, cur); + ggml_tensor * Vcur = build_mm(layer.v_w, cur); + + // [n_embd, n_pos] -> [D, H, N] + Qcur = ggml_reshape_3d(ctx0, Qcur, d_head, n_head, n_pos); + Kcur = ggml_reshape_3d(ctx0, Kcur, d_head, n_head, n_pos); + Vcur = ggml_reshape_3d(ctx0, Vcur, d_head, n_head, n_pos); + + // Q/K scaling + Qcur = ggml_scale(ctx0, Qcur, q_scale); + if (layer.per_dim_scale_w) { + Qcur = ggml_mul(ctx0, Qcur, ggml_reshape_3d(ctx0, layer.per_dim_scale_w, d_head, 1, 1)); + } + Kcur = ggml_scale(ctx0, Kcur, k_scale); + if (layer.per_dim_k_scale_w) { + Kcur = ggml_mul(ctx0, Kcur, ggml_reshape_3d(ctx0, layer.per_dim_k_scale_w, d_head, 1, 1)); + } + + // Q blocking: [D, H, N] -> pad to Np -> reshape [D, H, C, B] + // ggml permute: ne[ax_i] = src->ne[i], so (0,3,1,2) sends H->3, C->1, B->2 + Qcur = ggml_pad(ctx0, Qcur, 0, 0, pad_seq, 0); // [D, H, Np] + Qcur = ggml_reshape_4d(ctx0, Qcur, d_head, n_head, C, B); // [D, H, C, B] + Qcur = ggml_cont(ctx0, ggml_permute(ctx0, Qcur, 0, 3, 1, 2)); // [D, C, B, H] + + // K/V block context extraction via overlapping view: + // Pad to S*B elements, roll right by P to create left-padding, + // then view with stride C in the block dimension (overlapping windows). + auto extract_blocks = [&](ggml_tensor * t) -> ggml_tensor * { + // [D, H, N] -> pad to S*B -> roll right by P -> cont (materialize) + const int64_t pad_kv = S * B - n_pos; + t = ggml_pad(ctx0, t, 0, 0, pad_kv, 0); // [D, H, S*B] + t = ggml_roll(ctx0, t, 0, 0, P, 0); // left-pad by P + t = ggml_cont(ctx0, t); // materialize roll (removes view offset) + // Overlapping view: stride for B dim is C positions, not S + // ne = [D, H, S, B], data_size = D*H*S*B*sizeof = source_nbytes (exact fit) + // nb1=D*sizeof, nb2=D*H*sizeof, nb3=C*D*H*sizeof (overlap: C < S) + t = ggml_view_4d(ctx0, t, d_head, n_head, S, B, + t->nb[1], t->nb[2], C * t->nb[2], 0); + t = ggml_cont(ctx0, t); // materialize overlapping windows + return t; + }; + + ggml_tensor * Kblk = extract_blocks(Kcur); + // [D, H, S, B] -> [D, S, B, H] via permute(0,3,1,2) + Kblk = ggml_cont(ctx0, ggml_permute(ctx0, Kblk, 0, 3, 1, 2)); + + ggml_tensor * Vblk = extract_blocks(Vcur); + // [D, H, S, B] -> [S, D, B, H] via permute(1,3,0,2) + Vblk = ggml_cont(ctx0, ggml_permute(ctx0, Vblk, 1, 3, 0, 2)); + + // Content attention: Q @ K^T + // Kblk=[D,S,B,H], Qcur=[D,C,B,H] -> mul_mat contracts on D -> [S,C,B,H] + ggml_tensor * matrix_ac = ggml_mul_mat(ctx0, Kblk, Qcur); + + // Relative position attention + if (layer.attn_k_rel_w) { + // RPE: [n_embd, R] -> project -> [D, H, R] -> [D, R, H] + auto * p = ggml_mul_mat(ctx0, layer.attn_k_rel_w, pos_emb); + p = ggml_reshape_3d(ctx0, p, d_head, n_head, R); + p = ggml_cont(ctx0, ggml_permute(ctx0, p, 0, 2, 1, 3)); // [D, R, H] + + // Q_flat @ RPE^T: [D, C*B, H] @ [D, R, H] -> [R, C*B, H] + auto * Q_flat = ggml_reshape_3d(ctx0, Qcur, d_head, C * B, n_head); + auto * matrix_bd = ggml_mul_mat(ctx0, p, Q_flat); // [R, C*B, H] + matrix_bd = ggml_reshape_4d(ctx0, matrix_bd, R, C, B, n_head); // [R, C, B, H] + + // Blocked relative shift (appendix B of Transformer-XL) + { + matrix_bd = ggml_pad(ctx0, matrix_bd, S + 1 - R, 0, 0, 0); // [S+1, C, B, H] + matrix_bd = ggml_reshape_3d(ctx0, matrix_bd, (S + 1) * C, B, n_head); + matrix_bd = ggml_view_3d(ctx0, matrix_bd, + C * S, B, n_head, + matrix_bd->nb[1], matrix_bd->nb[2], 0); + matrix_bd = ggml_cont(ctx0, matrix_bd); // [C*S, B, H] + matrix_bd = ggml_reshape_4d(ctx0, matrix_bd, S, C, B, n_head); // [S, C, B, H] + } + + matrix_ac = ggml_add(ctx0, matrix_ac, matrix_bd); + } + + auto * scores = matrix_ac; // [S, C, B, H] + + // Softcap + scores = ggml_scale(ctx0, scores, 1.0f / softcap); + scores = ggml_tanh(ctx0, scores); + scores = ggml_scale(ctx0, scores, softcap); + + // Blocked attention mask: [S, C, B] broadcasts over H + scores = ggml_add(ctx0, scores, kq_mask); + + ggml_tensor * attn = ggml_soft_max(ctx0, scores); + + // attn @ V: [S,C,B,H] @ [S,D,B,H] -> [D,C,B,H] + ggml_tensor * x = ggml_mul_mat(ctx0, Vblk, attn); + + // [D,C,B,H] -> [D,H,C,B] via permute(0,2,3,1) -> flatten -> trim + x = ggml_cont(ctx0, ggml_permute(ctx0, x, 0, 2, 3, 1)); + x = ggml_cont_2d(ctx0, x, d_head * n_head, C * B); + if (pad_seq > 0) { + x = ggml_view_2d(ctx0, x, d_head * n_head, n_pos, x->nb[1], 0); + x = ggml_cont(ctx0, x); + } + + x = build_mm(layer.o_w, x); + if (layer.o_b) { x = ggml_add(ctx0, x, layer.o_b); } + + if (layer.attn_post_norm_w) { + x = build_norm(x, layer.attn_post_norm_w, nullptr, NORM_TYPE_RMS, norm_eps, il); + } + residual = ggml_add(ctx0, residual, x); + } + + // Convolution Module + if (layer.norm_conv_w && layer.conv_pw1_w && layer.conv_dw_w && layer.conv_pw2_w) { + cur = build_norm(residual, layer.norm_conv_w, nullptr, NORM_TYPE_RMS, norm_eps, il); + auto * x = build_mm(layer.conv_pw1_w, cur); + + // GLU + { + int64_t d = x->ne[0] / 2; + ggml_tensor * gate = ggml_sigmoid(ctx0, + ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); + x = ggml_mul(ctx0, + ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); + x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); + } + + // Causal depthwise Conv1D via ggml_ssm_conv (pad+roll for left-only padding). + // NOTE: ggml_ssm_conv on CUDA only supports kernel sizes 3, 4, 9. + // Gemma 4 uses kernel_size=5. This works on CPU and Vulkan backends. + // TODO: fix ggml-cuda ssm_conv to support kernel_size=5, or use ggml_conv_1d_dw + x = ggml_pad(ctx0, x, 4, 0, 0, 0); + x = ggml_roll(ctx0, x, 4, 0, 0, 0); + x = ggml_ssm_conv(ctx0, x, layer.conv_dw_w); + if (layer.conv_dw_b) { + x = ggml_add(ctx0, x, layer.conv_dw_b); + } + + if (layer.conv_norm_w) { + x = ggml_rms_norm(ctx0, x, norm_eps); + x = ggml_mul(ctx0, x, layer.conv_norm_w); + } + x = ggml_silu(ctx0, x); + x = build_mm(layer.conv_pw2_w, x); + residual = ggml_add(ctx0, residual, x); + } + + // FFN 2 (half-step) + if (layer.ff_norm_1_w && layer.ff_up_1_w && layer.ff_down_1_w) { + cur = build_norm(residual, layer.ff_norm_1_w, nullptr, NORM_TYPE_RMS, norm_eps, il); + cur = build_ffn(cur, + layer.ff_up_1_w, nullptr, nullptr, nullptr, + layer.ff_down_1_w, nullptr, FFN_SILU, il); + if (layer.ff_post_norm_1_w) { + cur = build_norm(cur, layer.ff_post_norm_1_w, nullptr, NORM_TYPE_RMS, norm_eps, il); + } + residual = ggml_add(ctx0, residual, ggml_scale(ctx0, cur, res_weight)); + } + + // Layer output norm + cur = layer.ln_2_w + ? build_norm(residual, layer.ln_2_w, nullptr, NORM_TYPE_RMS, norm_eps, il) + : residual; + + } + + // 4. Output Projection + if (model.audio_out_proj_w) { + cur = build_mm(model.audio_out_proj_w, cur); + if (model.audio_out_proj_b) { + cur = ggml_add(ctx0, cur, model.audio_out_proj_b); + } + } + + // 5. Audio Multimodal Embedder + cur = ggml_rms_norm(ctx0, cur, norm_eps); + if (model.mm_soft_emb_norm_w) { + cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w); + } + if (model.mm_input_proj_w) { + cur = build_mm(model.mm_input_proj_w, cur); + } + + ggml_build_forward_expand(gf, cur); + return gf; +} + +ggml_tensor * clip_graph_gemma4a::build_mm(ggml_tensor * w, ggml_tensor * x) const { + auto it = model.clamp_info_map.find(w->name); + if (it == model.clamp_info_map.end()) { + return ggml_mul_mat(ctx0, w, x); + } + const auto & ci = it->second; + ggml_tensor * clamped = ggml_clamp(ctx0, x, ci.inp_min, ci.inp_max); + ggml_tensor * out = ggml_mul_mat(ctx0, w, clamped); + return ggml_clamp(ctx0, out, ci.out_min, ci.out_max); +} diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h index 47e2cde2b9..afc83e267b 100644 --- a/tools/mtmd/models/models.h +++ b/tools/mtmd/models/models.h @@ -98,6 +98,12 @@ struct clip_graph_conformer : clip_graph { ggml_cgraph * build() override; }; +struct clip_graph_gemma4a : clip_graph { + clip_graph_gemma4a(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} + ggml_cgraph * build() override; + ggml_tensor * build_mm(ggml_tensor * w, ggml_tensor * x) const override; +}; + struct clip_graph_glm4v : clip_graph { clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {} ggml_cgraph * build() override; diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index e68387c273..38a8ce4f4a 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -8,6 +8,7 @@ #include #include #include +#include // some of the code here is copied from whisper.cpp @@ -37,23 +38,36 @@ void mtmd_audio_cache::fill_mel_filterbank_matrix(int n_mel, float fmin, float fmax, bool slaney_area_norm, - float scale) { + float scale, + bool use_htk) { GGML_ASSERT(n_mel > 0 && n_fft > 1); if (fmax <= 0.0f) { fmax = 0.5f * sample_rate; } - // Slaney scale (matches librosa default) - const double min_log_hz = 1000.0; - const double lin_slope = 3 / 200.; - const double min_log_mel = min_log_hz * lin_slope; - const double log_step = log(6.4) / 27.0; - auto hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double { - return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step; - }; - auto mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double { - return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step); - }; + std::function hz_to_mel; + std::function mel_to_hz; + + if (use_htk) { + hz_to_mel = [](const double f_hz) -> double { + return 2595.0 * log10(1.0 + f_hz / 700.0); + }; + mel_to_hz = [](const double m) -> double { + return 700.0 * (pow(10.0, m / 2595.0) - 1.0); + }; + } else { + // Slaney scale (matches librosa default) + const double min_log_hz = 1000.0; + const double lin_slope = 3 / 200.; + const double min_log_mel = min_log_hz * lin_slope; + const double log_step = log(6.4) / 27.0; + hz_to_mel = [min_log_hz, lin_slope, log_step, min_log_mel](const double f_hz) -> double { + return (f_hz < min_log_hz) ? f_hz * lin_slope : min_log_mel + log(f_hz / min_log_hz) / log_step; + }; + mel_to_hz = [min_log_hz, lin_slope, log_step, min_log_mel](const double m) -> double { + return (m < min_log_mel) ? m / lin_slope : min_log_hz * exp((m - min_log_mel) * log_step); + }; + } // infer N_fft from n_fft_bins const double bin_hz_step = double(sample_rate) / double(n_fft); @@ -257,10 +271,13 @@ struct filter_params { int32_t hann_window_size; int32_t hop_length; int32_t sample_rate; - bool center_padding = false; - float preemph = 0.f; + bool no_padding = false; + bool center_padding = false; + float preemph = 0.f; bool use_natural_log = false; bool norm_per_feature = false; + bool use_magnitude = false; // |X| instead of |X|^2 + float mel_floor = 5.960464477539063e-08f; }; static void log_mel_spectrogram_worker_thread(int ith, @@ -301,10 +318,10 @@ static void log_mel_spectrogram_worker_thread(int ith, // FFT fft(cache, fft_in.data(), frame_size, fft_out.data()); - // Calculate modulus^2 of complex numbers - // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting. + // Calculate modulus^2 (power) or modulus (magnitude) for (int j = 0; j < n_fft_bins; j++) { - fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + float power = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + fft_out[j] = params.use_magnitude ? sqrtf(power) : power; } // mel spectrogram @@ -324,9 +341,10 @@ static void log_mel_spectrogram_worker_thread(int ith, for (; k < n_fft_bins; k++) { sum += fft_out[k] * filters.data[j * n_fft_bins + k]; } + sum = std::max(sum, (double)params.mel_floor); sum = params.use_natural_log - ? log(sum + 5.960464477539063e-08) - : log10(std::max(sum, 1e-10)); + ? log(sum) + : log10(sum); out.data[j * out.n_len + i] = sum; } } @@ -360,7 +378,12 @@ static bool log_mel_spectrogram( // Padding std::vector samples_padded; - if (params.center_padding) { + if (params.no_padding) { + // no padding, use samples as-is + samples_padded = std::vector(samples, samples + n_samples); + samples = samples_padded.data(); + n_samples = samples_padded.size(); + } else if (params.center_padding) { const auto pad_amount = frame_size / 2; samples_padded = std::vector(n_samples + 2 * pad_amount, 0); std::copy(samples, samples + n_samples, samples_padded.data() + pad_amount); @@ -464,8 +487,8 @@ static bool log_mel_spectrogram( out.data[i * out.n_len + j] = 0.0; } } - } else { - // clamping and normalization + } else if (!params.no_padding) { + // Whisper-style clamping and normalization (NOT used by Gemma4) double mmax = -1e20; for (int i = 0; i < out.n_mel*out.n_len; i++) { if (out.data[i] > mmax) { @@ -627,6 +650,87 @@ bool mtmd_audio_preprocessor_conformer::preprocess(const float * return true; } +// +// mtmd_audio_preprocessor_gemma4a +// + +void mtmd_audio_preprocessor_gemma4a::initialize() { + cache.fill_sin_cos_table(hparams.audio_n_fft); + + // Standard periodic Hann window, zero-padded to FFT size + cache.hann_window.assign(hparams.audio_n_fft, 0.0f); + for (uint32_t i = 0; i < (uint32_t)hparams.audio_window_len; i++) { + cache.hann_window[i] = 0.5f - 0.5f * cosf((2.0f * (float)M_PI * i) / hparams.audio_window_len); + } + + // HTK mel scale, no Slaney area normalization + cache.fill_mel_filterbank_matrix( + hparams.n_mel_bins, hparams.audio_n_fft, hparams.audio_sample_rate, + 0.0f, hparams.audio_sample_rate / 2.0f, + /*slaney_area_norm=*/ false, + /*scale=*/ 1.0f, + /*use_htk=*/ true + ); +} + +bool mtmd_audio_preprocessor_gemma4a::preprocess(const float * samples, + size_t n_samples, + std::vector & output) { + if (n_samples == 0) { + return false; + } + + GGML_ASSERT(!cache.sin_vals.empty()); + GGML_ASSERT(!cache.cos_vals.empty()); + GGML_ASSERT(!cache.filters.data.empty()); + + filter_params params; + params.n_mel = hparams.n_mel_bins; + params.n_fft_bins = 1 + (hparams.audio_n_fft / 2); + params.hann_window_size = hparams.audio_n_fft; // window is zero-padded to FFT size + params.hop_length = hparams.audio_hop_len; + params.sample_rate = hparams.audio_sample_rate; + params.no_padding = true; + params.center_padding = false; + params.preemph = 0.0f; + params.use_natural_log = true; + params.use_magnitude = true; + params.mel_floor = 0.001f; + params.norm_per_feature = false; + + // Split into 30-second chunks (model context limit, ~750 tokens each) + const size_t chunk_samples = 30 * hparams.audio_sample_rate; + for (size_t off = 0; off < n_samples; off += chunk_samples) { + const float * chunk_ptr = samples + off; + size_t chunk_len = std::min(chunk_samples, n_samples - off); + + // Semicausal left-padding + right-padding to match PyTorch frame count + const int pad_left = hparams.audio_window_len / 2; + const int fft_size = hparams.audio_n_fft; + const int hop = hparams.audio_hop_len; + const int n_with_left = (int)chunk_len + pad_left; + // PyTorch: unfold(size=frame_length+1, step=hop) on semicausal-padded waveform + const int pt_frames = (n_with_left - (hparams.audio_window_len + 1)) / hop + 1; + const int n_padded_needed = (pt_frames - 1) * hop + fft_size; + const int total_pad = std::max((int)(n_padded_needed - (int)chunk_len), pad_left); + std::vector padded_samples(total_pad + chunk_len, 0.0f); + std::copy(chunk_ptr, chunk_ptr + chunk_len, padded_samples.data() + pad_left); + + mtmd_audio_mel out_chunk; + bool ok = log_mel_spectrogram(padded_samples.data(), padded_samples.size(), 4, params, cache, out_chunk); + if (!ok) { + return false; + } + + // Trim to PyTorch frame count + out_chunk.n_len = std::min(out_chunk.n_len, pt_frames); + + output.push_back(std::move(out_chunk)); + } + + return true; +} + // // mtmd_audio_streaming_istft implementation // diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h index 53857a2eb5..efaa14f924 100644 --- a/tools/mtmd/mtmd-audio.h +++ b/tools/mtmd/mtmd-audio.h @@ -45,7 +45,8 @@ struct mtmd_audio_cache { float fmin = 0.0f, // e.g. 0.0 float fmax = -1.0f, // e.g. sr/2; pass -1 for auto bool slaney_area_norm = true, - float scale = 1.0f // optional extra scaling + float scale = 1.0f, + bool use_htk = false ); }; @@ -77,6 +78,15 @@ struct mtmd_audio_preprocessor_conformer : mtmd_audio_preprocessor { mtmd_audio_cache cache; }; +struct mtmd_audio_preprocessor_gemma4a : mtmd_audio_preprocessor { + mtmd_audio_preprocessor_gemma4a(const clip_ctx * ctx) : mtmd_audio_preprocessor(ctx) {} + void initialize() override; + bool preprocess(const float * samples, size_t n_samples, std::vector & output) override; + + private: + mtmd_audio_cache cache; +}; + // // streaming ISTFT - converts spectrogram frames back to audio one frame at a time // diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp index 4cbb3301ea..7de43c9fe3 100644 --- a/tools/mtmd/mtmd.cpp +++ b/tools/mtmd/mtmd.cpp @@ -476,6 +476,12 @@ struct mtmd_context { { audio_preproc = std::make_unique(ctx_a); } break; + case PROJECTOR_TYPE_GEMMA4A: + { + aud_beg = "<|audio>"; + aud_end = ""; + audio_preproc = std::make_unique(ctx_a); + } break; default: throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj)); } From 7b40644a61a8efdef4cbe96b24851a9c28460022 Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Tue, 7 Apr 2026 13:03:53 +1200 Subject: [PATCH 02/13] gemma4: fix audio encoder and LM precision issues Audio encoder fixes: - Fix swapped conv norm weight mapping in tensor_mapping.py (A_ENC_CONV_NORM and A_ENC_NORM_CONV had their gemma4 entries inverted, causing the conv pre-norm and internal norm weights to be swapped in GGUF. This produced 0.67 encoder cosine vs PyTorch; now 0.9999) - Fix causal mask off-by-one: add (gq - gk) < max_past to match PyTorch's dist < left_window_size (was attending to 13 past tokens instead of 12) - Use -1e9 instead of -INFINITY for masked positions to match PyTorch's attention_invalid_logits_value and avoid NaN in padded attention weights LM fixes: - Disable attention logit softcapping for Gemma4 (unlike Gemma2, Gemma4's text model does not use attn softcapping; was incorrectly hardcoded) - Use BF16-rounded embedding scale constants to match PyTorch's native BF16 training precision (ref: PR #21451). Fixes long-context coherence on CPU/Vulkan backends. Co-Authored-By: Claude Opus 4.6 (1M context) --- gguf-py/gguf/tensor_mapping.py | 8 ++++---- src/llama-model.cpp | 6 ++++-- src/models/gemma4-iswa.cpp | 6 ++++-- tools/mtmd/clip.cpp | 4 ++-- 4 files changed, 14 insertions(+), 10 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index a2aa139de1..d476728396 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -2056,22 +2056,22 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_CONV_NORM: ( "conformer.layers.{bid}.conv.batch_norm", # lfm2 - "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n + "conformer.layers.{bid}.lconv1d.conv_norm", # gemma4 ), MODEL_TENSOR.A_ENC_CONV_PW1: ( "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 - "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n + "conformer.layers.{bid}.lconv1d.linear_start", # gemma4 ), MODEL_TENSOR.A_ENC_CONV_PW2: ( "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 - "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n + "conformer.layers.{bid}.lconv1d.linear_end", # gemma4 ), MODEL_TENSOR.A_ENC_NORM_CONV: ( "conformer.layers.{bid}.norm_conv", # lfm2 - "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n + "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma4 ), MODEL_TENSOR.A_PER_DIM_K_SCALE: ( diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 5636b45439..670ebe9f56 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1186,14 +1186,16 @@ void llama_model::load_hparams(llama_model_loader & ml) { uint32_t swa_period = 2; ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); - hparams.attn_soft_cap = true; hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); + // Gemma4 does NOT use attention logit softcapping (unlike Gemma2) + hparams.f_attn_logit_softcapping = 0.0f; + ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); + hparams.attn_soft_cap = (hparams.f_attn_logit_softcapping > 0.0f); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); switch (hparams.n_layer) { diff --git a/src/models/gemma4-iswa.cpp b/src/models/gemma4-iswa.cpp index b3c6c5be2a..051586cb6c 100644 --- a/src/models/gemma4-iswa.cpp +++ b/src/models/gemma4-iswa.cpp @@ -17,7 +17,8 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll inpL = build_inp_embd(model.tok_embd); // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) - inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); + // use BF16-rounded scale to match PyTorch's native BF16 training precision (ref: PR #21451) + inpL = ggml_scale(ctx0, inpL, ubatch.token ? ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf(n_embd))) : 1.0f); cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions @@ -149,8 +150,9 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll cb(cur_moe, "ffn_norm_2", il); // custom MoE logits calculation (router operates on attn_out, not cur) + // use BF16-rounded scale to match PyTorch's native BF16 training precision (ref: PR #21451) ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps); - tmp = ggml_scale(ctx0, tmp, 1.0f / sqrtf((float) n_embd)); + tmp = ggml_scale(ctx0, tmp, 1.0f / ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf((float) n_embd)))); tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s); ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, tmp); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 6e4c281a13..d7d2ade5b0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3392,13 +3392,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima // Blocked causal attention mask: [context_size, chunk_size, num_blocks] { - std::vector mask(context_size * chunk_size * num_blocks, -INFINITY); + std::vector mask(context_size * chunk_size * num_blocks, -1e9f); for (int b = 0; b < num_blocks; b++) { for (int q = 0; q < chunk_size; q++) { int gq = b * chunk_size + q; for (int k = 0; k < context_size; k++) { int gk = b * chunk_size - max_past + k; - if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq) { + if (gq < n_pos && gk >= 0 && gk < n_pos && gk <= gq && (gq - gk) < max_past) { mask[k + q * context_size + b * context_size * chunk_size] = 0.0f; } } From 50d6f82db36a004cd4a18d9100cd0818bc619b72 Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Tue, 7 Apr 2026 14:57:07 +1200 Subject: [PATCH 03/13] mtmd: use double-precision math for audio preprocessing constants Use double-precision trig (sin/cos) instead of float (sinf/cosf) for precomputed FFT twiddle factors, Hann window, and sinusoidal RPE to match PyTorch's precision in the audio encoder preprocessing. Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/mtmd/clip.cpp | 12 ++++++------ tools/mtmd/mtmd-audio.cpp | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d7d2ade5b0..33a12383cf 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3411,16 +3411,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima { const int n_embd = ctx->model.hparams.n_embd; const int num_timescales = n_embd / 2; - const float log_timescale_increment = logf(10000.0f) / std::max(num_timescales - 1, 1); + const double log_timescale_increment = log(10000.0) / std::max(num_timescales - 1, 1); const int rpe_len = max_past + 1; std::vector pos_emb(n_embd * rpe_len, 0.0f); for (int p = 0; p < rpe_len; p++) { - float position = (float)(max_past - p); + double position = (double)(max_past - p); for (int i = 0; i < num_timescales; i++) { - float inv_ts = expf(-(float)i * log_timescale_increment); - float scaled = position * inv_ts; - pos_emb[p * n_embd + i] = sinf(scaled); - pos_emb[p * n_embd + i + num_timescales] = cosf(scaled); + double inv_ts = exp(-(double)i * log_timescale_increment); + double scaled = position * inv_ts; + pos_emb[p * n_embd + i] = (float)sin(scaled); + pos_emb[p * n_embd + i + num_timescales] = (float)cos(scaled); } } set_input_f32("pos_emb", pos_emb); diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index 38a8ce4f4a..ade09bd345 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -19,8 +19,8 @@ void mtmd_audio_cache::fill_sin_cos_table(uint32_t n) { cos_vals.resize(n); for (uint32_t i = 0; i < n; i++) { double theta = (2 * M_PI * i) / n; - sin_vals[i] = sinf(theta); - cos_vals[i] = cosf(theta); + sin_vals[i] = sin(theta); + cos_vals[i] = cos(theta); } } @@ -28,7 +28,7 @@ void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) { hann_window.resize(length); int offset = periodic ? 0 : -1; for (uint32_t i = 0; i < length; i++) { - hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); + hann_window[i] = 0.5 * (1.0 - cos((2.0 * M_PI * i) / (length + offset))); } } From f97f5abb392a0835be22468e2897d4491b2683da Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Tue, 7 Apr 2026 20:04:04 +1200 Subject: [PATCH 04/13] Revert "mtmd: use double-precision math for audio preprocessing constants" This reverts commit 65a4b12e066501e34f2aac251a50bcca74fd0da5. --- tools/mtmd/clip.cpp | 12 ++++++------ tools/mtmd/mtmd-audio.cpp | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 33a12383cf..d7d2ade5b0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -3411,16 +3411,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima { const int n_embd = ctx->model.hparams.n_embd; const int num_timescales = n_embd / 2; - const double log_timescale_increment = log(10000.0) / std::max(num_timescales - 1, 1); + const float log_timescale_increment = logf(10000.0f) / std::max(num_timescales - 1, 1); const int rpe_len = max_past + 1; std::vector pos_emb(n_embd * rpe_len, 0.0f); for (int p = 0; p < rpe_len; p++) { - double position = (double)(max_past - p); + float position = (float)(max_past - p); for (int i = 0; i < num_timescales; i++) { - double inv_ts = exp(-(double)i * log_timescale_increment); - double scaled = position * inv_ts; - pos_emb[p * n_embd + i] = (float)sin(scaled); - pos_emb[p * n_embd + i + num_timescales] = (float)cos(scaled); + float inv_ts = expf(-(float)i * log_timescale_increment); + float scaled = position * inv_ts; + pos_emb[p * n_embd + i] = sinf(scaled); + pos_emb[p * n_embd + i + num_timescales] = cosf(scaled); } } set_input_f32("pos_emb", pos_emb); diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp index ade09bd345..38a8ce4f4a 100644 --- a/tools/mtmd/mtmd-audio.cpp +++ b/tools/mtmd/mtmd-audio.cpp @@ -19,8 +19,8 @@ void mtmd_audio_cache::fill_sin_cos_table(uint32_t n) { cos_vals.resize(n); for (uint32_t i = 0; i < n; i++) { double theta = (2 * M_PI * i) / n; - sin_vals[i] = sin(theta); - cos_vals[i] = cos(theta); + sin_vals[i] = sinf(theta); + cos_vals[i] = cosf(theta); } } @@ -28,7 +28,7 @@ void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) { hann_window.resize(length); int offset = periodic ? 0 : -1; for (uint32_t i = 0; i < length; i++) { - hann_window[i] = 0.5 * (1.0 - cos((2.0 * M_PI * i) / (length + offset))); + hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset))); } } From e6801b082bc7faebd339e9a9c560dad3a1e512ab Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Wed, 8 Apr 2026 12:32:57 +1200 Subject: [PATCH 05/13] gguf-py: restore gemma3n mappings in tensor_mapping.py and fix swapped conv norms --- gguf-py/gguf/tensor_mapping.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index d476728396..413cef8888 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -2056,22 +2056,22 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_CONV_NORM: ( "conformer.layers.{bid}.conv.batch_norm", # lfm2 - "conformer.layers.{bid}.lconv1d.conv_norm", # gemma4 + "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n, gemma4 ), MODEL_TENSOR.A_ENC_CONV_PW1: ( "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 - "conformer.layers.{bid}.lconv1d.linear_start", # gemma4 + "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n, gemma4 ), MODEL_TENSOR.A_ENC_CONV_PW2: ( "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 - "conformer.layers.{bid}.lconv1d.linear_end", # gemma4 + "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n, gemma4 ), MODEL_TENSOR.A_ENC_NORM_CONV: ( "conformer.layers.{bid}.norm_conv", # lfm2 - "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma4 + "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n, gemma4 ), MODEL_TENSOR.A_PER_DIM_K_SCALE: ( From 1c61106ab031d1f71a5912a28be85e88dbc8d7e0 Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Wed, 8 Apr 2026 21:13:36 +1200 Subject: [PATCH 06/13] address ngxson review: fix tensor mapping in C++, remove dup comment, derive softcap - Revert conv_norm/pre_layer_norm swap in tensor_mapping.py to preserve backward compatibility with existing GGUFs; fix mapping in C++ clip.cpp by cross-loading the swapped tensor names at load time instead - Fix missing comma in V_ENC_ATTN_QKV mapping (silent string concatenation bug) - Remove duplicated comment line in gemma4-iswa.cpp - Keep per-layer embedding scale for multimodal path (matches PyTorch ScaledWordEmbedding which replaces multimodal IDs with pad_token_id before lookup; scaling is a text model property, not projector) - Derive attn_soft_cap from ml.get_key() return value instead of hardcoding true (Gemma4 has no attn softcapping key in GGUF) Co-Authored-By: Claude Opus 4.6 (1M context) --- gguf-py/gguf/tensor_mapping.py | 4 ++-- src/llama-model.cpp | 5 +---- src/models/gemma4-iswa.cpp | 1 - tools/mtmd/clip.cpp | 10 ++++++---- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 413cef8888..375c14b5a8 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -2056,7 +2056,7 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_CONV_NORM: ( "conformer.layers.{bid}.conv.batch_norm", # lfm2 - "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n, gemma4 + "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n, gemma4 ), MODEL_TENSOR.A_ENC_CONV_PW1: ( @@ -2071,7 +2071,7 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_NORM_CONV: ( "conformer.layers.{bid}.norm_conv", # lfm2 - "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n, gemma4 + "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n, gemma4 ), MODEL_TENSOR.A_PER_DIM_K_SCALE: ( diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 670ebe9f56..771cb4feab 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1192,10 +1192,7 @@ void llama_model::load_hparams(llama_model_loader & ml) { ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - // Gemma4 does NOT use attention logit softcapping (unlike Gemma2) - hparams.f_attn_logit_softcapping = 0.0f; - ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); - hparams.attn_soft_cap = (hparams.f_attn_logit_softcapping > 0.0f); + hparams.attn_soft_cap = ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); switch (hparams.n_layer) { diff --git a/src/models/gemma4-iswa.cpp b/src/models/gemma4-iswa.cpp index 051586cb6c..9090c9109e 100644 --- a/src/models/gemma4-iswa.cpp +++ b/src/models/gemma4-iswa.cpp @@ -150,7 +150,6 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll cb(cur_moe, "ffn_norm_2", il); // custom MoE logits calculation (router operates on attn_out, not cur) - // use BF16-rounded scale to match PyTorch's native BF16 training precision (ref: PR #21451) ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps); tmp = ggml_scale(ctx0, tmp, 1.0f / ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf((float) n_embd)))); tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s); diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d7d2ade5b0..2ec25387a9 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2182,14 +2182,16 @@ struct clip_model_loader { layer.attn_k_rel_w = get_tensor(string_format(TN_A_ATTN_K_REL, prefix, il, "weight"), false); // Convolution module - layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"), false); - layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"), false); + // Note: gemma GGUF tensor names are swapped vs semantic usage, + // so we cross-load conv_norm <-> norm_conv to match how they're used + layer.norm_conv_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"), false); + layer.norm_conv_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"), false); layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight")); layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"), false); layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight")); layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"), false); - layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"), false); - layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"), false); + layer.conv_norm_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"), false); + layer.conv_norm_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"), false); layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight")); layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"), false); From 0d9cfb7d92981e7bc8c934126298763643b6286e Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Wed, 8 Apr 2026 23:32:17 +1200 Subject: [PATCH 07/13] address review: remove cross-load, keep per-layer scale, derive softcap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Remove conv_norm cross-load in clip.cpp (the upstream tensor mapping is correct for existing GGUFs; cross-loading caused double-swap) - Keep per-layer embedding scale for multimodal path — this is the text model's ScaledWordEmbedding behavior, cannot be moved to projector since tok_embd_per_layer is a text model tensor - Derive attn_soft_cap from ml.get_key() return value - Remove duplicated comment Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/mtmd/clip.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 2ec25387a9..d7d2ade5b0 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2182,16 +2182,14 @@ struct clip_model_loader { layer.attn_k_rel_w = get_tensor(string_format(TN_A_ATTN_K_REL, prefix, il, "weight"), false); // Convolution module - // Note: gemma GGUF tensor names are swapped vs semantic usage, - // so we cross-load conv_norm <-> norm_conv to match how they're used - layer.norm_conv_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"), false); - layer.norm_conv_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"), false); + layer.norm_conv_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"), false); + layer.norm_conv_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"), false); layer.conv_pw1_w = get_tensor(string_format(TN_CONV_PW1, prefix, il, "weight")); layer.conv_pw1_b = get_tensor(string_format(TN_CONV_PW1, prefix, il, "bias"), false); layer.conv_dw_w = get_tensor(string_format(TN_CONV_DW, prefix, il, "weight")); layer.conv_dw_b = get_tensor(string_format(TN_CONV_DW, prefix, il, "bias"), false); - layer.conv_norm_w = get_tensor(string_format(TN_NORM_CONV, prefix, il, "weight"), false); - layer.conv_norm_b = get_tensor(string_format(TN_NORM_CONV, prefix, il, "bias"), false); + layer.conv_norm_w = get_tensor(string_format(TN_CONV_NORM, prefix, il, "weight"), false); + layer.conv_norm_b = get_tensor(string_format(TN_CONV_NORM, prefix, il, "bias"), false); layer.conv_pw2_w = get_tensor(string_format(TN_CONV_PW2, prefix, il, "weight")); layer.conv_pw2_b = get_tensor(string_format(TN_CONV_PW2, prefix, il, "bias"), false); From 3fc1206ad5a2bdc56c8d0aac6523bc0f2e4d6892 Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Wed, 8 Apr 2026 23:52:21 +1200 Subject: [PATCH 08/13] address review: auto-detect swapped conv norms, remove dup comment - Add auto-detection of swapped conv_norm/norm_conv tensor data in Gemma 4 audio mmproj GGUFs. Publicly released GGUFs have these tensors swapped. Detection compares weight energy (sum-of-squares) and swaps tensor pointers if needed. - Remove duplicated comment line in gemma4-iswa.cpp Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/mtmd/clip.cpp | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index d7d2ade5b0..0f12aaed76 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2306,6 +2306,46 @@ struct clip_model_loader { LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); } + + // Auto-detect and fix swapped conv norm mapping in Gemma 4 audio GGUFs. + // + // Publicly released Gemma 4 mmproj GGUFs have conv_norm and norm_conv + // tensor data swapped: HF pre_layer_norm ended up in GGUF conv_norm and + // vice versa. The C++ code uses layer.norm_conv_w as the pre-conv norm + // and layer.conv_norm_w as the post-conv norm, so the swapped data + // produces incorrect encoder output. + // + // We detect the swap by comparing weight magnitudes: pre_layer_norm + // weights have significantly higher energy than conv_norm weights in + // Gemma 4 conformer layers. If conv_norm has higher energy, the mapping + // is swapped and we fix it by swapping the loaded tensor pointers. + if (model.proj_type == PROJECTOR_TYPE_GEMMA4A + && hparams.n_layer > 0 + && model.layers[0].conv_norm_w + && model.layers[0].norm_conv_w) { + // Read first N values from each tensor and compute sum-of-squares + const int n_check = std::min((int)model.layers[0].conv_norm_w->ne[0], 64); + std::vector buf_cn(n_check), buf_nc(n_check); + ggml_backend_tensor_get(model.layers[0].conv_norm_w, buf_cn.data(), 0, n_check * sizeof(float)); + ggml_backend_tensor_get(model.layers[0].norm_conv_w, buf_nc.data(), 0, n_check * sizeof(float)); + + float ss_cn = 0.0f, ss_nc = 0.0f; + for (int i = 0; i < n_check; i++) { + ss_cn += buf_cn[i] * buf_cn[i]; + ss_nc += buf_nc[i] * buf_nc[i]; + } + + // In correctly-mapped GGUFs, conv_norm (post-conv) has lower magnitude + // than norm_conv (pre-conv/pre_layer_norm). If conv_norm has higher + // magnitude, the mapping is swapped and we need to fix it. + if (ss_cn > ss_nc * 1.5f) { + LOG_INF("%s: detected swapped conv norm mapping in GGUF, auto-fixing\n", __func__); + for (int il = 0; il < hparams.n_layer; ++il) { + std::swap(model.layers[il].conv_norm_w, model.layers[il].norm_conv_w); + std::swap(model.layers[il].conv_norm_b, model.layers[il].norm_conv_b); + } + } + } } struct support_info_op { From 831f94267ca3fd19ef2dd6cff3ccc8f1281372ff Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Thu, 9 Apr 2026 09:22:32 +1200 Subject: [PATCH 09/13] address review: simplify conv norm swap, move scaling to PR #21625 - Simplify conv norm fix: unconditionally swap tensor pointers after loading (all existing Gemma 4 mmproj GGUFs have this issue) - Remove per-layer embedding scaling for multimodal path (moved to dedicated PR #21625) - Remove duplicated comment in gemma4-iswa.cpp Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/mtmd/clip.cpp | 48 ++++++++++++--------------------------------- 1 file changed, 13 insertions(+), 35 deletions(-) diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp index 0f12aaed76..8182028230 100644 --- a/tools/mtmd/clip.cpp +++ b/tools/mtmd/clip.cpp @@ -2307,43 +2307,21 @@ struct clip_model_loader { LOG_DBG("%s: loaded %zu tensors from %s\n", __func__, tensors_to_load.size(), fname.c_str()); } - // Auto-detect and fix swapped conv norm mapping in Gemma 4 audio GGUFs. + // Fix swapped conv norm tensors in Gemma 4 audio GGUFs. // - // Publicly released Gemma 4 mmproj GGUFs have conv_norm and norm_conv - // tensor data swapped: HF pre_layer_norm ended up in GGUF conv_norm and - // vice versa. The C++ code uses layer.norm_conv_w as the pre-conv norm - // and layer.conv_norm_w as the post-conv norm, so the swapped data - // produces incorrect encoder output. + // The upstream tensor_mapping.py maps gemma4 HF tensors to GGUF names + // with conv_norm and norm_conv swapped: + // HF lconv1d.pre_layer_norm -> GGUF a.blk.{bid}.conv_norm (should be norm_conv) + // HF lconv1d.conv_norm -> GGUF a.blk.{bid}.norm_conv (should be conv_norm) // - // We detect the swap by comparing weight magnitudes: pre_layer_norm - // weights have significantly higher energy than conv_norm weights in - // Gemma 4 conformer layers. If conv_norm has higher energy, the mapping - // is swapped and we fix it by swapping the loaded tensor pointers. - if (model.proj_type == PROJECTOR_TYPE_GEMMA4A - && hparams.n_layer > 0 - && model.layers[0].conv_norm_w - && model.layers[0].norm_conv_w) { - // Read first N values from each tensor and compute sum-of-squares - const int n_check = std::min((int)model.layers[0].conv_norm_w->ne[0], 64); - std::vector buf_cn(n_check), buf_nc(n_check); - ggml_backend_tensor_get(model.layers[0].conv_norm_w, buf_cn.data(), 0, n_check * sizeof(float)); - ggml_backend_tensor_get(model.layers[0].norm_conv_w, buf_nc.data(), 0, n_check * sizeof(float)); - - float ss_cn = 0.0f, ss_nc = 0.0f; - for (int i = 0; i < n_check; i++) { - ss_cn += buf_cn[i] * buf_cn[i]; - ss_nc += buf_nc[i] * buf_nc[i]; - } - - // In correctly-mapped GGUFs, conv_norm (post-conv) has lower magnitude - // than norm_conv (pre-conv/pre_layer_norm). If conv_norm has higher - // magnitude, the mapping is swapped and we need to fix it. - if (ss_cn > ss_nc * 1.5f) { - LOG_INF("%s: detected swapped conv norm mapping in GGUF, auto-fixing\n", __func__); - for (int il = 0; il < hparams.n_layer; ++il) { - std::swap(model.layers[il].conv_norm_w, model.layers[il].norm_conv_w); - std::swap(model.layers[il].conv_norm_b, model.layers[il].norm_conv_b); - } + // All publicly released Gemma 4 mmproj GGUFs have this issue. Rather + // than changing the Python mapping (which would break gemma3n compat), + // we swap the tensor pointers after loading so they match their + // semantic usage: norm_conv_w = pre-conv norm, conv_norm_w = post-conv norm. + if (model.proj_type == PROJECTOR_TYPE_GEMMA4A && hparams.n_layer > 0) { + for (int il = 0; il < hparams.n_layer; ++il) { + std::swap(model.layers[il].conv_norm_w, model.layers[il].norm_conv_w); + std::swap(model.layers[il].conv_norm_b, model.layers[il].norm_conv_b); } } } From 896d3e81a7cac32b170113d0ba1a05d7681c2760 Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Thu, 9 Apr 2026 10:00:00 +1200 Subject: [PATCH 10/13] mtmd: fix CUDA/Vulkan conformer encoder by making sigmoid input contiguous The GLU gate in the Gemma 4 conformer creates a non-contiguous view (ggml_view_2d with offset) and passes it to ggml_sigmoid. CUDA and Vulkan backends require contiguous inputs for unary ops, so sigmoid fell back to CPU causing 25 graph splits per encoder forward pass. The repeated GPU<->CPU transfers introduced numerical divergence that caused repetition on longer audio. Fix: wrap the view in ggml_cont() before ggml_sigmoid(). This keeps the entire conformer graph on a single backend with no splits. Co-Authored-By: Claude Opus 4.6 (1M context) --- tools/mtmd/models/gemma4a.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/mtmd/models/gemma4a.cpp b/tools/mtmd/models/gemma4a.cpp index 6a5ae67fa9..16b09b568d 100644 --- a/tools/mtmd/models/gemma4a.cpp +++ b/tools/mtmd/models/gemma4a.cpp @@ -213,7 +213,7 @@ ggml_cgraph * clip_graph_gemma4a::build() { { int64_t d = x->ne[0] / 2; ggml_tensor * gate = ggml_sigmoid(ctx0, - ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0])); + ggml_cont(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], d * x->nb[0]))); x = ggml_mul(ctx0, ggml_view_2d(ctx0, x, d, x->ne[1], x->nb[1], 0), gate); x = ggml_cont(ctx0, ggml_transpose(ctx0, x)); From 863230fd45c3a3aa3a94a1a2fe8aaa47f1969559 Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Thu, 9 Apr 2026 10:17:23 +1200 Subject: [PATCH 11/13] revert tensor_mapping.py, gemma4-iswa.cpp and llama-model.cpp changes The conv norm mapping fix is handled in C++ (clip.cpp) by swapping tensor pointers after loading. No changes to tensor_mapping.py needed. The BF16-rounded scale, per-layer embedding scaling, and attn_soft_cap changes are moved to dedicated PRs (#21613, #21625). Co-Authored-By: Claude Opus 4.6 (1M context) --- gguf-py/gguf/tensor_mapping.py | 8 ++++---- src/llama-model.cpp | 3 ++- src/models/gemma4-iswa.cpp | 5 ++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py index 375c14b5a8..a2aa139de1 100644 --- a/gguf-py/gguf/tensor_mapping.py +++ b/gguf-py/gguf/tensor_mapping.py @@ -2056,22 +2056,22 @@ class TensorNameMap: MODEL_TENSOR.A_ENC_CONV_NORM: ( "conformer.layers.{bid}.conv.batch_norm", # lfm2 - "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n, gemma4 + "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n ), MODEL_TENSOR.A_ENC_CONV_PW1: ( "conformer.layers.{bid}.conv.pointwise_conv1", # lfm2 - "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n, gemma4 + "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n ), MODEL_TENSOR.A_ENC_CONV_PW2: ( "conformer.layers.{bid}.conv.pointwise_conv2", # lfm2 - "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n, gemma4 + "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n ), MODEL_TENSOR.A_ENC_NORM_CONV: ( "conformer.layers.{bid}.norm_conv", # lfm2 - "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n, gemma4 + "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n ), MODEL_TENSOR.A_PER_DIM_K_SCALE: ( diff --git a/src/llama-model.cpp b/src/llama-model.cpp index 771cb4feab..5636b45439 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -1186,13 +1186,14 @@ void llama_model::load_hparams(llama_model_loader & ml) { uint32_t swa_period = 2; ml.get_key_or_arr(LLM_KV_ATTENTION_SLIDING_WINDOW_PATTERN, swa_period, false); hparams.set_swa_pattern(swa_period); + hparams.attn_soft_cap = true; hparams.rope_freq_base_train_swa = hparams.rope_freq_base_train; hparams.rope_freq_scale_train_swa = hparams.rope_freq_scale_train; ml.get_key(LLM_KV_ROPE_FREQ_BASE_SWA, hparams.rope_freq_base_train_swa, false); ml.get_key(LLM_KV_ATTENTION_SLIDING_WINDOW, hparams.n_swa, false); ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps); - hparams.attn_soft_cap = ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); + ml.get_key(LLM_KV_ATTN_LOGIT_SOFTCAPPING, hparams.f_attn_logit_softcapping, false); ml.get_key(LLM_KV_FINAL_LOGIT_SOFTCAPPING, hparams.f_final_logit_softcapping, false); switch (hparams.n_layer) { diff --git a/src/models/gemma4-iswa.cpp b/src/models/gemma4-iswa.cpp index 9090c9109e..b3c6c5be2a 100644 --- a/src/models/gemma4-iswa.cpp +++ b/src/models/gemma4-iswa.cpp @@ -17,8 +17,7 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll inpL = build_inp_embd(model.tok_embd); // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) - // use BF16-rounded scale to match PyTorch's native BF16 training precision (ref: PR #21451) - inpL = ggml_scale(ctx0, inpL, ubatch.token ? ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf(n_embd))) : 1.0f); + inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions @@ -151,7 +150,7 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll // custom MoE logits calculation (router operates on attn_out, not cur) ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps); - tmp = ggml_scale(ctx0, tmp, 1.0f / ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf((float) n_embd)))); + tmp = ggml_scale(ctx0, tmp, 1.0f / sqrtf((float) n_embd)); tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s); ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, tmp); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); From 4f653fd033eb546f3b1a1f4298cb3563d5a3b843 Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Thu, 9 Apr 2026 11:47:12 +1200 Subject: [PATCH 12/13] gemma4: restore BF16-rounded scales and per-layer multimodal scaling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restore BF16-rounded scale wrappers for embedding and MoE logits to match PyTorch's native BF16 training precision. The small difference between sqrtf(1536)=39.19 and BF16-rounded 39.25 compounds through 35 layers, causing audio repetition especially on CUDA. Also add per-layer embedding scale for the multimodal path — PyTorch's ScaledWordEmbedding replaces multimodal IDs with pad_token_id and scales by sqrt(n_embd_per_layer). Without this, the token path is scaled but the multimodal path is not, degrading audio quality. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/models/gemma4-iswa.cpp | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/src/models/gemma4-iswa.cpp b/src/models/gemma4-iswa.cpp index b3c6c5be2a..9c75fb084d 100644 --- a/src/models/gemma4-iswa.cpp +++ b/src/models/gemma4-iswa.cpp @@ -17,7 +17,8 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll inpL = build_inp_embd(model.tok_embd); // important: do not normalize weights for raw embeddings input (i.e. encoded image emdeddings) - inpL = ggml_scale(ctx0, inpL, ubatch.token ? sqrtf(n_embd) : 1.0f); + // use BF16-rounded scale to match PyTorch's native BF16 training precision + inpL = ggml_scale(ctx0, inpL, ubatch.token ? ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf(n_embd))) : 1.0f); cb(inpL, "inp_scaled", -1); // inp_pos - contains the positions @@ -149,8 +150,9 @@ llm_build_gemma4_iswa::llm_build_gemma4_iswa(const llama_model & model, const ll cb(cur_moe, "ffn_norm_2", il); // custom MoE logits calculation (router operates on attn_out, not cur) + // use BF16-rounded scale to match PyTorch's native BF16 training precision ggml_tensor * tmp = ggml_rms_norm(ctx0, attn_out, hparams.f_norm_rms_eps); - tmp = ggml_scale(ctx0, tmp, 1.0f / sqrtf((float) n_embd)); + tmp = ggml_scale(ctx0, tmp, 1.0f / ggml_bf16_to_fp32(ggml_fp32_to_bf16(sqrtf((float) n_embd)))); tmp = ggml_mul(ctx0, tmp, model.layers[il].ffn_gate_inp_s); ggml_tensor * logits = build_lora_mm(model.layers[il].ffn_gate_inp, tmp); // [n_expert, n_tokens] cb(logits, "ffn_moe_logits", il); @@ -281,9 +283,12 @@ ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() { // TODO: verify if this is the correct behavior in transformers implementation const int64_t embd_size = model.per_layer_tok_embd->ne[0]; // n_embd_per_layer * n_layer - // Extract and dequantize padding token embedding (row 0) + // Extract and dequantize padding token embedding (row 0). + // PyTorch replaces multimodal IDs with pad_token_id before lookup, + // then ScaledWordEmbedding scales by sqrt(n_embd_per_layer). ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0); inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32); + inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_per_layer)); // Reshape to [n_embd_per_layer, n_layer, 1] inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, 1); From 9b5efb8e2f2319a37af94218bc422aecf60b37d3 Mon Sep 17 00:00:00 2001 From: Stephen Cox Date: Thu, 9 Apr 2026 11:54:17 +1200 Subject: [PATCH 13/13] gemma4: remove per-layer scaling (moved to #21625) The multimodal per-layer embedding scaling is handled by PR #21625. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/models/gemma4-iswa.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/models/gemma4-iswa.cpp b/src/models/gemma4-iswa.cpp index 9c75fb084d..26649a658d 100644 --- a/src/models/gemma4-iswa.cpp +++ b/src/models/gemma4-iswa.cpp @@ -283,12 +283,9 @@ ggml_tensor * llm_build_gemma4_iswa::build_inp_per_layer() { // TODO: verify if this is the correct behavior in transformers implementation const int64_t embd_size = model.per_layer_tok_embd->ne[0]; // n_embd_per_layer * n_layer - // Extract and dequantize padding token embedding (row 0). - // PyTorch replaces multimodal IDs with pad_token_id before lookup, - // then ScaledWordEmbedding scales by sqrt(n_embd_per_layer). + // Extract and dequantize padding token embedding (row 0) ggml_tensor * padding = ggml_view_1d(ctx0, model.per_layer_tok_embd, embd_size, 0); inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32); - inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_per_layer)); // Reshape to [n_embd_per_layer, n_layer, 1] inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_per_layer, n_layer, 1);