diff --git a/server/CMakeLists.txt b/server/CMakeLists.txt index 8511c4e6d..23e6f2022 100644 --- a/server/CMakeLists.txt +++ b/server/CMakeLists.txt @@ -583,6 +583,11 @@ if(DFLASH27B_TESTS) target_link_libraries(test_adaptive_keep_ratio PRIVATE dflash_common) add_test(NAME adaptive_keep COMMAND test_adaptive_keep_ratio) endif() + if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_derived_scalars.cpp") + add_executable(test_derived_scalars test/test_derived_scalars.cpp) + target_include_directories(test_derived_scalars PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) + add_test(NAME derived_scalars COMMAND test_derived_scalars) + endif() if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/test/test_bandit_integration.cpp") add_executable(test_bandit_integration test/test_bandit_integration.cpp) target_include_directories(test_bandit_integration PRIVATE ${DFLASH27B_SRC_INCLUDE_DIRS}) diff --git a/server/src/common/derived_scalars.h b/server/src/common/derived_scalars.h new file mode 100644 index 000000000..da4668af8 --- /dev/null +++ b/server/src/common/derived_scalars.h @@ -0,0 +1,67 @@ +// Pure helper: verify that tensor-shape-derived scalars match GGUF-declared +// metadata. No IO; safe to call from any loader after weights are loaded. +// +// Returns true when derived == declared for all three dimensions. +// On mismatch fills `err` with a diagnostic and returns false. +// +// Callers must compute the *expected* values from their declared scalars: +// draft loader : expected_q_dim = n_head * head_dim +// expected_kv_dim = n_head_kv * head_dim +// qwen35 target : expected_q_dim = n_head * n_embd_head_k * 2 (Q+gate packed) +// expected_kv_dim = n_head_kv * n_embd_head_k +// Both loaders: expected_n_embd = n_embd (wq->ne[0] = input projection dim). +// +// Equivalent pattern for gemma4 lives inline in gemma4_backend.cpp (~line 1072) +// as a silent override rather than an assertion; kept separate intentionally. + +#pragma once + +#include +#include +#include + +namespace dflash::common { + +// verify_derived_scalars +// wq_ne1 : weight_q->ne[1] (output dim of Q projection) +// wk_ne1 : weight_k->ne[1] (output dim of K projection) +// wq_ne0 : weight_q->ne[0] (input dim of Q projection == n_embd) +// expected_q_dim : n_head * head_dim [* 2 for packed Q+gate] +// expected_kv_dim: n_head_kv * head_dim +// expected_n_embd: n_embd +// layer_tag : short string for the error message (e.g. "blk.0" or "blk.3") +// err : filled on mismatch +inline bool verify_derived_scalars( + int64_t wq_ne1, int64_t wk_ne1, int64_t wq_ne0, + int64_t expected_q_dim, int64_t expected_kv_dim, int64_t expected_n_embd, + const char * layer_tag, + std::string & err) +{ + if (wq_ne1 != expected_q_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: %s attn_q.weight->ne[1]=%lld != expected_q_dim=%lld", + layer_tag, (long long)wq_ne1, (long long)expected_q_dim); + err = buf; + return false; + } + if (wk_ne1 != expected_kv_dim) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: %s attn_k.weight->ne[1]=%lld != expected_kv_dim=%lld", + layer_tag, (long long)wk_ne1, (long long)expected_kv_dim); + err = buf; + return false; + } + if (wq_ne0 != expected_n_embd) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: %s attn_q.weight->ne[0]=%lld != n_embd=%lld", + layer_tag, (long long)wq_ne0, (long long)expected_n_embd); + err = buf; + return false; + } + return true; +} + +} // namespace dflash::common diff --git a/server/src/draft/draft_gguf_loader.cpp b/server/src/draft/draft_gguf_loader.cpp index fbec7263b..39d620ce6 100644 --- a/server/src/draft/draft_gguf_loader.cpp +++ b/server/src/draft/draft_gguf_loader.cpp @@ -24,6 +24,7 @@ // blk..ffn_down.weight [hidden, intermediate] Q8_0 / F16 #include "internal.h" +#include "common/derived_scalars.h" #include #include @@ -349,6 +350,41 @@ bool load_draft_gguf(const std::string & path, gguf_free(gctx); + // Structural defense: derive head_dim / n_head / n_head_kv from weight + // tensor shapes and assert against GGUF-declared metadata. + // All draft layers have wq/wk (no deltanet mix), so layer 0 suffices. + // wq: [n_embd, n_head*head_dim], ne[1]=n_head*head_dim, ne[0]=n_embd. + // wk: [n_embd, n_head_kv*head_dim], ne[1]=n_head_kv*head_dim. + { + const DraftLayer & L0 = out.layers[0]; + const int64_t exp_q_dim = (int64_t)out.n_head * out.head_dim; + const int64_t exp_kv_dim = (int64_t)out.n_head_kv * out.head_dim; + const int64_t exp_n_embd = (int64_t)out.n_embd; + std::string err; + if (!dflash::common::verify_derived_scalars( + L0.wq->ne[1], L0.wk->ne[1], L0.wq->ne[0], + exp_q_dim, exp_kv_dim, exp_n_embd, + "blk.0", err)) { + set_last_error(err); + return false; + } + // fc: [n_target_layers*n_embd, n_embd] — ne[0] = n_target_layers*n_embd. + if (out.n_target_layers > 0) { + const int64_t derived_fc_in = out.fc->ne[0]; + const int64_t expected_fc_in = (int64_t)out.n_target_layers * out.n_embd; + if (derived_fc_in != expected_fc_in) { + char buf[256]; + std::snprintf(buf, sizeof(buf), + "GGUF shape mismatch: dflash.fc.weight->ne[0]=%lld " + "!= n_target_layers*n_embd=%d*%d=%lld", + (long long)derived_fc_in, + out.n_target_layers, out.n_embd, (long long)expected_fc_in); + set_last_error(buf); + return false; + } + } + } + char summary[192]; std::snprintf(summary, sizeof(summary), "draft GGUF loaded: %" PRId64 " tensors, %.2f GiB on GPU", diff --git a/server/src/qwen35/gguf_target_loader.cpp b/server/src/qwen35/gguf_target_loader.cpp index 116ddafc0..8b9a27aac 100644 --- a/server/src/qwen35/gguf_target_loader.cpp +++ b/server/src/qwen35/gguf_target_loader.cpp @@ -44,6 +44,7 @@ // tensor's bytes from the mmap'd file. #include "internal.h" +#include "common/derived_scalars.h" #include "common/layer_split_utils.h" #include @@ -738,6 +739,31 @@ bool load_target_gguf_partial(const std::string & path, gguf_free(gctx); + // Structural defense: derive head_dim / n_head / n_head_kv from weight + // tensor shapes and assert against GGUF-declared metadata. + // Uses the first full-attention layer; deltanet layers don't carry wq/wk. + // wq packs Q+gate: ne[1] = n_head * n_embd_head_k * 2. + // wk: ne[1] = n_head_kv * n_embd_head_k. wq: ne[0] = n_embd. + { + const int fa_il = out.full_attention_interval - 1; + const TargetLayer & fa = out.layers[(size_t)fa_il]; + if (fa.wq && fa.wk) { + const int64_t exp_q_dim = (int64_t)out.n_head * out.n_embd_head_k * 2; + const int64_t exp_kv_dim = (int64_t)out.n_head_kv * out.n_embd_head_k; + const int64_t exp_n_embd = (int64_t)out.n_embd; + char tag[16]; + std::snprintf(tag, sizeof(tag), "blk.%d", fa_il); + std::string err; + if (!dflash::common::verify_derived_scalars( + fa.wq->ne[1], fa.wk->ne[1], fa.wq->ne[0], + exp_q_dim, exp_kv_dim, exp_n_embd, + tag, err)) { + set_last_error(err); + return false; + } + } + } + if (tok_embd_off == 0 || tok_embd_type == GGML_TYPE_COUNT) { set_last_error("token_embd.weight not found or invalid type"); return false; diff --git a/server/test/test_derived_scalars.cpp b/server/test/test_derived_scalars.cpp new file mode 100644 index 000000000..c0e4ae07a --- /dev/null +++ b/server/test/test_derived_scalars.cpp @@ -0,0 +1,132 @@ +// Unit tests for dflash::common::verify_derived_scalars — no GPU, no model files. +// +// Build: cmake --build build --target test_derived_scalars -j +// Run: cd build && ctest -R derived_scalars --output-on-failure + +#include "common/derived_scalars.h" + +#include +#include + +using namespace dflash::common; + +// ─── Minimal test framework ──────────────────────────────────────────────────── + +static int test_failures = 0; +static int test_count = 0; + +#define TEST_ASSERT(expr) do { \ + test_count++; \ + if (!(expr)) { \ + test_failures++; \ + std::fprintf(stderr, " FAIL: %s:%d: %s\n", __FILE__, __LINE__, #expr); \ + } \ +} while (0) + +#define RUN_TEST(fn) do { \ + std::fprintf(stderr, " %s ...", #fn); \ + int before = test_failures; \ + fn(); \ + if (test_failures == before) std::fprintf(stderr, " ok\n"); \ + else std::fprintf(stderr, "\n"); \ +} while (0) + +// ─── Tests ───────────────────────────────────────────────────────────────────── + +// All three dims match: returns true, err untouched. +static void match_returns_true() { + std::string err; + bool ok = verify_derived_scalars( + /*wq_ne1*/ 4096, /*wk_ne1*/ 512, /*wq_ne0*/ 5120, + /*exp_q_dim*/ 4096, /*exp_kv_dim*/ 512, /*exp_n_embd*/ 5120, + "blk.0", err); + TEST_ASSERT(ok); + TEST_ASSERT(err.empty()); +} + +// wq_ne1 != expected_q_dim: returns false and err non-empty. +static void mismatch_q_dim_returns_false() { + std::string err; + bool ok = verify_derived_scalars( + /*wq_ne1*/ 4096 + 1, /*wk_ne1*/ 512, /*wq_ne0*/ 5120, + /*exp_q_dim*/ 4096, /*exp_kv_dim*/ 512, /*exp_n_embd*/ 5120, + "blk.0", err); + TEST_ASSERT(!ok); + TEST_ASSERT(!err.empty()); +} + +// wk_ne1 != expected_kv_dim: returns false. +static void mismatch_kv_dim_returns_false() { + std::string err; + bool ok = verify_derived_scalars( + /*wq_ne1*/ 4096, /*wk_ne1*/ 512 + 1, /*wq_ne0*/ 5120, + /*exp_q_dim*/ 4096, /*exp_kv_dim*/ 512, /*exp_n_embd*/ 5120, + "blk.0", err); + TEST_ASSERT(!ok); + TEST_ASSERT(!err.empty()); +} + +// wq_ne0 != expected_n_embd: returns false. +static void mismatch_n_embd_returns_false() { + std::string err; + bool ok = verify_derived_scalars( + /*wq_ne1*/ 4096, /*wk_ne1*/ 512, /*wq_ne0*/ 5120 + 64, + /*exp_q_dim*/ 4096, /*exp_kv_dim*/ 512, /*exp_n_embd*/ 5120, + "blk.0", err); + TEST_ASSERT(!ok); + TEST_ASSERT(!err.empty()); +} + +// Typical draft model dims: n_head=32, head_dim=128, n_head_kv=8, n_embd=5120. +// expected_q_dim=32*128=4096, expected_kv_dim=8*128=1024. +static void draft_dims_match() { + std::string err; + bool ok = verify_derived_scalars( + 4096, 1024, 5120, + (int64_t)32 * 128, (int64_t)8 * 128, 5120, + "blk.0", err); + TEST_ASSERT(ok); + TEST_ASSERT(err.empty()); +} + +// Typical qwen35 target layer: n_head=24, n_embd_head_k=256, n_head_kv=4. +// expected_q_dim = 24*256*2 = 12288 (Q+gate packed). +// expected_kv_dim = 4*256 = 1024. +static void qwen35_target_dims_match() { + std::string err; + bool ok = verify_derived_scalars( + /*wq_ne1*/ 12288, /*wk_ne1*/ 1024, /*wq_ne0*/ 5120, + /*exp_q_dim*/ (int64_t)24 * 256 * 2, + /*exp_kv_dim*/ (int64_t)4 * 256, + /*exp_n_embd*/ 5120, + "blk.3", err); + TEST_ASSERT(ok); + TEST_ASSERT(err.empty()); +} + +// Error message must contain the layer tag for easy diagnosis. +static void err_contains_layer_tag() { + std::string err; + verify_derived_scalars( + 4097, 1024, 5120, + 4096, 1024, 5120, + "blk.15", err); + TEST_ASSERT(err.find("blk.15") != std::string::npos); +} + +// ─── main ────────────────────────────────────────────────────────────────────── + +int main() { + std::fprintf(stderr, "=== test_derived_scalars ===\n"); + + RUN_TEST(match_returns_true); + RUN_TEST(mismatch_q_dim_returns_false); + RUN_TEST(mismatch_kv_dim_returns_false); + RUN_TEST(mismatch_n_embd_returns_false); + RUN_TEST(draft_dims_match); + RUN_TEST(qwen35_target_dims_match); + RUN_TEST(err_contains_layer_tag); + + std::fprintf(stderr, "\n%d tests, %d failures\n", test_count, test_failures); + return (test_failures == 0) ? 0 : 1; +}