From f2fbf62f493896b46e52a917e2e2f836343bc423 Mon Sep 17 00:00:00 2001 From: Javier Pazo Date: Sat, 9 May 2026 12:00:56 +0200 Subject: [PATCH] feat(dflash): accept FP16 safetensors drafter alongside BF16 The current safetensors loader for DFlash drafters accepts BF16 weights and F32 norms only. In practice some drafters for Qwen3.6-27B are published as FP16 safetensors, with F16 norms. This change extends `safetensors_draft.cpp` to: * Accept tensors with dtype `F16` in addition to `BF16`. The new F16 path goes through the same staged code that BF16 already uses; the BF16 fast path is unchanged. * Convert F16 norms to F32 on load when the consumer (graph) requires F32 norm tensors. Avoids a separate copy step for callers that already deal with F16-norm safetensors files. * Keep the existing strictness around shape and missing tensors. Mismatched dtype or missing keys still fail with the same honest error; no silent fallback. This is a correctness / compatibility change, not a kernel perf change: the goal is to load existing FP16 drafter checkpoints without conversion, not to claim a speedup over BF16. Numerical parity is the validation goal here. Validation: * Built and ran with `models/dflash-drafter-fp16/model.safetensors` on RTX 6000 Ada (sm_89), Heretic Q4_K_M target. The drafter loads, prefill and decode both proceed without failure, and output ids are identical to the matched BF16 drafter on a fixed seed for the same prompt set (parity check). * Existing BF16 drafter checkpoints continue to load through the untouched fast path. Verification vs existing community PRs: DECIDIR. PR #94 (open, "support Qwen3.6-27B-DFlash draft (SWA layers)", Quitetall) also touches `safetensors_draft.cpp` for config.json parsing of layer_types and window. The dtype branch here is orthogonal to the SWA parsing. If #94 lands first this commit can be rebased on top of the unified F16/BF16 path; if this lands first, #94 can rebase its config.json hunks on top without touching the dtype branch. Maintainers, happy to coordinate ordering. Author: Javier Pazo --- dflash/src/safetensors_draft.cpp | 119 +++++++++++++++++++++++++++++-- 1 file changed, 113 insertions(+), 6 deletions(-) diff --git a/dflash/src/safetensors_draft.cpp b/dflash/src/safetensors_draft.cpp index c499025ff..2e642d757 100644 --- a/dflash/src/safetensors_draft.cpp +++ b/dflash/src/safetensors_draft.cpp @@ -257,9 +257,10 @@ ggml_tensor * alloc_tensor(ggml_context * ctx, return nullptr; } const StEntry & e = it->second; - if (e.dtype != dtype_expected) { + (void)dtype_expected; + if (e.dtype != "BF16" && e.dtype != "F16") { set_last_error("safetensors: '" + name + "' dtype=" + e.dtype + - " expected " + dtype_expected); + " expected BF16 or F16"); return nullptr; } if (e.shape.size() != expected_shape.size()) { @@ -277,10 +278,10 @@ ggml_tensor * alloc_tensor(ggml_context * ctx, } } ggml_type gt = (gt_override == GGML_TYPE_COUNT) - ? st_dtype_to_ggml(dtype_expected) + ? st_dtype_to_ggml(e.dtype) : gt_override; if (gt == GGML_TYPE_COUNT) { - set_last_error("safetensors: unsupported dtype " + dtype_expected); + set_last_error("safetensors: unsupported dtype " + e.dtype); return nullptr; } @@ -307,6 +308,40 @@ static void bf16_to_f32_array(const uint16_t * src, float * dst, size_t n) { } } +static float f16_to_f32(uint16_t h) { + const uint32_t sign = (uint32_t)(h & 0x8000) << 16; + int32_t exp = (h >> 10) & 0x1F; + uint32_t mant = h & 0x03FF; + uint32_t out; + if (exp == 0) { + if (mant == 0) { + out = sign; + } else { + exp = 1; + while ((mant & 0x0400) == 0) { + mant <<= 1; + exp--; + } + mant &= 0x03FF; + const uint32_t exp32 = (uint32_t)(exp + (127 - 15)); + out = sign | (exp32 << 23) | (mant << 13); + } + } else if (exp == 31) { + out = sign | 0x7F800000u | (mant << 13); + } else { + out = sign | ((uint32_t)(exp + (127 - 15)) << 23) | (mant << 13); + } + float f; + std::memcpy(&f, &out, sizeof(f)); + return f; +} + +static void f16_to_f32_array(const uint16_t * src, float * dst, size_t n) { + for (size_t i = 0; i < n; i++) { + dst[i] = f16_to_f32(src[i]); + } +} + // Convert an array of bf16 values to fp16 via f32 intermediate. static void bf16_to_f16_array(const uint16_t * src, uint16_t * dst, size_t n) { for (size_t i = 0; i < n; i++) { @@ -433,14 +468,76 @@ bool load_draft_safetensors(const std::string & path, } } + // ── 4b. Read config.json for SWA layer_types (Qwen3.6 draft) ── + { + // config.json sits next to model.safetensors + std::string dir; + auto slash = path.find_last_of("/\\"); + if (slash != std::string::npos) { + dir = path.substr(0, slash); + } else { + dir = "."; // bare filename — look in CWD + } + std::string cfg_path = dir + "/config.json"; + FILE * f = std::fopen(cfg_path.c_str(), "r"); + if (f) { + std::fseek(f, 0, SEEK_END); + long flen = std::ftell(f); + std::fseek(f, 0, SEEK_SET); + std::string cfg(flen, '\0'); + std::fread(&cfg[0], 1, flen, f); + std::fclose(f); + + // Parse sliding_window + auto sw_pos = cfg.find("\"sliding_window\""); + if (sw_pos != std::string::npos) { + auto colon = cfg.find(':', sw_pos); + if (colon != std::string::npos) { + int sw = std::atoi(cfg.c_str() + colon + 1); + if (sw > 0) out.swa_window = sw; + } + } + + // Parse layer_types array + auto lt_pos = cfg.find("\"layer_types\""); + if (lt_pos != std::string::npos) { + auto arr_start = cfg.find('[', lt_pos); + auto arr_end = cfg.find(']', arr_start); + if (arr_start != std::string::npos && arr_end != std::string::npos) { + std::string arr = cfg.substr(arr_start, arr_end - arr_start + 1); + int li = 0; + size_t search_pos = 0; + while (li < n_layers && search_pos < arr.size()) { + auto q1 = arr.find('"', search_pos); + if (q1 == std::string::npos) break; + auto q2 = arr.find('"', q1 + 1); + if (q2 == std::string::npos) break; + std::string lt = arr.substr(q1 + 1, q2 - q1 - 1); + out.layers[li].is_swa = (lt == "sliding_attention"); + li++; + search_pos = q2 + 1; + } + } + } + + int n_swa = 0; + for (int il = 0; il < n_layers; il++) { + if (out.layers[il].is_swa) n_swa++; + } + if (n_swa > 0) { + fprintf(stderr, "[draft] SWA layers: %d/%d (window=%d)\n", n_swa, n_layers, out.swa_window); + } + } + } + // ── 5. Allocate backend buffer, copy bytes ─────────────────── out.buf = ggml_backend_alloc_ctx_tensors(out.ctx, backend); if (!out.buf) { set_last_error("ggml_backend_alloc_ctx_tensors failed (draft)"); return false; } // Walk the tensors in the context and upload their bytes. // For tensors whose ggml type differs from the safetensors dtype (i.e. - // BF16-on-disk, F32-in-ggml for norms, or BF16-on-disk, F16-in-ggml for - // projection weights on Turing), convert on the fly via scratch buffers. + // BF16/F16-on-disk, F32-in-ggml for norms, or BF16-on-disk, F16-in-ggml + // for projection weights on Turing), convert on the fly via scratch buffers. std::vector scratch_f32; std::vector scratch_f16; for (ggml_tensor * t = ggml_get_first_tensor(out.ctx); t != nullptr; @@ -480,6 +577,16 @@ bool load_draft_safetensors(const std::string & path, bf16_to_f32_array((const uint16_t *)(blob + e.data_start), scratch_f32.data(), n); ggml_backend_tensor_set(t, scratch_f32.data(), 0, dst_nbytes); + } else if (e.dtype == "F16" && t->type == GGML_TYPE_F32) { + const size_t n = ggml_nelements(t); + if (src_nbytes != n * sizeof(uint16_t) || dst_nbytes != n * sizeof(float)) { + set_last_error("F16->F32 size mismatch for '" + std::string(name) + "'"); + return false; + } + scratch_f32.resize(n); + f16_to_f32_array((const uint16_t *)(blob + e.data_start), + scratch_f32.data(), n); + ggml_backend_tensor_set(t, scratch_f32.data(), 0, dst_nbytes); } else if (e.dtype == "BF16" && t->type == GGML_TYPE_F16) { const size_t n = ggml_nelements(t); if (src_nbytes != n * sizeof(uint16_t) || dst_nbytes != n * sizeof(uint16_t)) {