From fbdbb78ba291a90dc43c0582087dfae3371e4828 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaolin" <xiaolin.zhang@intel.com>
Date: Mon, 1 Jun 2026 14:48:21 +0800
Subject: [PATCH 1/4] GPU plugin: gate MOE3GemmFusedCompressed fusion on oneDNN
 availability

The MOE3GemmFusedCompressed kernel dispatches expert GEMMs through oneDNN,
which requires an in-order OCL queue.  If oneDNN is disabled (e.g. via
OV_GPU_USE_ONEDNN=0 on an IMMAD GPU), the queue stays out-of-order and
the oneDNN stream creation asserts at ocl_stream.cpp:240.

Guard the FuseMOE3GemmCompressed transformation pass behind
config.get_use_onednn() so the op is never introduced into the graph
when oneDNN is unavailable.  The outer 'supports_immad' block already
prevents this on non-IMMAD GPUs; this inner guard handles the case
where oneDNN is explicitly disabled by the user on IMMAD hardware.

Signed-off-by: Zhang, Xiaolin <xiaolin.zhang@intel.com>
---
 .../intel_gpu/src/plugin/transformations_pipeline.cpp    | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index b6e281a2836831..111f8978df185a 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -567,8 +567,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         const bool disable_moe_opt = GPU_DEBUG_VALUE_OR(config.get_disable_moe_opt(), false);
 
         // MOE: TiledMoeBlock -> GatherMatmuls(compressed) -> MoeOp(compressed) -> MoeOpWithRouting(compressed).
-        // Gated on supports_immad: GatherMatmul backend is systolic-only.
-        if (device_info.supports_immad) {
+        // Gated on supports_immad (systolic-only) and oneDNN (required for expert GEMM dispatch).
+        // Note: even though we are already inside `if (supports_immad)`, oneDNN can still be explicitly disabled by the user.
+        if (device_info.supports_immad && config.get_use_onednn()) {
             manager.register_pass<ov::pass::ConvertTiledMoeBlockToGatherMatmuls>();
 
             // f32 listed because this pass runs before ConvertPrecision (line ~588);
@@ -583,6 +584,10 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
                 const bool has_batch_dim = !is_pa;
                 manager.register_pass<ov::pass::MoeOpFusion>(has_batch_dim);
                 manager.register_pass<ov::intel_gpu::FuseMOESharedExpert>();
+                // MOE3GemmFusedCompressed kernel dispatches expert GEMMs through
+                // oneDNN, which requires an in-order OCL queue.  If oneDNN is
+                // disabled (e.g. via OV_GPU_USE_ONEDNN=0 on an IMMAD GPU), the
+                // queue stays out-of-order and the oneDNN stream creation may assert.
                 manager.register_pass<ov::intel_gpu::FuseMOE3GemmCompressed>();
             }
         }

From 0d8e0c7a2d5213fc1cbd8e225e3246cbbd29e685 Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaolin" <xiaolin.zhang@intel.com>
Date: Thu, 28 May 2026 16:10:50 +0800
Subject: [PATCH 2/4] GPU: fix std::bad_function_call crash on compiled model
 cache load

When a GPU-compiled model is loaded from the blob cache and the first
inference uses a different input shape, the SHAPE_CHANGED flag triggers
update_dispatch_data_func() for every kernel stage.  Two issues caused
std::bad_function_call:

1. kv_cache: the load() deserializer restored dispatch-data functions
   for scatter_update, concat, beam_table, dq, and scale_concat stages
   but omitted the zp_concat stage.  On compressed KV-cache models with
   zero-point inputs (e.g. kv_int8), the first shape-changed inference
   called the null std::function and crashed.

   Fix: restore zp_concat's update_dispatch_data_func in load() using
   the same kernel-selector lookup pattern as scale_concat.

2. DispatchDataFunc::operator() (ocl_v2 framework) called the inner
   std::function without checking for null, even though the class
   explicitly supports construction from nullptr and KernelData
   defaults to nullptr.

   Fix: add a null guard so stages whose codegen legitimately returns
   no dispatch-data updater silently skip the call instead of crashing.

Reproducible with Qwen3-Omni-30B-A3B-Instruct on GPU with --cache-model:
compile on one prompt length, then infer with a different prompt length.

Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
---
 .../src/graph/common_utils/kernel_generator_base.hpp        | 4 +++-
 src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp      | 6 ++++++
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp b/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp
index 9322bec03e3b96..6b0a6b41b074de 100644
--- a/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp
@@ -40,7 +40,9 @@ struct DispatchDataFunc {
     explicit DispatchDataFunc(std::nullptr_t) {}
 
     void operator()(const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params = nullptr) const {
-        m_dispatch_data_func(params, kd, rt_params);
+        if (m_dispatch_data_func) {
+            m_dispatch_data_func(params, kd, rt_params);
+        }
     }
 };
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
index 0f411c4b96533e..30d4c1961b198d 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
@@ -178,6 +178,12 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
                 auto scale_zp_concat_kernel_impl = scale_zp_concat_kernel_selector.GetImplementation(_kernels_data[*scale_concat_stage].kernelName);
                 scale_zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[*scale_concat_stage]);
             }
+
+            if (const auto zp_concat_stage = stages.try_get_index(kv_stage::zp_concat)) {
+                auto& zp_concat_kernel_selector = kernel_selector_t::Instance();
+                auto zp_concat_kernel_impl = zp_concat_kernel_selector.GetImplementation(_kernels_data[*zp_concat_stage].kernelName);
+                zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[*zp_concat_stage]);
+            }
         }
     }
     void set_arguments_impl(kv_cache_inst& instance) override {}

From 17060d85fb83f174dac58e84a4167be4900adfeb Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaolin" <xiaolin.zhang@intel.com>
Date: Thu, 28 May 2026 16:11:07 +0800
Subject: [PATCH 3/4] GPU: serialize MoE prefill execution flags in compiled
 model cache

moe_3gemm_swiglu_opt_impl selects between three prefill execution paths
(micro_gemm, grouped_gemm, per-expert onednn loop) based on three
boolean flags set in the constructor from environment variables and
hardware capabilities.  These flags also determine the number of
internal buffers (9 vs 15) via get_internal_buffer_descs().

The existing load() override did not serialize these flags.  After
loading from the compiled model cache the flags reverted to their
default-initialized values (all false), which could select a different
execution path and allocate a mismatched number of internal buffers
compared to the cached kernel stages.

Add a save() override that writes use_micro_gemm_prefill,
use_gpu_mask_gen_prefill, and use_grouped_gemm_prefill after the
base-class data.  Update load() to read them back, restoring the
exact execution configuration that was active when the model was
originally compiled.

Signed-off-by: Xiaolin Zhang <xiaolin.zhang@intel.com>
---
 .../graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp  | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
index c78b209885722f..86ddaf15628104 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
@@ -1031,8 +1031,20 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
         _shared_down_proj->forward(stream, batch, gate_mem_dnnl, output_dnnl, scalar_gate_dnnl);
     }
 
+    void save(BinaryOutputBuffer& ob) const override {
+        PrimitiveImplOCL::save(ob);
+        ob << use_micro_gemm_prefill;
+        ob << use_gpu_mask_gen_prefill;
+        ob << use_grouped_gemm_prefill;
+    }
+
     void load(BinaryInputBuffer& ib) override {
         PrimitiveImplOCL::load(ib);
+        // Read execution-path flags before init() so any future init() logic
+        // that depends on them sees the deserialized (not default) values.
+        ib >> use_micro_gemm_prefill;
+        ib >> use_gpu_mask_gen_prefill;
+        ib >> use_grouped_gemm_prefill;
         const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
         init(impl_params->typed_desc<moe_3gemm_fused_compressed>());
     }

From b195f838f55bab5946cd94877831beff405bc11c Mon Sep 17 00:00:00 2001
From: "Zhang, Xiaolin" <xiaolin.zhang@intel.com>
Date: Mon, 1 Jun 2026 09:16:19 +0800
Subject: [PATCH 4/4] Add unit tests for GPU plugin cache serialization and
 stability fixes

1. dispatch_data_func_test.cpp (new):
   - Tests the null-guard added to DispatchDataFunc::operator()
   - Verifies that calling a DispatchDataFunc constructed from nullptr
     does not throw std::bad_function_call
   - Verifies that a valid dispatch function is correctly invoked
   - Verifies that default-constructed KernelData's update_dispatch_data_func
     is safe to call (null-safe)

2. cache_serialization_test.cpp (new):
   - Tests KV-cache stages serialization round-trip including the zp_concat
     dispatch data func restoration path (save/load cycle)
   - Tests KV-cache stages without zp_concat (scatter + kv_cache only)
   - Tests KV-cache scatter-only path serialization
   - Tests MoE prefill flags (use_micro_gemm_prefill, use_gpu_mask_gen_prefill,
     use_grouped_gemm_prefill) round-trip for:
     * micro_gemm path
     * grouped_gemm path
     * fallback path (all false)
     * all-true path

Signed-off-by: Zhang, Xiaolin <xiaolin.zhang@intel.com>
---
 .../module_tests/dispatch_data_func_test.cpp  |  39 +++
 .../test_cases/cache_serialization_test.cpp   | 304 ++++++++++++++++++
 2 files changed, 343 insertions(+)
 create mode 100644 src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp
 create mode 100644 src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp

diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp
new file mode 100644
index 00000000000000..6770e6263f2197
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Regression tests for the null-guard added to DispatchDataFunc::operator().
+// Before the fix, calling a DispatchDataFunc constructed from nullptr would
+// invoke a null std::function and throw std::bad_function_call.
+
+#include "test_utils.h"
+#include "common_utils/kernel_generator_base.hpp"
+
+using namespace ov::intel_gpu;
+
+TEST(dispatch_data_func, null_func_does_not_crash) {
+    DispatchDataFunc func{nullptr};
+    KernelData kd;
+    RuntimeParams params;
+
+    ASSERT_NO_THROW(func(params, kd, nullptr));
+}
+
+TEST(dispatch_data_func, valid_func_is_called) {
+    bool called = false;
+    DispatchDataFunc func{[&called](const RuntimeParams&, KernelData&, ImplRuntimeParams*) {
+        called = true;
+    }};
+    KernelData kd;
+    RuntimeParams params;
+
+    func(params, kd, nullptr);
+    ASSERT_TRUE(called);
+}
+
+TEST(dispatch_data_func, default_constructed_is_null_safe) {
+    // Default-constructed KernelData has update_dispatch_data_func{nullptr}.
+    KernelData kd;
+    RuntimeParams params;
+
+    ASSERT_NO_THROW(kd.update_dispatch_data_func(params, kd, nullptr));
+}
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp
new file mode 100644
index 00000000000000..07495414c25baf
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp
@@ -0,0 +1,304 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Regression tests for cache serialization round-trip of:
+//   1) kv_cache stages including the zp_concat stage (commit: "GPU: fix std::bad_function_call")
+//   2) MoE prefill execution flags (commit: "GPU: serialize MoE prefill execution flags")
+//
+// These tests replicate the serialization logic of internal structures
+// (stages_helper, moe prefill flags) without requiring a full model compilation,
+// verifying that the save/load contract is correct.
+
+#include "test_utils.h"
+#include "intel_gpu/graph/serialization/binary_buffer.hpp"
+#include "intel_gpu/graph/serialization/utils.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+using namespace cldnn;
+using namespace ::tests;
+
+// --------------------------------------------------------------------------
+// kv_cache stages_helper round-trip tests
+// --------------------------------------------------------------------------
+// Mirrors the kv_stage enum and stages_helper from kv_cache.cpp
+namespace {
+enum class kv_stage_test : uint8_t { scatter_update, concat, beam_table, dq, scale_concat, zp_concat };
+
+struct stages_helper_test {
+    std::vector<kv_stage_test> stages;
+
+    void save(BinaryOutputBuffer& ob) const {
+        ob << stages.size();
+        for (const auto& stage : stages) {
+            ob << static_cast<uint8_t>(stage);
+        }
+    }
+
+    void load(BinaryInputBuffer& ib) {
+        size_t stages_size = 0;
+        ib >> stages_size;
+        stages.resize(stages_size);
+        for (auto& stage : stages) {
+            uint8_t stage_ = 0;
+            ib >> stage_;
+            stage = static_cast<kv_stage_test>(stage_);
+        }
+    }
+
+    std::optional<size_t> try_get_index(kv_stage_test stage) const noexcept {
+        if (const auto it = std::find(stages.begin(), stages.end(), stage); it != stages.end()) {
+            return static_cast<size_t>(std::distance(stages.begin(), it));
+        }
+        return {};
+    }
+};
+}  // namespace
+
+// Test: KV-cache stages including zp_concat survive a binary round-trip.
+// This covers the scenario where a compressed KV-cache with zero-point inputs
+// has stages {concat, beam_table, dq, scale_concat, zp_concat}.
+// Before the fix, the load() path did not restore the dispatch function for
+// zp_concat, causing std::bad_function_call on shape change.
+TEST(cache_serialization, kv_cache_stages_with_zp_concat_round_trip) {
+    auto& engine = get_test_engine();
+
+    // Simulate a compressed KV-cache with indirect + zero-points
+    stages_helper_test original;
+    original.stages.push_back(kv_stage_test::concat);
+    original.stages.push_back(kv_stage_test::beam_table);
+    original.stages.push_back(kv_stage_test::dq);
+    original.stages.push_back(kv_stage_test::scale_concat);
+    original.stages.push_back(kv_stage_test::zp_concat);
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    stages_helper_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.stages.size(), original.stages.size());
+    for (size_t i = 0; i < original.stages.size(); ++i) {
+        ASSERT_EQ(static_cast<uint8_t>(loaded.stages[i]),
+                  static_cast<uint8_t>(original.stages[i]))
+            << "Stage mismatch at index " << i;
+    }
+
+    // Verify zp_concat stage is present and at the correct index
+    auto zp_idx = loaded.try_get_index(kv_stage_test::zp_concat);
+    ASSERT_TRUE(zp_idx.has_value()) << "zp_concat stage lost during serialization";
+    ASSERT_EQ(*zp_idx, 4u);
+
+    // Verify all stages are at expected positions
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::concat).has_value());
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::beam_table).has_value());
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::dq).has_value());
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::scale_concat).has_value());
+}
+
+// Test: KV-cache stages without zp_concat (non-ZP compressed cache).
+TEST(cache_serialization, kv_cache_stages_without_zp_concat_round_trip) {
+    auto& engine = get_test_engine();
+
+    stages_helper_test original;
+    original.stages.push_back(kv_stage_test::concat);
+    original.stages.push_back(kv_stage_test::beam_table);
+    original.stages.push_back(kv_stage_test::dq);
+    original.stages.push_back(kv_stage_test::scale_concat);
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    stages_helper_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.stages.size(), 4u);
+    ASSERT_FALSE(loaded.try_get_index(kv_stage_test::zp_concat).has_value());
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::scale_concat).has_value());
+}
+
+// Test: scatter_update-only KV-cache stages (non-indirect, non-compressed).
+TEST(cache_serialization, kv_cache_stages_scatter_only_round_trip) {
+    auto& engine = get_test_engine();
+
+    stages_helper_test original;
+    original.stages.push_back(kv_stage_test::scatter_update);
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    stages_helper_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.stages.size(), 1u);
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::scatter_update).has_value());
+}
+
+// --------------------------------------------------------------------------
+// MoE prefill execution flags round-trip tests
+// --------------------------------------------------------------------------
+// Mirrors the save/load logic for the three boolean flags in
+// moe_3gemm_swiglu_opt_impl: use_micro_gemm_prefill,
+// use_gpu_mask_gen_prefill, use_grouped_gemm_prefill.
+// Before the fix, these flags were not serialized, causing mismatched
+// execution paths and buffer counts after cache load.
+
+namespace {
+struct moe_prefill_flags_test {
+    bool use_micro_gemm_prefill = false;
+    bool use_gpu_mask_gen_prefill = false;
+    bool use_grouped_gemm_prefill = false;
+
+    void save(BinaryOutputBuffer& ob) const {
+        ob << use_micro_gemm_prefill;
+        ob << use_gpu_mask_gen_prefill;
+        ob << use_grouped_gemm_prefill;
+    }
+
+    void load(BinaryInputBuffer& ib) {
+        ib >> use_micro_gemm_prefill;
+        ib >> use_gpu_mask_gen_prefill;
+        ib >> use_grouped_gemm_prefill;
+    }
+};
+}  // namespace
+
+// Test: micro_gemm path — only micro_gemm flag set
+TEST(cache_serialization, moe_prefill_flags_micro_gemm_round_trip) {
+    auto& engine = get_test_engine();
+
+    moe_prefill_flags_test original;
+    original.use_micro_gemm_prefill = true;
+    original.use_gpu_mask_gen_prefill = true;
+    original.use_grouped_gemm_prefill = false;
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    moe_prefill_flags_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.use_micro_gemm_prefill, true);
+    ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, true);
+    ASSERT_EQ(loaded.use_grouped_gemm_prefill, false);
+}
+
+// Test: grouped_gemm path — only grouped_gemm flag set
+TEST(cache_serialization, moe_prefill_flags_grouped_gemm_round_trip) {
+    auto& engine = get_test_engine();
+
+    moe_prefill_flags_test original;
+    original.use_micro_gemm_prefill = false;
+    original.use_gpu_mask_gen_prefill = false;
+    original.use_grouped_gemm_prefill = true;
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    moe_prefill_flags_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.use_micro_gemm_prefill, false);
+    ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, false);
+    ASSERT_EQ(loaded.use_grouped_gemm_prefill, true);
+}
+
+// Test: fallback path — all flags false (per-expert onednn loop)
+TEST(cache_serialization, moe_prefill_flags_fallback_round_trip) {
+    auto& engine = get_test_engine();
+
+    moe_prefill_flags_test original;
+    original.use_micro_gemm_prefill = false;
+    original.use_gpu_mask_gen_prefill = false;
+    original.use_grouped_gemm_prefill = false;
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    moe_prefill_flags_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.use_micro_gemm_prefill, false);
+    ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, false);
+    ASSERT_EQ(loaded.use_grouped_gemm_prefill, false);
+}
+
+// Test: all flags true — should round-trip correctly even if this
+// combination is not used in practice
+TEST(cache_serialization, moe_prefill_flags_all_true_round_trip) {
+    auto& engine = get_test_engine();
+
+    moe_prefill_flags_test original;
+    original.use_micro_gemm_prefill = true;
+    original.use_gpu_mask_gen_prefill = true;
+    original.use_grouped_gemm_prefill = true;
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    moe_prefill_flags_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.use_micro_gemm_prefill, true);
+    ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, true);
+    ASSERT_EQ(loaded.use_grouped_gemm_prefill, true);
+}