diff --git a/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp b/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp
index 9322bec03e3b..6b0a6b41b074 100644
--- a/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp
+++ b/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp
@@ -40,7 +40,9 @@ struct DispatchDataFunc {
     explicit DispatchDataFunc(std::nullptr_t) {}
 
     void operator()(const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params = nullptr) const {
-        m_dispatch_data_func(params, kd, rt_params);
+        if (m_dispatch_data_func) {
+            m_dispatch_data_func(params, kd, rt_params);
+        }
     }
 };
 
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
index 0f411c4b9653..30d4c1961b19 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp
@@ -178,6 +178,12 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
                 auto scale_zp_concat_kernel_impl = scale_zp_concat_kernel_selector.GetImplementation(_kernels_data[*scale_concat_stage].kernelName);
                 scale_zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[*scale_concat_stage]);
             }
+
+            if (const auto zp_concat_stage = stages.try_get_index(kv_stage::zp_concat)) {
+                auto& zp_concat_kernel_selector = kernel_selector_t::Instance();
+                auto zp_concat_kernel_impl = zp_concat_kernel_selector.GetImplementation(_kernels_data[*zp_concat_stage].kernelName);
+                zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[*zp_concat_stage]);
+            }
         }
     }
     void set_arguments_impl(kv_cache_inst& instance) override {}
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
index c78b20988572..86ddaf156281 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
@@ -1031,8 +1031,20 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
         _shared_down_proj->forward(stream, batch, gate_mem_dnnl, output_dnnl, scalar_gate_dnnl);
     }
 
+    void save(BinaryOutputBuffer& ob) const override {
+        PrimitiveImplOCL::save(ob);
+        ob << use_micro_gemm_prefill;
+        ob << use_gpu_mask_gen_prefill;
+        ob << use_grouped_gemm_prefill;
+    }
+
     void load(BinaryInputBuffer& ib) override {
         PrimitiveImplOCL::load(ib);
+        // Read execution-path flags before init() so any future init() logic
+        // that depends on them sees the deserialized (not default) values.
+        ib >> use_micro_gemm_prefill;
+        ib >> use_gpu_mask_gen_prefill;
+        ib >> use_grouped_gemm_prefill;
         const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
         init(impl_params->typed_desc<moe_3gemm_fused_compressed>());
     }
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index b6e281a28368..111f8978df18 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -567,8 +567,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         const bool disable_moe_opt = GPU_DEBUG_VALUE_OR(config.get_disable_moe_opt(), false);
 
         // MOE: TiledMoeBlock -> GatherMatmuls(compressed) -> MoeOp(compressed) -> MoeOpWithRouting(compressed).
-        // Gated on supports_immad: GatherMatmul backend is systolic-only.
-        if (device_info.supports_immad) {
+        // Gated on supports_immad (systolic-only) and oneDNN (required for expert GEMM dispatch).
+        // Note: even though we are already inside `if (supports_immad)`, oneDNN can still be explicitly disabled by the user.
+        if (device_info.supports_immad && config.get_use_onednn()) {
             manager.register_pass<ov::pass::ConvertTiledMoeBlockToGatherMatmuls>();
 
             // f32 listed because this pass runs before ConvertPrecision (line ~588);
@@ -583,6 +584,10 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
                 const bool has_batch_dim = !is_pa;
                 manager.register_pass<ov::pass::MoeOpFusion>(has_batch_dim);
                 manager.register_pass<ov::intel_gpu::FuseMOESharedExpert>();
+                // MOE3GemmFusedCompressed kernel dispatches expert GEMMs through
+                // oneDNN, which requires an in-order OCL queue.  If oneDNN is
+                // disabled (e.g. via OV_GPU_USE_ONEDNN=0 on an IMMAD GPU), the
+                // queue stays out-of-order and the oneDNN stream creation may assert.
                 manager.register_pass<ov::intel_gpu::FuseMOE3GemmCompressed>();
             }
         }
diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp
new file mode 100644
index 000000000000..6770e6263f21
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp
@@ -0,0 +1,39 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Regression tests for the null-guard added to DispatchDataFunc::operator().
+// Before the fix, calling a DispatchDataFunc constructed from nullptr would
+// invoke a null std::function and throw std::bad_function_call.
+
+#include "test_utils.h"
+#include "common_utils/kernel_generator_base.hpp"
+
+using namespace ov::intel_gpu;
+
+TEST(dispatch_data_func, null_func_does_not_crash) {
+    DispatchDataFunc func{nullptr};
+    KernelData kd;
+    RuntimeParams params;
+
+    ASSERT_NO_THROW(func(params, kd, nullptr));
+}
+
+TEST(dispatch_data_func, valid_func_is_called) {
+    bool called = false;
+    DispatchDataFunc func{[&called](const RuntimeParams&, KernelData&, ImplRuntimeParams*) {
+        called = true;
+    }};
+    KernelData kd;
+    RuntimeParams params;
+
+    func(params, kd, nullptr);
+    ASSERT_TRUE(called);
+}
+
+TEST(dispatch_data_func, default_constructed_is_null_safe) {
+    // Default-constructed KernelData has update_dispatch_data_func{nullptr}.
+    KernelData kd;
+    RuntimeParams params;
+
+    ASSERT_NO_THROW(kd.update_dispatch_data_func(params, kd, nullptr));
+}
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp
new file mode 100644
index 000000000000..07495414c25b
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp
@@ -0,0 +1,304 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Regression tests for cache serialization round-trip of:
+//   1) kv_cache stages including the zp_concat stage (commit: "GPU: fix std::bad_function_call")
+//   2) MoE prefill execution flags (commit: "GPU: serialize MoE prefill execution flags")
+//
+// These tests replicate the serialization logic of internal structures
+// (stages_helper, moe prefill flags) without requiring a full model compilation,
+// verifying that the save/load contract is correct.
+
+#include "test_utils.h"
+#include "intel_gpu/graph/serialization/binary_buffer.hpp"
+#include "intel_gpu/graph/serialization/utils.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <vector>
+
+using namespace cldnn;
+using namespace ::tests;
+
+// --------------------------------------------------------------------------
+// kv_cache stages_helper round-trip tests
+// --------------------------------------------------------------------------
+// Mirrors the kv_stage enum and stages_helper from kv_cache.cpp
+namespace {
+enum class kv_stage_test : uint8_t { scatter_update, concat, beam_table, dq, scale_concat, zp_concat };
+
+struct stages_helper_test {
+    std::vector<kv_stage_test> stages;
+
+    void save(BinaryOutputBuffer& ob) const {
+        ob << stages.size();
+        for (const auto& stage : stages) {
+            ob << static_cast<uint8_t>(stage);
+        }
+    }
+
+    void load(BinaryInputBuffer& ib) {
+        size_t stages_size = 0;
+        ib >> stages_size;
+        stages.resize(stages_size);
+        for (auto& stage : stages) {
+            uint8_t stage_ = 0;
+            ib >> stage_;
+            stage = static_cast<kv_stage_test>(stage_);
+        }
+    }
+
+    std::optional<size_t> try_get_index(kv_stage_test stage) const noexcept {
+        if (const auto it = std::find(stages.begin(), stages.end(), stage); it != stages.end()) {
+            return static_cast<size_t>(std::distance(stages.begin(), it));
+        }
+        return {};
+    }
+};
+}  // namespace
+
+// Test: KV-cache stages including zp_concat survive a binary round-trip.
+// This covers the scenario where a compressed KV-cache with zero-point inputs
+// has stages {concat, beam_table, dq, scale_concat, zp_concat}.
+// Before the fix, the load() path did not restore the dispatch function for
+// zp_concat, causing std::bad_function_call on shape change.
+TEST(cache_serialization, kv_cache_stages_with_zp_concat_round_trip) {
+    auto& engine = get_test_engine();
+
+    // Simulate a compressed KV-cache with indirect + zero-points
+    stages_helper_test original;
+    original.stages.push_back(kv_stage_test::concat);
+    original.stages.push_back(kv_stage_test::beam_table);
+    original.stages.push_back(kv_stage_test::dq);
+    original.stages.push_back(kv_stage_test::scale_concat);
+    original.stages.push_back(kv_stage_test::zp_concat);
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    stages_helper_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.stages.size(), original.stages.size());
+    for (size_t i = 0; i < original.stages.size(); ++i) {
+        ASSERT_EQ(static_cast<uint8_t>(loaded.stages[i]),
+                  static_cast<uint8_t>(original.stages[i]))
+            << "Stage mismatch at index " << i;
+    }
+
+    // Verify zp_concat stage is present and at the correct index
+    auto zp_idx = loaded.try_get_index(kv_stage_test::zp_concat);
+    ASSERT_TRUE(zp_idx.has_value()) << "zp_concat stage lost during serialization";
+    ASSERT_EQ(*zp_idx, 4u);
+
+    // Verify all stages are at expected positions
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::concat).has_value());
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::beam_table).has_value());
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::dq).has_value());
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::scale_concat).has_value());
+}
+
+// Test: KV-cache stages without zp_concat (non-ZP compressed cache).
+TEST(cache_serialization, kv_cache_stages_without_zp_concat_round_trip) {
+    auto& engine = get_test_engine();
+
+    stages_helper_test original;
+    original.stages.push_back(kv_stage_test::concat);
+    original.stages.push_back(kv_stage_test::beam_table);
+    original.stages.push_back(kv_stage_test::dq);
+    original.stages.push_back(kv_stage_test::scale_concat);
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    stages_helper_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.stages.size(), 4u);
+    ASSERT_FALSE(loaded.try_get_index(kv_stage_test::zp_concat).has_value());
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::scale_concat).has_value());
+}
+
+// Test: scatter_update-only KV-cache stages (non-indirect, non-compressed).
+TEST(cache_serialization, kv_cache_stages_scatter_only_round_trip) {
+    auto& engine = get_test_engine();
+
+    stages_helper_test original;
+    original.stages.push_back(kv_stage_test::scatter_update);
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    stages_helper_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.stages.size(), 1u);
+    ASSERT_TRUE(loaded.try_get_index(kv_stage_test::scatter_update).has_value());
+}
+
+// --------------------------------------------------------------------------
+// MoE prefill execution flags round-trip tests
+// --------------------------------------------------------------------------
+// Mirrors the save/load logic for the three boolean flags in
+// moe_3gemm_swiglu_opt_impl: use_micro_gemm_prefill,
+// use_gpu_mask_gen_prefill, use_grouped_gemm_prefill.
+// Before the fix, these flags were not serialized, causing mismatched
+// execution paths and buffer counts after cache load.
+
+namespace {
+struct moe_prefill_flags_test {
+    bool use_micro_gemm_prefill = false;
+    bool use_gpu_mask_gen_prefill = false;
+    bool use_grouped_gemm_prefill = false;
+
+    void save(BinaryOutputBuffer& ob) const {
+        ob << use_micro_gemm_prefill;
+        ob << use_gpu_mask_gen_prefill;
+        ob << use_grouped_gemm_prefill;
+    }
+
+    void load(BinaryInputBuffer& ib) {
+        ib >> use_micro_gemm_prefill;
+        ib >> use_gpu_mask_gen_prefill;
+        ib >> use_grouped_gemm_prefill;
+    }
+};
+}  // namespace
+
+// Test: micro_gemm path — only micro_gemm flag set
+TEST(cache_serialization, moe_prefill_flags_micro_gemm_round_trip) {
+    auto& engine = get_test_engine();
+
+    moe_prefill_flags_test original;
+    original.use_micro_gemm_prefill = true;
+    original.use_gpu_mask_gen_prefill = true;
+    original.use_grouped_gemm_prefill = false;
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    moe_prefill_flags_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.use_micro_gemm_prefill, true);
+    ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, true);
+    ASSERT_EQ(loaded.use_grouped_gemm_prefill, false);
+}
+
+// Test: grouped_gemm path — only grouped_gemm flag set
+TEST(cache_serialization, moe_prefill_flags_grouped_gemm_round_trip) {
+    auto& engine = get_test_engine();
+
+    moe_prefill_flags_test original;
+    original.use_micro_gemm_prefill = false;
+    original.use_gpu_mask_gen_prefill = false;
+    original.use_grouped_gemm_prefill = true;
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    moe_prefill_flags_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.use_micro_gemm_prefill, false);
+    ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, false);
+    ASSERT_EQ(loaded.use_grouped_gemm_prefill, true);
+}
+
+// Test: fallback path — all flags false (per-expert onednn loop)
+TEST(cache_serialization, moe_prefill_flags_fallback_round_trip) {
+    auto& engine = get_test_engine();
+
+    moe_prefill_flags_test original;
+    original.use_micro_gemm_prefill = false;
+    original.use_gpu_mask_gen_prefill = false;
+    original.use_grouped_gemm_prefill = false;
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    moe_prefill_flags_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.use_micro_gemm_prefill, false);
+    ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, false);
+    ASSERT_EQ(loaded.use_grouped_gemm_prefill, false);
+}
+
+// Test: all flags true — should round-trip correctly even if this
+// combination is not used in practice
+TEST(cache_serialization, moe_prefill_flags_all_true_round_trip) {
+    auto& engine = get_test_engine();
+
+    moe_prefill_flags_test original;
+    original.use_micro_gemm_prefill = true;
+    original.use_gpu_mask_gen_prefill = true;
+    original.use_grouped_gemm_prefill = true;
+
+    membuf mem_buf;
+    {
+        std::ostream out_mem(&mem_buf);
+        BinaryOutputBuffer ob(out_mem);
+        original.save(ob);
+    }
+
+    moe_prefill_flags_test loaded;
+    {
+        std::istream in_mem(&mem_buf);
+        BinaryInputBuffer ib(in_mem, engine);
+        loaded.load(ib);
+    }
+
+    ASSERT_EQ(loaded.use_micro_gemm_prefill, true);
+    ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, true);
+    ASSERT_EQ(loaded.use_grouped_gemm_prefill, true);
+}