openvinotoolkit · xzhan34 · Jun 1, 2026 · May 28, 2026 · May 28, 2026 · Jun 1, 2026
@@ -40,7 +40,9 @@ struct DispatchDataFunc {
     explicit DispatchDataFunc(std::nullptr_t) {}
 
     void operator()(const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params = nullptr) const {
-        m_dispatch_data_func(params, kd, rt_params);
+        if (m_dispatch_data_func) {
+            m_dispatch_data_func(params, kd, rt_params);
+        }
     }
 };
 

@@ -178,6 +178,12 @@ struct kv_cache_impl : multi_stage_primitive<kv_cache> {
                 auto scale_zp_concat_kernel_impl = scale_zp_concat_kernel_selector.GetImplementation(_kernels_data[*scale_concat_stage].kernelName);
                 scale_zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[*scale_concat_stage]);
             }
+
+            if (const auto zp_concat_stage = stages.try_get_index(kv_stage::zp_concat)) {
+                auto& zp_concat_kernel_selector = kernel_selector_t::Instance();
+                auto zp_concat_kernel_impl = zp_concat_kernel_selector.GetImplementation(_kernels_data[*zp_concat_stage].kernelName);
+                zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[*zp_concat_stage]);
+            }
         }
     }
     void set_arguments_impl(kv_cache_inst& instance) override {}

@@ -1031,8 +1031,20 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
         _shared_down_proj->forward(stream, batch, gate_mem_dnnl, output_dnnl, scalar_gate_dnnl);
     }
 
+    void save(BinaryOutputBuffer& ob) const override {
+        PrimitiveImplOCL::save(ob);
+        ob << use_micro_gemm_prefill;
+        ob << use_gpu_mask_gen_prefill;
+        ob << use_grouped_gemm_prefill;
+    }
+
     void load(BinaryInputBuffer& ib) override {
         PrimitiveImplOCL::load(ib);
+        // Read execution-path flags before init() so any future init() logic
+        // that depends on them sees the deserialized (not default) values.
+        ib >> use_micro_gemm_prefill;
+        ib >> use_gpu_mask_gen_prefill;
+        ib >> use_grouped_gemm_prefill;
         const kernel_impl_params* impl_params = reinterpret_cast<kernel_impl_params*>(ib.getKernelImplParams());
         init(impl_params->typed_desc<moe_3gemm_fused_compressed>());
     }

@@ -567,8 +567,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
         const bool disable_moe_opt = GPU_DEBUG_VALUE_OR(config.get_disable_moe_opt(), false);
 
         // MOE: TiledMoeBlock -> GatherMatmuls(compressed) -> MoeOp(compressed) -> MoeOpWithRouting(compressed).
-        // Gated on supports_immad: GatherMatmul backend is systolic-only.
-        if (device_info.supports_immad) {
+        // Gated on supports_immad (systolic-only) and oneDNN (required for expert GEMM dispatch).
+        // Note: even though we are already inside `if (supports_immad)`, oneDNN can still be explicitly disabled by the user.
+        if (device_info.supports_immad && config.get_use_onednn()) {
             manager.register_pass<ov::pass::ConvertTiledMoeBlockToGatherMatmuls>();
 
             // f32 listed because this pass runs before ConvertPrecision (line ~588);
@@ -583,6 +584,10 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
                 const bool has_batch_dim = !is_pa;
                 manager.register_pass<ov::pass::MoeOpFusion>(has_batch_dim);
                 manager.register_pass<ov::intel_gpu::FuseMOESharedExpert>();
+                // MOE3GemmFusedCompressed kernel dispatches expert GEMMs through
+                // oneDNN, which requires an in-order OCL queue.  If oneDNN is
+                // disabled (e.g. via OV_GPU_USE_ONEDNN=0 on an IMMAD GPU), the
+                // queue stays out-of-order and the oneDNN stream creation may assert.
                 manager.register_pass<ov::intel_gpu::FuseMOE3GemmCompressed>();
             }
         }

@@ -0,0 +1,39 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+// Regression tests for the null-guard added to DispatchDataFunc::operator().
+// Before the fix, calling a DispatchDataFunc constructed from nullptr would
+// invoke a null std::function and throw std::bad_function_call.
+
+#include "test_utils.h"
+#include "common_utils/kernel_generator_base.hpp"
+
+using namespace ov::intel_gpu;
+
+TEST(dispatch_data_func, null_func_does_not_crash) {
+    DispatchDataFunc func{nullptr};
+    KernelData kd;
+    RuntimeParams params;
+
+    ASSERT_NO_THROW(func(params, kd, nullptr));
+}
+
+TEST(dispatch_data_func, valid_func_is_called) {
+    bool called = false;
+    DispatchDataFunc func{[&called](const RuntimeParams&, KernelData&, ImplRuntimeParams*) {
+        called = true;
+    }};
+    KernelData kd;
+    RuntimeParams params;
+
+    func(params, kd, nullptr);
+    ASSERT_TRUE(called);
+}
+
+TEST(dispatch_data_func, default_constructed_is_null_safe) {
+    // Default-constructed KernelData has update_dispatch_data_func{nullptr}.
+    KernelData kd;
+    RuntimeParams params;
+
+    ASSERT_NO_THROW(kd.update_dispatch_data_func(params, kd, nullptr));
+}