diff --git a/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp b/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp index 9322bec03e3b..6b0a6b41b074 100644 --- a/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp +++ b/src/plugins/intel_gpu/src/graph/common_utils/kernel_generator_base.hpp @@ -40,7 +40,9 @@ struct DispatchDataFunc { explicit DispatchDataFunc(std::nullptr_t) {} void operator()(const RuntimeParams& params, KernelData& kd, ImplRuntimeParams* rt_params = nullptr) const { - m_dispatch_data_func(params, kd, rt_params); + if (m_dispatch_data_func) { + m_dispatch_data_func(params, kd, rt_params); + } } }; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp index 0f411c4b9653..30d4c1961b19 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl/kv_cache.cpp @@ -178,6 +178,12 @@ struct kv_cache_impl : multi_stage_primitive { auto scale_zp_concat_kernel_impl = scale_zp_concat_kernel_selector.GetImplementation(_kernels_data[*scale_concat_stage].kernelName); scale_zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[*scale_concat_stage]); } + + if (const auto zp_concat_stage = stages.try_get_index(kv_stage::zp_concat)) { + auto& zp_concat_kernel_selector = kernel_selector_t::Instance(); + auto zp_concat_kernel_impl = zp_concat_kernel_selector.GetImplementation(_kernels_data[*zp_concat_stage].kernelName); + zp_concat_kernel_impl->GetUpdateDispatchDataFunc(_kernels_data[*zp_concat_stage]); + } } } void set_arguments_impl(kv_cache_inst& instance) override {} diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index c78b20988572..86ddaf156281 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -1031,8 +1031,20 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { _shared_down_proj->forward(stream, batch, gate_mem_dnnl, output_dnnl, scalar_gate_dnnl); } + void save(BinaryOutputBuffer& ob) const override { + PrimitiveImplOCL::save(ob); + ob << use_micro_gemm_prefill; + ob << use_gpu_mask_gen_prefill; + ob << use_grouped_gemm_prefill; + } + void load(BinaryInputBuffer& ib) override { PrimitiveImplOCL::load(ib); + // Read execution-path flags before init() so any future init() logic + // that depends on them sees the deserialized (not default) values. + ib >> use_micro_gemm_prefill; + ib >> use_gpu_mask_gen_prefill; + ib >> use_grouped_gemm_prefill; const kernel_impl_params* impl_params = reinterpret_cast(ib.getKernelImplParams()); init(impl_params->typed_desc()); } diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index b6e281a28368..111f8978df18 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -567,8 +567,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { const bool disable_moe_opt = GPU_DEBUG_VALUE_OR(config.get_disable_moe_opt(), false); // MOE: TiledMoeBlock -> GatherMatmuls(compressed) -> MoeOp(compressed) -> MoeOpWithRouting(compressed). - // Gated on supports_immad: GatherMatmul backend is systolic-only. - if (device_info.supports_immad) { + // Gated on supports_immad (systolic-only) and oneDNN (required for expert GEMM dispatch). + // Note: even though we are already inside `if (supports_immad)`, oneDNN can still be explicitly disabled by the user. + if (device_info.supports_immad && config.get_use_onednn()) { manager.register_pass(); // f32 listed because this pass runs before ConvertPrecision (line ~588); @@ -583,6 +584,10 @@ void TransformationsPipeline::apply(std::shared_ptr func) { const bool has_batch_dim = !is_pa; manager.register_pass(has_batch_dim); manager.register_pass(); + // MOE3GemmFusedCompressed kernel dispatches expert GEMMs through + // oneDNN, which requires an in-order OCL queue. If oneDNN is + // disabled (e.g. via OV_GPU_USE_ONEDNN=0 on an IMMAD GPU), the + // queue stays out-of-order and the oneDNN stream creation may assert. manager.register_pass(); } } diff --git a/src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp b/src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp new file mode 100644 index 000000000000..6770e6263f21 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/module_tests/dispatch_data_func_test.cpp @@ -0,0 +1,39 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +// Regression tests for the null-guard added to DispatchDataFunc::operator(). +// Before the fix, calling a DispatchDataFunc constructed from nullptr would +// invoke a null std::function and throw std::bad_function_call. + +#include "test_utils.h" +#include "common_utils/kernel_generator_base.hpp" + +using namespace ov::intel_gpu; + +TEST(dispatch_data_func, null_func_does_not_crash) { + DispatchDataFunc func{nullptr}; + KernelData kd; + RuntimeParams params; + + ASSERT_NO_THROW(func(params, kd, nullptr)); +} + +TEST(dispatch_data_func, valid_func_is_called) { + bool called = false; + DispatchDataFunc func{[&called](const RuntimeParams&, KernelData&, ImplRuntimeParams*) { + called = true; + }}; + KernelData kd; + RuntimeParams params; + + func(params, kd, nullptr); + ASSERT_TRUE(called); +} + +TEST(dispatch_data_func, default_constructed_is_null_safe) { + // Default-constructed KernelData has update_dispatch_data_func{nullptr}. + KernelData kd; + RuntimeParams params; + + ASSERT_NO_THROW(kd.update_dispatch_data_func(params, kd, nullptr)); +} diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp new file mode 100644 index 000000000000..07495414c25b --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/test_cases/cache_serialization_test.cpp @@ -0,0 +1,304 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +// Regression tests for cache serialization round-trip of: +// 1) kv_cache stages including the zp_concat stage (commit: "GPU: fix std::bad_function_call") +// 2) MoE prefill execution flags (commit: "GPU: serialize MoE prefill execution flags") +// +// These tests replicate the serialization logic of internal structures +// (stages_helper, moe prefill flags) without requiring a full model compilation, +// verifying that the save/load contract is correct. + +#include "test_utils.h" +#include "intel_gpu/graph/serialization/binary_buffer.hpp" +#include "intel_gpu/graph/serialization/utils.hpp" + +#include +#include +#include +#include + +using namespace cldnn; +using namespace ::tests; + +// -------------------------------------------------------------------------- +// kv_cache stages_helper round-trip tests +// -------------------------------------------------------------------------- +// Mirrors the kv_stage enum and stages_helper from kv_cache.cpp +namespace { +enum class kv_stage_test : uint8_t { scatter_update, concat, beam_table, dq, scale_concat, zp_concat }; + +struct stages_helper_test { + std::vector stages; + + void save(BinaryOutputBuffer& ob) const { + ob << stages.size(); + for (const auto& stage : stages) { + ob << static_cast(stage); + } + } + + void load(BinaryInputBuffer& ib) { + size_t stages_size = 0; + ib >> stages_size; + stages.resize(stages_size); + for (auto& stage : stages) { + uint8_t stage_ = 0; + ib >> stage_; + stage = static_cast(stage_); + } + } + + std::optional try_get_index(kv_stage_test stage) const noexcept { + if (const auto it = std::find(stages.begin(), stages.end(), stage); it != stages.end()) { + return static_cast(std::distance(stages.begin(), it)); + } + return {}; + } +}; +} // namespace + +// Test: KV-cache stages including zp_concat survive a binary round-trip. +// This covers the scenario where a compressed KV-cache with zero-point inputs +// has stages {concat, beam_table, dq, scale_concat, zp_concat}. +// Before the fix, the load() path did not restore the dispatch function for +// zp_concat, causing std::bad_function_call on shape change. +TEST(cache_serialization, kv_cache_stages_with_zp_concat_round_trip) { + auto& engine = get_test_engine(); + + // Simulate a compressed KV-cache with indirect + zero-points + stages_helper_test original; + original.stages.push_back(kv_stage_test::concat); + original.stages.push_back(kv_stage_test::beam_table); + original.stages.push_back(kv_stage_test::dq); + original.stages.push_back(kv_stage_test::scale_concat); + original.stages.push_back(kv_stage_test::zp_concat); + + membuf mem_buf; + { + std::ostream out_mem(&mem_buf); + BinaryOutputBuffer ob(out_mem); + original.save(ob); + } + + stages_helper_test loaded; + { + std::istream in_mem(&mem_buf); + BinaryInputBuffer ib(in_mem, engine); + loaded.load(ib); + } + + ASSERT_EQ(loaded.stages.size(), original.stages.size()); + for (size_t i = 0; i < original.stages.size(); ++i) { + ASSERT_EQ(static_cast(loaded.stages[i]), + static_cast(original.stages[i])) + << "Stage mismatch at index " << i; + } + + // Verify zp_concat stage is present and at the correct index + auto zp_idx = loaded.try_get_index(kv_stage_test::zp_concat); + ASSERT_TRUE(zp_idx.has_value()) << "zp_concat stage lost during serialization"; + ASSERT_EQ(*zp_idx, 4u); + + // Verify all stages are at expected positions + ASSERT_TRUE(loaded.try_get_index(kv_stage_test::concat).has_value()); + ASSERT_TRUE(loaded.try_get_index(kv_stage_test::beam_table).has_value()); + ASSERT_TRUE(loaded.try_get_index(kv_stage_test::dq).has_value()); + ASSERT_TRUE(loaded.try_get_index(kv_stage_test::scale_concat).has_value()); +} + +// Test: KV-cache stages without zp_concat (non-ZP compressed cache). +TEST(cache_serialization, kv_cache_stages_without_zp_concat_round_trip) { + auto& engine = get_test_engine(); + + stages_helper_test original; + original.stages.push_back(kv_stage_test::concat); + original.stages.push_back(kv_stage_test::beam_table); + original.stages.push_back(kv_stage_test::dq); + original.stages.push_back(kv_stage_test::scale_concat); + + membuf mem_buf; + { + std::ostream out_mem(&mem_buf); + BinaryOutputBuffer ob(out_mem); + original.save(ob); + } + + stages_helper_test loaded; + { + std::istream in_mem(&mem_buf); + BinaryInputBuffer ib(in_mem, engine); + loaded.load(ib); + } + + ASSERT_EQ(loaded.stages.size(), 4u); + ASSERT_FALSE(loaded.try_get_index(kv_stage_test::zp_concat).has_value()); + ASSERT_TRUE(loaded.try_get_index(kv_stage_test::scale_concat).has_value()); +} + +// Test: scatter_update-only KV-cache stages (non-indirect, non-compressed). +TEST(cache_serialization, kv_cache_stages_scatter_only_round_trip) { + auto& engine = get_test_engine(); + + stages_helper_test original; + original.stages.push_back(kv_stage_test::scatter_update); + + membuf mem_buf; + { + std::ostream out_mem(&mem_buf); + BinaryOutputBuffer ob(out_mem); + original.save(ob); + } + + stages_helper_test loaded; + { + std::istream in_mem(&mem_buf); + BinaryInputBuffer ib(in_mem, engine); + loaded.load(ib); + } + + ASSERT_EQ(loaded.stages.size(), 1u); + ASSERT_TRUE(loaded.try_get_index(kv_stage_test::scatter_update).has_value()); +} + +// -------------------------------------------------------------------------- +// MoE prefill execution flags round-trip tests +// -------------------------------------------------------------------------- +// Mirrors the save/load logic for the three boolean flags in +// moe_3gemm_swiglu_opt_impl: use_micro_gemm_prefill, +// use_gpu_mask_gen_prefill, use_grouped_gemm_prefill. +// Before the fix, these flags were not serialized, causing mismatched +// execution paths and buffer counts after cache load. + +namespace { +struct moe_prefill_flags_test { + bool use_micro_gemm_prefill = false; + bool use_gpu_mask_gen_prefill = false; + bool use_grouped_gemm_prefill = false; + + void save(BinaryOutputBuffer& ob) const { + ob << use_micro_gemm_prefill; + ob << use_gpu_mask_gen_prefill; + ob << use_grouped_gemm_prefill; + } + + void load(BinaryInputBuffer& ib) { + ib >> use_micro_gemm_prefill; + ib >> use_gpu_mask_gen_prefill; + ib >> use_grouped_gemm_prefill; + } +}; +} // namespace + +// Test: micro_gemm path — only micro_gemm flag set +TEST(cache_serialization, moe_prefill_flags_micro_gemm_round_trip) { + auto& engine = get_test_engine(); + + moe_prefill_flags_test original; + original.use_micro_gemm_prefill = true; + original.use_gpu_mask_gen_prefill = true; + original.use_grouped_gemm_prefill = false; + + membuf mem_buf; + { + std::ostream out_mem(&mem_buf); + BinaryOutputBuffer ob(out_mem); + original.save(ob); + } + + moe_prefill_flags_test loaded; + { + std::istream in_mem(&mem_buf); + BinaryInputBuffer ib(in_mem, engine); + loaded.load(ib); + } + + ASSERT_EQ(loaded.use_micro_gemm_prefill, true); + ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, true); + ASSERT_EQ(loaded.use_grouped_gemm_prefill, false); +} + +// Test: grouped_gemm path — only grouped_gemm flag set +TEST(cache_serialization, moe_prefill_flags_grouped_gemm_round_trip) { + auto& engine = get_test_engine(); + + moe_prefill_flags_test original; + original.use_micro_gemm_prefill = false; + original.use_gpu_mask_gen_prefill = false; + original.use_grouped_gemm_prefill = true; + + membuf mem_buf; + { + std::ostream out_mem(&mem_buf); + BinaryOutputBuffer ob(out_mem); + original.save(ob); + } + + moe_prefill_flags_test loaded; + { + std::istream in_mem(&mem_buf); + BinaryInputBuffer ib(in_mem, engine); + loaded.load(ib); + } + + ASSERT_EQ(loaded.use_micro_gemm_prefill, false); + ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, false); + ASSERT_EQ(loaded.use_grouped_gemm_prefill, true); +} + +// Test: fallback path — all flags false (per-expert onednn loop) +TEST(cache_serialization, moe_prefill_flags_fallback_round_trip) { + auto& engine = get_test_engine(); + + moe_prefill_flags_test original; + original.use_micro_gemm_prefill = false; + original.use_gpu_mask_gen_prefill = false; + original.use_grouped_gemm_prefill = false; + + membuf mem_buf; + { + std::ostream out_mem(&mem_buf); + BinaryOutputBuffer ob(out_mem); + original.save(ob); + } + + moe_prefill_flags_test loaded; + { + std::istream in_mem(&mem_buf); + BinaryInputBuffer ib(in_mem, engine); + loaded.load(ib); + } + + ASSERT_EQ(loaded.use_micro_gemm_prefill, false); + ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, false); + ASSERT_EQ(loaded.use_grouped_gemm_prefill, false); +} + +// Test: all flags true — should round-trip correctly even if this +// combination is not used in practice +TEST(cache_serialization, moe_prefill_flags_all_true_round_trip) { + auto& engine = get_test_engine(); + + moe_prefill_flags_test original; + original.use_micro_gemm_prefill = true; + original.use_gpu_mask_gen_prefill = true; + original.use_grouped_gemm_prefill = true; + + membuf mem_buf; + { + std::ostream out_mem(&mem_buf); + BinaryOutputBuffer ob(out_mem); + original.save(ob); + } + + moe_prefill_flags_test loaded; + { + std::istream in_mem(&mem_buf); + BinaryInputBuffer ib(in_mem, engine); + loaded.load(ib); + } + + ASSERT_EQ(loaded.use_micro_gemm_prefill, true); + ASSERT_EQ(loaded.use_gpu_mask_gen_prefill, true); + ASSERT_EQ(loaded.use_grouped_gemm_prefill, true); +}