From b7b6eb449d89bc28de0891b2e9355900deca75c1 Mon Sep 17 00:00:00 2001 From: zaixing-wang Date: Tue, 2 Jun 2026 08:51:20 +0000 Subject: [PATCH] [GPU] Add MoE expert offload-to-disk (OTD) for large MoE models --- .../intel_gpu/plugin/program_builder.hpp | 3 +- .../primitives/moe_3gemm_fused_compressed.hpp | 58 ++- .../intel_gpu/runtime/internal_properties.hpp | 1 + .../include/intel_gpu/runtime/options.inl | 1 + .../graph_optimizer/prepare_quantization.cpp | 4 + .../src/graph/impls/ocl_v2/moe/LRUCache.cpp | 65 +++ .../src/graph/impls/ocl_v2/moe/LRUCache.hpp | 82 ++++ .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 393 ++++++++++----- .../impls/ocl_v2/moe/moe_otd_runtime.hpp | 341 +++++++++++++ .../src/graph/include/moe_3gemm_fused_inst.h | 2 + src/plugins/intel_gpu/src/graph/network.cpp | 7 +- src/plugins/intel_gpu/src/graph/program.cpp | 10 +- .../intel_gpu/src/plugin/ops/constant.cpp | 82 ++-- src/plugins/intel_gpu/src/plugin/ops/moe.cpp | 351 +++++++++++++- .../src/plugin/ops/moe_offload_constant.hpp | 118 +++++ src/plugins/intel_gpu/src/plugin/plugin.cpp | 2 +- .../test_cases/moe_offload_lru_cache_test.cpp | 455 ++++++++++++++++++ .../test_cases/moe_offload_property_test.cpp | 19 + 18 files changed, 1846 insertions(+), 148 deletions(-) create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.cpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.hpp create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_otd_runtime.hpp create mode 100644 src/plugins/intel_gpu/src/plugin/ops/moe_offload_constant.hpp create mode 100644 src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_lru_cache_test.cpp create mode 100644 src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_property_test.cpp diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp index 02848734d88830..393ee0ded69ada 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp @@ -139,10 +139,11 @@ class ProgramBuilder final { bool use_new_shape_infer() const { return m_config.get_allow_new_shape_infer(); } bool is_inner_program() const { return m_is_inner_program; } - bool is_query_mode() { return queryMode; } + bool is_query_mode() const { return queryMode; } std::shared_ptr get_task_executor() const { return m_task_executor; } std::shared_ptr get_compilation_context() const { return m_compilation_context; } + std::shared_ptr get_model() const { return m_model; } private: static factories_map_t factories_map; diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/moe_3gemm_fused_compressed.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/moe_3gemm_fused_compressed.hpp index b1c430d1850214..222938abf10e26 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/primitives/moe_3gemm_fused_compressed.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/moe_3gemm_fused_compressed.hpp @@ -4,19 +4,51 @@ #pragma once #include +#include #include "intel_gpu/op/moe_3gemm_fused_compressed.hpp" #include "intel_gpu/runtime/engine.hpp" +#include "intel_gpu/runtime/memory.hpp" #include "primitive.hpp" namespace cldnn { using MOE3GemmFusedCompressed = ov::intel_gpu::op::MOE3GemmFusedCompressed; +struct moe_weights { + cldnn::memory::ptr gate_w = nullptr; + cldnn::memory::ptr gate_s = nullptr; + cldnn::memory::ptr gate_z = nullptr; + cldnn::memory::ptr up_w = nullptr; + cldnn::memory::ptr up_s = nullptr; + cldnn::memory::ptr up_z = nullptr; + cldnn::memory::ptr down_w = nullptr; + cldnn::memory::ptr down_s = nullptr; + cldnn::memory::ptr down_z = nullptr; +}; + /// @brief moe compressed primitive /// @details Performs moe compressed struct moe_3gemm_fused_compressed : public primitive_base { CLDNN_DECLARE_PRIMITIVE(moe_3gemm_fused_compressed) + static constexpr size_t serialized_weight_offset_count = 9; + + enum class input_index : size_t { + hidden_states = 0, + routing_weights, + weight_0, + scale_0, + zp_0, + weight_1, + scale_1, + zp_1, + weight_2, + scale_2, + zp_2, + count + }; + static constexpr size_t input_count = static_cast(input_index::count); + moe_3gemm_fused_compressed() : primitive_base("", {}) {} // @brief Constructs moe primitive / layer. @@ -70,11 +102,22 @@ struct moe_3gemm_fused_compressed : public primitive_base& inputs, const MOE3GemmFusedCompressed::Config& config) + moe_3gemm_fused_compressed(const primitive_id& id, + const std::vector& inputs, + const MOE3GemmFusedCompressed::Config& config, + const std::vector& weight_bin_offsets = {}, + const std::string& weights_path = "", + size_t lru_expert_num = 0) : primitive_base(id, inputs, 1, {optional_data_type()}), - _config(config) {} + _config(config), + _weight_bin_offsets(weight_bin_offsets), + _weights_path(weights_path), + _lru_expert_num(lru_expert_num) {} MOE3GemmFusedCompressed::Config _config; + std::vector _weight_bin_offsets; + std::string _weights_path; + size_t _lru_expert_num = 0; bool operator==(const primitive& rhs) const override { if (!compare_common_params(rhs)) @@ -82,17 +125,26 @@ struct moe_3gemm_fused_compressed : public primitive_base(rhs); - return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0; + return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0 && + _weight_bin_offsets == rhs_casted._weight_bin_offsets && + _weights_path == rhs_casted._weights_path && + _lru_expert_num == rhs_casted._lru_expert_num; } void save(BinaryOutputBuffer& ob) const override { primitive_base::save(ob); ob << make_data(&_config, sizeof(_config)); + ob << _weight_bin_offsets; + ob << _weights_path; + ob << _lru_expert_num; } void load(BinaryInputBuffer& ib) override { primitive_base::load(ib); ib >> make_data(&_config, sizeof(_config)); + ib >> _weight_bin_offsets; + ib >> _weights_path; + ib >> _lru_expert_num; } }; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp index 2e7151f60ec162..f84e4c17ab43a4 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp @@ -133,6 +133,7 @@ static constexpr Property force_implemen static constexpr Property config_file{"CONFIG_FILE"}; static constexpr Property buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"}; static constexpr Property max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"}; +static constexpr Property moe_offload_ratio{"MOE_OFFLOAD_RATIO"}; static constexpr Property use_onednn{"GPU_USE_ONEDNN"}; static constexpr Property use_cm{"GPU_USE_CM"}; diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl index 5842f039720207..33086747a0231d 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl @@ -39,6 +39,7 @@ OV_CONFIG_RELEASE_OPTION(ov::hint, model, nullptr, "Shared pointer to the ov::Mo OV_CONFIG_RELEASE_OPTION(ov::internal, key_cache_quant_mode, ov::internal::CacheQuantMode::BY_CHANNEL, "AUTO or BY_CHANNEL or BY_TOKEN") OV_CONFIG_RELEASE_OPTION(ov::internal, value_cache_quant_mode, ov::internal::CacheQuantMode::BY_TOKEN, "AUTO or BY_CHANNEL or BY_TOKEN") OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, mem_pool_util_threshold, 0.5, "Minimum utilization threshold (0.0~1.0) for reusable memory in the pool") +OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, moe_offload_ratio, 0, "Percentage (0-100) of MoE experts to keep resident on device for offload") OV_CONFIG_RELEASE_OPTION(ov, enable_weightless, false, "Enable/Disable weightless blob") OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, shape_predictor_settings, {10, 16 * 1024, 2, 1.1f}, "Preallocation settings") diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp index 9872fc63ffcb53..c90796ad003495 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp @@ -635,6 +635,10 @@ static void optimize_moe_gemm_decompression_parameters(moe_gemm_node& node, prog static void optimize_moe_3gemm_fused_decompression_parameters(moe_node& node, program& p) { auto prim = node.get_primitive(); + if (prim->_lru_expert_num > 0) { + // OTD routed weights are backed by resident-size allocations; reorders would materialize full logical tensors. + return; + } const auto& cfg = prim->_config; // Routed-expert scales at 3/6/9 (gate/up/down); zp at +1 when has_zp. constexpr std::array routed_scale_indices{3u, 6u, 9u}; diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.cpp new file mode 100644 index 00000000000000..1705d69053a7f3 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.cpp @@ -0,0 +1,65 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "LRUCache.hpp" + +#include + +LRUCache::LRUCache(size_t max_total_experts, EvictCallback cb) + : m_max_total_experts(max_total_experts), + m_total_experts(0), + m_to_filled_lru_expert_no(0), + m_on_evict(std::move(cb)) { + m_filled_list.resize(max_total_experts, false); +} + +void LRUCache::move_to_end(std::list::iterator it) { + if (std::next(it) == m_list.end()) + return; + m_list.splice(m_list.end(), m_list, it); +} + +void LRUCache::evict_one_unlocked() { + if (m_list.empty()) + return; + + auto& oldest = m_list.front(); + + m_filled_list[oldest.lru_expert_no] = false; + m_to_filled_lru_expert_no = oldest.lru_expert_no; + Key key{oldest.layer, oldest.expert}; + m_map.erase(key); + m_list.pop_front(); + --m_total_experts; +} + +void LRUCache::evict_one() { + std::lock_guard lock(m_mutex); + evict_one_unlocked(); +} + +std::pair LRUCache::get_lru_item(size_t layer, size_t expert) { + std::lock_guard lock(m_mutex); + + Key key{layer, expert}; + auto it = m_map.find(key); + if (it == m_map.end()) { + size_t to_filled_no = 0; + if (m_total_experts >= m_max_total_experts) { + evict_one_unlocked(); + to_filled_no = m_to_filled_lru_expert_no; + } else { + to_filled_no = m_total_experts; + } + m_list.push_back(Node{layer, expert, to_filled_no}); + auto new_it = std::prev(m_list.end()); + m_map[key] = new_it; + ++m_total_experts; + return {to_filled_no, false}; + } else { + move_to_end(it->second); + const bool is_hit = m_filled_list[it->second->lru_expert_no]; + return {it->second->lru_expert_no, is_hit}; + } +} \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.hpp new file mode 100644 index 00000000000000..e2f1fefae5a6bc --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.hpp @@ -0,0 +1,82 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "intel_gpu/runtime/engine.hpp" + +class LRUCache { +public: + using EvictCallback = std::function; + + enum NodeAction { INSERT, REFRESH }; + + LRUCache(size_t max_total_experts, EvictCallback cb = nullptr); + NodeAction insert_or_refresh(size_t layer, size_t expert, void* addr, void* params = nullptr); + + std::pair get_lru_item(size_t layer, size_t expert); + size_t get_total_experts() const { + std::lock_guard lock(m_mutex); + return m_total_experts; + } + + void evict_one(); + + size_t size() const { + std::lock_guard lock(m_mutex); + return m_total_experts; + } + std::pair get_item(size_t layer, size_t expert); + void set_filled(size_t lru_expert_no) { + std::lock_guard lock(m_mutex); + if (lru_expert_no >= m_filled_list.size()) { + return; + } + m_filled_list[lru_expert_no] = true; + } + + bool m_initialized = false; + +private: + struct Key { + size_t layer; + size_t expert; + bool operator==(const Key& other) const noexcept { + return layer == other.layer && expert == other.expert; + } + }; + + struct KeyHash { + std::size_t operator()(const Key& k) const noexcept { + return std::hash()(k.layer * 131ULL + k.expert); + } + }; + + struct Node { + size_t layer; + size_t expert; + size_t lru_expert_no; + }; + + size_t m_max_total_experts; + size_t m_per_expert_size; + size_t m_total_experts; + size_t m_to_filled_lru_expert_no; + EvictCallback m_on_evict; + + std::list m_list; + std::vector m_filled_list; + std::unordered_map::iterator, KeyHash> m_map; + mutable std::mutex m_mutex; + + void move_to_end(std::list::iterator it); + void evict_one_unlocked(); +}; \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp index c78b209885722f..9e080032ed7aa1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp @@ -4,16 +4,28 @@ // clang-format off #include "moe_3gemm_gen_micro.hpp" +#include "moe_otd_runtime.hpp" #include "moe_3gemm_swiglu_opt.hpp" +#include "openvino/runtime/shared_buffer.hpp" +#include "LRUCache.hpp" // clang-format on #define DEBUG_MOE_LOG 0 #ifdef ENABLE_ONEDNN_FOR_GPU # include +# include +# include +# include # include +# include +# include +# include # include +# include +# include # include +# include # include # include @@ -676,6 +688,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { moe_fusion_weights_base_addr moe_fusion_wei_addr; memory::ptr input_routing_weights; memory::ptr input_router_topk_idx; + memory::ptr _expert_index_buffer; + bool _index_initialized = false; }; std::vector> _dnnl_weights; @@ -684,6 +698,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { int _shared_intermediate_size; int _gate_up_group_size; int _down_group_size; + size_t _lru_expert_num = 0; + std::shared_ptr _lru_cache; ov::op::internal::MOE::Activation_type _activation_type = ov::op::internal::MOE::Activation_type::SWIGLU; bool _has_shared_expert = false; @@ -766,6 +782,21 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { GPU_DEBUG_TRACE_DETAIL << "MOE_BATCHED_GEMV_THRESHOLD = " << batched_gemv_threshold << std::endl; } + // OTD relies on runtime weight streaming in oneDNN path. + _lru_expert_num = params.typed_desc()->_lru_expert_num; + if (_lru_expert_num > 0) { + _lru_cache = std::make_shared(_lru_expert_num); + } + if (_lru_expert_num > 0 && use_micro_gemm_prefill) { + use_micro_gemm_prefill = false; + GPU_DEBUG_TRACE_DETAIL << "[DEBUG] moe_3gemm_swiglu_opt_impl(): force disable micro_gemm prefill in OTD mode, lru_expert_num=" << _lru_expert_num + << std::endl; + } + if (_lru_expert_num > 0 && use_grouped_gemm_prefill) { + use_grouped_gemm_prefill = false; + GPU_DEBUG_TRACE_DETAIL << "[DEBUG] moe_3gemm_swiglu_opt_impl(): force disable grouped_gemm prefill in OTD mode, lru_expert_num=" << _lru_expert_num + << std::endl; + } // Don't change the order of stages auto routing_type = node.as().get_primitive()->_config.routing_type; if (routing_type == ov::op::internal::MOECompressed::RoutingType::SOFTMAX) { @@ -842,67 +873,72 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto& dnnl_weights = _dnnl_weights[j]; dnnl_weights.resize(3); dnnl_weights[0].ic = _hidden_size; - dnnl_weights[0].ic_group_size = ic_group_size_from_scale(_hidden_size, moe_fusion_wei_addr.scale[0]); + dnnl_weights[0].ic_group_size = + moe_fusion_wei_addr.scale[0] ? ic_group_size_from_scale(_hidden_size, moe_fusion_wei_addr.scale[0]) : _gate_up_group_size; dnnl_weights[0].oc = _intermediate_size; dnnl_weights[1].ic = _hidden_size; - dnnl_weights[1].ic_group_size = ic_group_size_from_scale(_hidden_size, moe_fusion_wei_addr.scale[1]); + dnnl_weights[1].ic_group_size = + moe_fusion_wei_addr.scale[1] ? ic_group_size_from_scale(_hidden_size, moe_fusion_wei_addr.scale[1]) : _gate_up_group_size; dnnl_weights[1].oc = _intermediate_size; dnnl_weights[2].ic = _intermediate_size; - dnnl_weights[2].ic_group_size = ic_group_size_from_scale(_intermediate_size, moe_fusion_wei_addr.scale[2]); + dnnl_weights[2].ic_group_size = + moe_fusion_wei_addr.scale[2] ? ic_group_size_from_scale(_intermediate_size, moe_fusion_wei_addr.scale[2]) : _down_group_size; dnnl_weights[2].oc = _hidden_size; - for (int i = 0; i < 3; i++) { - // Cross-check ic/ic_group_size against scale shape (drift caused u8 inf bug). - { - const auto& sshape = moe_fusion_wei_addr.scale[i]->get_layout().get_shape(); - const size_t scale_num_groups = (sshape.size() >= 3) ? sshape[2] : 1; - OPENVINO_ASSERT(dnnl_weights[i].ic_group_size > 0, "moe_3gemm GEMM ", i, " ic_group_size must be > 0"); - OPENVINO_ASSERT(dnnl_weights[i].ic % dnnl_weights[i].ic_group_size == 0, - "moe_3gemm GEMM ", - i, - " ic=", - dnnl_weights[i].ic, - " not divisible by ic_group_size=", - dnnl_weights[i].ic_group_size); - const auto expected_groups = dnnl_weights[i].ic / dnnl_weights[i].ic_group_size; - OPENVINO_ASSERT(static_cast(expected_groups) == scale_num_groups, - "moe_3gemm GEMM ", - i, - " ic_group_size=", - dnnl_weights[i].ic_group_size, - " (=> ", - expected_groups, - " groups) disagrees with scale num_groups=", - scale_num_groups, - " (scale shape=", - sshape, - ")"); - if (cur_moe->_config.has_zp && moe_fusion_wei_addr.zp[i]) { - const auto& zshape = moe_fusion_wei_addr.zp[i]->get_layout().get_shape(); - OPENVINO_ASSERT(zshape == sshape, "moe_3gemm GEMM ", i, " scale shape ", sshape, " does not match zp shape ", zshape); + if (!_lru_expert_num) { + for (int i = 0; i < 3; i++) { + // Cross-check ic/ic_group_size against scale shape (drift caused u8 inf bug). + { + const auto& sshape = moe_fusion_wei_addr.scale[i]->get_layout().get_shape(); + const size_t scale_num_groups = (sshape.size() >= 3) ? sshape[2] : 1; + OPENVINO_ASSERT(dnnl_weights[i].ic_group_size > 0, "moe_3gemm GEMM ", i, " ic_group_size must be > 0"); + OPENVINO_ASSERT(dnnl_weights[i].ic % dnnl_weights[i].ic_group_size == 0, + "moe_3gemm GEMM ", + i, + " ic=", + dnnl_weights[i].ic, + " not divisible by ic_group_size=", + dnnl_weights[i].ic_group_size); + const auto expected_groups = dnnl_weights[i].ic / dnnl_weights[i].ic_group_size; + OPENVINO_ASSERT(static_cast(expected_groups) == scale_num_groups, + "moe_3gemm GEMM ", + i, + " ic_group_size=", + dnnl_weights[i].ic_group_size, + " (=> ", + expected_groups, + " groups) disagrees with scale num_groups=", + scale_num_groups, + " (scale shape=", + sshape, + ")"); + if (cur_moe->_config.has_zp && moe_fusion_wei_addr.zp[i]) { + const auto& zshape = moe_fusion_wei_addr.zp[i]->get_layout().get_shape(); + OPENVINO_ASSERT(zshape == sshape, "moe_3gemm GEMM ", i, " scale shape ", sshape, " does not match zp shape ", zshape); + } + } + // weight shape: [ic, oc], type: u4/i8 + int64_t wei_offset = j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc, moe_fusion_wei_addr.weight[i]->get_layout()); + dnnl_weights[i].weight = + convert2dnnl(moe_fusion_wei_addr.weight[i], {dnnl_weights[i].ic, dnnl_weights[i].oc}, dnnl::memory::format_tag::ba, wei_offset); + + // scale shape: [ic / ic_group_size, oc], type: f16 + int64_t scale_offset = j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size, + moe_fusion_wei_addr.scale[i]->get_layout()); + dnnl_weights[i].scale = convert2dnnl(moe_fusion_wei_addr.scale[i], + {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc}, + dnnl::memory::format_tag::ab, + scale_offset); + + // zp shape: [ic / ic_group_size, oc], type: u4/i8 + // Skip ZP memory allocation for symmetric quantization (has_zp=false) to save memory + if (cur_moe->_config.has_zp) { + int64_t zp_offset = j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size, + moe_fusion_wei_addr.zp[i]->get_layout()); + dnnl_weights[i].zp = convert2dnnl(moe_fusion_wei_addr.zp[i], + {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc}, + dnnl::memory::format_tag::ab, + zp_offset); } - } - // weight shape: [ic, oc], type: u4/i8 - int64_t wei_offset = j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc, moe_fusion_wei_addr.weight[i]->get_layout()); - dnnl_weights[i].weight = - convert2dnnl(moe_fusion_wei_addr.weight[i], {dnnl_weights[i].ic, dnnl_weights[i].oc}, dnnl::memory::format_tag::ba, wei_offset); - - // scale shape: [ic / ic_group_size, oc], type: f16 - int64_t scale_offset = - j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size, moe_fusion_wei_addr.scale[i]->get_layout()); - dnnl_weights[i].scale = convert2dnnl(moe_fusion_wei_addr.scale[i], - {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc}, - dnnl::memory::format_tag::ab, - scale_offset); - - // zp shape: [ic / ic_group_size, oc], type: u4/i8 - // Skip ZP memory allocation for symmetric quantization (has_zp=false) to save memory - if (cur_moe->_config.has_zp) { - int64_t zp_offset = - j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size, moe_fusion_wei_addr.zp[i]->get_layout()); - dnnl_weights[i].zp = convert2dnnl(moe_fusion_wei_addr.zp[i], - {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc}, - dnnl::memory::format_tag::ab, - zp_offset); } } } @@ -1044,6 +1080,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { cur_moe->_intermediate_size = _intermediate_size; cur_moe->_gate_up_group_size = _gate_up_group_size; cur_moe->_down_group_size = _down_group_size; + cur_moe->_lru_expert_num = _lru_expert_num; + cur_moe->_lru_cache = _lru_cache; // shared across clones within the same network cur_moe->use_micro_gemm_prefill = use_micro_gemm_prefill; cur_moe->use_gpu_mask_gen_prefill = use_gpu_mask_gen_prefill; cur_moe->use_grouped_gemm_prefill = use_grouped_gemm_prefill; @@ -1144,56 +1182,58 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { } } - // gate - scratch.moe_fusion_wei_addr.weight[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_0)); - scratch.moe_fusion_wei_addr.scale[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_0)); - scratch.moe_fusion_wei_addr.zp[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_0)); + if (!_lru_expert_num) { + // gate + scratch.moe_fusion_wei_addr.weight[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_0)); + scratch.moe_fusion_wei_addr.scale[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_0)); + scratch.moe_fusion_wei_addr.zp[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_0)); - // up - scratch.moe_fusion_wei_addr.weight[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_1)); - scratch.moe_fusion_wei_addr.scale[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_1)); - scratch.moe_fusion_wei_addr.zp[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_1)); + // up + scratch.moe_fusion_wei_addr.weight[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_1)); + scratch.moe_fusion_wei_addr.scale[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_1)); + scratch.moe_fusion_wei_addr.zp[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_1)); - // down - scratch.moe_fusion_wei_addr.weight[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_2)); - scratch.moe_fusion_wei_addr.scale[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_2)); - scratch.moe_fusion_wei_addr.zp[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_2)); - - // For symmetric quantization (has_zp=false), ZP inputs are element::dynamic placeholders - // with zero-count layout. Use scale memory as a dummy to avoid null pointer issues. - const auto& config = instance.get_typed_desc()->_config; - if (!config.has_zp) { - scratch.moe_fusion_wei_addr.zp[0] = scratch.moe_fusion_wei_addr.scale[0]; - scratch.moe_fusion_wei_addr.zp[1] = scratch.moe_fusion_wei_addr.scale[1]; - scratch.moe_fusion_wei_addr.zp[2] = scratch.moe_fusion_wei_addr.scale[2]; - } - - // shared expert - size_t dep_count = instance.dependencies().size(); - if (dep_count >= static_cast(MOE3GemmInputIndex::SHARED_GATE_GATE_WEIGHT) + 1) { - // Gate - scratch.moe_fusion_wei_addr.shared_weight[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_GATE_WEIGHT)); - scratch.moe_fusion_wei_addr.shared_scale[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_GATE_SCALE)); - scratch.moe_fusion_wei_addr.shared_zp[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_GATE_ZP)); - - // Up - scratch.moe_fusion_wei_addr.shared_weight[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_UP_WEIGHT)); - scratch.moe_fusion_wei_addr.shared_scale[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_UP_SCALE)); - scratch.moe_fusion_wei_addr.shared_zp[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_UP_ZP)); - - // Down - scratch.moe_fusion_wei_addr.shared_weight[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_DOWN_WEIGHT)); - scratch.moe_fusion_wei_addr.shared_scale[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_DOWN_SCALE)); - scratch.moe_fusion_wei_addr.shared_zp[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_DOWN_ZP)); - - // Scalar Gate - f16 - scratch.moe_fusion_wei_addr.shared_weight[3] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_GATE_GATE_WEIGHT)); - - // For symmetric quantization, shared expert ZPs are also element::dynamic placeholders + // down + scratch.moe_fusion_wei_addr.weight[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_2)); + scratch.moe_fusion_wei_addr.scale[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_2)); + scratch.moe_fusion_wei_addr.zp[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_2)); + + // For symmetric quantization (has_zp=false), ZP inputs are element::dynamic placeholders + // with zero-count layout. Use scale memory as a dummy to avoid null pointer issues. + const auto& config = instance.get_typed_desc()->_config; if (!config.has_zp) { - scratch.moe_fusion_wei_addr.shared_zp[0] = scratch.moe_fusion_wei_addr.shared_scale[0]; - scratch.moe_fusion_wei_addr.shared_zp[1] = scratch.moe_fusion_wei_addr.shared_scale[1]; - scratch.moe_fusion_wei_addr.shared_zp[2] = scratch.moe_fusion_wei_addr.shared_scale[2]; + scratch.moe_fusion_wei_addr.zp[0] = scratch.moe_fusion_wei_addr.scale[0]; + scratch.moe_fusion_wei_addr.zp[1] = scratch.moe_fusion_wei_addr.scale[1]; + scratch.moe_fusion_wei_addr.zp[2] = scratch.moe_fusion_wei_addr.scale[2]; + } + + // shared expert + size_t dep_count = instance.dependencies().size(); + if (dep_count >= static_cast(MOE3GemmInputIndex::SHARED_GATE_GATE_WEIGHT) + 1) { + // Gate + scratch.moe_fusion_wei_addr.shared_weight[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_GATE_WEIGHT)); + scratch.moe_fusion_wei_addr.shared_scale[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_GATE_SCALE)); + scratch.moe_fusion_wei_addr.shared_zp[0] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_GATE_ZP)); + + // Up + scratch.moe_fusion_wei_addr.shared_weight[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_UP_WEIGHT)); + scratch.moe_fusion_wei_addr.shared_scale[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_UP_SCALE)); + scratch.moe_fusion_wei_addr.shared_zp[1] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_UP_ZP)); + + // Down + scratch.moe_fusion_wei_addr.shared_weight[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_DOWN_WEIGHT)); + scratch.moe_fusion_wei_addr.shared_scale[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_DOWN_SCALE)); + scratch.moe_fusion_wei_addr.shared_zp[2] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_DOWN_ZP)); + + // Scalar Gate - f16 + scratch.moe_fusion_wei_addr.shared_weight[3] = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SHARED_GATE_GATE_WEIGHT)); + + // For symmetric quantization, shared expert ZPs are also element::dynamic placeholders + if (!config.has_zp) { + scratch.moe_fusion_wei_addr.shared_zp[0] = scratch.moe_fusion_wei_addr.shared_scale[0]; + scratch.moe_fusion_wei_addr.shared_zp[1] = scratch.moe_fusion_wei_addr.shared_scale[1]; + scratch.moe_fusion_wei_addr.shared_zp[2] = scratch.moe_fusion_wei_addr.shared_scale[2]; + } } } } @@ -1334,7 +1374,16 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { cldnn::event::ptr exec_batched_gemv(const std::vector& events, typed_primitive_inst& instance, scratch_buffers& scratch, + LRUCache& cache, size_t token_num) { + auto& cur_net = instance.get_network(); + auto& stream = cur_net.get_stream(); + if (_lru_expert_num) { + // Full pipeline sync required: the routing kernel writes expert IDs + // to GPU memory that we read on the CPU below (buffer_ptr()). + // TODO: replace with event-based wait on the routing kernel only. + stream.finish(); + } auto cur_moe = instance.get_typed_desc(); int max_topk = static_cast(cur_moe->_config.top_k); @@ -1348,6 +1397,42 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { const size_t subgroup_size = instance.get_impl_params()->get_device_info().arch >= gpu_arch::xe2 ? 32 : 16; const size_t max_work_group_size = instance.get_impl_params()->get_device_info().max_work_group_size; + if (_lru_expert_num) { + cldnn::moe_weights shell_params = instance._weights; + auto& engine = instance.get_network().get_engine(); + uint32_t* p_expert = (uint32_t*)batch_mem_ptr->buffer_ptr(); + std::vector experts_list; + for (int i = 0; i < max_topk; i++) { + experts_list.push_back(*p_expert++); + } + if (!scratch._index_initialized) { + size_t experts_index_size = 4 * max_topk; // each expert has 4 bytes + auto layout_expert = cldnn::layout({1, 1, 1, static_cast(experts_index_size)}, ov::element::i8, cldnn::format::bfyx); + // auto alloc_type = engine.get_preferred_memory_allocation_type(false); + scratch._expert_index_buffer = engine.allocate_memory(layout_expert, allocation_type::usm_host, false); + // instance._expert_index_buffer = engine.allocate_memory(layout_expert, alloc_type, false); + scratch._index_initialized = true; + } + uint32_t* p_expert_index = (uint32_t*)scratch._expert_index_buffer->buffer_ptr(); + for (int i = 0; i < max_topk; i++) { + auto expert_no = experts_list[i]; + auto lru_expert_no = moe_otd::get_lru_expert_no(instance, static_cast(expert_no), cache); + *p_expert_index++ = lru_expert_no; // update batch_mem_ptr as re-map + } + batch_mem_ptr = scratch._expert_index_buffer; + scratch.moe_fusion_wei_addr.weight[0] = shell_params.gate_w; + scratch.moe_fusion_wei_addr.scale[0] = shell_params.gate_s; + scratch.moe_fusion_wei_addr.zp[0] = shell_params.gate_z; + + scratch.moe_fusion_wei_addr.weight[1] = shell_params.up_w; + scratch.moe_fusion_wei_addr.scale[1] = shell_params.up_s; + scratch.moe_fusion_wei_addr.zp[1] = shell_params.up_z; + + scratch.moe_fusion_wei_addr.weight[2] = shell_params.down_w; + scratch.moe_fusion_wei_addr.scale[2] = shell_params.down_s; + scratch.moe_fusion_wei_addr.zp[2] = shell_params.down_z; + } + // gate const auto& mlp_gate_wei_mem = scratch.moe_fusion_wei_addr.weight[0]; const auto& mlp_gate_scale_mem = scratch.moe_fusion_wei_addr.scale[0]; @@ -1438,6 +1523,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { cldnn::event::ptr exec_prefill_micro_gemm(const std::vector& events, typed_primitive_inst& instance, scratch_buffers& scratch, + LRUCache& cache, const bool use_gpu_mask_gen) { auto cur_moe = instance.get_typed_desc(); int max_topk = static_cast(cur_moe->_config.top_k); @@ -1463,6 +1549,36 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto num_total_experts = static_cast(cur_moe->_config.num_expert); int num_actually_used_experts = 0; + if (_lru_expert_num) { + auto& stream = instance.get_network().get_stream(); + auto& engine = instance.get_network().get_engine(); + auto topk_count = token_num * static_cast(max_topk); + auto topk_bytes = topk_count * sizeof(uint32_t); + + std::vector expert_ids(topk_count); + batch_mem_ptr->copy_to(stream, expert_ids.data(), 0, 0, topk_bytes, true); + + std::unordered_map expert_to_lru; + expert_to_lru.reserve(topk_count); + for (size_t i = 0; i < topk_count; i++) { + auto expert_no = expert_ids[i]; + OPENVINO_ASSERT(expert_no < static_cast(num_total_experts), "expert_no ", expert_no, " exceed max_expert_num ", num_total_experts); + auto it = expert_to_lru.find(expert_no); + if (it == expert_to_lru.end()) { + auto lru_expert_no = moe_otd::get_lru_expert_no(instance, expert_no, cache); + it = expert_to_lru.emplace(expert_no, lru_expert_no).first; + } + expert_ids[i] = it->second; + } + + auto remap_layout = batch_mem_ptr->get_layout(); + if (!scratch._expert_index_buffer || scratch._expert_index_buffer->size() < topk_bytes) { + scratch._expert_index_buffer = engine.allocate_memory(remap_layout, allocation_type::usm_host, false); + } + scratch._expert_index_buffer->copy_from(stream, expert_ids.data(), 0, 0, topk_bytes, true); + batch_mem_ptr = scratch._expert_index_buffer; + } + // step 1: generate 4 mask data for following kernel execution // input: topk output, [token_len, expert_topk] // output: @@ -1727,6 +1843,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { using lru_cache_hash = LruCache, std::shared_ptr, PairHash>; lru_cache_hash _kernels = lru_cache_hash(1024); + std::shared_ptr _otd_kernel_holder; // --- grouped GEMM kernel cache (one primitive set per total-token count) --- struct grouped_onednn_kernel { @@ -1806,6 +1923,12 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { dnnl_weights[2].weight, dnnl_weights[2].scale, dnnl_weights[2].zp); + // each time dnnl_weights updated need refresh kernel cache in OTD mode, if not, the stream engine context and memory storage engine context will + // mismatch, dnnl kernel will report invalid_arguments and fail or compute wrong and output wrong tokens. if any perf concerns, need deep dive here. + if (_lru_expert_num) { + _otd_kernel_holder = kernel; + return *_otd_kernel_holder; + } _kernels.add(key, kernel); return *_kernels.get(key); } @@ -1921,7 +2044,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { cldnn::event::ptr exec_prefill_onednn(const std::vector& events, cldnn::stream& stream, typed_primitive_inst& instance, - scratch_buffers& scratch) { + scratch_buffers& scratch, + LRUCache& cache) { auto cur_moe = instance.get_typed_desc(); const auto& config = cur_moe->_config; auto& dnn_stream = stream.get_onednn_stream(); @@ -1958,6 +2082,34 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { if (can_skip_subgraph) { continue; } + + if (_lru_expert_num) { + // Ensure previous oneDNN work is completed before any potential + // LRU slot overwrite in get_lru_expert_no/fill_weights_memory. + dnn_stream.wait(); + + auto& dnnl_weights = _dnnl_weights[expert_no]; + auto lru_expert_no = moe_otd::get_lru_expert_no(instance, static_cast(expert_no), cache); + auto& params = instance._weights; + +# define CONVERT_DNNL(name, i) \ + int64_t wei_offset##i = lru_expert_no * dnnl_weights[i].ic * dnnl_weights[i].oc / 2; \ + int64_t scale_offset##i = lru_expert_no * dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size * 2; \ + int64_t zp_offset##i = lru_expert_no * dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size / 2; \ + dnnl_weights[i].weight = convert2dnnl(params.name##_w, {dnnl_weights[i].ic, dnnl_weights[i].oc}, dnnl::memory::format_tag::ba, wei_offset##i); \ + dnnl_weights[i].scale = convert2dnnl(params.name##_s, \ + {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc}, \ + dnnl::memory::format_tag::ab, \ + scale_offset##i); \ + dnnl_weights[i].zp = convert2dnnl(params.name##_z, \ + {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc}, \ + dnnl::memory::format_tag::ab, \ + zp_offset##i); + CONVERT_DNNL(gate, 0) + CONVERT_DNNL(up, 1) + CONVERT_DNNL(down, 2) +# undef CONVERT_DNNL + } auto& dnnl_weights = _dnnl_weights[expert_no]; // expert_mask @@ -1989,21 +2141,18 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { convert2dnnl(scratch.x, {static_cast(n_token), dnnl_weights[1].ic}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), dnnl::memory()); - // gate kernel.gate.forward(dnn_stream, n_token, convert2dnnl(scratch.x, {static_cast(n_token), dnnl_weights[0].ic}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.gate, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.up, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab)); - // down kernel.down.forward(dnn_stream, n_token, convert2dnnl(scratch.gate, {static_cast(n_token), _intermediate_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.y, {static_cast(n_token), _hidden_size}, dnnl::memory::format_tag::ab), convert2dnnl(scratch.routing_weights, {static_cast(routing_weights_size)}, dnnl::memory::format_tag::a)); - // index_add result_event = execute_stage({result_event}, instance, @@ -2268,6 +2417,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { const auto& config = cur_moe->_config; auto& cur_net = instance.get_network(); auto& stream = cur_net.get_stream(); + + OPENVINO_ASSERT(!_lru_expert_num || _lru_cache, "LRU cache not initialized for OTD mode"); + // When OTD is disabled (_lru_expert_num == 0) the cache reference is + // never dereferenced — every use site is guarded by `if (_lru_expert_num)`. + // Provide a stack-local dummy so that the reference is always valid. + LRUCache dummy_cache(0); + auto& cache = _lru_cache ? *_lru_cache : dummy_cache; + cldnn::event::ptr ret_env = nullptr; _has_shared_expert = (config.num_shared_expert > 0); @@ -2283,6 +2440,24 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { auto [hidden_states_mem_ptr, hidden_states_layout] = get_input_info(instance, static_cast(MOE3GemmInputIndex::HIDDEN_STATES)); size_t token_num = get_seq_len(hidden_states_layout); + + if (_lru_expert_num) { + if (!cache.m_initialized) { + instance._weights.gate_w = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_0)); + instance._weights.gate_z = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_0)); + instance._weights.gate_s = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_0)); + + instance._weights.up_w = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_1)); + instance._weights.up_z = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_1)); + instance._weights.up_s = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_1)); + + instance._weights.down_w = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::WEIGHT_2)); + instance._weights.down_z = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::ZP_2)); + instance._weights.down_s = instance.input_memory_ptr(static_cast(MOE3GemmInputIndex::SCALE_2)); + cache.m_initialized = true; + } + } + scratch_buffers scratch; prepare_internal_buffers(instance, scratch, token_num); kernel_dump_info.clear_entries(); @@ -2317,7 +2492,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { // Batched GEMV: for small token counts (including single token, MTP/speculative decoding), // use optimized GEMV kernels with batch dimension. Avoids gather/scatter overhead. if (token_num <= batched_gemv_threshold) { - return exec_batched_gemv({topk_event}, instance, scratch, token_num); + return exec_batched_gemv({topk_event}, instance, scratch, cache, token_num); } auto final_hidden_states_mem_ptr = instance.output_memory_ptr(0); @@ -2340,11 +2515,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL { << std::endl; update_rt_params(instance); if (use_micro_gemm_prefill) { - ret_env = exec_prefill_micro_gemm({topk_event}, instance, scratch, use_gpu_mask_gen); + ret_env = exec_prefill_micro_gemm({topk_event}, instance, scratch, cache, use_gpu_mask_gen); } else if (use_grouped_gemm_prefill) { ret_env = exec_prefill_grouped_gemm({topk_event}, stream, instance, scratch); } else { - ret_env = exec_prefill_onednn({topk_event}, stream, instance, scratch); + ret_env = exec_prefill_onednn({topk_event}, stream, instance, scratch, cache); } if (_has_shared_expert) { diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_otd_runtime.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_otd_runtime.hpp new file mode 100644 index 00000000000000..f0573e2acce5ba --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_otd_runtime.hpp @@ -0,0 +1,341 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "LRUCache.hpp" +#include "intel_gpu/primitives/moe_3gemm_fused_compressed.hpp" +#include "intel_gpu/runtime/stream.hpp" +#include "moe_3gemm_fused_inst.h" +#include "openvino/util/parallel_io.hpp" + +namespace ov::intel_gpu::ocl::moe_otd { + +// Lightweight perf counters for OTD profiling. +// Enabled by setting MOE_OTD_PERF_LOG=1 environment variable. +// Counters are printed to stderr on process exit. +struct OtdPerfCounters { + std::atomic gpu_hits{0}; + std::atomic gpu_misses{0}; + std::atomic disk_io_ns{0}; + std::atomic transpose_ns{0}; + std::atomic gpu_copy_ns{0}; + std::atomic tensor_load_count{0}; // number of individual tensor loads (for averaging) + + void dump() const { + const auto hits = gpu_hits.load(std::memory_order_relaxed); + const auto misses = gpu_misses.load(std::memory_order_relaxed); + const auto total = hits + misses; + const auto loads = tensor_load_count.load(std::memory_order_relaxed); + std::cerr << "[OTD_PERF] gpu_hits=" << hits << ", gpu_misses=" << misses << ", gpu_hit_rate=" << (total > 0 ? 100.0 * hits / total : 0.0) << "%" + << ", tensor_loads=" << loads << ", avg_disk_io_us=" << (loads > 0 ? disk_io_ns.load(std::memory_order_relaxed) / 1000 / loads : 0) + << ", avg_transpose_us=" << (loads > 0 ? transpose_ns.load(std::memory_order_relaxed) / 1000 / loads : 0) + << ", avg_gpu_copy_us=" << (loads > 0 ? gpu_copy_ns.load(std::memory_order_relaxed) / 1000 / loads : 0) + << ", total_disk_io_ms=" << disk_io_ns.load(std::memory_order_relaxed) / 1000000 + << ", total_gpu_copy_ms=" << gpu_copy_ns.load(std::memory_order_relaxed) / 1000000 << std::endl; + } +}; + +inline OtdPerfCounters* get_perf_counters() { + static bool enabled = std::getenv("MOE_OTD_PERF_LOG") != nullptr; + if (!enabled) + return nullptr; + + static OtdPerfCounters counters; + static bool registered = [] { + std::atexit([] { + counters.dump(); + }); + return true; + }(); + (void)registered; + return &counters; +} + +inline size_t get_layer_from_id(const std::string& id) { + if (id == "moe:moe_router") { + return 0; + } + + size_t layer = 0; + size_t pos = id.rfind('_'); + if (pos != std::string::npos && pos + 1 < id.size()) { + std::string num_str = id.substr(pos + 1); + layer = atoi(num_str.c_str()); + } + return layer; +} + +class parallel_weight_reader { +public: + explicit parallel_weight_reader(const std::string& weights_path) : _weights_path(weights_path) { + std::streamoff file_size = 0; + ov::util::get_file_handle_and_size(std::filesystem::path(weights_path), 0, _shared_handle, file_size); + _file_size = static_cast(file_size); + } + + ~parallel_weight_reader() { + ov::util::close_file_handle(_shared_handle); + } + + const std::string& path() const { + return _weights_path; + } + + size_t file_size() const { + return _file_size; + } + + void read(char* dst, size_t size, size_t file_offset) { + if (!ov::util::positional_read(_shared_handle, dst, size, file_offset)) { + throw std::runtime_error("Failed to read enough bytes from OTD weight file"); + } + } + +private: + std::string _weights_path; + ov::FileHandle _shared_handle{}; + size_t _file_size = 0; +}; + +inline parallel_weight_reader& get_thread_local_weight_reader(const std::string& weights_path) { + thread_local std::unique_ptr reader; + if (!reader || reader->path() != weights_path) { + reader = std::make_unique(weights_path); + } + return *reader; +} + +inline void maybe_transpose_scale_zp(const cldnn::moe_3gemm_fused_compressed& desc, + const char* tensor_name, + const cldnn::layout& layout, + std::vector& payload, + size_t per_expert_size) { + const bool transpose_scale_zp = std::getenv("MOE_OTD_DISABLE_SCALE_ZP_TRANSPOSE") == nullptr; + if (!transpose_scale_zp || tensor_name == nullptr) { + return; + } + + const std::string_view name(tensor_name); + const bool is_scale = name.find("_s") != std::string_view::npos; + const bool is_zp = name.find("_z") != std::string_view::npos; + if (!is_scale && !is_zp) { + return; + } + + size_t oc = 0; + size_t ic = 0; + if (name.rfind("down_", 0) == 0) { + oc = static_cast(desc._config.hidden_size); + ic = static_cast(desc._config.inter_size); + } else { + oc = static_cast(desc._config.inter_size); + ic = static_cast(desc._config.hidden_size); + } + + const size_t group_size = static_cast(desc._config.group_size); + size_t group_count = 1; + if (group_size != 0 && group_size != std::numeric_limits::max()) { + OPENVINO_ASSERT(ic % group_size == 0, "Invalid group_size for OTD transpose: tensor=", tensor_name, ", ic=", ic, ", group_size=", group_size); + group_count = ic / group_size; + } + + OPENVINO_ASSERT(oc > 0 && group_count > 0, "Invalid dims for OTD transpose: tensor=", tensor_name, ", oc=", oc, ", group_count=", group_count); + + const size_t elem_count = oc * group_count; + if (is_scale) { + const size_t elem_size = static_cast(data_type_traits::size_of(layout.data_type)); + OPENVINO_ASSERT(elem_size > 0, "Invalid scale element size for tensor=", tensor_name); + OPENVINO_ASSERT(elem_count * elem_size == per_expert_size, + "Unexpected scale payload size for tensor=", + tensor_name, + ", expected=", + elem_count * elem_size, + ", got=", + per_expert_size); + + std::vector transposed(per_expert_size, 0); + for (size_t o = 0; o < oc; o++) { + for (size_t g = 0; g < group_count; g++) { + const size_t src_elem_idx = o * group_count + g; + const size_t dst_elem_idx = g * oc + o; + std::memcpy(transposed.data() + dst_elem_idx * elem_size, payload.data() + src_elem_idx * elem_size, elem_size); + } + } + payload.swap(transposed); + return; + } + + OPENVINO_ASSERT(elem_count % 2 == 0, "Unexpected odd element count for packed zp tensor=", tensor_name, ", elem_count=", elem_count); + OPENVINO_ASSERT(elem_count / 2 == per_expert_size, + "Unexpected zp payload size for tensor=", + tensor_name, + ", expected=", + elem_count / 2, + ", got=", + per_expert_size); + + std::vector unpacked(elem_count, 0); + for (size_t i = 0; i < per_expert_size; i++) { + const uint8_t byte = payload[i]; + unpacked[2 * i] = static_cast(byte & 0x0F); + unpacked[2 * i + 1] = static_cast((byte >> 4) & 0x0F); + } + + std::vector transposed_unpacked(elem_count, 0); + for (size_t o = 0; o < oc; o++) { + for (size_t g = 0; g < group_count; g++) { + const size_t src_idx = o * group_count + g; + const size_t dst_idx = g * oc + o; + transposed_unpacked[dst_idx] = unpacked[src_idx]; + } + } + + std::vector repacked(per_expert_size, 0); + for (size_t i = 0; i < per_expert_size; i++) { + repacked[i] = static_cast((transposed_unpacked[2 * i] & 0x0F) | ((transposed_unpacked[2 * i + 1] & 0x0F) << 4)); + } + payload.swap(repacked); +} + +inline void fill_weights_memory(cldnn::stream& exec_stream, + const cldnn::moe_3gemm_fused_compressed& desc, + cldnn::moe_weights& wei_mem, + const std::vector& experts_list, + const std::vector& lru_experts, + size_t layer = 0) { + struct tensor_fill_plan { + size_t per_expert_size = 0; + size_t src_offset = 0; + size_t dst_offset = 0; + }; + + const auto num_expert = static_cast(desc._config.num_expert); + const auto& weight_bin_offsets = desc._weight_bin_offsets; + const auto& weights_path = desc._weights_path; + + OPENVINO_ASSERT(!weights_path.empty(), "weights path is empty for OTD weight loading"); + OPENVINO_ASSERT(weight_bin_offsets.size() == cldnn::moe_3gemm_fused_compressed::serialized_weight_offset_count, "Unexpected number of MOE weight offsets"); + + static const std::array tensor_names = { + {"gate_w", "up_w", "down_w", "gate_s", "up_s", "down_s", "gate_z", "up_z", "down_z"}}; + const std::array tensors_by_offset = { + {wei_mem.gate_w, wei_mem.up_w, wei_mem.down_w, wei_mem.gate_s, wei_mem.up_s, wei_mem.down_s, wei_mem.gate_z, wei_mem.up_z, wei_mem.down_z}}; + + auto* perf = get_perf_counters(); + + auto& weight_reader = get_thread_local_weight_reader(weights_path); + const size_t weight_file_size = weight_reader.file_size(); + + size_t index = 0; + for (uint32_t expert : experts_list) { + auto make_tensor_fill_plan = [&](size_t base_offset, cldnn::memory_ptr mem, size_t expert_no, size_t lru_expert_no, const char* tensor_name) { + tensor_fill_plan plan; + if (!mem) + return plan; + const auto total_bytes = mem->get_layout().bytes_count(); + OPENVINO_ASSERT(num_expert > 0, "Invalid expert count"); + plan.per_expert_size = total_bytes / num_expert; + plan.src_offset = base_offset + expert_no * plan.per_expert_size; + plan.dst_offset = lru_expert_no * plan.per_expert_size; + OPENVINO_ASSERT(plan.src_offset <= weight_file_size, "Invalid src_offset out of file: ", plan.src_offset, ", file_size=", weight_file_size); + OPENVINO_ASSERT(plan.per_expert_size <= weight_file_size - plan.src_offset, + "Read range out of file for tensor ", + tensor_name, + ": src_offset=", + plan.src_offset, + ", per_expert_size=", + plan.per_expert_size, + ", file_size=", + weight_file_size, + ", base_offset=", + base_offset, + ", expert=", + expert_no); + return plan; + }; + + for (size_t offset_pos = 0; offset_pos < static_cast(cldnn::moe_3gemm_fused_compressed::serialized_weight_offset_count); offset_pos++) { + auto plan = + make_tensor_fill_plan(weight_bin_offsets[offset_pos], tensors_by_offset[offset_pos], expert, lru_experts[index], tensor_names[offset_pos]); + std::vector payload; + + if (plan.per_expert_size != 0) { + payload.resize(plan.per_expert_size); + if (perf) { + auto t0 = std::chrono::steady_clock::now(); + weight_reader.read(reinterpret_cast(payload.data()), plan.per_expert_size, plan.src_offset); + auto t1 = std::chrono::steady_clock::now(); + perf->disk_io_ns.fetch_add(static_cast(std::chrono::duration_cast(t1 - t0).count()), + std::memory_order_relaxed); + } else { + weight_reader.read(reinterpret_cast(payload.data()), plan.per_expert_size, plan.src_offset); + } + } + + // Transpose + GPU copy + auto mem = tensors_by_offset[offset_pos]; + if (mem && plan.per_expert_size != 0) { + if (perf) { + auto t0 = std::chrono::steady_clock::now(); + maybe_transpose_scale_zp(desc, tensor_names[offset_pos], mem->get_layout(), payload, plan.per_expert_size); + auto t1 = std::chrono::steady_clock::now(); + mem->copy_from(exec_stream, payload.data(), 0, plan.dst_offset, plan.per_expert_size, true); + auto t2 = std::chrono::steady_clock::now(); + perf->transpose_ns.fetch_add(static_cast(std::chrono::duration_cast(t1 - t0).count()), + std::memory_order_relaxed); + perf->gpu_copy_ns.fetch_add(static_cast(std::chrono::duration_cast(t2 - t1).count()), + std::memory_order_relaxed); + perf->tensor_load_count.fetch_add(1, std::memory_order_relaxed); + } else { + maybe_transpose_scale_zp(desc, tensor_names[offset_pos], mem->get_layout(), payload, plan.per_expert_size); + mem->copy_from(exec_stream, payload.data(), 0, plan.dst_offset, plan.per_expert_size, true); + } + } + } + + index++; + } +} + +inline uint32_t get_lru_expert_no(typed_primitive_inst& instance, uint32_t expert, LRUCache& cache) { + auto cur_moe = instance.get_typed_desc(); + auto& stream = instance.get_network().get_stream(); + size_t layer = get_layer_from_id(cur_moe->id); + auto item = cache.get_lru_item(layer, expert); + OPENVINO_ASSERT(item.first <= static_cast(std::numeric_limits::max()), "LRU slot index overflow: ", item.first); + const auto lru_slot = static_cast(item.first); + + auto* perf = get_perf_counters(); + if (item.second) { + if (perf) + perf->gpu_hits.fetch_add(1, std::memory_order_relaxed); + } else { + if (perf) + perf->gpu_misses.fetch_add(1, std::memory_order_relaxed); + std::vector experts_list_single; + experts_list_single.push_back(expert); + std::vector lru_experts_list_single; + lru_experts_list_single.push_back(lru_slot); + fill_weights_memory(stream, *cur_moe, instance._weights, experts_list_single, lru_experts_list_single, layer); + cache.set_filled(lru_slot); + } + return lru_slot; +} + +} // namespace ov::intel_gpu::ocl::moe_otd \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/graph/include/moe_3gemm_fused_inst.h b/src/plugins/intel_gpu/src/graph/include/moe_3gemm_fused_inst.h index f3b69f8773001f..3cdbb50cffc82b 100644 --- a/src/plugins/intel_gpu/src/graph/include/moe_3gemm_fused_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/moe_3gemm_fused_inst.h @@ -47,6 +47,8 @@ class typed_primitive_inst : public typed_primitive_ static layout calc_output_layout(const moe_node& /* node */, const kernel_impl_params& impl_param); static std::string to_string(const moe_node& node); typed_primitive_inst(network& network, const moe_node& node); + cldnn::memory::ptr _base; + cldnn::moe_weights _weights; }; using moe_inst = typed_primitive_inst; diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 7503bd7cb12ebd..cc4416b8c723cb 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -18,7 +18,7 @@ #include "intel_gpu/runtime/compilation_context.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" #include "intel_gpu/runtime/itt.hpp" - +#include "openvino/util/env_util.hpp" #include "intel_gpu/graph/kernel_impl_params.hpp" #include "intel_gpu/graph/program.hpp" #include "intel_gpu/graph/network.hpp" @@ -30,6 +30,7 @@ #include "paged_attention_inst.h" #include "convolution_inst.h" #include "deconvolution_inst.h" +#include "moe_3gemm_fused_inst.h" #include "mutable_data_inst.h" #include "condition_inst.h" #include "read_value_inst.h" @@ -1169,7 +1170,9 @@ void network::transfer_memory_to_device(std::shared_ptr instance && users.front()->is_type() && users.front()->is_dynamic()) return; - + if (get_config().get_moe_offload_ratio() > 0 && node.have_user_with_type()) { + return; + } // Do not transfer memory if a user requires lockable memory. // If memory is used in both gpu and cpu implementations, primitive itself is responsible for correct allocation type if (node.need_lockable_memory()) diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp index f163eb3cb35f5c..1bf62a34bedcdf 100644 --- a/src/plugins/intel_gpu/src/graph/program.cpp +++ b/src/plugins/intel_gpu/src/graph/program.cpp @@ -1,6 +1,8 @@ // Copyright (C) 2018-2026 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // +#include +#include #include "intel_gpu/graph/fused_primitive_desc.hpp" #include "registry/implementation_manager.hpp" @@ -55,6 +57,7 @@ #include "arg_max_min_inst.h" #include "dft_inst.h" #include "multiclass_nms_inst.h" +#include "moe_3gemm_fused_inst.h" #include "mutable_data_inst.h" #include "pooling_inst.h" #include "border_inst.h" @@ -501,7 +504,6 @@ void program::build_program(bool is_internal) { { pre_optimize_graph(is_internal); } run_graph_compilation(); { post_optimize_graph(is_internal); } - #ifdef GPU_DEBUG_CONFIG if (get_config().get_dry_run_path().empty() || is_internal) { #else @@ -723,7 +725,7 @@ void program::transfer_memory_to_device() { // TODO: Do we need finish call here? Maybe call it in network::execute() ? get_stream().finish(); }; - + auto otd = _config.get_moe_offload_ratio(); for (auto& node : processing_order) { if (node->is_shape_infer_dep()) { continue; @@ -731,6 +733,10 @@ void program::transfer_memory_to_device() { if (node->is_type() && !node->need_lockable_memory()) { auto& data_node = node->as(); auto data_node_layout = data_node.get_output_layout(); + auto prim = data_node.get_primitive(); + if (otd && node->have_user_with_type()) { + continue; + } auto& mem = data_node.get_attached_memory(); auto mem_layout = mem.get_layout(); auto alloc_type = mem.get_allocation_type(); diff --git a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp index 626dee0b48634c..293db5ea5a8f19 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp @@ -26,10 +26,21 @@ #include "openvino/op/tensor_iterator.hpp" #include "openvino/op/bucketize.hpp" #include "openvino/op/matmul.hpp" +#include "openvino/op/moe.hpp" #include "openvino/op/util/binary_elementwise_bitwise.hpp" +#include "intel_gpu/op/moe_3gemm_fused_compressed.hpp" +#include "ov_ops/moe_compressed.hpp" + #include "intel_gpu/primitives/data.hpp" #include "intel_gpu/runtime/debug_configuration.hpp" +#include "moe_offload_constant.hpp" + +#include +#include +#include +#include +#include namespace ov::intel_gpu { @@ -100,8 +111,17 @@ static void create_data(ProgramBuilder& p, const ov::Shape& const_shape, const s p.primitive_ids[initialconstPrimID] = constPrimID; p.profiling_ids.push_back(initialconstPrimID); } else { + auto partial_upload = moe_offload::try_prepare_partial_upload(p, op, const_shape, out_dtype, constFormat, constLayout); + cldnn::memory::ptr mem = nullptr; - if (constLayout.bytes_count() > 0) { + size_t upload_bytes = constLayout.bytes_count(); + ov::Shape upload_shape = const_shape; + + if (partial_upload.enabled) { + mem = partial_upload.memory; + upload_bytes = partial_upload.upload_bytes; + upload_shape = partial_upload.upload_shape; + } else if (constLayout.bytes_count() > 0) { mem = p.get_engine().allocate_memory(constLayout, false); } else { // In the case of empty const data with {0} shape, it has zero byte. @@ -114,37 +134,41 @@ static void create_data(ProgramBuilder& p, const ov::Shape& const_shape, const s GPU_DEBUG_LOG << "[" << initialconstPrimID << ": constant] layout: " << constLayout.to_short_string() << ", mem_ptr(" << mem << ", " << mem->size() << " bytes)"<< std::endl; auto& stream = p.get_engine().get_service_stream(); - cldnn::mem_lock lock{mem, stream}; - auto buf = lock.data(); - auto bufSize = constLayout.bytes_count(); - - // If a constant has element type f64 but contains no elements (empty tensor), - // convert it to f32 because the GPU plugin only supports the f32 data type internally. - if (ov::shape_size(const_shape) == 1 && - out_dtype == cldnn::data_types::f32 && - op->get_output_element_type(0) == ov::element::f64) { - const auto* f64data = op->get_data_ptr(); - auto f32buf = reinterpret_cast(buf); - f32buf[0] = static_cast(f64data[0]); - } else if (out_dtype == cldnn::data_types::f32 && - (op->get_output_element_type(0) == ov::element::u16 || - op->get_output_element_type(0) == ov::element::i16)) { - size_t count = ov::shape_size(const_shape); - auto f32buf = reinterpret_cast(buf); - - if (op->get_output_element_type(0) == ov::element::u16) { - const auto* u16data = op->get_data_ptr(); - for (size_t i = 0; i < count; i++) { - f32buf[i] = static_cast(u16data[i]); + + if (!partial_upload.enabled) { + cldnn::mem_lock lock{mem, stream}; + auto buf = lock.data(); + auto bufSize = upload_bytes; + auto upload_count = ov::shape_size(upload_shape); + + // If a constant has element type f64 but contains no elements (empty tensor), + // convert it to f32 because the GPU plugin only supports the f32 data type internally. + if (upload_count == 1 && + out_dtype == cldnn::data_types::f32 && + op->get_output_element_type(0) == ov::element::f64) { + const auto* f64data = op->get_data_ptr(); + auto f32buf = reinterpret_cast(buf); + f32buf[0] = static_cast(f64data[0]); + } else if (out_dtype == cldnn::data_types::f32 && + (op->get_output_element_type(0) == ov::element::u16 || + op->get_output_element_type(0) == ov::element::i16)) { + size_t count = upload_count; + auto f32buf = reinterpret_cast(buf); + + if (op->get_output_element_type(0) == ov::element::u16) { + const auto* u16data = op->get_data_ptr(); + for (size_t i = 0; i < count; i++) { + f32buf[i] = static_cast(u16data[i]); + } + } else { + const auto* i16data = op->get_data_ptr(); + for (size_t i = 0; i < count; i++) { + f32buf[i] = static_cast(i16data[i]); + } } } else { - const auto* i16data = op->get_data_ptr(); - for (size_t i = 0; i < count; i++) { - f32buf[i] = static_cast(i16data[i]); - } + std::memcpy(&buf[0], &data[0], bufSize); } - } else { - std::memcpy(&buf[0], &data[0], bufSize); } ov::wsh::Extension::hint_evict(*op); p.add_primitive(*op, cldnn::data(initialconstPrimID, mem)); diff --git a/src/plugins/intel_gpu/src/plugin/ops/moe.cpp b/src/plugins/intel_gpu/src/plugin/ops/moe.cpp index b7b4dcd1d2d9b6..d001709ce5af2e 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/moe.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/moe.cpp @@ -2,13 +2,22 @@ // SPDX-License-Identifier: Apache-2.0 // #include "openvino/op/moe.hpp" +#include "transformations/rt_info/fused_names_attribute.hpp" #include #include #include #include + +#include +#include +#include #include +#include +#include +#include +#include #include "ov_ops/moe_compressed.hpp" #include "intel_gpu/plugin/program_builder.hpp" #include "intel_gpu/op/moe_3gemm_fused_compressed.hpp" @@ -17,6 +26,8 @@ #include "intel_gpu/primitives/moe_gemm.hpp" #include "intel_gpu/primitives/moe_mask_gen.hpp" #include "openvino/op/constant.hpp" +#include "openvino/core/model.hpp" +#include "openvino/core/weight_sharing_util.hpp" namespace ov { namespace op { @@ -30,8 +41,346 @@ namespace ov::intel_gpu { using namespace cldnn; static void CreateMOE3GemmFusedCompressedOp(ProgramBuilder& p, const std::shared_ptr& op) { + using input_idx = cldnn::moe_3gemm_fused_compressed::input_index; auto inputs = p.GetInputInfo(op); const auto& config = op->get_config(); + const auto& model = p.get_model(); + std::string weights_path; + const size_t otd_ratio = p.get_config().get_moe_offload_ratio(); + const size_t lru_expert_num = otd_ratio > 0 ? std::max(1, static_cast(config.num_expert) * otd_ratio / 100) : 0; + const bool otd_enabled = lru_expert_num > 0; + if (otd_enabled) { + const auto& rt = model->get_rt_info(); + auto it = rt.find("__weights_path"); + OPENVINO_ASSERT(it != rt.end(), "Model rt_info is missing '__weights_path' required by OTD"); + weights_path = it->second.as(); + } + + struct XmlConstEntry { + size_t offset = 0; + size_t size = 0; + bool used = false; + }; + + std::unordered_map> xml_const_entries_by_name; + bool xml_offsets_ready = false; + + auto load_const_offsets_from_xml = [&]() { + if (xml_offsets_ready || weights_path.empty()) { + return; + } + + std::filesystem::path xml_path(weights_path); + xml_path.replace_extension(".xml"); + OPENVINO_ASSERT(std::filesystem::exists(xml_path), "IR xml file is not found: ", xml_path.string()); + + pugi::xml_document doc; + OPENVINO_ASSERT(doc.load_file(xml_path.string().c_str()), "Failed to parse IR xml file: ", xml_path.string()); + + auto net = doc.child("net"); + auto layers = net.child("layers"); + for (auto layer = layers.child("layer"); layer; layer = layer.next_sibling("layer")) { + const auto type_attr = layer.attribute("type"); + if (!type_attr || std::string(type_attr.value()) != "Const") { + continue; + } + + const auto data = layer.child("data"); + const auto name_attr = layer.attribute("name"); + const auto offset_attr = data.attribute("offset"); + const auto size_attr = data.attribute("size"); + if (!data || !name_attr || !offset_attr || !size_attr) { + continue; + } + + XmlConstEntry entry; + try { + entry.offset = static_cast(std::stoull(offset_attr.value())); + entry.size = static_cast(std::stoull(size_attr.value())); + } catch (const std::exception& e) { + OPENVINO_THROW("Failed to parse MOE weight offset/size from XML attribute: ", e.what(), + " (name=", name_attr.value(), ", offset='", offset_attr.value(), + "', size='", size_attr.value(), "')"); + } + xml_const_entries_by_name[name_attr.value()].push_back(entry); + } + + xml_offsets_ready = true; + }; + + // Extract a layer-scoping pattern from the MOE op name (e.g., "layers.0.experts") + // to disambiguate same-sized constants across different layers. + auto extract_layer_pattern = [](const std::string& moe_name) -> std::string { + // Match patterns like "layers.N.experts" or "layers.NN.experts" + auto pos = moe_name.find("layers."); + if (pos == std::string::npos) return {}; + auto end = moe_name.find(".experts", pos); + if (end == std::string::npos) { + end = moe_name.find("/experts", pos); + if (end == std::string::npos) return {}; + return moe_name.substr(pos, end - pos + 8); // include "/experts" + } + return moe_name.substr(pos, end - pos + 8); // include ".experts" + }; + + // Projection-identifying keywords for each offset slot (3 projections × {weight, scale, zp}). + // Slot layout: [weight_0, weight_1, weight_2, scale_0, scale_1, scale_2, zp_0, zp_1, zp_2] + // Projection 0: gate (VariadicSplit.0), Projection 1: up (VariadicSplit.1), Projection 2: down (down_proj) + struct ProjHint { + std::vector patterns; // candidate substrings in the XML name + std::vector suffixes; // suffix patterns (e.g., "/scale", "/zero_point") + }; + auto get_proj_hint = [](size_t offset_slot) -> ProjHint { + // offset_slot 0-2: weights, 3-5: scales, 6-8: zps + size_t proj_idx = offset_slot % 3; + ProjHint hint; + if (proj_idx == 0) { + hint.patterns = {"VariadicSplit.0", "gate_proj", "gate"}; + } else if (proj_idx == 1) { + hint.patterns = {"VariadicSplit.1", "up_proj", "up"}; + } else { + hint.patterns = {"down_proj", "VariadicSplit.2"}; + } + if (offset_slot < 3) { + // weight: no suffix or just the base name + hint.suffixes = {}; + } else if (offset_slot < 6) { + hint.suffixes = {"/scale"}; + } else { + hint.suffixes = {"/zero_point"}; + } + return hint; + }; + + auto get_const_offset = [&](size_t index, size_t offset_slot) -> size_t { + auto node = op->input_value(index).get_node_shared_ptr(); + auto const_op = std::dynamic_pointer_cast(node); + OPENVINO_ASSERT(const_op != nullptr, "Expected constant input for MOE3GemmFusedCompressed"); + const auto& rt_info = const_op->get_rt_info(); + auto attr_it = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static()); + + if (attr_it != rt_info.end()) { + return attr_it->second.as().bin_offset; + } + + // Try buffer descriptor offset (works when constant data is mmap'd from bin file). + auto source_buf = ov::weight_sharing::Extension::get_constant_source_buffer(*const_op); + if (source_buf) { + return ov::weight_sharing::Extension::get_constant_id(*const_op); + } + + load_const_offsets_from_xml(); + OPENVINO_ASSERT(xml_offsets_ready, + "Missing WeightlessCacheAttribute and failed to initialize xml-based offset lookup for " + "MOE3GemmFusedCompressed constant input"); + + auto resolve_from_name = [&](const std::string& lookup_name, + const std::string& const_name, + size_t expected_size, + size_t& resolved_offset) -> bool { + auto by_name_it = xml_const_entries_by_name.find(lookup_name); + if (by_name_it == xml_const_entries_by_name.end()) { + return false; + } + + size_t match_count = 0; + XmlConstEntry* matched_entry = nullptr; + for (auto& entry : by_name_it->second) { + if (!entry.used && entry.size == expected_size) { + match_count++; + if (matched_entry == nullptr) { + matched_entry = &entry; + } + } + } + + if (match_count == 1 && matched_entry != nullptr) { + matched_entry->used = true; + resolved_offset = matched_entry->offset; + return true; + } + + if (match_count > 1) { + OPENVINO_THROW("Ambiguous xml offset resolution for MOE3GemmFusedCompressed constant input: ", + const_name, + ", lookup_name=", lookup_name, + ", byte_size=", expected_size, + ", candidates=", match_count); + } + + return false; + }; + + const auto& name = const_op->get_friendly_name(); + const size_t expected_size = const_op->get_byte_size(); + size_t resolved_offset = 0; + if (resolve_from_name(name, name, expected_size, resolved_offset)) { + return resolved_offset; + } + + // Try original/fused names before using any size-based fallback. + std::set fused_names_unique; + for (const auto& fused_name : ov::getFusedNamesVector(const_op)) { + if (!fused_name.empty() && fused_name != name) { + fused_names_unique.insert(fused_name); + } + } + for (const auto& fused_name : fused_names_unique) { + if (resolve_from_name(fused_name, name, expected_size, resolved_offset)) { + return resolved_offset; + } + } + + // Fallback: allow by-size only when there is exactly one unused candidate. + struct SizeCandidate { + std::string name; + XmlConstEntry* entry = nullptr; + }; + std::vector size_candidates; + for (auto& kv : xml_const_entries_by_name) { + for (auto& entry : kv.second) { + if (!entry.used && entry.size == expected_size) { + size_candidates.push_back(SizeCandidate{kv.first, &entry}); + } + } + } + + if (size_candidates.size() == 1 && size_candidates[0].entry != nullptr) { + size_candidates[0].entry->used = true; + return size_candidates[0].entry->offset; + } + + // Layer+projection scoped resolution: use the MOE node name to identify the layer, + // and the offset slot to identify the projection role. + if (size_candidates.size() > 1) { + const auto& moe_name = op->get_friendly_name(); + std::string layer_pat = extract_layer_pattern(moe_name); + ProjHint hint = get_proj_hint(offset_slot); + + // Filter candidates by layer pattern + std::vector layer_filtered; + if (!layer_pat.empty()) { + for (auto& sc : size_candidates) { + if (sc.name.find(layer_pat) != std::string::npos) { + layer_filtered.push_back(sc); + } + } + } + + // If layer filtering narrowed it to one, use it + if (layer_filtered.size() == 1 && layer_filtered[0].entry != nullptr) { + layer_filtered[0].entry->used = true; + return layer_filtered[0].entry->offset; + } + + // Further filter by projection hint patterns + auto& search_pool = layer_filtered.empty() ? size_candidates : layer_filtered; + for (const auto& pat : hint.patterns) { + std::vector proj_filtered; + for (auto& sc : search_pool) { + if (sc.name.find(pat) != std::string::npos) { + proj_filtered.push_back(sc); + } + } + if (proj_filtered.size() == 1 && proj_filtered[0].entry != nullptr) { + proj_filtered[0].entry->used = true; + return proj_filtered[0].entry->offset; + } + // If pattern+suffix narrows further + if (proj_filtered.size() > 1 && !hint.suffixes.empty()) { + for (const auto& suffix : hint.suffixes) { + std::vector suffix_filtered; + for (auto& sc : proj_filtered) { + // Check if name ends with the suffix + if (sc.name.size() >= suffix.size() && + sc.name.compare(sc.name.size() - suffix.size(), suffix.size(), suffix) == 0) { + suffix_filtered.push_back(sc); + } + } + if (suffix_filtered.size() == 1 && suffix_filtered[0].entry != nullptr) { + suffix_filtered[0].entry->used = true; + return suffix_filtered[0].entry->offset; + } + } + } + } + + // Also try suffix-only filtering (for weight slots where suffix is empty, + // the weight is the entry that does NOT end with /scale or /zero_point) + if (offset_slot < 3 && !search_pool.empty()) { + std::vector weight_filtered; + for (auto& sc : search_pool) { + bool is_scale_or_zp = (sc.name.find("/scale") != std::string::npos) || + (sc.name.find("/zero_point") != std::string::npos); + if (!is_scale_or_zp) { + weight_filtered.push_back(sc); + } + } + if (weight_filtered.size() == 1 && weight_filtered[0].entry != nullptr) { + weight_filtered[0].entry->used = true; + return weight_filtered[0].entry->offset; + } + // Further filter weights by projection pattern + for (const auto& pat : hint.patterns) { + std::vector proj_wt_filtered; + for (auto& sc : weight_filtered) { + if (sc.name.find(pat) != std::string::npos) { + proj_wt_filtered.push_back(sc); + } + } + if (proj_wt_filtered.size() == 1 && proj_wt_filtered[0].entry != nullptr) { + proj_wt_filtered[0].entry->used = true; + return proj_wt_filtered[0].entry->offset; + } + } + } + + std::ostringstream oss; + const size_t max_candidates_to_log = 8; + for (size_t i = 0; i < std::min(max_candidates_to_log, size_candidates.size()); i++) { + const auto* candidate_entry = size_candidates[i].entry; + if (candidate_entry == nullptr) { + continue; + } + if (i > 0) { + oss << ';'; + } + oss << size_candidates[i].name << '@' << candidate_entry->offset; + } + + OPENVINO_THROW("Ambiguous xml offset resolution for MOE3GemmFusedCompressed constant input: ", + name, + ", byte_size=", expected_size, + ", size_candidates=", size_candidates.size(), + ", layer_pat=", layer_pat, + ", offset_slot=", offset_slot, + ", moe_name=", moe_name, + ", sample_candidates=", oss.str()); + } + + OPENVINO_THROW("Unable to resolve xml offset for MOE3GemmFusedCompressed constant input: ", name, + ", byte_size=", expected_size); + }; + + const std::array const_input_idx_by_offset = { + static_cast(input_idx::weight_0), + static_cast(input_idx::weight_1), + static_cast(input_idx::weight_2), + static_cast(input_idx::scale_0), + static_cast(input_idx::scale_1), + static_cast(input_idx::scale_2), + static_cast(input_idx::zp_0), + static_cast(input_idx::zp_1), + static_cast(input_idx::zp_2) + }; + + std::vector weight_bin_offsets(cldnn::moe_3gemm_fused_compressed::serialized_weight_offset_count, 0); + // Serialized offsets are only needed for OTD path (weight-on-demand loading). + if (otd_enabled) { + for (size_t i = 0; i < const_input_idx_by_offset.size(); i++) { + weight_bin_offsets[i] = get_const_offset(const_input_idx_by_offset[i], i); + } + } /// 0: hidden_states - input tensor with hidden representations /// 1: routing_weights - [num_seq, num_experts] routing weights for all experts /// 2: w0_weight - expert weights for first projection, @@ -84,7 +433,7 @@ static void CreateMOE3GemmFusedCompressedOp(ProgramBuilder& p, const std::shared validate_inputs_count(op, {expected_inputs}); const std::string layerName = layer_type_name_ID(op); - const cldnn::moe_3gemm_fused_compressed moe(layerName, inputs, config); + const cldnn::moe_3gemm_fused_compressed moe(layerName, inputs, config, weight_bin_offsets, weights_path, lru_expert_num); p.add_primitive(*op, moe); } diff --git a/src/plugins/intel_gpu/src/plugin/ops/moe_offload_constant.hpp b/src/plugins/intel_gpu/src/plugin/ops/moe_offload_constant.hpp new file mode 100644 index 00000000000000..717d24e572e843 --- /dev/null +++ b/src/plugins/intel_gpu/src/plugin/ops/moe_offload_constant.hpp @@ -0,0 +1,118 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "intel_gpu/op/moe_3gemm_fused_compressed.hpp" +#include "intel_gpu/plugin/program_builder.hpp" +#include "openvino/op/constant.hpp" + +namespace ov::intel_gpu::moe_offload { + +struct partial_upload_desc { + bool enabled = false; + cldnn::memory::ptr memory = nullptr; + ov::Shape upload_shape; + size_t upload_bytes = 0; +}; + +inline bool is_moe_related_constant(const std::shared_ptr& op) { + const auto users = op->get_output_target_inputs(0); + for (const auto& input : users) { + const auto* node = input.get_node(); + if (ov::is_type(node)) { + return true; + } + } + return false; +} + +class partial_upload_log_state { +public: + static constexpr size_t max_detailed_logs = 3; + + void log(const std::string& node_name, + size_t uploaded_experts, + size_t total_experts, + size_t upload_bytes, + size_t target_bytes) { + total_upload_bytes.fetch_add(static_cast(upload_bytes), std::memory_order_relaxed); + total_target_bytes.fetch_add(static_cast(target_bytes), std::memory_order_relaxed); + + const size_t total = total_count.fetch_add(1, std::memory_order_relaxed) + 1; + if (total <= max_detailed_logs) { + std::cout << "MOE OTD partial constant allocation at compile stage: " + << node_name << ", experts=" << uploaded_experts + << "/" << total_experts << ", upload_bytes=" << upload_bytes + << ", target_bytes=" << target_bytes << std::endl; + } else if (total == max_detailed_logs + 1) { + std::cout << "MOE OTD partial constant allocation: suppressing further per-constant logs, " + << "final summary will be printed at process exit" << std::endl; + } + } + + ~partial_upload_log_state() { + const size_t total = total_count.load(std::memory_order_relaxed); + if (total > max_detailed_logs) { + const size_t shown = max_detailed_logs; + std::cout << "MOE OTD partial constant allocation summary: total=" << total + << ", shown=" << shown << ", suppressed=" << (total - shown) + << ", total_upload_bytes=" << total_upload_bytes.load(std::memory_order_relaxed) + << ", total_target_bytes=" << total_target_bytes.load(std::memory_order_relaxed) + << std::endl; + } + } + +private: + std::atomic total_count{0}; + std::atomic total_upload_bytes{0}; + std::atomic total_target_bytes{0}; +}; + +inline partial_upload_log_state& get_partial_upload_log_state() { + static partial_upload_log_state state; + return state; +} + +inline partial_upload_desc try_prepare_partial_upload(ProgramBuilder& p, + const std::shared_ptr& op, + const ov::Shape& const_shape, + cldnn::data_types out_dtype, + const cldnn::format& const_format, + const cldnn::layout& const_layout) { + partial_upload_desc desc; + + const size_t otd_ratio = p.get_config().get_moe_offload_ratio(); + const bool partial_moe_const_upload = otd_ratio > 0 && is_moe_related_constant(op); + if (!partial_moe_const_upload || const_layout.bytes_count() == 0 || const_shape.empty() || const_shape[0] == 0) { + return desc; + } + + const size_t otd_expert_num = std::max(1, const_shape[0] * otd_ratio / 100); + + desc.enabled = true; + desc.upload_shape = const_shape; + desc.upload_shape[0] = std::min(const_shape[0], otd_expert_num); + + auto upload_layout = cldnn::layout(desc.upload_shape, out_dtype, const_format); + auto upload_mem = p.get_engine().allocate_memory(upload_layout, false); + desc.memory = p.get_engine().reinterpret_buffer(*upload_mem, const_layout); + desc.upload_bytes = upload_layout.bytes_count(); + + get_partial_upload_log_state().log(op->get_friendly_name(), + desc.upload_shape[0], + const_shape[0], + desc.upload_bytes, + const_layout.bytes_count()); + return desc; +} + +} // namespace ov::intel_gpu::moe_offload \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp index 0e1beab36f96a6..7685bbef460990 100644 --- a/src/plugins/intel_gpu/src/plugin/plugin.cpp +++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp @@ -189,7 +189,6 @@ std::shared_ptr Plugin::clone_and_transform_model(const std::shared_p if (weight_path.extension() != ".bin" && !is_weightless_cache_attributes_set(cloned_model)) set_weightless_cache_attributes(cloned_model); } - transform_model(cloned_model, config_copy, context); // Transformations for some reason may drop output tensor names, so here we copy those from the original model @@ -757,6 +756,7 @@ std::vector Plugin::get_supported_properties() const { ov::PropertyName{ov::cache_encryption_callbacks.name(), PropertyMutability::WO}, ov::PropertyName{ov::hint::kv_cache_precision.name(), PropertyMutability::RW}, ov::PropertyName{ov::hint::model.name(), PropertyMutability::WO}, + ov::PropertyName{ov::intel_gpu::moe_offload_ratio.name(), PropertyMutability::RW}, ov::PropertyName{ov::intel_gpu::config_file.name(), PropertyMutability::RW}, }; diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_lru_cache_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_lru_cache_test.cpp new file mode 100644 index 00000000000000..a3f9e6fb64fc42 --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_lru_cache_test.cpp @@ -0,0 +1,455 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils/test_utils.h" +#include "intel_gpu/runtime/internal_properties.hpp" +#include "ocl_v2/moe/LRUCache.hpp" + +#include +#include +#include +#include + +using namespace cldnn; +using namespace tests; + +// ────────────────────────────────────────────────── +// Basic construction and initial state +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, initial_state) { + LRUCache cache(4); + + ASSERT_EQ(cache.size(), 0U); + ASSERT_EQ(cache.get_total_experts(), 0U); + ASSERT_FALSE(cache.m_initialized); +} + +// ────────────────────────────────────────────────── +// get_lru_item: insert (miss) and hit +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, single_insert_is_miss) { + LRUCache cache(4); + + auto [slot, hit] = cache.get_lru_item(/*layer=*/0, /*expert=*/0); + ASSERT_FALSE(hit); // first access is a miss + ASSERT_EQ(slot, 0U); // first slot assigned + ASSERT_EQ(cache.size(), 1U); +} + +TEST(moe_lru_cache, second_access_without_fill_is_miss) { + // get_lru_item returns is_hit based on m_filled_list, not just presence + LRUCache cache(4); + + cache.get_lru_item(0, 0); // insert, slot 0 + auto [slot, hit] = cache.get_lru_item(0, 0); // access again + + ASSERT_EQ(slot, 0U); + ASSERT_FALSE(hit); // filled_list[0] is still false +} + +TEST(moe_lru_cache, access_after_set_filled_is_hit) { + LRUCache cache(4); + + auto [slot, miss] = cache.get_lru_item(0, 0); + cache.set_filled(slot); + + auto [slot2, hit] = cache.get_lru_item(0, 0); + ASSERT_EQ(slot2, slot); + ASSERT_TRUE(hit); +} + +// ────────────────────────────────────────────────── +// Sequential slot allocation +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, slots_assigned_sequentially) { + const size_t cap = 4; + LRUCache cache(cap); + + for (size_t i = 0; i < cap; ++i) { + auto [slot, hit] = cache.get_lru_item(/*layer=*/0, /*expert=*/i); + ASSERT_EQ(slot, i); + ASSERT_FALSE(hit); + } + ASSERT_EQ(cache.size(), cap); +} + +// ────────────────────────────────────────────────── +// Eviction: oldest entry evicted when cache is full +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, eviction_when_full) { + LRUCache cache(3); + + // Fill cache: experts 0, 1, 2 + cache.get_lru_item(0, 0); // slot 0 + cache.get_lru_item(0, 1); // slot 1 + cache.get_lru_item(0, 2); // slot 2 + ASSERT_EQ(cache.size(), 3U); + + // Insert expert 3 → evicts expert 0 (oldest), reuses slot 0 + auto [slot, hit] = cache.get_lru_item(0, 3); + ASSERT_FALSE(hit); + ASSERT_EQ(slot, 0U); // expert 0's slot is recycled + ASSERT_EQ(cache.size(), 3U); // size stays at capacity +} + +TEST(moe_lru_cache, evicted_entry_becomes_miss) { + LRUCache cache(2); + + // Fill: expert 0 (slot 0), expert 1 (slot 1) + cache.get_lru_item(0, 0); + cache.set_filled(0); + cache.get_lru_item(0, 1); + cache.set_filled(1); + + // Insert expert 2 → evicts expert 0 + cache.get_lru_item(0, 2); + + // Access expert 0 again → should be a miss (it was evicted) + auto [slot, hit] = cache.get_lru_item(0, 0); + ASSERT_FALSE(hit); +} + +// ────────────────────────────────────────────────── +// LRU ordering: recently accessed items survive eviction +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, lru_order_refresh_on_access) { + LRUCache cache(3); + + // Insert experts 0, 1, 2 in order + cache.get_lru_item(0, 0); // slot 0 (LRU order: 0) + cache.get_lru_item(0, 1); // slot 1 (LRU order: 0, 1) + cache.get_lru_item(0, 2); // slot 2 (LRU order: 0, 1, 2) + + // Access expert 0 → moves to most recent + // (LRU order: 1, 2, 0) + cache.get_lru_item(0, 0); + + // Insert expert 3 → should evict expert 1 (now the oldest) + auto [slot, hit] = cache.get_lru_item(0, 3); + ASSERT_FALSE(hit); + ASSERT_EQ(slot, 1U); // reuses expert 1's slot +} + +TEST(moe_lru_cache, double_refresh_changes_eviction_order) { + LRUCache cache(3); + + // Insert 0, 1, 2 + cache.get_lru_item(0, 0); // slot 0 + cache.get_lru_item(0, 1); // slot 1 + cache.get_lru_item(0, 2); // slot 2 + + // Refresh 0, then refresh 1 → LRU order: 2, 0, 1 + cache.get_lru_item(0, 0); + cache.get_lru_item(0, 1); + + // Insert 3 → evicts expert 2 (oldest) + auto [slot3, hit3] = cache.get_lru_item(0, 3); + ASSERT_FALSE(hit3); + ASSERT_EQ(slot3, 2U); // reuses expert 2's slot + + // Insert 4 → evicts expert 0 (now oldest) + auto [slot4, hit4] = cache.get_lru_item(0, 4); + ASSERT_FALSE(hit4); + ASSERT_EQ(slot4, 0U); // reuses expert 0's slot +} + +// ────────────────────────────────────────────────── +// Multi-layer: (layer, expert) pairs are independent +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, multi_layer_keys_are_independent) { + LRUCache cache(4); + + auto [s0, h0] = cache.get_lru_item(/*layer=*/0, /*expert=*/0); + auto [s1, h1] = cache.get_lru_item(/*layer=*/1, /*expert=*/0); + + ASSERT_FALSE(h0); + ASSERT_FALSE(h1); + ASSERT_NE(s0, s1); // different slots for different layers + ASSERT_EQ(cache.size(), 2U); +} + +TEST(moe_lru_cache, same_expert_different_layers_both_cached) { + LRUCache cache(4); + + cache.get_lru_item(0, 5); + cache.set_filled(0); + cache.get_lru_item(1, 5); + cache.set_filled(1); + + auto [s0, h0] = cache.get_lru_item(0, 5); + auto [s1, h1] = cache.get_lru_item(1, 5); + + ASSERT_TRUE(h0); + ASSERT_TRUE(h1); + ASSERT_NE(s0, s1); +} + +// ────────────────────────────────────────────────── +// set_filled / filled tracking +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, filled_cleared_on_eviction) { + LRUCache cache(2); + + // Insert expert 0, mark filled + auto [s0, miss0] = cache.get_lru_item(0, 0); + cache.set_filled(s0); + + // Insert expert 1 + cache.get_lru_item(0, 1); + + // Insert expert 2 → evicts expert 0, reuses slot s0 + auto [s2, h2] = cache.get_lru_item(0, 2); + ASSERT_EQ(s2, s0); // recycled slot + ASSERT_FALSE(h2); // filled was cleared during eviction + + // Even after re-inserting to the same slot, accessing it returns not-filled + auto [s2b, h2b] = cache.get_lru_item(0, 2); + ASSERT_FALSE(h2b); // not filled until set_filled is called +} + +TEST(moe_lru_cache, set_filled_out_of_range_is_safe) { + LRUCache cache(2); + // Should not crash + cache.set_filled(100); + cache.set_filled(std::numeric_limits::max()); +} + +// ────────────────────────────────────────────────── +// evict_one: explicit eviction +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, explicit_evict_reduces_size) { + LRUCache cache(4); + + cache.get_lru_item(0, 0); + cache.get_lru_item(0, 1); + ASSERT_EQ(cache.size(), 2U); + + cache.evict_one(); + ASSERT_EQ(cache.size(), 1U); + + cache.evict_one(); + ASSERT_EQ(cache.size(), 0U); +} + +TEST(moe_lru_cache, evict_on_empty_is_safe) { + LRUCache cache(4); + // Should not crash + cache.evict_one(); + ASSERT_EQ(cache.size(), 0U); +} + +// ────────────────────────────────────────────────── +// Capacity = 1 edge case +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, capacity_one) { + LRUCache cache(1); + + auto [s0, h0] = cache.get_lru_item(0, 0); + ASSERT_EQ(s0, 0U); + ASSERT_FALSE(h0); + + cache.set_filled(s0); + + // Access same → hit + auto [s0b, h0b] = cache.get_lru_item(0, 0); + ASSERT_EQ(s0b, 0U); + ASSERT_TRUE(h0b); + + // Insert new → evicts the only entry, reuses slot 0 + auto [s1, h1] = cache.get_lru_item(0, 1); + ASSERT_EQ(s1, 0U); + ASSERT_FALSE(h1); + ASSERT_EQ(cache.size(), 1U); +} + +// ────────────────────────────────────────────────── +// Stress: many inserts and evictions +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, stress_many_experts) { + const size_t cap = 8; + LRUCache cache(cap); + + // Insert 100 unique (layer=0, expert=i) entries into a cache with capacity 8 + for (size_t i = 0; i < 100; ++i) { + auto [slot, hit] = cache.get_lru_item(0, i); + ASSERT_LT(slot, cap); + if (i < cap) { + ASSERT_EQ(slot, i); + } + } + ASSERT_EQ(cache.size(), cap); + + // The last `cap` experts should still be in the cache + for (size_t i = 100 - cap; i < 100; ++i) { + auto [slot, hit] = cache.get_lru_item(0, i); + // They are in cache (though not filled) + ASSERT_LT(slot, cap); + } +} + +// ────────────────────────────────────────────────── +// Thread safety: concurrent get_lru_item calls +// ────────────────────────────────────────────────── + +TEST(moe_lru_cache, concurrent_access) { + const size_t cap = 16; + LRUCache cache(cap); + const int num_threads = 4; + const int ops_per_thread = 200; + + std::vector threads; + std::atomic start{false}; + + for (int t = 0; t < num_threads; ++t) { + threads.emplace_back([&, t]() { + while (!start.load()) {} // spin until all threads are ready + for (int i = 0; i < ops_per_thread; ++i) { + size_t layer = t; + size_t expert = i % 32; + auto [slot, hit] = cache.get_lru_item(layer, expert); + ASSERT_LT(slot, cap); + if (!hit) { + cache.set_filled(slot); + } + } + }); + } + + start.store(true); + for (auto& th : threads) { + th.join(); + } + + ASSERT_LE(cache.size(), cap); +} + +// ────────────────────────────────────────────────── +// Property config: moe_offload_ratio roundtrip +// ────────────────────────────────────────────────── + +TEST(moe_offload_property_test, default_value_is_zero) { + auto config = get_test_default_config(get_test_engine()); + ASSERT_EQ(config.get_moe_offload_ratio(), 0U); +} + +TEST(moe_offload_property_test, set_and_get_various_values) { + auto config = get_test_default_config(get_test_engine()); + + config.set_property(ov::intel_gpu::moe_offload_ratio(1)); + ASSERT_EQ(config.get_moe_offload_ratio(), 1U); + + config.set_property(ov::intel_gpu::moe_offload_ratio(50)); + ASSERT_EQ(config.get_moe_offload_ratio(), 50U); + + config.set_property(ov::intel_gpu::moe_offload_ratio(100)); + ASSERT_EQ(config.get_moe_offload_ratio(), 100U); +} + +TEST(moe_offload_property_test, set_back_to_zero_disables) { + auto config = get_test_default_config(get_test_engine()); + + config.set_property(ov::intel_gpu::moe_offload_ratio(37)); + ASSERT_EQ(config.get_moe_offload_ratio(), 37U); + + config.set_property(ov::intel_gpu::moe_offload_ratio(0)); + ASSERT_EQ(config.get_moe_offload_ratio(), 0U); +} + +// ────────────────────────────────────────────────── +// Primitive: moe_3gemm_fused_compressed fields +// ────────────────────────────────────────────────── + +#include "intel_gpu/primitives/moe_3gemm_fused_compressed.hpp" + +TEST(moe_offload_primitive_test, default_offload_fields) { + cldnn::moe_3gemm_fused_compressed prim; + ASSERT_TRUE(prim._weight_bin_offsets.empty()); + ASSERT_TRUE(prim._weights_path.empty()); + ASSERT_EQ(prim._lru_expert_num, 0U); +} + +TEST(moe_offload_primitive_test, construct_with_offload_params) { + MOE3GemmFusedCompressed::Config config{}; + std::vector offsets = {0, 100, 200, 300, 400, 500, 600, 700, 800}; + std::string path = "/path/to/weights.bin"; + size_t lru_num = 16; + + cldnn::moe_3gemm_fused_compressed prim( + "test_moe", + {cldnn::input_info("input0"), cldnn::input_info("input1")}, + config, + offsets, + path, + lru_num); + + ASSERT_EQ(prim._weight_bin_offsets.size(), 9U); + ASSERT_EQ(prim._weight_bin_offsets[0], 0U); + ASSERT_EQ(prim._weight_bin_offsets[8], 800U); + ASSERT_EQ(prim._weights_path, path); + ASSERT_EQ(prim._lru_expert_num, lru_num); +} + +TEST(moe_offload_primitive_test, equality_with_offload_fields) { + MOE3GemmFusedCompressed::Config config{}; + std::vector offsets = {0, 100, 200, 300, 400, 500, 600, 700, 800}; + + cldnn::moe_3gemm_fused_compressed prim1( + "test_moe", + {cldnn::input_info("input0")}, + config, + offsets, + "/path/a.bin", + 16); + + cldnn::moe_3gemm_fused_compressed prim2( + "test_moe", + {cldnn::input_info("input0")}, + config, + offsets, + "/path/a.bin", + 16); + + ASSERT_TRUE(prim1 == prim2); + + // Different weights_path → not equal + cldnn::moe_3gemm_fused_compressed prim3( + "test_moe", + {cldnn::input_info("input0")}, + config, + offsets, + "/path/b.bin", + 16); + ASSERT_FALSE(prim1 == prim3); + + // Different lru_expert_num → not equal + cldnn::moe_3gemm_fused_compressed prim4( + "test_moe", + {cldnn::input_info("input0")}, + config, + offsets, + "/path/a.bin", + 32); + ASSERT_FALSE(prim1 == prim4); + + // Different offsets → not equal + std::vector offsets2 = {0, 100, 200, 300, 400, 500, 600, 700, 999}; + cldnn::moe_3gemm_fused_compressed prim5( + "test_moe", + {cldnn::input_info("input0")}, + config, + offsets2, + "/path/a.bin", + 16); + ASSERT_FALSE(prim1 == prim5); +} diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_property_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_property_test.cpp new file mode 100644 index 00000000000000..81c84a80200dda --- /dev/null +++ b/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_property_test.cpp @@ -0,0 +1,19 @@ +// Copyright (C) 2018-2026 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "test_utils/test_utils.h" +#include "intel_gpu/runtime/internal_properties.hpp" + +using namespace cldnn; +using namespace tests; + +TEST(moe_offload_property_test, execution_config_roundtrip) { + auto config = get_test_default_config(get_test_engine()); + + ASSERT_EQ(config.get_moe_offload_ratio(), 0U); + + config.set_property(ov::intel_gpu::moe_offload_ratio(37)); + + ASSERT_EQ(config.get_moe_offload_ratio(), 37U); +} \ No newline at end of file