openvinotoolkit · zaixing-wang · Jun 2, 2026
@@ -139,10 +139,11 @@ class ProgramBuilder final {
 
     bool use_new_shape_infer() const { return m_config.get_allow_new_shape_infer(); }
     bool is_inner_program() const { return m_is_inner_program; }
-    bool is_query_mode() { return queryMode; }
+    bool is_query_mode() const { return queryMode; }
 
     std::shared_ptr<ov::threading::IStreamsExecutor> get_task_executor() const { return m_task_executor; }
     std::shared_ptr<cldnn::ICompilationContext> get_compilation_context() const { return m_compilation_context; }
+    std::shared_ptr<ov::Model> get_model() const { return m_model; }
 
 private:
     static factories_map_t factories_map;

@@ -4,19 +4,51 @@
 
 #pragma once
 #include <vector>
+#include <string>
 
 #include "intel_gpu/op/moe_3gemm_fused_compressed.hpp"
 #include "intel_gpu/runtime/engine.hpp"
+#include "intel_gpu/runtime/memory.hpp"
 #include "primitive.hpp"
 
 namespace cldnn {
 using MOE3GemmFusedCompressed = ov::intel_gpu::op::MOE3GemmFusedCompressed;
 
+struct moe_weights {
+    cldnn::memory::ptr gate_w = nullptr;
+    cldnn::memory::ptr gate_s = nullptr;
+    cldnn::memory::ptr gate_z = nullptr;
+    cldnn::memory::ptr up_w = nullptr;
+    cldnn::memory::ptr up_s = nullptr;
+    cldnn::memory::ptr up_z = nullptr;
+    cldnn::memory::ptr down_w = nullptr;
+    cldnn::memory::ptr down_s = nullptr;
+    cldnn::memory::ptr down_z = nullptr;
+};
+
 /// @brief moe compressed primitive
 /// @details Performs moe compressed
 struct moe_3gemm_fused_compressed : public primitive_base<moe_3gemm_fused_compressed> {
     CLDNN_DECLARE_PRIMITIVE(moe_3gemm_fused_compressed)
 
+    static constexpr size_t serialized_weight_offset_count = 9;
+
+    enum class input_index : size_t {
+        hidden_states = 0,
+        routing_weights,
+        weight_0,
+        scale_0,
+        zp_0,
+        weight_1,
+        scale_1,
+        zp_1,
+        weight_2,
+        scale_2,
+        zp_2,
+        count
+    };
+    static constexpr size_t input_count = static_cast<size_t>(input_index::count);
+
     moe_3gemm_fused_compressed() : primitive_base("", {}) {}
 
     // @brief Constructs moe primitive / layer.
@@ -70,29 +102,49 @@ struct moe_3gemm_fused_compressed : public primitive_base<moe_3gemm_fused_compre
     //                   22: shared_gate_gate_weight - shared expert gate weight for gating,
     //                      shape [hidden_size]
     //
-    moe_3gemm_fused_compressed(const primitive_id& id, const std::vector<input_info>& inputs, const MOE3GemmFusedCompressed::Config& config)
+        moe_3gemm_fused_compressed(const primitive_id& id,
+                                                             const std::vector<input_info>& inputs,
+                                                             const MOE3GemmFusedCompressed::Config& config,
+                                                             const std::vector<size_t>& weight_bin_offsets = {},
+                                                             const std::string& weights_path = "",
+                                                             size_t lru_expert_num = 0)
         : primitive_base(id, inputs, 1, {optional_data_type()}),
-          _config(config) {}
+                    _config(config),
+                    _weight_bin_offsets(weight_bin_offsets),
+                    _weights_path(weights_path),
+                    _lru_expert_num(lru_expert_num) {}
 
     MOE3GemmFusedCompressed::Config _config;
+        std::vector<size_t> _weight_bin_offsets;
+        std::string _weights_path;
+        size_t _lru_expert_num = 0;
 
     bool operator==(const primitive& rhs) const override {
         if (!compare_common_params(rhs))
             return false;
 
         auto rhs_casted = downcast<const moe_3gemm_fused_compressed>(rhs);
 
-        return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0;
+         return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0 &&
+             _weight_bin_offsets == rhs_casted._weight_bin_offsets &&
+             _weights_path == rhs_casted._weights_path &&
+             _lru_expert_num == rhs_casted._lru_expert_num;
     }
 
     void save(BinaryOutputBuffer& ob) const override {
         primitive_base<moe_3gemm_fused_compressed>::save(ob);
         ob << make_data(&_config, sizeof(_config));
+        ob << _weight_bin_offsets;
+        ob << _weights_path;
+        ob << _lru_expert_num;
     }
 
     void load(BinaryInputBuffer& ib) override {
         primitive_base<moe_3gemm_fused_compressed>::load(ib);
         ib >> make_data(&_config, sizeof(_config));
+        ib >> _weight_bin_offsets;
+        ib >> _weights_path;
+        ib >> _lru_expert_num;
     }
 };
 

@@ -133,6 +133,7 @@ static constexpr Property<ImplForcingMap, PropertyMutability::RW> force_implemen
 static constexpr Property<std::string, PropertyMutability::RW> config_file{"CONFIG_FILE"};
 static constexpr Property<float, PropertyMutability::RW> buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"};
 static constexpr Property<size_t, PropertyMutability::RW> max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"};
+static constexpr Property<size_t, PropertyMutability::RW> moe_offload_ratio{"MOE_OFFLOAD_RATIO"};
 static constexpr Property<bool, PropertyMutability::RW> use_onednn{"GPU_USE_ONEDNN"};
 static constexpr Property<bool, PropertyMutability::RW> use_cm{"GPU_USE_CM"};
 

@@ -39,6 +39,7 @@ OV_CONFIG_RELEASE_OPTION(ov::hint, model, nullptr, "Shared pointer to the ov::Mo
 OV_CONFIG_RELEASE_OPTION(ov::internal, key_cache_quant_mode, ov::internal::CacheQuantMode::BY_CHANNEL, "AUTO or BY_CHANNEL or BY_TOKEN")
 OV_CONFIG_RELEASE_OPTION(ov::internal, value_cache_quant_mode, ov::internal::CacheQuantMode::BY_TOKEN, "AUTO or BY_CHANNEL or BY_TOKEN")
 OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, mem_pool_util_threshold, 0.5, "Minimum utilization threshold (0.0~1.0) for reusable memory in the pool")
+OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, moe_offload_ratio, 0, "Percentage (0-100) of MoE experts to keep resident on device for offload")
 OV_CONFIG_RELEASE_OPTION(ov, enable_weightless, false, "Enable/Disable weightless blob")
 
 OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, shape_predictor_settings, {10, 16 * 1024, 2, 1.1f}, "Preallocation settings")

@@ -635,6 +635,10 @@ static void optimize_moe_gemm_decompression_parameters(moe_gemm_node& node, prog
 
 static void optimize_moe_3gemm_fused_decompression_parameters(moe_node& node, program& p) {
     auto prim = node.get_primitive();
+    if (prim->_lru_expert_num > 0) {
+        // OTD routed weights are backed by resident-size allocations; reorders would materialize full logical tensors.
+        return;
+    }
     const auto& cfg = prim->_config;
     // Routed-expert scales at 3/6/9 (gate/up/down); zp at +1 when has_zp.
     constexpr std::array<size_t, 3> routed_scale_indices{3u, 6u, 9u};

@@ -0,0 +1,65 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "LRUCache.hpp"
+
+#include <cstdlib>
+
+LRUCache::LRUCache(size_t max_total_experts, EvictCallback cb)
+    : m_max_total_experts(max_total_experts),
+      m_total_experts(0),
+      m_to_filled_lru_expert_no(0),
+      m_on_evict(std::move(cb)) {
+    m_filled_list.resize(max_total_experts, false);
+}
+
+void LRUCache::move_to_end(std::list<Node>::iterator it) {
+    if (std::next(it) == m_list.end())
+        return;
+    m_list.splice(m_list.end(), m_list, it);
+}
+
+void LRUCache::evict_one_unlocked() {
+    if (m_list.empty())
+        return;
+
+    auto& oldest = m_list.front();
+
+    m_filled_list[oldest.lru_expert_no] = false;
+    m_to_filled_lru_expert_no = oldest.lru_expert_no;
+    Key key{oldest.layer, oldest.expert};
+    m_map.erase(key);
+    m_list.pop_front();
+    --m_total_experts;
+}
+
+void LRUCache::evict_one() {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    evict_one_unlocked();
+}
+
+std::pair<size_t, bool> LRUCache::get_lru_item(size_t layer, size_t expert) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    Key key{layer, expert};
+    auto it = m_map.find(key);
+    if (it == m_map.end()) {
+        size_t to_filled_no = 0;
+        if (m_total_experts >= m_max_total_experts) {
+            evict_one_unlocked();
+            to_filled_no = m_to_filled_lru_expert_no;
+        } else {
+            to_filled_no = m_total_experts;
+        }
+        m_list.push_back(Node{layer, expert, to_filled_no});
+        auto new_it = std::prev(m_list.end());
+        m_map[key] = new_it;
+        ++m_total_experts;
+        return {to_filled_no, false};
+    } else {
+        move_to_end(it->second);
+        const bool is_hit = m_filled_list[it->second->lru_expert_no];
+        return {it->second->lru_expert_no, is_hit};
+    }
+}
@@ -0,0 +1,82 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <functional>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "intel_gpu/runtime/engine.hpp"
+
+class LRUCache {
+public:
+    using EvictCallback = std::function<void(size_t layer, size_t expert, void* addr, void* params)>;
+
+    enum NodeAction { INSERT, REFRESH };
+
+    LRUCache(size_t max_total_experts, EvictCallback cb = nullptr);
+    NodeAction insert_or_refresh(size_t layer, size_t expert, void* addr, void* params = nullptr);
+
+    std::pair<size_t, bool> get_lru_item(size_t layer, size_t expert);
+    size_t get_total_experts() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return m_total_experts;
+    }
+
+    void evict_one();
+
+    size_t size() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return m_total_experts;
+    }
+    std::pair<size_t, bool> get_item(size_t layer, size_t expert);
+    void set_filled(size_t lru_expert_no) {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        if (lru_expert_no >= m_filled_list.size()) {
+            return;
+        }
+        m_filled_list[lru_expert_no] = true;
+    }
+
+    bool m_initialized = false;
+
+private:
+    struct Key {
+        size_t layer;
+        size_t expert;
+        bool operator==(const Key& other) const noexcept {
+            return layer == other.layer && expert == other.expert;
+        }
+    };
+
+    struct KeyHash {
+        std::size_t operator()(const Key& k) const noexcept {
+            return std::hash<size_t>()(k.layer * 131ULL + k.expert);
+        }
+    };
+
+    struct Node {
+        size_t layer;
+        size_t expert;
+        size_t lru_expert_no;
+    };
+
+    size_t m_max_total_experts;
+    size_t m_per_expert_size;
+    size_t m_total_experts;
+    size_t m_to_filled_lru_expert_no;
+    EvictCallback m_on_evict;
+
+    std::list<Node> m_list;
+    std::vector<bool> m_filled_list;
+    std::unordered_map<Key, std::list<Node>::iterator, KeyHash> m_map;
+    mutable std::mutex m_mutex;
+
+    void move_to_end(std::list<Node>::iterator it);
+    void evict_one_unlocked();
+};