From b7b6eb449d89bc28de0891b2e9355900deca75c1 Mon Sep 17 00:00:00 2001
From: zaixing-wang <zaixing.wang@intel.com>
Date: Tue, 2 Jun 2026 08:51:20 +0000
Subject: [PATCH] [GPU] Add MoE expert offload-to-disk (OTD) for large MoE
 models

---
 .../intel_gpu/plugin/program_builder.hpp      |   3 +-
 .../primitives/moe_3gemm_fused_compressed.hpp |  58 ++-
 .../intel_gpu/runtime/internal_properties.hpp |   1 +
 .../include/intel_gpu/runtime/options.inl     |   1 +
 .../graph_optimizer/prepare_quantization.cpp  |   4 +
 .../src/graph/impls/ocl_v2/moe/LRUCache.cpp   |  65 +++
 .../src/graph/impls/ocl_v2/moe/LRUCache.hpp   |  82 ++++
 .../impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp | 393 ++++++++++-----
 .../impls/ocl_v2/moe/moe_otd_runtime.hpp      | 341 +++++++++++++
 .../src/graph/include/moe_3gemm_fused_inst.h  |   2 +
 src/plugins/intel_gpu/src/graph/network.cpp   |   7 +-
 src/plugins/intel_gpu/src/graph/program.cpp   |  10 +-
 .../intel_gpu/src/plugin/ops/constant.cpp     |  82 ++--
 src/plugins/intel_gpu/src/plugin/ops/moe.cpp  | 351 +++++++++++++-
 .../src/plugin/ops/moe_offload_constant.hpp   | 118 +++++
 src/plugins/intel_gpu/src/plugin/plugin.cpp   |   2 +-
 .../test_cases/moe_offload_lru_cache_test.cpp | 455 ++++++++++++++++++
 .../test_cases/moe_offload_property_test.cpp  |  19 +
 18 files changed, 1846 insertions(+), 148 deletions(-)
 create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.cpp
 create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.hpp
 create mode 100644 src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_otd_runtime.hpp
 create mode 100644 src/plugins/intel_gpu/src/plugin/ops/moe_offload_constant.hpp
 create mode 100644 src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_lru_cache_test.cpp
 create mode 100644 src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_property_test.cpp
diff --git a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp
index 02848734d88830..393ee0ded69ada 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/plugin/program_builder.hpp
@@ -139,10 +139,11 @@ class ProgramBuilder final {
 
     bool use_new_shape_infer() const { return m_config.get_allow_new_shape_infer(); }
     bool is_inner_program() const { return m_is_inner_program; }
-    bool is_query_mode() { return queryMode; }
+    bool is_query_mode() const { return queryMode; }
 
     std::shared_ptr<ov::threading::IStreamsExecutor> get_task_executor() const { return m_task_executor; }
     std::shared_ptr<cldnn::ICompilationContext> get_compilation_context() const { return m_compilation_context; }
+    std::shared_ptr<ov::Model> get_model() const { return m_model; }
 
 private:
     static factories_map_t factories_map;
diff --git a/src/plugins/intel_gpu/include/intel_gpu/primitives/moe_3gemm_fused_compressed.hpp b/src/plugins/intel_gpu/include/intel_gpu/primitives/moe_3gemm_fused_compressed.hpp
index b1c430d1850214..222938abf10e26 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/primitives/moe_3gemm_fused_compressed.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/primitives/moe_3gemm_fused_compressed.hpp
@@ -4,19 +4,51 @@
 
 #pragma once
 #include <vector>
+#include <string>
 
 #include "intel_gpu/op/moe_3gemm_fused_compressed.hpp"
 #include "intel_gpu/runtime/engine.hpp"
+#include "intel_gpu/runtime/memory.hpp"
 #include "primitive.hpp"
 
 namespace cldnn {
 using MOE3GemmFusedCompressed = ov::intel_gpu::op::MOE3GemmFusedCompressed;
 
+struct moe_weights {
+    cldnn::memory::ptr gate_w = nullptr;
+    cldnn::memory::ptr gate_s = nullptr;
+    cldnn::memory::ptr gate_z = nullptr;
+    cldnn::memory::ptr up_w = nullptr;
+    cldnn::memory::ptr up_s = nullptr;
+    cldnn::memory::ptr up_z = nullptr;
+    cldnn::memory::ptr down_w = nullptr;
+    cldnn::memory::ptr down_s = nullptr;
+    cldnn::memory::ptr down_z = nullptr;
+};
+
 /// @brief moe compressed primitive
 /// @details Performs moe compressed
 struct moe_3gemm_fused_compressed : public primitive_base<moe_3gemm_fused_compressed> {
     CLDNN_DECLARE_PRIMITIVE(moe_3gemm_fused_compressed)
 
+    static constexpr size_t serialized_weight_offset_count = 9;
+
+    enum class input_index : size_t {
+        hidden_states = 0,
+        routing_weights,
+        weight_0,
+        scale_0,
+        zp_0,
+        weight_1,
+        scale_1,
+        zp_1,
+        weight_2,
+        scale_2,
+        zp_2,
+        count
+    };
+    static constexpr size_t input_count = static_cast<size_t>(input_index::count);
+
     moe_3gemm_fused_compressed() : primitive_base("", {}) {}
 
     // @brief Constructs moe primitive / layer.
@@ -70,11 +102,22 @@ struct moe_3gemm_fused_compressed : public primitive_base<moe_3gemm_fused_compre
     //                   22: shared_gate_gate_weight - shared expert gate weight for gating,
     //                      shape [hidden_size]
     //
-    moe_3gemm_fused_compressed(const primitive_id& id, const std::vector<input_info>& inputs, const MOE3GemmFusedCompressed::Config& config)
+        moe_3gemm_fused_compressed(const primitive_id& id,
+                                                             const std::vector<input_info>& inputs,
+                                                             const MOE3GemmFusedCompressed::Config& config,
+                                                             const std::vector<size_t>& weight_bin_offsets = {},
+                                                             const std::string& weights_path = "",
+                                                             size_t lru_expert_num = 0)
         : primitive_base(id, inputs, 1, {optional_data_type()}),
-          _config(config) {}
+                    _config(config),
+                    _weight_bin_offsets(weight_bin_offsets),
+                    _weights_path(weights_path),
+                    _lru_expert_num(lru_expert_num) {}
 
     MOE3GemmFusedCompressed::Config _config;
+        std::vector<size_t> _weight_bin_offsets;
+        std::string _weights_path;
+        size_t _lru_expert_num = 0;
 
     bool operator==(const primitive& rhs) const override {
         if (!compare_common_params(rhs))
@@ -82,17 +125,26 @@ struct moe_3gemm_fused_compressed : public primitive_base<moe_3gemm_fused_compre
 
         auto rhs_casted = downcast<const moe_3gemm_fused_compressed>(rhs);
 
-        return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0;
+         return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0 &&
+             _weight_bin_offsets == rhs_casted._weight_bin_offsets &&
+             _weights_path == rhs_casted._weights_path &&
+             _lru_expert_num == rhs_casted._lru_expert_num;
     }
 
     void save(BinaryOutputBuffer& ob) const override {
         primitive_base<moe_3gemm_fused_compressed>::save(ob);
         ob << make_data(&_config, sizeof(_config));
+        ob << _weight_bin_offsets;
+        ob << _weights_path;
+        ob << _lru_expert_num;
     }
 
     void load(BinaryInputBuffer& ib) override {
         primitive_base<moe_3gemm_fused_compressed>::load(ib);
         ib >> make_data(&_config, sizeof(_config));
+        ib >> _weight_bin_offsets;
+        ib >> _weights_path;
+        ib >> _lru_expert_num;
     }
 };
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
index 2e7151f60ec162..f84e4c17ab43a4 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/internal_properties.hpp
@@ -133,6 +133,7 @@ static constexpr Property<ImplForcingMap, PropertyMutability::RW> force_implemen
 static constexpr Property<std::string, PropertyMutability::RW> config_file{"CONFIG_FILE"};
 static constexpr Property<float, PropertyMutability::RW> buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"};
 static constexpr Property<size_t, PropertyMutability::RW> max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"};
+static constexpr Property<size_t, PropertyMutability::RW> moe_offload_ratio{"MOE_OFFLOAD_RATIO"};
 static constexpr Property<bool, PropertyMutability::RW> use_onednn{"GPU_USE_ONEDNN"};
 static constexpr Property<bool, PropertyMutability::RW> use_cm{"GPU_USE_CM"};
 
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl
index 5842f039720207..33086747a0231d 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/options.inl
@@ -39,6 +39,7 @@ OV_CONFIG_RELEASE_OPTION(ov::hint, model, nullptr, "Shared pointer to the ov::Mo
 OV_CONFIG_RELEASE_OPTION(ov::internal, key_cache_quant_mode, ov::internal::CacheQuantMode::BY_CHANNEL, "AUTO or BY_CHANNEL or BY_TOKEN")
 OV_CONFIG_RELEASE_OPTION(ov::internal, value_cache_quant_mode, ov::internal::CacheQuantMode::BY_TOKEN, "AUTO or BY_CHANNEL or BY_TOKEN")
 OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, mem_pool_util_threshold, 0.5, "Minimum utilization threshold (0.0~1.0) for reusable memory in the pool")
+OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, moe_offload_ratio, 0, "Percentage (0-100) of MoE experts to keep resident on device for offload")
 OV_CONFIG_RELEASE_OPTION(ov, enable_weightless, false, "Enable/Disable weightless blob")
 
 OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, shape_predictor_settings, {10, 16 * 1024, 2, 1.1f}, "Preallocation settings")
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp
index 9872fc63ffcb53..c90796ad003495 100644
--- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp
+++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_quantization.cpp
@@ -635,6 +635,10 @@ static void optimize_moe_gemm_decompression_parameters(moe_gemm_node& node, prog
 
 static void optimize_moe_3gemm_fused_decompression_parameters(moe_node& node, program& p) {
     auto prim = node.get_primitive();
+    if (prim->_lru_expert_num > 0) {
+        // OTD routed weights are backed by resident-size allocations; reorders would materialize full logical tensors.
+        return;
+    }
     const auto& cfg = prim->_config;
     // Routed-expert scales at 3/6/9 (gate/up/down); zp at +1 when has_zp.
     constexpr std::array<size_t, 3> routed_scale_indices{3u, 6u, 9u};
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.cpp
new file mode 100644
index 00000000000000..1705d69053a7f3
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.cpp
@@ -0,0 +1,65 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "LRUCache.hpp"
+
+#include <cstdlib>
+
+LRUCache::LRUCache(size_t max_total_experts, EvictCallback cb)
+    : m_max_total_experts(max_total_experts),
+      m_total_experts(0),
+      m_to_filled_lru_expert_no(0),
+      m_on_evict(std::move(cb)) {
+    m_filled_list.resize(max_total_experts, false);
+}
+
+void LRUCache::move_to_end(std::list<Node>::iterator it) {
+    if (std::next(it) == m_list.end())
+        return;
+    m_list.splice(m_list.end(), m_list, it);
+}
+
+void LRUCache::evict_one_unlocked() {
+    if (m_list.empty())
+        return;
+
+    auto& oldest = m_list.front();
+
+    m_filled_list[oldest.lru_expert_no] = false;
+    m_to_filled_lru_expert_no = oldest.lru_expert_no;
+    Key key{oldest.layer, oldest.expert};
+    m_map.erase(key);
+    m_list.pop_front();
+    --m_total_experts;
+}
+
+void LRUCache::evict_one() {
+    std::lock_guard<std::mutex> lock(m_mutex);
+    evict_one_unlocked();
+}
+
+std::pair<size_t, bool> LRUCache::get_lru_item(size_t layer, size_t expert) {
+    std::lock_guard<std::mutex> lock(m_mutex);
+
+    Key key{layer, expert};
+    auto it = m_map.find(key);
+    if (it == m_map.end()) {
+        size_t to_filled_no = 0;
+        if (m_total_experts >= m_max_total_experts) {
+            evict_one_unlocked();
+            to_filled_no = m_to_filled_lru_expert_no;
+        } else {
+            to_filled_no = m_total_experts;
+        }
+        m_list.push_back(Node{layer, expert, to_filled_no});
+        auto new_it = std::prev(m_list.end());
+        m_map[key] = new_it;
+        ++m_total_experts;
+        return {to_filled_no, false};
+    } else {
+        move_to_end(it->second);
+        const bool is_hit = m_filled_list[it->second->lru_expert_no];
+        return {it->second->lru_expert_no, is_hit};
+    }
+}
\ No newline at end of file
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.hpp
new file mode 100644
index 00000000000000..e2f1fefae5a6bc
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.hpp
@@ -0,0 +1,82 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+#include <functional>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+#include <utility>
+#include <vector>
+
+#include "intel_gpu/runtime/engine.hpp"
+
+class LRUCache {
+public:
+    using EvictCallback = std::function<void(size_t layer, size_t expert, void* addr, void* params)>;
+
+    enum NodeAction { INSERT, REFRESH };
+
+    LRUCache(size_t max_total_experts, EvictCallback cb = nullptr);
+    NodeAction insert_or_refresh(size_t layer, size_t expert, void* addr, void* params = nullptr);
+
+    std::pair<size_t, bool> get_lru_item(size_t layer, size_t expert);
+    size_t get_total_experts() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return m_total_experts;
+    }
+
+    void evict_one();
+
+    size_t size() const {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        return m_total_experts;
+    }
+    std::pair<size_t, bool> get_item(size_t layer, size_t expert);
+    void set_filled(size_t lru_expert_no) {
+        std::lock_guard<std::mutex> lock(m_mutex);
+        if (lru_expert_no >= m_filled_list.size()) {
+            return;
+        }
+        m_filled_list[lru_expert_no] = true;
+    }
+
+    bool m_initialized = false;
+
+private:
+    struct Key {
+        size_t layer;
+        size_t expert;
+        bool operator==(const Key& other) const noexcept {
+            return layer == other.layer && expert == other.expert;
+        }
+    };
+
+    struct KeyHash {
+        std::size_t operator()(const Key& k) const noexcept {
+            return std::hash<size_t>()(k.layer * 131ULL + k.expert);
+        }
+    };
+
+    struct Node {
+        size_t layer;
+        size_t expert;
+        size_t lru_expert_no;
+    };
+
+    size_t m_max_total_experts;
+    size_t m_per_expert_size;
+    size_t m_total_experts;
+    size_t m_to_filled_lru_expert_no;
+    EvictCallback m_on_evict;
+
+    std::list<Node> m_list;
+    std::vector<bool> m_filled_list;
+    std::unordered_map<Key, std::list<Node>::iterator, KeyHash> m_map;
+    mutable std::mutex m_mutex;
+
+    void move_to_end(std::list<Node>::iterator it);
+    void evict_one_unlocked();
+};
\ No newline at end of file
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
index c78b209885722f..9e080032ed7aa1 100644
--- a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_3gemm_swiglu_opt.cpp
@@ -4,16 +4,28 @@
 
 // clang-format off
 #include "moe_3gemm_gen_micro.hpp"
+#include "moe_otd_runtime.hpp"
 #include "moe_3gemm_swiglu_opt.hpp"
+#include "openvino/runtime/shared_buffer.hpp"
+#include "LRUCache.hpp"
 // clang-format on
 
 #define DEBUG_MOE_LOG 0
 
 #ifdef ENABLE_ONEDNN_FOR_GPU
 #    include <algorithm>
+#    include <chrono>
+#    include <cstdint>
+#    include <fstream>
 #    include <initializer_list>
+#    include <iostream>
+#    include <limits>
+#    include <mutex>
 #    include <oneapi/dnnl/dnnl.hpp>
+#    include <oneapi/dnnl/dnnl_ocl.hpp>
+#    include <sstream>
 #    include <string_view>
+#    include <thread>
 #    include <tuple>
 #    include <utility>
 
@@ -676,6 +688,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
         moe_fusion_weights_base_addr moe_fusion_wei_addr;
         memory::ptr input_routing_weights;
         memory::ptr input_router_topk_idx;
+        memory::ptr _expert_index_buffer;
+        bool _index_initialized = false;
     };
 
     std::vector<std::vector<dnnl_weights>> _dnnl_weights;
@@ -684,6 +698,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
     int _shared_intermediate_size;
     int _gate_up_group_size;
     int _down_group_size;
+    size_t _lru_expert_num = 0;
+    std::shared_ptr<LRUCache> _lru_cache;
     ov::op::internal::MOE::Activation_type _activation_type = ov::op::internal::MOE::Activation_type::SWIGLU;
 
     bool _has_shared_expert = false;
@@ -766,6 +782,21 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
             GPU_DEBUG_TRACE_DETAIL << "MOE_BATCHED_GEMV_THRESHOLD = " << batched_gemv_threshold << std::endl;
         }
 
+        // OTD relies on runtime weight streaming in oneDNN path.
+        _lru_expert_num = params.typed_desc<moe_3gemm_fused_compressed>()->_lru_expert_num;
+        if (_lru_expert_num > 0) {
+            _lru_cache = std::make_shared<LRUCache>(_lru_expert_num);
+        }
+        if (_lru_expert_num > 0 && use_micro_gemm_prefill) {
+            use_micro_gemm_prefill = false;
+            GPU_DEBUG_TRACE_DETAIL << "[DEBUG] moe_3gemm_swiglu_opt_impl(): force disable micro_gemm prefill in OTD mode, lru_expert_num=" << _lru_expert_num
+                                   << std::endl;
+        }
+        if (_lru_expert_num > 0 && use_grouped_gemm_prefill) {
+            use_grouped_gemm_prefill = false;
+            GPU_DEBUG_TRACE_DETAIL << "[DEBUG] moe_3gemm_swiglu_opt_impl(): force disable grouped_gemm prefill in OTD mode, lru_expert_num=" << _lru_expert_num
+                                   << std::endl;
+        }
         // Don't change the order of stages
         auto routing_type = node.as<moe_3gemm_fused_compressed>().get_primitive()->_config.routing_type;
         if (routing_type == ov::op::internal::MOECompressed::RoutingType::SOFTMAX) {
@@ -842,67 +873,72 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
             auto& dnnl_weights = _dnnl_weights[j];
             dnnl_weights.resize(3);
             dnnl_weights[0].ic = _hidden_size;
-            dnnl_weights[0].ic_group_size = ic_group_size_from_scale(_hidden_size, moe_fusion_wei_addr.scale[0]);
+            dnnl_weights[0].ic_group_size =
+                moe_fusion_wei_addr.scale[0] ? ic_group_size_from_scale(_hidden_size, moe_fusion_wei_addr.scale[0]) : _gate_up_group_size;
             dnnl_weights[0].oc = _intermediate_size;
             dnnl_weights[1].ic = _hidden_size;
-            dnnl_weights[1].ic_group_size = ic_group_size_from_scale(_hidden_size, moe_fusion_wei_addr.scale[1]);
+            dnnl_weights[1].ic_group_size =
+                moe_fusion_wei_addr.scale[1] ? ic_group_size_from_scale(_hidden_size, moe_fusion_wei_addr.scale[1]) : _gate_up_group_size;
             dnnl_weights[1].oc = _intermediate_size;
             dnnl_weights[2].ic = _intermediate_size;
-            dnnl_weights[2].ic_group_size = ic_group_size_from_scale(_intermediate_size, moe_fusion_wei_addr.scale[2]);
+            dnnl_weights[2].ic_group_size =
+                moe_fusion_wei_addr.scale[2] ? ic_group_size_from_scale(_intermediate_size, moe_fusion_wei_addr.scale[2]) : _down_group_size;
             dnnl_weights[2].oc = _hidden_size;
-            for (int i = 0; i < 3; i++) {
-                // Cross-check ic/ic_group_size against scale shape (drift caused u8 inf bug).
-                {
-                    const auto& sshape = moe_fusion_wei_addr.scale[i]->get_layout().get_shape();
-                    const size_t scale_num_groups = (sshape.size() >= 3) ? sshape[2] : 1;
-                    OPENVINO_ASSERT(dnnl_weights[i].ic_group_size > 0, "moe_3gemm GEMM ", i, " ic_group_size must be > 0");
-                    OPENVINO_ASSERT(dnnl_weights[i].ic % dnnl_weights[i].ic_group_size == 0,
-                                    "moe_3gemm GEMM ",
-                                    i,
-                                    " ic=",
-                                    dnnl_weights[i].ic,
-                                    " not divisible by ic_group_size=",
-                                    dnnl_weights[i].ic_group_size);
-                    const auto expected_groups = dnnl_weights[i].ic / dnnl_weights[i].ic_group_size;
-                    OPENVINO_ASSERT(static_cast<size_t>(expected_groups) == scale_num_groups,
-                                    "moe_3gemm GEMM ",
-                                    i,
-                                    " ic_group_size=",
-                                    dnnl_weights[i].ic_group_size,
-                                    " (=> ",
-                                    expected_groups,
-                                    " groups) disagrees with scale num_groups=",
-                                    scale_num_groups,
-                                    " (scale shape=",
-                                    sshape,
-                                    ")");
-                    if (cur_moe->_config.has_zp && moe_fusion_wei_addr.zp[i]) {
-                        const auto& zshape = moe_fusion_wei_addr.zp[i]->get_layout().get_shape();
-                        OPENVINO_ASSERT(zshape == sshape, "moe_3gemm GEMM ", i, " scale shape ", sshape, " does not match zp shape ", zshape);
+            if (!_lru_expert_num) {
+                for (int i = 0; i < 3; i++) {
+                    // Cross-check ic/ic_group_size against scale shape (drift caused u8 inf bug).
+                    {
+                        const auto& sshape = moe_fusion_wei_addr.scale[i]->get_layout().get_shape();
+                        const size_t scale_num_groups = (sshape.size() >= 3) ? sshape[2] : 1;
+                        OPENVINO_ASSERT(dnnl_weights[i].ic_group_size > 0, "moe_3gemm GEMM ", i, " ic_group_size must be > 0");
+                        OPENVINO_ASSERT(dnnl_weights[i].ic % dnnl_weights[i].ic_group_size == 0,
+                                        "moe_3gemm GEMM ",
+                                        i,
+                                        " ic=",
+                                        dnnl_weights[i].ic,
+                                        " not divisible by ic_group_size=",
+                                        dnnl_weights[i].ic_group_size);
+                        const auto expected_groups = dnnl_weights[i].ic / dnnl_weights[i].ic_group_size;
+                        OPENVINO_ASSERT(static_cast<size_t>(expected_groups) == scale_num_groups,
+                                        "moe_3gemm GEMM ",
+                                        i,
+                                        " ic_group_size=",
+                                        dnnl_weights[i].ic_group_size,
+                                        " (=> ",
+                                        expected_groups,
+                                        " groups) disagrees with scale num_groups=",
+                                        scale_num_groups,
+                                        " (scale shape=",
+                                        sshape,
+                                        ")");
+                        if (cur_moe->_config.has_zp && moe_fusion_wei_addr.zp[i]) {
+                            const auto& zshape = moe_fusion_wei_addr.zp[i]->get_layout().get_shape();
+                            OPENVINO_ASSERT(zshape == sshape, "moe_3gemm GEMM ", i, " scale shape ", sshape, " does not match zp shape ", zshape);
+                        }
+                    }
+                    // weight shape: [ic, oc], type: u4/i8
+                    int64_t wei_offset = j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc, moe_fusion_wei_addr.weight[i]->get_layout());
+                    dnnl_weights[i].weight =
+                        convert2dnnl(moe_fusion_wei_addr.weight[i], {dnnl_weights[i].ic, dnnl_weights[i].oc}, dnnl::memory::format_tag::ba, wei_offset);
+
+                    // scale shape: [ic / ic_group_size, oc], type: f16
+                    int64_t scale_offset = j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size,
+                                                               moe_fusion_wei_addr.scale[i]->get_layout());
+                    dnnl_weights[i].scale = convert2dnnl(moe_fusion_wei_addr.scale[i],
+                                                         {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc},
+                                                         dnnl::memory::format_tag::ab,
+                                                         scale_offset);
+
+                    // zp shape: [ic / ic_group_size, oc], type: u4/i8
+                    // Skip ZP memory allocation for symmetric quantization (has_zp=false) to save memory
+                    if (cur_moe->_config.has_zp) {
+                        int64_t zp_offset = j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size,
+                                                                moe_fusion_wei_addr.zp[i]->get_layout());
+                        dnnl_weights[i].zp = convert2dnnl(moe_fusion_wei_addr.zp[i],
+                                                          {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc},
+                                                          dnnl::memory::format_tag::ab,
+                                                          zp_offset);
                     }
-                }
-                // weight shape: [ic, oc], type: u4/i8
-                int64_t wei_offset = j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc, moe_fusion_wei_addr.weight[i]->get_layout());
-                dnnl_weights[i].weight =
-                    convert2dnnl(moe_fusion_wei_addr.weight[i], {dnnl_weights[i].ic, dnnl_weights[i].oc}, dnnl::memory::format_tag::ba, wei_offset);
-
-                // scale shape: [ic / ic_group_size, oc], type: f16
-                int64_t scale_offset =
-                    j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size, moe_fusion_wei_addr.scale[i]->get_layout());
-                dnnl_weights[i].scale = convert2dnnl(moe_fusion_wei_addr.scale[i],
-                                                     {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc},
-                                                     dnnl::memory::format_tag::ab,
-                                                     scale_offset);
-
-                // zp shape: [ic / ic_group_size, oc], type: u4/i8
-                // Skip ZP memory allocation for symmetric quantization (has_zp=false) to save memory
-                if (cur_moe->_config.has_zp) {
-                    int64_t zp_offset =
-                        j * get_bytes_count(dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size, moe_fusion_wei_addr.zp[i]->get_layout());
-                    dnnl_weights[i].zp = convert2dnnl(moe_fusion_wei_addr.zp[i],
-                                                      {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc},
-                                                      dnnl::memory::format_tag::ab,
-                                                      zp_offset);
                 }
             }
         }
@@ -1044,6 +1080,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
         cur_moe->_intermediate_size = _intermediate_size;
         cur_moe->_gate_up_group_size = _gate_up_group_size;
         cur_moe->_down_group_size = _down_group_size;
+        cur_moe->_lru_expert_num = _lru_expert_num;
+        cur_moe->_lru_cache = _lru_cache;  // shared across clones within the same network
         cur_moe->use_micro_gemm_prefill = use_micro_gemm_prefill;
         cur_moe->use_gpu_mask_gen_prefill = use_gpu_mask_gen_prefill;
         cur_moe->use_grouped_gemm_prefill = use_grouped_gemm_prefill;
@@ -1144,56 +1182,58 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
             }
         }
 
-        // gate
-        scratch.moe_fusion_wei_addr.weight[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::WEIGHT_0));
-        scratch.moe_fusion_wei_addr.scale[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SCALE_0));
-        scratch.moe_fusion_wei_addr.zp[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::ZP_0));
+        if (!_lru_expert_num) {
+            // gate
+            scratch.moe_fusion_wei_addr.weight[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::WEIGHT_0));
+            scratch.moe_fusion_wei_addr.scale[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SCALE_0));
+            scratch.moe_fusion_wei_addr.zp[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::ZP_0));
 
-        // up
-        scratch.moe_fusion_wei_addr.weight[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::WEIGHT_1));
-        scratch.moe_fusion_wei_addr.scale[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SCALE_1));
-        scratch.moe_fusion_wei_addr.zp[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::ZP_1));
+            // up
+            scratch.moe_fusion_wei_addr.weight[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::WEIGHT_1));
+            scratch.moe_fusion_wei_addr.scale[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SCALE_1));
+            scratch.moe_fusion_wei_addr.zp[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::ZP_1));
 
-        // down
-        scratch.moe_fusion_wei_addr.weight[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::WEIGHT_2));
-        scratch.moe_fusion_wei_addr.scale[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SCALE_2));
-        scratch.moe_fusion_wei_addr.zp[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::ZP_2));
-
-        // For symmetric quantization (has_zp=false), ZP inputs are element::dynamic placeholders
-        // with zero-count layout. Use scale memory as a dummy to avoid null pointer issues.
-        const auto& config = instance.get_typed_desc<moe_3gemm_fused_compressed>()->_config;
-        if (!config.has_zp) {
-            scratch.moe_fusion_wei_addr.zp[0] = scratch.moe_fusion_wei_addr.scale[0];
-            scratch.moe_fusion_wei_addr.zp[1] = scratch.moe_fusion_wei_addr.scale[1];
-            scratch.moe_fusion_wei_addr.zp[2] = scratch.moe_fusion_wei_addr.scale[2];
-        }
-
-        // shared expert
-        size_t dep_count = instance.dependencies().size();
-        if (dep_count >= static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_GATE_WEIGHT) + 1) {
-            // Gate
-            scratch.moe_fusion_wei_addr.shared_weight[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_WEIGHT));
-            scratch.moe_fusion_wei_addr.shared_scale[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_SCALE));
-            scratch.moe_fusion_wei_addr.shared_zp[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_ZP));
-
-            // Up
-            scratch.moe_fusion_wei_addr.shared_weight[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_UP_WEIGHT));
-            scratch.moe_fusion_wei_addr.shared_scale[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_UP_SCALE));
-            scratch.moe_fusion_wei_addr.shared_zp[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_UP_ZP));
-
-            // Down
-            scratch.moe_fusion_wei_addr.shared_weight[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_DOWN_WEIGHT));
-            scratch.moe_fusion_wei_addr.shared_scale[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_DOWN_SCALE));
-            scratch.moe_fusion_wei_addr.shared_zp[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_DOWN_ZP));
-
-            // Scalar Gate - f16
-            scratch.moe_fusion_wei_addr.shared_weight[3] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_GATE_WEIGHT));
-
-            // For symmetric quantization, shared expert ZPs are also element::dynamic placeholders
+            // down
+            scratch.moe_fusion_wei_addr.weight[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::WEIGHT_2));
+            scratch.moe_fusion_wei_addr.scale[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SCALE_2));
+            scratch.moe_fusion_wei_addr.zp[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::ZP_2));
+
+            // For symmetric quantization (has_zp=false), ZP inputs are element::dynamic placeholders
+            // with zero-count layout. Use scale memory as a dummy to avoid null pointer issues.
+            const auto& config = instance.get_typed_desc<moe_3gemm_fused_compressed>()->_config;
             if (!config.has_zp) {
-                scratch.moe_fusion_wei_addr.shared_zp[0] = scratch.moe_fusion_wei_addr.shared_scale[0];
-                scratch.moe_fusion_wei_addr.shared_zp[1] = scratch.moe_fusion_wei_addr.shared_scale[1];
-                scratch.moe_fusion_wei_addr.shared_zp[2] = scratch.moe_fusion_wei_addr.shared_scale[2];
+                scratch.moe_fusion_wei_addr.zp[0] = scratch.moe_fusion_wei_addr.scale[0];
+                scratch.moe_fusion_wei_addr.zp[1] = scratch.moe_fusion_wei_addr.scale[1];
+                scratch.moe_fusion_wei_addr.zp[2] = scratch.moe_fusion_wei_addr.scale[2];
+            }
+
+            // shared expert
+            size_t dep_count = instance.dependencies().size();
+            if (dep_count >= static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_GATE_WEIGHT) + 1) {
+                // Gate
+                scratch.moe_fusion_wei_addr.shared_weight[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_WEIGHT));
+                scratch.moe_fusion_wei_addr.shared_scale[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_SCALE));
+                scratch.moe_fusion_wei_addr.shared_zp[0] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_ZP));
+
+                // Up
+                scratch.moe_fusion_wei_addr.shared_weight[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_UP_WEIGHT));
+                scratch.moe_fusion_wei_addr.shared_scale[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_UP_SCALE));
+                scratch.moe_fusion_wei_addr.shared_zp[1] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_UP_ZP));
+
+                // Down
+                scratch.moe_fusion_wei_addr.shared_weight[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_DOWN_WEIGHT));
+                scratch.moe_fusion_wei_addr.shared_scale[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_DOWN_SCALE));
+                scratch.moe_fusion_wei_addr.shared_zp[2] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_DOWN_ZP));
+
+                // Scalar Gate - f16
+                scratch.moe_fusion_wei_addr.shared_weight[3] = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SHARED_GATE_GATE_WEIGHT));
+
+                // For symmetric quantization, shared expert ZPs are also element::dynamic placeholders
+                if (!config.has_zp) {
+                    scratch.moe_fusion_wei_addr.shared_zp[0] = scratch.moe_fusion_wei_addr.shared_scale[0];
+                    scratch.moe_fusion_wei_addr.shared_zp[1] = scratch.moe_fusion_wei_addr.shared_scale[1];
+                    scratch.moe_fusion_wei_addr.shared_zp[2] = scratch.moe_fusion_wei_addr.shared_scale[2];
+                }
             }
         }
     }
@@ -1334,7 +1374,16 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
     cldnn::event::ptr exec_batched_gemv(const std::vector<cldnn::event::ptr>& events,
                                         typed_primitive_inst<moe_3gemm_fused_compressed>& instance,
                                         scratch_buffers& scratch,
+                                        LRUCache& cache,
                                         size_t token_num) {
+        auto& cur_net = instance.get_network();
+        auto& stream = cur_net.get_stream();
+        if (_lru_expert_num) {
+            // Full pipeline sync required: the routing kernel writes expert IDs
+            // to GPU memory that we read on the CPU below (buffer_ptr()).
+            // TODO: replace with event-based wait on the routing kernel only.
+            stream.finish();
+        }
         auto cur_moe = instance.get_typed_desc<moe_3gemm_fused_compressed>();
         int max_topk = static_cast<int>(cur_moe->_config.top_k);
 
@@ -1348,6 +1397,42 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
         const size_t subgroup_size = instance.get_impl_params()->get_device_info().arch >= gpu_arch::xe2 ? 32 : 16;
         const size_t max_work_group_size = instance.get_impl_params()->get_device_info().max_work_group_size;
 
+        if (_lru_expert_num) {
+            cldnn::moe_weights shell_params = instance._weights;
+            auto& engine = instance.get_network().get_engine();
+            uint32_t* p_expert = (uint32_t*)batch_mem_ptr->buffer_ptr();
+            std::vector<uint32_t> experts_list;
+            for (int i = 0; i < max_topk; i++) {
+                experts_list.push_back(*p_expert++);
+            }
+            if (!scratch._index_initialized) {
+                size_t experts_index_size = 4 * max_topk;  // each expert has 4 bytes
+                auto layout_expert = cldnn::layout({1, 1, 1, static_cast<ov::Dimension::value_type>(experts_index_size)}, ov::element::i8, cldnn::format::bfyx);
+                // auto alloc_type = engine.get_preferred_memory_allocation_type(false);
+                scratch._expert_index_buffer = engine.allocate_memory(layout_expert, allocation_type::usm_host, false);
+                // instance._expert_index_buffer = engine.allocate_memory(layout_expert, alloc_type, false);
+                scratch._index_initialized = true;
+            }
+            uint32_t* p_expert_index = (uint32_t*)scratch._expert_index_buffer->buffer_ptr();
+            for (int i = 0; i < max_topk; i++) {
+                auto expert_no = experts_list[i];
+                auto lru_expert_no = moe_otd::get_lru_expert_no(instance, static_cast<uint32_t>(expert_no), cache);
+                *p_expert_index++ = lru_expert_no;  // update batch_mem_ptr as re-map
+            }
+            batch_mem_ptr = scratch._expert_index_buffer;
+            scratch.moe_fusion_wei_addr.weight[0] = shell_params.gate_w;
+            scratch.moe_fusion_wei_addr.scale[0] = shell_params.gate_s;
+            scratch.moe_fusion_wei_addr.zp[0] = shell_params.gate_z;
+
+            scratch.moe_fusion_wei_addr.weight[1] = shell_params.up_w;
+            scratch.moe_fusion_wei_addr.scale[1] = shell_params.up_s;
+            scratch.moe_fusion_wei_addr.zp[1] = shell_params.up_z;
+
+            scratch.moe_fusion_wei_addr.weight[2] = shell_params.down_w;
+            scratch.moe_fusion_wei_addr.scale[2] = shell_params.down_s;
+            scratch.moe_fusion_wei_addr.zp[2] = shell_params.down_z;
+        }
+
         // gate
         const auto& mlp_gate_wei_mem = scratch.moe_fusion_wei_addr.weight[0];
         const auto& mlp_gate_scale_mem = scratch.moe_fusion_wei_addr.scale[0];
@@ -1438,6 +1523,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
     cldnn::event::ptr exec_prefill_micro_gemm(const std::vector<cldnn::event::ptr>& events,
                                               typed_primitive_inst<moe_3gemm_fused_compressed>& instance,
                                               scratch_buffers& scratch,
+                                              LRUCache& cache,
                                               const bool use_gpu_mask_gen) {
         auto cur_moe = instance.get_typed_desc<moe_3gemm_fused_compressed>();
         int max_topk = static_cast<int>(cur_moe->_config.top_k);
@@ -1463,6 +1549,36 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
         auto num_total_experts = static_cast<int>(cur_moe->_config.num_expert);
         int num_actually_used_experts = 0;
 
+        if (_lru_expert_num) {
+            auto& stream = instance.get_network().get_stream();
+            auto& engine = instance.get_network().get_engine();
+            auto topk_count = token_num * static_cast<size_t>(max_topk);
+            auto topk_bytes = topk_count * sizeof(uint32_t);
+
+            std::vector<uint32_t> expert_ids(topk_count);
+            batch_mem_ptr->copy_to(stream, expert_ids.data(), 0, 0, topk_bytes, true);
+
+            std::unordered_map<uint32_t, uint32_t> expert_to_lru;
+            expert_to_lru.reserve(topk_count);
+            for (size_t i = 0; i < topk_count; i++) {
+                auto expert_no = expert_ids[i];
+                OPENVINO_ASSERT(expert_no < static_cast<uint32_t>(num_total_experts), "expert_no ", expert_no, " exceed max_expert_num ", num_total_experts);
+                auto it = expert_to_lru.find(expert_no);
+                if (it == expert_to_lru.end()) {
+                    auto lru_expert_no = moe_otd::get_lru_expert_no(instance, expert_no, cache);
+                    it = expert_to_lru.emplace(expert_no, lru_expert_no).first;
+                }
+                expert_ids[i] = it->second;
+            }
+
+            auto remap_layout = batch_mem_ptr->get_layout();
+            if (!scratch._expert_index_buffer || scratch._expert_index_buffer->size() < topk_bytes) {
+                scratch._expert_index_buffer = engine.allocate_memory(remap_layout, allocation_type::usm_host, false);
+            }
+            scratch._expert_index_buffer->copy_from(stream, expert_ids.data(), 0, 0, topk_bytes, true);
+            batch_mem_ptr = scratch._expert_index_buffer;
+        }
+
         // step 1: generate 4 mask data for following kernel execution
         // input: topk output, [token_len, expert_topk]
         // output:
@@ -1727,6 +1843,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
 
     using lru_cache_hash = LruCache<std::pair<int, int>, std::shared_ptr<onednn_kernel>, PairHash>;
     lru_cache_hash _kernels = lru_cache_hash(1024);
+    std::shared_ptr<onednn_kernel> _otd_kernel_holder;
 
     // --- grouped GEMM kernel cache (one primitive set per total-token count) ---
     struct grouped_onednn_kernel {
@@ -1806,6 +1923,12 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
                                              dnnl_weights[2].weight,
                                              dnnl_weights[2].scale,
                                              dnnl_weights[2].zp);
+        // each time dnnl_weights updated need refresh kernel cache in OTD mode, if not, the stream engine context and memory storage engine context will
+        // mismatch, dnnl kernel will report invalid_arguments and fail or compute wrong and output wrong tokens. if any perf concerns, need deep dive here.
+        if (_lru_expert_num) {
+            _otd_kernel_holder = kernel;
+            return *_otd_kernel_holder;
+        }
         _kernels.add(key, kernel);
         return *_kernels.get(key);
     }
@@ -1921,7 +2044,8 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
     cldnn::event::ptr exec_prefill_onednn(const std::vector<cldnn::event::ptr>& events,
                                           cldnn::stream& stream,
                                           typed_primitive_inst<moe_3gemm_fused_compressed>& instance,
-                                          scratch_buffers& scratch) {
+                                          scratch_buffers& scratch,
+                                          LRUCache& cache) {
         auto cur_moe = instance.get_typed_desc<moe_3gemm_fused_compressed>();
         const auto& config = cur_moe->_config;
         auto& dnn_stream = stream.get_onednn_stream();
@@ -1958,6 +2082,34 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
             if (can_skip_subgraph) {
                 continue;
             }
+
+            if (_lru_expert_num) {
+                // Ensure previous oneDNN work is completed before any potential
+                // LRU slot overwrite in get_lru_expert_no/fill_weights_memory.
+                dnn_stream.wait();
+
+                auto& dnnl_weights = _dnnl_weights[expert_no];
+                auto lru_expert_no = moe_otd::get_lru_expert_no(instance, static_cast<uint32_t>(expert_no), cache);
+                auto& params = instance._weights;
+
+#    define CONVERT_DNNL(name, i)                                                                                                                      \
+        int64_t wei_offset##i = lru_expert_no * dnnl_weights[i].ic * dnnl_weights[i].oc / 2;                                                           \
+        int64_t scale_offset##i = lru_expert_no * dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size * 2;                         \
+        int64_t zp_offset##i = lru_expert_no * dnnl_weights[i].ic * dnnl_weights[i].oc / dnnl_weights[i].ic_group_size / 2;                            \
+        dnnl_weights[i].weight = convert2dnnl(params.name##_w, {dnnl_weights[i].ic, dnnl_weights[i].oc}, dnnl::memory::format_tag::ba, wei_offset##i); \
+        dnnl_weights[i].scale = convert2dnnl(params.name##_s,                                                                                          \
+                                             {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc},                                 \
+                                             dnnl::memory::format_tag::ab,                                                                             \
+                                             scale_offset##i);                                                                                         \
+        dnnl_weights[i].zp = convert2dnnl(params.name##_z,                                                                                             \
+                                          {dnnl_weights[i].ic / dnnl_weights[i].ic_group_size, dnnl_weights[i].oc},                                    \
+                                          dnnl::memory::format_tag::ab,                                                                                \
+                                          zp_offset##i);
+                CONVERT_DNNL(gate, 0)
+                CONVERT_DNNL(up, 1)
+                CONVERT_DNNL(down, 2)
+#    undef CONVERT_DNNL
+            }
             auto& dnnl_weights = _dnnl_weights[expert_no];
 
             // expert_mask
@@ -1989,21 +2141,18 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
                               convert2dnnl(scratch.x, {static_cast<int64_t>(n_token), dnnl_weights[1].ic}, dnnl::memory::format_tag::ab),
                               convert2dnnl(scratch.up, {static_cast<int64_t>(n_token), _intermediate_size}, dnnl::memory::format_tag::ab),
                               dnnl::memory());
-
             // gate
             kernel.gate.forward(dnn_stream,
                                 n_token,
                                 convert2dnnl(scratch.x, {static_cast<int64_t>(n_token), dnnl_weights[0].ic}, dnnl::memory::format_tag::ab),
                                 convert2dnnl(scratch.gate, {static_cast<int64_t>(n_token), _intermediate_size}, dnnl::memory::format_tag::ab),
                                 convert2dnnl(scratch.up, {static_cast<int64_t>(n_token), _intermediate_size}, dnnl::memory::format_tag::ab));
-
             // down
             kernel.down.forward(dnn_stream,
                                 n_token,
                                 convert2dnnl(scratch.gate, {static_cast<int64_t>(n_token), _intermediate_size}, dnnl::memory::format_tag::ab),
                                 convert2dnnl(scratch.y, {static_cast<int64_t>(n_token), _hidden_size}, dnnl::memory::format_tag::ab),
                                 convert2dnnl(scratch.routing_weights, {static_cast<int64_t>(routing_weights_size)}, dnnl::memory::format_tag::a));
-
             // index_add
             result_event = execute_stage({result_event},
                                          instance,
@@ -2268,6 +2417,14 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
         const auto& config = cur_moe->_config;
         auto& cur_net = instance.get_network();
         auto& stream = cur_net.get_stream();
+
+        OPENVINO_ASSERT(!_lru_expert_num || _lru_cache, "LRU cache not initialized for OTD mode");
+        // When OTD is disabled (_lru_expert_num == 0) the cache reference is
+        // never dereferenced — every use site is guarded by `if (_lru_expert_num)`.
+        // Provide a stack-local dummy so that the reference is always valid.
+        LRUCache dummy_cache(0);
+        auto& cache = _lru_cache ? *_lru_cache : dummy_cache;
+
         cldnn::event::ptr ret_env = nullptr;
         _has_shared_expert = (config.num_shared_expert > 0);
 
@@ -2283,6 +2440,24 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
 
         auto [hidden_states_mem_ptr, hidden_states_layout] = get_input_info(instance, static_cast<size_t>(MOE3GemmInputIndex::HIDDEN_STATES));
         size_t token_num = get_seq_len(hidden_states_layout);
+
+        if (_lru_expert_num) {
+            if (!cache.m_initialized) {
+                instance._weights.gate_w = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::WEIGHT_0));
+                instance._weights.gate_z = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::ZP_0));
+                instance._weights.gate_s = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SCALE_0));
+
+                instance._weights.up_w = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::WEIGHT_1));
+                instance._weights.up_z = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::ZP_1));
+                instance._weights.up_s = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SCALE_1));
+
+                instance._weights.down_w = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::WEIGHT_2));
+                instance._weights.down_z = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::ZP_2));
+                instance._weights.down_s = instance.input_memory_ptr(static_cast<size_t>(MOE3GemmInputIndex::SCALE_2));
+                cache.m_initialized = true;
+            }
+        }
+
         scratch_buffers scratch;
         prepare_internal_buffers(instance, scratch, token_num);
         kernel_dump_info.clear_entries();
@@ -2317,7 +2492,7 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
         // Batched GEMV: for small token counts (including single token, MTP/speculative decoding),
         // use optimized GEMV kernels with batch dimension. Avoids gather/scatter overhead.
         if (token_num <= batched_gemv_threshold) {
-            return exec_batched_gemv({topk_event}, instance, scratch, token_num);
+            return exec_batched_gemv({topk_event}, instance, scratch, cache, token_num);
         }
 
         auto final_hidden_states_mem_ptr = instance.output_memory_ptr(0);
@@ -2340,11 +2515,11 @@ class moe_3gemm_swiglu_opt_impl : public PrimitiveImplOCL {
                                << std::endl;
         update_rt_params(instance);
         if (use_micro_gemm_prefill) {
-            ret_env = exec_prefill_micro_gemm({topk_event}, instance, scratch, use_gpu_mask_gen);
+            ret_env = exec_prefill_micro_gemm({topk_event}, instance, scratch, cache, use_gpu_mask_gen);
         } else if (use_grouped_gemm_prefill) {
             ret_env = exec_prefill_grouped_gemm({topk_event}, stream, instance, scratch);
         } else {
-            ret_env = exec_prefill_onednn({topk_event}, stream, instance, scratch);
+            ret_env = exec_prefill_onednn({topk_event}, stream, instance, scratch, cache);
         }
 
         if (_has_shared_expert) {
diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_otd_runtime.hpp b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_otd_runtime.hpp
new file mode 100644
index 00000000000000..f0573e2acce5ba
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/moe_otd_runtime.hpp
@@ -0,0 +1,341 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <chrono>
+#include <cstdlib>
+#include <cstring>
+#include <filesystem>
+#include <iostream>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <string_view>
+#include <vector>
+
+#include "LRUCache.hpp"
+#include "intel_gpu/primitives/moe_3gemm_fused_compressed.hpp"
+#include "intel_gpu/runtime/stream.hpp"
+#include "moe_3gemm_fused_inst.h"
+#include "openvino/util/parallel_io.hpp"
+
+namespace ov::intel_gpu::ocl::moe_otd {
+
+// Lightweight perf counters for OTD profiling.
+// Enabled by setting MOE_OTD_PERF_LOG=1 environment variable.
+// Counters are printed to stderr on process exit.
+struct OtdPerfCounters {
+    std::atomic<uint64_t> gpu_hits{0};
+    std::atomic<uint64_t> gpu_misses{0};
+    std::atomic<uint64_t> disk_io_ns{0};
+    std::atomic<uint64_t> transpose_ns{0};
+    std::atomic<uint64_t> gpu_copy_ns{0};
+    std::atomic<uint64_t> tensor_load_count{0};  // number of individual tensor loads (for averaging)
+
+    void dump() const {
+        const auto hits = gpu_hits.load(std::memory_order_relaxed);
+        const auto misses = gpu_misses.load(std::memory_order_relaxed);
+        const auto total = hits + misses;
+        const auto loads = tensor_load_count.load(std::memory_order_relaxed);
+        std::cerr << "[OTD_PERF] gpu_hits=" << hits << ", gpu_misses=" << misses << ", gpu_hit_rate=" << (total > 0 ? 100.0 * hits / total : 0.0) << "%"
+                  << ", tensor_loads=" << loads << ", avg_disk_io_us=" << (loads > 0 ? disk_io_ns.load(std::memory_order_relaxed) / 1000 / loads : 0)
+                  << ", avg_transpose_us=" << (loads > 0 ? transpose_ns.load(std::memory_order_relaxed) / 1000 / loads : 0)
+                  << ", avg_gpu_copy_us=" << (loads > 0 ? gpu_copy_ns.load(std::memory_order_relaxed) / 1000 / loads : 0)
+                  << ", total_disk_io_ms=" << disk_io_ns.load(std::memory_order_relaxed) / 1000000
+                  << ", total_gpu_copy_ms=" << gpu_copy_ns.load(std::memory_order_relaxed) / 1000000 << std::endl;
+    }
+};
+
+inline OtdPerfCounters* get_perf_counters() {
+    static bool enabled = std::getenv("MOE_OTD_PERF_LOG") != nullptr;
+    if (!enabled)
+        return nullptr;
+
+    static OtdPerfCounters counters;
+    static bool registered = [] {
+        std::atexit([] {
+            counters.dump();
+        });
+        return true;
+    }();
+    (void)registered;
+    return &counters;
+}
+
+inline size_t get_layer_from_id(const std::string& id) {
+    if (id == "moe:moe_router") {
+        return 0;
+    }
+
+    size_t layer = 0;
+    size_t pos = id.rfind('_');
+    if (pos != std::string::npos && pos + 1 < id.size()) {
+        std::string num_str = id.substr(pos + 1);
+        layer = atoi(num_str.c_str());
+    }
+    return layer;
+}
+
+class parallel_weight_reader {
+public:
+    explicit parallel_weight_reader(const std::string& weights_path) : _weights_path(weights_path) {
+        std::streamoff file_size = 0;
+        ov::util::get_file_handle_and_size(std::filesystem::path(weights_path), 0, _shared_handle, file_size);
+        _file_size = static_cast<size_t>(file_size);
+    }
+
+    ~parallel_weight_reader() {
+        ov::util::close_file_handle(_shared_handle);
+    }
+
+    const std::string& path() const {
+        return _weights_path;
+    }
+
+    size_t file_size() const {
+        return _file_size;
+    }
+
+    void read(char* dst, size_t size, size_t file_offset) {
+        if (!ov::util::positional_read(_shared_handle, dst, size, file_offset)) {
+            throw std::runtime_error("Failed to read enough bytes from OTD weight file");
+        }
+    }
+
+private:
+    std::string _weights_path;
+    ov::FileHandle _shared_handle{};
+    size_t _file_size = 0;
+};
+
+inline parallel_weight_reader& get_thread_local_weight_reader(const std::string& weights_path) {
+    thread_local std::unique_ptr<parallel_weight_reader> reader;
+    if (!reader || reader->path() != weights_path) {
+        reader = std::make_unique<parallel_weight_reader>(weights_path);
+    }
+    return *reader;
+}
+
+inline void maybe_transpose_scale_zp(const cldnn::moe_3gemm_fused_compressed& desc,
+                                     const char* tensor_name,
+                                     const cldnn::layout& layout,
+                                     std::vector<uint8_t>& payload,
+                                     size_t per_expert_size) {
+    const bool transpose_scale_zp = std::getenv("MOE_OTD_DISABLE_SCALE_ZP_TRANSPOSE") == nullptr;
+    if (!transpose_scale_zp || tensor_name == nullptr) {
+        return;
+    }
+
+    const std::string_view name(tensor_name);
+    const bool is_scale = name.find("_s") != std::string_view::npos;
+    const bool is_zp = name.find("_z") != std::string_view::npos;
+    if (!is_scale && !is_zp) {
+        return;
+    }
+
+    size_t oc = 0;
+    size_t ic = 0;
+    if (name.rfind("down_", 0) == 0) {
+        oc = static_cast<size_t>(desc._config.hidden_size);
+        ic = static_cast<size_t>(desc._config.inter_size);
+    } else {
+        oc = static_cast<size_t>(desc._config.inter_size);
+        ic = static_cast<size_t>(desc._config.hidden_size);
+    }
+
+    const size_t group_size = static_cast<size_t>(desc._config.group_size);
+    size_t group_count = 1;
+    if (group_size != 0 && group_size != std::numeric_limits<size_t>::max()) {
+        OPENVINO_ASSERT(ic % group_size == 0, "Invalid group_size for OTD transpose: tensor=", tensor_name, ", ic=", ic, ", group_size=", group_size);
+        group_count = ic / group_size;
+    }
+
+    OPENVINO_ASSERT(oc > 0 && group_count > 0, "Invalid dims for OTD transpose: tensor=", tensor_name, ", oc=", oc, ", group_count=", group_count);
+
+    const size_t elem_count = oc * group_count;
+    if (is_scale) {
+        const size_t elem_size = static_cast<size_t>(data_type_traits::size_of(layout.data_type));
+        OPENVINO_ASSERT(elem_size > 0, "Invalid scale element size for tensor=", tensor_name);
+        OPENVINO_ASSERT(elem_count * elem_size == per_expert_size,
+                        "Unexpected scale payload size for tensor=",
+                        tensor_name,
+                        ", expected=",
+                        elem_count * elem_size,
+                        ", got=",
+                        per_expert_size);
+
+        std::vector<uint8_t> transposed(per_expert_size, 0);
+        for (size_t o = 0; o < oc; o++) {
+            for (size_t g = 0; g < group_count; g++) {
+                const size_t src_elem_idx = o * group_count + g;
+                const size_t dst_elem_idx = g * oc + o;
+                std::memcpy(transposed.data() + dst_elem_idx * elem_size, payload.data() + src_elem_idx * elem_size, elem_size);
+            }
+        }
+        payload.swap(transposed);
+        return;
+    }
+
+    OPENVINO_ASSERT(elem_count % 2 == 0, "Unexpected odd element count for packed zp tensor=", tensor_name, ", elem_count=", elem_count);
+    OPENVINO_ASSERT(elem_count / 2 == per_expert_size,
+                    "Unexpected zp payload size for tensor=",
+                    tensor_name,
+                    ", expected=",
+                    elem_count / 2,
+                    ", got=",
+                    per_expert_size);
+
+    std::vector<uint8_t> unpacked(elem_count, 0);
+    for (size_t i = 0; i < per_expert_size; i++) {
+        const uint8_t byte = payload[i];
+        unpacked[2 * i] = static_cast<uint8_t>(byte & 0x0F);
+        unpacked[2 * i + 1] = static_cast<uint8_t>((byte >> 4) & 0x0F);
+    }
+
+    std::vector<uint8_t> transposed_unpacked(elem_count, 0);
+    for (size_t o = 0; o < oc; o++) {
+        for (size_t g = 0; g < group_count; g++) {
+            const size_t src_idx = o * group_count + g;
+            const size_t dst_idx = g * oc + o;
+            transposed_unpacked[dst_idx] = unpacked[src_idx];
+        }
+    }
+
+    std::vector<uint8_t> repacked(per_expert_size, 0);
+    for (size_t i = 0; i < per_expert_size; i++) {
+        repacked[i] = static_cast<uint8_t>((transposed_unpacked[2 * i] & 0x0F) | ((transposed_unpacked[2 * i + 1] & 0x0F) << 4));
+    }
+    payload.swap(repacked);
+}
+
+inline void fill_weights_memory(cldnn::stream& exec_stream,
+                                const cldnn::moe_3gemm_fused_compressed& desc,
+                                cldnn::moe_weights& wei_mem,
+                                const std::vector<uint32_t>& experts_list,
+                                const std::vector<uint32_t>& lru_experts,
+                                size_t layer = 0) {
+    struct tensor_fill_plan {
+        size_t per_expert_size = 0;
+        size_t src_offset = 0;
+        size_t dst_offset = 0;
+    };
+
+    const auto num_expert = static_cast<size_t>(desc._config.num_expert);
+    const auto& weight_bin_offsets = desc._weight_bin_offsets;
+    const auto& weights_path = desc._weights_path;
+
+    OPENVINO_ASSERT(!weights_path.empty(), "weights path is empty for OTD weight loading");
+    OPENVINO_ASSERT(weight_bin_offsets.size() == cldnn::moe_3gemm_fused_compressed::serialized_weight_offset_count, "Unexpected number of MOE weight offsets");
+
+    static const std::array<const char*, cldnn::moe_3gemm_fused_compressed::serialized_weight_offset_count> tensor_names = {
+        {"gate_w", "up_w", "down_w", "gate_s", "up_s", "down_s", "gate_z", "up_z", "down_z"}};
+    const std::array<cldnn::memory_ptr, cldnn::moe_3gemm_fused_compressed::serialized_weight_offset_count> tensors_by_offset = {
+        {wei_mem.gate_w, wei_mem.up_w, wei_mem.down_w, wei_mem.gate_s, wei_mem.up_s, wei_mem.down_s, wei_mem.gate_z, wei_mem.up_z, wei_mem.down_z}};
+
+    auto* perf = get_perf_counters();
+
+    auto& weight_reader = get_thread_local_weight_reader(weights_path);
+    const size_t weight_file_size = weight_reader.file_size();
+
+    size_t index = 0;
+    for (uint32_t expert : experts_list) {
+        auto make_tensor_fill_plan = [&](size_t base_offset, cldnn::memory_ptr mem, size_t expert_no, size_t lru_expert_no, const char* tensor_name) {
+            tensor_fill_plan plan;
+            if (!mem)
+                return plan;
+            const auto total_bytes = mem->get_layout().bytes_count();
+            OPENVINO_ASSERT(num_expert > 0, "Invalid expert count");
+            plan.per_expert_size = total_bytes / num_expert;
+            plan.src_offset = base_offset + expert_no * plan.per_expert_size;
+            plan.dst_offset = lru_expert_no * plan.per_expert_size;
+            OPENVINO_ASSERT(plan.src_offset <= weight_file_size, "Invalid src_offset out of file: ", plan.src_offset, ", file_size=", weight_file_size);
+            OPENVINO_ASSERT(plan.per_expert_size <= weight_file_size - plan.src_offset,
+                            "Read range out of file for tensor ",
+                            tensor_name,
+                            ": src_offset=",
+                            plan.src_offset,
+                            ", per_expert_size=",
+                            plan.per_expert_size,
+                            ", file_size=",
+                            weight_file_size,
+                            ", base_offset=",
+                            base_offset,
+                            ", expert=",
+                            expert_no);
+            return plan;
+        };
+
+        for (size_t offset_pos = 0; offset_pos < static_cast<size_t>(cldnn::moe_3gemm_fused_compressed::serialized_weight_offset_count); offset_pos++) {
+            auto plan =
+                make_tensor_fill_plan(weight_bin_offsets[offset_pos], tensors_by_offset[offset_pos], expert, lru_experts[index], tensor_names[offset_pos]);
+            std::vector<uint8_t> payload;
+
+            if (plan.per_expert_size != 0) {
+                payload.resize(plan.per_expert_size);
+                if (perf) {
+                    auto t0 = std::chrono::steady_clock::now();
+                    weight_reader.read(reinterpret_cast<char*>(payload.data()), plan.per_expert_size, plan.src_offset);
+                    auto t1 = std::chrono::steady_clock::now();
+                    perf->disk_io_ns.fetch_add(static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count()),
+                                               std::memory_order_relaxed);
+                } else {
+                    weight_reader.read(reinterpret_cast<char*>(payload.data()), plan.per_expert_size, plan.src_offset);
+                }
+            }
+
+            // Transpose + GPU copy
+            auto mem = tensors_by_offset[offset_pos];
+            if (mem && plan.per_expert_size != 0) {
+                if (perf) {
+                    auto t0 = std::chrono::steady_clock::now();
+                    maybe_transpose_scale_zp(desc, tensor_names[offset_pos], mem->get_layout(), payload, plan.per_expert_size);
+                    auto t1 = std::chrono::steady_clock::now();
+                    mem->copy_from(exec_stream, payload.data(), 0, plan.dst_offset, plan.per_expert_size, true);
+                    auto t2 = std::chrono::steady_clock::now();
+                    perf->transpose_ns.fetch_add(static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(t1 - t0).count()),
+                                                 std::memory_order_relaxed);
+                    perf->gpu_copy_ns.fetch_add(static_cast<uint64_t>(std::chrono::duration_cast<std::chrono::nanoseconds>(t2 - t1).count()),
+                                                std::memory_order_relaxed);
+                    perf->tensor_load_count.fetch_add(1, std::memory_order_relaxed);
+                } else {
+                    maybe_transpose_scale_zp(desc, tensor_names[offset_pos], mem->get_layout(), payload, plan.per_expert_size);
+                    mem->copy_from(exec_stream, payload.data(), 0, plan.dst_offset, plan.per_expert_size, true);
+                }
+            }
+        }
+
+        index++;
+    }
+}
+
+inline uint32_t get_lru_expert_no(typed_primitive_inst<cldnn::moe_3gemm_fused_compressed>& instance, uint32_t expert, LRUCache& cache) {
+    auto cur_moe = instance.get_typed_desc<cldnn::moe_3gemm_fused_compressed>();
+    auto& stream = instance.get_network().get_stream();
+    size_t layer = get_layer_from_id(cur_moe->id);
+    auto item = cache.get_lru_item(layer, expert);
+    OPENVINO_ASSERT(item.first <= static_cast<size_t>(std::numeric_limits<uint32_t>::max()), "LRU slot index overflow: ", item.first);
+    const auto lru_slot = static_cast<uint32_t>(item.first);
+
+    auto* perf = get_perf_counters();
+    if (item.second) {
+        if (perf)
+            perf->gpu_hits.fetch_add(1, std::memory_order_relaxed);
+    } else {
+        if (perf)
+            perf->gpu_misses.fetch_add(1, std::memory_order_relaxed);
+        std::vector<uint32_t> experts_list_single;
+        experts_list_single.push_back(expert);
+        std::vector<uint32_t> lru_experts_list_single;
+        lru_experts_list_single.push_back(lru_slot);
+        fill_weights_memory(stream, *cur_moe, instance._weights, experts_list_single, lru_experts_list_single, layer);
+        cache.set_filled(lru_slot);
+    }
+    return lru_slot;
+}
+
+}  // namespace ov::intel_gpu::ocl::moe_otd
\ No newline at end of file
diff --git a/src/plugins/intel_gpu/src/graph/include/moe_3gemm_fused_inst.h b/src/plugins/intel_gpu/src/graph/include/moe_3gemm_fused_inst.h
index f3b69f8773001f..3cdbb50cffc82b 100644
--- a/src/plugins/intel_gpu/src/graph/include/moe_3gemm_fused_inst.h
+++ b/src/plugins/intel_gpu/src/graph/include/moe_3gemm_fused_inst.h
@@ -47,6 +47,8 @@ class typed_primitive_inst<moe_3gemm_fused_compressed> : public typed_primitive_
     static layout calc_output_layout(const moe_node& /* node */, const kernel_impl_params& impl_param);
     static std::string to_string(const moe_node& node);
     typed_primitive_inst(network& network, const moe_node& node);
+    cldnn::memory::ptr _base;
+    cldnn::moe_weights _weights;
 };
 
 using moe_inst = typed_primitive_inst<moe_3gemm_fused_compressed>;
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
index 7503bd7cb12ebd..cc4416b8c723cb 100644
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -18,7 +18,7 @@
 #include "intel_gpu/runtime/compilation_context.hpp"
 #include "intel_gpu/runtime/debug_configuration.hpp"
 #include "intel_gpu/runtime/itt.hpp"
-
+#include "openvino/util/env_util.hpp"
 #include "intel_gpu/graph/kernel_impl_params.hpp"
 #include "intel_gpu/graph/program.hpp"
 #include "intel_gpu/graph/network.hpp"
@@ -30,6 +30,7 @@
 #include "paged_attention_inst.h"
 #include "convolution_inst.h"
 #include "deconvolution_inst.h"
+#include "moe_3gemm_fused_inst.h"
 #include "mutable_data_inst.h"
 #include "condition_inst.h"
 #include "read_value_inst.h"
@@ -1169,7 +1170,9 @@ void network::transfer_memory_to_device(std::shared_ptr<primitive_inst> instance
         && users.front()->is_type<reshape>()
         && users.front()->is_dynamic())
             return;
-
+    if (get_config().get_moe_offload_ratio() > 0 && node.have_user_with_type<moe_3gemm_fused_compressed>()) {
+        return;
+    }
     // Do not transfer memory if a user requires lockable memory.
     // If memory is used in both gpu and cpu implementations, primitive itself is responsible for correct allocation type
     if (node.need_lockable_memory())
diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
index f163eb3cb35f5c..1bf62a34bedcdf 100644
--- a/src/plugins/intel_gpu/src/graph/program.cpp
+++ b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -1,6 +1,8 @@
 // Copyright (C) 2018-2026 Intel Corporation
 // SPDX-License-Identifier: Apache-2.0
 //
+#include <thread>
+#include <chrono>
 
 #include "intel_gpu/graph/fused_primitive_desc.hpp"
 #include "registry/implementation_manager.hpp"
@@ -55,6 +57,7 @@
 #include "arg_max_min_inst.h"
 #include "dft_inst.h"
 #include "multiclass_nms_inst.h"
+#include "moe_3gemm_fused_inst.h"
 #include "mutable_data_inst.h"
 #include "pooling_inst.h"
 #include "border_inst.h"
@@ -501,7 +504,6 @@ void program::build_program(bool is_internal) {
     { pre_optimize_graph(is_internal); }
     run_graph_compilation();
     { post_optimize_graph(is_internal); }
-
 #ifdef GPU_DEBUG_CONFIG
     if (get_config().get_dry_run_path().empty() || is_internal) {
 #else
@@ -723,7 +725,7 @@ void program::transfer_memory_to_device() {
         // TODO: Do we need finish call here? Maybe call it in network::execute() ?
         get_stream().finish();
     };
-
+    auto otd = _config.get_moe_offload_ratio();
     for (auto& node : processing_order) {
         if (node->is_shape_infer_dep()) {
             continue;
@@ -731,6 +733,10 @@ void program::transfer_memory_to_device() {
         if (node->is_type<data>() && !node->need_lockable_memory()) {
             auto& data_node = node->as<data>();
             auto data_node_layout = data_node.get_output_layout();
+            auto prim = data_node.get_primitive();
+            if (otd && node->have_user_with_type<moe_3gemm_fused_compressed>()) {
+                continue;
+            }
             auto& mem = data_node.get_attached_memory();
             auto mem_layout = mem.get_layout();
             auto alloc_type = mem.get_allocation_type();
diff --git a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp
index 626dee0b48634c..293db5ea5a8f19 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/constant.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/constant.cpp
@@ -26,10 +26,21 @@
 #include "openvino/op/tensor_iterator.hpp"
 #include "openvino/op/bucketize.hpp"
 #include "openvino/op/matmul.hpp"
+#include "openvino/op/moe.hpp"
 #include "openvino/op/util/binary_elementwise_bitwise.hpp"
 
+#include "intel_gpu/op/moe_3gemm_fused_compressed.hpp"
+#include "ov_ops/moe_compressed.hpp"
+
 #include "intel_gpu/primitives/data.hpp"
 #include "intel_gpu/runtime/debug_configuration.hpp"
+#include "moe_offload_constant.hpp"
+
+#include <algorithm>
+#include <cctype>
+#include <cstdlib>
+#include <cstring>
+#include <string>
 
 namespace ov::intel_gpu {
 
@@ -100,8 +111,17 @@ static void create_data(ProgramBuilder& p, const ov::Shape& const_shape, const s
         p.primitive_ids[initialconstPrimID] = constPrimID;
         p.profiling_ids.push_back(initialconstPrimID);
     } else {
+        auto partial_upload = moe_offload::try_prepare_partial_upload(p, op, const_shape, out_dtype, constFormat, constLayout);
+
         cldnn::memory::ptr mem = nullptr;
-        if (constLayout.bytes_count() > 0) {
+        size_t upload_bytes = constLayout.bytes_count();
+        ov::Shape upload_shape = const_shape;
+
+        if (partial_upload.enabled) {
+            mem = partial_upload.memory;
+            upload_bytes = partial_upload.upload_bytes;
+            upload_shape = partial_upload.upload_shape;
+        } else if (constLayout.bytes_count() > 0) {
             mem = p.get_engine().allocate_memory(constLayout, false);
         } else {
             // In the case of empty const data with {0} shape, it has zero byte.
@@ -114,37 +134,41 @@ static void create_data(ProgramBuilder& p, const ov::Shape& const_shape, const s
         GPU_DEBUG_LOG << "[" << initialconstPrimID << ": constant] layout: "
                         << constLayout.to_short_string() << ", mem_ptr(" << mem << ", " << mem->size() << " bytes)"<< std::endl;
         auto& stream = p.get_engine().get_service_stream();
-        cldnn::mem_lock<char> lock{mem, stream};
-        auto buf = lock.data();
-        auto bufSize = constLayout.bytes_count();
-
-        // If a constant has element type f64 but contains no elements (empty tensor),
-        // convert it to f32 because the GPU plugin only supports the f32 data type internally.
-        if (ov::shape_size(const_shape) == 1 &&
-            out_dtype == cldnn::data_types::f32 &&
-            op->get_output_element_type(0) == ov::element::f64) {
-            const auto* f64data = op->get_data_ptr<double>();
-            auto f32buf = reinterpret_cast<float*>(buf);
-            f32buf[0] = static_cast<float>(f64data[0]);
-        } else if (out_dtype == cldnn::data_types::f32 &&
-                   (op->get_output_element_type(0) == ov::element::u16 ||
-                    op->get_output_element_type(0) == ov::element::i16)) {
-            size_t count = ov::shape_size(const_shape);
-            auto f32buf = reinterpret_cast<float*>(buf);
-
-            if (op->get_output_element_type(0) == ov::element::u16) {
-                const auto* u16data = op->get_data_ptr<uint16_t>();
-                for (size_t i = 0; i < count; i++) {
-                    f32buf[i] = static_cast<float>(u16data[i]);
+
+        if (!partial_upload.enabled) {
+            cldnn::mem_lock<char> lock{mem, stream};
+            auto buf = lock.data();
+            auto bufSize = upload_bytes;
+            auto upload_count = ov::shape_size(upload_shape);
+
+            // If a constant has element type f64 but contains no elements (empty tensor),
+            // convert it to f32 because the GPU plugin only supports the f32 data type internally.
+            if (upload_count == 1 &&
+                out_dtype == cldnn::data_types::f32 &&
+                op->get_output_element_type(0) == ov::element::f64) {
+                const auto* f64data = op->get_data_ptr<double>();
+                auto f32buf = reinterpret_cast<float*>(buf);
+                f32buf[0] = static_cast<float>(f64data[0]);
+            } else if (out_dtype == cldnn::data_types::f32 &&
+                       (op->get_output_element_type(0) == ov::element::u16 ||
+                        op->get_output_element_type(0) == ov::element::i16)) {
+                size_t count = upload_count;
+                auto f32buf = reinterpret_cast<float*>(buf);
+
+                if (op->get_output_element_type(0) == ov::element::u16) {
+                    const auto* u16data = op->get_data_ptr<uint16_t>();
+                    for (size_t i = 0; i < count; i++) {
+                        f32buf[i] = static_cast<float>(u16data[i]);
+                    }
+                } else {
+                    const auto* i16data = op->get_data_ptr<int16_t>();
+                    for (size_t i = 0; i < count; i++) {
+                        f32buf[i] = static_cast<float>(i16data[i]);
+                    }
                 }
             } else {
-                const auto* i16data = op->get_data_ptr<int16_t>();
-                for (size_t i = 0; i < count; i++) {
-                    f32buf[i] = static_cast<float>(i16data[i]);
-                }
+                std::memcpy(&buf[0], &data[0], bufSize);
             }
-        } else {
-            std::memcpy(&buf[0], &data[0], bufSize);
         }
         ov::wsh::Extension::hint_evict(*op);
         p.add_primitive(*op, cldnn::data(initialconstPrimID, mem));
diff --git a/src/plugins/intel_gpu/src/plugin/ops/moe.cpp b/src/plugins/intel_gpu/src/plugin/ops/moe.cpp
index b7b4dcd1d2d9b6..d001709ce5af2e 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/moe.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/moe.cpp
@@ -2,13 +2,22 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 #include "openvino/op/moe.hpp"
+#include "transformations/rt_info/fused_names_attribute.hpp"
 
 #include <intel_gpu/primitives/eltwise.hpp>
 #include <intel_gpu/primitives/moe_gather.hpp>
 #include <intel_gpu/primitives/moe_scatter_reduction.hpp>
 #include <intel_gpu/primitives/swiglu.hpp>
+
+#include <array>
+#include <cstdlib>
+#include <filesystem>
 #include <limits>
+#include <set>
+#include <sstream>
+#include <unordered_map>
 
+#include <pugixml.hpp>
 #include "ov_ops/moe_compressed.hpp"
 #include "intel_gpu/plugin/program_builder.hpp"
 #include "intel_gpu/op/moe_3gemm_fused_compressed.hpp"
@@ -17,6 +26,8 @@
 #include "intel_gpu/primitives/moe_gemm.hpp"
 #include "intel_gpu/primitives/moe_mask_gen.hpp"
 #include "openvino/op/constant.hpp"
+#include "openvino/core/model.hpp"
+#include "openvino/core/weight_sharing_util.hpp"
 
 namespace ov {
 namespace op {
@@ -30,8 +41,346 @@ namespace ov::intel_gpu {
 using namespace cldnn;
 
 static void CreateMOE3GemmFusedCompressedOp(ProgramBuilder& p, const std::shared_ptr<ov::intel_gpu::op::MOE3GemmFusedCompressed>& op) {
+    using input_idx = cldnn::moe_3gemm_fused_compressed::input_index;
     auto inputs = p.GetInputInfo(op);
     const auto& config = op->get_config();
+    const auto& model = p.get_model();
+    std::string weights_path;
+    const size_t otd_ratio = p.get_config().get_moe_offload_ratio();
+    const size_t lru_expert_num = otd_ratio > 0 ? std::max<size_t>(1, static_cast<size_t>(config.num_expert) * otd_ratio / 100) : 0;
+    const bool otd_enabled = lru_expert_num > 0;
+    if (otd_enabled) {
+        const auto& rt = model->get_rt_info();
+        auto it = rt.find("__weights_path");
+        OPENVINO_ASSERT(it != rt.end(), "Model rt_info is missing '__weights_path' required by OTD");
+        weights_path = it->second.as<std::string>();
+    }
+
+    struct XmlConstEntry {
+        size_t offset = 0;
+        size_t size = 0;
+        bool used = false;
+    };
+
+    std::unordered_map<std::string, std::vector<XmlConstEntry>> xml_const_entries_by_name;
+    bool xml_offsets_ready = false;
+
+    auto load_const_offsets_from_xml = [&]() {
+        if (xml_offsets_ready || weights_path.empty()) {
+            return;
+        }
+
+        std::filesystem::path xml_path(weights_path);
+        xml_path.replace_extension(".xml");
+        OPENVINO_ASSERT(std::filesystem::exists(xml_path), "IR xml file is not found: ", xml_path.string());
+
+        pugi::xml_document doc;
+        OPENVINO_ASSERT(doc.load_file(xml_path.string().c_str()), "Failed to parse IR xml file: ", xml_path.string());
+
+        auto net = doc.child("net");
+        auto layers = net.child("layers");
+        for (auto layer = layers.child("layer"); layer; layer = layer.next_sibling("layer")) {
+            const auto type_attr = layer.attribute("type");
+            if (!type_attr || std::string(type_attr.value()) != "Const") {
+                continue;
+            }
+
+            const auto data = layer.child("data");
+            const auto name_attr = layer.attribute("name");
+            const auto offset_attr = data.attribute("offset");
+            const auto size_attr = data.attribute("size");
+            if (!data || !name_attr || !offset_attr || !size_attr) {
+                continue;
+            }
+
+            XmlConstEntry entry;
+            try {
+                entry.offset = static_cast<size_t>(std::stoull(offset_attr.value()));
+                entry.size = static_cast<size_t>(std::stoull(size_attr.value()));
+            } catch (const std::exception& e) {
+                OPENVINO_THROW("Failed to parse MOE weight offset/size from XML attribute: ", e.what(),
+                               " (name=", name_attr.value(), ", offset='", offset_attr.value(),
+                               "', size='", size_attr.value(), "')");
+            }
+            xml_const_entries_by_name[name_attr.value()].push_back(entry);
+        }
+
+        xml_offsets_ready = true;
+    };
+
+    // Extract a layer-scoping pattern from the MOE op name (e.g., "layers.0.experts")
+    // to disambiguate same-sized constants across different layers.
+    auto extract_layer_pattern = [](const std::string& moe_name) -> std::string {
+        // Match patterns like "layers.N.experts" or "layers.NN.experts"
+        auto pos = moe_name.find("layers.");
+        if (pos == std::string::npos) return {};
+        auto end = moe_name.find(".experts", pos);
+        if (end == std::string::npos) {
+            end = moe_name.find("/experts", pos);
+            if (end == std::string::npos) return {};
+            return moe_name.substr(pos, end - pos + 8);  // include "/experts"
+        }
+        return moe_name.substr(pos, end - pos + 8);  // include ".experts"
+    };
+
+    // Projection-identifying keywords for each offset slot (3 projections × {weight, scale, zp}).
+    // Slot layout: [weight_0, weight_1, weight_2, scale_0, scale_1, scale_2, zp_0, zp_1, zp_2]
+    // Projection 0: gate (VariadicSplit.0), Projection 1: up (VariadicSplit.1), Projection 2: down (down_proj)
+    struct ProjHint {
+        std::vector<std::string> patterns;   // candidate substrings in the XML name
+        std::vector<std::string> suffixes;   // suffix patterns (e.g., "/scale", "/zero_point")
+    };
+    auto get_proj_hint = [](size_t offset_slot) -> ProjHint {
+        // offset_slot 0-2: weights, 3-5: scales, 6-8: zps
+        size_t proj_idx = offset_slot % 3;
+        ProjHint hint;
+        if (proj_idx == 0) {
+            hint.patterns = {"VariadicSplit.0", "gate_proj", "gate"};
+        } else if (proj_idx == 1) {
+            hint.patterns = {"VariadicSplit.1", "up_proj", "up"};
+        } else {
+            hint.patterns = {"down_proj", "VariadicSplit.2"};
+        }
+        if (offset_slot < 3) {
+            // weight: no suffix or just the base name
+            hint.suffixes = {};
+        } else if (offset_slot < 6) {
+            hint.suffixes = {"/scale"};
+        } else {
+            hint.suffixes = {"/zero_point"};
+        }
+        return hint;
+    };
+
+    auto get_const_offset = [&](size_t index, size_t offset_slot) -> size_t {
+        auto node = op->input_value(index).get_node_shared_ptr();
+        auto const_op = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
+        OPENVINO_ASSERT(const_op != nullptr, "Expected constant input for MOE3GemmFusedCompressed");
+        const auto& rt_info = const_op->get_rt_info();
+        auto attr_it = rt_info.find(ov::WeightlessCacheAttribute::get_type_info_static());
+
+        if (attr_it != rt_info.end()) {
+            return attr_it->second.as<ov::WeightlessCacheAttribute>().bin_offset;
+        }
+
+        // Try buffer descriptor offset (works when constant data is mmap'd from bin file).
+        auto source_buf = ov::weight_sharing::Extension::get_constant_source_buffer(*const_op);
+        if (source_buf) {
+            return ov::weight_sharing::Extension::get_constant_id(*const_op);
+        }
+
+        load_const_offsets_from_xml();
+        OPENVINO_ASSERT(xml_offsets_ready,
+                        "Missing WeightlessCacheAttribute and failed to initialize xml-based offset lookup for "
+                        "MOE3GemmFusedCompressed constant input");
+
+        auto resolve_from_name = [&](const std::string& lookup_name,
+                                     const std::string& const_name,
+                                     size_t expected_size,
+                                     size_t& resolved_offset) -> bool {
+            auto by_name_it = xml_const_entries_by_name.find(lookup_name);
+            if (by_name_it == xml_const_entries_by_name.end()) {
+                return false;
+            }
+
+            size_t match_count = 0;
+            XmlConstEntry* matched_entry = nullptr;
+            for (auto& entry : by_name_it->second) {
+                if (!entry.used && entry.size == expected_size) {
+                    match_count++;
+                    if (matched_entry == nullptr) {
+                        matched_entry = &entry;
+                    }
+                }
+            }
+
+            if (match_count == 1 && matched_entry != nullptr) {
+                matched_entry->used = true;
+                resolved_offset = matched_entry->offset;
+                return true;
+            }
+
+            if (match_count > 1) {
+                OPENVINO_THROW("Ambiguous xml offset resolution for MOE3GemmFusedCompressed constant input: ",
+                               const_name,
+                               ", lookup_name=", lookup_name,
+                               ", byte_size=", expected_size,
+                               ", candidates=", match_count);
+            }
+
+            return false;
+        };
+
+        const auto& name = const_op->get_friendly_name();
+        const size_t expected_size = const_op->get_byte_size();
+        size_t resolved_offset = 0;
+        if (resolve_from_name(name, name, expected_size, resolved_offset)) {
+            return resolved_offset;
+        }
+
+        // Try original/fused names before using any size-based fallback.
+        std::set<std::string> fused_names_unique;
+        for (const auto& fused_name : ov::getFusedNamesVector(const_op)) {
+            if (!fused_name.empty() && fused_name != name) {
+                fused_names_unique.insert(fused_name);
+            }
+        }
+        for (const auto& fused_name : fused_names_unique) {
+            if (resolve_from_name(fused_name, name, expected_size, resolved_offset)) {
+                return resolved_offset;
+            }
+        }
+
+        // Fallback: allow by-size only when there is exactly one unused candidate.
+        struct SizeCandidate {
+            std::string name;
+            XmlConstEntry* entry = nullptr;
+        };
+        std::vector<SizeCandidate> size_candidates;
+        for (auto& kv : xml_const_entries_by_name) {
+            for (auto& entry : kv.second) {
+                if (!entry.used && entry.size == expected_size) {
+                    size_candidates.push_back(SizeCandidate{kv.first, &entry});
+                }
+            }
+        }
+
+        if (size_candidates.size() == 1 && size_candidates[0].entry != nullptr) {
+            size_candidates[0].entry->used = true;
+            return size_candidates[0].entry->offset;
+        }
+
+        // Layer+projection scoped resolution: use the MOE node name to identify the layer,
+        // and the offset slot to identify the projection role.
+        if (size_candidates.size() > 1) {
+            const auto& moe_name = op->get_friendly_name();
+            std::string layer_pat = extract_layer_pattern(moe_name);
+            ProjHint hint = get_proj_hint(offset_slot);
+
+            // Filter candidates by layer pattern
+            std::vector<SizeCandidate> layer_filtered;
+            if (!layer_pat.empty()) {
+                for (auto& sc : size_candidates) {
+                    if (sc.name.find(layer_pat) != std::string::npos) {
+                        layer_filtered.push_back(sc);
+                    }
+                }
+            }
+
+            // If layer filtering narrowed it to one, use it
+            if (layer_filtered.size() == 1 && layer_filtered[0].entry != nullptr) {
+                layer_filtered[0].entry->used = true;
+                return layer_filtered[0].entry->offset;
+            }
+
+            // Further filter by projection hint patterns
+            auto& search_pool = layer_filtered.empty() ? size_candidates : layer_filtered;
+            for (const auto& pat : hint.patterns) {
+                std::vector<SizeCandidate> proj_filtered;
+                for (auto& sc : search_pool) {
+                    if (sc.name.find(pat) != std::string::npos) {
+                        proj_filtered.push_back(sc);
+                    }
+                }
+                if (proj_filtered.size() == 1 && proj_filtered[0].entry != nullptr) {
+                    proj_filtered[0].entry->used = true;
+                    return proj_filtered[0].entry->offset;
+                }
+                // If pattern+suffix narrows further
+                if (proj_filtered.size() > 1 && !hint.suffixes.empty()) {
+                    for (const auto& suffix : hint.suffixes) {
+                        std::vector<SizeCandidate> suffix_filtered;
+                        for (auto& sc : proj_filtered) {
+                            // Check if name ends with the suffix
+                            if (sc.name.size() >= suffix.size() &&
+                                sc.name.compare(sc.name.size() - suffix.size(), suffix.size(), suffix) == 0) {
+                                suffix_filtered.push_back(sc);
+                            }
+                        }
+                        if (suffix_filtered.size() == 1 && suffix_filtered[0].entry != nullptr) {
+                            suffix_filtered[0].entry->used = true;
+                            return suffix_filtered[0].entry->offset;
+                        }
+                    }
+                }
+            }
+
+            // Also try suffix-only filtering (for weight slots where suffix is empty,
+            // the weight is the entry that does NOT end with /scale or /zero_point)
+            if (offset_slot < 3 && !search_pool.empty()) {
+                std::vector<SizeCandidate> weight_filtered;
+                for (auto& sc : search_pool) {
+                    bool is_scale_or_zp = (sc.name.find("/scale") != std::string::npos) ||
+                                          (sc.name.find("/zero_point") != std::string::npos);
+                    if (!is_scale_or_zp) {
+                        weight_filtered.push_back(sc);
+                    }
+                }
+                if (weight_filtered.size() == 1 && weight_filtered[0].entry != nullptr) {
+                    weight_filtered[0].entry->used = true;
+                    return weight_filtered[0].entry->offset;
+                }
+                // Further filter weights by projection pattern
+                for (const auto& pat : hint.patterns) {
+                    std::vector<SizeCandidate> proj_wt_filtered;
+                    for (auto& sc : weight_filtered) {
+                        if (sc.name.find(pat) != std::string::npos) {
+                            proj_wt_filtered.push_back(sc);
+                        }
+                    }
+                    if (proj_wt_filtered.size() == 1 && proj_wt_filtered[0].entry != nullptr) {
+                        proj_wt_filtered[0].entry->used = true;
+                        return proj_wt_filtered[0].entry->offset;
+                    }
+                }
+            }
+
+            std::ostringstream oss;
+            const size_t max_candidates_to_log = 8;
+            for (size_t i = 0; i < std::min(max_candidates_to_log, size_candidates.size()); i++) {
+                const auto* candidate_entry = size_candidates[i].entry;
+                if (candidate_entry == nullptr) {
+                    continue;
+                }
+                if (i > 0) {
+                    oss << ';';
+                }
+                oss << size_candidates[i].name << '@' << candidate_entry->offset;
+            }
+
+            OPENVINO_THROW("Ambiguous xml offset resolution for MOE3GemmFusedCompressed constant input: ",
+                           name,
+                           ", byte_size=", expected_size,
+                           ", size_candidates=", size_candidates.size(),
+                           ", layer_pat=", layer_pat,
+                           ", offset_slot=", offset_slot,
+                           ", moe_name=", moe_name,
+                           ", sample_candidates=", oss.str());
+        }
+
+        OPENVINO_THROW("Unable to resolve xml offset for MOE3GemmFusedCompressed constant input: ", name,
+                       ", byte_size=", expected_size);
+    };
+
+    const std::array<size_t, cldnn::moe_3gemm_fused_compressed::serialized_weight_offset_count> const_input_idx_by_offset = {
+        static_cast<size_t>(input_idx::weight_0),
+        static_cast<size_t>(input_idx::weight_1),
+        static_cast<size_t>(input_idx::weight_2),
+        static_cast<size_t>(input_idx::scale_0),
+        static_cast<size_t>(input_idx::scale_1),
+        static_cast<size_t>(input_idx::scale_2),
+        static_cast<size_t>(input_idx::zp_0),
+        static_cast<size_t>(input_idx::zp_1),
+        static_cast<size_t>(input_idx::zp_2)
+    };
+
+    std::vector<size_t> weight_bin_offsets(cldnn::moe_3gemm_fused_compressed::serialized_weight_offset_count, 0);
+    // Serialized offsets are only needed for OTD path (weight-on-demand loading).
+    if (otd_enabled) {
+        for (size_t i = 0; i < const_input_idx_by_offset.size(); i++) {
+            weight_bin_offsets[i] = get_const_offset(const_input_idx_by_offset[i], i);
+        }
+    }
     ///   0: hidden_states - input tensor with hidden representations
     ///   1: routing_weights - [num_seq, num_experts] routing weights for all experts
     ///   2: w0_weight - expert weights for first projection,
@@ -84,7 +433,7 @@ static void CreateMOE3GemmFusedCompressedOp(ProgramBuilder& p, const std::shared
     validate_inputs_count(op, {expected_inputs});
 
     const std::string layerName = layer_type_name_ID(op);
-    const cldnn::moe_3gemm_fused_compressed moe(layerName, inputs, config);
+    const cldnn::moe_3gemm_fused_compressed moe(layerName, inputs, config, weight_bin_offsets, weights_path, lru_expert_num);
 
     p.add_primitive(*op, moe);
 }
diff --git a/src/plugins/intel_gpu/src/plugin/ops/moe_offload_constant.hpp b/src/plugins/intel_gpu/src/plugin/ops/moe_offload_constant.hpp
new file mode 100644
index 00000000000000..717d24e572e843
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/ops/moe_offload_constant.hpp
@@ -0,0 +1,118 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <algorithm>
+#include <atomic>
+#include <cstdint>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include "intel_gpu/op/moe_3gemm_fused_compressed.hpp"
+#include "intel_gpu/plugin/program_builder.hpp"
+#include "openvino/op/constant.hpp"
+
+namespace ov::intel_gpu::moe_offload {
+
+struct partial_upload_desc {
+    bool enabled = false;
+    cldnn::memory::ptr memory = nullptr;
+    ov::Shape upload_shape;
+    size_t upload_bytes = 0;
+};
+
+inline bool is_moe_related_constant(const std::shared_ptr<ov::op::v0::Constant>& op) {
+    const auto users = op->get_output_target_inputs(0);
+    for (const auto& input : users) {
+        const auto* node = input.get_node();
+        if (ov::is_type<ov::intel_gpu::op::MOE3GemmFusedCompressed>(node)) {
+            return true;
+        }
+    }
+    return false;
+}
+
+class partial_upload_log_state {
+public:
+    static constexpr size_t max_detailed_logs = 3;
+
+    void log(const std::string& node_name,
+             size_t uploaded_experts,
+             size_t total_experts,
+             size_t upload_bytes,
+             size_t target_bytes) {
+        total_upload_bytes.fetch_add(static_cast<uint64_t>(upload_bytes), std::memory_order_relaxed);
+        total_target_bytes.fetch_add(static_cast<uint64_t>(target_bytes), std::memory_order_relaxed);
+
+        const size_t total = total_count.fetch_add(1, std::memory_order_relaxed) + 1;
+        if (total <= max_detailed_logs) {
+            std::cout << "MOE OTD partial constant allocation at compile stage: "
+                      << node_name << ", experts=" << uploaded_experts
+                      << "/" << total_experts << ", upload_bytes=" << upload_bytes
+                      << ", target_bytes=" << target_bytes << std::endl;
+        } else if (total == max_detailed_logs + 1) {
+            std::cout << "MOE OTD partial constant allocation: suppressing further per-constant logs, "
+                      << "final summary will be printed at process exit" << std::endl;
+        }
+    }
+
+    ~partial_upload_log_state() {
+        const size_t total = total_count.load(std::memory_order_relaxed);
+        if (total > max_detailed_logs) {
+            const size_t shown = max_detailed_logs;
+            std::cout << "MOE OTD partial constant allocation summary: total=" << total
+                      << ", shown=" << shown << ", suppressed=" << (total - shown)
+                      << ", total_upload_bytes=" << total_upload_bytes.load(std::memory_order_relaxed)
+                      << ", total_target_bytes=" << total_target_bytes.load(std::memory_order_relaxed)
+                      << std::endl;
+        }
+    }
+
+private:
+    std::atomic<size_t> total_count{0};
+    std::atomic<uint64_t> total_upload_bytes{0};
+    std::atomic<uint64_t> total_target_bytes{0};
+};
+
+inline partial_upload_log_state& get_partial_upload_log_state() {
+    static partial_upload_log_state state;
+    return state;
+}
+
+inline partial_upload_desc try_prepare_partial_upload(ProgramBuilder& p,
+                                                      const std::shared_ptr<ov::op::v0::Constant>& op,
+                                                      const ov::Shape& const_shape,
+                                                      cldnn::data_types out_dtype,
+                                                      const cldnn::format& const_format,
+                                                      const cldnn::layout& const_layout) {
+    partial_upload_desc desc;
+
+    const size_t otd_ratio = p.get_config().get_moe_offload_ratio();
+    const bool partial_moe_const_upload = otd_ratio > 0 && is_moe_related_constant(op);
+    if (!partial_moe_const_upload || const_layout.bytes_count() == 0 || const_shape.empty() || const_shape[0] == 0) {
+        return desc;
+    }
+
+    const size_t otd_expert_num = std::max<size_t>(1, const_shape[0] * otd_ratio / 100);
+
+    desc.enabled = true;
+    desc.upload_shape = const_shape;
+    desc.upload_shape[0] = std::min<size_t>(const_shape[0], otd_expert_num);
+
+    auto upload_layout = cldnn::layout(desc.upload_shape, out_dtype, const_format);
+    auto upload_mem = p.get_engine().allocate_memory(upload_layout, false);
+    desc.memory = p.get_engine().reinterpret_buffer(*upload_mem, const_layout);
+    desc.upload_bytes = upload_layout.bytes_count();
+
+    get_partial_upload_log_state().log(op->get_friendly_name(),
+                                       desc.upload_shape[0],
+                                       const_shape[0],
+                                       desc.upload_bytes,
+                                       const_layout.bytes_count());
+    return desc;
+}
+
+}  // namespace ov::intel_gpu::moe_offload
\ No newline at end of file
diff --git a/src/plugins/intel_gpu/src/plugin/plugin.cpp b/src/plugins/intel_gpu/src/plugin/plugin.cpp
index 0e1beab36f96a6..7685bbef460990 100644
--- a/src/plugins/intel_gpu/src/plugin/plugin.cpp
+++ b/src/plugins/intel_gpu/src/plugin/plugin.cpp
@@ -189,7 +189,6 @@ std::shared_ptr<ov::Model> Plugin::clone_and_transform_model(const std::shared_p
         if (weight_path.extension() != ".bin" && !is_weightless_cache_attributes_set(cloned_model))
             set_weightless_cache_attributes(cloned_model);
     }
-
     transform_model(cloned_model, config_copy, context);
 
     // Transformations for some reason may drop output tensor names, so here we copy those from the original model
@@ -757,6 +756,7 @@ std::vector<ov::PropertyName> Plugin::get_supported_properties() const {
         ov::PropertyName{ov::cache_encryption_callbacks.name(), PropertyMutability::WO},
         ov::PropertyName{ov::hint::kv_cache_precision.name(), PropertyMutability::RW},
         ov::PropertyName{ov::hint::model.name(), PropertyMutability::WO},
+        ov::PropertyName{ov::intel_gpu::moe_offload_ratio.name(), PropertyMutability::RW},
         ov::PropertyName{ov::intel_gpu::config_file.name(), PropertyMutability::RW},
     };
 
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_lru_cache_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_lru_cache_test.cpp
new file mode 100644
index 00000000000000..a3f9e6fb64fc42
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_lru_cache_test.cpp
@@ -0,0 +1,455 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils/test_utils.h"
+#include "intel_gpu/runtime/internal_properties.hpp"
+#include "ocl_v2/moe/LRUCache.hpp"
+
+#include <thread>
+#include <vector>
+#include <set>
+#include <atomic>
+
+using namespace cldnn;
+using namespace tests;
+
+// ──────────────────────────────────────────────────
+// Basic construction and initial state
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, initial_state) {
+    LRUCache cache(4);
+
+    ASSERT_EQ(cache.size(), 0U);
+    ASSERT_EQ(cache.get_total_experts(), 0U);
+    ASSERT_FALSE(cache.m_initialized);
+}
+
+// ──────────────────────────────────────────────────
+// get_lru_item: insert (miss) and hit
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, single_insert_is_miss) {
+    LRUCache cache(4);
+
+    auto [slot, hit] = cache.get_lru_item(/*layer=*/0, /*expert=*/0);
+    ASSERT_FALSE(hit);        // first access is a miss
+    ASSERT_EQ(slot, 0U);     // first slot assigned
+    ASSERT_EQ(cache.size(), 1U);
+}
+
+TEST(moe_lru_cache, second_access_without_fill_is_miss) {
+    // get_lru_item returns is_hit based on m_filled_list, not just presence
+    LRUCache cache(4);
+
+    cache.get_lru_item(0, 0);  // insert, slot 0
+    auto [slot, hit] = cache.get_lru_item(0, 0);  // access again
+
+    ASSERT_EQ(slot, 0U);
+    ASSERT_FALSE(hit);  // filled_list[0] is still false
+}
+
+TEST(moe_lru_cache, access_after_set_filled_is_hit) {
+    LRUCache cache(4);
+
+    auto [slot, miss] = cache.get_lru_item(0, 0);
+    cache.set_filled(slot);
+
+    auto [slot2, hit] = cache.get_lru_item(0, 0);
+    ASSERT_EQ(slot2, slot);
+    ASSERT_TRUE(hit);
+}
+
+// ──────────────────────────────────────────────────
+// Sequential slot allocation
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, slots_assigned_sequentially) {
+    const size_t cap = 4;
+    LRUCache cache(cap);
+
+    for (size_t i = 0; i < cap; ++i) {
+        auto [slot, hit] = cache.get_lru_item(/*layer=*/0, /*expert=*/i);
+        ASSERT_EQ(slot, i);
+        ASSERT_FALSE(hit);
+    }
+    ASSERT_EQ(cache.size(), cap);
+}
+
+// ──────────────────────────────────────────────────
+// Eviction: oldest entry evicted when cache is full
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, eviction_when_full) {
+    LRUCache cache(3);
+
+    // Fill cache: experts 0, 1, 2
+    cache.get_lru_item(0, 0);  // slot 0
+    cache.get_lru_item(0, 1);  // slot 1
+    cache.get_lru_item(0, 2);  // slot 2
+    ASSERT_EQ(cache.size(), 3U);
+
+    // Insert expert 3 → evicts expert 0 (oldest), reuses slot 0
+    auto [slot, hit] = cache.get_lru_item(0, 3);
+    ASSERT_FALSE(hit);
+    ASSERT_EQ(slot, 0U);   // expert 0's slot is recycled
+    ASSERT_EQ(cache.size(), 3U);  // size stays at capacity
+}
+
+TEST(moe_lru_cache, evicted_entry_becomes_miss) {
+    LRUCache cache(2);
+
+    // Fill: expert 0 (slot 0), expert 1 (slot 1)
+    cache.get_lru_item(0, 0);
+    cache.set_filled(0);
+    cache.get_lru_item(0, 1);
+    cache.set_filled(1);
+
+    // Insert expert 2 → evicts expert 0
+    cache.get_lru_item(0, 2);
+
+    // Access expert 0 again → should be a miss (it was evicted)
+    auto [slot, hit] = cache.get_lru_item(0, 0);
+    ASSERT_FALSE(hit);
+}
+
+// ──────────────────────────────────────────────────
+// LRU ordering: recently accessed items survive eviction
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, lru_order_refresh_on_access) {
+    LRUCache cache(3);
+
+    // Insert experts 0, 1, 2 in order
+    cache.get_lru_item(0, 0);  // slot 0  (LRU order: 0)
+    cache.get_lru_item(0, 1);  // slot 1  (LRU order: 0, 1)
+    cache.get_lru_item(0, 2);  // slot 2  (LRU order: 0, 1, 2)
+
+    // Access expert 0 → moves to most recent
+    // (LRU order: 1, 2, 0)
+    cache.get_lru_item(0, 0);
+
+    // Insert expert 3 → should evict expert 1 (now the oldest)
+    auto [slot, hit] = cache.get_lru_item(0, 3);
+    ASSERT_FALSE(hit);
+    ASSERT_EQ(slot, 1U);  // reuses expert 1's slot
+}
+
+TEST(moe_lru_cache, double_refresh_changes_eviction_order) {
+    LRUCache cache(3);
+
+    // Insert 0, 1, 2
+    cache.get_lru_item(0, 0);  // slot 0
+    cache.get_lru_item(0, 1);  // slot 1
+    cache.get_lru_item(0, 2);  // slot 2
+
+    // Refresh 0, then refresh 1  → LRU order: 2, 0, 1
+    cache.get_lru_item(0, 0);
+    cache.get_lru_item(0, 1);
+
+    // Insert 3 → evicts expert 2 (oldest)
+    auto [slot3, hit3] = cache.get_lru_item(0, 3);
+    ASSERT_FALSE(hit3);
+    ASSERT_EQ(slot3, 2U);  // reuses expert 2's slot
+
+    // Insert 4 → evicts expert 0 (now oldest)
+    auto [slot4, hit4] = cache.get_lru_item(0, 4);
+    ASSERT_FALSE(hit4);
+    ASSERT_EQ(slot4, 0U);  // reuses expert 0's slot
+}
+
+// ──────────────────────────────────────────────────
+// Multi-layer: (layer, expert) pairs are independent
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, multi_layer_keys_are_independent) {
+    LRUCache cache(4);
+
+    auto [s0, h0] = cache.get_lru_item(/*layer=*/0, /*expert=*/0);
+    auto [s1, h1] = cache.get_lru_item(/*layer=*/1, /*expert=*/0);
+
+    ASSERT_FALSE(h0);
+    ASSERT_FALSE(h1);
+    ASSERT_NE(s0, s1);  // different slots for different layers
+    ASSERT_EQ(cache.size(), 2U);
+}
+
+TEST(moe_lru_cache, same_expert_different_layers_both_cached) {
+    LRUCache cache(4);
+
+    cache.get_lru_item(0, 5);
+    cache.set_filled(0);
+    cache.get_lru_item(1, 5);
+    cache.set_filled(1);
+
+    auto [s0, h0] = cache.get_lru_item(0, 5);
+    auto [s1, h1] = cache.get_lru_item(1, 5);
+
+    ASSERT_TRUE(h0);
+    ASSERT_TRUE(h1);
+    ASSERT_NE(s0, s1);
+}
+
+// ──────────────────────────────────────────────────
+// set_filled / filled tracking
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, filled_cleared_on_eviction) {
+    LRUCache cache(2);
+
+    // Insert expert 0, mark filled
+    auto [s0, miss0] = cache.get_lru_item(0, 0);
+    cache.set_filled(s0);
+
+    // Insert expert 1
+    cache.get_lru_item(0, 1);
+
+    // Insert expert 2 → evicts expert 0, reuses slot s0
+    auto [s2, h2] = cache.get_lru_item(0, 2);
+    ASSERT_EQ(s2, s0);         // recycled slot
+    ASSERT_FALSE(h2);          // filled was cleared during eviction
+
+    // Even after re-inserting to the same slot, accessing it returns not-filled
+    auto [s2b, h2b] = cache.get_lru_item(0, 2);
+    ASSERT_FALSE(h2b);  // not filled until set_filled is called
+}
+
+TEST(moe_lru_cache, set_filled_out_of_range_is_safe) {
+    LRUCache cache(2);
+    // Should not crash
+    cache.set_filled(100);
+    cache.set_filled(std::numeric_limits<size_t>::max());
+}
+
+// ──────────────────────────────────────────────────
+// evict_one: explicit eviction
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, explicit_evict_reduces_size) {
+    LRUCache cache(4);
+
+    cache.get_lru_item(0, 0);
+    cache.get_lru_item(0, 1);
+    ASSERT_EQ(cache.size(), 2U);
+
+    cache.evict_one();
+    ASSERT_EQ(cache.size(), 1U);
+
+    cache.evict_one();
+    ASSERT_EQ(cache.size(), 0U);
+}
+
+TEST(moe_lru_cache, evict_on_empty_is_safe) {
+    LRUCache cache(4);
+    // Should not crash
+    cache.evict_one();
+    ASSERT_EQ(cache.size(), 0U);
+}
+
+// ──────────────────────────────────────────────────
+// Capacity = 1 edge case
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, capacity_one) {
+    LRUCache cache(1);
+
+    auto [s0, h0] = cache.get_lru_item(0, 0);
+    ASSERT_EQ(s0, 0U);
+    ASSERT_FALSE(h0);
+
+    cache.set_filled(s0);
+
+    // Access same → hit
+    auto [s0b, h0b] = cache.get_lru_item(0, 0);
+    ASSERT_EQ(s0b, 0U);
+    ASSERT_TRUE(h0b);
+
+    // Insert new → evicts the only entry, reuses slot 0
+    auto [s1, h1] = cache.get_lru_item(0, 1);
+    ASSERT_EQ(s1, 0U);
+    ASSERT_FALSE(h1);
+    ASSERT_EQ(cache.size(), 1U);
+}
+
+// ──────────────────────────────────────────────────
+// Stress: many inserts and evictions
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, stress_many_experts) {
+    const size_t cap = 8;
+    LRUCache cache(cap);
+
+    // Insert 100 unique (layer=0, expert=i) entries into a cache with capacity 8
+    for (size_t i = 0; i < 100; ++i) {
+        auto [slot, hit] = cache.get_lru_item(0, i);
+        ASSERT_LT(slot, cap);
+        if (i < cap) {
+            ASSERT_EQ(slot, i);
+        }
+    }
+    ASSERT_EQ(cache.size(), cap);
+
+    // The last `cap` experts should still be in the cache
+    for (size_t i = 100 - cap; i < 100; ++i) {
+        auto [slot, hit] = cache.get_lru_item(0, i);
+        // They are in cache (though not filled)
+        ASSERT_LT(slot, cap);
+    }
+}
+
+// ──────────────────────────────────────────────────
+// Thread safety: concurrent get_lru_item calls
+// ──────────────────────────────────────────────────
+
+TEST(moe_lru_cache, concurrent_access) {
+    const size_t cap = 16;
+    LRUCache cache(cap);
+    const int num_threads = 4;
+    const int ops_per_thread = 200;
+
+    std::vector<std::thread> threads;
+    std::atomic<bool> start{false};
+
+    for (int t = 0; t < num_threads; ++t) {
+        threads.emplace_back([&, t]() {
+            while (!start.load()) {}  // spin until all threads are ready
+            for (int i = 0; i < ops_per_thread; ++i) {
+                size_t layer = t;
+                size_t expert = i % 32;
+                auto [slot, hit] = cache.get_lru_item(layer, expert);
+                ASSERT_LT(slot, cap);
+                if (!hit) {
+                    cache.set_filled(slot);
+                }
+            }
+        });
+    }
+
+    start.store(true);
+    for (auto& th : threads) {
+        th.join();
+    }
+
+    ASSERT_LE(cache.size(), cap);
+}
+
+// ──────────────────────────────────────────────────
+// Property config: moe_offload_ratio roundtrip
+// ──────────────────────────────────────────────────
+
+TEST(moe_offload_property_test, default_value_is_zero) {
+    auto config = get_test_default_config(get_test_engine());
+    ASSERT_EQ(config.get_moe_offload_ratio(), 0U);
+}
+
+TEST(moe_offload_property_test, set_and_get_various_values) {
+    auto config = get_test_default_config(get_test_engine());
+
+    config.set_property(ov::intel_gpu::moe_offload_ratio(1));
+    ASSERT_EQ(config.get_moe_offload_ratio(), 1U);
+
+    config.set_property(ov::intel_gpu::moe_offload_ratio(50));
+    ASSERT_EQ(config.get_moe_offload_ratio(), 50U);
+
+    config.set_property(ov::intel_gpu::moe_offload_ratio(100));
+    ASSERT_EQ(config.get_moe_offload_ratio(), 100U);
+}
+
+TEST(moe_offload_property_test, set_back_to_zero_disables) {
+    auto config = get_test_default_config(get_test_engine());
+
+    config.set_property(ov::intel_gpu::moe_offload_ratio(37));
+    ASSERT_EQ(config.get_moe_offload_ratio(), 37U);
+
+    config.set_property(ov::intel_gpu::moe_offload_ratio(0));
+    ASSERT_EQ(config.get_moe_offload_ratio(), 0U);
+}
+
+// ──────────────────────────────────────────────────
+// Primitive: moe_3gemm_fused_compressed fields
+// ──────────────────────────────────────────────────
+
+#include "intel_gpu/primitives/moe_3gemm_fused_compressed.hpp"
+
+TEST(moe_offload_primitive_test, default_offload_fields) {
+    cldnn::moe_3gemm_fused_compressed prim;
+    ASSERT_TRUE(prim._weight_bin_offsets.empty());
+    ASSERT_TRUE(prim._weights_path.empty());
+    ASSERT_EQ(prim._lru_expert_num, 0U);
+}
+
+TEST(moe_offload_primitive_test, construct_with_offload_params) {
+    MOE3GemmFusedCompressed::Config config{};
+    std::vector<size_t> offsets = {0, 100, 200, 300, 400, 500, 600, 700, 800};
+    std::string path = "/path/to/weights.bin";
+    size_t lru_num = 16;
+
+    cldnn::moe_3gemm_fused_compressed prim(
+        "test_moe",
+        {cldnn::input_info("input0"), cldnn::input_info("input1")},
+        config,
+        offsets,
+        path,
+        lru_num);
+
+    ASSERT_EQ(prim._weight_bin_offsets.size(), 9U);
+    ASSERT_EQ(prim._weight_bin_offsets[0], 0U);
+    ASSERT_EQ(prim._weight_bin_offsets[8], 800U);
+    ASSERT_EQ(prim._weights_path, path);
+    ASSERT_EQ(prim._lru_expert_num, lru_num);
+}
+
+TEST(moe_offload_primitive_test, equality_with_offload_fields) {
+    MOE3GemmFusedCompressed::Config config{};
+    std::vector<size_t> offsets = {0, 100, 200, 300, 400, 500, 600, 700, 800};
+
+    cldnn::moe_3gemm_fused_compressed prim1(
+        "test_moe",
+        {cldnn::input_info("input0")},
+        config,
+        offsets,
+        "/path/a.bin",
+        16);
+
+    cldnn::moe_3gemm_fused_compressed prim2(
+        "test_moe",
+        {cldnn::input_info("input0")},
+        config,
+        offsets,
+        "/path/a.bin",
+        16);
+
+    ASSERT_TRUE(prim1 == prim2);
+
+    // Different weights_path → not equal
+    cldnn::moe_3gemm_fused_compressed prim3(
+        "test_moe",
+        {cldnn::input_info("input0")},
+        config,
+        offsets,
+        "/path/b.bin",
+        16);
+    ASSERT_FALSE(prim1 == prim3);
+
+    // Different lru_expert_num → not equal
+    cldnn::moe_3gemm_fused_compressed prim4(
+        "test_moe",
+        {cldnn::input_info("input0")},
+        config,
+        offsets,
+        "/path/a.bin",
+        32);
+    ASSERT_FALSE(prim1 == prim4);
+
+    // Different offsets → not equal
+    std::vector<size_t> offsets2 = {0, 100, 200, 300, 400, 500, 600, 700, 999};
+    cldnn::moe_3gemm_fused_compressed prim5(
+        "test_moe",
+        {cldnn::input_info("input0")},
+        config,
+        offsets2,
+        "/path/a.bin",
+        16);
+    ASSERT_FALSE(prim1 == prim5);
+}
diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_property_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_property_test.cpp
new file mode 100644
index 00000000000000..81c84a80200dda
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/unit/test_cases/moe_offload_property_test.cpp
@@ -0,0 +1,19 @@
+// Copyright (C) 2018-2026 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "test_utils/test_utils.h"
+#include "intel_gpu/runtime/internal_properties.hpp"
+
+using namespace cldnn;
+using namespace tests;
+
+TEST(moe_offload_property_test, execution_config_roundtrip) {
+    auto config = get_test_default_config(get_test_engine());
+
+    ASSERT_EQ(config.get_moe_offload_ratio(), 0U);
+
+    config.set_property(ov::intel_gpu::moe_offload_ratio(37));
+
+    ASSERT_EQ(config.get_moe_offload_ratio(), 37U);
+}
\ No newline at end of file