Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -139,10 +139,11 @@ class ProgramBuilder final {

bool use_new_shape_infer() const { return m_config.get_allow_new_shape_infer(); }
bool is_inner_program() const { return m_is_inner_program; }
bool is_query_mode() { return queryMode; }
bool is_query_mode() const { return queryMode; }

std::shared_ptr<ov::threading::IStreamsExecutor> get_task_executor() const { return m_task_executor; }
std::shared_ptr<cldnn::ICompilationContext> get_compilation_context() const { return m_compilation_context; }
std::shared_ptr<ov::Model> get_model() const { return m_model; }

private:
static factories_map_t factories_map;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,51 @@

#pragma once
#include <vector>
#include <string>

#include "intel_gpu/op/moe_3gemm_fused_compressed.hpp"
#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/memory.hpp"
#include "primitive.hpp"

namespace cldnn {
using MOE3GemmFusedCompressed = ov::intel_gpu::op::MOE3GemmFusedCompressed;

struct moe_weights {
cldnn::memory::ptr gate_w = nullptr;
cldnn::memory::ptr gate_s = nullptr;
cldnn::memory::ptr gate_z = nullptr;
cldnn::memory::ptr up_w = nullptr;
cldnn::memory::ptr up_s = nullptr;
cldnn::memory::ptr up_z = nullptr;
cldnn::memory::ptr down_w = nullptr;
cldnn::memory::ptr down_s = nullptr;
cldnn::memory::ptr down_z = nullptr;
};

/// @brief moe compressed primitive
/// @details Performs moe compressed
struct moe_3gemm_fused_compressed : public primitive_base<moe_3gemm_fused_compressed> {
CLDNN_DECLARE_PRIMITIVE(moe_3gemm_fused_compressed)

static constexpr size_t serialized_weight_offset_count = 9;

enum class input_index : size_t {
hidden_states = 0,
routing_weights,
weight_0,
scale_0,
zp_0,
weight_1,
scale_1,
zp_1,
weight_2,
scale_2,
zp_2,
count
};
static constexpr size_t input_count = static_cast<size_t>(input_index::count);

moe_3gemm_fused_compressed() : primitive_base("", {}) {}

// @brief Constructs moe primitive / layer.
Expand Down Expand Up @@ -70,29 +102,49 @@ struct moe_3gemm_fused_compressed : public primitive_base<moe_3gemm_fused_compre
// 22: shared_gate_gate_weight - shared expert gate weight for gating,
// shape [hidden_size]
//
moe_3gemm_fused_compressed(const primitive_id& id, const std::vector<input_info>& inputs, const MOE3GemmFusedCompressed::Config& config)
moe_3gemm_fused_compressed(const primitive_id& id,
const std::vector<input_info>& inputs,
const MOE3GemmFusedCompressed::Config& config,
const std::vector<size_t>& weight_bin_offsets = {},
const std::string& weights_path = "",
size_t lru_expert_num = 0)
: primitive_base(id, inputs, 1, {optional_data_type()}),
_config(config) {}
_config(config),
_weight_bin_offsets(weight_bin_offsets),
_weights_path(weights_path),
_lru_expert_num(lru_expert_num) {}

MOE3GemmFusedCompressed::Config _config;
std::vector<size_t> _weight_bin_offsets;
std::string _weights_path;
size_t _lru_expert_num = 0;

bool operator==(const primitive& rhs) const override {
if (!compare_common_params(rhs))
return false;

auto rhs_casted = downcast<const moe_3gemm_fused_compressed>(rhs);

return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0;
return std::memcmp(&_config, &rhs_casted._config, sizeof(_config)) == 0 &&
_weight_bin_offsets == rhs_casted._weight_bin_offsets &&
_weights_path == rhs_casted._weights_path &&
_lru_expert_num == rhs_casted._lru_expert_num;
}

void save(BinaryOutputBuffer& ob) const override {
primitive_base<moe_3gemm_fused_compressed>::save(ob);
ob << make_data(&_config, sizeof(_config));
ob << _weight_bin_offsets;
ob << _weights_path;
ob << _lru_expert_num;
}

void load(BinaryInputBuffer& ib) override {
primitive_base<moe_3gemm_fused_compressed>::load(ib);
ib >> make_data(&_config, sizeof(_config));
ib >> _weight_bin_offsets;
ib >> _weights_path;
ib >> _lru_expert_num;
}
};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,7 @@ static constexpr Property<ImplForcingMap, PropertyMutability::RW> force_implemen
static constexpr Property<std::string, PropertyMutability::RW> config_file{"CONFIG_FILE"};
static constexpr Property<float, PropertyMutability::RW> buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"};
static constexpr Property<size_t, PropertyMutability::RW> max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"};
static constexpr Property<size_t, PropertyMutability::RW> moe_offload_ratio{"MOE_OFFLOAD_RATIO"};
static constexpr Property<bool, PropertyMutability::RW> use_onednn{"GPU_USE_ONEDNN"};
static constexpr Property<bool, PropertyMutability::RW> use_cm{"GPU_USE_CM"};

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ OV_CONFIG_RELEASE_OPTION(ov::hint, model, nullptr, "Shared pointer to the ov::Mo
OV_CONFIG_RELEASE_OPTION(ov::internal, key_cache_quant_mode, ov::internal::CacheQuantMode::BY_CHANNEL, "AUTO or BY_CHANNEL or BY_TOKEN")
OV_CONFIG_RELEASE_OPTION(ov::internal, value_cache_quant_mode, ov::internal::CacheQuantMode::BY_TOKEN, "AUTO or BY_CHANNEL or BY_TOKEN")
OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, mem_pool_util_threshold, 0.5, "Minimum utilization threshold (0.0~1.0) for reusable memory in the pool")
OV_CONFIG_RELEASE_OPTION(ov::intel_gpu, moe_offload_ratio, 0, "Percentage (0-100) of MoE experts to keep resident on device for offload")
OV_CONFIG_RELEASE_OPTION(ov, enable_weightless, false, "Enable/Disable weightless blob")

OV_CONFIG_RELEASE_INTERNAL_OPTION(ov::intel_gpu, shape_predictor_settings, {10, 16 * 1024, 2, 1.1f}, "Preallocation settings")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -635,6 +635,10 @@ static void optimize_moe_gemm_decompression_parameters(moe_gemm_node& node, prog

static void optimize_moe_3gemm_fused_decompression_parameters(moe_node& node, program& p) {
auto prim = node.get_primitive();
if (prim->_lru_expert_num > 0) {
// OTD routed weights are backed by resident-size allocations; reorders would materialize full logical tensors.
return;
}
const auto& cfg = prim->_config;
// Routed-expert scales at 3/6/9 (gate/up/down); zp at +1 when has_zp.
constexpr std::array<size_t, 3> routed_scale_indices{3u, 6u, 9u};
Expand Down
65 changes: 65 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "LRUCache.hpp"

#include <cstdlib>

LRUCache::LRUCache(size_t max_total_experts, EvictCallback cb)
: m_max_total_experts(max_total_experts),
m_total_experts(0),
m_to_filled_lru_expert_no(0),
m_on_evict(std::move(cb)) {
m_filled_list.resize(max_total_experts, false);
}

void LRUCache::move_to_end(std::list<Node>::iterator it) {
if (std::next(it) == m_list.end())
return;
m_list.splice(m_list.end(), m_list, it);
}

void LRUCache::evict_one_unlocked() {
if (m_list.empty())
return;

auto& oldest = m_list.front();

m_filled_list[oldest.lru_expert_no] = false;
m_to_filled_lru_expert_no = oldest.lru_expert_no;
Key key{oldest.layer, oldest.expert};
m_map.erase(key);
m_list.pop_front();
--m_total_experts;
}

void LRUCache::evict_one() {
std::lock_guard<std::mutex> lock(m_mutex);
evict_one_unlocked();
}

std::pair<size_t, bool> LRUCache::get_lru_item(size_t layer, size_t expert) {
std::lock_guard<std::mutex> lock(m_mutex);

Key key{layer, expert};
auto it = m_map.find(key);
if (it == m_map.end()) {
size_t to_filled_no = 0;
if (m_total_experts >= m_max_total_experts) {
evict_one_unlocked();
to_filled_no = m_to_filled_lru_expert_no;
} else {
to_filled_no = m_total_experts;
}
m_list.push_back(Node{layer, expert, to_filled_no});
auto new_it = std::prev(m_list.end());
m_map[key] = new_it;
++m_total_experts;
return {to_filled_no, false};
} else {
move_to_end(it->second);
const bool is_hit = m_filled_list[it->second->lru_expert_no];
return {it->second->lru_expert_no, is_hit};
}
}
82 changes: 82 additions & 0 deletions src/plugins/intel_gpu/src/graph/impls/ocl_v2/moe/LRUCache.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
// Copyright (C) 2018-2026 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once
#include <functional>
#include <list>
#include <memory>
#include <mutex>
#include <unordered_map>
#include <utility>
#include <vector>

#include "intel_gpu/runtime/engine.hpp"

class LRUCache {
public:
using EvictCallback = std::function<void(size_t layer, size_t expert, void* addr, void* params)>;

enum NodeAction { INSERT, REFRESH };

LRUCache(size_t max_total_experts, EvictCallback cb = nullptr);
NodeAction insert_or_refresh(size_t layer, size_t expert, void* addr, void* params = nullptr);

std::pair<size_t, bool> get_lru_item(size_t layer, size_t expert);
size_t get_total_experts() const {
std::lock_guard<std::mutex> lock(m_mutex);
return m_total_experts;
}

void evict_one();

size_t size() const {
std::lock_guard<std::mutex> lock(m_mutex);
return m_total_experts;
}
std::pair<size_t, bool> get_item(size_t layer, size_t expert);
void set_filled(size_t lru_expert_no) {
std::lock_guard<std::mutex> lock(m_mutex);
if (lru_expert_no >= m_filled_list.size()) {
return;
}
m_filled_list[lru_expert_no] = true;
}

bool m_initialized = false;

private:
struct Key {
size_t layer;
size_t expert;
bool operator==(const Key& other) const noexcept {
return layer == other.layer && expert == other.expert;
}
};

struct KeyHash {
std::size_t operator()(const Key& k) const noexcept {
return std::hash<size_t>()(k.layer * 131ULL + k.expert);
}
};

struct Node {
size_t layer;
size_t expert;
size_t lru_expert_no;
};

size_t m_max_total_experts;
size_t m_per_expert_size;
size_t m_total_experts;
size_t m_to_filled_lru_expert_no;
EvictCallback m_on_evict;

std::list<Node> m_list;
std::vector<bool> m_filled_list;
std::unordered_map<Key, std::list<Node>::iterator, KeyHash> m_map;
mutable std::mutex m_mutex;

void move_to_end(std::list<Node>::iterator it);
void evict_one_unlocked();
};
Loading
Loading