Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -890,16 +890,38 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(ov::npuw::s11n::Strea
host_gather.idx_idx & quant_unpack_gather.dst_idx & quant_unpack_gather.src_w_idx &
quant_unpack_gather.src_z_idx & quant_unpack_gather.src_s_idx & quant_unpack_gather.idx_idx & spatial;

ov::npuw::moe::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
ov::npuw::attn::serialize_compiled_state(pipeline.context, stream, submodel_ctx);

if (stream.input()) {
if (ov::npuw::attn::get_compiled_dynamic(pipeline.context) != nullptr) {
ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::Dynamic);
} else if (ov::npuw::attn::get_compiled_pyramid(pipeline.context) != nullptr) {
ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::Pyramid);
} else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) {
ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA);
// Function calls share pipeline.context with their function body at runtime.
// There is no need to serialize the compiled moe/attn state for each call –
// doing so would re-import NPU blobs for every repeated layer (one per call),
// causing O(N_layers) memory growth on import. Only the function body
// (compiled_model is set, or replaced_by is absent) writes/reads state.
const bool is_fcall = replaced_by.has_value() && !static_cast<bool>(compiled_model);
if (!is_fcall) {
ov::npuw::moe::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
ov::npuw::attn::serialize_compiled_state(pipeline.context, stream, submodel_ctx);

if (stream.input()) {
if (ov::npuw::attn::get_compiled_dynamic(pipeline.context) != nullptr) {
ov::npuw::attn::attach_runtime_behavior(pipeline,
pipeline.context,
ov::npuw::attn::BehaviorKind::Dynamic);
} else if (ov::npuw::attn::get_compiled_pyramid(pipeline.context) != nullptr) {
ov::npuw::attn::attach_runtime_behavior(pipeline,
pipeline.context,
ov::npuw::attn::BehaviorKind::Pyramid);
} else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) {
ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA);
} else if (ov::npuw::moe::get_compiled_experts(pipeline.context) != nullptr) {
ov::npuw::moe::attach_runtime_behavior(pipeline,
pipeline.context,
ov::npuw::moe::BehaviorRole::EXPERTS,
true);
} else if (ov::npuw::moe::get_compiled_downstream(pipeline.context) != nullptr) {
ov::npuw::moe::attach_runtime_behavior(pipeline,
pipeline.context,
ov::npuw::moe::BehaviorRole::DOWNSTREAM,
true);
}
}
}

Expand Down
10 changes: 7 additions & 3 deletions src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,25 @@ ov::npuw::v1::subgraphs::RuntimeBehaviorFactory make_runtime_factory() {
};
}

void attach_runtime_behavior(ov::npuw::v1::subgraphs::CompiledPipeline& compiled_pipeline,
ov::npuw::v1::subgraphs::Context& compiled_context,
} // namespace

void attach_runtime_behavior(v1::subgraphs::CompiledPipeline& compiled_pipeline,
v1::subgraphs::Context& compiled_context,
const BehaviorRole role,
const bool handles_function_prologue) {
compiled_context.put<BehaviorRole>(role);
compiled_pipeline.registration.group = ov::npuw::patterns::moe::GPTOSSExpert::group_name();
compiled_pipeline.registration.name = behavior_name(role);
ov::npuw::v1::subgraphs::RuntimeBehaviorSpec spec;
v1::subgraphs::RuntimeBehaviorSpec spec;
spec.registration = compiled_pipeline.registration;
spec.context = compiled_context;
spec.factory = make_runtime_factory();
spec.handles_function_prologue = handles_function_prologue;
compiled_pipeline.runtime_behavior = std::move(spec);
}

namespace {

template <typename T>
T* get_compiled_state(ov::npuw::v1::subgraphs::Context& context) {
auto* state = context.get_if<std::shared_ptr<T>>();
Expand Down
5 changes: 5 additions & 0 deletions src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,11 @@ void serialize_compiled_state(v1::subgraphs::Context& context,
ov::npuw::s11n::Stream& stream,
const ov::npuw::s11n::SubmodelDeserializeCtx* submodel_ctx);

void attach_runtime_behavior(v1::subgraphs::CompiledPipeline& pipeline,
v1::subgraphs::Context& context,
BehaviorRole role,
bool handles_function_prologue);

std::vector<ov::npuw::v1::subgraphs::ScopedPatternRegistration> register_patterns(
ov::npuw::v1::subgraphs::PatternRegistry& registry,
std::size_t moe_chunk_size);
Expand Down
Loading