From 7016176c75ddf693980f471299500224a911a153 Mon Sep 17 00:00:00 2001 From: intelgaoxiong Date: Tue, 2 Jun 2026 14:50:54 -0700 Subject: [PATCH 1/2] Fixed high memory footprint with blob cache. Signed-off-by: intelgaoxiong --- .../src/plugin/npuw/compiled_model.cpp | 32 +++++++++++++------ 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 41d11381b85817..96a56971290f1e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -890,16 +890,28 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(ov::npuw::s11n::Strea host_gather.idx_idx & quant_unpack_gather.dst_idx & quant_unpack_gather.src_w_idx & quant_unpack_gather.src_z_idx & quant_unpack_gather.src_s_idx & quant_unpack_gather.idx_idx & spatial; - ov::npuw::moe::serialize_compiled_state(pipeline.context, stream, submodel_ctx); - ov::npuw::attn::serialize_compiled_state(pipeline.context, stream, submodel_ctx); - - if (stream.input()) { - if (ov::npuw::attn::get_compiled_dynamic(pipeline.context) != nullptr) { - ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::Dynamic); - } else if (ov::npuw::attn::get_compiled_pyramid(pipeline.context) != nullptr) { - ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::Pyramid); - } else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) { - ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA); + // Function calls share pipeline.context with their function body at runtime. + // There is no need to serialize the compiled moe/attn state for each call – + // doing so would re-import NPU blobs for every repeated layer (one per call), + // causing O(N_layers) memory growth on import. Only the function body + // (compiled_model is set, or replaced_by is absent) writes/reads state. + const bool is_fcall = replaced_by.has_value() && !static_cast(compiled_model); + if (!is_fcall) { + ov::npuw::moe::serialize_compiled_state(pipeline.context, stream, submodel_ctx); + ov::npuw::attn::serialize_compiled_state(pipeline.context, stream, submodel_ctx); + + if (stream.input()) { + if (ov::npuw::attn::get_compiled_dynamic(pipeline.context) != nullptr) { + ov::npuw::attn::attach_runtime_behavior(pipeline, + pipeline.context, + ov::npuw::attn::BehaviorKind::Dynamic); + } else if (ov::npuw::attn::get_compiled_pyramid(pipeline.context) != nullptr) { + ov::npuw::attn::attach_runtime_behavior(pipeline, + pipeline.context, + ov::npuw::attn::BehaviorKind::Pyramid); + } else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) { + ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA); + } } } From 2168197d4b8cef9a6d3debf4d959584456f12ab4 Mon Sep 17 00:00:00 2001 From: intelgaoxiong Date: Wed, 3 Jun 2026 22:02:32 -0700 Subject: [PATCH 2/2] [NPUW][MoE] Fix missing runtime behavior attachment on cache import When loading a compiled model from blob cache, serialize_compiled_state() restores the MoEExperts/MoEDownstream objects but attach_runtime_behavior() was never called on the import path. This left pipeline.runtime_behavior unset, causing the subgraph to fall back to default (non-MoE) execution and produce incorrect results on every second run. Expose attach_runtime_behavior() in moe_subgraph as a public API and call it directly from compiled_model.cpp after moe::serialize_compiled_state() on the read path, mirroring the existing pattern used by the attention subsystem. --- .../intel_npu/src/plugin/npuw/compiled_model.cpp | 10 ++++++++++ .../intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp | 10 +++++++--- .../intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp | 5 +++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 96a56971290f1e..b7ec747605348d 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -911,6 +911,16 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(ov::npuw::s11n::Strea ov::npuw::attn::BehaviorKind::Pyramid); } else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) { ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA); + } else if (ov::npuw::moe::get_compiled_experts(pipeline.context) != nullptr) { + ov::npuw::moe::attach_runtime_behavior(pipeline, + pipeline.context, + ov::npuw::moe::BehaviorRole::EXPERTS, + true); + } else if (ov::npuw::moe::get_compiled_downstream(pipeline.context) != nullptr) { + ov::npuw::moe::attach_runtime_behavior(pipeline, + pipeline.context, + ov::npuw::moe::BehaviorRole::DOWNSTREAM, + true); } } } diff --git a/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp b/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp index 366f6e0856c183..a041bbee05ae1f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp @@ -59,14 +59,16 @@ ov::npuw::v1::subgraphs::RuntimeBehaviorFactory make_runtime_factory() { }; } -void attach_runtime_behavior(ov::npuw::v1::subgraphs::CompiledPipeline& compiled_pipeline, - ov::npuw::v1::subgraphs::Context& compiled_context, +} // namespace + +void attach_runtime_behavior(v1::subgraphs::CompiledPipeline& compiled_pipeline, + v1::subgraphs::Context& compiled_context, const BehaviorRole role, const bool handles_function_prologue) { compiled_context.put(role); compiled_pipeline.registration.group = ov::npuw::patterns::moe::GPTOSSExpert::group_name(); compiled_pipeline.registration.name = behavior_name(role); - ov::npuw::v1::subgraphs::RuntimeBehaviorSpec spec; + v1::subgraphs::RuntimeBehaviorSpec spec; spec.registration = compiled_pipeline.registration; spec.context = compiled_context; spec.factory = make_runtime_factory(); @@ -74,6 +76,8 @@ void attach_runtime_behavior(ov::npuw::v1::subgraphs::CompiledPipeline& compiled compiled_pipeline.runtime_behavior = std::move(spec); } +namespace { + template T* get_compiled_state(ov::npuw::v1::subgraphs::Context& context) { auto* state = context.get_if>(); diff --git a/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp b/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp index 2aa5a8b8b16065..c044c876b24ea2 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp @@ -53,6 +53,11 @@ void serialize_compiled_state(v1::subgraphs::Context& context, ov::npuw::s11n::Stream& stream, const ov::npuw::s11n::SubmodelDeserializeCtx* submodel_ctx); +void attach_runtime_behavior(v1::subgraphs::CompiledPipeline& pipeline, + v1::subgraphs::Context& context, + BehaviorRole role, + bool handles_function_prologue); + std::vector register_patterns( ov::npuw::v1::subgraphs::PatternRegistry& registry, std::size_t moe_chunk_size);