From 7016176c75ddf693980f471299500224a911a153 Mon Sep 17 00:00:00 2001
From: intelgaoxiong <xiong.gao@intel.com>
Date: Tue, 2 Jun 2026 14:50:54 -0700
Subject: [PATCH 1/2] Fixed high memory footprint with blob cache.

Signed-off-by: intelgaoxiong <xiong.gao@intel.com>
---
 .../src/plugin/npuw/compiled_model.cpp        | 32 +++++++++++++------
 1 file changed, 22 insertions(+), 10 deletions(-)
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 41d11381b85817..96a56971290f1e 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -890,16 +890,28 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(ov::npuw::s11n::Strea
         host_gather.idx_idx & quant_unpack_gather.dst_idx & quant_unpack_gather.src_w_idx &
         quant_unpack_gather.src_z_idx & quant_unpack_gather.src_s_idx & quant_unpack_gather.idx_idx & spatial;
 
-    ov::npuw::moe::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
-    ov::npuw::attn::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
-
-    if (stream.input()) {
-        if (ov::npuw::attn::get_compiled_dynamic(pipeline.context) != nullptr) {
-            ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::Dynamic);
-        } else if (ov::npuw::attn::get_compiled_pyramid(pipeline.context) != nullptr) {
-            ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::Pyramid);
-        } else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) {
-            ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA);
+    // Function calls share pipeline.context with their function body at runtime.
+    // There is no need to serialize the compiled moe/attn state for each call –
+    // doing so would re-import NPU blobs for every repeated layer (one per call),
+    // causing O(N_layers) memory growth on import.  Only the function body
+    // (compiled_model is set, or replaced_by is absent) writes/reads state.
+    const bool is_fcall = replaced_by.has_value() && !static_cast<bool>(compiled_model);
+    if (!is_fcall) {
+        ov::npuw::moe::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
+        ov::npuw::attn::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
+
+        if (stream.input()) {
+            if (ov::npuw::attn::get_compiled_dynamic(pipeline.context) != nullptr) {
+                ov::npuw::attn::attach_runtime_behavior(pipeline,
+                                                        pipeline.context,
+                                                        ov::npuw::attn::BehaviorKind::Dynamic);
+            } else if (ov::npuw::attn::get_compiled_pyramid(pipeline.context) != nullptr) {
+                ov::npuw::attn::attach_runtime_behavior(pipeline,
+                                                        pipeline.context,
+                                                        ov::npuw::attn::BehaviorKind::Pyramid);
+            } else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) {
+                ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA);
+            }
         }
     }
 

From 2168197d4b8cef9a6d3debf4d959584456f12ab4 Mon Sep 17 00:00:00 2001
From: intelgaoxiong <xiong.gao@intel.com>
Date: Wed, 3 Jun 2026 22:02:32 -0700
Subject: [PATCH 2/2] [NPUW][MoE] Fix missing runtime behavior attachment on
 cache import

When loading a compiled model from blob cache, serialize_compiled_state()
restores the MoEExperts/MoEDownstream objects but attach_runtime_behavior()
was never called on the import path. This left pipeline.runtime_behavior
unset, causing the subgraph to fall back to default (non-MoE) execution
and produce incorrect results on every second run.

Expose attach_runtime_behavior() in moe_subgraph as a public API and call
it directly from compiled_model.cpp after moe::serialize_compiled_state()
on the read path, mirroring the existing pattern used by the attention
subsystem.
---
 .../intel_npu/src/plugin/npuw/compiled_model.cpp       | 10 ++++++++++
 .../intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp     | 10 +++++++---
 .../intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp     |  5 +++++
 3 files changed, 22 insertions(+), 3 deletions(-)

diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 96a56971290f1e..b7ec747605348d 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -911,6 +911,16 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(ov::npuw::s11n::Strea
                                                         ov::npuw::attn::BehaviorKind::Pyramid);
             } else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) {
                 ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA);
+            } else if (ov::npuw::moe::get_compiled_experts(pipeline.context) != nullptr) {
+                ov::npuw::moe::attach_runtime_behavior(pipeline,
+                                                       pipeline.context,
+                                                       ov::npuw::moe::BehaviorRole::EXPERTS,
+                                                       true);
+            } else if (ov::npuw::moe::get_compiled_downstream(pipeline.context) != nullptr) {
+                ov::npuw::moe::attach_runtime_behavior(pipeline,
+                                                       pipeline.context,
+                                                       ov::npuw::moe::BehaviorRole::DOWNSTREAM,
+                                                       true);
             }
         }
     }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp b/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp
index 366f6e0856c183..a041bbee05ae1f 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.cpp
@@ -59,14 +59,16 @@ ov::npuw::v1::subgraphs::RuntimeBehaviorFactory make_runtime_factory() {
     };
 }
 
-void attach_runtime_behavior(ov::npuw::v1::subgraphs::CompiledPipeline& compiled_pipeline,
-                             ov::npuw::v1::subgraphs::Context& compiled_context,
+}  // namespace
+
+void attach_runtime_behavior(v1::subgraphs::CompiledPipeline& compiled_pipeline,
+                             v1::subgraphs::Context& compiled_context,
                              const BehaviorRole role,
                              const bool handles_function_prologue) {
     compiled_context.put<BehaviorRole>(role);
     compiled_pipeline.registration.group = ov::npuw::patterns::moe::GPTOSSExpert::group_name();
     compiled_pipeline.registration.name = behavior_name(role);
-    ov::npuw::v1::subgraphs::RuntimeBehaviorSpec spec;
+    v1::subgraphs::RuntimeBehaviorSpec spec;
     spec.registration = compiled_pipeline.registration;
     spec.context = compiled_context;
     spec.factory = make_runtime_factory();
@@ -74,6 +76,8 @@ void attach_runtime_behavior(ov::npuw::v1::subgraphs::CompiledPipeline& compiled
     compiled_pipeline.runtime_behavior = std::move(spec);
 }
 
+namespace {
+
 template <typename T>
 T* get_compiled_state(ov::npuw::v1::subgraphs::Context& context) {
     auto* state = context.get_if<std::shared_ptr<T>>();
diff --git a/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp b/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp
index 2aa5a8b8b16065..c044c876b24ea2 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/moe/moe_subgraph.hpp
@@ -53,6 +53,11 @@ void serialize_compiled_state(v1::subgraphs::Context& context,
                               ov::npuw::s11n::Stream& stream,
                               const ov::npuw::s11n::SubmodelDeserializeCtx* submodel_ctx);
 
+void attach_runtime_behavior(v1::subgraphs::CompiledPipeline& pipeline,
+                             v1::subgraphs::Context& context,
+                             BehaviorRole role,
+                             bool handles_function_prologue);
+
 std::vector<ov::npuw::v1::subgraphs::ScopedPatternRegistration> register_patterns(
     ov::npuw::v1::subgraphs::PatternRegistry& registry,
     std::size_t moe_chunk_size);