openvinotoolkit · intelgaoxiong · Jun 2, 2026 · Jun 4, 2026
@@ -890,16 +890,38 @@ void ov::npuw::CompiledModel::CompiledModelDesc::serialize(ov::npuw::s11n::Strea
         host_gather.idx_idx & quant_unpack_gather.dst_idx & quant_unpack_gather.src_w_idx &
         quant_unpack_gather.src_z_idx & quant_unpack_gather.src_s_idx & quant_unpack_gather.idx_idx & spatial;
 
-    ov::npuw::moe::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
-    ov::npuw::attn::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
-
-    if (stream.input()) {
-        if (ov::npuw::attn::get_compiled_dynamic(pipeline.context) != nullptr) {
-            ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::Dynamic);
-        } else if (ov::npuw::attn::get_compiled_pyramid(pipeline.context) != nullptr) {
-            ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::Pyramid);
-        } else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) {
-            ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA);
+    // Function calls share pipeline.context with their function body at runtime.
+    // There is no need to serialize the compiled moe/attn state for each call –
+    // doing so would re-import NPU blobs for every repeated layer (one per call),
+    // causing O(N_layers) memory growth on import.  Only the function body
+    // (compiled_model is set, or replaced_by is absent) writes/reads state.
+    const bool is_fcall = replaced_by.has_value() && !static_cast<bool>(compiled_model);
+    if (!is_fcall) {
+        ov::npuw::moe::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
+        ov::npuw::attn::serialize_compiled_state(pipeline.context, stream, submodel_ctx);
+
+        if (stream.input()) {
+            if (ov::npuw::attn::get_compiled_dynamic(pipeline.context) != nullptr) {
+                ov::npuw::attn::attach_runtime_behavior(pipeline,
+                                                        pipeline.context,
+                                                        ov::npuw::attn::BehaviorKind::Dynamic);
+            } else if (ov::npuw::attn::get_compiled_pyramid(pipeline.context) != nullptr) {
+                ov::npuw::attn::attach_runtime_behavior(pipeline,
+                                                        pipeline.context,
+                                                        ov::npuw::attn::BehaviorKind::Pyramid);
+            } else if (ov::npuw::attn::get_compiled_hfa(pipeline.context) != nullptr) {
+                ov::npuw::attn::attach_runtime_behavior(pipeline, pipeline.context, ov::npuw::attn::BehaviorKind::HFA);
+            } else if (ov::npuw::moe::get_compiled_experts(pipeline.context) != nullptr) {
+                ov::npuw::moe::attach_runtime_behavior(pipeline,
+                                                       pipeline.context,
+                                                       ov::npuw::moe::BehaviorRole::EXPERTS,
+                                                       true);
+            } else if (ov::npuw::moe::get_compiled_downstream(pipeline.context) != nullptr) {
+                ov::npuw::moe::attach_runtime_behavior(pipeline,
+                                                       pipeline.context,
+                                                       ov::npuw::moe::BehaviorRole::DOWNSTREAM,
+                                                       true);
+            }
         }
     }
 

@@ -59,21 +59,25 @@ ov::npuw::v1::subgraphs::RuntimeBehaviorFactory make_runtime_factory() {
     };
 }
 
-void attach_runtime_behavior(ov::npuw::v1::subgraphs::CompiledPipeline& compiled_pipeline,
-                             ov::npuw::v1::subgraphs::Context& compiled_context,
+}  // namespace
+
+void attach_runtime_behavior(v1::subgraphs::CompiledPipeline& compiled_pipeline,
+                             v1::subgraphs::Context& compiled_context,
                              const BehaviorRole role,
                              const bool handles_function_prologue) {
     compiled_context.put<BehaviorRole>(role);
     compiled_pipeline.registration.group = ov::npuw::patterns::moe::GPTOSSExpert::group_name();
     compiled_pipeline.registration.name = behavior_name(role);
-    ov::npuw::v1::subgraphs::RuntimeBehaviorSpec spec;
+    v1::subgraphs::RuntimeBehaviorSpec spec;
     spec.registration = compiled_pipeline.registration;
     spec.context = compiled_context;
     spec.factory = make_runtime_factory();
     spec.handles_function_prologue = handles_function_prologue;
     compiled_pipeline.runtime_behavior = std::move(spec);
 }
 
+namespace {
+
 template <typename T>
 T* get_compiled_state(ov::npuw::v1::subgraphs::Context& context) {
     auto* state = context.get_if<std::shared_ptr<T>>();

@@ -53,6 +53,11 @@ void serialize_compiled_state(v1::subgraphs::Context& context,
                               ov::npuw::s11n::Stream& stream,
                               const ov::npuw::s11n::SubmodelDeserializeCtx* submodel_ctx);
 
+void attach_runtime_behavior(v1::subgraphs::CompiledPipeline& pipeline,
+                             v1::subgraphs::Context& context,
+                             BehaviorRole role,
+                             bool handles_function_prologue);
+
 std::vector<ov::npuw::v1::subgraphs::ScopedPatternRegistration> register_patterns(
     ov::npuw::v1::subgraphs::PatternRegistry& registry,
     std::size_t moe_chunk_size);