pytorch
diff --git a/‎backends/xnnpack/runtime/XNNExecutor.h‎
Lines changed: 21 additions & 0 deletions b/‎backends/xnnpack/runtime/XNNExecutor.h‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎backends/xnnpack/runtime/XNNPACKBackend.cpp‎
Lines changed: 61 additions & 22 deletions b/‎backends/xnnpack/runtime/XNNPACKBackend.cpp‎
Lines changed: 61 additions & 22 deletions
diff --git a/‎backends/xnnpack/runtime/XNNWeightsCache.h‎
Lines changed: 39 additions & 5 deletions b/‎backends/xnnpack/runtime/XNNWeightsCache.h‎
Lines changed: 39 additions & 5 deletions
diff --git a/‎backends/xnnpack/runtime/XNNWeightsCacheManager.cpp‎
Lines changed: 104 additions & 0 deletions b/‎backends/xnnpack/runtime/XNNWeightsCacheManager.cpp‎
Lines changed: 104 additions & 0 deletions
@@ -25,6 +25,9 @@ namespace backends {
 namespace xnnpack {
 namespace delegate {
 
+// Forward-declared to keep XNNWeightsCache.h out of this header.
+class XNNWeightsCache;
+
 class XNNExecutor {
  private:
   std::unique_ptr<xnn_runtime, decltype(&xnn_delete_runtime)> runtime_{
@@ -37,6 +40,10 @@ class XNNExecutor {
   std::vector<xnn_external_value> externals_;
   std::vector<std::string> packed_data_names_;
   std::shared_ptr<XNNWorkspace> workspace_;
+  // Owned so the cache outlives delete_packed_data in destroy(),
+  // even when every other executor sharing it is gone. Empty when no
+  // file-backed cache is in use.
+  std::shared_ptr<XNNWeightsCache> weights_cache_;
   std::atomic<bool> in_use_{false};
   std::atomic<bool> destroyed_{false};
 
@@ -71,6 +78,20 @@ class XNNExecutor {
     return workspace_;
   }
 
+  // Set once by XNNPACKBackend::init after compileModel succeeds. Pass
+  // an empty shared_ptr if no file-backed cache is in use for this PTE
+  // (treated identically to never calling this).
+  inline void set_weights_cache(std::shared_ptr<XNNWeightsCache> cache) {
+    weights_cache_ = std::move(cache);
+  }
+
+  // Returns the per-PTE weights cache shared_ptr (may be empty). Used
+  // by XNNPACKBackend::execute to lock the cache's mutex around runtime
+  // invocation, and by destroy() to invoke delete_packed_data.
+  inline std::shared_ptr<XNNWeightsCache> get_weights_cache() const {
+    return weights_cache_;
+  }
+
   /**
    * Initialize the XNNExecutor with a given runtime and input/output ids.
    * The input/output ids are expected to be sorted in order of their
 
@@ -91,18 +91,36 @@ class XnnpackBackend final
     auto workspace = workspace_result.get();
 
     bool use_weight_cache = options_.resolve_weight_cache(context);
-    // Hold the lock for the entire init-compile-finalize sequence to prevent
-    // concurrent inits from resetting is_finalized_ or overwriting
-    // named_data_map_ while compileModel is using the shared weights cache.
-    std::unique_lock<std::mutex> lock_weights_cache(
-        options_.weights_cache_mutex(), std::defer_lock);
+    // Per-path weights cache: acquire (or create) the shared instance
+    // for this PTE's cache file path, then hold its instance mutex for
+    // the entire init-compile sequence. Two PTEs targeting the same
+    // path get the same shared instance and serialize on its mutex;
+    // PTEs targeting different paths get independent instances and
+    // proceed in parallel (the singleton design forced full
+    // serialization here).
+    std::shared_ptr<xnnpack::delegate::XNNWeightsCache> weights_cache;
+    std::unique_lock<std::mutex> lock_weights_cache;
     if (use_weight_cache) {
-      lock_weights_cache.lock();
-
-      const auto& cache_path = options_.get_packed_cache_path();
-      options_.weights_cache().set_packed_cache_path(cache_path);
+      // Per-PTE: only use a packed cache path when this PTE opted in
+      // via runtime_spec (LoadBackendOptionsMap passed to load_method).
+      // Ignoring the backend-singleton's global path prevents a
+      // non-opt-in PTE from inheriting another model's cache file
+      // when multiple models share this backend in one process.
+      std::string cache_path;
+      auto path_spec = context.get_runtime_spec<const char*>(
+          xnnpack::packed_cache_path_option_key);
+      if (path_spec.ok()) {
+        cache_path = path_spec.get();
+      }
+      auto wc_result = options_.get_or_create_weights_cache(cache_path);
+      if (!wc_result.ok()) {
+        return wc_result.error();
+      }
+      weights_cache = wc_result.get();
+      lock_weights_cache =
+          std::unique_lock<std::mutex>(weights_cache->mutex());
 
-      options_.weights_cache().initialize_for_runtime(
+      weights_cache->initialize_for_runtime(
           context.get_runtime_allocator(), named_data_map);
       workspace->set_uses_weight_cache();
     }
@@ -118,7 +136,7 @@ class XnnpackBackend final
         processed->data(),
         processed->size(),
         executor,
-        &options_.weights_cache(),
+        weights_cache.get(),
         workspace_ptr,
         named_data_map,
         use_weight_cache);
@@ -135,6 +153,14 @@ class XnnpackBackend final
       return err;
     }
 
+    // Publish the cache into the executor so execute() / destroy() can
+    // reach it without going through options_. Held by shared_ptr so
+    // the instance survives until this executor is destroyed even if
+    // every other PTE sharing the same cache has already torn down.
+    if (use_weight_cache) {
+      executor->set_weights_cache(std::move(weights_cache));
+    }
+
     return executor;
   }
 
@@ -146,10 +172,15 @@ class XnnpackBackend final
 
     auto workspace = executor->get_workspace();
 
-    std::unique_lock<std::mutex> lock_weights_cache(
-        options_.weights_cache_mutex(), std::defer_lock);
-    if (executor->uses_weight_cache() || workspace->uses_weight_cache()) {
-      lock_weights_cache.lock();
+    // Per-executor cache lock: serializes concurrent execute() and
+    // destroy() against any other PTE that shares this cache instance.
+    // Different-path PTEs hold different mutexes and proceed in
+    // parallel. The empty-shared_ptr branch covers PTEs that didn't
+    // opt into the file-backed cache.
+    auto cache = executor->get_weights_cache();
+    std::unique_lock<std::mutex> lock_weights_cache;
+    if (cache) {
+      lock_weights_cache = std::unique_lock<std::mutex>(cache->mutex());
     }
 
     auto [raii_lock, _] = workspace->acquire();
@@ -176,17 +207,23 @@ class XnnpackBackend final
     if (handle != nullptr) {
       auto executor = static_cast<xnnpack::delegate::XNNExecutor*>(handle);
       auto workspace = executor->get_workspace();
-
-      const std::lock_guard<std::mutex> lock_weights_cache(
-          options_.weights_cache_mutex());
+      auto cache = executor->get_weights_cache();
+
+      // Per-executor cache lock: same semantics as execute(). Keeps a
+      // local shared_ptr so the instance lives through delete_packed_data
+      // even if dropping it from the executor below was the last
+      // outside reference.
+      std::unique_lock<std::mutex> lock_weights_cache;
+      if (cache) {
+        lock_weights_cache = std::unique_lock<std::mutex>(cache->mutex());
+      }
 
 #ifdef ENABLE_XNNPACK_PROFILING
       executor->print_avg_op_timings();
 #endif
 
-      if (executor->uses_weight_cache()) {
-        options_.weights_cache().delete_packed_data(
-            executor->get_packed_data_names());
+      if (cache && executor->uses_weight_cache()) {
+        cache->delete_packed_data(executor->get_packed_data_names());
       }
 
       // This is needed to serialize access to xnn_delete_runtime which is not
@@ -237,7 +274,9 @@ class XnnpackBackend final
   mutable xnnpack::XnnpackBackendOptions options_;
 
   // Lock hierarchy for mutexes:
-  //   options_.weights_cache_mutex()
+  //   weights_cache_manager_.meta_mutex_  (leaf — held only during
+  //                                        get_or_create map ops)
+  //   XNNWeightsCache::instance_mutex_    (one per cache instance)
   //   workspace_meta_mutex_
   //   workspace_mutex_ (owned by executor)
 };
 
@@ -14,6 +14,7 @@
 #include <executorch/runtime/core/memory_allocator.h>
 #include <executorch/runtime/core/result.h>
 #include <executorch/runtime/executor/pte_data_map.h>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -143,17 +144,44 @@ class XNNWeightsCache {
    * When set, reserve_space() allocates from a MAP_SHARED file instead
    * of heap, and finalize_for_runtime() calls msync to make pages clean.
    *
-   * The path MUST be unique per XNNWeightsCache instance — sharing it
-   * across instances (or processes) would mean O_TRUNC corrupts the other
-   * holder's mappings (SIGBUS on access). initialize_for_runtime() takes
-   * an advisory exclusive flock on the file; if the lock fails the mmap
-   * path is disabled for this instance and allocations fall back to heap.
+   * MUST be called BEFORE any other method on this instance, and never
+   * again afterward. Production callers go through XNNWeightsCacheManager,
+   * which sets the path once before publishing the shared_ptr; the
+   * per-instance mutex() does NOT need to be held for this single
+   * pre-publish setter call because no other thread can observe the
+   * instance yet. Tests that construct XNNWeightsCache directly must
+   * still respect the call-once contract.
+   *
+   * Multiple instances pointing at the same path WILL corrupt each
+   * other's state (O_TRUNC → SIGBUS); the manager prevents this by
+   * deduping per-path.
    */
   void set_packed_cache_path(const std::string& path);
 
   /** Save packed weight index so subsequent loads skip packing. */
   Error save_packed_index();
 
+  /**
+   * Per-instance mutex. Callers MUST hold this around every method
+   * call on this XNNWeightsCache (initialize_for_runtime,
+   * finalize_for_runtime, load_unpacked_data, delete_packed_data,
+   * save_packed_index) AND around any XNNPACK callback path that
+   * touches this cache (xnn_create_runtime invokes look_up /
+   * reserve_space / look_up_or_insert during compile). The cache has
+   * no internal synchronization; this mutex is the only thing that
+   * serializes concurrent use.
+   *
+   * Held by XNNPACKBackend::init from before initialize_for_runtime
+   * through compileModel; by ::execute around the runtime invocation
+   * when the executor uses this cache; by ::destroy around
+   * delete_packed_data; and by XNNWeightsCacheManager::save_all
+   * around each save_packed_index. The manager's own meta_mutex is a
+   * strictly-shallower-level lock — never held across this one.
+   */
+  std::mutex& mutex() noexcept {
+    return instance_mutex_;
+  }
+
  private:
   static constexpr uint32_t kCacheMagic = 0x58505743; // "XPWC"
   // Bump when the on-disk layout (footer or per-entry record) changes.
@@ -215,6 +243,12 @@ class XNNWeightsCache {
   // in mmap_regions_, so delete_packed_data() can munmap when ref_count==0.
   std::unordered_map<void*, size_t> file_ptr_to_region_index_;
 
+  // Per-instance lock. Documented contract on mutex() — the cache itself
+  // never touches this field; callers (XNNPACKBackend, manager, tests)
+  // are responsible for acquiring before any other public method.
+  std::mutex instance_mutex_;
+
+
   // Function pointers to override XNNPACK's default xnn_weights_cache_provider
   // functions.
   static size_t look_up(
 
@@ -0,0 +1,104 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/xnnpack/runtime/XNNWeightsCacheManager.h>
+
+#include <executorch/runtime/core/error.h>
+
+#include <utility>
+#include <vector>
+
+namespace executorch::backends::xnnpack {
+
+using executorch::runtime::Error;
+using executorch::runtime::Result;
+
+Result<std::shared_ptr<delegate::XNNWeightsCache>>
+XNNWeightsCacheManager::get_or_create(const std::string& cache_file_path) {
+  // Empty path → shared, heap-only instance. All empty-path callers
+  // (NGTTS sub-runners, FLLM classifier, PLLM methods when mmap MC is
+  // off) dedupe against one another's packed weights via XNNPACK's
+  // in-memory `look_up_or_insert` name match. Without this sharing,
+  // each `XnnpackBackend::init` allocated its own copy of every
+  // packed weight, regressing heap-only memory by hundreds of MB on
+  // LoRA-multimethod models — see header comment.
+  if (cache_file_path.empty()) {
+    std::scoped_lock<std::mutex> lock(empty_path_mutex_);
+    if (auto live = empty_path_cache_.lock()) {
+      return live;
+    }
+    auto cache = std::make_shared<delegate::XNNWeightsCache>();
+    empty_path_cache_ = cache;
+    return cache;
+  }
+
+  std::scoped_lock<std::mutex> lock(meta_mutex_);
+
+  auto it = caches_.find(cache_file_path);
+  if (it != caches_.end()) {
+    if (auto live = it->second.lock()) {
+      return live;
+    }
+    // Stale weak_ptr — erase before recreating. Without this, the
+    // insert below would overwrite the dead entry anyway; explicit
+    // erase makes the intent obvious.
+    caches_.erase(it);
+  }
+
+  auto cache = std::make_shared<delegate::XNNWeightsCache>();
+  // Set the path before publishing the shared_ptr into the map so any
+  // concurrent caller that finds the live weak_ptr observes a fully
+  // initialized instance. set_packed_cache_path is a plain string copy
+  // — no heavy work, no I/O — so doing it under meta_mutex_ is safe.
+  cache->set_packed_cache_path(cache_file_path);
+  caches_[cache_file_path] = cache;
+  return cache;
+}
+
+Error XNNWeightsCacheManager::save_all() {
+  // Snapshot live shared_ptrs under meta_mutex_, then release it
+  // before calling into per-instance save. This honors the
+  // meta_mutex_ → instance mutex hierarchy and lets concurrent
+  // get_or_create on unrelated paths proceed during the save walk.
+  std::vector<std::shared_ptr<delegate::XNNWeightsCache>> live;
+  {
+    std::scoped_lock<std::mutex> lock(meta_mutex_);
+    live.reserve(caches_.size());
+    for (auto it = caches_.begin(); it != caches_.end();) {
+      if (auto cache = it->second.lock()) {
+        live.push_back(std::move(cache));
+        ++it;
+      } else {
+        it = caches_.erase(it);
+      }
+    }
+  }
+
+  Error first_err = Error::Ok;
+  for (auto& cache : live) {
+    std::lock_guard<std::mutex> lock(cache->mutex());
+    Error err = cache->save_packed_index();
+    if (err != Error::Ok && first_err == Error::Ok) {
+      first_err = err;
+    }
+  }
+  return first_err;
+}
+
+size_t XNNWeightsCacheManager::live_count() const {
+  std::scoped_lock<std::mutex> lock(meta_mutex_);
+  size_t count = 0;
+  for (const auto& entry : caches_) {
+    if (!entry.second.expired()) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+} // namespace executorch::backends::xnnpack