From 586cce1cb3329f2740559e3a90d7120885ca7ece Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 15 Apr 2026 10:05:55 -0700
Subject: [PATCH 1/2] [ET-VK] Fix use-after-free in PrepackNode when TensorRefs
 are shared
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a model has shared/tied weights (e.g. tied embeddings in transformers), the serialization deduplicates them into a single TensorRef that multiple PrepackNodes reference. Previously, `PrepackNode::create_staging_buffer()` called `tref->free_buffer()` unconditionally after copying weight data to a GPU staging buffer. This meant the first PrepackNode to execute would free the underlying host memory, and subsequent PrepackNodes sharing the same TensorRef would read from a dangling pointer — producing garbage/NaN values in prepacked weight and bias tensors on the GPU.

The fix adds a `prepack_use_count` field to `TensorRef` that tracks how many PrepackNodes still need to read from it. Each PrepackNode increments the count in its constructor and decrements it after copying data. The buffer is only freed when the count reaches zero. This preserves the original eager-free behavior for non-shared weights (freeing immediately after the single consumer copies) while correctly deferring the free for shared weights until the last consumer is done — avoiding both the use-after-free and unnecessary peak memory increase.

Differential Revision: [D101009402](https://our.internmc.facebook.com/intern/diff/D101009402/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/graph/ComputeGraph.cpp      |  2 --
 backends/vulkan/runtime/graph/containers/Constant.h |  8 ++++++--
 backends/vulkan/runtime/graph/ops/PrepackNode.cpp   | 10 +++++++---
 3 files changed, 13 insertions(+), 7 deletions(-)
diff --git a/backends/vulkan/runtime/graph/ComputeGraph.cpp b/backends/vulkan/runtime/graph/ComputeGraph.cpp
index b14d0f6ab0b..f0b61e128bb 100644
--- a/backends/vulkan/runtime/graph/ComputeGraph.cpp
+++ b/backends/vulkan/runtime/graph/ComputeGraph.cpp
@@ -1135,8 +1135,6 @@ void ComputeGraph::prepack() {
   int i = 0;
   bool submitted = false;
   const bool reduce_peak_memory = total_constant_nbytes_ > 10 * MB;
-  // int count = 0;
-
   context_->set_cmd();
   for (std::unique_ptr<PrepackNode>& node : prepack_nodes_) {
     // Do not trigger on the first or last prepack node.
diff --git a/backends/vulkan/runtime/graph/containers/Constant.h b/backends/vulkan/runtime/graph/containers/Constant.h
index a18c284a219..2b8c134a04f 100644
--- a/backends/vulkan/runtime/graph/containers/Constant.h
+++ b/backends/vulkan/runtime/graph/containers/Constant.h
@@ -29,6 +29,12 @@ struct TensorRef final {
   // This will be empty (default constructed) for the raw pointer constructor
   executorch::runtime::FreeableBuffer buffer;
 
+  // Number of PrepackNodes that still need to read from this TensorRef. When
+  // this reaches 0, the buffer can be safely freed. This prevents
+  // use-after-free when multiple PrepackNodes reference the same TensorRef
+  // (e.g. shared/tied weights).
+  uint32_t prepack_use_count{0};
+
   explicit TensorRef(
       const std::vector<int64_t>& t_sizes,
       vkapi::ScalarType t_dtype,
@@ -44,8 +50,6 @@ struct TensorRef final {
     return utils::multiply_integers(sizes) * vkapi::element_size(dtype);
   }
 
-  // Manually free the buffer if needed (though it will be freed automatically
-  // on destruction)
   void free_buffer() {
     buffer.Free();
   }
diff --git a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
index bb21f4b7c2b..9d89a45e168 100644
--- a/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
+++ b/backends/vulkan/runtime/graph/ops/PrepackNode.cpp
@@ -44,6 +44,9 @@ PrepackNode::PrepackNode(
       push_constants_(push_constants) {
   graph.update_descriptor_counts(shader, /*execute = */ false);
   graph.update_descriptor_counts(noop_shader_, /*execute = */ false);
+  if (!graph.val_is_none(tref)) {
+    graph.get_tref(tref)->prepack_use_count++;
+  }
 }
 
 api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
@@ -100,9 +103,10 @@ api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
     }
   }
 
-  // Once the staging buffer is copied, if the TensorRef owns a FreeableBuffer,
-  // it can be freed.
-  tref->free_buffer();
+  if (--tref->prepack_use_count == 0) {
+    tref->free_buffer();
+  }
+
   return staging;
 }
 

From 5fd2bae5f9b7e2ef73f45fc484779a1ddb47a4fc Mon Sep 17 00:00:00 2001
From: ssjia <ssjia@devvm26340.ftw0.facebook.com>
Date: Wed, 15 Apr 2026 10:09:12 -0700
Subject: [PATCH 2/2] Update on "[ET-VK] Fix use-after-free in PrepackNode when
 TensorRefs are shared"
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When a model has shared/tied weights (e.g. tied embeddings in transformers), the serialization deduplicates them into a single TensorRef that multiple PrepackNodes reference. Previously, `PrepackNode::create_staging_buffer()` called `tref->free_buffer()` unconditionally after copying weight data to a GPU staging buffer. This meant the first PrepackNode to execute would free the underlying host memory, and subsequent PrepackNodes sharing the same TensorRef would read from a dangling pointer — producing garbage/NaN values in prepacked weight and bias tensors on the GPU.

The fix adds a `prepack_use_count` field to `TensorRef` that tracks how many PrepackNodes still need to read from it. Each PrepackNode increments the count in its constructor and decrements it after copying data. The buffer is only freed when the count reaches zero. This preserves the original eager-free behavior for non-shared weights (freeing immediately after the single consumer copies) while correctly deferring the free for shared weights until the last consumer is done — avoiding both the use-after-free and unnecessary peak memory increase.

Differential Revision: [D101009402](https://our.internmc.facebook.com/intern/diff/D101009402/)

[ghstack-poisoned]
---
 backends/vulkan/runtime/graph/containers/Constant.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/backends/vulkan/runtime/graph/containers/Constant.h b/backends/vulkan/runtime/graph/containers/Constant.h
index 2b8c134a04f..690a25dd9c8 100644
--- a/backends/vulkan/runtime/graph/containers/Constant.h
+++ b/backends/vulkan/runtime/graph/containers/Constant.h
@@ -33,7 +33,7 @@ struct TensorRef final {
   // this reaches 0, the buffer can be safely freed. This prevents
   // use-after-free when multiple PrepackNodes reference the same TensorRef
   // (e.g. shared/tied weights).
-  uint32_t prepack_use_count{0};
+  int32_t prepack_use_count{0};
 
   explicit TensorRef(
       const std::vector<int64_t>& t_sizes,