pytorch
diff --git a/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 132 additions & 206 deletions b/‎backends/cuda/runtime/cuda_backend.cpp‎
Lines changed: 132 additions & 206 deletions
diff --git a/‎backends/cuda/runtime/utils.h‎
Lines changed: 80 additions & 21 deletions b/‎backends/cuda/runtime/utils.h‎
Lines changed: 80 additions & 21 deletions
diff --git a/‎backends/cuda/tests/test_cuda_export.py‎
Lines changed: 70 additions & 36 deletions b/‎backends/cuda/tests/test_cuda_export.py‎
Lines changed: 70 additions & 36 deletions
diff --git a/‎examples/models/gemma4_31b/export.py‎
Lines changed: 0 additions & 1 deletion b/‎examples/models/gemma4_31b/export.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎examples/models/gemma4_31b/main.cpp‎
Lines changed: 1 addition & 2 deletions b/‎examples/models/gemma4_31b/main.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/models/gemma4_31b/model.md‎
Lines changed: 7 additions & 5 deletions b/‎examples/models/gemma4_31b/model.md‎
Lines changed: 7 additions & 5 deletions
diff --git a/‎examples/models/gemma4_31b/model.py‎
Lines changed: 6 additions & 2 deletions b/‎examples/models/gemma4_31b/model.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎examples/models/qwen3_5_moe/export.py‎
Lines changed: 11 additions & 8 deletions b/‎examples/models/qwen3_5_moe/export.py‎
Lines changed: 11 additions & 8 deletions
diff --git a/‎examples/models/qwen3_5_moe/main.cpp‎
Lines changed: 1 addition & 4 deletions b/‎examples/models/qwen3_5_moe/main.cpp‎
Lines changed: 1 addition & 4 deletions
@@ -147,11 +147,13 @@ inline void _strided_copy(
 }
 
 // Copy data from SlimTensor to ETensor, rearranging if strides differ.
-// When stream is non-null, GPU copies use that stream (async fast path).
-// When stream is null, GPU copies are synchronous.
+// dst_device selects the destination memory space (CPU for D2H, a CUDA device
+// for D2D). When stream is non-null, GPU copies use that stream (async fast
+// path). When stream is null, GPU copies are synchronous.
 inline executorch::runtime::Error _copy_slimtensor_to_etensor_impl(
     const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
     executorch::runtime::etensor::Tensor* etensor,
+    const executorch::backends::aoti::slim::c10::Device& dst_device,
     cudaStream_t stream) {
   ET_CHECK_OK_OR_RETURN_ERROR(_check_tensor_metadata(slim_tensor, etensor));
 
@@ -165,7 +167,7 @@ inline executorch::runtime::Error _copy_slimtensor_to_etensor_impl(
 
   if (_strides_match(slim_tensor, etensor)) {
     // Fast path: strides match, raw byte copy
-    if (slim_tensor->is_cpu()) {
+    if (slim_tensor->is_cpu() && dst_device.is_cpu()) {
       std::memcpy(dst_data, src_data, nbytes);
     } else if (stream) {
       executorch::backends::aoti::slim::DeviceTraits<
@@ -174,23 +176,19 @@ inline executorch::runtime::Error _copy_slimtensor_to_etensor_impl(
               dst_data,
               src_data,
               nbytes,
-              executorch::backends::aoti::slim::CPU_DEVICE,
+              dst_device,
               slim_tensor->device(),
               stream);
     } else {
       executorch::backends::aoti::slim::DeviceTraits<
           executorch::backends::aoti::slim::c10::DeviceType::CUDA>::
-          memcpy(
-              dst_data,
-              src_data,
-              nbytes,
-              executorch::backends::aoti::slim::CPU_DEVICE,
-              slim_tensor->device());
+          memcpy(dst_data, src_data, nbytes, dst_device, slim_tensor->device());
     }
   } else {
     // Slow path: strides differ (e.g., AOTI delegate output layout differs
-    // from .pte's dim_order). Copy to a temp CPU buffer, then rearrange
-    // element-by-element to match the ETensor's expected layout.
+    // from .pte's dim_order). Copy to a temp CPU buffer, rearrange
+    // element-by-element to match the ETensor's expected layout, then move the
+    // result to the destination (CPU stays in place; GPU gets an H2D copy).
     std::vector<char> tmp(nbytes);
     if (slim_tensor->is_cpu()) {
       std::memcpy(tmp.data(), src_data, nbytes);
@@ -218,13 +216,38 @@ inline executorch::runtime::Error _copy_slimtensor_to_etensor_impl(
 
     size_t elem_size = executorch::backends::aoti::slim::c10::elementSize(
         slim_tensor->dtype());
-    _strided_copy(
-        dst_data,
-        tmp.data(),
-        elem_size,
-        sizes_vec,
-        src_strides_vec,
-        dst_strides_vec);
+
+    if (dst_device.is_cpu()) {
+      _strided_copy(
+          dst_data,
+          tmp.data(),
+          elem_size,
+          sizes_vec,
+          src_strides_vec,
+          dst_strides_vec);
+    } else {
+      // Rearrange into a CPU staging buffer, then copy to the GPU destination.
+      std::vector<char> rearranged(nbytes);
+      _strided_copy(
+          rearranged.data(),
+          tmp.data(),
+          elem_size,
+          sizes_vec,
+          src_strides_vec,
+          dst_strides_vec);
+      if (stream) {
+        ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpyAsync(
+            dst_data,
+            rearranged.data(),
+            nbytes,
+            cudaMemcpyHostToDevice,
+            stream));
+        ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamSynchronize(stream));
+      } else {
+        ET_CUDA_CHECK_OR_RETURN_ERROR(cudaMemcpy(
+            dst_data, rearranged.data(), nbytes, cudaMemcpyHostToDevice));
+      }
+    }
   }
 
   return executorch::runtime::Error::Ok;
@@ -251,7 +274,39 @@ inline executorch::runtime::Error copy_slimtensor_to_etensor_async(
     const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
     executorch::runtime::etensor::Tensor* etensor,
     cudaStream_t stream) {
-  return _copy_slimtensor_to_etensor_impl(slim_tensor, etensor, stream);
+  return _copy_slimtensor_to_etensor_impl(
+      slim_tensor,
+      etensor,
+      executorch::backends::aoti::slim::CPU_DEVICE,
+      stream);
+}
+
+/**
+ * Copies data from a SlimTensor to a GPU-resident ETensor asynchronously
+ * (device-to-device).
+ *
+ * Used when the destination ETensor's storage lives in a planned GPU arena.
+ * The destination device is taken from the source SlimTensor, so this only
+ * supports same-device D2D copies (source and destination on the same GPU).
+ *
+ * When strides match (common case), performs a fast async D2D copy on the
+ * provided stream. When strides differ, falls back to a staged copy with
+ * element-by-element rearrangement on the host.
+ *
+ * NOTE: In the fast path the copy is asynchronous. The caller must synchronize
+ * the stream before consuming the ETensor data.
+ *
+ * @param slim_tensor Pointer to the source SlimTensor (must not be null).
+ * @param etensor Pointer to the destination GPU ETensor (must not be null).
+ * @param stream The CUDA stream to use for async copy.
+ * @return Error::Ok on success, or an appropriate error code on failure.
+ */
+inline executorch::runtime::Error copy_slimtensor_to_device_etensor_async(
+    const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
+    executorch::runtime::etensor::Tensor* etensor,
+    cudaStream_t stream) {
+  return _copy_slimtensor_to_etensor_impl(
+      slim_tensor, etensor, slim_tensor->device(), stream);
 }
 
 /**
@@ -267,7 +322,11 @@ inline executorch::runtime::Error copy_slimtensor_to_etensor_async(
 inline executorch::runtime::Error copy_slimtensor_to_etensor(
     const executorch::backends::aoti::slim::SlimTensor* slim_tensor,
     executorch::runtime::etensor::Tensor* etensor) {
-  return _copy_slimtensor_to_etensor_impl(slim_tensor, etensor, nullptr);
+  return _copy_slimtensor_to_etensor_impl(
+      slim_tensor,
+      etensor,
+      executorch::backends::aoti::slim::CPU_DEVICE,
+      nullptr);
 }
 
 /**
 
@@ -328,18 +328,23 @@ def test_triton_kernel_mode_off(self):
 
     def test_device_info_propagated_to_cuda_delegate_outputs(self):
         """
-        Test that device info is correctly propagated from export to serialization
-        for CUDA delegate outputs.
-
-        This verifies the device propagation flow:
-        1. CudaPartitioner adds target_device="cuda:0" CompileSpec
-        2. PropagateDevicePass sets TensorSpec.device = CUDA for delegate outputs
-        3. Emitter serializes device info into ExtraTensorInfo.device_type
-        4. Serialized tensors have device_type = DeviceType.CUDA
-
-        Note: At this stage, the tensor memory is still on CPU. The CUDA backend
-        will copy data to GPU device at runtime. Device info tagging is the first
-        step toward full device-aware memory allocation.
+        Verify that, for a CUDA-delegated graph, every memory-planned tensor's
+        actual planned memory location matches its device_type tag.
+
+        With device memory planning (the default), the flow is:
+        1. CudaPartitioner adds target_device="cuda:0" CompileSpec.
+        2. PropagateDevicePass tags delegate IO TensorSpecs as CUDA and inserts
+           et_copy._h2d_copy / _d2h_copy ops at the delegate boundary, so the
+           method inputs/outputs stay on CPU while the delegate IO is CUDA.
+        3. Device-aware memory planning allocates each non-CPU tensor into a CUDA
+           buffer, recorded in ExecutionPlan.non_const_buffer_device.
+        4. The emitter serializes device info into ExtraTensorInfo.device_type.
+
+        The core check: for each planned tensor, the device of the buffer it is
+        allocated into (non_const_buffer_device) must agree with the tensor's
+        own device_type. A CUDA-tagged tensor planned into a CPU buffer (or vice
+        versa) means planning and device tagging disagree about where the
+        tensor's real memory lives.
         """
 
         class AddModule(torch.nn.Module):
@@ -354,7 +359,8 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         edge_program_manager = self._export_to_cuda_with_lower(module, inputs)
         self.assertIsNotNone(edge_program_manager, "CUDA export failed")
 
-        # Convert to ExecuTorch and access the serialized program
+        # Convert to ExecuTorch and access the serialized program. The default
+        # config enables device memory planning, so delegate IO is GPU-resident.
         et_prog = edge_program_manager.to_executorch()
         program = et_prog._emitter_output.program
 
@@ -366,32 +372,60 @@ def forward(self, x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
             "Expected at least one delegate in the execution plan",
         )
 
-        # Count tensors by device type
-        cpu_tensors = []
-        cuda_tensors = []
-
+        # Build buffer_idx -> device map from the per-buffer device mapping.
+        # Buffers without an entry default to CPU.
+        buffer_device: dict[int, schema.DeviceType] = {}
+        for entry in plan.non_const_buffer_device or []:
+            buffer_device[entry.buffer_idx] = entry.device_type
+
+        def tensor_device(t: schema.Tensor) -> schema.DeviceType:
+            if t.extra_tensor_info is not None:
+                return t.extra_tensor_info.device_type
+            return schema.DeviceType.CPU
+
+        # Walk every memory-planned tensor in the graph and assert its declared
+        # device_type matches the device of the buffer it lives in.
+        cuda_planned = 0
+        cpu_planned = 0
         for value in plan.values:
-            if isinstance(value.val, schema.Tensor):
-                tensor = value.val
-                if (
-                    tensor.extra_tensor_info is not None
-                    and tensor.extra_tensor_info.device_type == schema.DeviceType.CUDA
-                ):
-                    cuda_tensors.append(tensor)
-                else:
-                    # Either no extra_tensor_info or device_type is CPU (default)
-                    cpu_tensors.append(tensor)
-
-        # Both input and output tensors should be on CUDA device for now.
+            if not isinstance(value.val, schema.Tensor):
+                continue
+            tensor = value.val
+            # Only memory-planned (non-constant) tensors have allocation_info;
+            # their memory_id indexes into the non_const buffers.
+            if tensor.allocation_info is None:
+                continue
+
+            declared = tensor_device(tensor)
+            mem_id = tensor.allocation_info.memory_id
+            planned = buffer_device.get(mem_id, schema.DeviceType.CPU)
+
+            self.assertEqual(
+                planned,
+                declared,
+                f"Tensor planned into buffer {mem_id} has device_type="
+                f"{declared.name} but the buffer is allocated on "
+                f"{planned.name}; planned memory location and device tag "
+                f"must agree.",
+            )
+            if declared == schema.DeviceType.CUDA:
+                cuda_planned += 1
+            else:
+                cpu_planned += 1
+
+        # AddModule has 2 inputs + 1 output. With device memory planning the
+        # delegate IO is CUDA-resident (2 h2d copies + 1 delegate output) and
+        # the host-side method inputs/outputs stay on CPU (2 inputs + 1 d2h
+        # output), giving exactly 3 CUDA- and 3 CPU-resident planned tensors.
         self.assertEqual(
-            len(cpu_tensors),
-            0,
-            f"Expected no CPU tensors: method inputs/outputs should be tagged "
-            f"CUDA, but found {len(cpu_tensors)}",
+            cuda_planned,
+            3,
+            f"Expected exactly 3 CUDA-resident planned tensors (2 h2d copies + "
+            f"1 delegate output), but found {cuda_planned}.",
         )
         self.assertEqual(
-            len(cuda_tensors),
+            cpu_planned,
             3,
-            f"Expected 3 CUDA tensors (2 method inputs + 1 method output), "
-            f"but found {len(cuda_tensors)}",
+            f"Expected exactly 3 CPU-resident planned tensors (2 method inputs "
+            f"+ 1 d2h output), but found {cpu_planned}.",
         )
@@ -268,7 +268,6 @@ def _export_cuda(
             do_quant_fusion_and_const_prop=True,
             memory_planning_pass=MemoryPlanningPass(
                 alloc_graph_input=False,
-                share_mutable_buffers=True,
             ),
             emit_mutable_buffer_names=True,
         ),
 
@@ -158,8 +158,7 @@ int main(int argc, char** argv) {
       Module::LoadMode::MmapUseMlockIgnoreErrors,
       /*event_tracer=*/nullptr,
       /*memory_allocator=*/nullptr,
-      /*temp_allocator=*/nullptr,
-      /*share_memory_arenas=*/true);
+      /*temp_allocator=*/nullptr);
 
   // Get metadata
   auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());
 
@@ -109,11 +109,13 @@ Decoder norms per layer: `input_layernorm`, `post_attention_layernorm`,
 | `decode`  | tokens `(1, 1)` + input_pos `(1,)` + temperature `(1,)`    | `(1, 1)` float   |
 | `prefill` | tokens `(1, T)` + input_pos `(T,)` + temperature `(1,)`, T∈[5, min(max_seq_len-1, 2×sliding_window)] | `(1, 1)` float   |
 
-Both methods share the same KV-cache buffers via
-`MemoryPlanningPass(share_mutable_buffers=True)` and
-`emit_mutable_buffer_names=True`. The exported program performs Gumbel-max
-sampling on-device and returns a single token ID per call so the C++ runner
-only has to feed tokens.
+Both methods share the same KV-cache buffers. On the CUDA/AOTI backend the
+stateful buffers are lifted into the delegate as constants and shared across
+`decode`/`prefill` at runtime via the backend's per-FQN buffer cache, so the
+CUDA export leaves `share_mutable_buffers` off (other backends, e.g. MLX, instead
+share graph-level buffers via `share_mutable_buffers`). The exported program
+performs Gumbel-max sampling on-device and returns a single token ID per call so
+the C++ runner only has to feed tokens.
 
 ### MLX (`--backend mlx`)
 
 
@@ -8,8 +8,12 @@
 Gemma 4 31B-IT — export-friendly reference implementation for ExecuTorch.
 
 Model definition designed for torch.export(strict=True) with the CUDA backend.
-All stateful buffers (KV cache, RoPE inv_freq) are registered buffers so they
-are captured by share_mutable_buffers across prefill/decode. The numerically
+All stateful buffers (KV cache, RoPE inv_freq) are registered buffers with
+in-place updates. On the CUDA/AOTI backend they are lifted into the delegate as
+constants and shared across prefill/decode at runtime via the backend's per-FQN
+buffer cache (so the CUDA export leaves share_mutable_buffers off); backends that
+keep these buffers at the graph level (e.g. MLX) instead share them via
+share_mutable_buffers. The numerically
 sensitive primitives — RMSNorm, GELU-tanh MLP, proportional/full RoPE, and
 the BHSD KV cache — are imported from ``examples.models.gemma4.text_decoder``
 so the 31B and E2B/E4B paths share them.
 
@@ -623,8 +623,10 @@ def _materialize_buffers(model, config):
 
     Replaces meta buffers with real tensors on CPU, recomputes RoPE
     inv_freq and causal masks. State buffers (KV cache, conv/recurrent
-    state) are zero-initialized registered buffers that will be shared
-    across methods via share_mutable_buffers.
+    state) are zero-initialized registered buffers. On the CUDA/AOTI backend
+    they are lifted into the delegate as constants and shared across methods at
+    runtime via the backend's per-FQN buffer cache; backends that keep them at
+    the graph level instead share them via share_mutable_buffers.
     """
     # Masks stay bool, inv_freq stays float32.
     for fqn, buf in list(model.named_buffers()):
@@ -922,8 +924,12 @@ def _export_cuda(model, config, args):
         via fused_moe_batched_gemm, with dynamic sequence length.
 
     Both methods share mutable state buffers (KV cache, conv_state,
-    recurrent_state) via share_mutable_buffers=True. The model uses
-    registered buffers with in-place updates — no state in/out args.
+    recurrent_state): the model uses registered buffers with in-place
+    updates (no state in/out args). On the CUDA/AOTI backend these buffers
+    are lifted into the delegate as constants and shared across the
+    decode/prefill methods at runtime via the backend's per-FQN buffer cache
+    (share_mutable_buffers is left off for CUDA); backends that keep them at
+    the graph level instead share them via share_mutable_buffers.
     """
     import torch._inductor.config as inductor_config
 
@@ -1031,10 +1037,7 @@ def _export_cuda(model, config, args):
         config=ExecutorchBackendConfig(
             extract_delegate_segments=True,
             do_quant_fusion_and_const_prop=True,
-            memory_planning_pass=MemoryPlanningPass(
-                alloc_graph_input=False,
-                share_mutable_buffers=True,
-            ),
+            memory_planning_pass=MemoryPlanningPass(alloc_graph_input=False),
             emit_mutable_buffer_names=True,
         ),
     )
 
@@ -144,8 +144,6 @@ int main(int argc, char** argv) {
 
   stats.model_load_start_ms = llm::time_in_ms();
 
-  // Create Module with share_memory_arenas=true so prefill and decode
-  // share mutable buffers (KV cache, conv_state, recurrent_state).
   std::vector<std::string> data_files;
   if (!FLAGS_data_path.empty()) {
     data_files.push_back(FLAGS_data_path);
@@ -156,8 +154,7 @@ int main(int argc, char** argv) {
       Module::LoadMode::File,
       /*event_tracer=*/nullptr,
       /*memory_allocator=*/nullptr,
-      /*temp_allocator=*/nullptr,
-      /*share_memory_arenas=*/true);
+      /*temp_allocator=*/nullptr);
 
   // Get metadata
   auto metadata_result = llm::get_llm_metadata(tokenizer.get(), module.get());