pytorch · Gasoonjia · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026
diff --git a/backends/aoti/aoti_delegate_handle.h b/backends/aoti/aoti_delegate_handle.h
@@ -84,8 +84,6 @@ struct AOTIDelegateHandle {
   void* so_handle;
   std::string so_path;
   AOTInductorModelContainerHandle container_handle;
-  void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
-                     // dependency
   std::string method_name;
 
   // Function pointers specific to this handle's shared library

diff --git a/backends/aoti/slim/core/storage.h b/backends/aoti/slim/core/storage.h
@@ -127,16 +127,47 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
   /// @param ptr Pointer to device memory to free.
   static void free(void* ptr) {
     // Get the current stream for the current device
+    // Currently all cuda slimtensors should be on the same device same stream,
+    // so we can just use the stream on current device.
+    // TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
+    // support multiple devices.
     auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
-    if (stream_result.ok()) {
-      ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+    ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
+    ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
+  }
+
+  /// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
+  /// @param dst Destination pointer.
+  /// @param src Source pointer.
+  /// @param nbytes Number of bytes to copy.
+  /// @param dst_device Destination device.
+  /// @param src_device Source device.
+  /// @param stream CUDA stream for async copy.
+  static void memcpy_async(
+      void* dst,
+      const void* src,
+      size_t nbytes,
+      const c10::Device& dst_device,
+      const c10::Device& src_device,
+      cudaStream_t stream) {
+    cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;
+
+    if (src_device.is_cpu()) {
+      direction = cudaMemcpyHostToDevice;
+    } else if (dst_device.is_cpu()) {
+      direction = cudaMemcpyDeviceToHost;
     } else {
-      // Fallback to synchronous free if we can't get the stream
-      ET_CUDA_LOG_WARN(cudaFree(ptr));
+      ET_CHECK_MSG(
+          src_device.index() == dst_device.index(),
+          "CUDA memcpy across different device indices not supported: %d != %d",
+          static_cast<int>(src_device.index()),
+          static_cast<int>(dst_device.index()));
     }
+
+    ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
   }
 
-  /// Copies memory between CPU and CUDA or CUDA and CUDA.
+  /// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
   /// @param dst Destination pointer.
   /// @param src Source pointer.
   /// @param nbytes Number of bytes to copy.

diff --git a/backends/cuda/runtime/TARGETS b/backends/cuda/runtime/TARGETS
@@ -75,6 +75,9 @@ runtime.cxx_library(
     srcs = [
         "cuda_backend.cpp",
     ],
+    headers = [
+        "cuda_delegate_handle.h",
+    ],
     # @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
     link_whole = True,
     supports_python_dlopen = True,

diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -37,6 +37,7 @@
 
 // Include our shim layer headers
 #include <executorch/backends/aoti/aoti_delegate_handle.h>
+#include <executorch/backends/cuda/runtime/cuda_delegate_handle.h>
 #include <executorch/backends/cuda/runtime/platform/platform.h>
 #include <executorch/backends/cuda/runtime/shims/memory.h>
 #include <executorch/backends/cuda/runtime/utils.h>
@@ -77,6 +78,7 @@ using slim::c10::DeviceType;
 namespace {
 constexpr char kSkipCopyOutputToCpuForMethod[] =
     "skip_copy_output_to_cpu_for_method";
+constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream";
 } // anonymous namespace
 
 class ET_EXPERIMENTAL CudaBackend final
@@ -143,6 +145,33 @@ class ET_EXPERIMENTAL CudaBackend final
     return method_in_csv(method_name, skip_copy_method_);
   }
 
+  // Create the shared CUDA stream. Called when use_shared_cuda_stream option
+  // is set to true. The presence of shared_cuda_stream_ indicates shared mode.
+  void create_shared_cuda_stream() {
+    std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
+    if (shared_cuda_stream_ != nullptr) {
+      return; // Already created
+    }
+    shared_cuda_stream_ = cuda::create_cuda_stream();
+    if (shared_cuda_stream_ == nullptr) {
+      ET_LOG(Error, "Failed to create shared CUDA stream");
+      return;
+    }
+    ET_LOG(Info, "Created shared CUDA stream: %p", *shared_cuda_stream_);
+  }
+
+  // Get the shared CUDA stream. Returns nullptr if not in shared mode.
+  std::shared_ptr<cudaStream_t> get_shared_cuda_stream() const {
+    std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
+    return shared_cuda_stream_;
+  }
+
+  // Check if we're using shared CUDA stream mode.
+  bool is_using_shared_cuda_stream() const {
+    std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
+    return shared_cuda_stream_ != nullptr;
+  }
+
   Error load_function_pointers_into_handle(
       void* so_handle,
       AOTIDelegateHandle* handle) const {
@@ -201,6 +230,15 @@ class ET_EXPERIMENTAL CudaBackend final
               kSkipCopyOutputToCpuForMethod);
           return Error::InvalidArgument;
         }
+      } else if (std::strcmp(option.key, kUseSharedCudaStream) == 0) {
+        if (auto* val = std::get_if<bool>(&option.value)) {
+          if (*val) {
+            create_shared_cuda_stream();
+          }
+        } else {
+          ET_LOG(Error, "Option %s must be a boolean.", kUseSharedCudaStream);
+          return Error::InvalidArgument;
+        }
       }
     }
     return Error::Ok;
@@ -282,7 +320,7 @@ class ET_EXPERIMENTAL CudaBackend final
     processed->Free();
 
     // Create handle and load function pointers into it
-    AOTIDelegateHandle* handle = new AOTIDelegateHandle();
+    cuda::CudaDelegateHandle* handle = new cuda::CudaDelegateHandle();
     handle->so_handle = lib_handle;
     handle->so_path = so_path.string();
     handle->method_name = method_name;
@@ -313,10 +351,31 @@ class ET_EXPERIMENTAL CudaBackend final
           handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
       buffer_res->Free();
     }
-    // Create a CUDA stream for asynchronous execution
-    cudaStream_t cuda_stream;
-    ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
-    handle->cuda_stream = static_cast<void*>(cuda_stream);
+
+    // Use shared CUDA stream if enabled via options, otherwise create one.
+    // A shared stream ensures proper ordering across multiple methods
+    // (e.g., encoder, decoder, sampler) when using skip-copy optimization.
+    if (is_using_shared_cuda_stream()) {
+      // Shared stream mode: all handles share the same stream.
+      handle->cuda_stream = get_shared_cuda_stream();
+      ET_LOG(
+          Info,
+          "Using shared CUDA stream %p for method %s",
+          handle->get_cuda_stream(),
+          method_name.c_str());
+    } else {
+      // Per-handle stream mode: each handle owns its own stream.
+      handle->cuda_stream = cuda::create_cuda_stream();
+      if (handle->cuda_stream == nullptr) {
+        delete handle;
+        return Error::Internal;
+      }
+      ET_LOG(
+          Info,
+          "Created new CUDA stream %p for method %s",
+          handle->get_cuda_stream(),
+          method_name.c_str());
+    }
 
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
@@ -326,14 +385,16 @@ class ET_EXPERIMENTAL CudaBackend final
       BackendExecutionContext& context,
       DelegateHandle* handle_,
       Span<EValue*> args) const override {
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+    cuda::CudaDelegateHandle* handle = (cuda::CudaDelegateHandle*)handle_;
 
     size_t n_inputs;
     handle->get_num_inputs(handle->container_handle, &n_inputs);
 
     size_t n_outputs;
     handle->get_num_outputs(handle->container_handle, &n_outputs);
 
+    setCurrentCUDAStream(handle->get_cuda_stream(), 0);
+
     ET_CHECK_OR_RETURN_ERROR(
         n_inputs + n_outputs == args.size(),
         InvalidArgument,
@@ -406,13 +467,18 @@ class ET_EXPERIMENTAL CudaBackend final
     // expects ETensor* as input/output. We avoid changing its signature since
     // it's shared with the Metal backend. Instead, we reinterpret_cast
     // SlimTensor* to Tensor*
+    //
+    // Get current CUDA stream and pass it to AOTInductorModelContainerRun
+    Result<cudaStream_t> cuda_stream_ret = getCurrentCUDAStream(0);
+    cudaStream_t cuda_stream = cuda_stream_ret.get();
+    ET_CHECK_OK_OR_RETURN_ERROR(cuda_stream_ret.error());
     AOTIRuntimeError error = handle->run(
         handle->container_handle,
         reinterpret_cast<Tensor**>(gpu_inputs.data()),
         n_inputs,
         reinterpret_cast<Tensor**>(gpu_outputs.data()),
         n_outputs,
-        handle->cuda_stream,
+        static_cast<void*>(cuda_stream),
         nullptr);
 
     ET_CHECK_OR_RETURN_ERROR(
@@ -423,22 +489,12 @@ class ET_EXPERIMENTAL CudaBackend final
 
     const bool copy_outputs = !should_skip_copy_for_method(handle->method_name);
 
-    // Synchronize CUDA stream to ensure kernel execution is complete
-    // before accessing output data (either for copy or skip-copy path)
-    cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
-    cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
-    ET_CHECK_OR_RETURN_ERROR(
-        sync_err == cudaSuccess,
-        Internal,
-        "cudaStreamSynchronize failed: %s",
-        cudaGetErrorString(sync_err));
-
     if (copy_outputs) {
-      // Deep copy GPU SlimTensor results back to CPU ETensors
       for (size_t i = 0; i < n_outputs; i++) {
         auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
         ET_CHECK_OK_OR_RETURN_ERROR(
-            copy_slimtensor_to_etensor(gpu_outputs[i], cpu_output_tensor),
+            copy_slimtensor_to_etensor_async(
+                gpu_outputs[i], cpu_output_tensor, cuda_stream),
             "Failed to copy GPU output %zu back to CPU ETensor",
             i);
       }
@@ -483,7 +539,7 @@ class ET_EXPERIMENTAL CudaBackend final
     if (handle_ == nullptr) {
       return;
     }
-    AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
+    cuda::CudaDelegateHandle* handle = (cuda::CudaDelegateHandle*)handle_;
 
     // Clean up cached output tensors for this handle
     {
@@ -495,16 +551,10 @@ class ET_EXPERIMENTAL CudaBackend final
       }
     }
 
-    // Destroy the CUDA stream if it exists
-    if (handle->cuda_stream != nullptr) {
-      cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
-      cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
-      ET_CHECK_OR_LOG_ERROR(
-          stream_err == cudaSuccess,
-          "Failed to destroy CUDA stream: %s",
-          cudaGetErrorString(stream_err));
-      handle->cuda_stream = nullptr;
-    }
+    // The CUDA stream is managed by shared_ptr in the handle.
+    // It will be automatically destroyed when the last handle using it
+    // is destroyed. Just reset our reference.
+    handle->cuda_stream.reset();
 
     // NOTE: AOTInductorModelContainerDelete does not work correctly with
     // multiple .so files. Deleting one container frees shared resources,
@@ -541,13 +591,22 @@ class ET_EXPERIMENTAL CudaBackend final
   mutable std::mutex skip_copy_method_mutex_;
   std::string skip_copy_method_;
 
+  // Shared CUDA stream for all methods. When set (non-null), all methods use
+  // the same stream to ensure proper ordering (critical for skip-copy
+  // optimization). Created when use_shared_cuda_stream option is set to true.
+  // Managed via shared_ptr so it's automatically cleaned up when last handle
+  // is destroyed.
+  mutable std::mutex cuda_stream_mutex_;
+  std::shared_ptr<cudaStream_t> shared_cuda_stream_ = nullptr;
+
   // Cached output tensors for skip-copy optimization.
   // When skip-copy is enabled, output SlimTensors are cached here to keep
   // the underlying GPU memory alive while the caller processes the results.
-  // Maps each AOTIDelegateHandle* to its vector of cached output tensors.
+  // Maps each CudaDelegateHandle* to its vector of cached output tensors.
   mutable std::mutex cached_outputs_mutex_;
-  mutable std::unordered_map<AOTIDelegateHandle*, std::vector<SlimTensor*>>
-      cached_outputs_;
+  mutable std::
+      unordered_map<cuda::CudaDelegateHandle*, std::vector<SlimTensor*>>
+          cached_outputs_;
 };
 
 } // namespace executorch::backends::cuda

diff --git a/backends/cuda/runtime/cuda_delegate_handle.h b/backends/cuda/runtime/cuda_delegate_handle.h
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <cuda_runtime.h>
+#include <executorch/backends/aoti/aoti_delegate_handle.h>
+#include <memory>
+
+namespace executorch {
+namespace backends {
+namespace cuda {
+
+// Shared CUDA stream wrapper with proper RAII cleanup.
+// This ensures the stream is destroyed when all handles using it are destroyed.
+struct CudaStreamDeleter {
+  void operator()(cudaStream_t* stream) const {
+    if (stream != nullptr && *stream != nullptr) {
+      cudaStreamDestroy(*stream);
+    }
+    delete stream;
+  }
+};
+
+// Creates a new shared CUDA stream.
+// Returns nullptr on failure.
+inline std::shared_ptr<cudaStream_t> create_cuda_stream() {
+  cudaStream_t stream;
+  cudaError_t err = cudaStreamCreate(&stream);
+  if (err != cudaSuccess) {
+    return nullptr;
+  }
+  return std::shared_ptr<cudaStream_t>(
+      new cudaStream_t(stream), CudaStreamDeleter());
+}
+// CUDA-specific delegate handle that extends AOTIDelegateHandle.
+// This consolidates CUDA stream management into a single location.
+struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
+  // CUDA stream for this handle, support both shared mode and single mode.
+  // In shared mode, all cuda delegate handles share the same stream (e.g., for
+  // skip-copy optimization), they will all hold a reference to the same
+  // shared_ptr. The stream is automatically destroyed when the last handle is
+  // destroyed. In single mode, every cuda delegate handle has its own stream.
+  std::shared_ptr<cudaStream_t> cuda_stream;
+
+  // Get the raw CUDA stream pointer for use in CUDA API calls.
+  // Returns nullptr if no stream is set.
+  cudaStream_t get_cuda_stream() const {
+    return cuda_stream ? *cuda_stream : nullptr;
+  }
+
+  // Check if this handle has a valid CUDA stream.
+  bool has_cuda_stream() const {
+    return cuda_stream != nullptr && *cuda_stream != nullptr;
+  }
+};
+
+} // namespace cuda
+} // namespace backends
+} // namespace executorch