Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
0b42608
Remove unnecessary cuda sync for better perf
Gasoonjia Feb 9, 2026
0014dcb
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 9, 2026
75c0995
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
28a0792
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
3b38ed7
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
1d27edd
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
f4672cd
Update base for Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 10, 2026
3a29c04
Qualcomm AI Engine Direct - Optimize UT execution time re-submit (#17…
chenweng-quic Feb 10, 2026
a5c5f70
fix to allocate samsung device issue
Jiseong-oh Feb 10, 2026
5cd3504
Add death test matchers to verify assertion messages (#16543)
rascani Feb 10, 2026
b7e063a
Validate dim_order is a permutation in dim_order_to_stride (#17314)
rascani Feb 10, 2026
23de893
[ET-VK] Fix missing memory barrier for first-use writes on aliased te…
Feb 10, 2026
aed50d1
consolidate cuda stream
Gasoonjia Feb 11, 2026
bddcb88
reformat
Gasoonjia Feb 11, 2026
6916192
rename for better clarification
Gasoonjia Feb 11, 2026
7d06ab5
rebase to latest main
Gasoonjia Feb 11, 2026
55cb555
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 11, 2026
8dcbc3e
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 11, 2026
69be4bb
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 12, 2026
7d83c14
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 12, 2026
75acabb
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 19, 2026
d055d95
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 19, 2026
c5f5ca4
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 19, 2026
0eaea12
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 19, 2026
7e62b56
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 20, 2026
e76fba9
Update on "Remove unnecessary cuda sync for better perf"
Gasoonjia Feb 20, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions backends/aoti/aoti_delegate_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,6 @@ struct AOTIDelegateHandle {
void* so_handle;
std::string so_path;
AOTInductorModelContainerHandle container_handle;
void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
// dependency
std::string method_name;

// Function pointers specific to this handle's shared library
Expand Down
41 changes: 36 additions & 5 deletions backends/aoti/slim/core/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,16 +127,47 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
/// @param ptr Pointer to device memory to free.
static void free(void* ptr) {
// Get the current stream for the current device
// Currently all cuda slimtensors should be on the same device same stream,
// so we can just use the stream on current device.
// TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
// support multiple devices.
auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
if (stream_result.ok()) {
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
}

/// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
/// @param dst Destination pointer.
/// @param src Source pointer.
/// @param nbytes Number of bytes to copy.
/// @param dst_device Destination device.
/// @param src_device Source device.
/// @param stream CUDA stream for async copy.
static void memcpy_async(
void* dst,
const void* src,
size_t nbytes,
const c10::Device& dst_device,
const c10::Device& src_device,
cudaStream_t stream) {
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;

if (src_device.is_cpu()) {
direction = cudaMemcpyHostToDevice;
} else if (dst_device.is_cpu()) {
direction = cudaMemcpyDeviceToHost;
} else {
// Fallback to synchronous free if we can't get the stream
ET_CUDA_LOG_WARN(cudaFree(ptr));
ET_CHECK_MSG(
src_device.index() == dst_device.index(),
"CUDA memcpy across different device indices not supported: %d != %d",
static_cast<int>(src_device.index()),
static_cast<int>(dst_device.index()));
}

ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
}

/// Copies memory between CPU and CUDA or CUDA and CUDA.
/// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
/// @param dst Destination pointer.
/// @param src Source pointer.
/// @param nbytes Number of bytes to copy.
Expand Down
3 changes: 3 additions & 0 deletions backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ runtime.cxx_library(
srcs = [
"cuda_backend.cpp",
],
headers = [
"cuda_delegate_handle.h",
],
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
link_whole = True,
supports_python_dlopen = True,
Expand Down
125 changes: 92 additions & 33 deletions backends/cuda/runtime/cuda_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

// Include our shim layer headers
#include <executorch/backends/aoti/aoti_delegate_handle.h>
#include <executorch/backends/cuda/runtime/cuda_delegate_handle.h>
#include <executorch/backends/cuda/runtime/platform/platform.h>
#include <executorch/backends/cuda/runtime/shims/memory.h>
#include <executorch/backends/cuda/runtime/utils.h>
Expand Down Expand Up @@ -77,6 +78,7 @@ using slim::c10::DeviceType;
namespace {
constexpr char kSkipCopyOutputToCpuForMethod[] =
"skip_copy_output_to_cpu_for_method";
constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream";
} // anonymous namespace

class ET_EXPERIMENTAL CudaBackend final
Expand Down Expand Up @@ -143,6 +145,33 @@ class ET_EXPERIMENTAL CudaBackend final
return method_in_csv(method_name, skip_copy_method_);
}

// Create the shared CUDA stream. Called when use_shared_cuda_stream option
// is set to true. The presence of shared_cuda_stream_ indicates shared mode.
void create_shared_cuda_stream() {
std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
if (shared_cuda_stream_ != nullptr) {
return; // Already created
}
shared_cuda_stream_ = cuda::create_cuda_stream();
if (shared_cuda_stream_ == nullptr) {
ET_LOG(Error, "Failed to create shared CUDA stream");
return;
}
ET_LOG(Info, "Created shared CUDA stream: %p", *shared_cuda_stream_);
}

// Get the shared CUDA stream. Returns nullptr if not in shared mode.
std::shared_ptr<cudaStream_t> get_shared_cuda_stream() const {
std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
return shared_cuda_stream_;
}

// Check if we're using shared CUDA stream mode.
bool is_using_shared_cuda_stream() const {
std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
return shared_cuda_stream_ != nullptr;
}

Error load_function_pointers_into_handle(
void* so_handle,
AOTIDelegateHandle* handle) const {
Expand Down Expand Up @@ -201,6 +230,15 @@ class ET_EXPERIMENTAL CudaBackend final
kSkipCopyOutputToCpuForMethod);
return Error::InvalidArgument;
}
} else if (std::strcmp(option.key, kUseSharedCudaStream) == 0) {
if (auto* val = std::get_if<bool>(&option.value)) {
if (*val) {
create_shared_cuda_stream();
}
} else {
ET_LOG(Error, "Option %s must be a boolean.", kUseSharedCudaStream);
return Error::InvalidArgument;
}
}
}
return Error::Ok;
Expand Down Expand Up @@ -282,7 +320,7 @@ class ET_EXPERIMENTAL CudaBackend final
processed->Free();

// Create handle and load function pointers into it
AOTIDelegateHandle* handle = new AOTIDelegateHandle();
cuda::CudaDelegateHandle* handle = new cuda::CudaDelegateHandle();
handle->so_handle = lib_handle;
handle->so_path = so_path.string();
handle->method_name = method_name;
Expand Down Expand Up @@ -313,10 +351,31 @@ class ET_EXPERIMENTAL CudaBackend final
handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
buffer_res->Free();
}
// Create a CUDA stream for asynchronous execution
cudaStream_t cuda_stream;
ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
handle->cuda_stream = static_cast<void*>(cuda_stream);

// Use shared CUDA stream if enabled via options, otherwise create one.
// A shared stream ensures proper ordering across multiple methods
// (e.g., encoder, decoder, sampler) when using skip-copy optimization.
if (is_using_shared_cuda_stream()) {
// Shared stream mode: all handles share the same stream.
handle->cuda_stream = get_shared_cuda_stream();
ET_LOG(
Info,
"Using shared CUDA stream %p for method %s",
handle->get_cuda_stream(),
method_name.c_str());
} else {
// Per-handle stream mode: each handle owns its own stream.
handle->cuda_stream = cuda::create_cuda_stream();
if (handle->cuda_stream == nullptr) {
delete handle;
return Error::Internal;
}
ET_LOG(
Info,
"Created new CUDA stream %p for method %s",
handle->get_cuda_stream(),
method_name.c_str());
}

return (DelegateHandle*)handle; // Return the handle post-processing
}
Expand All @@ -326,14 +385,16 @@ class ET_EXPERIMENTAL CudaBackend final
BackendExecutionContext& context,
DelegateHandle* handle_,
Span<EValue*> args) const override {
AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
cuda::CudaDelegateHandle* handle = (cuda::CudaDelegateHandle*)handle_;

size_t n_inputs;
handle->get_num_inputs(handle->container_handle, &n_inputs);

size_t n_outputs;
handle->get_num_outputs(handle->container_handle, &n_outputs);

setCurrentCUDAStream(handle->get_cuda_stream(), 0);

ET_CHECK_OR_RETURN_ERROR(
n_inputs + n_outputs == args.size(),
InvalidArgument,
Expand Down Expand Up @@ -406,13 +467,18 @@ class ET_EXPERIMENTAL CudaBackend final
// expects ETensor* as input/output. We avoid changing its signature since
// it's shared with the Metal backend. Instead, we reinterpret_cast
// SlimTensor* to Tensor*
//
// Get current CUDA stream and pass it to AOTInductorModelContainerRun
Result<cudaStream_t> cuda_stream_ret = getCurrentCUDAStream(0);
cudaStream_t cuda_stream = cuda_stream_ret.get();
ET_CHECK_OK_OR_RETURN_ERROR(cuda_stream_ret.error());
AOTIRuntimeError error = handle->run(
handle->container_handle,
reinterpret_cast<Tensor**>(gpu_inputs.data()),
n_inputs,
reinterpret_cast<Tensor**>(gpu_outputs.data()),
n_outputs,
handle->cuda_stream,
static_cast<void*>(cuda_stream),
nullptr);

ET_CHECK_OR_RETURN_ERROR(
Expand All @@ -423,22 +489,12 @@ class ET_EXPERIMENTAL CudaBackend final

const bool copy_outputs = !should_skip_copy_for_method(handle->method_name);

// Synchronize CUDA stream to ensure kernel execution is complete
// before accessing output data (either for copy or skip-copy path)
cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
ET_CHECK_OR_RETURN_ERROR(
sync_err == cudaSuccess,
Internal,
"cudaStreamSynchronize failed: %s",
cudaGetErrorString(sync_err));

if (copy_outputs) {
// Deep copy GPU SlimTensor results back to CPU ETensors
for (size_t i = 0; i < n_outputs; i++) {
auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
ET_CHECK_OK_OR_RETURN_ERROR(
copy_slimtensor_to_etensor(gpu_outputs[i], cpu_output_tensor),
copy_slimtensor_to_etensor_async(
gpu_outputs[i], cpu_output_tensor, cuda_stream),
"Failed to copy GPU output %zu back to CPU ETensor",
i);
}
Expand Down Expand Up @@ -483,7 +539,7 @@ class ET_EXPERIMENTAL CudaBackend final
if (handle_ == nullptr) {
return;
}
AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
cuda::CudaDelegateHandle* handle = (cuda::CudaDelegateHandle*)handle_;

// Clean up cached output tensors for this handle
{
Expand All @@ -495,16 +551,10 @@ class ET_EXPERIMENTAL CudaBackend final
}
}

// Destroy the CUDA stream if it exists
if (handle->cuda_stream != nullptr) {
cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
ET_CHECK_OR_LOG_ERROR(
stream_err == cudaSuccess,
"Failed to destroy CUDA stream: %s",
cudaGetErrorString(stream_err));
handle->cuda_stream = nullptr;
}
// The CUDA stream is managed by shared_ptr in the handle.
// It will be automatically destroyed when the last handle using it
// is destroyed. Just reset our reference.
handle->cuda_stream.reset();

// NOTE: AOTInductorModelContainerDelete does not work correctly with
// multiple .so files. Deleting one container frees shared resources,
Expand Down Expand Up @@ -541,13 +591,22 @@ class ET_EXPERIMENTAL CudaBackend final
mutable std::mutex skip_copy_method_mutex_;
std::string skip_copy_method_;

// Shared CUDA stream for all methods. When set (non-null), all methods use
// the same stream to ensure proper ordering (critical for skip-copy
// optimization). Created when use_shared_cuda_stream option is set to true.
// Managed via shared_ptr so it's automatically cleaned up when last handle
// is destroyed.
mutable std::mutex cuda_stream_mutex_;
std::shared_ptr<cudaStream_t> shared_cuda_stream_ = nullptr;

// Cached output tensors for skip-copy optimization.
// When skip-copy is enabled, output SlimTensors are cached here to keep
// the underlying GPU memory alive while the caller processes the results.
// Maps each AOTIDelegateHandle* to its vector of cached output tensors.
// Maps each CudaDelegateHandle* to its vector of cached output tensors.
mutable std::mutex cached_outputs_mutex_;
mutable std::unordered_map<AOTIDelegateHandle*, std::vector<SlimTensor*>>
cached_outputs_;
mutable std::
unordered_map<cuda::CudaDelegateHandle*, std::vector<SlimTensor*>>
cached_outputs_;
};

} // namespace executorch::backends::cuda
Expand Down
65 changes: 65 additions & 0 deletions backends/cuda/runtime/cuda_delegate_handle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <cuda_runtime.h>
#include <executorch/backends/aoti/aoti_delegate_handle.h>
#include <memory>

namespace executorch {
namespace backends {
namespace cuda {

// Shared CUDA stream wrapper with proper RAII cleanup.
// This ensures the stream is destroyed when all handles using it are destroyed.
struct CudaStreamDeleter {
void operator()(cudaStream_t* stream) const {
if (stream != nullptr && *stream != nullptr) {
cudaStreamDestroy(*stream);
}
delete stream;
}
};

// Creates a new shared CUDA stream.
// Returns nullptr on failure.
inline std::shared_ptr<cudaStream_t> create_cuda_stream() {
cudaStream_t stream;
cudaError_t err = cudaStreamCreate(&stream);
if (err != cudaSuccess) {
return nullptr;
}
return std::shared_ptr<cudaStream_t>(
new cudaStream_t(stream), CudaStreamDeleter());
}
// CUDA-specific delegate handle that extends AOTIDelegateHandle.
// This consolidates CUDA stream management into a single location.
struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
// CUDA stream for this handle, support both shared mode and single mode.
// In shared mode, all cuda delegate handles share the same stream (e.g., for
// skip-copy optimization), they will all hold a reference to the same
// shared_ptr. The stream is automatically destroyed when the last handle is
// destroyed. In single mode, every cuda delegate handle has its own stream.
std::shared_ptr<cudaStream_t> cuda_stream;

// Get the raw CUDA stream pointer for use in CUDA API calls.
// Returns nullptr if no stream is set.
cudaStream_t get_cuda_stream() const {
return cuda_stream ? *cuda_stream : nullptr;
}

// Check if this handle has a valid CUDA stream.
bool has_cuda_stream() const {
return cuda_stream != nullptr && *cuda_stream != nullptr;
}
};

} // namespace cuda
} // namespace backends
} // namespace executorch
Loading
Loading