Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions backends/aoti/aoti_delegate_handle.h
Original file line number Diff line number Diff line change
Expand Up @@ -84,8 +84,6 @@ struct AOTIDelegateHandle {
void* so_handle;
std::string so_path;
AOTInductorModelContainerHandle container_handle;
void* cuda_stream; // cudaStream_t stored as void* to avoid CUDA header
// dependency
std::string method_name;

// Function pointers specific to this handle's shared library
Expand Down
41 changes: 36 additions & 5 deletions backends/aoti/slim/core/storage.h
Original file line number Diff line number Diff line change
Expand Up @@ -127,16 +127,47 @@ struct DeviceTraits<c10::DeviceType::CUDA> {
/// @param ptr Pointer to device memory to free.
static void free(void* ptr) {
// Get the current stream for the current device
// Currently all cuda slimtensors should be on the same device same stream,
// so we can just use the stream on current device.
// TODO(gasoonjia): add cuda stream as a member of MaybeOwningStorage to
// support multiple devices.
auto stream_result = executorch::backends::cuda::getCurrentCUDAStream(-1);
if (stream_result.ok()) {
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
ET_CHECK_MSG(stream_result.ok(), "Failed to get current CUDA stream");
ET_CUDA_LOG_WARN(cudaFreeAsync(ptr, stream_result.get()));
}

/// Copies memory between CPU and CUDA or CUDA and CUDA asynchronously.
/// @param dst Destination pointer.
/// @param src Source pointer.
/// @param nbytes Number of bytes to copy.
/// @param dst_device Destination device.
/// @param src_device Source device.
/// @param stream CUDA stream for async copy.
static void memcpy_async(
void* dst,
const void* src,
size_t nbytes,
const c10::Device& dst_device,
const c10::Device& src_device,
cudaStream_t stream) {
cudaMemcpyKind direction = cudaMemcpyDeviceToDevice;

if (src_device.is_cpu()) {
direction = cudaMemcpyHostToDevice;
} else if (dst_device.is_cpu()) {
direction = cudaMemcpyDeviceToHost;
} else {
// Fallback to synchronous free if we can't get the stream
ET_CUDA_LOG_WARN(cudaFree(ptr));
ET_CHECK_MSG(
src_device.index() == dst_device.index(),
"CUDA memcpy across different device indices not supported: %d != %d",
static_cast<int>(src_device.index()),
static_cast<int>(dst_device.index()));
}

ET_CUDA_CHECK(cudaMemcpyAsync(dst, src, nbytes, direction, stream));
}

/// Copies memory between CPU and CUDA or CUDA and CUDA.
/// Copies memory between CPU and CUDA or CUDA and CUDA synchronously.
/// @param dst Destination pointer.
/// @param src Source pointer.
/// @param nbytes Number of bytes to copy.
Expand Down
3 changes: 3 additions & 0 deletions backends/cuda/runtime/TARGETS
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,9 @@ runtime.cxx_library(
srcs = [
"cuda_backend.cpp",
],
headers = [
"cuda_delegate_handle.h",
],
# @lint-ignore BUCKLINT: Avoid `link_whole=True` (https://fburl.com/avoid-link-whole)
link_whole = True,
supports_python_dlopen = True,
Expand Down
125 changes: 92 additions & 33 deletions backends/cuda/runtime/cuda_backend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

// Include our shim layer headers
#include <executorch/backends/aoti/aoti_delegate_handle.h>
#include <executorch/backends/cuda/runtime/cuda_delegate_handle.h>
#include <executorch/backends/cuda/runtime/platform/platform.h>
#include <executorch/backends/cuda/runtime/shims/memory.h>
#include <executorch/backends/cuda/runtime/utils.h>
Expand Down Expand Up @@ -77,6 +78,7 @@ using slim::c10::DeviceType;
namespace {
constexpr char kSkipCopyOutputToCpuForMethod[] =
"skip_copy_output_to_cpu_for_method";
constexpr char kUseSharedCudaStream[] = "use_shared_cuda_stream";
} // anonymous namespace

class ET_EXPERIMENTAL CudaBackend final
Expand Down Expand Up @@ -143,6 +145,33 @@ class ET_EXPERIMENTAL CudaBackend final
return method_in_csv(method_name, skip_copy_method_);
}

// Create the shared CUDA stream. Called when use_shared_cuda_stream option
// is set to true. The presence of shared_cuda_stream_ indicates shared mode.
void create_shared_cuda_stream() {
std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
if (shared_cuda_stream_ != nullptr) {
return; // Already created
}
shared_cuda_stream_ = cuda::create_cuda_stream();
if (shared_cuda_stream_ == nullptr) {
ET_LOG(Error, "Failed to create shared CUDA stream");
return;
}
ET_LOG(Info, "Created shared CUDA stream: %p", *shared_cuda_stream_);
}

// Get the shared CUDA stream. Returns nullptr if not in shared mode.
std::shared_ptr<cudaStream_t> get_shared_cuda_stream() const {
std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
return shared_cuda_stream_;
}

// Check if we're using shared CUDA stream mode.
bool is_using_shared_cuda_stream() const {
std::lock_guard<std::mutex> guard(cuda_stream_mutex_);
return shared_cuda_stream_ != nullptr;
}

Error load_function_pointers_into_handle(
void* so_handle,
AOTIDelegateHandle* handle) const {
Expand Down Expand Up @@ -201,6 +230,15 @@ class ET_EXPERIMENTAL CudaBackend final
kSkipCopyOutputToCpuForMethod);
return Error::InvalidArgument;
}
} else if (std::strcmp(option.key, kUseSharedCudaStream) == 0) {
if (auto* val = std::get_if<bool>(&option.value)) {
if (*val) {
create_shared_cuda_stream();
}
} else {
ET_LOG(Error, "Option %s must be a boolean.", kUseSharedCudaStream);
return Error::InvalidArgument;
}
}
}
return Error::Ok;
Expand Down Expand Up @@ -282,7 +320,7 @@ class ET_EXPERIMENTAL CudaBackend final
processed->Free();

// Create handle and load function pointers into it
AOTIDelegateHandle* handle = new AOTIDelegateHandle();
cuda::CudaDelegateHandle* handle = new cuda::CudaDelegateHandle();
handle->so_handle = lib_handle;
handle->so_path = so_path.string();
handle->method_name = method_name;
Expand Down Expand Up @@ -313,10 +351,31 @@ class ET_EXPERIMENTAL CudaBackend final
handle->container_handle, static_cast<const uint8_t*>(weights_blob)));
buffer_res->Free();
}
// Create a CUDA stream for asynchronous execution
cudaStream_t cuda_stream;
ET_CUDA_CHECK_OR_RETURN_ERROR(cudaStreamCreate(&cuda_stream));
handle->cuda_stream = static_cast<void*>(cuda_stream);

// Use shared CUDA stream if enabled via options, otherwise create one.
// A shared stream ensures proper ordering across multiple methods
// (e.g., encoder, decoder, sampler) when using skip-copy optimization.
if (is_using_shared_cuda_stream()) {
// Shared stream mode: all handles share the same stream.
handle->cuda_stream = get_shared_cuda_stream();
ET_LOG(
Info,
"Using shared CUDA stream %p for method %s",
handle->get_cuda_stream(),
method_name.c_str());
} else {
// Per-handle stream mode: each handle owns its own stream.
handle->cuda_stream = cuda::create_cuda_stream();
if (handle->cuda_stream == nullptr) {
delete handle;
return Error::Internal;
}
ET_LOG(
Info,
"Created new CUDA stream %p for method %s",
handle->get_cuda_stream(),
method_name.c_str());
}

return (DelegateHandle*)handle; // Return the handle post-processing
}
Expand All @@ -326,14 +385,16 @@ class ET_EXPERIMENTAL CudaBackend final
BackendExecutionContext& context,
DelegateHandle* handle_,
Span<EValue*> args) const override {
AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
cuda::CudaDelegateHandle* handle = (cuda::CudaDelegateHandle*)handle_;

size_t n_inputs;
handle->get_num_inputs(handle->container_handle, &n_inputs);

size_t n_outputs;
handle->get_num_outputs(handle->container_handle, &n_outputs);

setCurrentCUDAStream(handle->get_cuda_stream(), 0);

ET_CHECK_OR_RETURN_ERROR(
n_inputs + n_outputs == args.size(),
InvalidArgument,
Expand Down Expand Up @@ -406,13 +467,18 @@ class ET_EXPERIMENTAL CudaBackend final
// expects ETensor* as input/output. We avoid changing its signature since
// it's shared with the Metal backend. Instead, we reinterpret_cast
// SlimTensor* to Tensor*
//
// Get current CUDA stream and pass it to AOTInductorModelContainerRun
Result<cudaStream_t> cuda_stream_ret = getCurrentCUDAStream(0);
cudaStream_t cuda_stream = cuda_stream_ret.get();
ET_CHECK_OK_OR_RETURN_ERROR(cuda_stream_ret.error());
AOTIRuntimeError error = handle->run(
handle->container_handle,
reinterpret_cast<Tensor**>(gpu_inputs.data()),
n_inputs,
reinterpret_cast<Tensor**>(gpu_outputs.data()),
n_outputs,
handle->cuda_stream,
static_cast<void*>(cuda_stream),
nullptr);

ET_CHECK_OR_RETURN_ERROR(
Expand All @@ -423,22 +489,12 @@ class ET_EXPERIMENTAL CudaBackend final

const bool copy_outputs = !should_skip_copy_for_method(handle->method_name);

// Synchronize CUDA stream to ensure kernel execution is complete
// before accessing output data (either for copy or skip-copy path)
cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
cudaError_t sync_err = cudaStreamSynchronize(cuda_stream);
ET_CHECK_OR_RETURN_ERROR(
sync_err == cudaSuccess,
Internal,
"cudaStreamSynchronize failed: %s",
cudaGetErrorString(sync_err));

if (copy_outputs) {
// Deep copy GPU SlimTensor results back to CPU ETensors
for (size_t i = 0; i < n_outputs; i++) {
auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
ET_CHECK_OK_OR_RETURN_ERROR(
copy_slimtensor_to_etensor(gpu_outputs[i], cpu_output_tensor),
copy_slimtensor_to_etensor_async(
gpu_outputs[i], cpu_output_tensor, cuda_stream),
"Failed to copy GPU output %zu back to CPU ETensor",
i);
}
Expand Down Expand Up @@ -483,7 +539,7 @@ class ET_EXPERIMENTAL CudaBackend final
if (handle_ == nullptr) {
return;
}
AOTIDelegateHandle* handle = (AOTIDelegateHandle*)handle_;
cuda::CudaDelegateHandle* handle = (cuda::CudaDelegateHandle*)handle_;

// Clean up cached output tensors for this handle
{
Expand All @@ -495,16 +551,10 @@ class ET_EXPERIMENTAL CudaBackend final
}
}

// Destroy the CUDA stream if it exists
if (handle->cuda_stream != nullptr) {
cudaStream_t cuda_stream = static_cast<cudaStream_t>(handle->cuda_stream);
cudaError_t stream_err = cudaStreamDestroy(cuda_stream);
ET_CHECK_OR_LOG_ERROR(
stream_err == cudaSuccess,
"Failed to destroy CUDA stream: %s",
cudaGetErrorString(stream_err));
handle->cuda_stream = nullptr;
}
// The CUDA stream is managed by shared_ptr in the handle.
// It will be automatically destroyed when the last handle using it
// is destroyed. Just reset our reference.
handle->cuda_stream.reset();

// NOTE: AOTInductorModelContainerDelete does not work correctly with
// multiple .so files. Deleting one container frees shared resources,
Expand Down Expand Up @@ -541,13 +591,22 @@ class ET_EXPERIMENTAL CudaBackend final
mutable std::mutex skip_copy_method_mutex_;
std::string skip_copy_method_;

// Shared CUDA stream for all methods. When set (non-null), all methods use
// the same stream to ensure proper ordering (critical for skip-copy
// optimization). Created when use_shared_cuda_stream option is set to true.
// Managed via shared_ptr so it's automatically cleaned up when last handle
// is destroyed.
mutable std::mutex cuda_stream_mutex_;
std::shared_ptr<cudaStream_t> shared_cuda_stream_ = nullptr;

// Cached output tensors for skip-copy optimization.
// When skip-copy is enabled, output SlimTensors are cached here to keep
// the underlying GPU memory alive while the caller processes the results.
// Maps each AOTIDelegateHandle* to its vector of cached output tensors.
// Maps each CudaDelegateHandle* to its vector of cached output tensors.
mutable std::mutex cached_outputs_mutex_;
mutable std::unordered_map<AOTIDelegateHandle*, std::vector<SlimTensor*>>
cached_outputs_;
mutable std::
unordered_map<cuda::CudaDelegateHandle*, std::vector<SlimTensor*>>
cached_outputs_;
};

} // namespace executorch::backends::cuda
Expand Down
65 changes: 65 additions & 0 deletions backends/cuda/runtime/cuda_delegate_handle.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* All rights reserved.
*
* This source code is licensed under the BSD-style license found in the
* LICENSE file in the root directory of this source tree.
*/

#pragma once

#include <cuda_runtime.h>
#include <executorch/backends/aoti/aoti_delegate_handle.h>
#include <memory>

namespace executorch {
namespace backends {
namespace cuda {

// Shared CUDA stream wrapper with proper RAII cleanup.
// This ensures the stream is destroyed when all handles using it are destroyed.
struct CudaStreamDeleter {
void operator()(cudaStream_t* stream) const {
if (stream != nullptr && *stream != nullptr) {
cudaStreamDestroy(*stream);
}
delete stream;
}
};

// Creates a new shared CUDA stream.
// Returns nullptr on failure.
inline std::shared_ptr<cudaStream_t> create_cuda_stream() {
cudaStream_t stream;
cudaError_t err = cudaStreamCreate(&stream);
if (err != cudaSuccess) {
return nullptr;
}
return std::shared_ptr<cudaStream_t>(
new cudaStream_t(stream), CudaStreamDeleter());
}
// CUDA-specific delegate handle that extends AOTIDelegateHandle.
// This consolidates CUDA stream management into a single location.
struct CudaDelegateHandle : public aoti::AOTIDelegateHandle {
// CUDA stream for this handle, support both shared mode and single mode.
// In shared mode, all cuda delegate handles share the same stream (e.g., for
// skip-copy optimization), they will all hold a reference to the same
// shared_ptr. The stream is automatically destroyed when the last handle is
// destroyed. In single mode, every cuda delegate handle has its own stream.
std::shared_ptr<cudaStream_t> cuda_stream;

// Get the raw CUDA stream pointer for use in CUDA API calls.
// Returns nullptr if no stream is set.
cudaStream_t get_cuda_stream() const {
return cuda_stream ? *cuda_stream : nullptr;
}

// Check if this handle has a valid CUDA stream.
bool has_cuda_stream() const {
return cuda_stream != nullptr && *cuda_stream != nullptr;
}
};

} // namespace cuda
} // namespace backends
} // namespace executorch
Loading
Loading