Skip to content

Commit 3915a90

Browse files
committed
[ET Device Support] CUDA-native Qwen 3.5 MoE inference with device tensor pipeline
Pull Request resolved: #18788 Integrate the ET device tensor pipeline into the Qwen 3.5 MoE model to eliminate unnecessary H2D/D2H copies during inference. - Export: Multi-method export (`forward` + `sample`) with device memory planning enabled and method-level H2D/D2H skipping. - Runner: Custom CUDA-native inference loop that keeps logits on GPU between forward and sample, reuses CUDA tensors across iterations, and only copies the 8-byte token ID back to CPU for EOS checking. ghstack-source-id: 386793196 @exported-using-ghexport Differential Revision: [D100133933](https://our.internmc.facebook.com/intern/diff/D100133933/)
1 parent f6fc389 commit 3915a90

9 files changed

Lines changed: 277 additions & 46 deletions

File tree

backends/cuda/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -107,12 +107,13 @@ set(_aoti_cuda_shim_sources runtime/cuda_allocator.cpp runtime/shims/memory.cpp
107107
runtime/shims/cuda_guard.cpp
108108
)
109109

110-
# Only build CUDA shims when CUDA language/toolchain is available.
110+
# Only build CUDA-specific shims when CUDA language/toolchain is available.
111111
if(CMAKE_CUDA_COMPILER)
112112
list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu
113113
runtime/shims/int4_plain_mm.cu runtime/shims/sort.cu
114114
runtime/shims/rand.cu
115115
)
116+
list(APPEND _aoti_cuda_shim_sources runtime/shims/randint.cu)
116117
endif()
117118

118119
add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})

backends/cuda/cuda_backend.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -253,8 +253,7 @@ def get_custom_passes(cls, compile_specs: List[CompileSpec]) -> List[typing.Any]
253253
mode = spec.value.decode("utf-8").upper()
254254
if mode not in ["ON", "OFF"]:
255255
raise ValueError(
256-
f"Invalid triton_kernel_mode: {mode}. "
257-
f"Expected 'ON' or 'OFF'."
256+
f"Invalid triton_kernel_mode: {mode}. Expected 'ON' or 'OFF'."
258257
)
259258
triton_kernel_mode = mode
260259
passes = [MoveCondPredicateToCpuPass()]

backends/cuda/runtime/cuda_backend.cpp

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -469,7 +469,16 @@ class ET_EXPERIMENTAL CudaBackend final
469469
return (DelegateHandle*)handle; // Return the handle post-processing
470470
}
471471

472-
// Once per execution
472+
// Execute the AOTI-compiled CUDA kernel for one inference step.
473+
//
474+
// Currently supports both CPU and CUDA memory for IO tensors:
475+
// - Inputs: detected via cudaPointerGetAttributes; CUDA data is wrapped
476+
// in-place (no copy), CPU data is copied to GPU via from_etensor().
477+
// - Outputs: either copied to ETensor's backing memory (CPU or CUDA),
478+
// or the ETensor is rewired to point at GPU memory (skip-copy mode).
479+
//
480+
// TODO: Once the device tensor pipeline is fully adopted, all IO tensors
481+
// will reside in CUDA memory. Remove the CPU fallback paths.
473482
Error execute(
474483
BackendExecutionContext& context,
475484
DelegateHandle* handle_,
@@ -494,14 +503,17 @@ class ET_EXPERIMENTAL CudaBackend final
494503
n_outputs,
495504
args.size())
496505

497-
// Verify device info on all memory-planned, ET-driven IO tensors.
498-
// All input and output tensors should have device_type = CUDA, which
499-
// is set during serialization by PropagateDevicePass based on the
500-
// target_device compile spec from CudaPartitioner.
506+
// Verify device metadata on all IO tensors.
507+
// All tensors should have device_type = CUDA, set during serialization
508+
// by PropagateDevicePass based on the target_device compile spec from
509+
// CudaPartitioner.
501510
//
502-
// Note: At this stage, the tensor memory is still on CPU. The device_type
503-
// is metadata indicating where the tensor *should* reside. The backend
504-
// is responsible for copying data to the actual CUDA device.
511+
// Note: device_type is metadata — the actual memory location may be
512+
// either CPU (legacy path with H2D copy ops) or CUDA (when device
513+
// memory planning is enabled via enable_non_cpu_memory_planning,
514+
// which allocates delegate IO in CUDA memory). The backend detects
515+
// the actual location via cudaPointerGetAttributes and handles both
516+
// cases.
505517
for (size_t i = 0; i < n_inputs + n_outputs; i++) {
506518
auto* tensor = &(args[i]->toTensor());
507519
auto device_type = tensor->unsafeGetTensorImpl()->device_type();
@@ -582,13 +594,13 @@ class ET_EXPERIMENTAL CudaBackend final
582594
std::vector<SlimTensor*> gpu_inputs(n_inputs);
583595
std::vector<SlimTensor*> gpu_outputs(n_outputs);
584596

585-
// Process input tensors: convert ETensor (CPU) to SlimTensor (GPU)
597+
// Process input tensors: convert ETensor to SlimTensor
586598
for (size_t i = 0; i < n_inputs; i++) {
587-
auto* cpu_tensor = &(args[i]->toTensor());
599+
auto* input_tensor = &(args[i]->toTensor());
588600

589601
// CAPTURE step: allocate persistent static GPU buffers
590602
if (is_capture_step) {
591-
size_t nbytes = cpu_tensor->nbytes();
603+
size_t nbytes = input_tensor->nbytes();
592604

593605
void* static_ptr = nullptr;
594606
cudaError_t merr = cudaMalloc(&static_ptr, nbytes);
@@ -601,46 +613,49 @@ class ET_EXPERIMENTAL CudaBackend final
601613

602614
cudaMemcpy(
603615
static_ptr,
604-
cpu_tensor->const_data_ptr(),
616+
input_tensor->const_data_ptr(),
605617
nbytes,
606618
cudaMemcpyHostToDevice);
607619

608620
handle->cuda_graph_state.static_input_ptrs.push_back(static_ptr);
609621
handle->cuda_graph_state.static_input_nbytes.push_back(nbytes);
610622

611623
gpu_inputs[i] = make_slimtensor_from_blob_with_etensor_metadata(
612-
static_ptr, cpu_tensor);
624+
static_ptr, input_tensor);
613625
continue;
614626
}
615627

616628
// Check if input data is already on GPU (skip-copy optimization for
617629
// inputs) This can happen when the caller has pre-staged data on GPU
618630
cudaPointerAttributes attributes{};
619-
const void* data_ptr = cpu_tensor->const_data_ptr();
631+
const void* data_ptr = input_tensor->const_data_ptr();
620632
if (data_ptr != nullptr) {
621633
cudaError_t err = cudaPointerGetAttributes(&attributes, data_ptr);
622634
if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
623635
// Data is already on GPU - wrap it directly without copy
624636
gpu_inputs[i] = make_slimtensor_from_blob_with_etensor_metadata(
625-
const_cast<void*>(data_ptr), cpu_tensor);
637+
const_cast<void*>(data_ptr), input_tensor);
626638

627639
continue;
628640
}
629641
}
630642

631-
// Data is on CPU - use from_etensor to copy to GPU
643+
// Data is in CPU memory (legacy path) — copy to GPU via from_etensor.
644+
// TODO: Remove this path once all callers use the device tensor pipeline.
632645
gpu_inputs[i] = new SlimTensor(
633-
from_etensor(*cpu_tensor, CPU_DEVICE, DEFAULT_CUDA_DEVICE));
646+
from_etensor(*input_tensor, CPU_DEVICE, DEFAULT_CUDA_DEVICE));
634647
}
635648

636-
// Process output tensors: create GPU SlimTensors for kernel output.
637-
// Save pre-run handles to detect orphans after run().
649+
// Allocate GPU SlimTensors for kernel outputs. These are always
650+
// freshly allocated on GPU regardless of the input memory mode.
651+
// Save pre-run handles to detect orphans after run() (the AOTI
652+
// runtime may replace output handles with its own allocations).
638653
std::vector<SlimTensor*> pre_run_outputs(n_outputs, nullptr);
639654
for (size_t i = 0; i < n_outputs; i++) {
640-
auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
641-
auto sizes = cpu_output_tensor->sizes();
642-
auto strides = cpu_output_tensor->strides();
643-
auto scalar_type = cpu_output_tensor->scalar_type();
655+
auto* output_tensor = &(args[i + n_inputs]->toTensor());
656+
auto sizes = output_tensor->sizes();
657+
auto strides = output_tensor->strides();
658+
auto scalar_type = output_tensor->scalar_type();
644659

645660
std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
646661
std::vector<int64_t> strides_vec(strides.begin(), strides.end());
@@ -801,13 +816,18 @@ class ET_EXPERIMENTAL CudaBackend final
801816

802817
const bool copy_outputs = !should_skip_copy_for_method(handle->method_name);
803818

819+
// Output disposition: copy to ETensor backing memory or keep on GPU.
820+
// When copy_outputs is true (default), results are copied to the
821+
// ETensor's memory (which may be CPU or CUDA planned memory).
822+
// When false (skip-copy optimization), the ETensor is rewired to
823+
// point at the GPU SlimTensor's memory directly.
804824
if (copy_outputs) {
805825
for (size_t i = 0; i < n_outputs; i++) {
806-
auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
826+
auto* output_tensor = &(args[i + n_inputs]->toTensor());
807827
ET_CHECK_OK_OR_RETURN_ERROR(
808828
copy_slimtensor_to_etensor_async(
809-
gpu_outputs[i], cpu_output_tensor, cuda_stream),
810-
"Failed to copy GPU output %zu back to CPU ETensor",
829+
gpu_outputs[i], output_tensor, cuda_stream),
830+
"Failed to copy GPU output %zu back to ETensor",
811831
i);
812832
delete gpu_outputs[i];
813833
gpu_outputs[i] = nullptr;
Lines changed: 108 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#include <cuda_runtime.h>
10+
#include <curand.h>
11+
12+
#include <executorch/backends/cuda/runtime/shims/randint.h>
13+
#include <executorch/runtime/platform/assert.h>
14+
#include <executorch/runtime/platform/log.h>
15+
16+
#include <cstdint>
17+
#include <ctime>
18+
19+
namespace executorch::backends::cuda {
20+
21+
using executorch::runtime::Error;
22+
23+
namespace {
24+
25+
// Transform cuRAND uniform doubles (0, 1] to int64 values in [low, high).
26+
__global__ void uniform_to_randint_kernel(
27+
int64_t* out,
28+
const double* uniform,
29+
int64_t numel,
30+
int64_t low,
31+
int64_t range) {
32+
int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
33+
if (idx < numel) {
34+
// uniform is in (0, 1], so (uniform * range) is in (0, range].
35+
// Subtract 1 and clamp to get [0, range-1], then add low for [low, high-1].
36+
int64_t val = static_cast<int64_t>(uniform[idx] * range);
37+
out[idx] = low + (val >= range ? range - 1 : val);
38+
}
39+
}
40+
41+
curandGenerator_t get_or_create_generator() {
42+
static curandGenerator_t gen = nullptr;
43+
if (gen == nullptr) {
44+
curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
45+
curandSetPseudoRandomGeneratorSeed(
46+
gen, static_cast<unsigned long long>(time(nullptr)));
47+
}
48+
return gen;
49+
}
50+
51+
} // anonymous namespace
52+
53+
extern "C" {
54+
55+
AOTITorchError aoti_torch_cuda_randint_low_out(
56+
SlimTensor* out,
57+
int64_t low,
58+
int64_t high,
59+
const int64_t* size,
60+
int64_t size_len_) {
61+
ET_CHECK_OR_RETURN_ERROR(
62+
out != nullptr,
63+
InvalidArgument,
64+
"aoti_torch_cuda_randint_low_out: out tensor is null");
65+
66+
ET_CHECK_OR_RETURN_ERROR(
67+
high > low,
68+
InvalidArgument,
69+
"aoti_torch_cuda_randint_low_out: requires high > low");
70+
71+
int64_t numel = 1;
72+
for (int64_t i = 0; i < size_len_; i++) {
73+
numel *= size[i];
74+
}
75+
if (numel == 0) {
76+
return Error::Ok;
77+
}
78+
79+
int64_t range = high - low;
80+
int64_t* out_data = static_cast<int64_t*>(out->data_ptr());
81+
82+
// Allocate temporary buffer for uniform doubles on device.
83+
double* d_uniform = nullptr;
84+
auto alloc_err = cudaMalloc(&d_uniform, numel * sizeof(double));
85+
ET_CHECK_OR_RETURN_ERROR(
86+
alloc_err == cudaSuccess,
87+
Internal,
88+
"aoti_torch_cuda_randint_low_out: cudaMalloc failed (%d)",
89+
static_cast<int>(alloc_err));
90+
91+
// Generate uniform doubles in (0, 1].
92+
auto gen = get_or_create_generator();
93+
curandGenerateUniformDouble(gen, d_uniform, numel);
94+
95+
// Transform to integers in [low, high).
96+
constexpr int kThreads = 256;
97+
int blocks = static_cast<int>((numel + kThreads - 1) / kThreads);
98+
uniform_to_randint_kernel<<<blocks, kThreads>>>(
99+
out_data, d_uniform, numel, low, range);
100+
101+
cudaFree(d_uniform);
102+
103+
return Error::Ok;
104+
}
105+
106+
} // extern "C"
107+
108+
} // namespace executorch::backends::cuda
Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
#pragma once
10+
11+
#include <executorch/backends/aoti/common_shims_slim.h>
12+
#include <executorch/backends/aoti/export.h>
13+
14+
namespace executorch::backends::cuda {
15+
16+
using executorch::backends::aoti::AOTITorchError;
17+
using SlimTensor = executorch::backends::aoti::slim::SlimTensor;
18+
19+
extern "C" {
20+
21+
/**
22+
* Fills a pre-allocated CUDA tensor with random integers in [low, high).
23+
*
24+
* Used by AOTI-generated code when the model calls torch.randint or ops
25+
* that decompose into randint (e.g. torch.rand_like on some dtypes).
26+
*
27+
* @param out Pre-allocated output tensor on CUDA (must not be null).
28+
* @param low Lower bound (inclusive) of the random range.
29+
* @param high Upper bound (exclusive) of the random range.
30+
* @param size Pointer to array of output dimension sizes.
31+
* @param size_len_ Number of dimensions.
32+
* @return AOTITorchError error code (Error::Ok on success).
33+
*/
34+
AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda_randint_low_out(
35+
SlimTensor* out,
36+
int64_t low,
37+
int64_t high,
38+
const int64_t* size,
39+
int64_t size_len_);
40+
41+
} // extern "C"
42+
43+
} // namespace executorch::backends::cuda

examples/models/qwen3_5_moe/CMakeLists.txt

Lines changed: 9 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -32,14 +32,8 @@ list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
3232
executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
3333

3434
# Extensions
35-
list(
36-
APPEND
37-
link_libraries
38-
extension_llm_runner
39-
extension_module
40-
extension_data_loader
41-
extension_tensor
42-
extension_flat_tensor
35+
list(APPEND link_libraries extension_module extension_data_loader
36+
extension_tensor extension_flat_tensor
4337
)
4438

4539
# Backend selection
@@ -48,7 +42,7 @@ if(EXECUTORCH_BUILD_METAL)
4842
executorch_target_link_options_shared_lib(metal_backend)
4943
elseif(EXECUTORCH_BUILD_CUDA)
5044
find_package(CUDAToolkit REQUIRED)
51-
list(APPEND link_libraries aoti_cuda_backend)
45+
list(APPEND link_libraries aoti_cuda_backend CUDA::cudart)
5246
executorch_target_link_options_shared_lib(aoti_cuda_backend)
5347
add_compile_definitions(EXECUTORCH_BUILD_CUDA)
5448
else()
@@ -60,7 +54,12 @@ endif()
6054
# Tokenizer
6155
list(APPEND link_libraries tokenizers::tokenizers)
6256

63-
add_executable(qwen3_5_moe_runner main.cpp)
57+
add_executable(
58+
qwen3_5_moe_runner
59+
main.cpp ${EXECUTORCH_ROOT}/runtime/core/device_allocator.cpp
60+
${EXECUTORCH_ROOT}/runtime/core/device_memory_buffer.cpp
61+
${EXECUTORCH_ROOT}/backends/cuda/runtime/cuda_allocator.cpp
62+
)
6463
target_include_directories(
6564
qwen3_5_moe_runner PUBLIC ${_common_include_directories}
6665
)

0 commit comments

Comments
 (0)