[ET Device Support] CUDA-native Qwen 3.5 MoE inference with device tensor pipeline

Gasoonjia · Gasoonjia · commit 3915a905ea25 · 2026-05-27T15:09:08.000-07:00
Pull Request resolved: #18788 Integrate the ET device tensor pipeline into the Qwen 3.5 MoE model to eliminate unnecessary H2D/D2H copies during inference. - Export: Multi-method export (`forward` + `sample`) with device memory planning enabled and method-level H2D/D2H skipping. - Runner: Custom CUDA-native inference loop that keeps logits on GPU between forward and sample, reuses CUDA tensors across iterations, and only copies the 8-byte token ID back to CPU for EOS checking. ghstack-source-id: 386793196 @exported-using-ghexport Differential Revision: [D100133933](https://our.internmc.facebook.com/intern/diff/D100133933/)
diff --git a/backends/cuda/CMakeLists.txt b/backends/cuda/CMakeLists.txt
@@ -107,12 +107,13 @@ set(_aoti_cuda_shim_sources runtime/cuda_allocator.cpp runtime/shims/memory.cpp
                             runtime/shims/cuda_guard.cpp
 )
 
-# Only build CUDA shims when CUDA language/toolchain is available.
+# Only build CUDA-specific shims when CUDA language/toolchain is available.
 if(CMAKE_CUDA_COMPILER)
   list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu
        runtime/shims/int4_plain_mm.cu runtime/shims/sort.cu
        runtime/shims/rand.cu
   )
+  list(APPEND _aoti_cuda_shim_sources runtime/shims/randint.cu)
 endif()
 
 add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})
diff --git a/backends/cuda/cuda_backend.py b/backends/cuda/cuda_backend.py
@@ -253,8 +253,7 @@ def get_custom_passes(cls, compile_specs: List[CompileSpec]) -> List[typing.Any]
                 mode = spec.value.decode("utf-8").upper()
                 if mode not in ["ON", "OFF"]:
                     raise ValueError(
-                        f"Invalid triton_kernel_mode: {mode}. "
-                        f"Expected 'ON' or 'OFF'."
+                        f"Invalid triton_kernel_mode: {mode}. Expected 'ON' or 'OFF'."
                     )
                 triton_kernel_mode = mode
         passes = [MoveCondPredicateToCpuPass()]
diff --git a/backends/cuda/runtime/cuda_backend.cpp b/backends/cuda/runtime/cuda_backend.cpp
@@ -469,7 +469,16 @@ class ET_EXPERIMENTAL CudaBackend final
     return (DelegateHandle*)handle; // Return the handle post-processing
   }
 
-  // Once per execution
+  // Execute the AOTI-compiled CUDA kernel for one inference step.
+  //
+  // Currently supports both CPU and CUDA memory for IO tensors:
+  //   - Inputs: detected via cudaPointerGetAttributes; CUDA data is wrapped
+  //     in-place (no copy), CPU data is copied to GPU via from_etensor().
+  //   - Outputs: either copied to ETensor's backing memory (CPU or CUDA),
+  //     or the ETensor is rewired to point at GPU memory (skip-copy mode).
+  //
+  // TODO: Once the device tensor pipeline is fully adopted, all IO tensors
+  // will reside in CUDA memory. Remove the CPU fallback paths.
   Error execute(
       BackendExecutionContext& context,
       DelegateHandle* handle_,
@@ -494,14 +503,17 @@ class ET_EXPERIMENTAL CudaBackend final
         n_outputs,
         args.size())
 
-    // Verify device info on all memory-planned, ET-driven IO tensors.
-    // All input and output tensors should have device_type = CUDA, which
-    // is set during serialization by PropagateDevicePass based on the
-    // target_device compile spec from CudaPartitioner.
+    // Verify device metadata on all IO tensors.
+    // All tensors should have device_type = CUDA, set during serialization
+    // by PropagateDevicePass based on the target_device compile spec from
+    // CudaPartitioner.
     //
-    // Note: At this stage, the tensor memory is still on CPU. The device_type
-    // is metadata indicating where the tensor *should* reside. The backend
-    // is responsible for copying data to the actual CUDA device.
+    // Note: device_type is metadata — the actual memory location may be
+    // either CPU (legacy path with H2D copy ops) or CUDA (when device
+    // memory planning is enabled via enable_non_cpu_memory_planning,
+    // which allocates delegate IO in CUDA memory). The backend detects
+    // the actual location via cudaPointerGetAttributes and handles both
+    // cases.
     for (size_t i = 0; i < n_inputs + n_outputs; i++) {
       auto* tensor = &(args[i]->toTensor());
       auto device_type = tensor->unsafeGetTensorImpl()->device_type();
@@ -582,13 +594,13 @@ class ET_EXPERIMENTAL CudaBackend final
     std::vector<SlimTensor*> gpu_inputs(n_inputs);
     std::vector<SlimTensor*> gpu_outputs(n_outputs);
 
-    // Process input tensors: convert ETensor (CPU) to SlimTensor (GPU)
+    // Process input tensors: convert ETensor to SlimTensor
     for (size_t i = 0; i < n_inputs; i++) {
-      auto* cpu_tensor = &(args[i]->toTensor());
+      auto* input_tensor = &(args[i]->toTensor());
 
       // CAPTURE step: allocate persistent static GPU buffers
       if (is_capture_step) {
-        size_t nbytes = cpu_tensor->nbytes();
+        size_t nbytes = input_tensor->nbytes();
 
         void* static_ptr = nullptr;
         cudaError_t merr = cudaMalloc(&static_ptr, nbytes);
@@ -601,46 +613,49 @@ class ET_EXPERIMENTAL CudaBackend final
 
         cudaMemcpy(
             static_ptr,
-            cpu_tensor->const_data_ptr(),
+            input_tensor->const_data_ptr(),
             nbytes,
             cudaMemcpyHostToDevice);
 
         handle->cuda_graph_state.static_input_ptrs.push_back(static_ptr);
         handle->cuda_graph_state.static_input_nbytes.push_back(nbytes);
 
         gpu_inputs[i] = make_slimtensor_from_blob_with_etensor_metadata(
-            static_ptr, cpu_tensor);
+            static_ptr, input_tensor);
         continue;
       }
 
       // Check if input data is already on GPU (skip-copy optimization for
       // inputs) This can happen when the caller has pre-staged data on GPU
       cudaPointerAttributes attributes{};
-      const void* data_ptr = cpu_tensor->const_data_ptr();
+      const void* data_ptr = input_tensor->const_data_ptr();
       if (data_ptr != nullptr) {
         cudaError_t err = cudaPointerGetAttributes(&attributes, data_ptr);
         if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
           // Data is already on GPU - wrap it directly without copy
           gpu_inputs[i] = make_slimtensor_from_blob_with_etensor_metadata(
-              const_cast<void*>(data_ptr), cpu_tensor);
+              const_cast<void*>(data_ptr), input_tensor);
 
           continue;
         }
       }
 
-      // Data is on CPU - use from_etensor to copy to GPU
+      // Data is in CPU memory (legacy path) — copy to GPU via from_etensor.
+      // TODO: Remove this path once all callers use the device tensor pipeline.
       gpu_inputs[i] = new SlimTensor(
-          from_etensor(*cpu_tensor, CPU_DEVICE, DEFAULT_CUDA_DEVICE));
+          from_etensor(*input_tensor, CPU_DEVICE, DEFAULT_CUDA_DEVICE));
     }
 
-    // Process output tensors: create GPU SlimTensors for kernel output.
-    // Save pre-run handles to detect orphans after run().
+    // Allocate GPU SlimTensors for kernel outputs. These are always
+    // freshly allocated on GPU regardless of the input memory mode.
+    // Save pre-run handles to detect orphans after run() (the AOTI
+    // runtime may replace output handles with its own allocations).
     std::vector<SlimTensor*> pre_run_outputs(n_outputs, nullptr);
     for (size_t i = 0; i < n_outputs; i++) {
-      auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
-      auto sizes = cpu_output_tensor->sizes();
-      auto strides = cpu_output_tensor->strides();
-      auto scalar_type = cpu_output_tensor->scalar_type();
+      auto* output_tensor = &(args[i + n_inputs]->toTensor());
+      auto sizes = output_tensor->sizes();
+      auto strides = output_tensor->strides();
+      auto scalar_type = output_tensor->scalar_type();
 
       std::vector<int64_t> sizes_vec(sizes.begin(), sizes.end());
       std::vector<int64_t> strides_vec(strides.begin(), strides.end());
@@ -801,13 +816,18 @@ class ET_EXPERIMENTAL CudaBackend final
 
     const bool copy_outputs = !should_skip_copy_for_method(handle->method_name);
 
+    // Output disposition: copy to ETensor backing memory or keep on GPU.
+    // When copy_outputs is true (default), results are copied to the
+    // ETensor's memory (which may be CPU or CUDA planned memory).
+    // When false (skip-copy optimization), the ETensor is rewired to
+    // point at the GPU SlimTensor's memory directly.
     if (copy_outputs) {
       for (size_t i = 0; i < n_outputs; i++) {
-        auto* cpu_output_tensor = &(args[i + n_inputs]->toTensor());
+        auto* output_tensor = &(args[i + n_inputs]->toTensor());
         ET_CHECK_OK_OR_RETURN_ERROR(
             copy_slimtensor_to_etensor_async(
-                gpu_outputs[i], cpu_output_tensor, cuda_stream),
-            "Failed to copy GPU output %zu back to CPU ETensor",
+                gpu_outputs[i], output_tensor, cuda_stream),
+            "Failed to copy GPU output %zu back to ETensor",
             i);
         delete gpu_outputs[i];
         gpu_outputs[i] = nullptr;
diff --git a/backends/cuda/runtime/shims/randint.cu b/backends/cuda/runtime/shims/randint.cu
@@ -0,0 +1,108 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <cuda_runtime.h>
+#include <curand.h>
+
+#include <executorch/backends/cuda/runtime/shims/randint.h>
+#include <executorch/runtime/platform/assert.h>
+#include <executorch/runtime/platform/log.h>
+
+#include <cstdint>
+#include <ctime>
+
+namespace executorch::backends::cuda {
+
+using executorch::runtime::Error;
+
+namespace {
+
+// Transform cuRAND uniform doubles (0, 1] to int64 values in [low, high).
+__global__ void uniform_to_randint_kernel(
+    int64_t* out,
+    const double* uniform,
+    int64_t numel,
+    int64_t low,
+    int64_t range) {
+  int64_t idx = static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
+  if (idx < numel) {
+    // uniform is in (0, 1], so (uniform * range) is in (0, range].
+    // Subtract 1 and clamp to get [0, range-1], then add low for [low, high-1].
+    int64_t val = static_cast<int64_t>(uniform[idx] * range);
+    out[idx] = low + (val >= range ? range - 1 : val);
+  }
+}
+
+curandGenerator_t get_or_create_generator() {
+  static curandGenerator_t gen = nullptr;
+  if (gen == nullptr) {
+    curandCreateGenerator(&gen, CURAND_RNG_PSEUDO_DEFAULT);
+    curandSetPseudoRandomGeneratorSeed(
+        gen, static_cast<unsigned long long>(time(nullptr)));
+  }
+  return gen;
+}
+
+} // anonymous namespace
+
+extern "C" {
+
+AOTITorchError aoti_torch_cuda_randint_low_out(
+    SlimTensor* out,
+    int64_t low,
+    int64_t high,
+    const int64_t* size,
+    int64_t size_len_) {
+  ET_CHECK_OR_RETURN_ERROR(
+      out != nullptr,
+      InvalidArgument,
+      "aoti_torch_cuda_randint_low_out: out tensor is null");
+
+  ET_CHECK_OR_RETURN_ERROR(
+      high > low,
+      InvalidArgument,
+      "aoti_torch_cuda_randint_low_out: requires high > low");
+
+  int64_t numel = 1;
+  for (int64_t i = 0; i < size_len_; i++) {
+    numel *= size[i];
+  }
+  if (numel == 0) {
+    return Error::Ok;
+  }
+
+  int64_t range = high - low;
+  int64_t* out_data = static_cast<int64_t*>(out->data_ptr());
+
+  // Allocate temporary buffer for uniform doubles on device.
+  double* d_uniform = nullptr;
+  auto alloc_err = cudaMalloc(&d_uniform, numel * sizeof(double));
+  ET_CHECK_OR_RETURN_ERROR(
+      alloc_err == cudaSuccess,
+      Internal,
+      "aoti_torch_cuda_randint_low_out: cudaMalloc failed (%d)",
+      static_cast<int>(alloc_err));
+
+  // Generate uniform doubles in (0, 1].
+  auto gen = get_or_create_generator();
+  curandGenerateUniformDouble(gen, d_uniform, numel);
+
+  // Transform to integers in [low, high).
+  constexpr int kThreads = 256;
+  int blocks = static_cast<int>((numel + kThreads - 1) / kThreads);
+  uniform_to_randint_kernel<<<blocks, kThreads>>>(
+      out_data, d_uniform, numel, low, range);
+
+  cudaFree(d_uniform);
+
+  return Error::Ok;
+}
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/backends/cuda/runtime/shims/randint.h b/backends/cuda/runtime/shims/randint.h
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+#include <executorch/backends/aoti/common_shims_slim.h>
+#include <executorch/backends/aoti/export.h>
+
+namespace executorch::backends::cuda {
+
+using executorch::backends::aoti::AOTITorchError;
+using SlimTensor = executorch::backends::aoti::slim::SlimTensor;
+
+extern "C" {
+
+/**
+ * Fills a pre-allocated CUDA tensor with random integers in [low, high).
+ *
+ * Used by AOTI-generated code when the model calls torch.randint or ops
+ * that decompose into randint (e.g. torch.rand_like on some dtypes).
+ *
+ * @param out Pre-allocated output tensor on CUDA (must not be null).
+ * @param low Lower bound (inclusive) of the random range.
+ * @param high Upper bound (exclusive) of the random range.
+ * @param size Pointer to array of output dimension sizes.
+ * @param size_len_ Number of dimensions.
+ * @return AOTITorchError error code (Error::Ok on success).
+ */
+AOTI_SHIM_EXPORT AOTITorchError aoti_torch_cuda_randint_low_out(
+    SlimTensor* out,
+    int64_t low,
+    int64_t high,
+    const int64_t* size,
+    int64_t size_len_);
+
+} // extern "C"
+
+} // namespace executorch::backends::cuda
diff --git a/examples/models/qwen3_5_moe/CMakeLists.txt b/examples/models/qwen3_5_moe/CMakeLists.txt
@@ -32,14 +32,8 @@ list(APPEND link_libraries optimized_native_cpu_ops_lib cpublas eigen_blas)
 executorch_target_link_options_shared_lib(optimized_native_cpu_ops_lib)
 
 # Extensions
-list(
-  APPEND
-  link_libraries
-  extension_llm_runner
-  extension_module
-  extension_data_loader
-  extension_tensor
-  extension_flat_tensor
+list(APPEND link_libraries extension_module extension_data_loader
+     extension_tensor extension_flat_tensor
 )
 
 # Backend selection
@@ -48,7 +42,7 @@ if(EXECUTORCH_BUILD_METAL)
   executorch_target_link_options_shared_lib(metal_backend)
 elseif(EXECUTORCH_BUILD_CUDA)
   find_package(CUDAToolkit REQUIRED)
-  list(APPEND link_libraries aoti_cuda_backend)
+  list(APPEND link_libraries aoti_cuda_backend CUDA::cudart)
   executorch_target_link_options_shared_lib(aoti_cuda_backend)
   add_compile_definitions(EXECUTORCH_BUILD_CUDA)
 else()
@@ -60,7 +54,12 @@ endif()
 # Tokenizer
 list(APPEND link_libraries tokenizers::tokenizers)
 
-add_executable(qwen3_5_moe_runner main.cpp)
+add_executable(
+  qwen3_5_moe_runner
+  main.cpp ${EXECUTORCH_ROOT}/runtime/core/device_allocator.cpp
+  ${EXECUTORCH_ROOT}/runtime/core/device_memory_buffer.cpp
+  ${EXECUTORCH_ROOT}/backends/cuda/runtime/cuda_allocator.cpp
+)
 target_include_directories(
   qwen3_5_moe_runner PUBLIC ${_common_include_directories}
 )
diff --git a/examples/models/qwen3_5_moe/export.py b/examples/models/qwen3_5_moe/export.py
diff --git a/examples/models/qwen3_5_moe/main.cpp b/examples/models/qwen3_5_moe/main.cpp
diff --git a/examples/models/qwen3_5_moe/model.py b/examples/models/qwen3_5_moe/model.py

Original file line number	Diff line number	Diff line change
`@@ -107,12 +107,13 @@ set(_aoti_cuda_shim_sources runtime/cuda_allocator.cpp runtime/shims/memory.cpp`
`107`	`107`	`runtime/shims/cuda_guard.cpp`
`108`	`108`	`)`
`109`	`109`
`110`		`-# Only build CUDA shims when CUDA language/toolchain is available.`
	`110`	`+# Only build CUDA-specific shims when CUDA language/toolchain is available.`
`111`	`111`	`if(CMAKE_CUDA_COMPILER)`
`112`	`112`	`list(APPEND _aoti_cuda_shim_sources runtime/shims/int4mm.cu`
`113`	`113`	`runtime/shims/int4_plain_mm.cu runtime/shims/sort.cu`
`114`	`114`	`runtime/shims/rand.cu`
`115`	`115`	`)`
	`116`	`+ list(APPEND _aoti_cuda_shim_sources runtime/shims/randint.cu)`
`116`	`117`	`endif()`
`117`	`118`
`118`	`119`	`add_library(aoti_cuda_shims SHARED ${_aoti_cuda_shim_sources})`