@@ -469,7 +469,16 @@ class ET_EXPERIMENTAL CudaBackend final
469469 return (DelegateHandle*)handle; // Return the handle post-processing
470470 }
471471
472- // Once per execution
472+ // Execute the AOTI-compiled CUDA kernel for one inference step.
473+ //
474+ // Currently supports both CPU and CUDA memory for IO tensors:
475+ // - Inputs: detected via cudaPointerGetAttributes; CUDA data is wrapped
476+ // in-place (no copy), CPU data is copied to GPU via from_etensor().
477+ // - Outputs: either copied to ETensor's backing memory (CPU or CUDA),
478+ // or the ETensor is rewired to point at GPU memory (skip-copy mode).
479+ //
480+ // TODO: Once the device tensor pipeline is fully adopted, all IO tensors
481+ // will reside in CUDA memory. Remove the CPU fallback paths.
473482 Error execute (
474483 BackendExecutionContext& context,
475484 DelegateHandle* handle_,
@@ -494,14 +503,17 @@ class ET_EXPERIMENTAL CudaBackend final
494503 n_outputs,
495504 args.size ())
496505
497- // Verify device info on all memory-planned, ET-driven IO tensors.
498- // All input and output tensors should have device_type = CUDA, which
499- // is set during serialization by PropagateDevicePass based on the
500- // target_device compile spec from CudaPartitioner.
506+ // Verify device metadata on all IO tensors.
507+ // All tensors should have device_type = CUDA, set during serialization
508+ // by PropagateDevicePass based on the target_device compile spec from
509+ // CudaPartitioner.
501510 //
502- // Note: At this stage, the tensor memory is still on CPU. The device_type
503- // is metadata indicating where the tensor *should* reside. The backend
504- // is responsible for copying data to the actual CUDA device.
511+ // Note: device_type is metadata — the actual memory location may be
512+ // either CPU (legacy path with H2D copy ops) or CUDA (when device
513+ // memory planning is enabled via enable_non_cpu_memory_planning,
514+ // which allocates delegate IO in CUDA memory). The backend detects
515+ // the actual location via cudaPointerGetAttributes and handles both
516+ // cases.
505517 for (size_t i = 0 ; i < n_inputs + n_outputs; i++) {
506518 auto * tensor = &(args[i]->toTensor ());
507519 auto device_type = tensor->unsafeGetTensorImpl ()->device_type ();
@@ -582,13 +594,13 @@ class ET_EXPERIMENTAL CudaBackend final
582594 std::vector<SlimTensor*> gpu_inputs (n_inputs);
583595 std::vector<SlimTensor*> gpu_outputs (n_outputs);
584596
585- // Process input tensors: convert ETensor (CPU) to SlimTensor (GPU)
597+ // Process input tensors: convert ETensor to SlimTensor
586598 for (size_t i = 0 ; i < n_inputs; i++) {
587- auto * cpu_tensor = &(args[i]->toTensor ());
599+ auto * input_tensor = &(args[i]->toTensor ());
588600
589601 // CAPTURE step: allocate persistent static GPU buffers
590602 if (is_capture_step) {
591- size_t nbytes = cpu_tensor ->nbytes ();
603+ size_t nbytes = input_tensor ->nbytes ();
592604
593605 void * static_ptr = nullptr ;
594606 cudaError_t merr = cudaMalloc (&static_ptr, nbytes);
@@ -601,46 +613,49 @@ class ET_EXPERIMENTAL CudaBackend final
601613
602614 cudaMemcpy (
603615 static_ptr,
604- cpu_tensor ->const_data_ptr (),
616+ input_tensor ->const_data_ptr (),
605617 nbytes,
606618 cudaMemcpyHostToDevice);
607619
608620 handle->cuda_graph_state .static_input_ptrs .push_back (static_ptr);
609621 handle->cuda_graph_state .static_input_nbytes .push_back (nbytes);
610622
611623 gpu_inputs[i] = make_slimtensor_from_blob_with_etensor_metadata (
612- static_ptr, cpu_tensor );
624+ static_ptr, input_tensor );
613625 continue ;
614626 }
615627
616628 // Check if input data is already on GPU (skip-copy optimization for
617629 // inputs) This can happen when the caller has pre-staged data on GPU
618630 cudaPointerAttributes attributes{};
619- const void * data_ptr = cpu_tensor ->const_data_ptr ();
631+ const void * data_ptr = input_tensor ->const_data_ptr ();
620632 if (data_ptr != nullptr ) {
621633 cudaError_t err = cudaPointerGetAttributes (&attributes, data_ptr);
622634 if (err == cudaSuccess && attributes.type == cudaMemoryTypeDevice) {
623635 // Data is already on GPU - wrap it directly without copy
624636 gpu_inputs[i] = make_slimtensor_from_blob_with_etensor_metadata (
625- const_cast <void *>(data_ptr), cpu_tensor );
637+ const_cast <void *>(data_ptr), input_tensor );
626638
627639 continue ;
628640 }
629641 }
630642
631- // Data is on CPU - use from_etensor to copy to GPU
643+ // Data is in CPU memory (legacy path) — copy to GPU via from_etensor.
644+ // TODO: Remove this path once all callers use the device tensor pipeline.
632645 gpu_inputs[i] = new SlimTensor (
633- from_etensor (*cpu_tensor , CPU_DEVICE , DEFAULT_CUDA_DEVICE ));
646+ from_etensor (*input_tensor , CPU_DEVICE , DEFAULT_CUDA_DEVICE ));
634647 }
635648
636- // Process output tensors: create GPU SlimTensors for kernel output.
637- // Save pre-run handles to detect orphans after run().
649+ // Allocate GPU SlimTensors for kernel outputs. These are always
650+ // freshly allocated on GPU regardless of the input memory mode.
651+ // Save pre-run handles to detect orphans after run() (the AOTI
652+ // runtime may replace output handles with its own allocations).
638653 std::vector<SlimTensor*> pre_run_outputs (n_outputs, nullptr );
639654 for (size_t i = 0 ; i < n_outputs; i++) {
640- auto * cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
641- auto sizes = cpu_output_tensor ->sizes ();
642- auto strides = cpu_output_tensor ->strides ();
643- auto scalar_type = cpu_output_tensor ->scalar_type ();
655+ auto * output_tensor = &(args[i + n_inputs]->toTensor ());
656+ auto sizes = output_tensor ->sizes ();
657+ auto strides = output_tensor ->strides ();
658+ auto scalar_type = output_tensor ->scalar_type ();
644659
645660 std::vector<int64_t > sizes_vec (sizes.begin (), sizes.end ());
646661 std::vector<int64_t > strides_vec (strides.begin (), strides.end ());
@@ -801,13 +816,18 @@ class ET_EXPERIMENTAL CudaBackend final
801816
802817 const bool copy_outputs = !should_skip_copy_for_method (handle->method_name );
803818
819+ // Output disposition: copy to ETensor backing memory or keep on GPU.
820+ // When copy_outputs is true (default), results are copied to the
821+ // ETensor's memory (which may be CPU or CUDA planned memory).
822+ // When false (skip-copy optimization), the ETensor is rewired to
823+ // point at the GPU SlimTensor's memory directly.
804824 if (copy_outputs) {
805825 for (size_t i = 0 ; i < n_outputs; i++) {
806- auto * cpu_output_tensor = &(args[i + n_inputs]->toTensor ());
826+ auto * output_tensor = &(args[i + n_inputs]->toTensor ());
807827 ET_CHECK_OK_OR_RETURN_ERROR (
808828 copy_slimtensor_to_etensor_async (
809- gpu_outputs[i], cpu_output_tensor , cuda_stream),
810- " Failed to copy GPU output %zu back to CPU ETensor" ,
829+ gpu_outputs[i], output_tensor , cuda_stream),
830+ " Failed to copy GPU output %zu back to ETensor" ,
811831 i);
812832 delete gpu_outputs[i];
813833 gpu_outputs[i] = nullptr ;
0 commit comments