Skip to content

Commit b68f982

Browse files
authored
[ET Device Support] Module: allocate device memory for planned buffers
Differential Revision: D97850705 Pull Request resolved: #18476
1 parent cbab086 commit b68f982

7 files changed

Lines changed: 328 additions & 3 deletions

File tree

extension/module/module.cpp

Lines changed: 76 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
#include <executorch/extension/flat_tensor/flat_tensor_data_map.h>
1414
#include <executorch/extension/memory_allocator/malloc_memory_allocator.h>
1515
#include <executorch/extension/named_data_map/merged_data_map.h>
16+
#include <executorch/runtime/core/device_memory_buffer.h>
1617
#include <executorch/runtime/platform/runtime.h>
1718

1819
namespace executorch {
@@ -367,6 +368,51 @@ Module::make_planned_memory_with_shared_arenas(
367368
return planned;
368369
}
369370

371+
std::unique_ptr<Module::PlannedMemory> Module::make_planned_memory_with_devices(
372+
const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta) {
373+
auto planned = std::make_unique<PlannedMemory>();
374+
const size_t num_buffers = method_meta.num_memory_planned_buffers();
375+
planned->planned_buffers.reserve(num_buffers);
376+
planned->planned_spans.reserve(num_buffers);
377+
planned->device_buffers.reserve(num_buffers);
378+
planned->planned_devices.reserve(num_buffers);
379+
380+
for (size_t i = 0; i < num_buffers; ++i) {
381+
auto size = method_meta.memory_planned_buffer_size(i);
382+
ET_CHECK_MSG(size.ok(), "Failed to get buffer size for index %zu", i);
383+
auto device = method_meta.memory_planned_buffer_device(i);
384+
ET_CHECK_MSG(device.ok(), "Failed to get buffer device for index %zu", i);
385+
planned->planned_devices.push_back(device.get());
386+
387+
if (device->is_cpu()) {
388+
planned->planned_buffers.emplace_back(size.get());
389+
planned->planned_spans.emplace_back(
390+
planned->planned_buffers.back().data(), size.get());
391+
} else {
392+
// Allocate device memory via DeviceAllocator and store the RAII buffer.
393+
planned->planned_buffers.emplace_back(); // empty CPU placeholder
394+
auto dmb = runtime::DeviceMemoryBuffer::create(
395+
size.get(), device->type(), device->index());
396+
ET_CHECK_MSG(
397+
dmb.ok(),
398+
"Failed to allocate device memory for buffer %zu (device_type=%d)",
399+
i,
400+
static_cast<int>(device->type()));
401+
planned->planned_spans.emplace_back(dmb->as_span());
402+
planned->device_buffers.push_back(std::move(dmb.get()));
403+
}
404+
}
405+
406+
// HierarchicalAllocator owns the per-buffer Device metadata so the
407+
// MemoryManager can later expose it via planned_buffer_devices().
408+
planned->planned_memory = std::make_unique<runtime::HierarchicalAllocator>(
409+
runtime::Span<runtime::Span<uint8_t>>(
410+
planned->planned_spans.data(), planned->planned_spans.size()),
411+
runtime::Span<const runtime::etensor::Device>(
412+
planned->planned_devices.data(), planned->planned_devices.size()));
413+
return planned;
414+
}
415+
370416
runtime::Result<std::vector<size_t>> Module::get_mem_planned_buffer_sizes(
371417
const std::string& method_name) {
372418
auto meta_res = program_->method_meta(method_name.c_str());
@@ -422,10 +468,38 @@ runtime::Error Module::load_method(
422468
MethodHolder method_holder;
423469

424470
if (!planned_memory) {
425-
if (!share_memory_arenas_) {
471+
// Check if any buffers need device memory allocation.
472+
auto meta_res = program_->method_meta(method_name.c_str());
473+
ET_CHECK_OK_OR_RETURN_ERROR(meta_res.error());
474+
auto& meta = meta_res.get();
475+
476+
bool has_device_buffers = false;
477+
for (size_t i = 0; i < meta.num_memory_planned_buffers(); ++i) {
478+
auto dev = meta.memory_planned_buffer_device(i);
479+
if (dev.ok() && !dev->is_cpu()) {
480+
has_device_buffers = true;
481+
break;
482+
}
483+
}
484+
485+
if (has_device_buffers) {
486+
// Device memory with shared arenas is not yet supported.
487+
ET_CHECK_OR_RETURN_ERROR(
488+
!share_memory_arenas_,
489+
NotSupported,
490+
"Device memory buffers are not yet compatible with "
491+
"share_memory_arenas. Please disable share_memory_arenas "
492+
"when using models with device-planned memory.");
493+
494+
// Device-aware path: allocate CPU and device buffers. The device
495+
// span is owned by the HierarchicalAllocator inside PlannedMemory.
496+
method_holder.planned_memory = make_planned_memory_with_devices(meta);
497+
planned_memory = method_holder.planned_memory->planned_memory.get();
498+
} else if (!share_memory_arenas_) {
426499
auto sizes_res = get_mem_planned_buffer_sizes(method_name);
427500
ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
428501
method_holder.planned_memory = make_planned_memory(sizes_res.get());
502+
planned_memory = method_holder.planned_memory->planned_memory.get();
429503
} else {
430504
auto sizes_res = get_mem_planned_buffer_sizes(method_name);
431505
ET_CHECK_OK_OR_RETURN_ERROR(sizes_res.error());
@@ -442,8 +516,8 @@ runtime::Error Module::load_method(
442516
}
443517
method_holder.planned_memory =
444518
make_planned_memory_with_shared_arenas(sizes, shared_arenas_);
519+
planned_memory = method_holder.planned_memory->planned_memory.get();
445520
}
446-
planned_memory = method_holder.planned_memory->planned_memory.get();
447521
}
448522

449523
method_holder.memory_manager = std::make_unique<runtime::MemoryManager>(

extension/module/module.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,8 @@
1818
#include <executorch/runtime/backend/options.h>
1919
#include <executorch/runtime/executor/program.h>
2020

21+
#include <executorch/runtime/core/device_memory_buffer.h>
22+
2123
#ifdef USE_ATEN_LIB
2224
#define ET_MODULE_NAMESPACE module::aten
2325
#else // !USE_ATEN_LIB
@@ -716,13 +718,20 @@ class Module {
716718
struct PlannedMemory {
717719
std::vector<std::vector<uint8_t>> planned_buffers;
718720
std::vector<runtime::Span<uint8_t>> planned_spans;
721+
std::vector<runtime::DeviceMemoryBuffer> device_buffers;
722+
/// Per-buffer Device (type + index) metadata used by
723+
/// HierarchicalAllocator. Owns the storage backing the device span the
724+
/// allocator references, so it must outlive `planned_memory`.
725+
std::vector<runtime::etensor::Device> planned_devices;
719726
std::unique_ptr<runtime::HierarchicalAllocator> planned_memory;
720727
};
721728
std::unique_ptr<PlannedMemory> make_planned_memory(
722729
const std::vector<size_t>& buffer_sizes);
723730
std::unique_ptr<PlannedMemory> make_planned_memory_with_shared_arenas(
724731
const std::vector<size_t>& buffer_sizes,
725732
std::vector<std::vector<uint8_t>>& shared_arenas);
733+
std::unique_ptr<PlannedMemory> make_planned_memory_with_devices(
734+
const ET_RUNTIME_NAMESPACE::MethodMeta& method_meta);
726735
runtime::Result<std::vector<size_t>> get_mem_planned_buffer_sizes(
727736
const std::string& method_name);
728737
runtime::Result<std::vector<size_t>> get_max_mem_planned_buffer_sizes();

extension/module/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ def define_common_targets():
3030
"//executorch/runtime/backend:backend_options",
3131
"//executorch/runtime/backend:backend_options_map",
3232
"//executorch/runtime/executor:program_no_prim_ops" + aten_suffix,
33+
"//executorch/runtime/core:device_memory_buffer",
3334
],
3435
)
3536

Lines changed: 218 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,218 @@
1+
/*
2+
* Copyright (c) Meta Platforms, Inc. and affiliates.
3+
* All rights reserved.
4+
*
5+
* This source code is licensed under the BSD-style license found in the
6+
* LICENSE file in the root directory of this source tree.
7+
*/
8+
9+
/**
10+
* Tests that Module's device-aware memory allocation path works correctly.
11+
*
12+
* Uses ModuleAddWithDevice.pte which has:
13+
* non_const_buffer_sizes: [0, 48] (1 buffer, index 0 reserved)
14+
* non_const_buffer_device: [{buffer_idx=1, device_type=CUDA, device_index=0}]
15+
*
16+
* Since we don't have a real CUDA backend, we test that:
17+
* 1. CPU-only models load through Module without invoking device allocator
18+
* 2. Device-annotated models trigger DeviceMemoryBuffer::create via a mock
19+
*/
20+
21+
#include <executorch/extension/module/module.h>
22+
23+
#include <gtest/gtest.h>
24+
25+
#include <executorch/runtime/core/device_allocator.h>
26+
#include <executorch/runtime/core/device_memory_buffer.h>
27+
#include <executorch/runtime/platform/runtime.h>
28+
29+
using executorch::extension::Module;
30+
using executorch::runtime::DeviceAllocator;
31+
using executorch::runtime::DeviceMemoryBuffer;
32+
using executorch::runtime::Error;
33+
using executorch::runtime::register_device_allocator;
34+
using executorch::runtime::Result;
35+
using executorch::runtime::etensor::DeviceIndex;
36+
using executorch::runtime::etensor::DeviceType;
37+
38+
namespace {
39+
40+
class MockCudaAllocator : public DeviceAllocator {
41+
public:
42+
Result<void*> allocate(
43+
size_t nbytes,
44+
DeviceIndex index,
45+
size_t alignment = kDefaultAlignment) override {
46+
(void)alignment;
47+
allocate_count_++;
48+
last_allocate_size_ = nbytes;
49+
last_allocate_index_ = index;
50+
buffer_ = std::make_unique<uint8_t[]>(nbytes);
51+
return static_cast<void*>(buffer_.get());
52+
}
53+
54+
void deallocate(void* ptr, DeviceIndex index) override {
55+
deallocate_count_++;
56+
buffer_.reset();
57+
}
58+
59+
Error copy_host_to_device(void*, const void*, size_t, DeviceIndex) override {
60+
return Error::Ok;
61+
}
62+
63+
Error copy_device_to_host(void*, const void*, size_t, DeviceIndex) override {
64+
return Error::Ok;
65+
}
66+
67+
DeviceType device_type() const override {
68+
return DeviceType::CUDA;
69+
}
70+
71+
int allocate_count_ = 0;
72+
int deallocate_count_ = 0;
73+
size_t last_allocate_size_ = 0;
74+
DeviceIndex last_allocate_index_ = -1;
75+
76+
private:
77+
std::unique_ptr<uint8_t[]> buffer_;
78+
};
79+
80+
} // namespace
81+
82+
static MockCudaAllocator g_mock_cuda;
83+
84+
class ModuleDeviceMemoryTest : public ::testing::Test {
85+
protected:
86+
static void SetUpTestSuite() {
87+
executorch::runtime::runtime_init();
88+
register_device_allocator(&g_mock_cuda);
89+
}
90+
91+
void SetUp() override {
92+
g_mock_cuda.allocate_count_ = 0;
93+
g_mock_cuda.deallocate_count_ = 0;
94+
g_mock_cuda.last_allocate_size_ = 0;
95+
g_mock_cuda.last_allocate_index_ = -1;
96+
}
97+
};
98+
99+
TEST_F(ModuleDeviceMemoryTest, CpuOnlyModelDoesNotAllocateDeviceMemory) {
100+
const char* path = std::getenv("ET_MODULE_ADD_PATH");
101+
ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_PATH not set";
102+
103+
Module module(path);
104+
auto err = module.load_method("forward");
105+
ASSERT_EQ(err, Error::Ok);
106+
107+
EXPECT_EQ(g_mock_cuda.allocate_count_, 0)
108+
<< "CPU-only model should not allocate device memory";
109+
}
110+
111+
TEST_F(ModuleDeviceMemoryTest, DeviceMemoryBufferCreateCallsAllocator) {
112+
// Directly test DeviceMemoryBuffer::create with the registered mock.
113+
// This verifies the RAII allocation/deallocation path that Module uses.
114+
{
115+
auto result = DeviceMemoryBuffer::create(48, DeviceType::CUDA, 0);
116+
ASSERT_TRUE(result.ok());
117+
auto buf = std::move(result.get());
118+
119+
EXPECT_EQ(g_mock_cuda.allocate_count_, 1);
120+
EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48);
121+
EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0);
122+
EXPECT_NE(buf.data(), nullptr);
123+
EXPECT_EQ(buf.size(), 48);
124+
125+
// as_span() wraps the device pointer for HierarchicalAllocator.
126+
auto span = buf.as_span();
127+
EXPECT_EQ(span.data(), static_cast<uint8_t*>(buf.data()));
128+
EXPECT_EQ(span.size(), 48);
129+
130+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 0);
131+
}
132+
// RAII deallocation on scope exit.
133+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 1);
134+
}
135+
136+
TEST_F(ModuleDeviceMemoryTest, DeviceModelMethodMetaReportsCudaBuffer) {
137+
// Verify MethodMeta reports the correct device for buffers in the
138+
// device-annotated model, without needing to load the full method.
139+
const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
140+
ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
141+
142+
Module module(path);
143+
auto err = module.load();
144+
ASSERT_EQ(err, Error::Ok);
145+
146+
auto meta = module.method_meta("forward");
147+
ASSERT_TRUE(meta.ok());
148+
149+
// ModuleAddWithDevice has 1 planned buffer (48 bytes) on CUDA.
150+
ASSERT_EQ(meta->num_memory_planned_buffers(), 1);
151+
152+
auto size = meta->memory_planned_buffer_size(0);
153+
ASSERT_TRUE(size.ok());
154+
EXPECT_EQ(size.get(), 48);
155+
156+
auto device = meta->memory_planned_buffer_device(0);
157+
ASSERT_TRUE(device.ok());
158+
EXPECT_EQ(device->type(), DeviceType::CUDA);
159+
EXPECT_EQ(device->index(), 0);
160+
}
161+
162+
TEST_F(ModuleDeviceMemoryTest, DeviceModelWithSharedArenasReturnsNotSupported) {
163+
const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
164+
ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
165+
166+
// share_memory_arenas = true with a device-annotated model should fail.
167+
Module module(
168+
path,
169+
Module::LoadMode::File,
170+
/*event_tracer=*/nullptr,
171+
/*memory_allocator=*/nullptr,
172+
/*temp_allocator=*/nullptr,
173+
/*share_memory_arenas=*/true);
174+
175+
auto err = module.load_method("forward");
176+
EXPECT_EQ(err, Error::NotSupported);
177+
}
178+
179+
TEST_F(
180+
ModuleDeviceMemoryTest,
181+
LoadMethodAllocatesDeviceMemoryAndDeallocatesOnDestroy) {
182+
const char* path = std::getenv("ET_MODULE_ADD_WITH_DEVICE_PATH");
183+
ASSERT_NE(path, nullptr) << "ET_MODULE_ADD_WITH_DEVICE_PATH not set";
184+
185+
{
186+
Module module(path);
187+
auto err = module.load_method("forward");
188+
189+
// Regardless of whether load_method succeeds or fails (e.g. due to
190+
// backend init issues), the device-aware memory allocation path
191+
// (make_planned_memory_with_devices) runs BEFORE backend init.
192+
EXPECT_EQ(g_mock_cuda.allocate_count_, 1)
193+
<< "Expected 1 device allocation for the CUDA buffer"
194+
<< " (actual: " << g_mock_cuda.allocate_count_ << ")"
195+
<< ", deallocate_count=" << g_mock_cuda.deallocate_count_
196+
<< ", load_method returned error=" << static_cast<int>(err);
197+
EXPECT_EQ(g_mock_cuda.last_allocate_size_, 48)
198+
<< "Expected 48 bytes allocated (3 CUDA tensors sharing one buffer)";
199+
EXPECT_EQ(g_mock_cuda.last_allocate_index_, 0)
200+
<< "Expected device_index=0 (cuda:0)";
201+
202+
if (err == Error::Ok) {
203+
// Success path: MethodHolder moved into methods_ map.
204+
// DeviceMemoryBuffer is alive as long as Module is alive.
205+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 0)
206+
<< "No deallocation while method is loaded";
207+
} else {
208+
// Error path: local MethodHolder destroyed on return from load_method.
209+
// RAII deallocation already happened.
210+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
211+
<< "RAII deallocation on error path";
212+
}
213+
}
214+
215+
// After Module destroyed, all device memory must be freed.
216+
EXPECT_EQ(g_mock_cuda.deallocate_count_, 1)
217+
<< "Expected deallocation after Module destroyed";
218+
}

0 commit comments

Comments
 (0)