Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions ggml/src/ggml-sycl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,17 @@ endif()

target_compile_options(ggml-sycl PRIVATE "-Wno-narrowing")

# Link against Level Zero loader for direct device memory allocation.
# Avoids sycl::malloc_device triggering DMA-buf/TTM system RAM staging
# in the xe kernel driver during multi-GPU inference.
find_library(ZE_LOADER_LIB ze_loader HINTS ${ONEAPI_ROOT}/lib ENV LD_LIBRARY_PATH)
if(ZE_LOADER_LIB)
target_link_libraries(ggml-sycl PRIVATE ${ZE_LOADER_LIB})
message(STATUS "Level Zero loader found: ${ZE_LOADER_LIB}")
else()
message(WARNING "Level Zero loader (ze_loader) not found, multi-GPU may use excessive system RAM")
endif()

# Link against oneDNN
set(GGML_SYCL_DNNL 0)
if(GGML_SYCL_DNN)
Expand Down
16 changes: 14 additions & 2 deletions ggml/src/ggml-sycl/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
//

#include "common.hpp"
#include <sycl/backend.hpp>
#include <level_zero/ze_api.h>

#include "ggml-backend-impl.h"
#include "ggml-impl.h"
Expand Down Expand Up @@ -75,8 +77,18 @@ void release_extra_gpu(ggml_tensor_extra_gpu * extra, std::vector<queue_ptr> str
}
if (extra->data_device[i] != nullptr && streams.size()>0) {
ggml_sycl_set_device(i);
SYCL_CHECK(
CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
bool freed = false;
try {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(
streams[i]->get_context());
if (zeMemFree(ze_ctx, extra->data_device[i]) == ZE_RESULT_SUCCESS) {
freed = true;
}
} catch (...) {}
if (!freed) {
SYCL_CHECK(
CHECK_TRY_ERROR(sycl::free(extra->data_device[i], *(streams[i]))));
}
}
}
delete extra;
Expand Down
10 changes: 10 additions & 0 deletions ggml/src/ggml-sycl/dpct/helper.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@

#include <sycl/sycl.hpp>
#include <sycl/half_type.hpp>
#include <sycl/backend.hpp>
#include <level_zero/ze_api.h>
#include <oneapi/mkl.hpp>

#include <map>
Expand Down Expand Up @@ -1307,6 +1309,14 @@ namespace dpct

static inline void *dpct_malloc(size_t size, sycl::queue &q)
{
try {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
ze_device_mem_alloc_desc_t desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0};
void *ptr = nullptr;
if (zeMemAllocDevice(ze_ctx, &desc, size, 64, ze_dev, &ptr) == ZE_RESULT_SUCCESS && ptr)
return ptr;
} catch (...) {}
return sycl::malloc_device(size, q.get_device(), q.get_context());
}

Expand Down
73 changes: 54 additions & 19 deletions ggml/src/ggml-sycl/ggml-sycl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@
#include <regex>

#include <sycl/sycl.hpp>
#include <sycl/backend.hpp>
#include <level_zero/ze_api.h>
#if defined(GGML_SYCL_GRAPH) && SYCL_EXT_ONEAPI_ASYNC_MEMORY_ALLOC
# include <sycl/ext/oneapi/experimental/async_alloc/async_alloc.hpp>
#endif
Expand Down Expand Up @@ -345,6 +347,10 @@ catch (sycl::exception const &exc) {
std::exit(1);
}

// Forward declarations for Level Zero allocation helpers (defined after this struct)
static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q);
static void ggml_sycl_free_device(void *ptr, sycl::queue &q);

// sycl buffer

struct ggml_backend_sycl_buffer_context {
Expand All @@ -365,7 +371,7 @@ struct ggml_backend_sycl_buffer_context {
~ggml_backend_sycl_buffer_context() {
if (dev_ptr != nullptr) {
ggml_sycl_set_device(device);
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(dev_ptr, *stream)));
ggml_sycl_free_device(dev_ptr, *stream);
}

//release extra used by tensors
Expand Down Expand Up @@ -487,8 +493,50 @@ catch (sycl::exception const &exc) {
std::exit(1);
}

// Use Level Zero zeMemAllocDevice to avoid sycl::malloc_device triggering
// DMA-buf/TTM system RAM staging in the xe kernel driver.
// sycl::malloc_device creates a 1:1 host memory mirror of every VRAM allocation
// via xe_gem_prime_export, consuming system RAM equal to VRAM allocated.
// zeMemAllocDevice uses the SVM/P2P path with no host staging.
static void * ggml_sycl_malloc_device(size_t size, sycl::queue &q) {
void *ptr = nullptr;
try {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_device());
ze_device_mem_alloc_desc_t alloc_desc = {ZE_STRUCTURE_TYPE_DEVICE_MEM_ALLOC_DESC, nullptr, 0, 0};
ze_result_t r = zeMemAllocDevice(ze_ctx, &alloc_desc, size, 64, ze_dev, &ptr);
if (r == ZE_RESULT_SUCCESS && ptr) {
return ptr;
}
} catch (...) {}
return sycl::malloc_device(size, q);
}

static void ggml_sycl_free_device(void *ptr, sycl::queue &q) {
if (!ptr) return;
try {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q.get_context());
if (zeMemFree(ze_ctx, ptr) == ZE_RESULT_SUCCESS) return;
} catch (...) {}
sycl::free(ptr, q);
}

static void dev2dev_memcpy(sycl::queue &q_dst, sycl::queue &q_src, void *ptr_dst,
const void *ptr_src, size_t size) {
try {
auto ze_ctx = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_context());
auto ze_dev = sycl::get_native<sycl::backend::ext_oneapi_level_zero>(q_dst.get_device());
ze_command_queue_desc_t cq_desc = {ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0,
0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};
ze_command_list_handle_t cl;
ze_result_t r = zeCommandListCreateImmediate(ze_ctx, ze_dev, &cq_desc, &cl);
if (r == ZE_RESULT_SUCCESS) {
zeCommandListAppendMemoryCopy(cl, ptr_dst, ptr_src, size, nullptr, 0, nullptr);
zeCommandListDestroy(cl);
return;
}
} catch (...) {}
// Fallback to host-staged copy
char *host_buf = (char *)malloc(size);
q_src.memcpy(host_buf, (const char *)ptr_src, size).wait();
q_dst.memcpy((char *)ptr_dst, host_buf, size).wait();
Expand Down Expand Up @@ -655,9 +703,7 @@ ggml_backend_sycl_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft,
const queue_ptr stream = buft_ctx->stream;
size = std::max(size, (size_t)1); // syclMalloc returns null for size 0

void * dev_ptr;
SYCL_CHECK(CHECK_TRY_ERROR(dev_ptr = (void *)sycl::malloc_device(
size, *stream)));
void * dev_ptr = ggml_sycl_malloc_device(size, *stream);
if (!dev_ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
return nullptr;
Expand Down Expand Up @@ -898,18 +944,9 @@ ggml_backend_sycl_split_buffer_init_tensor(ggml_backend_buffer_t buffer,
size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
}

// FIXME: do not crash if SYCL Buffer alloc fails
// currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
ggml_sycl_set_device(i);
const queue_ptr stream = ctx->streams[i];
char * buf;
/*
DPCT1009:208: SYCL uses exceptions to report errors and does not use the
error codes. The original code was commented out and a warning string
was inserted. You need to rewrite this code.
*/
SYCL_CHECK(CHECK_TRY_ERROR(buf = (char *)sycl::malloc_device(
size, *stream)));
char * buf = (char *)ggml_sycl_malloc_device(size, *stream);
if (!buf) {
char err_buf[1024];
snprintf(err_buf, 1023, "%s: can't allocate %lu Bytes of memory on device\n", __func__, size);
Expand Down Expand Up @@ -1268,7 +1305,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
for (int i = 0; i < MAX_SYCL_BUFFERS; ++i) {
ggml_sycl_buffer & b = buffer_pool[i];
if (b.ptr != nullptr) {
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(b.ptr, *qptr)));
ggml_sycl_free_device(b.ptr, *qptr);
pool_size -= b.size;
}
}
Expand Down Expand Up @@ -1316,9 +1353,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
void * ptr;
size_t look_ahead_size = (size_t) (1.05 * size);

SYCL_CHECK(
CHECK_TRY_ERROR(ptr = (void *)sycl::malloc_device(
look_ahead_size, *qptr)));
ptr = ggml_sycl_malloc_device(look_ahead_size, *qptr);
if (!ptr) {
GGML_LOG_ERROR("%s: can't allocate %lu Bytes of memory on device/GPU\n", __func__, look_ahead_size);
return nullptr;
Expand Down Expand Up @@ -1346,7 +1381,7 @@ struct ggml_sycl_pool_leg : public ggml_sycl_pool {
}
}
GGML_LOG_WARN("WARNING: sycl buffer pool full, increase MAX_sycl_BUFFERS\n");
SYCL_CHECK(CHECK_TRY_ERROR(sycl::free(ptr, *qptr)));
ggml_sycl_free_device(ptr, *qptr);
pool_size -= size;
}
};
Expand Down
Loading