Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions common/arg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2249,6 +2249,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
params.use_hugepages = true;
}
).set_env("LLAMA_ARG_HUGEPAGES"));
add_opt(common_arg(
{"--gpu-host-import"},
{"--no-gpu-host-import"},
"import model weights to the GPU via host-pointer aliasing (buffer_from_host_ptr) in lieu of separate device buffer.\n"
"requires backend capability (e.g., HIP on integrated APUs). automatically enabled with --hugepages and disabled otherwise",
[](common_params & params, bool value) {
params.gpu_host_import = value ? LLAMA_GPU_HOST_IMPORT_ON : LLAMA_GPU_HOST_IMPORT_OFF;
}
).set_env("LLAMA_ARG_GPU_HOST_IMPORT"));
add_opt(common_arg(
{"-dio", "--direct-io"},
{"-ndio", "--no-direct-io"},
Expand Down
1 change: 1 addition & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1432,6 +1432,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
mparams.check_tensors = params.check_tensors;
mparams.use_extra_bufts = !params.no_extra_bufts;
mparams.no_host = params.no_host;
mparams.gpu_host_import = params.gpu_host_import;

if (params.kv_overrides.empty()) {
mparams.kv_overrides = NULL;
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -547,6 +547,8 @@ struct common_params {

bool single_turn = false; // single turn chat conversation

llama_gpu_host_import_mode gpu_host_import = LLAMA_GPU_HOST_IMPORT_AUTO; // bfhp defaults to auto

ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V

Expand Down
96 changes: 41 additions & 55 deletions ggml/src/ggml-cuda/ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -633,25 +633,11 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
const size_t original_size = ggml_nbytes(tensor);
const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);

if (padded_size > original_size) {
if (ctx->owned) {
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
} else {
// Externally-owned buffer (buffer_from_host_ptr): the GPU-side
// cudaMemset is unusable here because hipMemset through a
// hipHostGetDevicePointer-derived address is unsupported on
// GFX1151 / ROCm 7.2.0 (Mapped regions are effectively
// read-only from the GPU side). Zero the padding via the
// host-side mapping instead — the caller's host buffer must
// be writable through the end of all init_tensor calls. The
// hugepages loader satisfies this by mapping PROT_READ|
// PROT_WRITE during load and downgrading to PROT_READ after
// load_all_data completes.
const size_t pad_offset = (size_t)((const char *)tensor->data + original_size
- (const char *)ctx->dev_ptr);
memset((uint8_t *)ctx->host_ptr + pad_offset, 0, padded_size - original_size);
}
// As some backend(s) cannot safely memset via device alias (e.g., GFX1151 / ROCm 7.2.0), padding for extern buffer is zero-filled
// in the loader. Mapped regions are effectively readonly from the GPU, and host buffer protection may prevent writes.
if (padded_size > original_size && ctx->owned) {
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
}
}
return GGML_STATUS_SUCCESS;
Expand Down Expand Up @@ -749,20 +735,37 @@ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
/* .reset = */ NULL,
};

// Buffer interface for externally-owned (buffer_from_host_ptr) buffers.
// The underlying host memory is read-only (mmap'd PROT_READ on Linux, and
// hipHostRegister+Mapped regions are effectively read-only from the GPU
// side on GFX1151 / ROCm 7.2.0 regardless of flags). Write ops are NULL'd
// to enforce read-only semantics at the type level.
static void ggml_backend_cuda_imported_buffer_set_tensor(
ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) {
GGML_ABORT("ggml-cuda: imported buffer is read-only (set_tensor not supported)");
}

static void ggml_backend_cuda_imported_buffer_memset_tensor(
ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {
GGML_ABORT("ggml-cuda: imported buffer is read-only (memset_tensor not supported)");
}

static bool ggml_backend_cuda_imported_buffer_cpy_tensor(
ggml_backend_buffer_t, const ggml_tensor *, ggml_tensor *) {
GGML_ABORT("ggml-cuda: imported buffer is read-only (cpy_tensor not supported)");
}

static void ggml_backend_cuda_imported_buffer_clear(
ggml_backend_buffer_t, uint8_t) {
GGML_ABORT("ggml-cuda: imported buffer is read-only (clear_tensor not supported)");
}

// Buffer interface for external (buffer_from_host_ptr) buffers. Host memory is read-only (mmap'd PROT_READ on Linux), and hipHostRegister+Mapped
// regions are effectively read-only from some GPU(s) (e.g., GFX1151 / ROCm 7.2.0) regardless of flags. Writes trigger an abort to enforce semantics.
static const ggml_backend_buffer_i ggml_backend_cuda_imported_buffer_interface = {
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
/* .memset_tensor = */ NULL,
/* .set_tensor = */ NULL,
/* .memset_tensor = */ ggml_backend_cuda_imported_buffer_memset_tensor,
/* .set_tensor = */ ggml_backend_cuda_imported_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
/* .cpy_tensor = */ NULL,
/* .clear = */ NULL,
/* .cpy_tensor = */ ggml_backend_cuda_imported_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cuda_imported_buffer_clear,
/* .reset = */ NULL,
};

Expand Down Expand Up @@ -4744,13 +4747,8 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
bool events = true;
#endif

// buffer_from_host_ptr is currently enabled only on HIP integrated GPUs
// (validated on Strix Halo / ROCm 7.2.0). NVIDIA Jetson reports
// prop.integrated == 1 too and may benefit from the same path, but it
// has not been tested on that platform; #15034's cuda_host-buffer
// corruption is in a different code path (see ggml-cuda.cu:243) and
// is not expected to apply here, but validation is required before
// extending beyond HIP.
// buffer_from_host_ptr is currently enabled only on HIP integrated GPUs (validated on Strix Halo). NVIDIA Jetson may benefit from this
// path but has not been tested; its host-buffer corruption is separate, but validation is required before extending beyond HIP.
bool buffer_from_host_ptr = false;
#if defined(GGML_USE_HIP)
cudaDeviceProp prop;
Expand Down Expand Up @@ -4783,18 +4781,10 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
}

#if defined(GGML_USE_HIP)
// Import a host-allocated memory region as a GPU-accessible buffer via
// cudaHostRegister + cudaHostGetDevicePointer. HIP-only for now; only
// validated on Strix Halo / ROCm 7.2.0. See the capability flag in
// get_props() for context. TODO: extend to CUDA / Jetson after validating
// that #15034's corruption mode does not apply to this code path.
// Import host-allocated memory region as GPU-accessible buffer via cudaHostRegister + cudaHostGetDevicePointer. Only validated on
// HIP / ROCm 7.2.0; see get_props() for context. TODO: extend to CUDA / Jetson after validating corruption mode does not apply here.
//
// PRECONDITION: the caller's host buffer must be writable through the
// end of all init_tensor calls. The buffer's quantized-tensor padding
// is zeroed via host-side memset during init_tensor (GPU-side memset
// is unsupported on GFX1151 / ROCm 7.2.0 for Mapped regions). The
// hugepages loader satisfies this by mapping PROT_READ|PROT_WRITE
// during load and downgrading to PROT_READ after load_all_data.
// Loader ensures quantized-tensor padding bytes are zerod in host buffer before calling; already handled for hugepages and file mmap.
static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(
ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
GGML_UNUSED(max_tensor_size);
Expand All @@ -4808,15 +4798,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(
return nullptr;
}

// ReadOnly is intentionally not set: empirically, hipHostRegisterReadOnly
// on GFX1151 / ROCm 7.2.0 incurs a ~14% TG regression vs Portable|Mapped
// alone (measured on Qwen3-30B-A3B Q4_K_M, 2026-04-25). The host-side
// mmap is PROT_READ after the loader finalizes; the imported buffer
// interface enforces read-only access at the type level by NULLing
// write ops, so ReadOnly adds no contract enforcement we don't
// already have. Its only observable effect is the regression.
cudaError_t err = cudaHostRegister(ptr, size,
cudaHostRegisterPortable | cudaHostRegisterMapped);
// ReadOnly is intentionally absent: empirically, hipHostRegisterReadOnly on some platforms incur a performance penalty vs Portable|Mapped
// alone (e.g., GFX1151 / ROCm 7.2.0 penalty is ~14% of TG on Qwen3 30B Q4_K_M). Host-side mmap is PROT_READ after loader finalizes; the
// imported buffer interface enforces read-only access at the type level by aborting on calls to write ops to enforce contract semantics.
cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterPortable | cudaHostRegisterMapped);

if (err != cudaSuccess) {
(void)cudaGetLastError();
GGML_LOG_ERROR("%s: cudaHostRegister failed: %s\n", __func__, cudaGetErrorString(err));
Expand Down
10 changes: 10 additions & 0 deletions include/llama.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,14 @@ extern "C" {
LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported
};

// Support for buffer from host pages; allows passing of pages to GPU on unified memory systems.
// Needed to avoid double memory and copy with some memory modes (e.g., Linux humepages with preallocation)
enum llama_gpu_host_import_mode {
LLAMA_GPU_HOST_IMPORT_AUTO = 0, // on by default if and only if use_hugepages is active
LLAMA_GPU_HOST_IMPORT_ON = 1, // forced on
LLAMA_GPU_HOST_IMPORT_OFF = 2, // forced off
};

// TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
typedef struct llama_token_data {
llama_token id; // token id
Expand Down Expand Up @@ -320,6 +328,8 @@ extern "C" {
bool no_host; // bypass host buffer allowing extra buffers to be used
bool no_alloc; // only load metadata and simulate memory allocations
bool use_hugepages; // back model memory with anonymous hugetlb pages (Linux only)

enum llama_gpu_host_import_mode gpu_host_import; // whether loader imports weights via buffer_from_host_ptr (default: auto)
};

struct llama_sampler_seq_config {
Expand Down
51 changes: 50 additions & 1 deletion src/llama-model-loader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
#include <future>
#include <regex>

#ifdef __linux__
#if defined(__linux__) || defined(__APPLE__)
#include <sys/mman.h>
#endif

Expand Down Expand Up @@ -1382,6 +1382,55 @@ void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void *
}
}

// Zero-fill padding. For MAP_PRIVATE file mmaps, triggers COW on touched pages (~one page per padded tensor); no-op
// for anonymous hugetlb. Restores PROT_READ on exit. Linux/Apple-only (uses mprotect); no-op on other platforms.
void llama_model_loader::zero_padding_in_mapping(uint32_t idx, ggml_context * ctx, ggml_backend_buffer_type_t buft) {
#if defined(__linux__) || defined(__APPLE__)
if (idx >= mappings.size()) {
return;
}
auto & mapping = mappings.at(idx);
if (mapping->is_hugetlb()) {
// Kernel zero-fills anonymous hugetlb mappings, so padding bytes are already zero; skipping here avoids poisoning the
// mapping with PROT_READ before load_all_data completes. and the lockdown happens in load_all_data's cleanup instead.
return;
}
void * base = mapping->addr();
const size_t mapsize = mapping->mmap_size();

if (mprotect(base, mapsize, PROT_READ | PROT_WRITE) != 0) {
LLAMA_LOG_WARN("%s: mprotect(RW) failed: %s; skipping pre-bfhp zero-pass\n", __func__, strerror(errno));
return;
}

for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
const auto * weight = get_weight(ggml_get_name(tensor));
if (!weight || weight->idx != idx) {
continue;
}
if (!ggml_is_quantized(tensor->type)) {
continue;
}
if (tensor->view_src != nullptr) {
continue;
}
const size_t orig_size = ggml_nbytes(tensor);
const size_t padded_size = ggml_backend_buft_get_alloc_size(buft, tensor);
if (padded_size > orig_size) {
memset((char *) base + weight->offs + orig_size, 0, padded_size - orig_size);
}
}

if (mprotect(base, mapsize, PROT_READ) != 0) {
LLAMA_LOG_WARN("%s: mprotect(RO) restore failed: %s\n", __func__, strerror(errno));
}
#else
GGML_UNUSED(idx);
GGML_UNUSED(ctx);
GGML_UNUSED(buft);
#endif
}

void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
const auto & w = require_weight(ggml_get_name(cur));

Expand Down
3 changes: 3 additions & 0 deletions src/llama-model-loader.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,9 @@ struct llama_model_loader {

void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;

// Ensure quantized-tensor padding bytes are zero in file-ifx-th mapped region.
void zero_padding_in_mapping(uint32_t idx, struct ggml_context * ctx, ggml_backend_buffer_type_t buft);

// for backwards compatibility, does not support ggml-backend
void load_data_for(struct ggml_tensor * cur) const;

Expand Down
15 changes: 14 additions & 1 deletion src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7797,9 +7797,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
ggml_backend_dev_get_props(dev, &props);
bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
bool gpu_host_import_enabled;

// Resolve bfhp state. AUTO --> match hugepages to avoid unnecessarily copying and memory consumption.
// ON/OFF are overrides from user flags.
switch (params.gpu_host_import) {
case LLAMA_GPU_HOST_IMPORT_ON: gpu_host_import_enabled = true; break;
case LLAMA_GPU_HOST_IMPORT_OFF: gpu_host_import_enabled = false; break;
case LLAMA_GPU_HOST_IMPORT_AUTO:
default: gpu_host_import_enabled = params.use_hugepages; break;
}

std::vector<ggml_backend_buffer_ptr> bufs;
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft && gpu_host_import_enabled) {
GGML_ASSERT(!ml.no_alloc);
for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
// only the mmap region containing the tensors in the model is mapped to the backend buffer
Expand All @@ -7812,6 +7822,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
if (first >= last) {
continue;
}
ml.zero_padding_in_mapping(idx, ctx, buft); // Zero out padding before importing host buffer (invariant for all backends)

const size_t max_size = ggml_get_max_tensor_size(ctx);
ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
if (buf == nullptr) {
Expand Down Expand Up @@ -8955,6 +8967,7 @@ llama_model_params llama_model_default_params() {
/*.no_host =*/ false,
/*.no_alloc =*/ false,
/*.use_hugepages =*/ false,
/*.gpu_host_import =*/ LLAMA_GPU_HOST_IMPORT_AUTO,
};

return result;
Expand Down