diff --git a/common/arg.cpp b/common/arg.cpp index a06edb491..ca0c653b8 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2249,6 +2249,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex params.use_hugepages = true; } ).set_env("LLAMA_ARG_HUGEPAGES")); + add_opt(common_arg( + {"--gpu-host-import"}, + {"--no-gpu-host-import"}, + "import model weights to the GPU via host-pointer aliasing (buffer_from_host_ptr) in lieu of separate device buffer.\n" + "requires backend capability (e.g., HIP on integrated APUs). automatically enabled with --hugepages and disabled otherwise", + [](common_params & params, bool value) { + params.gpu_host_import = value ? LLAMA_GPU_HOST_IMPORT_ON : LLAMA_GPU_HOST_IMPORT_OFF; + } + ).set_env("LLAMA_ARG_GPU_HOST_IMPORT")); add_opt(common_arg( {"-dio", "--direct-io"}, {"-ndio", "--no-direct-io"}, diff --git a/common/common.cpp b/common/common.cpp index 0bdf09d11..6a5d51107 100644 --- a/common/common.cpp +++ b/common/common.cpp @@ -1432,6 +1432,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) { mparams.check_tensors = params.check_tensors; mparams.use_extra_bufts = !params.no_extra_bufts; mparams.no_host = params.no_host; + mparams.gpu_host_import = params.gpu_host_import; if (params.kv_overrides.empty()) { mparams.kv_overrides = NULL; diff --git a/common/common.h b/common/common.h index e2250ff98..1911b92cf 100644 --- a/common/common.h +++ b/common/common.h @@ -547,6 +547,8 @@ struct common_params { bool single_turn = false; // single turn chat conversation + llama_gpu_host_import_mode gpu_host_import = LLAMA_GPU_HOST_IMPORT_AUTO; // bfhp defaults to auto + ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index c979e3b75..8f9f5222a 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -633,25 +633,11 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer const size_t original_size = ggml_nbytes(tensor); const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor); - if (padded_size > original_size) { - if (ctx->owned) { - ggml_cuda_set_device(ctx->device); - CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); - } else { - // Externally-owned buffer (buffer_from_host_ptr): the GPU-side - // cudaMemset is unusable here because hipMemset through a - // hipHostGetDevicePointer-derived address is unsupported on - // GFX1151 / ROCm 7.2.0 (Mapped regions are effectively - // read-only from the GPU side). Zero the padding via the - // host-side mapping instead — the caller's host buffer must - // be writable through the end of all init_tensor calls. The - // hugepages loader satisfies this by mapping PROT_READ| - // PROT_WRITE during load and downgrading to PROT_READ after - // load_all_data completes. - const size_t pad_offset = (size_t)((const char *)tensor->data + original_size - - (const char *)ctx->dev_ptr); - memset((uint8_t *)ctx->host_ptr + pad_offset, 0, padded_size - original_size); - } + // As some backend(s) cannot safely memset via device alias (e.g., GFX1151 / ROCm 7.2.0), padding for extern buffer is zero-filled + // in the loader. Mapped regions are effectively readonly from the GPU, and host buffer protection may prevent writes. + if (padded_size > original_size && ctx->owned) { + ggml_cuda_set_device(ctx->device); + CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size)); } } return GGML_STATUS_SUCCESS; @@ -749,20 +735,37 @@ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = { /* .reset = */ NULL, }; -// Buffer interface for externally-owned (buffer_from_host_ptr) buffers. -// The underlying host memory is read-only (mmap'd PROT_READ on Linux, and -// hipHostRegister+Mapped regions are effectively read-only from the GPU -// side on GFX1151 / ROCm 7.2.0 regardless of flags). Write ops are NULL'd -// to enforce read-only semantics at the type level. +static void ggml_backend_cuda_imported_buffer_set_tensor( + ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) { + GGML_ABORT("ggml-cuda: imported buffer is read-only (set_tensor not supported)"); +} + +static void ggml_backend_cuda_imported_buffer_memset_tensor( + ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) { + GGML_ABORT("ggml-cuda: imported buffer is read-only (memset_tensor not supported)"); +} + +static bool ggml_backend_cuda_imported_buffer_cpy_tensor( + ggml_backend_buffer_t, const ggml_tensor *, ggml_tensor *) { + GGML_ABORT("ggml-cuda: imported buffer is read-only (cpy_tensor not supported)"); +} + +static void ggml_backend_cuda_imported_buffer_clear( + ggml_backend_buffer_t, uint8_t) { + GGML_ABORT("ggml-cuda: imported buffer is read-only (clear_tensor not supported)"); +} + +// Buffer interface for external (buffer_from_host_ptr) buffers. Host memory is read-only (mmap'd PROT_READ on Linux), and hipHostRegister+Mapped +// regions are effectively read-only from some GPU(s) (e.g., GFX1151 / ROCm 7.2.0) regardless of flags. Writes trigger an abort to enforce semantics. static const ggml_backend_buffer_i ggml_backend_cuda_imported_buffer_interface = { /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer, /* .get_base = */ ggml_backend_cuda_buffer_get_base, /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor, - /* .memset_tensor = */ NULL, - /* .set_tensor = */ NULL, + /* .memset_tensor = */ ggml_backend_cuda_imported_buffer_memset_tensor, + /* .set_tensor = */ ggml_backend_cuda_imported_buffer_set_tensor, /* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor, - /* .cpy_tensor = */ NULL, - /* .clear = */ NULL, + /* .cpy_tensor = */ ggml_backend_cuda_imported_buffer_cpy_tensor, + /* .clear = */ ggml_backend_cuda_imported_buffer_clear, /* .reset = */ NULL, }; @@ -4744,13 +4747,8 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back bool events = true; #endif - // buffer_from_host_ptr is currently enabled only on HIP integrated GPUs - // (validated on Strix Halo / ROCm 7.2.0). NVIDIA Jetson reports - // prop.integrated == 1 too and may benefit from the same path, but it - // has not been tested on that platform; #15034's cuda_host-buffer - // corruption is in a different code path (see ggml-cuda.cu:243) and - // is not expected to apply here, but validation is required before - // extending beyond HIP. + // buffer_from_host_ptr is currently enabled only on HIP integrated GPUs (validated on Strix Halo). NVIDIA Jetson may benefit from this + // path but has not been tested; its host-buffer corruption is separate, but validation is required before extending beyond HIP. bool buffer_from_host_ptr = false; #if defined(GGML_USE_HIP) cudaDeviceProp prop; @@ -4783,18 +4781,10 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type( } #if defined(GGML_USE_HIP) -// Import a host-allocated memory region as a GPU-accessible buffer via -// cudaHostRegister + cudaHostGetDevicePointer. HIP-only for now; only -// validated on Strix Halo / ROCm 7.2.0. See the capability flag in -// get_props() for context. TODO: extend to CUDA / Jetson after validating -// that #15034's corruption mode does not apply to this code path. +// Import host-allocated memory region as GPU-accessible buffer via cudaHostRegister + cudaHostGetDevicePointer. Only validated on +// HIP / ROCm 7.2.0; see get_props() for context. TODO: extend to CUDA / Jetson after validating corruption mode does not apply here. // -// PRECONDITION: the caller's host buffer must be writable through the -// end of all init_tensor calls. The buffer's quantized-tensor padding -// is zeroed via host-side memset during init_tensor (GPU-side memset -// is unsupported on GFX1151 / ROCm 7.2.0 for Mapped regions). The -// hugepages loader satisfies this by mapping PROT_READ|PROT_WRITE -// during load and downgrading to PROT_READ after load_all_data. +// Loader ensures quantized-tensor padding bytes are zerod in host buffer before calling; already handled for hugepages and file mmap. static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr( ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) { GGML_UNUSED(max_tensor_size); @@ -4808,15 +4798,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr( return nullptr; } - // ReadOnly is intentionally not set: empirically, hipHostRegisterReadOnly - // on GFX1151 / ROCm 7.2.0 incurs a ~14% TG regression vs Portable|Mapped - // alone (measured on Qwen3-30B-A3B Q4_K_M, 2026-04-25). The host-side - // mmap is PROT_READ after the loader finalizes; the imported buffer - // interface enforces read-only access at the type level by NULLing - // write ops, so ReadOnly adds no contract enforcement we don't - // already have. Its only observable effect is the regression. - cudaError_t err = cudaHostRegister(ptr, size, - cudaHostRegisterPortable | cudaHostRegisterMapped); + // ReadOnly is intentionally absent: empirically, hipHostRegisterReadOnly on some platforms incur a performance penalty vs Portable|Mapped + // alone (e.g., GFX1151 / ROCm 7.2.0 penalty is ~14% of TG on Qwen3 30B Q4_K_M). Host-side mmap is PROT_READ after loader finalizes; the + // imported buffer interface enforces read-only access at the type level by aborting on calls to write ops to enforce contract semantics. + cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterPortable | cudaHostRegisterMapped); + if (err != cudaSuccess) { (void)cudaGetLastError(); GGML_LOG_ERROR("%s: cudaHostRegister failed: %s\n", __func__, cudaGetErrorString(err)); diff --git a/include/llama.h b/include/llama.h index 7c55d8df9..40cbab8e3 100644 --- a/include/llama.h +++ b/include/llama.h @@ -198,6 +198,14 @@ extern "C" { LLAMA_SPLIT_MODE_ROW = 2, // split layers and KV across GPUs, use tensor parallelism if supported }; + // Support for buffer from host pages; allows passing of pages to GPU on unified memory systems. + // Needed to avoid double memory and copy with some memory modes (e.g., Linux humepages with preallocation) + enum llama_gpu_host_import_mode { + LLAMA_GPU_HOST_IMPORT_AUTO = 0, // on by default if and only if use_hugepages is active + LLAMA_GPU_HOST_IMPORT_ON = 1, // forced on + LLAMA_GPU_HOST_IMPORT_OFF = 2, // forced off + }; + // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) typedef struct llama_token_data { llama_token id; // token id @@ -320,6 +328,8 @@ extern "C" { bool no_host; // bypass host buffer allowing extra buffers to be used bool no_alloc; // only load metadata and simulate memory allocations bool use_hugepages; // back model memory with anonymous hugetlb pages (Linux only) + + enum llama_gpu_host_import_mode gpu_host_import; // whether loader imports weights via buffer_from_host_ptr (default: auto) }; struct llama_sampler_seq_config { diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp index e72e91914..a11664050 100644 --- a/src/llama-model-loader.cpp +++ b/src/llama-model-loader.cpp @@ -14,7 +14,7 @@ #include #include -#ifdef __linux__ +#if defined(__linux__) || defined(__APPLE__) #include #endif @@ -1382,6 +1382,55 @@ void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void * } } +// Zero-fill padding. For MAP_PRIVATE file mmaps, triggers COW on touched pages (~one page per padded tensor); no-op +// for anonymous hugetlb. Restores PROT_READ on exit. Linux/Apple-only (uses mprotect); no-op on other platforms. +void llama_model_loader::zero_padding_in_mapping(uint32_t idx, ggml_context * ctx, ggml_backend_buffer_type_t buft) { +#if defined(__linux__) || defined(__APPLE__) + if (idx >= mappings.size()) { + return; + } + auto & mapping = mappings.at(idx); + if (mapping->is_hugetlb()) { + // Kernel zero-fills anonymous hugetlb mappings, so padding bytes are already zero; skipping here avoids poisoning the + // mapping with PROT_READ before load_all_data completes. and the lockdown happens in load_all_data's cleanup instead. + return; + } + void * base = mapping->addr(); + const size_t mapsize = mapping->mmap_size(); + + if (mprotect(base, mapsize, PROT_READ | PROT_WRITE) != 0) { + LLAMA_LOG_WARN("%s: mprotect(RW) failed: %s; skipping pre-bfhp zero-pass\n", __func__, strerror(errno)); + return; + } + + for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) { + const auto * weight = get_weight(ggml_get_name(tensor)); + if (!weight || weight->idx != idx) { + continue; + } + if (!ggml_is_quantized(tensor->type)) { + continue; + } + if (tensor->view_src != nullptr) { + continue; + } + const size_t orig_size = ggml_nbytes(tensor); + const size_t padded_size = ggml_backend_buft_get_alloc_size(buft, tensor); + if (padded_size > orig_size) { + memset((char *) base + weight->offs + orig_size, 0, padded_size - orig_size); + } + } + + if (mprotect(base, mapsize, PROT_READ) != 0) { + LLAMA_LOG_WARN("%s: mprotect(RO) restore failed: %s\n", __func__, strerror(errno)); + } +#else + GGML_UNUSED(idx); + GGML_UNUSED(ctx); + GGML_UNUSED(buft); +#endif +} + void llama_model_loader::load_data_for(struct ggml_tensor * cur) const { const auto & w = require_weight(ggml_get_name(cur)); diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h index ce6eb5d64..f919e018c 100644 --- a/src/llama-model-loader.h +++ b/src/llama-model-loader.h @@ -192,6 +192,9 @@ struct llama_model_loader { void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const; + // Ensure quantized-tensor padding bytes are zero in file-ifx-th mapped region. + void zero_padding_in_mapping(uint32_t idx, struct ggml_context * ctx, ggml_backend_buffer_type_t buft); + // for backwards compatibility, does not support ggml-backend void load_data_for(struct ggml_tensor * cur) const; diff --git a/src/llama-model.cpp b/src/llama-model.cpp index d766305b6..556a870d8 100644 --- a/src/llama-model.cpp +++ b/src/llama-model.cpp @@ -7797,9 +7797,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) { ggml_backend_dev_get_props(dev, &props); bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr; bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev); + bool gpu_host_import_enabled; + + // Resolve bfhp state. AUTO --> match hugepages to avoid unnecessarily copying and memory consumption. + // ON/OFF are overrides from user flags. + switch (params.gpu_host_import) { + case LLAMA_GPU_HOST_IMPORT_ON: gpu_host_import_enabled = true; break; + case LLAMA_GPU_HOST_IMPORT_OFF: gpu_host_import_enabled = false; break; + case LLAMA_GPU_HOST_IMPORT_AUTO: + default: gpu_host_import_enabled = params.use_hugepages; break; + } std::vector bufs; - if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) { + if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft && gpu_host_import_enabled) { GGML_ASSERT(!ml.no_alloc); for (uint32_t idx = 0; idx < ml.files.size(); idx++) { // only the mmap region containing the tensors in the model is mapped to the backend buffer @@ -7812,6 +7822,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) { if (first >= last) { continue; } + ml.zero_padding_in_mapping(idx, ctx, buft); // Zero out padding before importing host buffer (invariant for all backends) + const size_t max_size = ggml_get_max_tensor_size(ctx); ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size); if (buf == nullptr) { @@ -8955,6 +8967,7 @@ llama_model_params llama_model_default_params() { /*.no_host =*/ false, /*.no_alloc =*/ false, /*.use_hugepages =*/ false, + /*.gpu_host_import =*/ LLAMA_GPU_HOST_IMPORT_AUTO, }; return result;