domvox · doctorjei · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026
diff --git a/common/arg.cpp b/common/arg.cpp
@@ -2249,6 +2249,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
             params.use_hugepages = true;
         }
     ).set_env("LLAMA_ARG_HUGEPAGES"));
+    add_opt(common_arg(
+        {"--gpu-host-import"},
+        {"--no-gpu-host-import"},
+        "import model weights to the GPU via host-pointer aliasing (buffer_from_host_ptr) in lieu of separate device buffer.\n"
+        "requires backend capability (e.g., HIP on integrated APUs). automatically enabled with --hugepages and disabled otherwise",
+        [](common_params & params, bool value) {
+            params.gpu_host_import = value ? LLAMA_GPU_HOST_IMPORT_ON : LLAMA_GPU_HOST_IMPORT_OFF;
+        }
+    ).set_env("LLAMA_ARG_GPU_HOST_IMPORT"));
     add_opt(common_arg(
         {"-dio", "--direct-io"},
         {"-ndio", "--no-direct-io"},

diff --git a/common/common.cpp b/common/common.cpp
@@ -1432,6 +1432,7 @@ struct llama_model_params common_model_params_to_llama(common_params & params) {
     mparams.check_tensors   = params.check_tensors;
     mparams.use_extra_bufts = !params.no_extra_bufts;
     mparams.no_host         = params.no_host;
+    mparams.gpu_host_import = params.gpu_host_import;
 
     if (params.kv_overrides.empty()) {
         mparams.kv_overrides = NULL;

diff --git a/common/common.h b/common/common.h
@@ -547,6 +547,8 @@ struct common_params {
 
     bool single_turn       = false; // single turn chat conversation
 
+    llama_gpu_host_import_mode gpu_host_import = LLAMA_GPU_HOST_IMPORT_AUTO; // bfhp defaults to auto
+
     ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
     ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -633,25 +633,11 @@ static enum ggml_status ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer
         const size_t original_size = ggml_nbytes(tensor);
         const size_t padded_size = ggml_backend_buft_get_alloc_size(buffer->buft, tensor);
 
-        if (padded_size > original_size) {
-            if (ctx->owned) {
-                ggml_cuda_set_device(ctx->device);
-                CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
-            } else {
-                // Externally-owned buffer (buffer_from_host_ptr): the GPU-side
-                // cudaMemset is unusable here because hipMemset through a
-                // hipHostGetDevicePointer-derived address is unsupported on
-                // GFX1151 / ROCm 7.2.0 (Mapped regions are effectively
-                // read-only from the GPU side). Zero the padding via the
-                // host-side mapping instead — the caller's host buffer must
-                // be writable through the end of all init_tensor calls. The
-                // hugepages loader satisfies this by mapping PROT_READ|
-                // PROT_WRITE during load and downgrading to PROT_READ after
-                // load_all_data completes.
-                const size_t pad_offset = (size_t)((const char *)tensor->data + original_size
-                                                   - (const char *)ctx->dev_ptr);
-                memset((uint8_t *)ctx->host_ptr + pad_offset, 0, padded_size - original_size);
-            }
+        // As some backend(s) cannot safely memset via device alias (e.g., GFX1151 / ROCm 7.2.0), padding for extern buffer is zero-filled
+        // in the loader. Mapped regions are effectively readonly from the GPU, and host buffer protection may prevent writes.
+        if (padded_size > original_size && ctx->owned) {
+            ggml_cuda_set_device(ctx->device);
+            CUDA_CHECK(cudaMemset((char *)tensor->data + original_size, 0, padded_size - original_size));
         }
     }
     return GGML_STATUS_SUCCESS;
@@ -749,20 +735,37 @@ static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
     /* .reset           = */ NULL,
 };
 
-// Buffer interface for externally-owned (buffer_from_host_ptr) buffers.
-// The underlying host memory is read-only (mmap'd PROT_READ on Linux, and
-// hipHostRegister+Mapped regions are effectively read-only from the GPU
-// side on GFX1151 / ROCm 7.2.0 regardless of flags). Write ops are NULL'd
-// to enforce read-only semantics at the type level.
+static void ggml_backend_cuda_imported_buffer_set_tensor(
+    ggml_backend_buffer_t, ggml_tensor *, const void *, size_t, size_t) {
+    GGML_ABORT("ggml-cuda: imported buffer is read-only (set_tensor not supported)");
+}
+
+static void ggml_backend_cuda_imported_buffer_memset_tensor(
+    ggml_backend_buffer_t, ggml_tensor *, uint8_t, size_t, size_t) {
+    GGML_ABORT("ggml-cuda: imported buffer is read-only (memset_tensor not supported)");
+}
+
+static bool ggml_backend_cuda_imported_buffer_cpy_tensor(
+    ggml_backend_buffer_t, const ggml_tensor *, ggml_tensor *) {
+    GGML_ABORT("ggml-cuda: imported buffer is read-only (cpy_tensor not supported)");
+}
+
+static void ggml_backend_cuda_imported_buffer_clear(
+    ggml_backend_buffer_t, uint8_t) {
+    GGML_ABORT("ggml-cuda: imported buffer is read-only (clear_tensor not supported)");
+}
+
+// Buffer interface for external (buffer_from_host_ptr) buffers. Host memory is read-only (mmap'd PROT_READ on Linux), and  hipHostRegister+Mapped
+// regions are effectively read-only from some GPU(s) (e.g., GFX1151 / ROCm 7.2.0) regardless of flags. Writes trigger an abort to enforce semantics.
 static const ggml_backend_buffer_i ggml_backend_cuda_imported_buffer_interface = {
     /* .free_buffer     = */ ggml_backend_cuda_buffer_free_buffer,
     /* .get_base        = */ ggml_backend_cuda_buffer_get_base,
     /* .init_tensor     = */ ggml_backend_cuda_buffer_init_tensor,
-    /* .memset_tensor   = */ NULL,
-    /* .set_tensor      = */ NULL,
+    /* .memset_tensor   = */ ggml_backend_cuda_imported_buffer_memset_tensor,
+    /* .set_tensor      = */ ggml_backend_cuda_imported_buffer_set_tensor,
     /* .get_tensor      = */ ggml_backend_cuda_buffer_get_tensor,
-    /* .cpy_tensor      = */ NULL,
-    /* .clear           = */ NULL,
+    /* .cpy_tensor      = */ ggml_backend_cuda_imported_buffer_cpy_tensor,
+    /* .clear           = */ ggml_backend_cuda_imported_buffer_clear,
     /* .reset           = */ NULL,
 };
 
@@ -4744,13 +4747,8 @@ static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_back
     bool events = true;
 #endif
 
-    // buffer_from_host_ptr is currently enabled only on HIP integrated GPUs
-    // (validated on Strix Halo / ROCm 7.2.0). NVIDIA Jetson reports
-    // prop.integrated == 1 too and may benefit from the same path, but it
-    // has not been tested on that platform; #15034's cuda_host-buffer
-    // corruption is in a different code path (see ggml-cuda.cu:243) and
-    // is not expected to apply here, but validation is required before
-    // extending beyond HIP.
+    // buffer_from_host_ptr is currently enabled only on HIP integrated GPUs (validated on Strix Halo). NVIDIA Jetson may benefit from this
+    // path but has not been tested; its host-buffer corruption is separate, but validation is required before extending beyond HIP.
     bool buffer_from_host_ptr = false;
 #if defined(GGML_USE_HIP)
     cudaDeviceProp prop;
@@ -4783,18 +4781,10 @@ static ggml_backend_buffer_type_t ggml_backend_cuda_device_get_host_buffer_type(
 }
 
 #if defined(GGML_USE_HIP)
-// Import a host-allocated memory region as a GPU-accessible buffer via
-// cudaHostRegister + cudaHostGetDevicePointer. HIP-only for now; only
-// validated on Strix Halo / ROCm 7.2.0. See the capability flag in
-// get_props() for context. TODO: extend to CUDA / Jetson after validating
-// that #15034's corruption mode does not apply to this code path.
+// Import host-allocated memory region as GPU-accessible buffer via cudaHostRegister + cudaHostGetDevicePointer. Only validated on
+// HIP / ROCm 7.2.0; see get_props() for context. TODO: extend to CUDA / Jetson after validating corruption mode does not apply here.
 //
-// PRECONDITION: the caller's host buffer must be writable through the
-// end of all init_tensor calls. The buffer's quantized-tensor padding
-// is zeroed via host-side memset during init_tensor (GPU-side memset
-// is unsupported on GFX1151 / ROCm 7.2.0 for Mapped regions). The
-// hugepages loader satisfies this by mapping PROT_READ|PROT_WRITE
-// during load and downgrading to PROT_READ after load_all_data.
+// Loader ensures quantized-tensor padding bytes are zerod in host buffer before calling; already handled for hugepages and file mmap.
 static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(
         ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
     GGML_UNUSED(max_tensor_size);
@@ -4808,15 +4798,11 @@ static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(
         return nullptr;
     }
 
-    // ReadOnly is intentionally not set: empirically, hipHostRegisterReadOnly
-    // on GFX1151 / ROCm 7.2.0 incurs a ~14% TG regression vs Portable|Mapped
-    // alone (measured on Qwen3-30B-A3B Q4_K_M, 2026-04-25). The host-side
-    // mmap is PROT_READ after the loader finalizes; the imported buffer
-    // interface enforces read-only access at the type level by NULLing
-    // write ops, so ReadOnly adds no contract enforcement we don't
-    // already have. Its only observable effect is the regression.
-    cudaError_t err = cudaHostRegister(ptr, size,
-        cudaHostRegisterPortable | cudaHostRegisterMapped);
+    // ReadOnly is intentionally absent: empirically, hipHostRegisterReadOnly on some platforms incur a performance penalty vs Portable|Mapped
+    // alone (e.g., GFX1151 / ROCm 7.2.0 penalty is ~14% of TG on Qwen3 30B Q4_K_M). Host-side  mmap is PROT_READ after loader finalizes; the
+    // imported buffer interface enforces read-only access at the type level by aborting on calls to write ops to enforce contract semantics.
+    cudaError_t err = cudaHostRegister(ptr, size, cudaHostRegisterPortable | cudaHostRegisterMapped);
+
     if (err != cudaSuccess) {
         (void)cudaGetLastError();
         GGML_LOG_ERROR("%s: cudaHostRegister failed: %s\n", __func__, cudaGetErrorString(err));

diff --git a/include/llama.h b/include/llama.h
@@ -198,6 +198,14 @@ extern "C" {
         LLAMA_SPLIT_MODE_ROW   = 2, // split layers and KV across GPUs, use tensor parallelism if supported
     };
 
+    // Support for buffer from host pages; allows passing of pages to GPU on unified memory systems.
+    // Needed to avoid double memory and copy with some memory modes (e.g., Linux humepages with preallocation)
+    enum llama_gpu_host_import_mode {
+        LLAMA_GPU_HOST_IMPORT_AUTO = 0, // on by default if and only if use_hugepages is active
+        LLAMA_GPU_HOST_IMPORT_ON   = 1, // forced on
+        LLAMA_GPU_HOST_IMPORT_OFF  = 2, // forced off
+    };
+
     // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
     typedef struct llama_token_data {
         llama_token id; // token id
@@ -320,6 +328,8 @@ extern "C" {
         bool no_host;         // bypass host buffer allowing extra buffers to be used
         bool no_alloc;        // only load metadata and simulate memory allocations
         bool use_hugepages;   // back model memory with anonymous hugetlb pages (Linux only)
+
+        enum llama_gpu_host_import_mode gpu_host_import; // whether loader imports weights via buffer_from_host_ptr (default: auto)
     };
 
     struct llama_sampler_seq_config {

diff --git a/src/llama-model-loader.cpp b/src/llama-model-loader.cpp
@@ -14,7 +14,7 @@
 #include <future>
 #include <regex>
 
-#ifdef __linux__
+#if defined(__linux__) || defined(__APPLE__)
 #include <sys/mman.h>
 #endif
 
@@ -1382,6 +1382,55 @@ void llama_model_loader::get_mapping_range(size_t * first, size_t * last, void *
     }
 }
 
+// Zero-fill padding. For MAP_PRIVATE file mmaps, triggers COW on touched pages (~one page per padded tensor); no-op
+// for anonymous hugetlb. Restores PROT_READ on exit. Linux/Apple-only (uses mprotect); no-op on other platforms.
+void llama_model_loader::zero_padding_in_mapping(uint32_t idx, ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+#if defined(__linux__) || defined(__APPLE__)
+    if (idx >= mappings.size()) {
+        return;
+    }
+    auto & mapping = mappings.at(idx);
+    if (mapping->is_hugetlb()) {
+        // Kernel zero-fills anonymous hugetlb mappings, so padding bytes are already zero; skipping here avoids poisoning the
+        // mapping with PROT_READ before load_all_data completes. and the lockdown happens in load_all_data's cleanup instead.
+        return;
+    }
+    void * base = mapping->addr();
+    const size_t mapsize = mapping->mmap_size();
+
+    if (mprotect(base, mapsize, PROT_READ | PROT_WRITE) != 0) {
+        LLAMA_LOG_WARN("%s: mprotect(RW) failed: %s; skipping pre-bfhp zero-pass\n", __func__, strerror(errno));
+        return;
+    }
+
+    for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
+        const auto * weight = get_weight(ggml_get_name(tensor));
+        if (!weight || weight->idx != idx) {
+            continue;
+        }
+        if (!ggml_is_quantized(tensor->type)) {
+            continue;
+        }
+        if (tensor->view_src != nullptr) {
+            continue;
+        }
+        const size_t orig_size   = ggml_nbytes(tensor);
+        const size_t padded_size = ggml_backend_buft_get_alloc_size(buft, tensor);
+        if (padded_size > orig_size) {
+            memset((char *) base + weight->offs + orig_size, 0, padded_size - orig_size);
+        }
+    }
+
+    if (mprotect(base, mapsize, PROT_READ) != 0) {
+        LLAMA_LOG_WARN("%s: mprotect(RO) restore failed: %s\n", __func__, strerror(errno));
+    }
+#else
+    GGML_UNUSED(idx);
+    GGML_UNUSED(ctx);
+    GGML_UNUSED(buft);
+#endif
+}
+
 void llama_model_loader::load_data_for(struct ggml_tensor * cur) const {
     const auto & w = require_weight(ggml_get_name(cur));
 

diff --git a/src/llama-model-loader.h b/src/llama-model-loader.h
@@ -192,6 +192,9 @@ struct llama_model_loader {
 
     void get_mapping_range(size_t * first, size_t * last, void ** addr, int idx, ggml_context * ctx) const;
 
+    // Ensure quantized-tensor padding bytes are zero in file-ifx-th mapped region.
+    void zero_padding_in_mapping(uint32_t idx, struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+
     // for backwards compatibility, does not support ggml-backend
     void load_data_for(struct ggml_tensor * cur) const;
 

diff --git a/src/llama-model.cpp b/src/llama-model.cpp
@@ -7797,9 +7797,19 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
         ggml_backend_dev_get_props(dev, &props);
         bool buffer_from_host_ptr_supported = props.caps.buffer_from_host_ptr;
         bool is_default_buft = buft == ggml_backend_dev_buffer_type(dev);
+        bool gpu_host_import_enabled;
+
+        // Resolve bfhp state. AUTO --> match hugepages to avoid unnecessarily copying and memory consumption.
+        // ON/OFF are overrides from user flags.
+        switch (params.gpu_host_import) {
+            case LLAMA_GPU_HOST_IMPORT_ON:  gpu_host_import_enabled = true;                break;
+            case LLAMA_GPU_HOST_IMPORT_OFF: gpu_host_import_enabled = false;               break;
+            case LLAMA_GPU_HOST_IMPORT_AUTO:
+            default:                        gpu_host_import_enabled = params.use_hugepages; break;
+        }
 
         std::vector<ggml_backend_buffer_ptr> bufs;
-        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft) {
+        if (ml.use_mmap && use_mmap_buffer && buffer_from_host_ptr_supported && is_default_buft && gpu_host_import_enabled) {
             GGML_ASSERT(!ml.no_alloc);
             for (uint32_t idx = 0; idx < ml.files.size(); idx++) {
                 // only the mmap region containing the tensors in the model is mapped to the backend buffer
@@ -7812,6 +7822,8 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                 if (first >= last) {
                     continue;
                 }
+                ml.zero_padding_in_mapping(idx, ctx, buft); // Zero out padding before importing host buffer (invariant for all backends)
+
                 const size_t max_size = ggml_get_max_tensor_size(ctx);
                 ggml_backend_buffer_t buf = ggml_backend_dev_buffer_from_host_ptr(dev, (char *) addr + first, last - first, max_size);
                 if (buf == nullptr) {
@@ -8955,6 +8967,7 @@ llama_model_params llama_model_default_params() {
         /*.no_host                     =*/ false,
         /*.no_alloc                    =*/ false,
         /*.use_hugepages               =*/ false,
+        /*.gpu_host_import             =*/ LLAMA_GPU_HOST_IMPORT_AUTO,
     };
 
     return result;