From 75c458da489ec2c543121da118601966e68d2727 Mon Sep 17 00:00:00 2001 From: Mike Hutu <159610786+Mikehutu@users.noreply.github.com> Date: Fri, 15 May 2026 11:04:07 +0300 Subject: [PATCH] cuda: fix HMM path on coherent unified-memory systems (GB10 NVLink-C2C) When cudaHostRegister() returns cudaErrorNotSupported the GPU already has full HMM access to all host virtual addresses; no registration is needed. The previous code left the else-branch empty after clearing the error, so g_model_hmm_direct was never set and cuda_model_range_is_cached() kept returning 0 for every tensor range. This caused cuda_model_range_ptr() to fall through to the per-range cudaMalloc+cudaMemcpy path for every tensor on first access, silently allocating ~87 GB of redundant device copies of the model weights on a DGX Spark GB10 (128 GB unified LPDDR5x): Before: free -h shows ~98 GB used / ~23 GB available nvidia-smi process entry: 87,992 MiB After: free -h shows ~26 GB used / ~95 GB available nvidia-smi process entry: ~26,000 MiB (KV cache only) Fix: when registration is skipped, call cuda_model_prefetch_range() which was already fully implemented but had no call site on the normal Linux/CUDA path. It issues cudaMemPrefetchAsync over the full mmap region (async, on a side stream) and sets g_model_hmm_direct=1 on success. The companion hunk in cuda_model_range_is_cached() short-circuits to 1 when g_model_hmm_direct is set, so the per-range copy path is never reached. Tested manually on DGX Spark GB10 (SM_121, 128 GB, CUDA 13.0, driver 580.142) with DeepSeek V4 Flash IQ2_XXS-w2Q2K (80.76 GiB): 13.81 t/s, 94% GPU util. --- ds4_cuda.cu | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 1eee21de..aa72cb5c 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -305,6 +305,8 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset, static int cuda_model_range_is_cached(const void *model_map, uint64_t offset, uint64_t bytes) { if (bytes == 0) return 1; if (g_model_device_owned || g_model_registered) return 1; + /* On the HMM direct path every range is immediately accessible. */ + if (g_model_hmm_direct) return 1; const uint64_t end = offset + bytes; if (end < offset) return 0; @@ -1487,6 +1489,12 @@ extern "C" int ds4_gpu_set_model_map(const void *model_map, uint64_t model_size) } else { fprintf(stderr, "ds4: CUDA host registration skipped: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); + /* On coherent unified memory systems (e.g. Grace-Blackwell NVLink-C2C) the + * host mmap pointer is directly addressable by the GPU via HMM page faults. + * Queue an async prefetch of the full model so pages are resident before + * inference starts, avoiding per-kernel page-fault stalls. The function + * also sets g_model_hmm_direct=1 on success. */ + (void)cuda_model_prefetch_range(model_map, model_size, 0, model_size); } return 1; }