diff --git a/Makefile b/Makefile
index 005bb45e..e20b1ce8 100644
--- a/Makefile
+++ b/Makefile
@@ -12,6 +12,7 @@ OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc
 
 LDLIBS ?= -lm -pthread
 METAL_SRCS := $(wildcard metal/*.metal)
+GPU_EXTRA_DEPS :=
 
 ifeq ($(UNAME_S),Darwin)
 METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
@@ -26,12 +27,14 @@ ROCM_PATH ?= /opt/rocm
 GPU_CC = $(ROCM_PATH)/bin/hipcc
 ROCM_ARCH ?= gfx1151
 
+CFLAGS += -DDS4_ROCM_BUILD
 GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
+GPU_CFLAGS += -DDS4_ROCM_BUILD
 GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas
 
 @echo "ROCM_ARCH: $(ROCM_ARCH)"
 
-EXTRA_DEPS = ds4_rocm.h
+GPU_EXTRA_DEPS += ds4_rocm.h
 
 else
 
@@ -177,7 +180,7 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h
 ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
 	$(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m
 
-ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS)
+ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(GPU_EXTRA_DEPS)
 	$(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu
 
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
@@ -195,4 +198,3 @@ test: ds4_test
 
 clean:
 	rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
-
diff --git a/README.md b/README.md
index 25af9206..edda84b6 100644
--- a/README.md
+++ b/README.md
@@ -136,6 +136,7 @@ Then build:
 make                  # macOS Metal
 make cuda-spark       # Linux CUDA, DGX Spark / GB10
 make cuda-generic     # Linux CUDA, other local CUDA GPUs
+make rocm ROCM_ARCH=gfx1151
 make cpu              # CPU-only diagnostics build
 ```
 
@@ -646,11 +647,13 @@ the kv cache files include the verbatim prompt cached.
 
 ## Backends
 
-The default graph backend is Metal on macOS and CUDA in CUDA builds:
+The default graph backend is Metal on macOS, CUDA in CUDA builds, and ROCm in
+ROCm builds:
 
 ```sh
 ./ds4 -p "Hello" --metal
 ./ds4 -p "Hello" --cuda
+./ds4 -p "Hello" --rocm
 ```
 
 On Linux, plain `make` prints the available build targets instead of selecting a
@@ -662,8 +665,13 @@ when cross-building or when you need a known target:
 ```sh
 make cuda CUDA_ARCH=sm_120
 make cuda CUDA_ARCH=native
+make rocm ROCM_ARCH=gfx1151
 ```
 
+In ROCm builds, `--rocm` and `--backend rocm` are the canonical runtime names.
+`--cuda` and `--backend cuda` remain accepted as compatibility aliases because
+the shared GPU implementation still uses the CUDA backend enum internally.
+
 There is also a CPU reference/debug path:
 
 ```sh
@@ -675,8 +683,8 @@ make cpu
 
 Do not treat the CPU path as the production target. The CLI and `ds4-server`
 support the CPU backend for reference/debug use and share the same KV session
-and snapshot format as Metal and CUDA, but normal inference should use Metal or
-CUDA.
+and snapshot format as Metal, CUDA, and ROCm, but normal inference should use
+Metal, CUDA, or ROCm.
 
 ## Steering
 
diff --git a/ds4.c b/ds4.c
index 51410e33..39d760bb 100644
--- a/ds4.c
+++ b/ds4.c
@@ -1453,7 +1453,7 @@ static bool accelerator_cache_model_tensors(ds4_backend backend, const ds4_model
         const double t1 = now_sec();
         if (ds4_log_is_tty(stderr)) fputc('\n', stderr);
         fprintf(stderr,
-                "ds4: CUDA startup model cache prepared %.2f GiB of tensor spans in %.3fs\n",
+                "ds4: " DS4_GPU_BACKEND_DISPLAY " startup model cache prepared %.2f GiB of tensor spans in %.3fs\n",
                 (double)cached / 1073741824.0,
                 t1 - t0);
     }
@@ -16995,7 +16995,8 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
 #ifndef DS4_NO_GPU
     if (e->backend == DS4_BACKEND_CUDA) {
 #ifdef __APPLE__
-        fprintf(stderr, "ds4: CUDA backend requested but this build is linked with Metal, not CUDA\n");
+        fprintf(stderr, "ds4: %s backend requested but this build is linked with Metal, not %s\n",
+                DS4_GPU_BACKEND_DISPLAY, DS4_GPU_BACKEND_DISPLAY);
         ds4_engine_close(e);
         *out = NULL;
         return 1;
@@ -17003,7 +17004,8 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
     }
     if (e->backend == DS4_BACKEND_METAL) {
 #ifndef __APPLE__
-        fprintf(stderr, "ds4: Metal backend requested but this build is linked with CUDA, not Metal\n");
+        fprintf(stderr, "ds4: Metal backend requested but this build is linked with %s, not Metal\n",
+                DS4_GPU_BACKEND_DISPLAY);
         ds4_engine_close(e);
         *out = NULL;
         return 1;
@@ -17033,11 +17035,12 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
             *out = NULL;
             return 1;
         }
-        if (e->mtp_ready &&
+        const bool mtp_needs_global_model_map = e->mtp_ready && e->backend != DS4_BACKEND_CUDA;
+        if (mtp_needs_global_model_map &&
             !ds4_gpu_set_model_map_range(e->mtp_model.map,
-                                           e->mtp_model.size,
-                                           e->mtp_model.tensor_data_pos,
-                                           e->mtp_model.size - e->mtp_model.tensor_data_pos))
+                                          e->mtp_model.size,
+                                          e->mtp_model.tensor_data_pos,
+                                          e->mtp_model.size - e->mtp_model.tensor_data_pos))
         {
             fprintf(stderr,
                     "ds4: %s failed to map MTP model views; aborting startup. "
@@ -17047,7 +17050,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
             *out = NULL;
             return 1;
         }
-        if (!e->mtp_ready && !accelerator_cache_model_tensors(e->backend, &e->model)) {
+        if (!accelerator_cache_model_tensors(e->backend, &e->model)) {
             fprintf(stderr, "ds4: %s failed to prepare startup model cache\n",
                     ds4_backend_name(e->backend));
             ds4_engine_close(e);
diff --git a/ds4.h b/ds4.h
index 950d8dca..2916070f 100644
--- a/ds4.h
+++ b/ds4.h
@@ -20,6 +20,18 @@ typedef enum {
     DS4_BACKEND_CPU,
 } ds4_backend;
 
+#ifdef DS4_ROCM_BUILD
+#define DS4_GPU_BACKEND_CLI_NAME "rocm"
+#define DS4_GPU_BACKEND_FLAG "--rocm"
+#define DS4_GPU_BACKEND_DISPLAY "ROCm"
+#define DS4_GPU_BACKEND_LIST "metal, rocm, cuda, cpu"
+#else
+#define DS4_GPU_BACKEND_CLI_NAME "cuda"
+#define DS4_GPU_BACKEND_FLAG "--cuda"
+#define DS4_GPU_BACKEND_DISPLAY "CUDA"
+#define DS4_GPU_BACKEND_LIST "metal, cuda, cpu"
+#endif
+
 typedef enum {
     DS4_THINK_NONE,
     DS4_THINK_HIGH,
diff --git a/ds4_bench.c b/ds4_bench.c
index 027b2b31..c7817f1e 100644
--- a/ds4_bench.c
+++ b/ds4_bench.c
@@ -63,8 +63,8 @@ static void usage(FILE *fp) {
         "\n"
         "Model and backend:\n"
         "  -m, --model FILE       GGUF model path. Default: ds4flash.gguf\n"
-        "  --metal | --cuda | --cpu | --backend NAME\n"
-        "      Select backend explicitly. Defaults to Metal on macOS, CUDA elsewhere.\n"
+        "  --metal | " DS4_GPU_BACKEND_FLAG " | --cpu | --backend NAME\n"
+        "      Select backend explicitly. Defaults to Metal on macOS, " DS4_GPU_BACKEND_DISPLAY " in " DS4_GPU_BACKEND_DISPLAY " builds, CUDA elsewhere.\n"
         "  -t, --threads N        CPU helper threads.\n"
         "  --quality              Prefer exact kernels where applicable.\n"
         "  --warm-weights         Touch mapped tensor pages before benchmarking.\n"
@@ -112,10 +112,13 @@ static const char *need_arg(int *i, int argc, char **argv, const char *opt) {
 
 static ds4_backend parse_backend(const char *s, const char *opt) {
     if (!strcmp(s, "metal")) return DS4_BACKEND_METAL;
+    if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA;
+#ifdef DS4_ROCM_BUILD
     if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA;
+#endif
     if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU;
     fprintf(stderr, "ds4-bench: invalid value for %s: %s\n", opt, s);
-    fprintf(stderr, "ds4-bench: valid backends are: metal, cuda, cpu\n");
+    fprintf(stderr, "ds4-bench: valid backends are: %s\n", DS4_GPU_BACKEND_LIST);
     exit(2);
 }
 
@@ -213,7 +216,7 @@ static bench_config parse_options(int argc, char **argv) {
             c.backend = parse_backend(need_arg(&i, argc, argv, arg), arg);
         } else if (!strcmp(arg, "--metal")) {
             c.backend = DS4_BACKEND_METAL;
-        } else if (!strcmp(arg, "--cuda")) {
+        } else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) {
             c.backend = DS4_BACKEND_CUDA;
         } else if (!strcmp(arg, "--cpu")) {
             c.backend = DS4_BACKEND_CPU;
diff --git a/ds4_cli.c b/ds4_cli.c
index bc70e659..f58dcae1 100644
--- a/ds4_cli.c
+++ b/ds4_cli.c
@@ -93,12 +93,12 @@ static void usage(FILE *fp) {
         "      Context size allocated for the session. Default: 32768\n"
         "  --metal\n"
         "      Use the Metal graph backend. This is the normal fast path on macOS.\n"
-        "  --cuda\n"
-        "      Use the CUDA graph backend. This is the normal fast path on CUDA builds.\n"
+        "  " DS4_GPU_BACKEND_FLAG "\n"
+        "      Use the " DS4_GPU_BACKEND_DISPLAY " graph backend. This is the normal fast path on " DS4_GPU_BACKEND_DISPLAY " builds.\n"
         "  --cpu\n"
         "      Use the CPU reference/debug backend. Not recommended for normal inference.\n"
         "  --backend NAME\n"
-        "      Select backend explicitly: metal, cuda, or cpu.\n"
+        "      Select backend explicitly: " DS4_GPU_BACKEND_LIST ".\n"
         "  -t, --threads N\n"
         "      CPU helper threads for host-side or reference work.\n"
         "  --quality\n"
@@ -223,10 +223,13 @@ static float parse_float_range(const char *s, const char *opt, float min, float
 
 static ds4_backend parse_backend(const char *s) {
     if (!strcmp(s, "metal")) return DS4_BACKEND_METAL;
+    if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA;
+#ifdef DS4_ROCM_BUILD
     if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA;
+#endif
     if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU;
     fprintf(stderr, "ds4: invalid backend: %s\n", s);
-    fprintf(stderr, "ds4: valid backends are: metal, cuda, cpu\n");
+    fprintf(stderr, "ds4: valid backends are: %s\n", DS4_GPU_BACKEND_LIST);
     exit(2);
 }
 
@@ -1260,7 +1263,7 @@ static cli_config parse_options(int argc, char **argv) {
             c.engine.backend = DS4_BACKEND_CPU;
         } else if (!strcmp(arg, "--metal")) {
             c.engine.backend = DS4_BACKEND_METAL;
-        } else if (!strcmp(arg, "--cuda")) {
+        } else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) {
             c.engine.backend = DS4_BACKEND_CUDA;
         } else if (!strcmp(arg, "--dump-tokens")) {
             c.gen.dump_tokens = true;
diff --git a/ds4_cuda.cu b/ds4_cuda.cu
index 8b6241ca..1481c220 100644
--- a/ds4_cuda.cu
+++ b/ds4_cuda.cu
@@ -34,6 +34,16 @@
 #define CUDA_QK_K 256
 #define DS4_CUDA_UNUSED __attribute__((unused))
 
+#ifdef DS4_ROCM_BUILD
+#define DS4_CUDA_BACKEND_DISPLAY "ROCm"
+#define DS4_CUDA_BLAS_DISPLAY "hipBLAS"
+#else
+#define DS4_CUDA_BACKEND_DISPLAY "CUDA"
+#define DS4_CUDA_BLAS_DISPLAY "cuBLAS"
+#endif
+
+#define DS4_CUDA_LOG_PREFIX "ds4: " DS4_CUDA_BACKEND_DISPLAY
+
 enum {
     /* attention_decode_mixed_kernel stores raw-window scores plus visible
      * compressed scores in shared memory.  The host routes larger unmasked
@@ -185,7 +195,7 @@ static void *cuda_tmp_alloc(uint64_t bytes, const char *what) {
     void *ptr = NULL;
     cudaError_t err = cudaMalloc(&ptr, (size_t)bytes);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA temp alloc failed for %s (%.2f MiB): %s\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " temp alloc failed for %s (%.2f MiB): %s\n",
                 what ? what : "scratch", (double)bytes / 1048576.0, cudaGetErrorString(err));
         (void)cudaGetLastError();
         return NULL;
@@ -206,7 +216,9 @@ static const char *cuda_model_ptr(const void *model_map, uint64_t offset) {
 
 static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset, uint64_t bytes, const char *what) {
     if (bytes == 0) return cuda_model_ptr(model_map, offset);
-    if (g_model_device_owned || g_model_registered) return cuda_model_ptr(model_map, offset);
+    if ((g_model_device_owned || g_model_registered) && model_map == g_model_host_base) {
+        return cuda_model_ptr(model_map, offset);
+    }
     if (g_model_hmm_direct &&
         getenv("DS4_CUDA_WEIGHT_CACHE") == NULL &&
         getenv("DS4_CUDA_WEIGHT_PRELOAD") == NULL) {
@@ -258,13 +270,13 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset,
                 g_model_ranges.push_back({model_map, offset, bytes, dev_ptr, (void *)reg_addr, (char *)reg_dev, reg_bytes, 1, 0});
                 g_model_range_by_offset[offset] = g_model_ranges.size() - 1u;
                 if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
-                    fprintf(stderr, "ds4: CUDA mapped %s %.2f MiB\n",
+                    fprintf(stderr, DS4_CUDA_LOG_PREFIX " mapped %s %.2f MiB\n",
                             what ? what : "weights",
                             (double)bytes / 1048576.0);
                 }
                 return dev_ptr;
             }
-            fprintf(stderr, "ds4: CUDA model range map pointer failed for %s: %s\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range map pointer failed for %s: %s\n",
                     what ? what : "weights", cudaGetErrorString(err));
             (void)cudaHostUnregister((void *)reg_addr);
             (void)cudaGetLastError();
@@ -278,7 +290,7 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset,
     err = cudaMalloc(&dev, (size_t)bytes);
     if (err != cudaSuccess) {
         (void)cudaGetLastError();
-        fprintf(stderr, "ds4: CUDA model range alloc failed for %s (%.2f MiB): %s\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range alloc failed for %s (%.2f MiB): %s\n",
                 what ? what : "weights", (double)bytes / 1048576.0, cudaGetErrorString(err));
         return NULL;
     }
@@ -289,7 +301,7 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset,
         uint64_t n = bytes - done < chunk ? bytes - done : chunk;
         err = cudaMemcpy((char *)dev + done, src + done, (size_t)n, cudaMemcpyHostToDevice);
         if (err != cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA model range copy failed for %s at %.2f/%.2f MiB: %s\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range copy failed for %s at %.2f/%.2f MiB: %s\n",
                     what ? what : "weights",
                     (double)done / 1048576.0,
                     (double)bytes / 1048576.0,
@@ -303,7 +315,7 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset,
     g_model_range_by_offset[offset] = g_model_ranges.size() - 1u;
     g_model_range_bytes += bytes;
     if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
-        fprintf(stderr, "ds4: CUDA cached %s %.2f MiB (total %.2f GiB)\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " cached %s %.2f MiB (total %.2f GiB)\n",
                 what ? what : "weights",
                 (double)bytes / 1048576.0,
                 (double)g_model_range_bytes / 1073741824.0);
@@ -313,7 +325,7 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset,
 
 static int cuda_model_range_is_cached(const void *model_map, uint64_t offset, uint64_t bytes) {
     if (bytes == 0) return 1;
-    if (g_model_device_owned || g_model_registered) return 1;
+    if ((g_model_device_owned || g_model_registered) && model_map == g_model_host_base) return 1;
 
     const uint64_t end = offset + bytes;
     if (end < offset) return 0;
@@ -393,7 +405,7 @@ static void cuda_q8_f16_cache_budget_notice(
     g_q8_f16_budget_notice_printed = 1;
     if (limit_bytes != UINT64_MAX && free_bytes == 0 && total_bytes == 0 && reserve_bytes == 0) {
         fprintf(stderr,
-                "ds4: CUDA q8 fp16 cache %s; using q8 kernels "
+                DS4_CUDA_LOG_PREFIX " q8 fp16 cache %s; using q8 kernels "
                 "(request=%.2f MiB cached=%.2f GiB limit=%.2f GiB)\n",
                 reason,
                 (double)request_bytes / 1048576.0,
@@ -401,7 +413,7 @@ static void cuda_q8_f16_cache_budget_notice(
                 (double)limit_bytes / 1073741824.0);
     } else if (limit_bytes == UINT64_MAX) {
         fprintf(stderr,
-                "ds4: CUDA q8 fp16 cache %s; using q8 kernels "
+                DS4_CUDA_LOG_PREFIX " q8 fp16 cache %s; using q8 kernels "
                 "(request=%.2f MiB cached=%.2f GiB free=%.2f GiB reserve=%.2f GiB total=%.2f GiB)\n",
                 reason,
                 (double)request_bytes / 1048576.0,
@@ -411,7 +423,7 @@ static void cuda_q8_f16_cache_budget_notice(
                 (double)total_bytes / 1073741824.0);
     } else {
         fprintf(stderr,
-                "ds4: CUDA q8 fp16 cache %s; using q8 kernels "
+                DS4_CUDA_LOG_PREFIX " q8 fp16 cache %s; using q8 kernels "
                 "(request=%.2f MiB cached=%.2f GiB limit=%.2f GiB free=%.2f GiB reserve=%.2f GiB total=%.2f GiB)\n",
                 reason,
                 (double)request_bytes / 1048576.0,
@@ -436,7 +448,7 @@ static int cuda_q8_f16_cache_has_budget(uint64_t request_bytes, const char *labe
     size_t total_b = 0;
     cudaError_t err = cudaMemGetInfo(&free_b, &total_b);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA q8 fp16 cache memory query failed: %s; using q8 kernels\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " q8 fp16 cache memory query failed: %s; using q8 kernels\n",
                 cudaGetErrorString(err));
         (void)cudaGetLastError();
         return 0;
@@ -458,7 +470,7 @@ static int cuda_q8_f16_cache_has_budget(uint64_t request_bytes, const char *labe
 static void cuda_q8_f16_cache_disable_after_failure(const char *what, uint64_t request_bytes) {
     if (!g_q8_f16_disabled_after_oom) {
         fprintf(stderr,
-                "ds4: CUDA q8 fp16 cache disabled after %s "
+                DS4_CUDA_LOG_PREFIX " q8 fp16 cache disabled after %s "
                 "(request=%.2f MiB cached=%.2f GiB); using q8 kernels\n",
                 what ? what : "allocation failure",
                 (double)request_bytes / 1048576.0,
@@ -559,7 +571,7 @@ static const __half *cuda_q8_f16_ptr(
     __half *dev = NULL;
     cudaError_t err = cudaMalloc(&dev, (size_t)out_bytes);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA q8 fp16 cache alloc failed (%.2f MiB): %s\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " q8 fp16 cache alloc failed (%.2f MiB): %s\n",
                 (double)out_bytes / 1048576.0, cudaGetErrorString(err));
         cuda_q8_f16_cache_disable_after_failure("allocation failure", out_bytes);
         return NULL;
@@ -580,7 +592,7 @@ static const __half *cuda_q8_f16_ptr(
     g_q8_f16_by_offset[offset] = g_q8_f16_ranges.size() - 1u;
     g_q8_f16_bytes += out_bytes;
     if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
-        fprintf(stderr, "ds4: CUDA cached q8 fp16 %.2f MiB (total %.2f GiB)\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " cached q8 fp16 %.2f MiB (total %.2f GiB)\n",
                 (double)out_bytes / 1048576.0,
                 (double)g_q8_f16_bytes / 1073741824.0);
     }
@@ -611,7 +623,7 @@ static float *cuda_q8_f32_ptr(
     float *dev = NULL;
     cudaError_t err = cudaMalloc(&dev, (size_t)out_bytes);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA q8 fp32 cache alloc failed (%.2f MiB): %s\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " q8 fp32 cache alloc failed (%.2f MiB): %s\n",
                 (double)out_bytes / 1048576.0, cudaGetErrorString(err));
         (void)cudaGetLastError();
         return NULL;
@@ -631,7 +643,7 @@ static float *cuda_q8_f32_ptr(
     g_q8_f32_by_offset[offset] = g_q8_f32_ranges.size() - 1u;
     g_q8_f32_bytes += out_bytes;
     if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
-        fprintf(stderr, "ds4: CUDA cached q8 fp32 %.2f MiB (total %.2f GiB)\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " cached q8 fp32 %.2f MiB (total %.2f GiB)\n",
                 (double)out_bytes / 1048576.0,
                 (double)g_q8_f32_bytes / 1073741824.0);
     }
@@ -640,7 +652,7 @@ static float *cuda_q8_f32_ptr(
 
 static int cuda_ok(cudaError_t err, const char *what) {
     if (err == cudaSuccess) return 1;
-    fprintf(stderr, "ds4: CUDA %s failed: %s\n", what, cudaGetErrorString(err));
+    fprintf(stderr, DS4_CUDA_LOG_PREFIX " %s failed: %s\n", what, cudaGetErrorString(err));
     return 0;
 }
 
@@ -673,9 +685,9 @@ static void cuda_model_load_progress_note(uint64_t cached_bytes) {
                                      1024ull * 1024ull * 1024ull;
         g_model_load_progress_last = now;
         if (g_model_load_progress_tty) {
-            fprintf(stderr, "ds4: CUDA loading model tensors into device cache: 0.00 GiB");
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " loading model tensors into device cache: 0.00 GiB");
         } else {
-            fprintf(stderr, "ds4: CUDA loading model tensors into device cache\n");
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " loading model tensors into device cache\n");
         }
     }
 
@@ -685,10 +697,10 @@ static void cuda_model_load_progress_note(uint64_t cached_bytes) {
     }
 
     if (g_model_load_progress_tty) {
-        fprintf(stderr, "\rds4: CUDA loading model tensors into device cache: %.2f GiB",
+        fprintf(stderr, "\r" DS4_CUDA_LOG_PREFIX " loading model tensors into device cache: %.2f GiB",
                 (double)cached_bytes / 1073741824.0);
     } else {
-        fprintf(stderr, "ds4: CUDA loading model tensors %.2f GiB cached\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " loading model tensors %.2f GiB cached\n",
                 (double)cached_bytes / 1073741824.0);
     }
     fflush(stderr);
@@ -737,13 +749,13 @@ static int cuda_model_prefetch_range(const void *model_map, uint64_t model_size,
     const double t0 = cuda_wall_sec();
     err = cudaMemAdvise(pre_ptr, (size_t)pre_bytes, cudaMemAdviseSetReadMostly, loc);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA model read-mostly advise skipped: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " model read-mostly advise skipped: %s\n", cudaGetErrorString(err));
         (void)cudaGetLastError();
         return 0;
     }
     err = cudaMemAdvise(pre_ptr, (size_t)pre_bytes, cudaMemAdviseSetPreferredLocation, loc);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA model preferred-location advise skipped: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " model preferred-location advise skipped: %s\n", cudaGetErrorString(err));
         (void)cudaGetLastError();
         return 0;
     }
@@ -751,7 +763,7 @@ static int cuda_model_prefetch_range(const void *model_map, uint64_t model_size,
     if (!g_model_prefetch_stream) {
         err = cudaStreamCreateWithFlags(&g_model_prefetch_stream, cudaStreamNonBlocking);
         if (err != cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA model prefetch stream creation skipped: %s\n", cudaGetErrorString(err));
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model prefetch stream creation skipped: %s\n", cudaGetErrorString(err));
             (void)cudaGetLastError();
             return 0;
         }
@@ -759,21 +771,21 @@ static int cuda_model_prefetch_range(const void *model_map, uint64_t model_size,
 
     err = cudaMemPrefetchAsync(pre_ptr, (size_t)pre_bytes, loc, 0, g_model_prefetch_stream);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA model prefetch skipped: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " model prefetch skipped: %s\n", cudaGetErrorString(err));
         (void)cudaGetLastError();
         return 0;
     }
     if (getenv("DS4_CUDA_MODEL_PREFETCH_SYNC") != NULL) {
         err = cudaStreamSynchronize(g_model_prefetch_stream);
         if (err != cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA model prefetch sync failed: %s\n", cudaGetErrorString(err));
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model prefetch sync failed: %s\n", cudaGetErrorString(err));
             (void)cudaGetLastError();
             return 0;
         }
     }
     const double t1 = cuda_wall_sec();
     fprintf(stderr,
-            "ds4: CUDA ATS/HMM prefetch queued %.2f GiB of model tensors in %.3fs\n",
+            DS4_CUDA_LOG_PREFIX " ATS/HMM prefetch queued %.2f GiB of model tensors in %.3fs\n",
             (double)map_size / 1073741824.0,
             t1 - t0);
     g_model_hmm_direct = 1;
@@ -857,7 +869,7 @@ static int cuda_model_stage_pool_alloc(uint64_t bytes) {
     if (!g_model_upload_stream) {
         cudaError_t err = cudaStreamCreateWithFlags(&g_model_upload_stream, cudaStreamNonBlocking);
         if (err != cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA model upload stream creation failed: %s\n", cudaGetErrorString(err));
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model upload stream creation failed: %s\n", cudaGetErrorString(err));
             (void)cudaGetLastError();
             return 0;
         }
@@ -865,14 +877,14 @@ static int cuda_model_stage_pool_alloc(uint64_t bytes) {
     for (size_t i = 0; i < 4; i++) {
         cudaError_t err = cudaMallocHost(&g_model_stage_raw[i], (size_t)bytes);
         if (err != cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA pinned model staging allocation failed: %s\n", cudaGetErrorString(err));
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " pinned model staging allocation failed: %s\n", cudaGetErrorString(err));
             (void)cudaGetLastError();
             return 0;
         }
         g_model_stage[i] = cuda_align_ptr(g_model_stage_raw[i], g_model_direct_align);
         err = cudaEventCreateWithFlags(&g_model_stage_event[i], cudaEventDisableTiming);
         if (err != cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA model staging event creation failed: %s\n", cudaGetErrorString(err));
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model staging event creation failed: %s\n", cudaGetErrorString(err));
             (void)cudaGetLastError();
             return 0;
         }
@@ -918,7 +930,7 @@ static int cuda_model_stage_read(void *stage, uint64_t stage_bytes,
             const int direct_errno = errno;
             if (direct_errno == EINVAL || direct_errno == EFAULT || direct_errno == ENOTSUP || direct_errno == EOPNOTSUPP) {
                 if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
-                    fprintf(stderr, "ds4: CUDA direct model read disabled: %s\n", strerror(direct_errno));
+                    fprintf(stderr, DS4_CUDA_LOG_PREFIX " direct model read disabled: %s\n", strerror(direct_errno));
                 }
                 (void)close(g_model_direct_fd);
                 g_model_direct_fd = -1;
@@ -985,7 +997,7 @@ static char *cuda_model_arena_alloc(uint64_t bytes, const char *what) {
     void *dev = NULL;
     cudaError_t err = cudaMalloc(&dev, (size_t)chunk);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA model arena alloc failed for %s (%.2f MiB chunk): %s\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " model arena alloc failed for %s (%.2f MiB chunk): %s\n",
                 what ? what : "weights",
                 (double)chunk / 1048576.0,
                 cudaGetErrorString(err));
@@ -997,7 +1009,7 @@ static char *cuda_model_arena_alloc(uint64_t bytes, const char *what) {
     if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
         uint64_t arena_bytes = 0;
         for (const cuda_model_arena &a : g_model_arenas) arena_bytes += a.bytes;
-        fprintf(stderr, "ds4: CUDA model arena allocated %.2f MiB (arenas %.2f GiB)\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " model arena allocated %.2f MiB (arenas %.2f GiB)\n",
                 (double)chunk / 1048576.0,
                 (double)arena_bytes / 1073741824.0);
     }
@@ -1014,7 +1026,7 @@ static const char *cuda_model_range_ptr_from_fd(
     const uint64_t limit = cuda_model_cache_limit_bytes();
     if (g_model_range_bytes > limit || bytes > limit - g_model_range_bytes) {
         if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
-            fprintf(stderr, "ds4: CUDA direct %s %.2f MiB (cache budget %.2f GiB exhausted)\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " direct %s %.2f MiB (cache budget %.2f GiB exhausted)\n",
                     what ? what : "weights",
                     (double)bytes / 1048576.0,
                     (double)limit / 1073741824.0);
@@ -1041,7 +1053,7 @@ static const char *cuda_model_range_ptr_from_fd(
         if (chunk_idx >= 4u) {
             err = cudaEventSynchronize(g_model_stage_event[bi]);
             if (err != cudaSuccess) {
-                fprintf(stderr, "ds4: CUDA model staging wait failed for %s: %s\n",
+                fprintf(stderr, DS4_CUDA_LOG_PREFIX " model staging wait failed for %s: %s\n",
                         what ? what : "weights", cudaGetErrorString(err));
                 (void)cudaGetLastError();
                 return NULL;
@@ -1050,7 +1062,7 @@ static const char *cuda_model_range_ptr_from_fd(
         const char *payload = NULL;
         if (!cuda_model_stage_read(g_model_stage[bi], g_model_stage_bytes,
                                    offset + copied, n, &payload)) {
-            fprintf(stderr, "ds4: CUDA model range read failed for %s at %.2f MiB: %s\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range read failed for %s at %.2f MiB: %s\n",
                     what ? what : "weights",
                     (double)copied / 1048576.0,
                     strerror(errno));
@@ -1059,7 +1071,7 @@ static const char *cuda_model_range_ptr_from_fd(
         err = cudaMemcpyAsync(dev + copied, payload, (size_t)n,
                               cudaMemcpyHostToDevice, g_model_upload_stream);
         if (err != cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA model range copy failed for %s at %.2f MiB: %s\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range copy failed for %s at %.2f MiB: %s\n",
                     what ? what : "weights",
                     (double)copied / 1048576.0,
                     cudaGetErrorString(err));
@@ -1068,7 +1080,7 @@ static const char *cuda_model_range_ptr_from_fd(
         }
         err = cudaEventRecord(g_model_stage_event[bi], g_model_upload_stream);
         if (err != cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA model staging record failed for %s: %s\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model staging record failed for %s: %s\n",
                     what ? what : "weights", cudaGetErrorString(err));
             (void)cudaGetLastError();
             return NULL;
@@ -1081,7 +1093,7 @@ static const char *cuda_model_range_ptr_from_fd(
     }
     err = cudaStreamSynchronize(g_model_upload_stream);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA model range upload sync failed for %s: %s\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range upload sync failed for %s: %s\n",
                 what ? what : "weights", cudaGetErrorString(err));
         (void)cudaGetLastError();
         return NULL;
@@ -1092,7 +1104,7 @@ static const char *cuda_model_range_ptr_from_fd(
     g_model_range_bytes += bytes;
     cuda_model_load_progress_note(g_model_range_bytes);
     if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
-        fprintf(stderr, "ds4: CUDA fd-cached %s %.2f MiB (total %.2f GiB)\n",
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " fd-cached %s %.2f MiB (total %.2f GiB)\n",
                 what ? what : "weights",
                 (double)bytes / 1048576.0,
                 (double)g_model_range_bytes / 1073741824.0);
@@ -1114,19 +1126,19 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
     const double t0 = cuda_wall_sec();
     cudaError_t err = cudaMalloc(&dev, (size_t)model_size);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA model allocation skipped: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " model allocation skipped: %s\n", cudaGetErrorString(err));
         (void)cudaGetLastError();
         return 0;
     }
 
-    fprintf(stderr, "ds4: CUDA chunk-copying %.2f GiB model image\n",
+    fprintf(stderr, DS4_CUDA_LOG_PREFIX " chunk-copying %.2f GiB model image\n",
             (double)model_size / 1073741824.0);
 
     const uint64_t chunk = cuda_model_copy_chunk_bytes();
     void *stage = NULL;
     err = cudaMallocHost(&stage, (size_t)chunk);
     if (err != cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA pinned model staging allocation failed: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " pinned model staging allocation failed: %s\n", cudaGetErrorString(err));
         (void)cudaFree(dev);
         (void)cudaGetLastError();
         return 0;
@@ -1139,7 +1151,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
             memcpy(stage, (const char *)model_map + copied_header, (size_t)n);
             err = cudaMemcpy((char *)dev + copied_header, stage, (size_t)n, cudaMemcpyHostToDevice);
             if (err != cudaSuccess) {
-                fprintf(stderr, "ds4: CUDA model header copy failed: %s\n", cudaGetErrorString(err));
+                fprintf(stderr, DS4_CUDA_LOG_PREFIX " model header copy failed: %s\n", cudaGetErrorString(err));
                 (void)cudaFreeHost(stage);
                 (void)cudaFree(dev);
                 (void)cudaGetLastError();
@@ -1157,7 +1169,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
         memcpy(stage, (const char *)model_map + off, (size_t)n);
         err = cudaMemcpy((char *)dev + off, stage, (size_t)n, cudaMemcpyHostToDevice);
         if (err != cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA model chunk copy failed at %.2f GiB: %s\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model chunk copy failed at %.2f GiB: %s\n",
                     (double)copied / 1073741824.0, cudaGetErrorString(err));
             (void)cudaFreeHost(stage);
             (void)cudaFree(dev);
@@ -1168,7 +1180,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
         copied += n;
         const double now = cuda_wall_sec();
         if (getenv("DS4_CUDA_MODEL_COPY_VERBOSE") != NULL && now - last_report >= 2.0) {
-            fprintf(stderr, "ds4: CUDA model chunk copy %.2f/%.2f GiB\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model chunk copy %.2f/%.2f GiB\n",
                     (double)copied / 1073741824.0,
                     (double)map_size / 1073741824.0);
             last_report = now;
@@ -1181,7 +1193,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u
     g_model_hmm_direct = 0;
     const double t1 = cuda_wall_sec();
     fprintf(stderr,
-            "ds4: CUDA model chunk copy complete in %.3fs (%.2f GiB tensors)\n",
+            DS4_CUDA_LOG_PREFIX " model chunk copy complete in %.3fs (%.2f GiB tensors)\n",
             t1 - t0,
             (double)map_size / 1073741824.0);
     return 1;
@@ -1207,7 +1219,7 @@ static void cuda_model_range_release_all(void) {
 
 static int cublas_ok(cublasStatus_t st, const char *what) {
     if (st == CUBLAS_STATUS_SUCCESS) return 1;
-    fprintf(stderr, "ds4: cuBLAS %s failed: status %d\n", what, (int)st);
+    fprintf(stderr, "ds4: " DS4_CUDA_BLAS_DISPLAY " %s failed: status %d\n", what, (int)st);
     return 0;
 }
 
@@ -1216,8 +1228,14 @@ extern "C" int ds4_gpu_init(void) {
     if (!cuda_ok(cudaSetDevice(dev), "set device")) return 0;
     cudaDeviceProp prop;
     if (cudaGetDeviceProperties(&prop, dev) == cudaSuccess) {
-        fprintf(stderr, "ds4: CUDA backend initialized on %s (sm_%d%d)\n",
+#ifdef DS4_ROCM_BUILD
+        const char *arch = prop.gcnArchName[0] ? prop.gcnArchName : "gfx";
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " backend initialized on %s (%s)\n",
+                prop.name, arch);
+#else
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " backend initialized on %s (sm_%d%d)\n",
                 prop.name, prop.major, prop.minor);
+#endif
     }
     if (!g_cublas_ready) {
         if (!cublas_ok(cublasCreate(&g_cublas), "create handle")) return 0;
@@ -1411,21 +1429,21 @@ extern "C" int ds4_gpu_set_model_map(const void *model_map, uint64_t model_size)
         const double t0 = clock() / (double)CLOCKS_PER_SEC;
         cudaError_t err = cudaMalloc(&dev, (size_t)model_size);
         if (err == cudaSuccess) {
-            fprintf(stderr, "ds4: CUDA copying %.2f GiB model to device memory\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " copying %.2f GiB model to device memory\n",
                     (double)model_size / 1073741824.0);
             err = cudaMemcpy(dev, model_map, (size_t)model_size, cudaMemcpyHostToDevice);
             if (err == cudaSuccess) {
                 g_model_device_base = (const char *)dev;
                 g_model_device_owned = 1;
                 const double t1 = clock() / (double)CLOCKS_PER_SEC;
-                fprintf(stderr, "ds4: CUDA model copy complete in %.3fs\n", t1 - t0);
+                fprintf(stderr, DS4_CUDA_LOG_PREFIX " model copy complete in %.3fs\n", t1 - t0);
                 return 1;
             }
-            fprintf(stderr, "ds4: CUDA model copy failed: %s\n", cudaGetErrorString(err));
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model copy failed: %s\n", cudaGetErrorString(err));
             (void)cudaFree(dev);
             (void)cudaGetLastError();
         } else {
-            fprintf(stderr, "ds4: CUDA model allocation skipped: %s\n", cudaGetErrorString(err));
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " model allocation skipped: %s\n", cudaGetErrorString(err));
             (void)cudaGetLastError();
         }
     }
@@ -1438,14 +1456,14 @@ extern "C" int ds4_gpu_set_model_map(const void *model_map, uint64_t model_size)
         if (err == cudaSuccess && dev) {
             g_model_device_base = (const char *)dev;
             g_model_registered = 1;
-            fprintf(stderr, "ds4: CUDA registered %.2f GiB model mapping for device access\n",
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " registered %.2f GiB model mapping for device access\n",
                     (double)model_size / 1073741824.0);
         } else {
-            fprintf(stderr, "ds4: CUDA host registration pointer lookup failed: %s\n", cudaGetErrorString(err));
+            fprintf(stderr, DS4_CUDA_LOG_PREFIX " host registration pointer lookup failed: %s\n", cudaGetErrorString(err));
             (void)cudaGetLastError();
         }
     } else {
-        fprintf(stderr, "ds4: CUDA host registration skipped: %s\n", cudaGetErrorString(err));
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " host registration skipped: %s\n", cudaGetErrorString(err));
         (void)cudaGetLastError();
     }
     return 1;
@@ -1484,11 +1502,11 @@ extern "C" int ds4_gpu_set_model_fd(int fd) {
                 g_model_direct_fd = direct_fd;
                 if (g_model_direct_align < 512) g_model_direct_align = 512;
                 if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
-                    fprintf(stderr, "ds4: CUDA model direct I/O enabled (align=%llu)\n",
+                    fprintf(stderr, DS4_CUDA_LOG_PREFIX " model direct I/O enabled (align=%llu)\n",
                             (unsigned long long)g_model_direct_align);
                 }
             } else if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) {
-                fprintf(stderr, "ds4: CUDA model direct I/O unavailable: %s\n", strerror(errno));
+                fprintf(stderr, DS4_CUDA_LOG_PREFIX " model direct I/O unavailable: %s\n", strerror(errno));
             }
         }
 #endif
@@ -1524,7 +1542,7 @@ extern "C" int ds4_gpu_cache_q8_f16_range(const void *model_map, uint64_t model_
 extern "C" void ds4_gpu_print_memory_report(const char *label) {
     size_t free_b = 0, total_b = 0;
     (void)cudaMemGetInfo(&free_b, &total_b);
-    fprintf(stderr, "ds4: CUDA memory report %s: free %.2f MiB total %.2f MiB\n",
+    fprintf(stderr, DS4_CUDA_LOG_PREFIX " memory report %s: free %.2f MiB total %.2f MiB\n",
             label ? label : "", (double)free_b / 1048576.0, (double)total_b / 1048576.0);
 }
 
@@ -5211,10 +5229,10 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode
                                              CUBLAS_COMPUTE_32F,
                                              CUBLAS_GEMM_DEFAULT);
             if (st == CUBLAS_STATUS_SUCCESS) return 1;
-            fprintf(stderr, "ds4: cuBLAS q8 f16 matmul failed: status %d\n", (int)st);
-            cuda_q8_f16_cache_disable_after_failure("cuBLAS f16 matmul failure",
+            fprintf(stderr, "ds4: " DS4_CUDA_BLAS_DISPLAY " q8 f16 matmul failed: status %d\n", (int)st);
+            cuda_q8_f16_cache_disable_after_failure("BLAS f16 matmul failure",
                                                     in_dim * out_dim * sizeof(__half));
-            /* The F16 expansion cache is only an optimization.  If cuBLAS
+            /* The F16 expansion cache is only an optimization.  If BLAS
              * rejects the cached path under memory pressure, retry the same
              * operation through the native Q8 kernels below. */
         }
@@ -6114,7 +6132,7 @@ extern "C" int ds4_gpu_attention_decode_heads_tensor(
                                                                               head_dim);
             return cuda_ok(cudaGetLastError(), "attention decode online launch");
         }
-        fprintf(stderr, "ds4: CUDA attention score buffer too small for %u compressed rows\n", n_comp);
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " attention score buffer too small for %u compressed rows\n", n_comp);
         return 0;
     }
     dim3 grid(1, n_head, 1);
@@ -6168,7 +6186,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads,
         if (!tmp) return 0;
         float *scores = tmp;
         float *out_tmp = (float *)((char *)tmp + out_offset);
-        const float alpha = rsqrtf((float)head_dim);
+        const float alpha = 1.0f / sqrtf((float)head_dim);
         const float beta = 0.0f;
         cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
                                                       CUBLAS_OP_T,
@@ -6285,7 +6303,7 @@ static int attention_decode_batch_launch(
                                                                               head_dim);
             return cuda_ok(cudaGetLastError(), "attention decode online launch");
         }
-        fprintf(stderr, "ds4: CUDA attention score buffer too small for %u compressed rows\n", n_comp);
+        fprintf(stderr, DS4_CUDA_LOG_PREFIX " attention score buffer too small for %u compressed rows\n", n_comp);
         return 0;
     }
     if (!use_comp_mask && n_tokens > 1 && head_dim == 512 &&
@@ -6538,7 +6556,7 @@ static int attention_prefill_mixed_launch(
                 n_comp,
                 head_dim);
         if (!cuda_ok(cudaGetLastError(), "attention mixed kv pack launch")) return 0;
-        const float alpha = rsqrtf((float)head_dim);
+        const float alpha = 1.0f / sqrtf((float)head_dim);
         const float beta = 0.0f;
         cublasStatus_t st = cublasSgemmStridedBatched(g_cublas,
                                                       CUBLAS_OP_T,
@@ -9655,7 +9673,7 @@ static int routed_moe_launch(
                 (void)cudaEventElapsedTime(&ms_sum, prof_ev[5], prof_ev[6]);
                 (void)cudaEventElapsedTime(&ms_total, prof_ev[0], prof_ev[6]);
                 fprintf(stderr,
-                        "ds4: CUDA MoE profile tokens=%u pairs=%u xq=%.3f sort=%.3f gateup=%.3f midq=%.3f down=%.3f sum=%.3f total=%.3f ms\n",
+                        DS4_CUDA_LOG_PREFIX " MoE profile tokens=%u pairs=%u xq=%.3f sort=%.3f gateup=%.3f midq=%.3f down=%.3f sum=%.3f total=%.3f ms\n",
                         n_tokens, pair_count, ms_xq, ms_sort, ms_gate, ms_midq, ms_down, ms_sum, ms_total);
             }
             for (uint32_t i = 0; i < 7u; i++) (void)cudaEventDestroy(prof_ev[i]);
diff --git a/ds4_server.c b/ds4_server.c
index bc8abbbd..53993846 100644
--- a/ds4_server.c
+++ b/ds4_server.c
@@ -7906,8 +7906,8 @@ static void usage(FILE *fp) {
         "      Apply steering after attention outputs. Default: 0\n"
         "  --warm-weights\n"
         "      Touch mapped tensor pages before serving. Slower startup, fewer first-use stalls.\n"
-        "  --metal | --cuda | --cpu | --backend NAME\n"
-        "      Select backend explicitly. Defaults to Metal on macOS and CUDA on CUDA builds.\n"
+        "  --metal | " DS4_GPU_BACKEND_FLAG " | --cpu | --backend NAME\n"
+        "      Select backend explicitly. Defaults to Metal on macOS, " DS4_GPU_BACKEND_DISPLAY " in " DS4_GPU_BACKEND_DISPLAY " builds, and CUDA otherwise.\n"
         "\n"
         "HTTP API:\n"
         "  --host HOST\n"
@@ -7967,10 +7967,13 @@ static void usage(FILE *fp) {
 
 static ds4_backend parse_backend_arg(const char *s, const char *arg) {
     if (!strcmp(s, "metal")) return DS4_BACKEND_METAL;
+    if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA;
+#ifdef DS4_ROCM_BUILD
     if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA;
+#endif
     if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU;
     server_log(DS4_LOG_DEFAULT, "ds4-server: invalid %s value: %s", arg, s);
-    server_log(DS4_LOG_DEFAULT, "ds4-server: valid server backends are: metal, cuda, cpu");
+    server_log(DS4_LOG_DEFAULT, "ds4-server: valid server backends are: %s", DS4_GPU_BACKEND_LIST);
     exit(2);
 }
 
@@ -8060,7 +8063,7 @@ static server_config parse_options(int argc, char **argv) {
             c.engine.warm_weights = true;
         } else if (!strcmp(arg, "--metal")) {
             c.engine.backend = DS4_BACKEND_METAL;
-        } else if (!strcmp(arg, "--cuda")) {
+        } else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) {
             c.engine.backend = DS4_BACKEND_CUDA;
         } else if (!strcmp(arg, "--backend")) {
             c.engine.backend = parse_backend_arg(need_arg(&i, argc, argv, arg), arg);