antirez · adv0r · May 15, 2026 · May 15, 2026
diff --git a/Makefile b/Makefile
@@ -12,6 +12,7 @@ OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc
 
 LDLIBS ?= -lm -pthread
 METAL_SRCS := $(wildcard metal/*.metal)
+GPU_EXTRA_DEPS :=
 
 ifeq ($(UNAME_S),Darwin)
 METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
@@ -26,12 +27,14 @@ ROCM_PATH ?= /opt/rocm
 GPU_CC = $(ROCM_PATH)/bin/hipcc
 ROCM_ARCH ?= gfx1151
 
+CFLAGS += -DDS4_ROCM_BUILD
 GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
+GPU_CFLAGS += -DDS4_ROCM_BUILD
 GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas
 
 @echo "ROCM_ARCH: $(ROCM_ARCH)"
 
-EXTRA_DEPS = ds4_rocm.h
+GPU_EXTRA_DEPS += ds4_rocm.h
 
 else
 
@@ -177,7 +180,7 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h
 ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
 	$(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m
 
-ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS)
+ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(GPU_EXTRA_DEPS)
 	$(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu
 
 tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
@@ -195,4 +198,3 @@ test: ds4_test
 
 clean:
 	rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
-
diff --git a/README.md b/README.md
@@ -136,6 +136,7 @@ Then build:
 make                  # macOS Metal
 make cuda-spark       # Linux CUDA, DGX Spark / GB10
 make cuda-generic     # Linux CUDA, other local CUDA GPUs
+make rocm ROCM_ARCH=gfx1151
 make cpu              # CPU-only diagnostics build
 ```
 
@@ -646,11 +647,13 @@ the kv cache files include the verbatim prompt cached.
 
 ## Backends
 
-The default graph backend is Metal on macOS and CUDA in CUDA builds:
+The default graph backend is Metal on macOS, CUDA in CUDA builds, and ROCm in
+ROCm builds:
 
 ```sh
 ./ds4 -p "Hello" --metal
 ./ds4 -p "Hello" --cuda
+./ds4 -p "Hello" --rocm
 ```
 
 On Linux, plain `make` prints the available build targets instead of selecting a
@@ -662,8 +665,13 @@ when cross-building or when you need a known target:
 ```sh
 make cuda CUDA_ARCH=sm_120
 make cuda CUDA_ARCH=native
+make rocm ROCM_ARCH=gfx1151
 ```
 
+In ROCm builds, `--rocm` and `--backend rocm` are the canonical runtime names.
+`--cuda` and `--backend cuda` remain accepted as compatibility aliases because
+the shared GPU implementation still uses the CUDA backend enum internally.
+
 There is also a CPU reference/debug path:
 
 ```sh
@@ -675,8 +683,8 @@ make cpu
 
 Do not treat the CPU path as the production target. The CLI and `ds4-server`
 support the CPU backend for reference/debug use and share the same KV session
-and snapshot format as Metal and CUDA, but normal inference should use Metal or
-CUDA.
+and snapshot format as Metal, CUDA, and ROCm, but normal inference should use
+Metal, CUDA, or ROCm.
 
 ## Steering
 

diff --git a/ds4.c b/ds4.c
@@ -1453,7 +1453,7 @@ static bool accelerator_cache_model_tensors(ds4_backend backend, const ds4_model
         const double t1 = now_sec();
         if (ds4_log_is_tty(stderr)) fputc('\n', stderr);
         fprintf(stderr,
-                "ds4: CUDA startup model cache prepared %.2f GiB of tensor spans in %.3fs\n",
+                "ds4: " DS4_GPU_BACKEND_DISPLAY " startup model cache prepared %.2f GiB of tensor spans in %.3fs\n",
                 (double)cached / 1073741824.0,
                 t1 - t0);
     }
@@ -16995,15 +16995,17 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
 #ifndef DS4_NO_GPU
     if (e->backend == DS4_BACKEND_CUDA) {
 #ifdef __APPLE__
-        fprintf(stderr, "ds4: CUDA backend requested but this build is linked with Metal, not CUDA\n");
+        fprintf(stderr, "ds4: %s backend requested but this build is linked with Metal, not %s\n",
+                DS4_GPU_BACKEND_DISPLAY, DS4_GPU_BACKEND_DISPLAY);
         ds4_engine_close(e);
         *out = NULL;
         return 1;
 #endif
     }
     if (e->backend == DS4_BACKEND_METAL) {
 #ifndef __APPLE__
-        fprintf(stderr, "ds4: Metal backend requested but this build is linked with CUDA, not Metal\n");
+        fprintf(stderr, "ds4: Metal backend requested but this build is linked with %s, not Metal\n",
+                DS4_GPU_BACKEND_DISPLAY);
         ds4_engine_close(e);
         *out = NULL;
         return 1;
@@ -17033,11 +17035,12 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
             *out = NULL;
             return 1;
         }
-        if (e->mtp_ready &&
+        const bool mtp_needs_global_model_map = e->mtp_ready && e->backend != DS4_BACKEND_CUDA;
+        if (mtp_needs_global_model_map &&
             !ds4_gpu_set_model_map_range(e->mtp_model.map,
-                                           e->mtp_model.size,
-                                           e->mtp_model.tensor_data_pos,
-                                           e->mtp_model.size - e->mtp_model.tensor_data_pos))
+                                          e->mtp_model.size,
+                                          e->mtp_model.tensor_data_pos,
+                                          e->mtp_model.size - e->mtp_model.tensor_data_pos))
         {
             fprintf(stderr,
                     "ds4: %s failed to map MTP model views; aborting startup. "
@@ -17047,7 +17050,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
             *out = NULL;
             return 1;
         }
-        if (!e->mtp_ready && !accelerator_cache_model_tensors(e->backend, &e->model)) {
+        if (!accelerator_cache_model_tensors(e->backend, &e->model)) {
             fprintf(stderr, "ds4: %s failed to prepare startup model cache\n",
                     ds4_backend_name(e->backend));
             ds4_engine_close(e);

diff --git a/ds4.h b/ds4.h
@@ -20,6 +20,18 @@ typedef enum {
     DS4_BACKEND_CPU,
 } ds4_backend;
 
+#ifdef DS4_ROCM_BUILD
+#define DS4_GPU_BACKEND_CLI_NAME "rocm"
+#define DS4_GPU_BACKEND_FLAG "--rocm"
+#define DS4_GPU_BACKEND_DISPLAY "ROCm"
+#define DS4_GPU_BACKEND_LIST "metal, rocm, cuda, cpu"
+#else
+#define DS4_GPU_BACKEND_CLI_NAME "cuda"
+#define DS4_GPU_BACKEND_FLAG "--cuda"
+#define DS4_GPU_BACKEND_DISPLAY "CUDA"
+#define DS4_GPU_BACKEND_LIST "metal, cuda, cpu"
+#endif
+
 typedef enum {
     DS4_THINK_NONE,
     DS4_THINK_HIGH,

diff --git a/ds4_bench.c b/ds4_bench.c
@@ -63,8 +63,8 @@ static void usage(FILE *fp) {
         "\n"
         "Model and backend:\n"
         "  -m, --model FILE       GGUF model path. Default: ds4flash.gguf\n"
-        "  --metal | --cuda | --cpu | --backend NAME\n"
-        "      Select backend explicitly. Defaults to Metal on macOS, CUDA elsewhere.\n"
+        "  --metal | " DS4_GPU_BACKEND_FLAG " | --cpu | --backend NAME\n"
+        "      Select backend explicitly. Defaults to Metal on macOS, " DS4_GPU_BACKEND_DISPLAY " in " DS4_GPU_BACKEND_DISPLAY " builds, CUDA elsewhere.\n"
         "  -t, --threads N        CPU helper threads.\n"
         "  --quality              Prefer exact kernels where applicable.\n"
         "  --warm-weights         Touch mapped tensor pages before benchmarking.\n"
@@ -112,10 +112,13 @@ static const char *need_arg(int *i, int argc, char **argv, const char *opt) {
 
 static ds4_backend parse_backend(const char *s, const char *opt) {
     if (!strcmp(s, "metal")) return DS4_BACKEND_METAL;
+    if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA;
+#ifdef DS4_ROCM_BUILD
     if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA;
+#endif
     if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU;
     fprintf(stderr, "ds4-bench: invalid value for %s: %s\n", opt, s);
-    fprintf(stderr, "ds4-bench: valid backends are: metal, cuda, cpu\n");
+    fprintf(stderr, "ds4-bench: valid backends are: %s\n", DS4_GPU_BACKEND_LIST);
     exit(2);
 }
 
@@ -213,7 +216,7 @@ static bench_config parse_options(int argc, char **argv) {
             c.backend = parse_backend(need_arg(&i, argc, argv, arg), arg);
         } else if (!strcmp(arg, "--metal")) {
             c.backend = DS4_BACKEND_METAL;
-        } else if (!strcmp(arg, "--cuda")) {
+        } else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) {
             c.backend = DS4_BACKEND_CUDA;
         } else if (!strcmp(arg, "--cpu")) {
             c.backend = DS4_BACKEND_CPU;

diff --git a/ds4_cli.c b/ds4_cli.c
@@ -93,12 +93,12 @@ static void usage(FILE *fp) {
         "      Context size allocated for the session. Default: 32768\n"
         "  --metal\n"
         "      Use the Metal graph backend. This is the normal fast path on macOS.\n"
-        "  --cuda\n"
-        "      Use the CUDA graph backend. This is the normal fast path on CUDA builds.\n"
+        "  " DS4_GPU_BACKEND_FLAG "\n"
+        "      Use the " DS4_GPU_BACKEND_DISPLAY " graph backend. This is the normal fast path on " DS4_GPU_BACKEND_DISPLAY " builds.\n"
         "  --cpu\n"
         "      Use the CPU reference/debug backend. Not recommended for normal inference.\n"
         "  --backend NAME\n"
-        "      Select backend explicitly: metal, cuda, or cpu.\n"
+        "      Select backend explicitly: " DS4_GPU_BACKEND_LIST ".\n"
         "  -t, --threads N\n"
         "      CPU helper threads for host-side or reference work.\n"
         "  --quality\n"
@@ -223,10 +223,13 @@ static float parse_float_range(const char *s, const char *opt, float min, float
 
 static ds4_backend parse_backend(const char *s) {
     if (!strcmp(s, "metal")) return DS4_BACKEND_METAL;
+    if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA;
+#ifdef DS4_ROCM_BUILD
     if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA;
+#endif
     if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU;
     fprintf(stderr, "ds4: invalid backend: %s\n", s);
-    fprintf(stderr, "ds4: valid backends are: metal, cuda, cpu\n");
+    fprintf(stderr, "ds4: valid backends are: %s\n", DS4_GPU_BACKEND_LIST);
     exit(2);
 }
 
@@ -1260,7 +1263,7 @@ static cli_config parse_options(int argc, char **argv) {
             c.engine.backend = DS4_BACKEND_CPU;
         } else if (!strcmp(arg, "--metal")) {
             c.engine.backend = DS4_BACKEND_METAL;
-        } else if (!strcmp(arg, "--cuda")) {
+        } else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) {
             c.engine.backend = DS4_BACKEND_CUDA;
         } else if (!strcmp(arg, "--dump-tokens")) {
             c.gen.dump_tokens = true;