diff --git a/Makefile b/Makefile index 005bb45e..e20b1ce8 100644 --- a/Makefile +++ b/Makefile @@ -12,6 +12,7 @@ OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc LDLIBS ?= -lm -pthread METAL_SRCS := $(wildcard metal/*.metal) +GPU_EXTRA_DEPS := ifeq ($(UNAME_S),Darwin) METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal @@ -26,12 +27,14 @@ ROCM_PATH ?= /opt/rocm GPU_CC = $(ROCM_PATH)/bin/hipcc ROCM_ARCH ?= gfx1151 +CFLAGS += -DDS4_ROCM_BUILD GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH) +GPU_CFLAGS += -DDS4_ROCM_BUILD GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas @echo "ROCM_ARCH: $(ROCM_ARCH)" -EXTRA_DEPS = ds4_rocm.h +GPU_EXTRA_DEPS += ds4_rocm.h else @@ -177,7 +180,7 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS) $(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m -ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS) +ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(GPU_EXTRA_DEPS) $(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o @@ -195,4 +198,3 @@ test: ds4_test clean: rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o - diff --git a/README.md b/README.md index 25af9206..edda84b6 100644 --- a/README.md +++ b/README.md @@ -136,6 +136,7 @@ Then build: make # macOS Metal make cuda-spark # Linux CUDA, DGX Spark / GB10 make cuda-generic # Linux CUDA, other local CUDA GPUs +make rocm ROCM_ARCH=gfx1151 make cpu # CPU-only diagnostics build ``` @@ -646,11 +647,13 @@ the kv cache files include the verbatim prompt cached. ## Backends -The default graph backend is Metal on macOS and CUDA in CUDA builds: +The default graph backend is Metal on macOS, CUDA in CUDA builds, and ROCm in +ROCm builds: ```sh ./ds4 -p "Hello" --metal ./ds4 -p "Hello" --cuda +./ds4 -p "Hello" --rocm ``` On Linux, plain `make` prints the available build targets instead of selecting a @@ -662,8 +665,13 @@ when cross-building or when you need a known target: ```sh make cuda CUDA_ARCH=sm_120 make cuda CUDA_ARCH=native +make rocm ROCM_ARCH=gfx1151 ``` +In ROCm builds, `--rocm` and `--backend rocm` are the canonical runtime names. +`--cuda` and `--backend cuda` remain accepted as compatibility aliases because +the shared GPU implementation still uses the CUDA backend enum internally. + There is also a CPU reference/debug path: ```sh @@ -675,8 +683,8 @@ make cpu Do not treat the CPU path as the production target. The CLI and `ds4-server` support the CPU backend for reference/debug use and share the same KV session -and snapshot format as Metal and CUDA, but normal inference should use Metal or -CUDA. +and snapshot format as Metal, CUDA, and ROCm, but normal inference should use +Metal, CUDA, or ROCm. ## Steering diff --git a/ds4.c b/ds4.c index 51410e33..39d760bb 100644 --- a/ds4.c +++ b/ds4.c @@ -1453,7 +1453,7 @@ static bool accelerator_cache_model_tensors(ds4_backend backend, const ds4_model const double t1 = now_sec(); if (ds4_log_is_tty(stderr)) fputc('\n', stderr); fprintf(stderr, - "ds4: CUDA startup model cache prepared %.2f GiB of tensor spans in %.3fs\n", + "ds4: " DS4_GPU_BACKEND_DISPLAY " startup model cache prepared %.2f GiB of tensor spans in %.3fs\n", (double)cached / 1073741824.0, t1 - t0); } @@ -16995,7 +16995,8 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { #ifndef DS4_NO_GPU if (e->backend == DS4_BACKEND_CUDA) { #ifdef __APPLE__ - fprintf(stderr, "ds4: CUDA backend requested but this build is linked with Metal, not CUDA\n"); + fprintf(stderr, "ds4: %s backend requested but this build is linked with Metal, not %s\n", + DS4_GPU_BACKEND_DISPLAY, DS4_GPU_BACKEND_DISPLAY); ds4_engine_close(e); *out = NULL; return 1; @@ -17003,7 +17004,8 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { } if (e->backend == DS4_BACKEND_METAL) { #ifndef __APPLE__ - fprintf(stderr, "ds4: Metal backend requested but this build is linked with CUDA, not Metal\n"); + fprintf(stderr, "ds4: Metal backend requested but this build is linked with %s, not Metal\n", + DS4_GPU_BACKEND_DISPLAY); ds4_engine_close(e); *out = NULL; return 1; @@ -17033,11 +17035,12 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { *out = NULL; return 1; } - if (e->mtp_ready && + const bool mtp_needs_global_model_map = e->mtp_ready && e->backend != DS4_BACKEND_CUDA; + if (mtp_needs_global_model_map && !ds4_gpu_set_model_map_range(e->mtp_model.map, - e->mtp_model.size, - e->mtp_model.tensor_data_pos, - e->mtp_model.size - e->mtp_model.tensor_data_pos)) + e->mtp_model.size, + e->mtp_model.tensor_data_pos, + e->mtp_model.size - e->mtp_model.tensor_data_pos)) { fprintf(stderr, "ds4: %s failed to map MTP model views; aborting startup. " @@ -17047,7 +17050,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) { *out = NULL; return 1; } - if (!e->mtp_ready && !accelerator_cache_model_tensors(e->backend, &e->model)) { + if (!accelerator_cache_model_tensors(e->backend, &e->model)) { fprintf(stderr, "ds4: %s failed to prepare startup model cache\n", ds4_backend_name(e->backend)); ds4_engine_close(e); diff --git a/ds4.h b/ds4.h index 950d8dca..2916070f 100644 --- a/ds4.h +++ b/ds4.h @@ -20,6 +20,18 @@ typedef enum { DS4_BACKEND_CPU, } ds4_backend; +#ifdef DS4_ROCM_BUILD +#define DS4_GPU_BACKEND_CLI_NAME "rocm" +#define DS4_GPU_BACKEND_FLAG "--rocm" +#define DS4_GPU_BACKEND_DISPLAY "ROCm" +#define DS4_GPU_BACKEND_LIST "metal, rocm, cuda, cpu" +#else +#define DS4_GPU_BACKEND_CLI_NAME "cuda" +#define DS4_GPU_BACKEND_FLAG "--cuda" +#define DS4_GPU_BACKEND_DISPLAY "CUDA" +#define DS4_GPU_BACKEND_LIST "metal, cuda, cpu" +#endif + typedef enum { DS4_THINK_NONE, DS4_THINK_HIGH, diff --git a/ds4_bench.c b/ds4_bench.c index 027b2b31..c7817f1e 100644 --- a/ds4_bench.c +++ b/ds4_bench.c @@ -63,8 +63,8 @@ static void usage(FILE *fp) { "\n" "Model and backend:\n" " -m, --model FILE GGUF model path. Default: ds4flash.gguf\n" - " --metal | --cuda | --cpu | --backend NAME\n" - " Select backend explicitly. Defaults to Metal on macOS, CUDA elsewhere.\n" + " --metal | " DS4_GPU_BACKEND_FLAG " | --cpu | --backend NAME\n" + " Select backend explicitly. Defaults to Metal on macOS, " DS4_GPU_BACKEND_DISPLAY " in " DS4_GPU_BACKEND_DISPLAY " builds, CUDA elsewhere.\n" " -t, --threads N CPU helper threads.\n" " --quality Prefer exact kernels where applicable.\n" " --warm-weights Touch mapped tensor pages before benchmarking.\n" @@ -112,10 +112,13 @@ static const char *need_arg(int *i, int argc, char **argv, const char *opt) { static ds4_backend parse_backend(const char *s, const char *opt) { if (!strcmp(s, "metal")) return DS4_BACKEND_METAL; + if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA; +#ifdef DS4_ROCM_BUILD if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA; +#endif if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU; fprintf(stderr, "ds4-bench: invalid value for %s: %s\n", opt, s); - fprintf(stderr, "ds4-bench: valid backends are: metal, cuda, cpu\n"); + fprintf(stderr, "ds4-bench: valid backends are: %s\n", DS4_GPU_BACKEND_LIST); exit(2); } @@ -213,7 +216,7 @@ static bench_config parse_options(int argc, char **argv) { c.backend = parse_backend(need_arg(&i, argc, argv, arg), arg); } else if (!strcmp(arg, "--metal")) { c.backend = DS4_BACKEND_METAL; - } else if (!strcmp(arg, "--cuda")) { + } else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) { c.backend = DS4_BACKEND_CUDA; } else if (!strcmp(arg, "--cpu")) { c.backend = DS4_BACKEND_CPU; diff --git a/ds4_cli.c b/ds4_cli.c index bc70e659..f58dcae1 100644 --- a/ds4_cli.c +++ b/ds4_cli.c @@ -93,12 +93,12 @@ static void usage(FILE *fp) { " Context size allocated for the session. Default: 32768\n" " --metal\n" " Use the Metal graph backend. This is the normal fast path on macOS.\n" - " --cuda\n" - " Use the CUDA graph backend. This is the normal fast path on CUDA builds.\n" + " " DS4_GPU_BACKEND_FLAG "\n" + " Use the " DS4_GPU_BACKEND_DISPLAY " graph backend. This is the normal fast path on " DS4_GPU_BACKEND_DISPLAY " builds.\n" " --cpu\n" " Use the CPU reference/debug backend. Not recommended for normal inference.\n" " --backend NAME\n" - " Select backend explicitly: metal, cuda, or cpu.\n" + " Select backend explicitly: " DS4_GPU_BACKEND_LIST ".\n" " -t, --threads N\n" " CPU helper threads for host-side or reference work.\n" " --quality\n" @@ -223,10 +223,13 @@ static float parse_float_range(const char *s, const char *opt, float min, float static ds4_backend parse_backend(const char *s) { if (!strcmp(s, "metal")) return DS4_BACKEND_METAL; + if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA; +#ifdef DS4_ROCM_BUILD if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA; +#endif if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU; fprintf(stderr, "ds4: invalid backend: %s\n", s); - fprintf(stderr, "ds4: valid backends are: metal, cuda, cpu\n"); + fprintf(stderr, "ds4: valid backends are: %s\n", DS4_GPU_BACKEND_LIST); exit(2); } @@ -1260,7 +1263,7 @@ static cli_config parse_options(int argc, char **argv) { c.engine.backend = DS4_BACKEND_CPU; } else if (!strcmp(arg, "--metal")) { c.engine.backend = DS4_BACKEND_METAL; - } else if (!strcmp(arg, "--cuda")) { + } else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) { c.engine.backend = DS4_BACKEND_CUDA; } else if (!strcmp(arg, "--dump-tokens")) { c.gen.dump_tokens = true; diff --git a/ds4_cuda.cu b/ds4_cuda.cu index 8b6241ca..1481c220 100644 --- a/ds4_cuda.cu +++ b/ds4_cuda.cu @@ -34,6 +34,16 @@ #define CUDA_QK_K 256 #define DS4_CUDA_UNUSED __attribute__((unused)) +#ifdef DS4_ROCM_BUILD +#define DS4_CUDA_BACKEND_DISPLAY "ROCm" +#define DS4_CUDA_BLAS_DISPLAY "hipBLAS" +#else +#define DS4_CUDA_BACKEND_DISPLAY "CUDA" +#define DS4_CUDA_BLAS_DISPLAY "cuBLAS" +#endif + +#define DS4_CUDA_LOG_PREFIX "ds4: " DS4_CUDA_BACKEND_DISPLAY + enum { /* attention_decode_mixed_kernel stores raw-window scores plus visible * compressed scores in shared memory. The host routes larger unmasked @@ -185,7 +195,7 @@ static void *cuda_tmp_alloc(uint64_t bytes, const char *what) { void *ptr = NULL; cudaError_t err = cudaMalloc(&ptr, (size_t)bytes); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA temp alloc failed for %s (%.2f MiB): %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " temp alloc failed for %s (%.2f MiB): %s\n", what ? what : "scratch", (double)bytes / 1048576.0, cudaGetErrorString(err)); (void)cudaGetLastError(); return NULL; @@ -206,7 +216,9 @@ static const char *cuda_model_ptr(const void *model_map, uint64_t offset) { static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset, uint64_t bytes, const char *what) { if (bytes == 0) return cuda_model_ptr(model_map, offset); - if (g_model_device_owned || g_model_registered) return cuda_model_ptr(model_map, offset); + if ((g_model_device_owned || g_model_registered) && model_map == g_model_host_base) { + return cuda_model_ptr(model_map, offset); + } if (g_model_hmm_direct && getenv("DS4_CUDA_WEIGHT_CACHE") == NULL && getenv("DS4_CUDA_WEIGHT_PRELOAD") == NULL) { @@ -258,13 +270,13 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset, g_model_ranges.push_back({model_map, offset, bytes, dev_ptr, (void *)reg_addr, (char *)reg_dev, reg_bytes, 1, 0}); g_model_range_by_offset[offset] = g_model_ranges.size() - 1u; if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { - fprintf(stderr, "ds4: CUDA mapped %s %.2f MiB\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " mapped %s %.2f MiB\n", what ? what : "weights", (double)bytes / 1048576.0); } return dev_ptr; } - fprintf(stderr, "ds4: CUDA model range map pointer failed for %s: %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range map pointer failed for %s: %s\n", what ? what : "weights", cudaGetErrorString(err)); (void)cudaHostUnregister((void *)reg_addr); (void)cudaGetLastError(); @@ -278,7 +290,7 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset, err = cudaMalloc(&dev, (size_t)bytes); if (err != cudaSuccess) { (void)cudaGetLastError(); - fprintf(stderr, "ds4: CUDA model range alloc failed for %s (%.2f MiB): %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range alloc failed for %s (%.2f MiB): %s\n", what ? what : "weights", (double)bytes / 1048576.0, cudaGetErrorString(err)); return NULL; } @@ -289,7 +301,7 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset, uint64_t n = bytes - done < chunk ? bytes - done : chunk; err = cudaMemcpy((char *)dev + done, src + done, (size_t)n, cudaMemcpyHostToDevice); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model range copy failed for %s at %.2f/%.2f MiB: %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range copy failed for %s at %.2f/%.2f MiB: %s\n", what ? what : "weights", (double)done / 1048576.0, (double)bytes / 1048576.0, @@ -303,7 +315,7 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset, g_model_range_by_offset[offset] = g_model_ranges.size() - 1u; g_model_range_bytes += bytes; if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { - fprintf(stderr, "ds4: CUDA cached %s %.2f MiB (total %.2f GiB)\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " cached %s %.2f MiB (total %.2f GiB)\n", what ? what : "weights", (double)bytes / 1048576.0, (double)g_model_range_bytes / 1073741824.0); @@ -313,7 +325,7 @@ static const char *cuda_model_range_ptr(const void *model_map, uint64_t offset, static int cuda_model_range_is_cached(const void *model_map, uint64_t offset, uint64_t bytes) { if (bytes == 0) return 1; - if (g_model_device_owned || g_model_registered) return 1; + if ((g_model_device_owned || g_model_registered) && model_map == g_model_host_base) return 1; const uint64_t end = offset + bytes; if (end < offset) return 0; @@ -393,7 +405,7 @@ static void cuda_q8_f16_cache_budget_notice( g_q8_f16_budget_notice_printed = 1; if (limit_bytes != UINT64_MAX && free_bytes == 0 && total_bytes == 0 && reserve_bytes == 0) { fprintf(stderr, - "ds4: CUDA q8 fp16 cache %s; using q8 kernels " + DS4_CUDA_LOG_PREFIX " q8 fp16 cache %s; using q8 kernels " "(request=%.2f MiB cached=%.2f GiB limit=%.2f GiB)\n", reason, (double)request_bytes / 1048576.0, @@ -401,7 +413,7 @@ static void cuda_q8_f16_cache_budget_notice( (double)limit_bytes / 1073741824.0); } else if (limit_bytes == UINT64_MAX) { fprintf(stderr, - "ds4: CUDA q8 fp16 cache %s; using q8 kernels " + DS4_CUDA_LOG_PREFIX " q8 fp16 cache %s; using q8 kernels " "(request=%.2f MiB cached=%.2f GiB free=%.2f GiB reserve=%.2f GiB total=%.2f GiB)\n", reason, (double)request_bytes / 1048576.0, @@ -411,7 +423,7 @@ static void cuda_q8_f16_cache_budget_notice( (double)total_bytes / 1073741824.0); } else { fprintf(stderr, - "ds4: CUDA q8 fp16 cache %s; using q8 kernels " + DS4_CUDA_LOG_PREFIX " q8 fp16 cache %s; using q8 kernels " "(request=%.2f MiB cached=%.2f GiB limit=%.2f GiB free=%.2f GiB reserve=%.2f GiB total=%.2f GiB)\n", reason, (double)request_bytes / 1048576.0, @@ -436,7 +448,7 @@ static int cuda_q8_f16_cache_has_budget(uint64_t request_bytes, const char *labe size_t total_b = 0; cudaError_t err = cudaMemGetInfo(&free_b, &total_b); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA q8 fp16 cache memory query failed: %s; using q8 kernels\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " q8 fp16 cache memory query failed: %s; using q8 kernels\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; @@ -458,7 +470,7 @@ static int cuda_q8_f16_cache_has_budget(uint64_t request_bytes, const char *labe static void cuda_q8_f16_cache_disable_after_failure(const char *what, uint64_t request_bytes) { if (!g_q8_f16_disabled_after_oom) { fprintf(stderr, - "ds4: CUDA q8 fp16 cache disabled after %s " + DS4_CUDA_LOG_PREFIX " q8 fp16 cache disabled after %s " "(request=%.2f MiB cached=%.2f GiB); using q8 kernels\n", what ? what : "allocation failure", (double)request_bytes / 1048576.0, @@ -559,7 +571,7 @@ static const __half *cuda_q8_f16_ptr( __half *dev = NULL; cudaError_t err = cudaMalloc(&dev, (size_t)out_bytes); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA q8 fp16 cache alloc failed (%.2f MiB): %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " q8 fp16 cache alloc failed (%.2f MiB): %s\n", (double)out_bytes / 1048576.0, cudaGetErrorString(err)); cuda_q8_f16_cache_disable_after_failure("allocation failure", out_bytes); return NULL; @@ -580,7 +592,7 @@ static const __half *cuda_q8_f16_ptr( g_q8_f16_by_offset[offset] = g_q8_f16_ranges.size() - 1u; g_q8_f16_bytes += out_bytes; if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { - fprintf(stderr, "ds4: CUDA cached q8 fp16 %.2f MiB (total %.2f GiB)\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " cached q8 fp16 %.2f MiB (total %.2f GiB)\n", (double)out_bytes / 1048576.0, (double)g_q8_f16_bytes / 1073741824.0); } @@ -611,7 +623,7 @@ static float *cuda_q8_f32_ptr( float *dev = NULL; cudaError_t err = cudaMalloc(&dev, (size_t)out_bytes); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA q8 fp32 cache alloc failed (%.2f MiB): %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " q8 fp32 cache alloc failed (%.2f MiB): %s\n", (double)out_bytes / 1048576.0, cudaGetErrorString(err)); (void)cudaGetLastError(); return NULL; @@ -631,7 +643,7 @@ static float *cuda_q8_f32_ptr( g_q8_f32_by_offset[offset] = g_q8_f32_ranges.size() - 1u; g_q8_f32_bytes += out_bytes; if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { - fprintf(stderr, "ds4: CUDA cached q8 fp32 %.2f MiB (total %.2f GiB)\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " cached q8 fp32 %.2f MiB (total %.2f GiB)\n", (double)out_bytes / 1048576.0, (double)g_q8_f32_bytes / 1073741824.0); } @@ -640,7 +652,7 @@ static float *cuda_q8_f32_ptr( static int cuda_ok(cudaError_t err, const char *what) { if (err == cudaSuccess) return 1; - fprintf(stderr, "ds4: CUDA %s failed: %s\n", what, cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " %s failed: %s\n", what, cudaGetErrorString(err)); return 0; } @@ -673,9 +685,9 @@ static void cuda_model_load_progress_note(uint64_t cached_bytes) { 1024ull * 1024ull * 1024ull; g_model_load_progress_last = now; if (g_model_load_progress_tty) { - fprintf(stderr, "ds4: CUDA loading model tensors into device cache: 0.00 GiB"); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " loading model tensors into device cache: 0.00 GiB"); } else { - fprintf(stderr, "ds4: CUDA loading model tensors into device cache\n"); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " loading model tensors into device cache\n"); } } @@ -685,10 +697,10 @@ static void cuda_model_load_progress_note(uint64_t cached_bytes) { } if (g_model_load_progress_tty) { - fprintf(stderr, "\rds4: CUDA loading model tensors into device cache: %.2f GiB", + fprintf(stderr, "\r" DS4_CUDA_LOG_PREFIX " loading model tensors into device cache: %.2f GiB", (double)cached_bytes / 1073741824.0); } else { - fprintf(stderr, "ds4: CUDA loading model tensors %.2f GiB cached\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " loading model tensors %.2f GiB cached\n", (double)cached_bytes / 1073741824.0); } fflush(stderr); @@ -737,13 +749,13 @@ static int cuda_model_prefetch_range(const void *model_map, uint64_t model_size, const double t0 = cuda_wall_sec(); err = cudaMemAdvise(pre_ptr, (size_t)pre_bytes, cudaMemAdviseSetReadMostly, loc); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model read-mostly advise skipped: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model read-mostly advise skipped: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; } err = cudaMemAdvise(pre_ptr, (size_t)pre_bytes, cudaMemAdviseSetPreferredLocation, loc); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model preferred-location advise skipped: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model preferred-location advise skipped: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; } @@ -751,7 +763,7 @@ static int cuda_model_prefetch_range(const void *model_map, uint64_t model_size, if (!g_model_prefetch_stream) { err = cudaStreamCreateWithFlags(&g_model_prefetch_stream, cudaStreamNonBlocking); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model prefetch stream creation skipped: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model prefetch stream creation skipped: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; } @@ -759,21 +771,21 @@ static int cuda_model_prefetch_range(const void *model_map, uint64_t model_size, err = cudaMemPrefetchAsync(pre_ptr, (size_t)pre_bytes, loc, 0, g_model_prefetch_stream); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model prefetch skipped: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model prefetch skipped: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; } if (getenv("DS4_CUDA_MODEL_PREFETCH_SYNC") != NULL) { err = cudaStreamSynchronize(g_model_prefetch_stream); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model prefetch sync failed: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model prefetch sync failed: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; } } const double t1 = cuda_wall_sec(); fprintf(stderr, - "ds4: CUDA ATS/HMM prefetch queued %.2f GiB of model tensors in %.3fs\n", + DS4_CUDA_LOG_PREFIX " ATS/HMM prefetch queued %.2f GiB of model tensors in %.3fs\n", (double)map_size / 1073741824.0, t1 - t0); g_model_hmm_direct = 1; @@ -857,7 +869,7 @@ static int cuda_model_stage_pool_alloc(uint64_t bytes) { if (!g_model_upload_stream) { cudaError_t err = cudaStreamCreateWithFlags(&g_model_upload_stream, cudaStreamNonBlocking); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model upload stream creation failed: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model upload stream creation failed: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; } @@ -865,14 +877,14 @@ static int cuda_model_stage_pool_alloc(uint64_t bytes) { for (size_t i = 0; i < 4; i++) { cudaError_t err = cudaMallocHost(&g_model_stage_raw[i], (size_t)bytes); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA pinned model staging allocation failed: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " pinned model staging allocation failed: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; } g_model_stage[i] = cuda_align_ptr(g_model_stage_raw[i], g_model_direct_align); err = cudaEventCreateWithFlags(&g_model_stage_event[i], cudaEventDisableTiming); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model staging event creation failed: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model staging event creation failed: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; } @@ -918,7 +930,7 @@ static int cuda_model_stage_read(void *stage, uint64_t stage_bytes, const int direct_errno = errno; if (direct_errno == EINVAL || direct_errno == EFAULT || direct_errno == ENOTSUP || direct_errno == EOPNOTSUPP) { if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { - fprintf(stderr, "ds4: CUDA direct model read disabled: %s\n", strerror(direct_errno)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " direct model read disabled: %s\n", strerror(direct_errno)); } (void)close(g_model_direct_fd); g_model_direct_fd = -1; @@ -985,7 +997,7 @@ static char *cuda_model_arena_alloc(uint64_t bytes, const char *what) { void *dev = NULL; cudaError_t err = cudaMalloc(&dev, (size_t)chunk); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model arena alloc failed for %s (%.2f MiB chunk): %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model arena alloc failed for %s (%.2f MiB chunk): %s\n", what ? what : "weights", (double)chunk / 1048576.0, cudaGetErrorString(err)); @@ -997,7 +1009,7 @@ static char *cuda_model_arena_alloc(uint64_t bytes, const char *what) { if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { uint64_t arena_bytes = 0; for (const cuda_model_arena &a : g_model_arenas) arena_bytes += a.bytes; - fprintf(stderr, "ds4: CUDA model arena allocated %.2f MiB (arenas %.2f GiB)\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model arena allocated %.2f MiB (arenas %.2f GiB)\n", (double)chunk / 1048576.0, (double)arena_bytes / 1073741824.0); } @@ -1014,7 +1026,7 @@ static const char *cuda_model_range_ptr_from_fd( const uint64_t limit = cuda_model_cache_limit_bytes(); if (g_model_range_bytes > limit || bytes > limit - g_model_range_bytes) { if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { - fprintf(stderr, "ds4: CUDA direct %s %.2f MiB (cache budget %.2f GiB exhausted)\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " direct %s %.2f MiB (cache budget %.2f GiB exhausted)\n", what ? what : "weights", (double)bytes / 1048576.0, (double)limit / 1073741824.0); @@ -1041,7 +1053,7 @@ static const char *cuda_model_range_ptr_from_fd( if (chunk_idx >= 4u) { err = cudaEventSynchronize(g_model_stage_event[bi]); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model staging wait failed for %s: %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model staging wait failed for %s: %s\n", what ? what : "weights", cudaGetErrorString(err)); (void)cudaGetLastError(); return NULL; @@ -1050,7 +1062,7 @@ static const char *cuda_model_range_ptr_from_fd( const char *payload = NULL; if (!cuda_model_stage_read(g_model_stage[bi], g_model_stage_bytes, offset + copied, n, &payload)) { - fprintf(stderr, "ds4: CUDA model range read failed for %s at %.2f MiB: %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range read failed for %s at %.2f MiB: %s\n", what ? what : "weights", (double)copied / 1048576.0, strerror(errno)); @@ -1059,7 +1071,7 @@ static const char *cuda_model_range_ptr_from_fd( err = cudaMemcpyAsync(dev + copied, payload, (size_t)n, cudaMemcpyHostToDevice, g_model_upload_stream); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model range copy failed for %s at %.2f MiB: %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range copy failed for %s at %.2f MiB: %s\n", what ? what : "weights", (double)copied / 1048576.0, cudaGetErrorString(err)); @@ -1068,7 +1080,7 @@ static const char *cuda_model_range_ptr_from_fd( } err = cudaEventRecord(g_model_stage_event[bi], g_model_upload_stream); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model staging record failed for %s: %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model staging record failed for %s: %s\n", what ? what : "weights", cudaGetErrorString(err)); (void)cudaGetLastError(); return NULL; @@ -1081,7 +1093,7 @@ static const char *cuda_model_range_ptr_from_fd( } err = cudaStreamSynchronize(g_model_upload_stream); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model range upload sync failed for %s: %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model range upload sync failed for %s: %s\n", what ? what : "weights", cudaGetErrorString(err)); (void)cudaGetLastError(); return NULL; @@ -1092,7 +1104,7 @@ static const char *cuda_model_range_ptr_from_fd( g_model_range_bytes += bytes; cuda_model_load_progress_note(g_model_range_bytes); if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { - fprintf(stderr, "ds4: CUDA fd-cached %s %.2f MiB (total %.2f GiB)\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " fd-cached %s %.2f MiB (total %.2f GiB)\n", what ? what : "weights", (double)bytes / 1048576.0, (double)g_model_range_bytes / 1073741824.0); @@ -1114,19 +1126,19 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u const double t0 = cuda_wall_sec(); cudaError_t err = cudaMalloc(&dev, (size_t)model_size); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model allocation skipped: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model allocation skipped: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); return 0; } - fprintf(stderr, "ds4: CUDA chunk-copying %.2f GiB model image\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " chunk-copying %.2f GiB model image\n", (double)model_size / 1073741824.0); const uint64_t chunk = cuda_model_copy_chunk_bytes(); void *stage = NULL; err = cudaMallocHost(&stage, (size_t)chunk); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA pinned model staging allocation failed: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " pinned model staging allocation failed: %s\n", cudaGetErrorString(err)); (void)cudaFree(dev); (void)cudaGetLastError(); return 0; @@ -1139,7 +1151,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u memcpy(stage, (const char *)model_map + copied_header, (size_t)n); err = cudaMemcpy((char *)dev + copied_header, stage, (size_t)n, cudaMemcpyHostToDevice); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model header copy failed: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model header copy failed: %s\n", cudaGetErrorString(err)); (void)cudaFreeHost(stage); (void)cudaFree(dev); (void)cudaGetLastError(); @@ -1157,7 +1169,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u memcpy(stage, (const char *)model_map + off, (size_t)n); err = cudaMemcpy((char *)dev + off, stage, (size_t)n, cudaMemcpyHostToDevice); if (err != cudaSuccess) { - fprintf(stderr, "ds4: CUDA model chunk copy failed at %.2f GiB: %s\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model chunk copy failed at %.2f GiB: %s\n", (double)copied / 1073741824.0, cudaGetErrorString(err)); (void)cudaFreeHost(stage); (void)cudaFree(dev); @@ -1168,7 +1180,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u copied += n; const double now = cuda_wall_sec(); if (getenv("DS4_CUDA_MODEL_COPY_VERBOSE") != NULL && now - last_report >= 2.0) { - fprintf(stderr, "ds4: CUDA model chunk copy %.2f/%.2f GiB\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model chunk copy %.2f/%.2f GiB\n", (double)copied / 1073741824.0, (double)map_size / 1073741824.0); last_report = now; @@ -1181,7 +1193,7 @@ static int cuda_model_copy_chunked(const void *model_map, uint64_t model_size, u g_model_hmm_direct = 0; const double t1 = cuda_wall_sec(); fprintf(stderr, - "ds4: CUDA model chunk copy complete in %.3fs (%.2f GiB tensors)\n", + DS4_CUDA_LOG_PREFIX " model chunk copy complete in %.3fs (%.2f GiB tensors)\n", t1 - t0, (double)map_size / 1073741824.0); return 1; @@ -1207,7 +1219,7 @@ static void cuda_model_range_release_all(void) { static int cublas_ok(cublasStatus_t st, const char *what) { if (st == CUBLAS_STATUS_SUCCESS) return 1; - fprintf(stderr, "ds4: cuBLAS %s failed: status %d\n", what, (int)st); + fprintf(stderr, "ds4: " DS4_CUDA_BLAS_DISPLAY " %s failed: status %d\n", what, (int)st); return 0; } @@ -1216,8 +1228,14 @@ extern "C" int ds4_gpu_init(void) { if (!cuda_ok(cudaSetDevice(dev), "set device")) return 0; cudaDeviceProp prop; if (cudaGetDeviceProperties(&prop, dev) == cudaSuccess) { - fprintf(stderr, "ds4: CUDA backend initialized on %s (sm_%d%d)\n", +#ifdef DS4_ROCM_BUILD + const char *arch = prop.gcnArchName[0] ? prop.gcnArchName : "gfx"; + fprintf(stderr, DS4_CUDA_LOG_PREFIX " backend initialized on %s (%s)\n", + prop.name, arch); +#else + fprintf(stderr, DS4_CUDA_LOG_PREFIX " backend initialized on %s (sm_%d%d)\n", prop.name, prop.major, prop.minor); +#endif } if (!g_cublas_ready) { if (!cublas_ok(cublasCreate(&g_cublas), "create handle")) return 0; @@ -1411,21 +1429,21 @@ extern "C" int ds4_gpu_set_model_map(const void *model_map, uint64_t model_size) const double t0 = clock() / (double)CLOCKS_PER_SEC; cudaError_t err = cudaMalloc(&dev, (size_t)model_size); if (err == cudaSuccess) { - fprintf(stderr, "ds4: CUDA copying %.2f GiB model to device memory\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " copying %.2f GiB model to device memory\n", (double)model_size / 1073741824.0); err = cudaMemcpy(dev, model_map, (size_t)model_size, cudaMemcpyHostToDevice); if (err == cudaSuccess) { g_model_device_base = (const char *)dev; g_model_device_owned = 1; const double t1 = clock() / (double)CLOCKS_PER_SEC; - fprintf(stderr, "ds4: CUDA model copy complete in %.3fs\n", t1 - t0); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model copy complete in %.3fs\n", t1 - t0); return 1; } - fprintf(stderr, "ds4: CUDA model copy failed: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model copy failed: %s\n", cudaGetErrorString(err)); (void)cudaFree(dev); (void)cudaGetLastError(); } else { - fprintf(stderr, "ds4: CUDA model allocation skipped: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model allocation skipped: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); } } @@ -1438,14 +1456,14 @@ extern "C" int ds4_gpu_set_model_map(const void *model_map, uint64_t model_size) if (err == cudaSuccess && dev) { g_model_device_base = (const char *)dev; g_model_registered = 1; - fprintf(stderr, "ds4: CUDA registered %.2f GiB model mapping for device access\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " registered %.2f GiB model mapping for device access\n", (double)model_size / 1073741824.0); } else { - fprintf(stderr, "ds4: CUDA host registration pointer lookup failed: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " host registration pointer lookup failed: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); } } else { - fprintf(stderr, "ds4: CUDA host registration skipped: %s\n", cudaGetErrorString(err)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " host registration skipped: %s\n", cudaGetErrorString(err)); (void)cudaGetLastError(); } return 1; @@ -1484,11 +1502,11 @@ extern "C" int ds4_gpu_set_model_fd(int fd) { g_model_direct_fd = direct_fd; if (g_model_direct_align < 512) g_model_direct_align = 512; if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { - fprintf(stderr, "ds4: CUDA model direct I/O enabled (align=%llu)\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model direct I/O enabled (align=%llu)\n", (unsigned long long)g_model_direct_align); } } else if (getenv("DS4_CUDA_WEIGHT_CACHE_VERBOSE")) { - fprintf(stderr, "ds4: CUDA model direct I/O unavailable: %s\n", strerror(errno)); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " model direct I/O unavailable: %s\n", strerror(errno)); } } #endif @@ -1524,7 +1542,7 @@ extern "C" int ds4_gpu_cache_q8_f16_range(const void *model_map, uint64_t model_ extern "C" void ds4_gpu_print_memory_report(const char *label) { size_t free_b = 0, total_b = 0; (void)cudaMemGetInfo(&free_b, &total_b); - fprintf(stderr, "ds4: CUDA memory report %s: free %.2f MiB total %.2f MiB\n", + fprintf(stderr, DS4_CUDA_LOG_PREFIX " memory report %s: free %.2f MiB total %.2f MiB\n", label ? label : "", (double)free_b / 1048576.0, (double)total_b / 1048576.0); } @@ -5211,10 +5229,10 @@ static int cuda_matmul_q8_0_tensor_labeled(ds4_gpu_tensor *out, const void *mode CUBLAS_COMPUTE_32F, CUBLAS_GEMM_DEFAULT); if (st == CUBLAS_STATUS_SUCCESS) return 1; - fprintf(stderr, "ds4: cuBLAS q8 f16 matmul failed: status %d\n", (int)st); - cuda_q8_f16_cache_disable_after_failure("cuBLAS f16 matmul failure", + fprintf(stderr, "ds4: " DS4_CUDA_BLAS_DISPLAY " q8 f16 matmul failed: status %d\n", (int)st); + cuda_q8_f16_cache_disable_after_failure("BLAS f16 matmul failure", in_dim * out_dim * sizeof(__half)); - /* The F16 expansion cache is only an optimization. If cuBLAS + /* The F16 expansion cache is only an optimization. If BLAS * rejects the cached path under memory pressure, retry the same * operation through the native Q8 kernels below. */ } @@ -6114,7 +6132,7 @@ extern "C" int ds4_gpu_attention_decode_heads_tensor( head_dim); return cuda_ok(cudaGetLastError(), "attention decode online launch"); } - fprintf(stderr, "ds4: CUDA attention score buffer too small for %u compressed rows\n", n_comp); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " attention score buffer too small for %u compressed rows\n", n_comp); return 0; } dim3 grid(1, n_head, 1); @@ -6168,7 +6186,7 @@ extern "C" int ds4_gpu_attention_prefill_raw_heads_tensor(ds4_gpu_tensor *heads, if (!tmp) return 0; float *scores = tmp; float *out_tmp = (float *)((char *)tmp + out_offset); - const float alpha = rsqrtf((float)head_dim); + const float alpha = 1.0f / sqrtf((float)head_dim); const float beta = 0.0f; cublasStatus_t st = cublasSgemmStridedBatched(g_cublas, CUBLAS_OP_T, @@ -6285,7 +6303,7 @@ static int attention_decode_batch_launch( head_dim); return cuda_ok(cudaGetLastError(), "attention decode online launch"); } - fprintf(stderr, "ds4: CUDA attention score buffer too small for %u compressed rows\n", n_comp); + fprintf(stderr, DS4_CUDA_LOG_PREFIX " attention score buffer too small for %u compressed rows\n", n_comp); return 0; } if (!use_comp_mask && n_tokens > 1 && head_dim == 512 && @@ -6538,7 +6556,7 @@ static int attention_prefill_mixed_launch( n_comp, head_dim); if (!cuda_ok(cudaGetLastError(), "attention mixed kv pack launch")) return 0; - const float alpha = rsqrtf((float)head_dim); + const float alpha = 1.0f / sqrtf((float)head_dim); const float beta = 0.0f; cublasStatus_t st = cublasSgemmStridedBatched(g_cublas, CUBLAS_OP_T, @@ -9655,7 +9673,7 @@ static int routed_moe_launch( (void)cudaEventElapsedTime(&ms_sum, prof_ev[5], prof_ev[6]); (void)cudaEventElapsedTime(&ms_total, prof_ev[0], prof_ev[6]); fprintf(stderr, - "ds4: CUDA MoE profile tokens=%u pairs=%u xq=%.3f sort=%.3f gateup=%.3f midq=%.3f down=%.3f sum=%.3f total=%.3f ms\n", + DS4_CUDA_LOG_PREFIX " MoE profile tokens=%u pairs=%u xq=%.3f sort=%.3f gateup=%.3f midq=%.3f down=%.3f sum=%.3f total=%.3f ms\n", n_tokens, pair_count, ms_xq, ms_sort, ms_gate, ms_midq, ms_down, ms_sum, ms_total); } for (uint32_t i = 0; i < 7u; i++) (void)cudaEventDestroy(prof_ev[i]); diff --git a/ds4_server.c b/ds4_server.c index bc8abbbd..53993846 100644 --- a/ds4_server.c +++ b/ds4_server.c @@ -7906,8 +7906,8 @@ static void usage(FILE *fp) { " Apply steering after attention outputs. Default: 0\n" " --warm-weights\n" " Touch mapped tensor pages before serving. Slower startup, fewer first-use stalls.\n" - " --metal | --cuda | --cpu | --backend NAME\n" - " Select backend explicitly. Defaults to Metal on macOS and CUDA on CUDA builds.\n" + " --metal | " DS4_GPU_BACKEND_FLAG " | --cpu | --backend NAME\n" + " Select backend explicitly. Defaults to Metal on macOS, " DS4_GPU_BACKEND_DISPLAY " in " DS4_GPU_BACKEND_DISPLAY " builds, and CUDA otherwise.\n" "\n" "HTTP API:\n" " --host HOST\n" @@ -7967,10 +7967,13 @@ static void usage(FILE *fp) { static ds4_backend parse_backend_arg(const char *s, const char *arg) { if (!strcmp(s, "metal")) return DS4_BACKEND_METAL; + if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA; +#ifdef DS4_ROCM_BUILD if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA; +#endif if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU; server_log(DS4_LOG_DEFAULT, "ds4-server: invalid %s value: %s", arg, s); - server_log(DS4_LOG_DEFAULT, "ds4-server: valid server backends are: metal, cuda, cpu"); + server_log(DS4_LOG_DEFAULT, "ds4-server: valid server backends are: %s", DS4_GPU_BACKEND_LIST); exit(2); } @@ -8060,7 +8063,7 @@ static server_config parse_options(int argc, char **argv) { c.engine.warm_weights = true; } else if (!strcmp(arg, "--metal")) { c.engine.backend = DS4_BACKEND_METAL; - } else if (!strcmp(arg, "--cuda")) { + } else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) { c.engine.backend = DS4_BACKEND_CUDA; } else if (!strcmp(arg, "--backend")) { c.engine.backend = parse_backend_arg(need_arg(&i, argc, argv, arg), arg);