Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 5 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ OBJCFLAGS ?= -O3 -ffast-math $(NATIVE_CPU_FLAG) -Wall -Wextra -fobjc-arc

LDLIBS ?= -lm -pthread
METAL_SRCS := $(wildcard metal/*.metal)
GPU_EXTRA_DEPS :=

ifeq ($(UNAME_S),Darwin)
METAL_LDLIBS := $(LDLIBS) -framework Foundation -framework Metal
Expand All @@ -26,12 +27,14 @@ ROCM_PATH ?= /opt/rocm
GPU_CC = $(ROCM_PATH)/bin/hipcc
ROCM_ARCH ?= gfx1151

CFLAGS += -DDS4_ROCM_BUILD
GPU_CFLAGS ?= -O3 -fno-finite-math-only -pthread -D__HIP_PLATFORM_AMD__ -Wno-unused-command-line-argument --offload-arch=$(ROCM_ARCH)
GPU_CFLAGS += -DDS4_ROCM_BUILD
GPU_LDLIBS = -lm -pthread -L$(ROCM_PATH)/lib -lhipblas

@echo "ROCM_ARCH: $(ROCM_ARCH)"

EXTRA_DEPS = ds4_rocm.h
GPU_EXTRA_DEPS += ds4_rocm.h

else

Expand Down Expand Up @@ -177,7 +180,7 @@ ds4_bench_cpu.o: ds4_bench.c ds4.h
ds4_metal.o: ds4_metal.m ds4_gpu.h $(METAL_SRCS)
$(CC) $(OBJCFLAGS) -c -o $@ ds4_metal.m

ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(EXTRA_DEPS)
ds4_cuda.o: ds4_cuda.cu ds4_gpu.h ds4_iq2_tables_cuda.inc $(GPU_EXTRA_DEPS)
$(GPU_CC) $(GPU_CFLAGS) -c -o $@ ds4_cuda.cu

tests/cuda_long_context_smoke: tests/cuda_long_context_smoke.o ds4_cuda.o
Expand All @@ -195,4 +198,3 @@ test: ds4_test

clean:
rm -f ds4 ds4-server ds4-bench ds4_cpu ds4_native ds4_server_test ds4_test *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o

14 changes: 11 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ Then build:
make # macOS Metal
make cuda-spark # Linux CUDA, DGX Spark / GB10
make cuda-generic # Linux CUDA, other local CUDA GPUs
make rocm ROCM_ARCH=gfx1151
make cpu # CPU-only diagnostics build
```

Expand Down Expand Up @@ -646,11 +647,13 @@ the kv cache files include the verbatim prompt cached.

## Backends

The default graph backend is Metal on macOS and CUDA in CUDA builds:
The default graph backend is Metal on macOS, CUDA in CUDA builds, and ROCm in
ROCm builds:

```sh
./ds4 -p "Hello" --metal
./ds4 -p "Hello" --cuda
./ds4 -p "Hello" --rocm
```

On Linux, plain `make` prints the available build targets instead of selecting a
Expand All @@ -662,8 +665,13 @@ when cross-building or when you need a known target:
```sh
make cuda CUDA_ARCH=sm_120
make cuda CUDA_ARCH=native
make rocm ROCM_ARCH=gfx1151
```

In ROCm builds, `--rocm` and `--backend rocm` are the canonical runtime names.
`--cuda` and `--backend cuda` remain accepted as compatibility aliases because
the shared GPU implementation still uses the CUDA backend enum internally.

There is also a CPU reference/debug path:

```sh
Expand All @@ -675,8 +683,8 @@ make cpu

Do not treat the CPU path as the production target. The CLI and `ds4-server`
support the CPU backend for reference/debug use and share the same KV session
and snapshot format as Metal and CUDA, but normal inference should use Metal or
CUDA.
and snapshot format as Metal, CUDA, and ROCm, but normal inference should use
Metal, CUDA, or ROCm.

## Steering

Expand Down
19 changes: 11 additions & 8 deletions ds4.c
Original file line number Diff line number Diff line change
Expand Up @@ -1453,7 +1453,7 @@ static bool accelerator_cache_model_tensors(ds4_backend backend, const ds4_model
const double t1 = now_sec();
if (ds4_log_is_tty(stderr)) fputc('\n', stderr);
fprintf(stderr,
"ds4: CUDA startup model cache prepared %.2f GiB of tensor spans in %.3fs\n",
"ds4: " DS4_GPU_BACKEND_DISPLAY " startup model cache prepared %.2f GiB of tensor spans in %.3fs\n",
(double)cached / 1073741824.0,
t1 - t0);
}
Expand Down Expand Up @@ -16995,15 +16995,17 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
#ifndef DS4_NO_GPU
if (e->backend == DS4_BACKEND_CUDA) {
#ifdef __APPLE__
fprintf(stderr, "ds4: CUDA backend requested but this build is linked with Metal, not CUDA\n");
fprintf(stderr, "ds4: %s backend requested but this build is linked with Metal, not %s\n",
DS4_GPU_BACKEND_DISPLAY, DS4_GPU_BACKEND_DISPLAY);
ds4_engine_close(e);
*out = NULL;
return 1;
#endif
}
if (e->backend == DS4_BACKEND_METAL) {
#ifndef __APPLE__
fprintf(stderr, "ds4: Metal backend requested but this build is linked with CUDA, not Metal\n");
fprintf(stderr, "ds4: Metal backend requested but this build is linked with %s, not Metal\n",
DS4_GPU_BACKEND_DISPLAY);
ds4_engine_close(e);
*out = NULL;
return 1;
Expand Down Expand Up @@ -17033,11 +17035,12 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
*out = NULL;
return 1;
}
if (e->mtp_ready &&
const bool mtp_needs_global_model_map = e->mtp_ready && e->backend != DS4_BACKEND_CUDA;
if (mtp_needs_global_model_map &&
!ds4_gpu_set_model_map_range(e->mtp_model.map,
e->mtp_model.size,
e->mtp_model.tensor_data_pos,
e->mtp_model.size - e->mtp_model.tensor_data_pos))
e->mtp_model.size,
e->mtp_model.tensor_data_pos,
e->mtp_model.size - e->mtp_model.tensor_data_pos))
{
fprintf(stderr,
"ds4: %s failed to map MTP model views; aborting startup. "
Expand All @@ -17047,7 +17050,7 @@ int ds4_engine_open(ds4_engine **out, const ds4_engine_options *opt) {
*out = NULL;
return 1;
}
if (!e->mtp_ready && !accelerator_cache_model_tensors(e->backend, &e->model)) {
if (!accelerator_cache_model_tensors(e->backend, &e->model)) {
fprintf(stderr, "ds4: %s failed to prepare startup model cache\n",
ds4_backend_name(e->backend));
ds4_engine_close(e);
Expand Down
12 changes: 12 additions & 0 deletions ds4.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ typedef enum {
DS4_BACKEND_CPU,
} ds4_backend;

#ifdef DS4_ROCM_BUILD
#define DS4_GPU_BACKEND_CLI_NAME "rocm"
#define DS4_GPU_BACKEND_FLAG "--rocm"
#define DS4_GPU_BACKEND_DISPLAY "ROCm"
#define DS4_GPU_BACKEND_LIST "metal, rocm, cuda, cpu"
#else
#define DS4_GPU_BACKEND_CLI_NAME "cuda"
#define DS4_GPU_BACKEND_FLAG "--cuda"
#define DS4_GPU_BACKEND_DISPLAY "CUDA"
#define DS4_GPU_BACKEND_LIST "metal, cuda, cpu"
#endif

typedef enum {
DS4_THINK_NONE,
DS4_THINK_HIGH,
Expand Down
11 changes: 7 additions & 4 deletions ds4_bench.c
Original file line number Diff line number Diff line change
Expand Up @@ -63,8 +63,8 @@ static void usage(FILE *fp) {
"\n"
"Model and backend:\n"
" -m, --model FILE GGUF model path. Default: ds4flash.gguf\n"
" --metal | --cuda | --cpu | --backend NAME\n"
" Select backend explicitly. Defaults to Metal on macOS, CUDA elsewhere.\n"
" --metal | " DS4_GPU_BACKEND_FLAG " | --cpu | --backend NAME\n"
" Select backend explicitly. Defaults to Metal on macOS, " DS4_GPU_BACKEND_DISPLAY " in " DS4_GPU_BACKEND_DISPLAY " builds, CUDA elsewhere.\n"
" -t, --threads N CPU helper threads.\n"
" --quality Prefer exact kernels where applicable.\n"
" --warm-weights Touch mapped tensor pages before benchmarking.\n"
Expand Down Expand Up @@ -112,10 +112,13 @@ static const char *need_arg(int *i, int argc, char **argv, const char *opt) {

static ds4_backend parse_backend(const char *s, const char *opt) {
if (!strcmp(s, "metal")) return DS4_BACKEND_METAL;
if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA;
#ifdef DS4_ROCM_BUILD
if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA;
#endif
if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU;
fprintf(stderr, "ds4-bench: invalid value for %s: %s\n", opt, s);
fprintf(stderr, "ds4-bench: valid backends are: metal, cuda, cpu\n");
fprintf(stderr, "ds4-bench: valid backends are: %s\n", DS4_GPU_BACKEND_LIST);
exit(2);
}

Expand Down Expand Up @@ -213,7 +216,7 @@ static bench_config parse_options(int argc, char **argv) {
c.backend = parse_backend(need_arg(&i, argc, argv, arg), arg);
} else if (!strcmp(arg, "--metal")) {
c.backend = DS4_BACKEND_METAL;
} else if (!strcmp(arg, "--cuda")) {
} else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) {
c.backend = DS4_BACKEND_CUDA;
} else if (!strcmp(arg, "--cpu")) {
c.backend = DS4_BACKEND_CPU;
Expand Down
13 changes: 8 additions & 5 deletions ds4_cli.c
Original file line number Diff line number Diff line change
Expand Up @@ -93,12 +93,12 @@ static void usage(FILE *fp) {
" Context size allocated for the session. Default: 32768\n"
" --metal\n"
" Use the Metal graph backend. This is the normal fast path on macOS.\n"
" --cuda\n"
" Use the CUDA graph backend. This is the normal fast path on CUDA builds.\n"
" " DS4_GPU_BACKEND_FLAG "\n"
" Use the " DS4_GPU_BACKEND_DISPLAY " graph backend. This is the normal fast path on " DS4_GPU_BACKEND_DISPLAY " builds.\n"
" --cpu\n"
" Use the CPU reference/debug backend. Not recommended for normal inference.\n"
" --backend NAME\n"
" Select backend explicitly: metal, cuda, or cpu.\n"
" Select backend explicitly: " DS4_GPU_BACKEND_LIST ".\n"
" -t, --threads N\n"
" CPU helper threads for host-side or reference work.\n"
" --quality\n"
Expand Down Expand Up @@ -223,10 +223,13 @@ static float parse_float_range(const char *s, const char *opt, float min, float

static ds4_backend parse_backend(const char *s) {
if (!strcmp(s, "metal")) return DS4_BACKEND_METAL;
if (!strcmp(s, DS4_GPU_BACKEND_CLI_NAME)) return DS4_BACKEND_CUDA;
#ifdef DS4_ROCM_BUILD
if (!strcmp(s, "cuda")) return DS4_BACKEND_CUDA;
#endif
if (!strcmp(s, "cpu")) return DS4_BACKEND_CPU;
fprintf(stderr, "ds4: invalid backend: %s\n", s);
fprintf(stderr, "ds4: valid backends are: metal, cuda, cpu\n");
fprintf(stderr, "ds4: valid backends are: %s\n", DS4_GPU_BACKEND_LIST);
exit(2);
}

Expand Down Expand Up @@ -1260,7 +1263,7 @@ static cli_config parse_options(int argc, char **argv) {
c.engine.backend = DS4_BACKEND_CPU;
} else if (!strcmp(arg, "--metal")) {
c.engine.backend = DS4_BACKEND_METAL;
} else if (!strcmp(arg, "--cuda")) {
} else if (!strcmp(arg, DS4_GPU_BACKEND_FLAG) || !strcmp(arg, "--cuda")) {
c.engine.backend = DS4_BACKEND_CUDA;
} else if (!strcmp(arg, "--dump-tokens")) {
c.gen.dump_tokens = true;
Expand Down
Loading