From 0b850f126df72245cdfb8db08bcb681a6d59d240 Mon Sep 17 00:00:00 2001 From: Mitch Fultz <57411549+fitchmultz@users.noreply.github.com> Date: Thu, 14 May 2026 15:43:08 -0600 Subject: [PATCH 1/2] metal: add M5 Max runtime fast paths --- ds4_metal.m | 74 +++++++++++++++++++++++++++++++++++++++++++---- metal/dense.metal | 8 +++-- metal/moe.metal | 7 +++-- 3 files changed, 78 insertions(+), 11 deletions(-) diff --git a/ds4_metal.m b/ds4_metal.m index 0a6ae748..1cff84d9 100644 --- a/ds4_metal.m +++ b/ds4_metal.m @@ -186,6 +186,33 @@ static void ds4_gpu_print_device_summary(void) { } } +static int ds4_gpu_is_m5_device(void) { + static int initialized; + static int is_m5; + if (!initialized) { + const char *device_name = g_device.name ? [g_device.name UTF8String] : ""; + is_m5 = strstr(device_name, "M5") != NULL; + initialized = 1; + } + return is_m5; +} + +static int ds4_gpu_use_m5_private_scratch(void) { + static int initialized; + static int enabled; + if (!initialized) { + enabled = getenv("DS4_METAL_DISABLE_M5_PRIVATE_SCRATCH") == NULL && ds4_gpu_is_m5_device(); + initialized = 1; + } + return enabled; +} + +static int ds4_gpu_scratch_needs_cpu_access(const char *label) { + if (!label) return 0; + return strstr(label, "mask") != NULL || + strcmp(label, "ds4_attention_output_group_ids") == 0; +} + #define DS4_METAL_MAX_MODEL_VIEWS 16 #define DS4_METAL_MODEL_MAX_TENSOR_BYTES 704643072ull @@ -297,7 +324,20 @@ static int ds4_gpu_ensure_scratch_buffer( if (bytes == 0) bytes = 1; if (bytes > NSUIntegerMax) return 0; - *buffer = [g_device newBufferWithLength:bytes options:MTLResourceStorageModeShared]; + MTLResourceOptions options = MTLResourceStorageModeShared; + if (ds4_gpu_use_m5_private_scratch() && !ds4_gpu_scratch_needs_cpu_access(label)) { + /* + * Keep Metal's default hazard tracking. These scratch buffers are + * reused by dependent kernels across many compute encoders, and the + * graph does not insert explicit fences for untracked resources. + */ + options = MTLResourceStorageModePrivate; + } + + *buffer = [g_device newBufferWithLength:bytes options:options]; + if (!*buffer && options != MTLResourceStorageModeShared) { + *buffer = [g_device newBufferWithLength:bytes options:MTLResourceStorageModeShared]; + } if (!*buffer) { fprintf(stderr, "ds4: failed to allocate Metal scratch buffer %s (%llu bytes)\n", label, (unsigned long long)bytes); @@ -551,18 +591,25 @@ static int ds4_gpu_map_model_views( return buffer; } +static int ds4_gpu_use_m5_simdgroup_matrix(void); + static id ds4_gpu_get_mul_mm_pipeline( const char *function_name, bool bc_inp, bool bc_out) { - NSString *key = [NSString stringWithFormat:@"%s_bci=%d_bco=%d", - function_name, bc_inp ? 1 : 0, bc_out ? 1 : 0]; + bool m5_sgmatrix = ds4_gpu_use_m5_simdgroup_matrix() != 0; + NSString *key = [NSString stringWithFormat:@"%s_bci=%d_bco=%d_m5sg=%d", + function_name, + bc_inp ? 1 : 0, + bc_out ? 1 : 0, + m5_sgmatrix ? 1 : 0]; id cached = [g_pipeline_cache objectForKey:key]; if (cached) return cached; MTLFunctionConstantValues *constants = [[MTLFunctionConstantValues alloc] init]; [constants setConstantValue:&bc_inp type:MTLDataTypeBool atIndex:700]; [constants setConstantValue:&bc_out type:MTLDataTypeBool atIndex:701]; + [constants setConstantValue:&m5_sgmatrix type:MTLDataTypeBool atIndex:702]; NSError *error = nil; NSString *name = [NSString stringWithUTF8String:function_name]; @@ -590,13 +637,17 @@ static int ds4_gpu_map_model_views( static id ds4_gpu_get_mul_mm_id_pipeline( const char *function_name, bool bc_inp) { - NSString *key = [NSString stringWithFormat:@"%s_bci=%d", - function_name, bc_inp ? 1 : 0]; + bool m5_sgmatrix = ds4_gpu_use_m5_simdgroup_matrix() != 0; + NSString *key = [NSString stringWithFormat:@"%s_bci=%d_m5sg=%d", + function_name, + bc_inp ? 1 : 0, + m5_sgmatrix ? 1 : 0]; id cached = [g_pipeline_cache objectForKey:key]; if (cached) return cached; MTLFunctionConstantValues *constants = [[MTLFunctionConstantValues alloc] init]; [constants setConstantValue:&bc_inp type:MTLDataTypeBool atIndex:700]; + [constants setConstantValue:&m5_sgmatrix type:MTLDataTypeBool atIndex:702]; NSError *error = nil; NSString *name = [NSString stringWithUTF8String:function_name]; @@ -673,6 +724,18 @@ static int ds4_gpu_use_compressor_pair_nr4(void) { return enabled; } +static int ds4_gpu_use_m5_simdgroup_matrix(void) { + static int initialized; + static int enabled; + if (!initialized) { + const char *disable = getenv("DS4_METAL_DISABLE_M5_SIMDGROUP_MATRIX"); + const char *force = getenv("DS4_METAL_FORCE_M5_SIMDGROUP_MATRIX"); + enabled = disable ? 0 : (force ? 1 : ds4_gpu_is_m5_device()); + initialized = 1; + } + return enabled; +} + static int ds4_gpu_warm_model_views(void) { if (g_model_view_count == 0) return 1; @@ -1165,6 +1228,7 @@ void ds4_gpu_set_quality(bool quality) { "#define N_SG_Q8_0 4\n" "#define FC_MUL_MV 600\n" "#define FC_MUL_MM 700\n" +"#define FC_MUL_MM_M5_SGMATRIX 702\n" "#define FC_BIN 1300\n" "#define FOR_UNROLL(x) _Pragma(\"clang loop unroll(full)\") for (x)\n" "#define M_PI_F 3.14159265358979323846f\n" diff --git a/metal/dense.metal b/metal/dense.metal index a84927e9..aa3233a5 100644 --- a/metal/dense.metal +++ b/metal/dense.metal @@ -909,6 +909,7 @@ template [[host_name("kernel_mul_mv_ext_q8_0_f32_r1_5")]] kernel mul_mv_ext_q4_f constant bool FC_mul_mm_bc_inp [[function_constant(FC_MUL_MM + 0)]]; constant bool FC_mul_mm_bc_out [[function_constant(FC_MUL_MM + 1)]]; +constant bool FC_mul_mm_m5_sgmatrix [[function_constant(FC_MUL_MM_M5_SGMATRIX)]]; // Tiled matrix-matrix kernel used for prompt batches larger than 8. DS4 uses // this to turn prefill into large simdgroup matrix operations; each block_q @@ -1047,20 +1048,21 @@ kernel void kernel_mul_mm( threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2)); threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2)); + // M5 compiles this as a tighter simdgroup_matrix load/MMA chain without no-op barriers. FOR_UNROLL (short ik = 0; ik < NK/8; ik++) { - simdgroup_barrier(mem_flags::mem_none); + if (!FC_mul_mm_m5_sgmatrix) simdgroup_barrier(mem_flags::mem_none); FOR_UNROLL (short i = 0; i < 4; i++) { simdgroup_load(ma[i], lsma + 64*i, 8, 0, false); } - simdgroup_barrier(mem_flags::mem_none); + if (!FC_mul_mm_m5_sgmatrix) simdgroup_barrier(mem_flags::mem_none); FOR_UNROLL (short i = 0; i < 2; i++) { simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false); } - simdgroup_barrier(mem_flags::mem_none); + if (!FC_mul_mm_m5_sgmatrix) simdgroup_barrier(mem_flags::mem_none); FOR_UNROLL (short i = 0; i < 8; i++){ simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); diff --git a/metal/moe.metal b/metal/moe.metal index 65074d7d..fc5ff166 100644 --- a/metal/moe.metal +++ b/metal/moe.metal @@ -1656,20 +1656,21 @@ kernel void kernel_mul_mm_id( threadgroup const S0 * lsma = (sa + 4*64*(sgitg%2)); threadgroup const S1 * lsmb = (sb + 2*64*(sgitg/2)); + // M5 compiles this as a tighter simdgroup_matrix load/MMA chain without no-op barriers. FOR_UNROLL (short ik = 0; ik < NK/8; ik++) { - simdgroup_barrier(mem_flags::mem_none); + if (!FC_mul_mm_m5_sgmatrix) simdgroup_barrier(mem_flags::mem_none); FOR_UNROLL (short i = 0; i < 4; i++) { simdgroup_load(ma[i], lsma + 64*i, 8, 0, false); } - simdgroup_barrier(mem_flags::mem_none); + if (!FC_mul_mm_m5_sgmatrix) simdgroup_barrier(mem_flags::mem_none); FOR_UNROLL (short i = 0; i < 2; i++) { simdgroup_load(mb[i], lsmb + 64*i, 8, 0, false); } - simdgroup_barrier(mem_flags::mem_none); + if (!FC_mul_mm_m5_sgmatrix) simdgroup_barrier(mem_flags::mem_none); FOR_UNROLL (short i = 0; i < 8; i++){ simdgroup_multiply_accumulate(mc[i], mb[i/4], ma[i%4], mc[i]); From c1ee32a1903376245d53657e0a076a572e3fdc59 Mon Sep 17 00:00:00 2001 From: Mitch Fultz <57411549+fitchmultz@users.noreply.github.com> Date: Thu, 14 May 2026 15:43:08 -0600 Subject: [PATCH 2/2] metal: default M5 Max to safe 4096 prefill --- ds4.c | 104 +++++++++++++++++++++++++++++++++++++++++++++++---- ds4.h | 2 + ds4_server.c | 16 +++++--- 3 files changed, 108 insertions(+), 14 deletions(-) diff --git a/ds4.c b/ds4.c index 51410e33..a3423af8 100644 --- a/ds4.c +++ b/ds4.c @@ -30,6 +30,9 @@ #include #include #include +#ifdef __APPLE__ +#include +#endif #include #include #include @@ -6098,7 +6101,35 @@ static uint32_t ds4_default_raw_cap(uint32_t ctx_size) { return raw_cap; } -static uint32_t ds4_default_prefill_cap_for_prompt(int prompt_len) { +static bool ds4_host_is_apple_m5_max(void) { +#if defined(__APPLE__) + static int initialized; + static int is_m5_max; + if (!initialized) { + char brand[128]; + size_t len = sizeof(brand); + if (sysctlbyname("machdep.cpu.brand_string", brand, &len, NULL, 0) == 0) { + brand[sizeof(brand) - 1] = '\0'; + is_m5_max = strstr(brand, "Apple M5 Max") != NULL; + } + initialized = 1; + } + return is_m5_max != 0; +#else + return false; +#endif +} + +uint32_t ds4_backend_default_prefill_chunk(ds4_backend backend) { + if (backend == DS4_BACKEND_METAL && ds4_host_is_apple_m5_max()) return 4096u; + return 2048u; +} + +uint32_t ds4_backend_default_kv_boundary_align_tokens(ds4_backend backend) { + return ds4_backend_default_prefill_chunk(backend); +} + +static uint32_t ds4_default_prefill_cap_for_prompt(int prompt_len, ds4_backend backend) { if (prompt_len <= 0) return 1; uint32_t cap = (uint32_t)prompt_len; @@ -6109,9 +6140,23 @@ static uint32_t ds4_default_prefill_cap_for_prompt(int prompt_len) { if (endp != env) { if (v <= 0) return cap; cap = (uint32_t)v; + if (ds4_backend_default_prefill_chunk(backend) >= 4096u && + cap > 4096u && getenv("DS4_METAL_ALLOW_UNSAFE_PREFILL_CHUNK") == NULL) + { + static int warned_large_prefill_chunk; + if (!warned_large_prefill_chunk) { + fprintf(stderr, + "ds4: DS4_METAL_PREFILL_CHUNK=%u exceeds the correctness-gated 4096-token limit; " + "clamping to 4096 (set DS4_METAL_ALLOW_UNSAFE_PREFILL_CHUNK=1 to experiment)\n", + cap); + warned_large_prefill_chunk = 1; + } + cap = 4096u; + } } - } else if (prompt_len > 2048) { - cap = 2048u; + } else { + const uint32_t default_chunk = ds4_backend_default_prefill_chunk(backend); + if (prompt_len > (int)default_chunk) cap = default_chunk; } if (cap == 0) cap = 1; @@ -10970,6 +11015,11 @@ static bool metal_graph_q_stage_profile_boundary( return ds4_gpu_begin_commands() != 0; } +static bool metal_graph_use_m5_large_prefill_schedule(const ds4_gpu_graph *g) { + return g && g->prefill_cap >= 4096u && + ds4_backend_default_prefill_chunk(DS4_BACKEND_METAL) >= 4096u; +} + static bool metal_graph_encode_layer_attention_batch( ds4_gpu_graph *g, const ds4_model *model, @@ -11986,7 +12036,16 @@ static bool metal_graph_encode_layer_attention_batch( } const bool topk_prefill_needed = ratio == 4 && n_comp > DS4_N_INDEXER_TOP_K; - if (ok && zero_prefix && topk_prefill_needed && n_comp != 0) { + /* The all-at-once zero-prefix indexed attention path selects one top-k + * set per token from the complete compressed prefix. For very large + * first chunks, high-scoring future compressed rows can crowd out older + * visible rows before the attention kernel applies its causal `visible` + * cutoff, which breaks long-memory prompts. Keep the fast batched path + * for the correctness-gated 2048-token chunk size; larger experimental + * chunks fall through to the per-token indexed path below. */ + if (ok && zero_prefix && topk_prefill_needed && n_comp != 0 && + (!metal_graph_use_m5_large_prefill_schedule(g) || n_tokens <= 2048u)) + { const float index_scale = 1.0f / sqrtf((float)(DS4_N_INDEXER_HEAD_DIM * DS4_N_INDEXER_HEAD)); double index_stage_t0 = 0.0; if (index_stage_profile) { @@ -13274,6 +13333,19 @@ static bool metal_graph_prefill_layer_major( return ok; } +static bool metal_graph_prefill_chunked_range( + ds4_gpu_graph *g, + const ds4_model *model, + const ds4_weights *weights, + const token_vec *prompt, + uint32_t start, + uint32_t n_tokens, + float *logits, + bool show_progress, + ds4_session_progress_fn progress, + void *progress_ud, + ds4_imatrix_collector *imatrix); + static bool metal_graph_prefill_raw_swa( ds4_gpu_graph *g, const ds4_model *model, @@ -13284,6 +13356,19 @@ static bool metal_graph_prefill_raw_swa( bool show_progress) { if (n_tokens <= 0 || n_tokens > prompt->len) return false; if ((uint32_t)n_tokens > g->prefill_cap) return false; + if (metal_graph_use_m5_large_prefill_schedule(g) && n_tokens > 2048) { + return metal_graph_prefill_chunked_range(g, + model, + weights, + prompt, + 0, + (uint32_t)n_tokens, + logits, + show_progress, + NULL, + NULL, + NULL); + } return metal_graph_prefill_layer_major(g, model, weights, prompt, n_tokens, logits, show_progress, NULL); } @@ -13364,6 +13449,9 @@ static bool metal_graph_prefill_chunked_range( for (uint32_t pos0 = start; pos0 < end; ) { const uint32_t remaining = end - pos0; uint32_t local_cap = chunk_cap; + if (metal_graph_use_m5_large_prefill_schedule(g) && pos0 == 0 && local_cap > 2048u) { + local_cap = 2048u; + } if (start != 0 && g->prefill_cap != 0) { const uint32_t mod = pos0 % g->prefill_cap; if (mod != 0) { @@ -13772,9 +13860,9 @@ static uint32_t metal_graph_raw_cap_for_context(int ctx_size, uint32_t prefill_c } /* Choose the prefill ubatch size. Whole-batch is fastest for normal prompts; - * long prompts default to 2048-token chunks. */ + * long prompts default to a backend-tuned chunk size. */ static uint32_t metal_graph_prefill_cap_for_prompt(int prompt_len) { - return ds4_default_prefill_cap_for_prompt(prompt_len); + return ds4_default_prefill_cap_for_prompt(prompt_len, DS4_BACKEND_METAL); } /* When a server request shares a large prefix with the live checkpoint, extend @@ -13798,7 +13886,7 @@ ds4_context_memory ds4_context_memory_estimate(ds4_backend backend, int ctx_size uint32_t ctx = ctx_size > 0 ? (uint32_t)ctx_size : 1u; if (ds4_backend_uses_graph(backend)) { - m.prefill_cap = metal_graph_prefill_cap_for_prompt((int)ctx); + m.prefill_cap = ds4_default_prefill_cap_for_prompt((int)ctx, backend); m.raw_cap = metal_graph_raw_cap_for_context((int)ctx, m.prefill_cap); uint32_t min_ratio = UINT32_MAX; @@ -17097,7 +17185,7 @@ int ds4_session_create(ds4_session **out, ds4_engine *e, int ctx_size) { ds4_session *s = xcalloc(1, sizeof(*s)); s->engine = e; s->ctx_size = ctx_size; - s->prefill_cap = ds4_default_prefill_cap_for_prompt(ctx_size); + s->prefill_cap = ds4_default_prefill_cap_for_prompt(ctx_size, e->backend); kv_cache_init(&s->cpu_cache, (uint32_t)ctx_size, 0); cpu_decode_scratch_init(&s->cpu_scratch, (uint32_t)ctx_size); s->logits = xmalloc((size_t)DS4_N_VOCAB * sizeof(s->logits[0])); diff --git a/ds4.h b/ds4.h index 950d8dca..bcfad65b 100644 --- a/ds4.h +++ b/ds4.h @@ -97,6 +97,8 @@ const char *ds4_think_mode_name(ds4_think_mode mode); const char *ds4_think_max_prefix(void); uint32_t ds4_think_max_min_context(void); ds4_think_mode ds4_think_mode_for_context(ds4_think_mode mode, int ctx_size); +uint32_t ds4_backend_default_prefill_chunk(ds4_backend backend); +uint32_t ds4_backend_default_kv_boundary_align_tokens(ds4_backend backend); ds4_context_memory ds4_context_memory_estimate(ds4_backend backend, int ctx_size); bool ds4_log_is_tty(FILE *fp); void ds4_log(FILE *fp, ds4_log_type type, const char *fmt, ...); diff --git a/ds4_server.c b/ds4_server.c index fb2bf3ad..99704243 100644 --- a/ds4_server.c +++ b/ds4_server.c @@ -7658,11 +7658,10 @@ static void apply_openai_stream_tool_ids(tool_calls *calls, /* Tokenizers may merge text across the prompt boundary. Trimming a small tail * still improves the cheap token-prefix path, while text-prefix lookup handles * the cases where canonical prompt tokenization spells the same bytes - * differently. The 2048 alignment also matches the Metal prefill chunk - * schedule, which keeps compressor row finalization identical to a cold full - * prompt. */ + * differently. The alignment should match the backend prefill chunk schedule, + * which keeps compressor row finalization identical to a cold full prompt. */ #define KV_CACHE_DEFAULT_BOUNDARY_TRIM_TOKENS 32 -#define KV_CACHE_DEFAULT_BOUNDARY_ALIGN_TOKENS 2048 +#define KV_CACHE_FALLBACK_BOUNDARY_ALIGN_TOKENS 2048 #define KV_CACHE_DEFAULT_CONTINUED_INTERVAL_TOKENS 10000 #define KV_CACHE_DEFAULT_MB 4096 #define KV_EXT_TOOL_MAP (1u << 0) @@ -7697,7 +7696,7 @@ static kv_cache_options kv_cache_default_options(void) { .cold_max_tokens = KV_CACHE_DEFAULT_COLD_MAX_TOKENS, .continued_interval_tokens = KV_CACHE_DEFAULT_CONTINUED_INTERVAL_TOKENS, .boundary_trim_tokens = KV_CACHE_DEFAULT_BOUNDARY_TRIM_TOKENS, - .boundary_align_tokens = KV_CACHE_DEFAULT_BOUNDARY_ALIGN_TOKENS, + .boundary_align_tokens = KV_CACHE_FALLBACK_BOUNDARY_ALIGN_TOKENS, }; } @@ -11002,7 +11001,7 @@ static void usage(FILE *fp) { " --kv-cache-boundary-trim-tokens N\n" " Trim this many tail tokens before cold boundary saves to avoid tokenizer boundary merges. Default: 32\n" " --kv-cache-boundary-align-tokens N\n" - " Align cold boundary saves down to this token multiple. 0 disables alignment. Default: 2048\n" + " Align cold boundary saves down to this token multiple. 0 disables alignment. Default: backend prefill chunk (4096 on M5 Max Metal, otherwise 2048)\n" " --kv-cache-reject-different-quant\n" " Refuse checkpoints written by the same model with a different routed-expert quantization.\n" " --disable-exact-dsml-tool-replay\n" @@ -11064,6 +11063,7 @@ static server_config parse_options(int argc, char **argv) { c.kv_cache = kv_cache_default_options(); bool directional_steering_scale_set = false; + bool kv_boundary_align_set = false; for (int i = 1; i < argc; i++) { const char *arg = argv[i]; if (!strcmp(arg, "-h") || !strcmp(arg, "--help")) { @@ -11103,6 +11103,7 @@ static server_config parse_options(int argc, char **argv) { c.kv_cache.boundary_trim_tokens = parse_nonneg_int_arg(need_arg(&i, argc, argv, arg), arg); } else if (!strcmp(arg, "--kv-cache-boundary-align-tokens")) { c.kv_cache.boundary_align_tokens = parse_nonneg_int_arg(need_arg(&i, argc, argv, arg), arg); + kv_boundary_align_set = true; } else if (!strcmp(arg, "--kv-cache-reject-different-quant")) { c.kv_cache_reject_different_quant = true; } else if (!strcmp(arg, "--disable-exact-dsml-tool-replay")) { @@ -11135,6 +11136,9 @@ static server_config parse_options(int argc, char **argv) { exit(2); } } + if (!kv_boundary_align_set) { + c.kv_cache.boundary_align_tokens = ds4_backend_default_kv_boundary_align_tokens(c.engine.backend); + } if (c.kv_cache.cold_max_tokens > 0 && c.kv_cache.cold_max_tokens < c.kv_cache.min_tokens) {