From 2969cfa0d80c5b3066557ca72cd9ca07887145a1 Mon Sep 17 00:00:00 2001 From: Yiwei Shao <44545837+njsyw1997@users.noreply.github.com> Date: Fri, 3 Apr 2026 22:09:46 -0700 Subject: [PATCH 1/6] hexagon: add async HMX worker Introduce hmx-worker (dedicated thread for HMX compute) to overlap HMX matmul with HVX dequant/DMA stages in the pipeline path, replacing the previous synchronous HMX calls that blocked the main thread. --- ggml/src/ggml-hexagon/htp/CMakeLists.txt | 1 + ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c | 93 +++++++--- ggml/src/ggml-hexagon/htp/hmx-worker.c | 193 +++++++++++++++++++++ ggml/src/ggml-hexagon/htp/hmx-worker.h | 54 ++++++ ggml/src/ggml-hexagon/htp/htp-ctx.h | 5 + ggml/src/ggml-hexagon/htp/main.c | 16 +- 6 files changed, 336 insertions(+), 26 deletions(-) create mode 100644 ggml/src/ggml-hexagon/htp/hmx-worker.c create mode 100644 ggml/src/ggml-hexagon/htp/hmx-worker.h diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt index 2b60f427ad..1cddd6a2c0 100644 --- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt +++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt @@ -47,6 +47,7 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx) if (_hmx_idx GREATER_EQUAL 0) target_sources(${HTP_LIB} PRIVATE + hmx-worker.c hmx-matmul-ops.c ) diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c index ec191c1498..288a4caa6a 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c @@ -22,6 +22,7 @@ #include "htp-ctx.h" #include "htp-ops.h" +#include "hmx-worker.h" #include "hmx-utils.h" #include "hmx-ops.h" #include "hmx-profile.h" @@ -675,6 +676,39 @@ static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const } } +// --- Async HMX matmul job (for pipeline overlap) --- + +typedef struct { + __fp16 *output; + const __fp16 *activation; + const __fp16 *weight; + const __fp16 *scales; + int n_row_tiles; + int n_col_tiles; + int n_dot_tiles; +} hmx_matmul_job_t; + +static void hmx_matmul_worker_fn(void *data) { + hmx_matmul_job_t *job = (hmx_matmul_job_t *) data; + core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, + job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles); +} + +static inline void hmx_matmul_job_init( + hmx_matmul_job_t *job, + __fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales, + int n_row_tiles, int n_col_tiles, int n_dot_tiles) { + job->output = output; + job->activation = activation; + job->weight = weight; + job->scales = scales; + job->n_row_tiles = n_row_tiles; + job->n_col_tiles = n_col_tiles; + job->n_dot_tiles = n_dot_tiles; +} + +// --- End async HMX matmul job --- + static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, int n_rows, int n_cols, int n) { assert(n_cols % HMX_FP16_TILE_N_COLS == 0); const int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS; @@ -1256,9 +1290,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols, (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); - HAP_compute_res_hmx_lock(ctx->vtcm_rctx); - if (!use_pipeline) { + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { // transfer activation matrix chunk into VTCM size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); @@ -1318,20 +1351,23 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds TIMER_STOP(output_store); } } + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); } else { // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D) - // stage B and D (dequantize and store) are expected to be on the critical path + // HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D). // A --> B: vtcm_qweight, 1 buffer // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers // C --> D: vtcm_output0/vtcm_output1, 2 buffers - // - // LD ||A3| | B3 || - // MM || C2 || - // ST || D1 | || + // Async timeline (C overlaps B+D): + // main+HVX: [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2] + // HMX worker: [████ C0 ████████][████ C1 ████████████][████ C2 ████████] int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols); + hmx_matmul_job_t job_slots[2]; // persistent double-buffered job descriptors + + hmx_worker_begin(ctx->hmx_worker); for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) { const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows); @@ -1352,31 +1388,33 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k); } - // prologue: B0, A1, C0, B1 + // prologue: B0, A1, submit C0 (async), B1 (overlaps C0) { - // B0 + // B0: wait for DMA, dequant weight chunk 0 dma_queue_pop(ctx->dma[0]); dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type); - // A1 + // A1: issue DMA for weight chunk 1 const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols); if (1 < n_chunk_cnt) { const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride; dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1); } - // C0 - core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales, - hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); + // submit C0 (non-blocking — HMX worker executes in parallel) + hmx_matmul_job_init(&job_slots[0], + (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales, + hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); + hmx_worker_submit(ctx->hmx_worker, hmx_matmul_worker_fn, &job_slots[0]); - // B1 + // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker) if (1 < n_chunk_cnt) { dma_queue_pop(ctx->dma[0]); dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type); } } - // main loop + // main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1}) for (int i = 0; i < n_chunk_cnt; ++i) { const size_t nc = i * n_chunk_n_cols; const size_t nc_p1 = nc + 1 * n_chunk_n_cols; @@ -1386,36 +1424,41 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols); const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols); - // issue A_{i+2} + // issue A_{i+2}: DMA push (non-blocking) if (i + 2 < n_chunk_cnt) { const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride; dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2); } - // wait for HMX (C_{i}) -- C_{i} is done - - // result of B_{i+1} (input of C_{i+1}) should be ready now + // wait C_i: block until prologue/previous C completes + hmx_worker_wait(ctx->hmx_worker); - // issue C_{i+1} + // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below) + // job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's + // counterpart — and (i+1)%2 was last used by C_{i-1} which completed + // before C_i was submitted. if (i + 1 < n_chunk_cnt) { - core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales, + hmx_matmul_job_init(&job_slots[(i + 1) % 2], + (__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation, + (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); + hmx_worker_submit(ctx->hmx_worker, hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]); } - // compute D_{i} + // D_i: store output (multi-thread HVX, parallel with C_{i+1}) float *output_chunk = dst + (mr * n + nc); transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n); - // wait for DMA (A_{i+2}), compute B_{i+2} + // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1}) if (i + 2 < n_chunk_cnt) { dma_queue_pop(ctx->dma[0]); dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type); } } } - } - HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); + hmx_worker_end(ctx->hmx_worker); + } TIMER_STOP(total); diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.c b/ggml/src/ggml-hexagon/htp/hmx-worker.c new file mode 100644 index 0000000000..657d16b2a9 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hmx-worker.c @@ -0,0 +1,193 @@ +#include "hmx-worker.h" + +#include +#include +#include +#include + +#include +#include + +// --------------------------------------------------------------------------- +// Internal types +// --------------------------------------------------------------------------- + +enum hmx_worker_cmd { + HMX_WORKER_CMD_BEGIN, // acquire HMX lock + HMX_WORKER_CMD_JOB, // execute fn(data) + HMX_WORKER_CMD_END, // release HMX lock + HMX_WORKER_CMD_KILL, // exit thread +}; + +struct hmx_worker_context { + // Command channel: main thread → worker + atomic_uint cmd_seqn; // bumped by main thread for each command + enum hmx_worker_cmd cmd_type; + hmx_worker_fn_t fn; + void *data; + + // Completion channel: worker → main thread + atomic_uint done_seqn; // set to cmd_seqn when command completes + + // Configuration + uint32_t vtcm_rctx; + + // Thread resources + qurt_thread_t thread; + void *stack; // single allocation: stack + context +}; + +// --------------------------------------------------------------------------- +// Worker thread entry point +// --------------------------------------------------------------------------- + +static void hmx_worker_main(void *arg) { + struct hmx_worker_context *ctx = (struct hmx_worker_context *) arg; + + FARF(HIGH, "hmx-worker: thread started"); + + unsigned int prev_seqn = 0; + for (;;) { + unsigned int seqn = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire); + if (seqn == prev_seqn) { + qurt_futex_wait(&ctx->cmd_seqn, prev_seqn); + continue; + } + prev_seqn = seqn; + + switch (ctx->cmd_type) { + case HMX_WORKER_CMD_BEGIN: + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); + break; + + case HMX_WORKER_CMD_JOB: + ctx->fn(ctx->data); + break; + + case HMX_WORKER_CMD_END: + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); + break; + + case HMX_WORKER_CMD_KILL: + atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release); + qurt_futex_wake(&ctx->done_seqn, 1); + FARF(HIGH, "hmx-worker: thread stopped"); + return; + } + + atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release); + qurt_futex_wake(&ctx->done_seqn, 1); + } +} + +// --------------------------------------------------------------------------- +// Internal helpers +// --------------------------------------------------------------------------- + +// Issue a command to the worker (non-blocking). +static void hmx_worker_issue(struct hmx_worker_context *ctx, + enum hmx_worker_cmd type, + hmx_worker_fn_t fn, void *data) { + ctx->cmd_type = type; + ctx->fn = fn; + ctx->data = data; + atomic_fetch_add_explicit(&ctx->cmd_seqn, 1, memory_order_release); + qurt_futex_wake(&ctx->cmd_seqn, 1); +} + +// Block until the worker has completed the most recently issued command. +static void hmx_worker_drain(struct hmx_worker_context *ctx) { + unsigned int expected = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire); + while (atomic_load_explicit(&ctx->done_seqn, memory_order_acquire) != expected) { + qurt_futex_wait(&ctx->done_seqn, + atomic_load_explicit(&ctx->done_seqn, memory_order_relaxed)); + } +} + +// --------------------------------------------------------------------------- +// Public API +// --------------------------------------------------------------------------- + +#define LOWEST_USABLE_QURT_PRIO (254) + +AEEResult hmx_worker_init(hmx_worker_context_t *out, uint32_t stack_size, uint32_t vtcm_rctx) { + if (!out) { + return AEE_EBADPARM; + } + + // Single allocation: stack followed by context struct. + size_t total = stack_size + sizeof(struct hmx_worker_context); + unsigned char *blob = (unsigned char *) malloc(total); + if (!blob) { + FARF(ERROR, "hmx-worker: allocation failed (%zu bytes)", total); + return AEE_ENOMEMORY; + } + memset(blob, 0, total); + + struct hmx_worker_context *ctx = (struct hmx_worker_context *) (blob + stack_size); + ctx->stack = blob; + ctx->vtcm_rctx = vtcm_rctx; + atomic_init(&ctx->cmd_seqn, 0); + atomic_init(&ctx->done_seqn, 0); + + // Match caller thread priority (same pattern as worker-pool.c). + int prio = qurt_thread_get_priority(qurt_thread_get_id()); + if (prio < 1) prio = 1; + if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO; + + qurt_thread_attr_t attr; + qurt_thread_attr_init(&attr); + qurt_thread_attr_set_stack_addr(&attr, blob); + qurt_thread_attr_set_stack_size(&attr, stack_size); + qurt_thread_attr_set_priority(&attr, prio); + qurt_thread_attr_set_name(&attr, "hmx_worker"); + + int err = qurt_thread_create(&ctx->thread, &attr, hmx_worker_main, ctx); + if (err) { + FARF(ERROR, "hmx-worker: thread create failed (%d)", err); + free(blob); + return AEE_EQURTTHREADCREATE; + } + + *out = ctx; + return AEE_SUCCESS; +} + +void hmx_worker_release(hmx_worker_context_t ctx) { + if (!ctx) return; + + // Tell the worker to exit. + hmx_worker_issue(ctx, HMX_WORKER_CMD_KILL, NULL, NULL); + hmx_worker_drain(ctx); + + int status; + qurt_thread_join(ctx->thread, &status); + + free(ctx->stack); +} + +AEEResult hmx_worker_begin(hmx_worker_context_t ctx) { + hmx_worker_issue(ctx, HMX_WORKER_CMD_BEGIN, NULL, NULL); + hmx_worker_drain(ctx); // wait until HMX lock is acquired + return AEE_SUCCESS; +} + +AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void *data) { + // Caller is expected to have called wait() for any previous job. + // Safety: drain any residual (should be instant in normal flow). + hmx_worker_drain(ctx); + hmx_worker_issue(ctx, HMX_WORKER_CMD_JOB, fn, data); + return AEE_SUCCESS; +} + +AEEResult hmx_worker_wait(hmx_worker_context_t ctx) { + hmx_worker_drain(ctx); + return AEE_SUCCESS; +} + +AEEResult hmx_worker_end(hmx_worker_context_t ctx) { + hmx_worker_drain(ctx); // ensure no in-flight job + hmx_worker_issue(ctx, HMX_WORKER_CMD_END, NULL, NULL); + hmx_worker_drain(ctx); // wait until HMX lock is released + return AEE_SUCCESS; +} diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.h b/ggml/src/ggml-hexagon/htp/hmx-worker.h new file mode 100644 index 0000000000..9c0477b974 --- /dev/null +++ b/ggml/src/ggml-hexagon/htp/hmx-worker.h @@ -0,0 +1,54 @@ +#ifndef HMX_WORKER_H +#define HMX_WORKER_H + +// Async HMX worker: single dedicated thread for HMX compute, +// allowing the main thread to run HVX/DMA work in parallel. +// +// Lifecycle per matmul op: +// hmx_worker_begin — worker thread acquires HMX lock +// hmx_worker_submit — fire a job (non-blocking) +// hmx_worker_wait — block until current job completes +// ... — repeat submit/wait as needed +// hmx_worker_end — worker thread releases HMX lock +// +// Design: single-producer single-consumer, 1 in-flight job max. + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void (*hmx_worker_fn_t)(void *data); + +typedef struct hmx_worker_context *hmx_worker_context_t; + +// Create worker thread. Thread starts idle (no HMX lock held). +AEEResult hmx_worker_init(hmx_worker_context_t *ctx, uint32_t stack_size, uint32_t vtcm_rctx); + +// Destroy worker thread. Must not be called while a job is in-flight. +void hmx_worker_release(hmx_worker_context_t ctx); + +// Worker thread acquires HMX lock. Blocks until lock is held. +AEEResult hmx_worker_begin(hmx_worker_context_t ctx); + +// Submit a job (non-blocking). Caller must have called wait() for any +// previous job before submitting a new one. +// |data| must remain valid until the corresponding wait() returns. +AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void *data); + +// Block until the current in-flight job completes. +// Returns immediately if no job is in-flight. +AEEResult hmx_worker_wait(hmx_worker_context_t ctx); + +// Ensure no in-flight job, then worker thread releases HMX lock. +// Blocks until unlock is complete. +AEEResult hmx_worker_end(hmx_worker_context_t ctx); + +#ifdef __cplusplus +} +#endif + +#endif /* HMX_WORKER_H */ diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 4c36a6ea0c..7b09bb4f41 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -3,6 +3,7 @@ #include "hex-dma.h" #include "htp-ops.h" +#include "hmx-worker.h" #include "worker-pool.h" #include @@ -72,6 +73,10 @@ struct htp_context { atomic_bool vtcm_needs_release; struct htp_ops_context octx; + +#ifdef HTP_HAS_HMX + hmx_worker_context_t hmx_worker; // Async HMX worker for pipeline overlap +#endif }; int op_matmul(struct htp_ops_context * octx); diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c index 8b34703942..b6ca127c1d 100644 --- a/ggml/src/ggml-hexagon/htp/main.c +++ b/ggml/src/ggml-hexagon/htp/main.c @@ -324,6 +324,14 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que #ifdef HTP_HAS_HMX ctx->hmx_enabled = use_hmx; + ctx->hmx_worker = NULL; + if (use_hmx) { + AEEResult hmx_worker_err = hmx_worker_init(&ctx->hmx_worker, 8192, ctx->vtcm_rctx); + if (hmx_worker_err != AEE_SUCCESS) { + FARF(ERROR, "hmx_worker_init failed: %d", hmx_worker_err); + return hmx_worker_err; + } + } FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx); #endif @@ -389,7 +397,13 @@ AEEResult htp_iface_stop(remote_handle64 handle) { } #ifdef HTP_HAS_HMX - ctx->hmx_enabled = 0; + if (ctx->hmx_enabled) { + if (ctx->hmx_worker) { + hmx_worker_release(ctx->hmx_worker); + ctx->hmx_worker = NULL; + } + ctx->hmx_enabled = 0; + } #endif vtcm_free(ctx); From d7a8634de6a58c0f383b33727e3808fdcfbe4bb6 Mon Sep 17 00:00:00 2001 From: njsyw1997 <44545837+njsyw1997@users.noreply.github.com> Date: Tue, 7 Apr 2026 02:01:34 -0700 Subject: [PATCH 2/6] hexagon: cost-based VTCM chunk search for out-stationary matmul --- ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c | 205 ++++++++++++++------- ggml/src/ggml-hexagon/htp/hmx-worker.c | 105 ++++++----- ggml/src/ggml-hexagon/htp/hmx-worker.h | 8 +- 3 files changed, 193 insertions(+), 125 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c index 288a4caa6a..92526273bc 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c @@ -16,15 +16,16 @@ #include "ggml-common.h" #include "hex-dma.h" +#include "worker-pool.h" + #include "hvx-utils.h" #include "hvx-dump.h" -#include "worker-pool.h" #include "htp-ctx.h" #include "htp-ops.h" -#include "hmx-worker.h" -#include "hmx-utils.h" #include "hmx-ops.h" +#include "hmx-utils.h" +#include "hmx-worker.h" #include "hmx-profile.h" static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { @@ -110,36 +111,45 @@ static inline bool hmx_add_overflow(size_t a, size_t b, size_t *out) { return false; } -// Search for optimal (mc, nc) chunk sizes that maximize mc * nc within VTCM budget. +// Search for optimal (mc, nc) chunk sizes within VTCM budget. +// +// VTCM model: nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead +// +// Minimize ceil(m/mc) * m_block_cost + ceil(n/nc) * n_block_cost. +// All matmul paths repeat weight processing per M-block and activation loading +// per N-block, so discrete block counts drive total overhead. +// Tie-break: when cost is equal, prefer larger mc * nc. // -// Cost model: total = nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead -// per_n_cost: bytes per nc column (weight + scratch buffers) -// per_m_cost: bytes per mc row (activation) -// per_mn_cost: bytes per mc*nc element (output) -// overhead: fixed bytes (scales 256B, eye_tile 2048B, etc.) +// Caller-provided coefficients: +// m_block_cost: penalty per extra M-block (weight redundancy, scales with n). +// n_block_cost: penalty per extra N-block (activation redundancy, scales with m). // // Algorithm: nc sweeps from n_max down by 32, analytically solving for mc_max. // Returns 0 on success, -1 if VTCM is insufficient. -static int hmx_compute_chunks( - size_t vtcm_total, size_t overhead, - size_t per_n_cost, size_t per_m_cost, size_t per_mn_cost, - int m, int n, - size_t *m_chunk_out, size_t *n_chunk_out, - size_t *total_out) -{ +static int hmx_compute_chunks(size_t vtcm_total, + size_t overhead, + size_t per_n_cost, + size_t per_m_cost, + size_t per_mn_cost, + int m, + int n, + size_t m_block_cost, + size_t n_block_cost, + size_t * m_chunk_out, + size_t * n_chunk_out, + size_t * total_out) { if (m <= 0 || n <= 0) return -1; if (vtcm_total <= overhead) return -1; if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1; const size_t usable = vtcm_total - overhead; - size_t best_mn = 0, best_m = 0, best_n = 0; + + size_t best_cost = SIZE_MAX; + size_t best_mn = 0; + size_t best_m = 0, best_n = 0; const size_t n_max = hex_align_down((size_t)n, HMX_FP16_TILE_N_COLS); for (size_t nc = n_max; nc >= HMX_FP16_TILE_N_COLS; nc -= HMX_FP16_TILE_N_COLS) { - // Early exit: if nc * m_max cannot beat best, smaller nc won't either - if (nc * hex_align_down((size_t)m, HMX_FP16_TILE_N_ROWS) <= best_mn) - break; - size_t n_fixed = 0, ncmn = 0, mc_denom = 0; if (hmx_mul_overflow(nc, per_n_cost, &n_fixed)) continue; if (n_fixed >= usable) goto next_nc; @@ -153,10 +163,19 @@ static int hmx_compute_chunks( mc = hex_align_down(mc, HMX_FP16_TILE_N_ROWS); mc = hex_smin(mc, (size_t)m); - if (mc > 0 && mc * nc > best_mn) { - best_mn = mc * nc; - best_m = mc; - best_n = nc; + if (mc == 0) { + goto next_nc; + } + + size_t mblocks = ((size_t) m + mc - 1) / mc; + size_t nblocks = ((size_t) n + nc - 1) / nc; + size_t cost = mblocks * m_block_cost + nblocks * n_block_cost; + size_t mn = mc * nc; + if (cost < best_cost || (cost == best_cost && mn > best_mn)) { + best_cost = cost; + best_mn = mn; + best_m = mc; + best_n = nc; } } @@ -679,25 +698,29 @@ static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const // --- Async HMX matmul job (for pipeline overlap) --- typedef struct { - __fp16 *output; - const __fp16 *activation; - const __fp16 *weight; - const __fp16 *scales; - int n_row_tiles; - int n_col_tiles; - int n_dot_tiles; + __fp16 * output; + const __fp16 * activation; + const __fp16 * weight; + const __fp16 * scales; + int n_row_tiles; + int n_col_tiles; + int n_dot_tiles; } hmx_matmul_job_t; -static void hmx_matmul_worker_fn(void *data) { - hmx_matmul_job_t *job = (hmx_matmul_job_t *) data; - core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, - job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles); +static void hmx_matmul_worker_fn(void * data) { + hmx_matmul_job_t * job = (hmx_matmul_job_t *) data; + core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, job->n_row_tiles, job->n_col_tiles, + job->n_dot_tiles); } -static inline void hmx_matmul_job_init( - hmx_matmul_job_t *job, - __fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales, - int n_row_tiles, int n_col_tiles, int n_dot_tiles) { +static inline void hmx_matmul_job_init(hmx_matmul_job_t * job, + __fp16 * output, + const __fp16 * activation, + const __fp16 * weight, + const __fp16 * scales, + int n_row_tiles, + int n_col_tiles, + int n_dot_tiles) { job->output = output; job->activation = activation; job->weight = weight; @@ -866,12 +889,13 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu const size_t f32_scratch_per_m = use_dma_activation ? (size_t) params->k * sizeof(float) : 0; size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0; + // FP16 weight: interleave and activation load have similar per-element cost. if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, - /*per_n=*/3 * vec_dot_size, - /*per_m=*/group_size * vec_dot_size + f32_scratch_per_m, - /*per_mn=*/sizeof(__fp16), - params->m, params->n, - &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { + /*per_n=*/3 * vec_dot_size, + /*per_m=*/group_size * vec_dot_size + f32_scratch_per_m, + /*per_mn=*/sizeof(__fp16), params->m, params->n, + /*m_block_cost=*/(size_t) params->n, + /*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__); return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params); } @@ -1040,13 +1064,15 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co const size_t f32_scratch_per_m = use_dma_activation ? (size_t) k * sizeof(float) : 0; size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0; + // FP16 weight: interleave and activation load have similar per-element cost. if (hmx_compute_chunks(vtcm_budget, - /*overhead=*/ 256, - /*per_n=*/ 3 * vec_dot_size, // W + S0 + S1 - /*per_m=*/ vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch - /*per_mn=*/ sizeof(__fp16), // O - m, n, - &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { + /*overhead=*/256, + /*per_n=*/3 * vec_dot_size, // W + S0 + S1 + /*per_m=*/vec_dot_size + f32_scratch_per_m, // A + optional F32 scratch + /*per_mn=*/sizeof(__fp16), // O + m, n, + /*m_block_cost=*/(size_t) n, + /*n_block_cost=*/(size_t) m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget); return -1; } @@ -1191,6 +1217,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m, int k, int n, int w_type); +#define FALLBACK_TO_STANDARD 1 + int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation, const uint8_t *restrict permuted_weight, int m, int k, int n, int weight_type) { @@ -1203,9 +1231,12 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds // for large m, k (e.g. prefill FFN Down), use out-stationary version if (m >= 128 && k > n && n > 1024) { - FARF(MEDIUM, "hmx_matmul_qk: OUT-STATIONARY path m=%d k=%d n=%d type=%d (K_BLOCK=512, %d K-iters with fp16 intermediate)", - m, k, n, weight_type, (k + 511) / 512); - return mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type); + int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type); + if (rc != FALLBACK_TO_STANDARD) { + return rc; // 0 success, -1 error + } + FARF(MEDIUM, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n); + // fall through to standard path } size_t row_stride = get_x4x2_row_stride(weight_type, k); @@ -1231,9 +1262,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds } size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0; - if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, - per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost, - m, n, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { + // Quantized weight: dequant ~1.5x more expensive per element than activation load. + if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost, m, n, + /*m_block_cost=*/(size_t) n * 3, + /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) { FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d pipe=%d budget=%zu)", __func__, m, k, n, use_pipeline, vtcm_budget); return -1; @@ -1402,9 +1434,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds } // submit C0 (non-blocking — HMX worker executes in parallel) - hmx_matmul_job_init(&job_slots[0], - (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales, - hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); + hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, + (__fp16 *) vtcm_weight_bufs[0], vtcm_scales, + hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), + hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); hmx_worker_submit(ctx->hmx_worker, hmx_matmul_worker_fn, &job_slots[0]); // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker) @@ -1438,10 +1471,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds // counterpart — and (i+1)%2 was last used by C_{i-1} which completed // before C_i was submitted. if (i + 1 < n_chunk_cnt) { - hmx_matmul_job_init(&job_slots[(i + 1) % 2], - (__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation, - (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales, - hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); + hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2], + (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], + vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), + hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS); hmx_worker_submit(ctx->hmx_worker, hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]); } @@ -1583,12 +1616,41 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict const size_t vtcm_budget = ctx->vtcm_size; - const size_t M_BLOCK_SIZE = 512; - const size_t N_BLOCK_SIZE = 512; - const size_t K_BLOCK_SIZE = 512; + const size_t K_BLOCK_SIZE = 1024; + + // Fallback: if k doesn't need K-blocking, out-stationary has no advantage + const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE; + if (k_iters_check <= 1) { + FARF(MEDIUM, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k); + return FALLBACK_TO_STANDARD; + } - // Compute precise buffer sizes + // Dynamic M,N search via hmx_compute_chunks const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE); + const size_t per_m = K_BLOCK_SIZE * sizeof(float) // scratch1: M×K×4 (act DMA staging F32) + + K_BLOCK_SIZE * sizeof(__fp16); // activation: M×K×2 (F16 tiles) + const size_t per_n = sub_row_stride_alloc // scratch0: N×sub_row(K) (packed quant) + + K_BLOCK_SIZE * sizeof(__fp16); // weight: N×K×2 (F16 tiles) + const size_t per_mn = sizeof(__fp16); // output: M×N×2 (out-stationary) + // Alignment margin: hex_align_up can add up to 2047 bytes per buffer; + // scratch1 (mc×6144) is naturally 2048-aligned, remaining 4 buffers need margin + const size_t align_margin = 4 * HMX_FP16_TILE_SIZE; + const size_t overhead = HMX_FP16_TILE_SIZE + 256 + align_margin; // eye_tile + scales + alignment + + size_t M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used; + // Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost. + // From profiling: wt_dequant per element ≈ 1.5× activation load per element. + // m_block_cost = n*3: each extra M-block re-dequants all N×K weight (expensive). + // n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper). + const size_t m_block_cost = (size_t) n * 3; + const size_t n_block_cost = (size_t) m * 2; + if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn, m, n, m_block_cost, n_block_cost, &M_BLOCK_SIZE, + &N_BLOCK_SIZE, &vtcm_used) != 0) { + FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget); + return -1; + } + + // Compute precise buffer sizes from searched M,N and fixed K const size_t weight_size = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE); const size_t act_size = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE); const size_t out_size = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE); @@ -1597,7 +1659,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256; if (total_vtcm > vtcm_budget) { - FARF(HIGH, "%s: VTCM too small: need %zu have %zu (m=%d k=%d n=%d)", __func__, total_vtcm, vtcm_budget, m, k, n); + FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm, + vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE); return -1; } @@ -1611,8 +1674,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict __fp16 *vtcm_scales = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256); assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget); - FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", __func__, m, k, n, weight_type, - (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget); + FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", __func__, m, k, n, weight_type, + M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget); // initialize eye tile (32x32 identity matrix) { diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.c b/ggml/src/ggml-hexagon/htp/hmx-worker.c index 657d16b2a9..6826dde7de 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-worker.c +++ b/ggml/src/ggml-hexagon/htp/hmx-worker.c @@ -1,48 +1,47 @@ #include "hmx-worker.h" +#include +#include #include #include #include #include -#include -#include - // --------------------------------------------------------------------------- // Internal types // --------------------------------------------------------------------------- enum hmx_worker_cmd { HMX_WORKER_CMD_BEGIN, // acquire HMX lock - HMX_WORKER_CMD_JOB, // execute fn(data) - HMX_WORKER_CMD_END, // release HMX lock - HMX_WORKER_CMD_KILL, // exit thread + HMX_WORKER_CMD_JOB, // execute fn(data) + HMX_WORKER_CMD_END, // release HMX lock + HMX_WORKER_CMD_KILL, // exit thread }; struct hmx_worker_context { // Command channel: main thread → worker - atomic_uint cmd_seqn; // bumped by main thread for each command - enum hmx_worker_cmd cmd_type; - hmx_worker_fn_t fn; - void *data; + atomic_uint cmd_seqn; // bumped by main thread for each command + enum hmx_worker_cmd cmd_type; + hmx_worker_fn_t fn; + void * data; // Completion channel: worker → main thread - atomic_uint done_seqn; // set to cmd_seqn when command completes + atomic_uint done_seqn; // set to cmd_seqn when command completes // Configuration - uint32_t vtcm_rctx; + uint32_t vtcm_rctx; // Thread resources - qurt_thread_t thread; - void *stack; // single allocation: stack + context + qurt_thread_t thread; + void * stack; // single allocation: stack + context }; // --------------------------------------------------------------------------- // Worker thread entry point // --------------------------------------------------------------------------- -static void hmx_worker_main(void *arg) { - struct hmx_worker_context *ctx = (struct hmx_worker_context *) arg; +static void hmx_worker_main(void * arg) { + struct hmx_worker_context * ctx = (struct hmx_worker_context *) arg; FARF(HIGH, "hmx-worker: thread started"); @@ -56,23 +55,23 @@ static void hmx_worker_main(void *arg) { prev_seqn = seqn; switch (ctx->cmd_type) { - case HMX_WORKER_CMD_BEGIN: - HAP_compute_res_hmx_lock(ctx->vtcm_rctx); - break; - - case HMX_WORKER_CMD_JOB: - ctx->fn(ctx->data); - break; - - case HMX_WORKER_CMD_END: - HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); - break; - - case HMX_WORKER_CMD_KILL: - atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release); - qurt_futex_wake(&ctx->done_seqn, 1); - FARF(HIGH, "hmx-worker: thread stopped"); - return; + case HMX_WORKER_CMD_BEGIN: + HAP_compute_res_hmx_lock(ctx->vtcm_rctx); + break; + + case HMX_WORKER_CMD_JOB: + ctx->fn(ctx->data); + break; + + case HMX_WORKER_CMD_END: + HAP_compute_res_hmx_unlock(ctx->vtcm_rctx); + break; + + case HMX_WORKER_CMD_KILL: + atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release); + qurt_futex_wake(&ctx->done_seqn, 1); + FARF(HIGH, "hmx-worker: thread stopped"); + return; } atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release); @@ -85,9 +84,10 @@ static void hmx_worker_main(void *arg) { // --------------------------------------------------------------------------- // Issue a command to the worker (non-blocking). -static void hmx_worker_issue(struct hmx_worker_context *ctx, - enum hmx_worker_cmd type, - hmx_worker_fn_t fn, void *data) { +static void hmx_worker_issue(struct hmx_worker_context * ctx, + enum hmx_worker_cmd type, + hmx_worker_fn_t fn, + void * data) { ctx->cmd_type = type; ctx->fn = fn; ctx->data = data; @@ -96,11 +96,10 @@ static void hmx_worker_issue(struct hmx_worker_context *ctx, } // Block until the worker has completed the most recently issued command. -static void hmx_worker_drain(struct hmx_worker_context *ctx) { +static void hmx_worker_drain(struct hmx_worker_context * ctx) { unsigned int expected = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire); while (atomic_load_explicit(&ctx->done_seqn, memory_order_acquire) != expected) { - qurt_futex_wait(&ctx->done_seqn, - atomic_load_explicit(&ctx->done_seqn, memory_order_relaxed)); + qurt_futex_wait(&ctx->done_seqn, atomic_load_explicit(&ctx->done_seqn, memory_order_relaxed)); } } @@ -110,30 +109,34 @@ static void hmx_worker_drain(struct hmx_worker_context *ctx) { #define LOWEST_USABLE_QURT_PRIO (254) -AEEResult hmx_worker_init(hmx_worker_context_t *out, uint32_t stack_size, uint32_t vtcm_rctx) { +AEEResult hmx_worker_init(hmx_worker_context_t * out, uint32_t stack_size, uint32_t vtcm_rctx) { if (!out) { return AEE_EBADPARM; } // Single allocation: stack followed by context struct. - size_t total = stack_size + sizeof(struct hmx_worker_context); - unsigned char *blob = (unsigned char *) malloc(total); + size_t total = stack_size + sizeof(struct hmx_worker_context); + unsigned char * blob = (unsigned char *) malloc(total); if (!blob) { FARF(ERROR, "hmx-worker: allocation failed (%zu bytes)", total); return AEE_ENOMEMORY; } memset(blob, 0, total); - struct hmx_worker_context *ctx = (struct hmx_worker_context *) (blob + stack_size); - ctx->stack = blob; - ctx->vtcm_rctx = vtcm_rctx; - atomic_init(&ctx->cmd_seqn, 0); + struct hmx_worker_context * ctx = (struct hmx_worker_context *) (blob + stack_size); + ctx->stack = blob; + ctx->vtcm_rctx = vtcm_rctx; + atomic_init(&ctx->cmd_seqn, 0); atomic_init(&ctx->done_seqn, 0); // Match caller thread priority (same pattern as worker-pool.c). int prio = qurt_thread_get_priority(qurt_thread_get_id()); - if (prio < 1) prio = 1; - if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO; + if (prio < 1) { + prio = 1; + } + if (prio > LOWEST_USABLE_QURT_PRIO) { + prio = LOWEST_USABLE_QURT_PRIO; + } qurt_thread_attr_t attr; qurt_thread_attr_init(&attr); @@ -154,7 +157,9 @@ AEEResult hmx_worker_init(hmx_worker_context_t *out, uint32_t stack_size, uint32 } void hmx_worker_release(hmx_worker_context_t ctx) { - if (!ctx) return; + if (!ctx) { + return; + } // Tell the worker to exit. hmx_worker_issue(ctx, HMX_WORKER_CMD_KILL, NULL, NULL); @@ -172,7 +177,7 @@ AEEResult hmx_worker_begin(hmx_worker_context_t ctx) { return AEE_SUCCESS; } -AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void *data) { +AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void * data) { // Caller is expected to have called wait() for any previous job. // Safety: drain any residual (should be instant in normal flow). hmx_worker_drain(ctx); diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.h b/ggml/src/ggml-hexagon/htp/hmx-worker.h index 9c0477b974..36aec15858 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-worker.h +++ b/ggml/src/ggml-hexagon/htp/hmx-worker.h @@ -21,12 +21,12 @@ extern "C" { #endif -typedef void (*hmx_worker_fn_t)(void *data); +typedef void (*hmx_worker_fn_t)(void * data); -typedef struct hmx_worker_context *hmx_worker_context_t; +typedef struct hmx_worker_context * hmx_worker_context_t; // Create worker thread. Thread starts idle (no HMX lock held). -AEEResult hmx_worker_init(hmx_worker_context_t *ctx, uint32_t stack_size, uint32_t vtcm_rctx); +AEEResult hmx_worker_init(hmx_worker_context_t * ctx, uint32_t stack_size, uint32_t vtcm_rctx); // Destroy worker thread. Must not be called while a job is in-flight. void hmx_worker_release(hmx_worker_context_t ctx); @@ -37,7 +37,7 @@ AEEResult hmx_worker_begin(hmx_worker_context_t ctx); // Submit a job (non-blocking). Caller must have called wait() for any // previous job before submitting a new one. // |data| must remain valid until the corresponding wait() returns. -AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void *data); +AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void * data); // Block until the current in-flight job completes. // Returns immediately if no job is in-flight. From 266120dfe668512f812fbe62e8dcf5aa1d712c27 Mon Sep 17 00:00:00 2001 From: njsyw1997 <44545837+njsyw1997@users.noreply.github.com> Date: Sat, 11 Apr 2026 16:32:52 -0700 Subject: [PATCH 3/6] hexagon: fix futex race in hmx_worker_drain Store the boolean to local variable avoid atomic load twice --- ggml/src/ggml-hexagon/htp/hmx-worker.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.c b/ggml/src/ggml-hexagon/htp/hmx-worker.c index 6826dde7de..013173c7e5 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-worker.c +++ b/ggml/src/ggml-hexagon/htp/hmx-worker.c @@ -98,8 +98,15 @@ static void hmx_worker_issue(struct hmx_worker_context * ctx, // Block until the worker has completed the most recently issued command. static void hmx_worker_drain(struct hmx_worker_context * ctx) { unsigned int expected = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire); - while (atomic_load_explicit(&ctx->done_seqn, memory_order_acquire) != expected) { - qurt_futex_wait(&ctx->done_seqn, atomic_load_explicit(&ctx->done_seqn, memory_order_relaxed)); + for (;;) { + unsigned int seen = atomic_load_explicit(&ctx->done_seqn, memory_order_acquire); + if (seen == expected) { + return; + } + // Pass the same observed value to futex_wait(). If the worker completes + // between the load above and the futex call, the value mismatch makes the + // wait return immediately instead of sleeping forever on the new seqn. + qurt_futex_wait(&ctx->done_seqn, seen); } } From f5833aac101d06a8a45d4dff37267293aad83a9f Mon Sep 17 00:00:00 2001 From: Kim-Chyan Gan Date: Fri, 10 Apr 2026 15:07:01 -0700 Subject: [PATCH 4/6] hex-mm: hmx optimize scatter/transpose and use HMX intrinsics --- ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c | 135 +++++++++++---------- ggml/src/ggml-hexagon/htp/hmx-utils.h | 56 --------- ggml/src/ggml-hexagon/htp/hvx-base.h | 5 + 3 files changed, 77 insertions(+), 119 deletions(-) diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c index 92526273bc..fd64a1d0d3 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c +++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c @@ -49,7 +49,8 @@ static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = { static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned(VLEN))) = { 0*128, 1*128, 2*128, 3*128, 4*128, 5*128, 6*128, 7*128, 8*128, 9*128, 10*128, 11*128, 12*128, 13*128, 14*128, 15*128, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 16*128, 17*128, 18*128, 19*128, 20*128, 21*128, 22*128, 23*128, + 24*128, 25*128, 26*128, 27*128, 28*128, 29*128, 30*128, 31*128 }; // Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes @@ -253,7 +254,7 @@ static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx( const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); HVX_Vector v_scales = hvx_vec_splat_f16(*scale); // q4x4x2 stores two int4 values per byte. Keep only the selected nibble. - HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq; + HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); v_quants = Q6_V_vand_VV(v_quants, mask_h4); // Shuffle before LUT v_quants = Q6_Vb_vshuff_Vb(v_quants); @@ -277,7 +278,7 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx( // Load all 128 packed bytes (4 contiguous 32-byte groups) HVX_Vector vq = hvx_vmemu(packed_128); const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F); - HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq; + HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles); v_quants = Q6_V_vand_VV(v_quants, mask_h4); // Shuffle before LUT @@ -297,10 +298,8 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx( v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23)); // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter - out[0] = v_lo; // group0 already in [0:63] - out[1] = Q6_V_vror_VR(v_lo, 64); // group1 rotated to [0:63] - out[2] = v_hi; // group2 already in [0:63] - out[3] = Q6_V_vror_VR(v_hi, 64); // group3 rotated to [0:63] + out[0] = v_lo; // group0 already in [0:63] + out[1] = v_hi; // group2 already in [0:63] } // Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes. @@ -404,8 +403,9 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task( size_t row_stride, int weight_type, int start_tile, int end_tile) { - const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS; - const int qrow_size = (weight_type == HTP_TYPE_Q8_0) ? k_block : (k_block / 2); + const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS; + const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL); + const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block; const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) : (weight_type == HTP_TYPE_MXFP4) ? hvx_vmem(mxfp4_to_fp16_lut) : @@ -418,47 +418,46 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task( const HVX_Vector v_scat_step = Q6_V_vsplat_R(4); // 4 bytes = 1 column step const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64); // first 16 words (64 bytes) - for (int t = start_tile; t < end_tile; ) { - int ct = t / n_k_tiles; // column tile index - int kt = t % n_k_tiles; // K tile index + unsigned ct = (unsigned)start_tile / n_k_tiles; // column tile index + unsigned kt = (unsigned)start_tile % n_k_tiles; // K tile index + for (unsigned t = start_tile; t < end_tile; ) { + if (kt >= n_k_tiles) { kt = 0; ct++; } - // --- Batch-4 fast path for Q4_0/IQ4_NL: process 4 contiguous K-tiles with one vlut16 per row --- - if ((weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) && (kt % 4 == 0) && (t + 4 <= end_tile) && - ((t + 3) / n_k_tiles == ct)) { - int blk_idx = (kt * 32) / QK_Q4_0x4x2; - int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4 - bool upper = (sub_blk_base >= 4); - int packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes - int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE - + sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales + // --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row --- + if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) { + unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2; + unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32; // 0 or 4 + bool upper = (sub_blk_base >= 4); + unsigned packed_off = blk_idx * (QK_Q4_0x4x2 / 2); // 128 contiguous packed bytes + unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + + sub_blk_base * (int)sizeof(__fp16); // 4 consecutive scales __fp16 *tile_bases[4]; - for (int g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; } + for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; } HVX_Vector v_off = v_scat_base; - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { - int row0 = ct * HMX_FP16_TILE_N_COLS + r; - int row1 = row0 + 1; - const uint8_t *r0 = vtcm_src + row0 * row_stride; - const uint8_t *r1 = vtcm_src + row1 * row_stride; - HVX_Vector v0[4], v1[4]; - dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0); - if (row1 < n_cols) { - dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt, v1); - } else { - v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero(); - } + unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride; + unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1; - for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]); } + for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) { + HVX_Vector v0[2]; + const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride; + dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0); + Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]); + Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]); v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); - for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]); } + + + r0 = vtcm_src + row_offset; row_offset += row_stride; + dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0); + Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]); + Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]); v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step); } for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); } - - t += 4; + t += 4; kt += 4; continue; } @@ -515,20 +514,19 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task( // --- Single-tile fallback --- __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS; - if (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) { - int blk_idx = (kt * 32) / QK_Q4_0x4x2; - int sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32; - bool upper = (sub_blk >= 4); - int byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32; - int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16); + if (is_q4) { + unsigned blk_idx = (kt * 32) / QK_Q4_0x4x2; + unsigned sub_blk = ((kt * 32) % QK_Q4_0x4x2) / 32; + bool upper = (sub_blk >= 4); + unsigned byte_off = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32; + unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16); HVX_Vector v_off = v_scat_base; // reset to column 0 - for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) { - int row0 = ct * HMX_FP16_TILE_N_COLS + r; - int row1 = row0 + 1; - - const uint8_t *r0 = vtcm_src + row0 * row_stride; - const uint8_t *r1 = vtcm_src + row1 * row_stride; + unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride; + unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1; + for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) { + const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride; + const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride; HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx( r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt); @@ -605,7 +603,7 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task( } (void) *(volatile HVX_Vector *)(tile_base); } - ++t; + ++t; ++kt; } // Drain HVX scatter write buffer: a vmem load on the same HW thread retires @@ -673,9 +671,13 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles( // --- End x4x2 dequantizers --- // requires external HMX lock -static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales, +static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales, int n_row_tiles, int n_col_tiles, int n_dot_tiles) { - hmx_set_output_scales(scales); + __builtin_assume(n_row_tiles > 0); + __builtin_assume(n_col_tiles > 0); + __builtin_assume(n_dot_tiles > 0); + + Q6_bias_mxmem2_A((void *)scales); for (int r = 0; r < n_row_tiles; ++r) { for (int c = 0; c < n_col_tiles; ++c) { @@ -685,12 +687,14 @@ static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS; for (int k = 0; k < n_dot_tiles; ++k) { - int offset = k * HMX_FP16_TILE_N_ELMS; - hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset); + Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047); + Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047); + row_tiles += HMX_FP16_TILE_N_ELMS; + col_tiles += HMX_FP16_TILE_N_ELMS; } __fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS; - hmx_consume_accumulator_fp16(out_tile); + Q6_mxmem_AR_after_hf(out_tile, 0); } } } @@ -1510,10 +1514,13 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds } // C += AB -void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp16 *col_scales, const __fp16 *eye_tile, +void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b, const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile, int n_row_tiles, int n_col_tiles, int n_dot_tiles, bool zero_init) { + __builtin_assume(n_row_tiles > 0); + __builtin_assume(n_col_tiles > 0); + __builtin_assume(n_dot_tiles > 0); - hmx_set_output_scales(col_scales); + Q6_bias_mxmem2_A((void *)col_scales); for (int i = 0; i < n_row_tiles; ++i) { for (int j = 0; j < n_col_tiles; ++j) { @@ -1524,15 +1531,17 @@ void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp __fp16 *accum_tile = c + (i * n_col_tiles + j) * HMX_FP16_TILE_N_ELMS; if (!zero_init) { - hmx_load_tile_pair_fp16(accum_tile, eye_tile); + Q6_activation_hf_mxmem_RR((unsigned int)accum_tile, 2047); + Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047); } for (int k = 0; k < n_dot_tiles; ++k) { - int offset = k * HMX_FP16_TILE_N_ELMS; - hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset); + Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047); + Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047); + row_tiles += HMX_FP16_TILE_N_ELMS; + col_tiles += HMX_FP16_TILE_N_ELMS; } - - hmx_consume_accumulator_fp16(accum_tile); + Q6_mxmem_AR_after_hf(accum_tile, 0); } } } diff --git a/ggml/src/ggml-hexagon/htp/hmx-utils.h b/ggml/src/ggml-hexagon/htp/hmx-utils.h index aacfbcda28..af04619ceb 100644 --- a/ggml/src/ggml-hexagon/htp/hmx-utils.h +++ b/ggml/src/ggml-hexagon/htp/hmx-utils.h @@ -14,10 +14,6 @@ #define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline)) -static HMX_INLINE_ALWAYS void hmx_set_output_scales(const void *scales) { - asm volatile("bias = mxmem2(%0)" :: "r"(scales)); -} - // Initialise aligned 256-byte area with scale vector + zero padding. static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) { HVX_Vector *pv = (HVX_Vector *)out_scales; @@ -25,58 +21,6 @@ static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vecto *pv = Q6_V_vzero(); } -// Load multiple contiguous tiles with :deep streaming. -// Rt = total region size - 1; the hardware streams through [Rs, Rs + Rt]. -// IMPORTANT: the tile region [Rs, Rs + Rt] must NOT cross a VTCM 4 MB bank -// boundary, otherwise the mxmem instruction will raise a precise bus error. -// Callers must ensure their VTCM layout satisfies this constraint. -static HMX_INLINE_ALWAYS void hmx_load_tiles_fp16(const __fp16 *row_tiles, - const __fp16 *col_tiles, - size_t n_tiles) { - size_t limit = n_tiles * HMX_FP16_TILE_SIZE - 1; - asm volatile( - "{ activation.hf = mxmem(%0, %1):deep\n" - "weight.hf = mxmem(%2, %3) }\n" - :: "r"(row_tiles), "r"(limit), "r"(col_tiles), "r"(limit) - : "memory"); -} - -// Load a single activation+weight tile pair (no :deep streaming). -// Rt defines the accessible region [Rs, Rs+Rt]. Following the reference formula -// (limit = n_tiles * HMX_FP16_TILE_SIZE - 1), for a single tile Rt = 2047. -// The original code used Rt=0x7FFF (32 KB region); when dynamic VTCM allocation -// places a tile near a 4 MB bank boundary, the oversized region crosses it and -// triggers a precise bus error (0x2601). Rt=2047 confines accesses to exactly -// one 2048-byte tile while covering all 16 HVX vectors (offsets 0..2047). -static HMX_INLINE_ALWAYS void hmx_load_tile_pair_fp16(const __fp16 *act_tile, - const __fp16 *wt_tile) { - asm volatile( - "{ activation.hf = mxmem(%0, %1)\n" - "weight.hf = mxmem(%2, %3) }\n" - :: "r"(act_tile), "r"(2047), - "r"(wt_tile), "r"(2047) - : "memory"); -} - -static HMX_INLINE_ALWAYS void hmx_consume_accumulator_fp16(__fp16 *out) { - // Use the combined convert-and-store instruction (matches the reference - // Q6_mxmem_AR_after_hf intrinsic). The previous two-instruction sequence - // "cvt.hf = acc(2); mxmem = cvt" used an undocumented Rs=2 parameter. - asm volatile( - "mxmem(%0, %1):after.hf = acc\n" - :: "r"(out), "r"(0) - : "memory"); -} - -// Compute inner product of two vectors of tiles and store result. -static HMX_INLINE_ALWAYS void hmx_dot_fp16(__fp16 *out, - const __fp16 *row_tiles, - const __fp16 *col_tiles, - size_t n_tiles) { - hmx_load_tiles_fp16(row_tiles, col_tiles, n_tiles); - hmx_consume_accumulator_fp16(out); -} - // --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) --- static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) { diff --git a/ggml/src/ggml-hexagon/htp/hvx-base.h b/ggml/src/ggml-hexagon/htp/hvx-base.h index db05ab40d2..ed6026e762 100644 --- a/ggml/src/ggml-hexagon/htp/hvx-base.h +++ b/ggml/src/ggml-hexagon/htp/hvx-base.h @@ -116,9 +116,14 @@ static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) { } static inline HVX_Vector hvx_vec_f32_to_f16_shuff(HVX_Vector v0, HVX_Vector v1) { +#if __HVX_ARCH__ >= 81 + HVX_Vector q0 = Q6_Vqf32_equals_Vsf(v0); + HVX_Vector q1 = Q6_Vqf32_equals_Vsf(v1); +#else const HVX_Vector zero = Q6_V_vzero(); HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero); HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero); +#endif return Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0)); } From 56ae47942c4bca918a96d678d7f4639b9bff34ed Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Sat, 11 Apr 2026 17:52:52 -0700 Subject: [PATCH 5/6] hex-vmem: drop vmem limit a touch under 3GB on v73 --- ggml/src/ggml-hexagon/htp/htp-ops.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h index 44a6ab4f73..fa84b674cd 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ops.h +++ b/ggml/src/ggml-hexagon/htp/htp-ops.h @@ -91,7 +91,12 @@ enum htp_op_code { #define HTP_OP_MAX_BUFS 8 #define HTP_OP_MAX_REQS 256 #define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS) + +#if __HVX_ARCH__ < 75 +#define HTP_OP_MAX_VMEM (3167538380u) +#else #define HTP_OP_MAX_VMEM (3221225472u) +#endif enum htp_tensor_flags { HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights) From 0d7997717c60c9d9665f745a4d10c8b3f2bdcc91 Mon Sep 17 00:00:00 2001 From: njsyw1997 <44545837+njsyw1997@users.noreply.github.com> Date: Sat, 11 Apr 2026 18:19:58 -0700 Subject: [PATCH 6/6] hexagon: add fwd declaration of htp_context --- ggml/src/ggml-hexagon/htp/htp-ctx.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h index 7b09bb4f41..65eaf1b4bc 100644 --- a/ggml/src/ggml-hexagon/htp/htp-ctx.h +++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h @@ -31,6 +31,8 @@ struct htp_spad { uint32_t size_per_thread; // size per thread }; +struct htp_context; + // Context while processing an Op // TODO: fold this into the main context struct htp_ops_context {