From 2969cfa0d80c5b3066557ca72cd9ca07887145a1 Mon Sep 17 00:00:00 2001
From: Yiwei Shao <44545837+njsyw1997@users.noreply.github.com>
Date: Fri, 3 Apr 2026 22:09:46 -0700
Subject: [PATCH 1/6] hexagon: add async HMX worker

Introduce hmx-worker (dedicated thread for HMX compute) to overlap HMX
matmul with HVX dequant/DMA stages in the pipeline path, replacing the
previous synchronous HMX calls that blocked the main thread.
---
 ggml/src/ggml-hexagon/htp/CMakeLists.txt   |   1 +
 ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c |  93 +++++++---
 ggml/src/ggml-hexagon/htp/hmx-worker.c     | 193 +++++++++++++++++++++
 ggml/src/ggml-hexagon/htp/hmx-worker.h     |  54 ++++++
 ggml/src/ggml-hexagon/htp/htp-ctx.h        |   5 +
 ggml/src/ggml-hexagon/htp/main.c           |  16 +-
 6 files changed, 336 insertions(+), 26 deletions(-)
 create mode 100644 ggml/src/ggml-hexagon/htp/hmx-worker.c
 create mode 100644 ggml/src/ggml-hexagon/htp/hmx-worker.h

diff --git a/ggml/src/ggml-hexagon/htp/CMakeLists.txt b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
index 2b60f427ad..1cddd6a2c0 100644
--- a/ggml/src/ggml-hexagon/htp/CMakeLists.txt
+++ b/ggml/src/ggml-hexagon/htp/CMakeLists.txt
@@ -47,6 +47,7 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)
 
 if (_hmx_idx GREATER_EQUAL 0)
     target_sources(${HTP_LIB} PRIVATE
+        hmx-worker.c
         hmx-matmul-ops.c
     )
 
diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
index ec191c1498..288a4caa6a 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -22,6 +22,7 @@
 #include "htp-ctx.h"
 #include "htp-ops.h"
 
+#include "hmx-worker.h"
 #include "hmx-utils.h"
 #include "hmx-ops.h"
 #include "hmx-profile.h"
@@ -675,6 +676,39 @@ static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const
     }
 }
 
+// --- Async HMX matmul job (for pipeline overlap) ---
+
+typedef struct {
+    __fp16       *output;
+    const __fp16 *activation;
+    const __fp16 *weight;
+    const __fp16 *scales;
+    int           n_row_tiles;
+    int           n_col_tiles;
+    int           n_dot_tiles;
+} hmx_matmul_job_t;
+
+static void hmx_matmul_worker_fn(void *data) {
+    hmx_matmul_job_t *job = (hmx_matmul_job_t *) data;
+    core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales,
+                        job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
+}
+
+static inline void hmx_matmul_job_init(
+    hmx_matmul_job_t *job,
+    __fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales,
+    int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
+    job->output      = output;
+    job->activation  = activation;
+    job->weight      = weight;
+    job->scales      = scales;
+    job->n_row_tiles = n_row_tiles;
+    job->n_col_tiles = n_col_tiles;
+    job->n_dot_tiles = n_dot_tiles;
+}
+
+// --- End async HMX matmul job ---
+
 static void transfer_output_chunk_fp16_to_fp32(float *restrict dst, const __fp16 *restrict vtcm_src, int n_rows, int n_cols, int n) {
     assert(n_cols % HMX_FP16_TILE_N_COLS == 0);
     const int n_col_tiles = n_cols / HMX_FP16_TILE_N_COLS;
@@ -1256,9 +1290,8 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
          use_pipeline ? "PIPELINE" : "SEQUENTIAL", m_chunk_n_rows, n_chunk_n_cols,
          (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
 
-    HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
-
     if (!use_pipeline) {
+        HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
         for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
             // transfer activation matrix chunk into VTCM
             size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
@@ -1318,20 +1351,23 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
                 TIMER_STOP(output_store);
             }
         }
+        HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
     } else {
         // 4-stage pipeline: DMA load (A), dequantize (B), HMX matmul (C), store (D)
-        // stage B and D (dequantize and store) are expected to be on the critical path
+        // HMX compute (C) runs on dedicated worker thread, overlapping with HVX stages (B, D).
 
         // A --> B: vtcm_qweight, 1 buffer
         // B --> C: vtcm_weight0/vtcm_weight1, 2 buffers
         // C --> D: vtcm_output0/vtcm_output1, 2 buffers
 
-        //
-        // LD ||A3|  | B3 ||
-        // MM ||    C2    ||
-        // ST || D1 |     ||
+        // Async timeline (C overlaps B+D):
+        //   main+HVX:   [A0][Act][B0][A1][sub C0][B1‖C0][A2][wait,sub C1][D0+B2‖C1][wait,sub C2][D1‖C2][wait][D2]
+        //   HMX worker:                   [████ C0 ████████][████ C1 ████████████][████ C2 ████████]
 
         int n_chunk_cnt = hmx_ceil_div(n, n_chunk_n_cols);
+        hmx_matmul_job_t job_slots[2];  // persistent double-buffered job descriptors
+
+        hmx_worker_begin(ctx->hmx_worker);
         for (size_t mr = 0; mr < m; mr += m_chunk_n_rows) {
             const size_t n_rows = hex_smin(m - mr, m_chunk_n_rows);
 
@@ -1352,31 +1388,33 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
                 transfer_activation_chunk_threaded(ctx, vtcm_activation, activation_chunk, n_rows, k, k);
             }
 
-            // prologue: B0, A1, C0, B1
+            // prologue: B0, A1, submit C0 (async), B1 (overlaps C0)
             {
-                // B0
+                // B0: wait for DMA, dequant weight chunk 0
                 dma_queue_pop(ctx->dma[0]);
                 dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[0], vtcm_qweight, n_cols_A0, k, row_stride, weight_type);
 
-                // A1
+                // A1: issue DMA for weight chunk 1
                 const size_t n_cols_A1 = hex_smin(n - 1 * n_chunk_n_cols, n_chunk_n_cols);
                 if (1 < n_chunk_cnt) {
                     const uint8_t *qweight_chunk_A1 = permuted_weight + n_chunk_n_cols * row_stride;
                     dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_A1), row_stride, row_stride, row_stride, n_cols_A1);
                 }
 
-                // C0
-                core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
-                         hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                // submit C0 (non-blocking — HMX worker executes in parallel)
+                hmx_matmul_job_init(&job_slots[0],
+                    (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
+                    hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                hmx_worker_submit(ctx->hmx_worker, hmx_matmul_worker_fn, &job_slots[0]);
 
-                // B1
+                // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
                 if (1 < n_chunk_cnt) {
                     dma_queue_pop(ctx->dma[0]);
                     dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[1], vtcm_qweight, n_cols_A1, k, row_stride, weight_type);
                 }
             }
 
-            // main loop
+            // main loop: wait C_i → submit C_{i+1} → D_i + B_{i+2} (parallel with C_{i+1})
             for (int i = 0; i < n_chunk_cnt; ++i) {
                 const size_t nc    = i * n_chunk_n_cols;
                 const size_t nc_p1 = nc + 1 * n_chunk_n_cols;
@@ -1386,36 +1424,41 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
                 const size_t n_cols_p1 = hex_smin(n - nc_p1, n_chunk_n_cols);
                 const size_t n_cols_p2 = hex_smin(n - nc_p2, n_chunk_n_cols);
 
-                // issue A_{i+2}
+                // issue A_{i+2}: DMA push (non-blocking)
                 if (i + 2 < n_chunk_cnt) {
                     const uint8_t *qweight_chunk_p2 = permuted_weight + nc_p2 * row_stride;
                     dma_queue_push(ctx->dma[0], dma_make_ptr(vtcm_qweight, qweight_chunk_p2), row_stride, row_stride, row_stride, n_cols_p2);
                 }
 
-                // wait for HMX (C_{i}) -- C_{i} is done
-
-                // result of B_{i+1} (input of C_{i+1}) should be ready now
+                // wait C_i: block until prologue/previous C completes
+                hmx_worker_wait(ctx->hmx_worker);
 
-                // issue C_{i+1}
+                // submit C_{i+1} (non-blocking, overlaps with D_i + B_{i+2} below)
+                // job_slots[(i+1)%2] is safe: C_i just completed, freeing slot i%2's
+                // counterpart — and (i+1)%2 was last used by C_{i-1} which completed
+                // before C_i was submitted.
                 if (i + 1 < n_chunk_cnt) {
-                    core_dot_chunk_fp16((__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales,
+                    hmx_matmul_job_init(&job_slots[(i + 1) % 2],
+                        (__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation,
+                        (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales,
                         hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                    hmx_worker_submit(ctx->hmx_worker, hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]);
                 }
 
-                // compute D_{i}
+                // D_i: store output (multi-thread HVX, parallel with C_{i+1})
                 float *output_chunk = dst + (mr * n + nc);
                 transfer_output_chunk_threaded(ctx, output_chunk, vtcm_output_bufs[i % 2], n_rows, n_cols, n);
 
-                // wait for DMA (A_{i+2}), compute B_{i+2}
+                // B_{i+2}: DMA pop + dequant (multi-thread HVX, parallel with C_{i+1})
                 if (i + 2 < n_chunk_cnt) {
                     dma_queue_pop(ctx->dma[0]);
                     dequantize_x4x2_weight_chunk_to_fp16_tiles(ctx, vtcm_weight_bufs[(i + 2) % 2], vtcm_qweight, n_cols_p2, k, row_stride, weight_type);
                 }
             }
         }
-    }
 
-    HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
+        hmx_worker_end(ctx->hmx_worker);
+    }
 
     TIMER_STOP(total);
 
diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.c b/ggml/src/ggml-hexagon/htp/hmx-worker.c
new file mode 100644
index 0000000000..657d16b2a9
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/hmx-worker.c
@@ -0,0 +1,193 @@
+#include "hmx-worker.h"
+
+#include <qurt.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <HAP_compute_res.h>
+#include <HAP_farf.h>
+
+// ---------------------------------------------------------------------------
+// Internal types
+// ---------------------------------------------------------------------------
+
+enum hmx_worker_cmd {
+    HMX_WORKER_CMD_BEGIN,  // acquire HMX lock
+    HMX_WORKER_CMD_JOB,   // execute fn(data)
+    HMX_WORKER_CMD_END,   // release HMX lock
+    HMX_WORKER_CMD_KILL,  // exit thread
+};
+
+struct hmx_worker_context {
+    // Command channel: main thread → worker
+    atomic_uint          cmd_seqn;   // bumped by main thread for each command
+    enum hmx_worker_cmd  cmd_type;
+    hmx_worker_fn_t      fn;
+    void                *data;
+
+    // Completion channel: worker → main thread
+    atomic_uint          done_seqn;  // set to cmd_seqn when command completes
+
+    // Configuration
+    uint32_t             vtcm_rctx;
+
+    // Thread resources
+    qurt_thread_t        thread;
+    void                *stack;      // single allocation: stack + context
+};
+
+// ---------------------------------------------------------------------------
+// Worker thread entry point
+// ---------------------------------------------------------------------------
+
+static void hmx_worker_main(void *arg) {
+    struct hmx_worker_context *ctx = (struct hmx_worker_context *) arg;
+
+    FARF(HIGH, "hmx-worker: thread started");
+
+    unsigned int prev_seqn = 0;
+    for (;;) {
+        unsigned int seqn = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire);
+        if (seqn == prev_seqn) {
+            qurt_futex_wait(&ctx->cmd_seqn, prev_seqn);
+            continue;
+        }
+        prev_seqn = seqn;
+
+        switch (ctx->cmd_type) {
+        case HMX_WORKER_CMD_BEGIN:
+            HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
+            break;
+
+        case HMX_WORKER_CMD_JOB:
+            ctx->fn(ctx->data);
+            break;
+
+        case HMX_WORKER_CMD_END:
+            HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
+            break;
+
+        case HMX_WORKER_CMD_KILL:
+            atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release);
+            qurt_futex_wake(&ctx->done_seqn, 1);
+            FARF(HIGH, "hmx-worker: thread stopped");
+            return;
+        }
+
+        atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release);
+        qurt_futex_wake(&ctx->done_seqn, 1);
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Internal helpers
+// ---------------------------------------------------------------------------
+
+// Issue a command to the worker (non-blocking).
+static void hmx_worker_issue(struct hmx_worker_context *ctx,
+                             enum hmx_worker_cmd type,
+                             hmx_worker_fn_t fn, void *data) {
+    ctx->cmd_type = type;
+    ctx->fn       = fn;
+    ctx->data     = data;
+    atomic_fetch_add_explicit(&ctx->cmd_seqn, 1, memory_order_release);
+    qurt_futex_wake(&ctx->cmd_seqn, 1);
+}
+
+// Block until the worker has completed the most recently issued command.
+static void hmx_worker_drain(struct hmx_worker_context *ctx) {
+    unsigned int expected = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire);
+    while (atomic_load_explicit(&ctx->done_seqn, memory_order_acquire) != expected) {
+        qurt_futex_wait(&ctx->done_seqn,
+                        atomic_load_explicit(&ctx->done_seqn, memory_order_relaxed));
+    }
+}
+
+// ---------------------------------------------------------------------------
+// Public API
+// ---------------------------------------------------------------------------
+
+#define LOWEST_USABLE_QURT_PRIO (254)
+
+AEEResult hmx_worker_init(hmx_worker_context_t *out, uint32_t stack_size, uint32_t vtcm_rctx) {
+    if (!out) {
+        return AEE_EBADPARM;
+    }
+
+    // Single allocation: stack followed by context struct.
+    size_t total = stack_size + sizeof(struct hmx_worker_context);
+    unsigned char *blob = (unsigned char *) malloc(total);
+    if (!blob) {
+        FARF(ERROR, "hmx-worker: allocation failed (%zu bytes)", total);
+        return AEE_ENOMEMORY;
+    }
+    memset(blob, 0, total);
+
+    struct hmx_worker_context *ctx = (struct hmx_worker_context *) (blob + stack_size);
+    ctx->stack     = blob;
+    ctx->vtcm_rctx = vtcm_rctx;
+    atomic_init(&ctx->cmd_seqn,  0);
+    atomic_init(&ctx->done_seqn, 0);
+
+    // Match caller thread priority (same pattern as worker-pool.c).
+    int prio = qurt_thread_get_priority(qurt_thread_get_id());
+    if (prio < 1)                       prio = 1;
+    if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO;
+
+    qurt_thread_attr_t attr;
+    qurt_thread_attr_init(&attr);
+    qurt_thread_attr_set_stack_addr(&attr, blob);
+    qurt_thread_attr_set_stack_size(&attr, stack_size);
+    qurt_thread_attr_set_priority(&attr, prio);
+    qurt_thread_attr_set_name(&attr, "hmx_worker");
+
+    int err = qurt_thread_create(&ctx->thread, &attr, hmx_worker_main, ctx);
+    if (err) {
+        FARF(ERROR, "hmx-worker: thread create failed (%d)", err);
+        free(blob);
+        return AEE_EQURTTHREADCREATE;
+    }
+
+    *out = ctx;
+    return AEE_SUCCESS;
+}
+
+void hmx_worker_release(hmx_worker_context_t ctx) {
+    if (!ctx) return;
+
+    // Tell the worker to exit.
+    hmx_worker_issue(ctx, HMX_WORKER_CMD_KILL, NULL, NULL);
+    hmx_worker_drain(ctx);
+
+    int status;
+    qurt_thread_join(ctx->thread, &status);
+
+    free(ctx->stack);
+}
+
+AEEResult hmx_worker_begin(hmx_worker_context_t ctx) {
+    hmx_worker_issue(ctx, HMX_WORKER_CMD_BEGIN, NULL, NULL);
+    hmx_worker_drain(ctx);  // wait until HMX lock is acquired
+    return AEE_SUCCESS;
+}
+
+AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void *data) {
+    // Caller is expected to have called wait() for any previous job.
+    // Safety: drain any residual (should be instant in normal flow).
+    hmx_worker_drain(ctx);
+    hmx_worker_issue(ctx, HMX_WORKER_CMD_JOB, fn, data);
+    return AEE_SUCCESS;
+}
+
+AEEResult hmx_worker_wait(hmx_worker_context_t ctx) {
+    hmx_worker_drain(ctx);
+    return AEE_SUCCESS;
+}
+
+AEEResult hmx_worker_end(hmx_worker_context_t ctx) {
+    hmx_worker_drain(ctx);  // ensure no in-flight job
+    hmx_worker_issue(ctx, HMX_WORKER_CMD_END, NULL, NULL);
+    hmx_worker_drain(ctx);  // wait until HMX lock is released
+    return AEE_SUCCESS;
+}
diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.h b/ggml/src/ggml-hexagon/htp/hmx-worker.h
new file mode 100644
index 0000000000..9c0477b974
--- /dev/null
+++ b/ggml/src/ggml-hexagon/htp/hmx-worker.h
@@ -0,0 +1,54 @@
+#ifndef HMX_WORKER_H
+#define HMX_WORKER_H
+
+// Async HMX worker: single dedicated thread for HMX compute,
+// allowing the main thread to run HVX/DMA work in parallel.
+//
+// Lifecycle per matmul op:
+//   hmx_worker_begin  — worker thread acquires HMX lock
+//   hmx_worker_submit — fire a job (non-blocking)
+//   hmx_worker_wait   — block until current job completes
+//   ...               — repeat submit/wait as needed
+//   hmx_worker_end    — worker thread releases HMX lock
+//
+// Design: single-producer single-consumer, 1 in-flight job max.
+
+#include <AEEStdDef.h>
+#include <AEEStdErr.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef void (*hmx_worker_fn_t)(void *data);
+
+typedef struct hmx_worker_context *hmx_worker_context_t;
+
+// Create worker thread.  Thread starts idle (no HMX lock held).
+AEEResult hmx_worker_init(hmx_worker_context_t *ctx, uint32_t stack_size, uint32_t vtcm_rctx);
+
+// Destroy worker thread.  Must not be called while a job is in-flight.
+void hmx_worker_release(hmx_worker_context_t ctx);
+
+// Worker thread acquires HMX lock.  Blocks until lock is held.
+AEEResult hmx_worker_begin(hmx_worker_context_t ctx);
+
+// Submit a job (non-blocking).  Caller must have called wait() for any
+// previous job before submitting a new one.
+// |data| must remain valid until the corresponding wait() returns.
+AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void *data);
+
+// Block until the current in-flight job completes.
+// Returns immediately if no job is in-flight.
+AEEResult hmx_worker_wait(hmx_worker_context_t ctx);
+
+// Ensure no in-flight job, then worker thread releases HMX lock.
+// Blocks until unlock is complete.
+AEEResult hmx_worker_end(hmx_worker_context_t ctx);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* HMX_WORKER_H */
diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h
index 4c36a6ea0c..7b09bb4f41 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -3,6 +3,7 @@
 
 #include "hex-dma.h"
 #include "htp-ops.h"
+#include "hmx-worker.h"
 #include "worker-pool.h"
 
 #include <assert.h>
@@ -72,6 +73,10 @@ struct htp_context {
     atomic_bool            vtcm_needs_release;
 
     struct htp_ops_context octx;
+
+#ifdef HTP_HAS_HMX
+    hmx_worker_context_t   hmx_worker; // Async HMX worker for pipeline overlap
+#endif
 };
 
 int op_matmul(struct htp_ops_context * octx);
diff --git a/ggml/src/ggml-hexagon/htp/main.c b/ggml/src/ggml-hexagon/htp/main.c
index 8b34703942..b6ca127c1d 100644
--- a/ggml/src/ggml-hexagon/htp/main.c
+++ b/ggml/src/ggml-hexagon/htp/main.c
@@ -324,6 +324,14 @@ AEEResult htp_iface_start(remote_handle64 handle, uint32 sess_id, uint64 dsp_que
 
 #ifdef HTP_HAS_HMX
     ctx->hmx_enabled = use_hmx;
+    ctx->hmx_worker  = NULL;
+    if (use_hmx) {
+        AEEResult hmx_worker_err = hmx_worker_init(&ctx->hmx_worker, 8192, ctx->vtcm_rctx);
+        if (hmx_worker_err != AEE_SUCCESS) {
+            FARF(ERROR, "hmx_worker_init failed: %d", hmx_worker_err);
+            return hmx_worker_err;
+        }
+    }
     FARF(HIGH, "HMX %s (use_hmx=%d)", ctx->hmx_enabled ? "enabled" : "disabled", use_hmx);
 #endif
 
@@ -389,7 +397,13 @@ AEEResult htp_iface_stop(remote_handle64 handle) {
     }
 
 #ifdef HTP_HAS_HMX
-    ctx->hmx_enabled = 0;
+    if (ctx->hmx_enabled) {
+        if (ctx->hmx_worker) {
+            hmx_worker_release(ctx->hmx_worker);
+            ctx->hmx_worker = NULL;
+        }
+        ctx->hmx_enabled = 0;
+    }
 #endif
 
     vtcm_free(ctx);

From d7a8634de6a58c0f383b33727e3808fdcfbe4bb6 Mon Sep 17 00:00:00 2001
From: njsyw1997 <44545837+njsyw1997@users.noreply.github.com>
Date: Tue, 7 Apr 2026 02:01:34 -0700
Subject: [PATCH 2/6] hexagon: cost-based VTCM chunk search for out-stationary
 matmul

---
 ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c | 205 ++++++++++++++-------
 ggml/src/ggml-hexagon/htp/hmx-worker.c     | 105 ++++++-----
 ggml/src/ggml-hexagon/htp/hmx-worker.h     |   8 +-
 3 files changed, 193 insertions(+), 125 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
index 288a4caa6a..92526273bc 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -16,15 +16,16 @@
 #include "ggml-common.h"
 
 #include "hex-dma.h"
+#include "worker-pool.h"
+
 #include "hvx-utils.h"
 #include "hvx-dump.h"
-#include "worker-pool.h"
 #include "htp-ctx.h"
 #include "htp-ops.h"
 
-#include "hmx-worker.h"
-#include "hmx-utils.h"
 #include "hmx-ops.h"
+#include "hmx-utils.h"
+#include "hmx-worker.h"
 #include "hmx-profile.h"
 
 static const __fp16 q4_0_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
@@ -110,36 +111,45 @@ static inline bool hmx_add_overflow(size_t a, size_t b, size_t *out) {
     return false;
 }
 
-// Search for optimal (mc, nc) chunk sizes that maximize mc * nc within VTCM budget.
+// Search for optimal (mc, nc) chunk sizes within VTCM budget.
+//
+// VTCM model: nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
+//
+// Minimize ceil(m/mc) * m_block_cost + ceil(n/nc) * n_block_cost.
+// All matmul paths repeat weight processing per M-block and activation loading
+// per N-block, so discrete block counts drive total overhead.
+// Tie-break: when cost is equal, prefer larger mc * nc.
 //
-// Cost model: total = nc * per_n_cost + mc * per_m_cost + mc * nc * per_mn_cost + overhead
-//   per_n_cost:  bytes per nc column (weight + scratch buffers)
-//   per_m_cost:  bytes per mc row (activation)
-//   per_mn_cost: bytes per mc*nc element (output)
-//   overhead:    fixed bytes (scales 256B, eye_tile 2048B, etc.)
+// Caller-provided coefficients:
+//   m_block_cost: penalty per extra M-block (weight redundancy, scales with n).
+//   n_block_cost: penalty per extra N-block (activation redundancy, scales with m).
 //
 // Algorithm: nc sweeps from n_max down by 32, analytically solving for mc_max.
 // Returns 0 on success, -1 if VTCM is insufficient.
-static int hmx_compute_chunks(
-    size_t vtcm_total, size_t overhead,
-    size_t per_n_cost, size_t per_m_cost, size_t per_mn_cost,
-    int m, int n,
-    size_t *m_chunk_out, size_t *n_chunk_out,
-    size_t *total_out)
-{
+static int hmx_compute_chunks(size_t   vtcm_total,
+                              size_t   overhead,
+                              size_t   per_n_cost,
+                              size_t   per_m_cost,
+                              size_t   per_mn_cost,
+                              int      m,
+                              int      n,
+                              size_t   m_block_cost,
+                              size_t   n_block_cost,
+                              size_t * m_chunk_out,
+                              size_t * n_chunk_out,
+                              size_t * total_out) {
     if (m <= 0 || n <= 0) return -1;
     if (vtcm_total <= overhead) return -1;
     if (per_n_cost == 0 || per_m_cost == 0 || per_mn_cost == 0) return -1;
 
     const size_t usable = vtcm_total - overhead;
-    size_t best_mn = 0, best_m = 0, best_n = 0;
+
+    size_t best_cost = SIZE_MAX;
+    size_t best_mn   = 0;
+    size_t best_m = 0, best_n = 0;
 
     const size_t n_max = hex_align_down((size_t)n, HMX_FP16_TILE_N_COLS);
     for (size_t nc = n_max; nc >= HMX_FP16_TILE_N_COLS; nc -= HMX_FP16_TILE_N_COLS) {
-        // Early exit: if nc * m_max cannot beat best, smaller nc won't either
-        if (nc * hex_align_down((size_t)m, HMX_FP16_TILE_N_ROWS) <= best_mn)
-            break;
-
         size_t n_fixed = 0, ncmn = 0, mc_denom = 0;
         if (hmx_mul_overflow(nc, per_n_cost, &n_fixed)) continue;
         if (n_fixed >= usable) goto next_nc;
@@ -153,10 +163,19 @@ static int hmx_compute_chunks(
             mc = hex_align_down(mc, HMX_FP16_TILE_N_ROWS);
             mc = hex_smin(mc, (size_t)m);
 
-            if (mc > 0 && mc * nc > best_mn) {
-                best_mn = mc * nc;
-                best_m  = mc;
-                best_n  = nc;
+            if (mc == 0) {
+                goto next_nc;
+            }
+
+            size_t mblocks = ((size_t) m + mc - 1) / mc;
+            size_t nblocks = ((size_t) n + nc - 1) / nc;
+            size_t cost    = mblocks * m_block_cost + nblocks * n_block_cost;
+            size_t mn      = mc * nc;
+            if (cost < best_cost || (cost == best_cost && mn > best_mn)) {
+                best_cost = cost;
+                best_mn   = mn;
+                best_m    = mc;
+                best_n    = nc;
             }
         }
 
@@ -679,25 +698,29 @@ static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const
 // --- Async HMX matmul job (for pipeline overlap) ---
 
 typedef struct {
-    __fp16       *output;
-    const __fp16 *activation;
-    const __fp16 *weight;
-    const __fp16 *scales;
-    int           n_row_tiles;
-    int           n_col_tiles;
-    int           n_dot_tiles;
+    __fp16 *       output;
+    const __fp16 * activation;
+    const __fp16 * weight;
+    const __fp16 * scales;
+    int            n_row_tiles;
+    int            n_col_tiles;
+    int            n_dot_tiles;
 } hmx_matmul_job_t;
 
-static void hmx_matmul_worker_fn(void *data) {
-    hmx_matmul_job_t *job = (hmx_matmul_job_t *) data;
-    core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales,
-                        job->n_row_tiles, job->n_col_tiles, job->n_dot_tiles);
+static void hmx_matmul_worker_fn(void * data) {
+    hmx_matmul_job_t * job = (hmx_matmul_job_t *) data;
+    core_dot_chunk_fp16(job->output, job->activation, job->weight, job->scales, job->n_row_tiles, job->n_col_tiles,
+                        job->n_dot_tiles);
 }
 
-static inline void hmx_matmul_job_init(
-    hmx_matmul_job_t *job,
-    __fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales,
-    int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
+static inline void hmx_matmul_job_init(hmx_matmul_job_t * job,
+                                       __fp16 *           output,
+                                       const __fp16 *     activation,
+                                       const __fp16 *     weight,
+                                       const __fp16 *     scales,
+                                       int                n_row_tiles,
+                                       int                n_col_tiles,
+                                       int                n_dot_tiles) {
     job->output      = output;
     job->activation  = activation;
     job->weight      = weight;
@@ -866,12 +889,13 @@ int hmx_mat_mul_permuted_w16a32_batched(struct htp_context *ctx, const hmx_matmu
     const size_t f32_scratch_per_m = use_dma_activation ? (size_t) params->k * sizeof(float) : 0;
 
     size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
+    // FP16 weight: interleave and activation load have similar per-element cost.
     if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
-                             /*per_n=*/3 * vec_dot_size,
-                             /*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
-                             /*per_mn=*/sizeof(__fp16),
-                             params->m, params->n,
-                             &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
+                           /*per_n=*/3 * vec_dot_size,
+                           /*per_m=*/group_size * vec_dot_size + f32_scratch_per_m,
+                           /*per_mn=*/sizeof(__fp16), params->m, params->n,
+                           /*m_block_cost=*/(size_t) params->n,
+                           /*n_block_cost=*/(size_t) params->m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
         FARF(HIGH, "%s: grouped path does not fit VTCM, falling back to legacy batched loop", __func__);
         return hmx_mat_mul_permuted_w16a32_batched_legacy(ctx, params);
     }
@@ -1040,13 +1064,15 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
     const size_t f32_scratch_per_m = use_dma_activation ? (size_t) k * sizeof(float) : 0;
 
     size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
+    // FP16 weight: interleave and activation load have similar per-element cost.
     if (hmx_compute_chunks(vtcm_budget,
-                              /*overhead=*/ 256,
-                              /*per_n=*/    3 * vec_dot_size,  // W + S0 + S1
-                              /*per_m=*/    vec_dot_size + f32_scratch_per_m,  // A + optional F32 scratch
-                              /*per_mn=*/   sizeof(__fp16),     // O
-                              m, n,
-                              &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
+                           /*overhead=*/256,
+                           /*per_n=*/3 * vec_dot_size,                  // W + S0 + S1
+                           /*per_m=*/vec_dot_size + f32_scratch_per_m,  // A + optional F32 scratch
+                           /*per_mn=*/sizeof(__fp16),                   // O
+                           m, n,
+                           /*m_block_cost=*/(size_t) n,
+                           /*n_block_cost=*/(size_t) m, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
         FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
         return -1;
     }
@@ -1191,6 +1217,8 @@ int hmx_mat_mul_permuted_w16a32(struct htp_context *ctx, float *restrict dst, co
 int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict out, const float *restrict x, const uint8_t *restrict w, int m,
                                        int k, int n, int w_type);
 
+#define FALLBACK_TO_STANDARD 1
+
 int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict dst, const float *restrict activation,
                                      const uint8_t *restrict permuted_weight, int m, int k, int n,
                                      int weight_type) {
@@ -1203,9 +1231,12 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
 
     // for large m, k (e.g. prefill FFN Down), use out-stationary version
     if (m >= 128 && k > n && n > 1024) {
-        FARF(MEDIUM, "hmx_matmul_qk: OUT-STATIONARY path m=%d k=%d n=%d type=%d (K_BLOCK=512, %d K-iters with fp16 intermediate)",
-             m, k, n, weight_type, (k + 511) / 512);
-        return mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
+        int rc = mat_mul_qk_0_d16a32_out_stationary(ctx, dst, activation, permuted_weight, m, k, n, weight_type);
+        if (rc != FALLBACK_TO_STANDARD) {
+            return rc;  // 0 success, -1 error
+        }
+        FARF(MEDIUM, "hmx_matmul_qk: out-stationary fallback to standard m=%d k=%d n=%d", m, k, n);
+        // fall through to standard path
     }
 
     size_t row_stride = get_x4x2_row_stride(weight_type, k);
@@ -1231,9 +1262,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
     }
 
     size_t m_chunk_n_rows = 0, n_chunk_n_cols = 0, vtcm_used = 0;
-    if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256,
-                              per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost,
-                              m, n, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
+    // Quantized weight: dequant ~1.5x more expensive per element than activation load.
+    if (hmx_compute_chunks(vtcm_budget, /*overhead=*/256, per_n_cost, /*per_m=*/vec_dot_size, per_mn_cost, m, n,
+                           /*m_block_cost=*/(size_t) n * 3,
+                           /*n_block_cost=*/(size_t) m * 2, &m_chunk_n_rows, &n_chunk_n_cols, &vtcm_used) != 0) {
         FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d pipe=%d budget=%zu)",
              __func__, m, k, n, use_pipeline, vtcm_budget);
         return -1;
@@ -1402,9 +1434,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
                 }
 
                 // submit C0 (non-blocking — HMX worker executes in parallel)
-                hmx_matmul_job_init(&job_slots[0],
-                    (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
-                    hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                hmx_matmul_job_init(&job_slots[0], (__fp16 *) vtcm_output_bufs[0], (__fp16 *) vtcm_activation,
+                                    (__fp16 *) vtcm_weight_bufs[0], vtcm_scales,
+                                    hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
+                                    hmx_ceil_div(n_cols_A0, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
                 hmx_worker_submit(ctx->hmx_worker, hmx_matmul_worker_fn, &job_slots[0]);
 
                 // B1: DMA pop + dequant (runs in parallel with C0 on HMX worker)
@@ -1438,10 +1471,10 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
                 // counterpart — and (i+1)%2 was last used by C_{i-1} which completed
                 // before C_i was submitted.
                 if (i + 1 < n_chunk_cnt) {
-                    hmx_matmul_job_init(&job_slots[(i + 1) % 2],
-                        (__fp16 *) vtcm_output_bufs[(i + 1) % 2], (__fp16 *) vtcm_activation,
-                        (__fp16 *) vtcm_weight_bufs[(i + 1) % 2], vtcm_scales,
-                        hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS), hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
+                    hmx_matmul_job_init(&job_slots[(i + 1) % 2], (__fp16 *) vtcm_output_bufs[(i + 1) % 2],
+                                        (__fp16 *) vtcm_activation, (__fp16 *) vtcm_weight_bufs[(i + 1) % 2],
+                                        vtcm_scales, hmx_ceil_div(n_rows, HMX_FP16_TILE_N_ROWS),
+                                        hmx_ceil_div(n_cols_p1, HMX_FP16_TILE_N_COLS), k / HMX_FP16_TILE_N_ROWS);
                     hmx_worker_submit(ctx->hmx_worker, hmx_matmul_worker_fn, &job_slots[(i + 1) % 2]);
                 }
 
@@ -1583,12 +1616,41 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
 
     const size_t vtcm_budget = ctx->vtcm_size;
 
-    const size_t M_BLOCK_SIZE = 512;
-    const size_t N_BLOCK_SIZE = 512;
-    const size_t K_BLOCK_SIZE = 512;
+    const size_t K_BLOCK_SIZE = 1024;
+
+    // Fallback: if k doesn't need K-blocking, out-stationary has no advantage
+    const size_t k_iters_check = (k + K_BLOCK_SIZE - 1) / K_BLOCK_SIZE;
+    if (k_iters_check <= 1) {
+        FARF(MEDIUM, "%s: K_BLK=%zu >= k=%d, fallback to standard path", __func__, K_BLOCK_SIZE, k);
+        return FALLBACK_TO_STANDARD;
+    }
 
-    // Compute precise buffer sizes
+    // Dynamic M,N search via hmx_compute_chunks
     const size_t sub_row_stride_alloc = get_x4x2_row_stride(weight_type, K_BLOCK_SIZE);
+    const size_t per_m                = K_BLOCK_SIZE * sizeof(float)  // scratch1: M×K×4 (act DMA staging F32)
+                         + K_BLOCK_SIZE * sizeof(__fp16);             // activation: M×K×2 (F16 tiles)
+    const size_t per_n = sub_row_stride_alloc                         // scratch0: N×sub_row(K) (packed quant)
+                         + K_BLOCK_SIZE * sizeof(__fp16);             // weight: N×K×2 (F16 tiles)
+    const size_t per_mn       = sizeof(__fp16);                       // output: M×N×2 (out-stationary)
+    // Alignment margin: hex_align_up can add up to 2047 bytes per buffer;
+    // scratch1 (mc×6144) is naturally 2048-aligned, remaining 4 buffers need margin
+    const size_t align_margin = 4 * HMX_FP16_TILE_SIZE;
+    const size_t overhead     = HMX_FP16_TILE_SIZE + 256 + align_margin;  // eye_tile + scales + alignment
+
+    size_t       M_BLOCK_SIZE, N_BLOCK_SIZE, vtcm_used;
+    // Cost-based search: minimize ceil(m/mc)*m_block_cost + ceil(n/nc)*n_block_cost.
+    // From profiling: wt_dequant per element ≈ 1.5× activation load per element.
+    // m_block_cost = n*3: each extra M-block re-dequants all N×K weight (expensive).
+    // n_block_cost = m*2: each extra N-block re-loads all M×K activation (cheaper).
+    const size_t m_block_cost = (size_t) n * 3;
+    const size_t n_block_cost = (size_t) m * 2;
+    if (hmx_compute_chunks(vtcm_budget, overhead, per_n, per_m, per_mn, m, n, m_block_cost, n_block_cost, &M_BLOCK_SIZE,
+                           &N_BLOCK_SIZE, &vtcm_used) != 0) {
+        FARF(HIGH, "%s: VTCM too small (m=%d k=%d n=%d budget=%zu)", __func__, m, k, n, vtcm_budget);
+        return -1;
+    }
+
+    // Compute precise buffer sizes from searched M,N and fixed K
     const size_t weight_size  = hex_align_up(N_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
     const size_t act_size     = hex_align_up(M_BLOCK_SIZE * K_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
     const size_t out_size     = hex_align_up(M_BLOCK_SIZE * N_BLOCK_SIZE * sizeof(__fp16), HMX_FP16_TILE_SIZE);
@@ -1597,7 +1659,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
 
     const size_t total_vtcm = weight_size + act_size + out_size + scratch0_sz + scratch1_sz + HMX_FP16_TILE_SIZE + 256;
     if (total_vtcm > vtcm_budget) {
-        FARF(HIGH, "%s: VTCM too small: need %zu have %zu (m=%d k=%d n=%d)", __func__, total_vtcm, vtcm_budget, m, k, n);
+        FARF(HIGH, "%s: VTCM overflow after search: need %zu have %zu (M=%zu N=%zu K=%zu)", __func__, total_vtcm,
+             vtcm_budget, M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE);
         return -1;
     }
 
@@ -1611,8 +1674,8 @@ int mat_mul_qk_0_d16a32_out_stationary(struct htp_context *ctx, float *restrict
     __fp16  *vtcm_scales     = (__fp16 *) vtcm_seq_alloc(&vtcm_ptr, 256);
     assert((size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base) <= vtcm_budget);
 
-    FARF(MEDIUM, "%s: m=%d k=%d n=%d wtype=%d vtcm=%zu/%zu", __func__, m, k, n, weight_type,
-         (size_t)(vtcm_ptr - (uint8_t *)ctx->vtcm_base), vtcm_budget);
+    FARF(HIGH, "hmx-mm: m=%d k=%d n=%d wtype=%d block M=%zu N=%zu K=%zu vtcm=%zu/%zu", __func__, m, k, n, weight_type,
+         M_BLOCK_SIZE, N_BLOCK_SIZE, K_BLOCK_SIZE, (size_t) (vtcm_ptr - (uint8_t *) ctx->vtcm_base), vtcm_budget);
 
     // initialize eye tile (32x32 identity matrix)
     {
diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.c b/ggml/src/ggml-hexagon/htp/hmx-worker.c
index 657d16b2a9..6826dde7de 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-worker.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-worker.c
@@ -1,48 +1,47 @@
 #include "hmx-worker.h"
 
+#include <HAP_compute_res.h>
+#include <HAP_farf.h>
 #include <qurt.h>
 #include <stdatomic.h>
 #include <stdlib.h>
 #include <string.h>
 
-#include <HAP_compute_res.h>
-#include <HAP_farf.h>
-
 // ---------------------------------------------------------------------------
 // Internal types
 // ---------------------------------------------------------------------------
 
 enum hmx_worker_cmd {
     HMX_WORKER_CMD_BEGIN,  // acquire HMX lock
-    HMX_WORKER_CMD_JOB,   // execute fn(data)
-    HMX_WORKER_CMD_END,   // release HMX lock
-    HMX_WORKER_CMD_KILL,  // exit thread
+    HMX_WORKER_CMD_JOB,    // execute fn(data)
+    HMX_WORKER_CMD_END,    // release HMX lock
+    HMX_WORKER_CMD_KILL,   // exit thread
 };
 
 struct hmx_worker_context {
     // Command channel: main thread → worker
-    atomic_uint          cmd_seqn;   // bumped by main thread for each command
-    enum hmx_worker_cmd  cmd_type;
-    hmx_worker_fn_t      fn;
-    void                *data;
+    atomic_uint         cmd_seqn;  // bumped by main thread for each command
+    enum hmx_worker_cmd cmd_type;
+    hmx_worker_fn_t     fn;
+    void *              data;
 
     // Completion channel: worker → main thread
-    atomic_uint          done_seqn;  // set to cmd_seqn when command completes
+    atomic_uint done_seqn;  // set to cmd_seqn when command completes
 
     // Configuration
-    uint32_t             vtcm_rctx;
+    uint32_t vtcm_rctx;
 
     // Thread resources
-    qurt_thread_t        thread;
-    void                *stack;      // single allocation: stack + context
+    qurt_thread_t thread;
+    void *        stack;  // single allocation: stack + context
 };
 
 // ---------------------------------------------------------------------------
 // Worker thread entry point
 // ---------------------------------------------------------------------------
 
-static void hmx_worker_main(void *arg) {
-    struct hmx_worker_context *ctx = (struct hmx_worker_context *) arg;
+static void hmx_worker_main(void * arg) {
+    struct hmx_worker_context * ctx = (struct hmx_worker_context *) arg;
 
     FARF(HIGH, "hmx-worker: thread started");
 
@@ -56,23 +55,23 @@ static void hmx_worker_main(void *arg) {
         prev_seqn = seqn;
 
         switch (ctx->cmd_type) {
-        case HMX_WORKER_CMD_BEGIN:
-            HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
-            break;
-
-        case HMX_WORKER_CMD_JOB:
-            ctx->fn(ctx->data);
-            break;
-
-        case HMX_WORKER_CMD_END:
-            HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
-            break;
-
-        case HMX_WORKER_CMD_KILL:
-            atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release);
-            qurt_futex_wake(&ctx->done_seqn, 1);
-            FARF(HIGH, "hmx-worker: thread stopped");
-            return;
+            case HMX_WORKER_CMD_BEGIN:
+                HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
+                break;
+
+            case HMX_WORKER_CMD_JOB:
+                ctx->fn(ctx->data);
+                break;
+
+            case HMX_WORKER_CMD_END:
+                HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
+                break;
+
+            case HMX_WORKER_CMD_KILL:
+                atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release);
+                qurt_futex_wake(&ctx->done_seqn, 1);
+                FARF(HIGH, "hmx-worker: thread stopped");
+                return;
         }
 
         atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release);
@@ -85,9 +84,10 @@ static void hmx_worker_main(void *arg) {
 // ---------------------------------------------------------------------------
 
 // Issue a command to the worker (non-blocking).
-static void hmx_worker_issue(struct hmx_worker_context *ctx,
-                             enum hmx_worker_cmd type,
-                             hmx_worker_fn_t fn, void *data) {
+static void hmx_worker_issue(struct hmx_worker_context * ctx,
+                             enum hmx_worker_cmd         type,
+                             hmx_worker_fn_t             fn,
+                             void *                      data) {
     ctx->cmd_type = type;
     ctx->fn       = fn;
     ctx->data     = data;
@@ -96,11 +96,10 @@ static void hmx_worker_issue(struct hmx_worker_context *ctx,
 }
 
 // Block until the worker has completed the most recently issued command.
-static void hmx_worker_drain(struct hmx_worker_context *ctx) {
+static void hmx_worker_drain(struct hmx_worker_context * ctx) {
     unsigned int expected = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire);
     while (atomic_load_explicit(&ctx->done_seqn, memory_order_acquire) != expected) {
-        qurt_futex_wait(&ctx->done_seqn,
-                        atomic_load_explicit(&ctx->done_seqn, memory_order_relaxed));
+        qurt_futex_wait(&ctx->done_seqn, atomic_load_explicit(&ctx->done_seqn, memory_order_relaxed));
     }
 }
 
@@ -110,30 +109,34 @@ static void hmx_worker_drain(struct hmx_worker_context *ctx) {
 
 #define LOWEST_USABLE_QURT_PRIO (254)
 
-AEEResult hmx_worker_init(hmx_worker_context_t *out, uint32_t stack_size, uint32_t vtcm_rctx) {
+AEEResult hmx_worker_init(hmx_worker_context_t * out, uint32_t stack_size, uint32_t vtcm_rctx) {
     if (!out) {
         return AEE_EBADPARM;
     }
 
     // Single allocation: stack followed by context struct.
-    size_t total = stack_size + sizeof(struct hmx_worker_context);
-    unsigned char *blob = (unsigned char *) malloc(total);
+    size_t          total = stack_size + sizeof(struct hmx_worker_context);
+    unsigned char * blob  = (unsigned char *) malloc(total);
     if (!blob) {
         FARF(ERROR, "hmx-worker: allocation failed (%zu bytes)", total);
         return AEE_ENOMEMORY;
     }
     memset(blob, 0, total);
 
-    struct hmx_worker_context *ctx = (struct hmx_worker_context *) (blob + stack_size);
-    ctx->stack     = blob;
-    ctx->vtcm_rctx = vtcm_rctx;
-    atomic_init(&ctx->cmd_seqn,  0);
+    struct hmx_worker_context * ctx = (struct hmx_worker_context *) (blob + stack_size);
+    ctx->stack                      = blob;
+    ctx->vtcm_rctx                  = vtcm_rctx;
+    atomic_init(&ctx->cmd_seqn, 0);
     atomic_init(&ctx->done_seqn, 0);
 
     // Match caller thread priority (same pattern as worker-pool.c).
     int prio = qurt_thread_get_priority(qurt_thread_get_id());
-    if (prio < 1)                       prio = 1;
-    if (prio > LOWEST_USABLE_QURT_PRIO) prio = LOWEST_USABLE_QURT_PRIO;
+    if (prio < 1) {
+        prio = 1;
+    }
+    if (prio > LOWEST_USABLE_QURT_PRIO) {
+        prio = LOWEST_USABLE_QURT_PRIO;
+    }
 
     qurt_thread_attr_t attr;
     qurt_thread_attr_init(&attr);
@@ -154,7 +157,9 @@ AEEResult hmx_worker_init(hmx_worker_context_t *out, uint32_t stack_size, uint32
 }
 
 void hmx_worker_release(hmx_worker_context_t ctx) {
-    if (!ctx) return;
+    if (!ctx) {
+        return;
+    }
 
     // Tell the worker to exit.
     hmx_worker_issue(ctx, HMX_WORKER_CMD_KILL, NULL, NULL);
@@ -172,7 +177,7 @@ AEEResult hmx_worker_begin(hmx_worker_context_t ctx) {
     return AEE_SUCCESS;
 }
 
-AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void *data) {
+AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void * data) {
     // Caller is expected to have called wait() for any previous job.
     // Safety: drain any residual (should be instant in normal flow).
     hmx_worker_drain(ctx);
diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.h b/ggml/src/ggml-hexagon/htp/hmx-worker.h
index 9c0477b974..36aec15858 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-worker.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-worker.h
@@ -21,12 +21,12 @@
 extern "C" {
 #endif
 
-typedef void (*hmx_worker_fn_t)(void *data);
+typedef void (*hmx_worker_fn_t)(void * data);
 
-typedef struct hmx_worker_context *hmx_worker_context_t;
+typedef struct hmx_worker_context * hmx_worker_context_t;
 
 // Create worker thread.  Thread starts idle (no HMX lock held).
-AEEResult hmx_worker_init(hmx_worker_context_t *ctx, uint32_t stack_size, uint32_t vtcm_rctx);
+AEEResult hmx_worker_init(hmx_worker_context_t * ctx, uint32_t stack_size, uint32_t vtcm_rctx);
 
 // Destroy worker thread.  Must not be called while a job is in-flight.
 void hmx_worker_release(hmx_worker_context_t ctx);
@@ -37,7 +37,7 @@ AEEResult hmx_worker_begin(hmx_worker_context_t ctx);
 // Submit a job (non-blocking).  Caller must have called wait() for any
 // previous job before submitting a new one.
 // |data| must remain valid until the corresponding wait() returns.
-AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void *data);
+AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void * data);
 
 // Block until the current in-flight job completes.
 // Returns immediately if no job is in-flight.

From 266120dfe668512f812fbe62e8dcf5aa1d712c27 Mon Sep 17 00:00:00 2001
From: njsyw1997 <44545837+njsyw1997@users.noreply.github.com>
Date: Sat, 11 Apr 2026 16:32:52 -0700
Subject: [PATCH 3/6] hexagon: fix futex race in hmx_worker_drain Store the
 boolean to local variable avoid atomic load twice

---
 ggml/src/ggml-hexagon/htp/hmx-worker.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hmx-worker.c b/ggml/src/ggml-hexagon/htp/hmx-worker.c
index 6826dde7de..013173c7e5 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-worker.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-worker.c
@@ -98,8 +98,15 @@ static void hmx_worker_issue(struct hmx_worker_context * ctx,
 // Block until the worker has completed the most recently issued command.
 static void hmx_worker_drain(struct hmx_worker_context * ctx) {
     unsigned int expected = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire);
-    while (atomic_load_explicit(&ctx->done_seqn, memory_order_acquire) != expected) {
-        qurt_futex_wait(&ctx->done_seqn, atomic_load_explicit(&ctx->done_seqn, memory_order_relaxed));
+    for (;;) {
+        unsigned int seen = atomic_load_explicit(&ctx->done_seqn, memory_order_acquire);
+        if (seen == expected) {
+            return;
+        }
+        // Pass the same observed value to futex_wait().  If the worker completes
+        // between the load above and the futex call, the value mismatch makes the
+        // wait return immediately instead of sleeping forever on the new seqn.
+        qurt_futex_wait(&ctx->done_seqn, seen);
     }
 }
 

From f5833aac101d06a8a45d4dff37267293aad83a9f Mon Sep 17 00:00:00 2001
From: Kim-Chyan Gan <kgan@qti.qualcomm.com>
Date: Fri, 10 Apr 2026 15:07:01 -0700
Subject: [PATCH 4/6] hex-mm: hmx optimize scatter/transpose and use HMX
 intrinsics

---
 ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c | 135 +++++++++++----------
 ggml/src/ggml-hexagon/htp/hmx-utils.h      |  56 ---------
 ggml/src/ggml-hexagon/htp/hvx-base.h       |   5 +
 3 files changed, 77 insertions(+), 119 deletions(-)

diff --git a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
index 92526273bc..fd64a1d0d3 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
+++ b/ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c
@@ -49,7 +49,8 @@ static const __fp16 iq4_nl_to_fp16_lut[64] __attribute__((aligned(VLEN))) = {
 static const int32_t weight_transpose_scatter_offsets[32] __attribute__((aligned(VLEN))) = {
     0*128,  1*128,  2*128,  3*128,  4*128,  5*128,  6*128,  7*128,
     8*128,  9*128, 10*128, 11*128, 12*128, 13*128, 14*128, 15*128,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    16*128, 17*128, 18*128, 19*128, 20*128, 21*128, 22*128, 23*128,
+    24*128, 25*128, 26*128, 27*128, 28*128, 29*128, 30*128, 31*128
 };
 
 // Scales per x4x2 logical block: 8 × sizeof(__fp16) = 16 bytes
@@ -253,7 +254,7 @@ static inline HVX_Vector dequantize_x4x2_q4_0_group_hvx(
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
     HVX_Vector v_scales = hvx_vec_splat_f16(*scale);
     // q4x4x2 stores two int4 values per byte. Keep only the selected nibble.
-    HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
+    HVX_Vector v_quants =  Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
     // Shuffle before LUT
     v_quants = Q6_Vb_vshuff_Vb(v_quants);
@@ -277,7 +278,7 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
     // Load all 128 packed bytes (4 contiguous 32-byte groups)
     HVX_Vector vq = hvx_vmemu(packed_128);
     const HVX_Vector mask_h4 = Q6_Vb_vsplat_R(0x0F);
-    HVX_Vector v_quants = upper_nibbles ? Q6_Vub_vlsr_VubR(vq, 4) : vq;
+    HVX_Vector v_quants = Q6_Vub_vlsr_VubR(vq, 4 * upper_nibbles);
     v_quants = Q6_V_vand_VV(v_quants, mask_h4);
 
     // Shuffle before LUT
@@ -297,10 +298,8 @@ static inline void dequantize_x4x2_q4_0_x4groups_hvx(
     v_hi = Q6_Vhf_equals_Vqf16(Q6_Vqf16_vmpy_VhfVhf(v_hi, v_sc23));
 
     // Extract individual groups: scatter uses q_mask64 so only first 64 bytes matter
-    out[0] = v_lo;                      // group0 already in [0:63]
-    out[1] = Q6_V_vror_VR(v_lo, 64);    // group1 rotated to [0:63]
-    out[2] = v_hi;                      // group2 already in [0:63]
-    out[3] = Q6_V_vror_VR(v_hi, 64);    // group3 rotated to [0:63]
+    out[0] = v_lo; // group0 already in [0:63]
+    out[1] = v_hi; // group2 already in [0:63]
 }
 
 // Dequantize one x4x2 Q8_0 group (32 int8 quants) -> 32 FP16 in first 64 bytes.
@@ -404,8 +403,9 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
         size_t row_stride, int weight_type,
         int start_tile, int end_tile) {
 
-    const int n_k_tiles = k_block / HMX_FP16_TILE_N_COLS;
-    const int qrow_size = (weight_type == HTP_TYPE_Q8_0) ? k_block : (k_block / 2);
+    const int n_k_tiles = (unsigned)k_block / HMX_FP16_TILE_N_COLS;
+    const bool is_q4 = (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL);
+    const int qrow_size = is_q4 ? ((unsigned)k_block / 2) : k_block;
 
     const HVX_Vector vlut_cvt = (weight_type == HTP_TYPE_IQ4_NL) ? hvx_vmem(iq4_nl_to_fp16_lut) :
                                 (weight_type == HTP_TYPE_MXFP4)  ? hvx_vmem(mxfp4_to_fp16_lut) :
@@ -418,47 +418,46 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
     const HVX_Vector v_scat_step = Q6_V_vsplat_R(4);  // 4 bytes = 1 column step
     const HVX_VectorPred q_mask64 = Q6_Q_vsetq_R(64);  // first 16 words (64 bytes)
 
-    for (int t = start_tile; t < end_tile; ) {
-        int ct = t / n_k_tiles;  // column tile index
-        int kt = t % n_k_tiles;  // K tile index
+    unsigned ct = (unsigned)start_tile / n_k_tiles;  // column tile index
+    unsigned kt = (unsigned)start_tile % n_k_tiles;  // K tile index
+    for (unsigned t = start_tile; t < end_tile; ) {
+        if (kt >= n_k_tiles) { kt = 0; ct++; }
 
-        // --- Batch-4 fast path for Q4_0/IQ4_NL: process 4 contiguous K-tiles with one vlut16 per row ---
-        if ((weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) && (kt % 4 == 0) && (t + 4 <= end_tile) &&
-            ((t + 3) / n_k_tiles == ct)) {
-            int blk_idx      = (kt * 32) / QK_Q4_0x4x2;
-            int sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;  // 0 or 4
-            bool upper       = (sub_blk_base >= 4);
-            int packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);     // 128 contiguous packed bytes
-            int scale_off    = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
-                             + sub_blk_base * (int)sizeof(__fp16);   // 4 consecutive scales
+        // --- Batch-4 fast path for Q4: process 4 contiguous K-tiles with one vlut16 per row ---
+        if (is_q4 && (kt % 4 == 0) && (t + 4 <= end_tile) && ((t + 3) / n_k_tiles == ct)) {
+            unsigned blk_idx      = (kt * 32) / QK_Q4_0x4x2;
+            unsigned sub_blk_base = ((kt * 32) % QK_Q4_0x4x2) / 32;  // 0 or 4
+            bool upper            = (sub_blk_base >= 4);
+            unsigned packed_off   = blk_idx * (QK_Q4_0x4x2 / 2);     // 128 contiguous packed bytes
+            unsigned scale_off    = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE
+                                  + sub_blk_base * (int)sizeof(__fp16);   // 4 consecutive scales
 
             __fp16 *tile_bases[4];
-            for (int g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
+            for (unsigned g = 0; g < 4; g++) { tile_bases[g] = vtcm_dst + (t + g) * HMX_FP16_TILE_N_ELMS; }
 
             HVX_Vector v_off = v_scat_base;
-            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
-                int row0 = ct * HMX_FP16_TILE_N_COLS + r;
-                int row1 = row0 + 1;
-                const uint8_t *r0 = vtcm_src + row0 * row_stride;
-                const uint8_t *r1 = vtcm_src + row1 * row_stride;
 
-                HVX_Vector v0[4], v1[4];
-                dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
-                if (row1 < n_cols) {
-                    dequantize_x4x2_q4_0_x4groups_hvx(r1 + packed_off, upper, (const __fp16 *)(r1 + scale_off), vlut_cvt, v1);
-                } else {
-                    v1[0] = v1[1] = v1[2] = v1[3] = Q6_V_vzero();
-                }
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
+            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
 
-                for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v0[g]); }
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
+                HVX_Vector v0[2];
+                const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
+                dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
-                for (int g = 0; g < 4; g++) { Q6_vscatter_QRMVwV(q_mask64, (size_t)tile_bases[g], HMX_FP16_TILE_SIZE - 1, v_off, v1[g]); }
+
+
+                r0 = vtcm_src + row_offset; row_offset += row_stride;
+                dequantize_x4x2_q4_0_x4groups_hvx(r0 + packed_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt, v0);
+                Q6_vscatter_RMVwV((size_t)tile_bases[0], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[0]);
+                Q6_vscatter_RMVwV((size_t)tile_bases[2], 2 * HMX_FP16_TILE_SIZE - 1, v_off, v0[1]);
                 v_off = Q6_Vw_vadd_VwVw(v_off, v_scat_step);
             }
 
             for (int g = 0; g < 4; g++) { (void) *(volatile HVX_Vector *)(tile_bases[g]); }
-
-            t += 4;
+            t += 4; kt += 4;
             continue;
         }
 
@@ -515,20 +514,19 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
         // --- Single-tile fallback ---
         __fp16 *tile_base = vtcm_dst + t * HMX_FP16_TILE_N_ELMS;
 
-        if (weight_type == HTP_TYPE_Q4_0 || weight_type == HTP_TYPE_IQ4_NL) {
-            int blk_idx  = (kt * 32) / QK_Q4_0x4x2;
-            int sub_blk  = ((kt * 32) % QK_Q4_0x4x2) / 32;
-            bool upper   = (sub_blk >= 4);
-            int byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
-            int scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
+        if (is_q4) {
+            unsigned blk_idx   = (kt * 32) / QK_Q4_0x4x2;
+            unsigned sub_blk   = ((kt * 32) % QK_Q4_0x4x2) / 32;
+            bool upper         = (sub_blk >= 4);
+            unsigned byte_off  = blk_idx * (QK_Q4_0x4x2 / 2) + (upper ? (sub_blk - 4) : sub_blk) * 32;
+            unsigned scale_off = qrow_size + blk_idx * HMX_X4X2_DBLK_SIZE + sub_blk * (int)sizeof(__fp16);
 
             HVX_Vector v_off = v_scat_base;  // reset to column 0
-            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2) {
-                int row0 = ct * HMX_FP16_TILE_N_COLS + r;
-                int row1 = row0 + 1;
-
-                const uint8_t *r0 = vtcm_src + row0 * row_stride;
-                const uint8_t *r1 = vtcm_src + row1 * row_stride;
+            unsigned row_offset = ct * HMX_FP16_TILE_N_COLS * row_stride;
+            unsigned row1 = ct * HMX_FP16_TILE_N_COLS + 1;
+            for (int r = 0; r < HMX_FP16_TILE_N_ROWS; r += 2, row1 += 2) {
+                const uint8_t *r0 = vtcm_src + row_offset; row_offset += row_stride;
+                const uint8_t *r1 = vtcm_src + row_offset; row_offset += row_stride;
 
                 HVX_Vector v0 = dequantize_x4x2_q4_0_group_hvx(
                     r0 + byte_off, upper, (const __fp16 *)(r0 + scale_off), vlut_cvt);
@@ -605,7 +603,7 @@ static void dequantize_x4x2_weight_to_fp16_tiles_task(
             }
             (void) *(volatile HVX_Vector *)(tile_base);
         }
-        ++t;
+        ++t; ++kt;
     }
 
     // Drain HVX scatter write buffer: a vmem load on the same HW thread retires
@@ -673,9 +671,13 @@ static void dequantize_x4x2_weight_chunk_to_fp16_tiles(
 // --- End x4x2 dequantizers ---
 
 // requires external HMX lock
-static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const __fp16 *weight, const __fp16 *scales,
+static void core_dot_chunk_fp16(__fp16 *restrict output, const __fp16 *restrict activation, const __fp16 *restrict weight, const __fp16 *restrict scales,
                                 int n_row_tiles, int n_col_tiles, int n_dot_tiles) {
-    hmx_set_output_scales(scales);
+    __builtin_assume(n_row_tiles > 0);
+    __builtin_assume(n_col_tiles > 0);
+    __builtin_assume(n_dot_tiles > 0);
+
+    Q6_bias_mxmem2_A((void *)scales);
 
     for (int r = 0; r < n_row_tiles; ++r) {
         for (int c = 0; c < n_col_tiles; ++c) {
@@ -685,12 +687,14 @@ static void core_dot_chunk_fp16(__fp16 *output, const __fp16 *activation, const
             const __fp16 *col_tiles = weight + c * n_dot_tiles * HMX_FP16_TILE_N_ELMS;
 
             for (int k = 0; k < n_dot_tiles; ++k) {
-                int offset = k * HMX_FP16_TILE_N_ELMS;
-                hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
+                Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
+                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
+                row_tiles += HMX_FP16_TILE_N_ELMS;
+                col_tiles += HMX_FP16_TILE_N_ELMS;
             }
 
             __fp16 *out_tile = output + (r * n_col_tiles + c) * HMX_FP16_TILE_N_ELMS;
-            hmx_consume_accumulator_fp16(out_tile);
+            Q6_mxmem_AR_after_hf(out_tile, 0);
         }
     }
 }
@@ -1510,10 +1514,13 @@ int hmx_mat_mul_permuted_qk_0_d16a32(struct htp_context *ctx, float *restrict ds
 }
 
 // C += AB
-void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp16 *col_scales, const __fp16 *eye_tile,
+void core_mma_chunk_fp16(__fp16 *restrict c, const __fp16 *restrict a, const __fp16 *restrict b, const __fp16 *restrict col_scales, const __fp16 *restrict eye_tile,
                          int n_row_tiles, int n_col_tiles, int n_dot_tiles, bool zero_init) {
+    __builtin_assume(n_row_tiles > 0);
+    __builtin_assume(n_col_tiles > 0);
+    __builtin_assume(n_dot_tiles > 0);
 
-    hmx_set_output_scales(col_scales);
+    Q6_bias_mxmem2_A((void *)col_scales);
 
     for (int i = 0; i < n_row_tiles; ++i) {
         for (int j = 0; j < n_col_tiles; ++j) {
@@ -1524,15 +1531,17 @@ void core_mma_chunk_fp16(__fp16 *c, const __fp16 *a, const __fp16 *b, const __fp
 
             __fp16 *accum_tile = c + (i * n_col_tiles + j) * HMX_FP16_TILE_N_ELMS;
             if (!zero_init) {
-                hmx_load_tile_pair_fp16(accum_tile, eye_tile);
+                Q6_activation_hf_mxmem_RR((unsigned int)accum_tile, 2047);
+                Q6_weight_hf_mxmem_RR((unsigned int)eye_tile, 2047);
             }
 
             for (int k = 0; k < n_dot_tiles; ++k) {
-                int offset = k * HMX_FP16_TILE_N_ELMS;
-                hmx_load_tile_pair_fp16(row_tiles + offset, col_tiles + offset);
+                Q6_activation_hf_mxmem_RR((unsigned int)row_tiles, 2047);
+                Q6_weight_hf_mxmem_RR((unsigned int)col_tiles, 2047);
+                row_tiles += HMX_FP16_TILE_N_ELMS;
+                col_tiles += HMX_FP16_TILE_N_ELMS;
             }
-
-            hmx_consume_accumulator_fp16(accum_tile);
+            Q6_mxmem_AR_after_hf(accum_tile, 0);
         }
     }
 }
diff --git a/ggml/src/ggml-hexagon/htp/hmx-utils.h b/ggml/src/ggml-hexagon/htp/hmx-utils.h
index aacfbcda28..af04619ceb 100644
--- a/ggml/src/ggml-hexagon/htp/hmx-utils.h
+++ b/ggml/src/ggml-hexagon/htp/hmx-utils.h
@@ -14,10 +14,6 @@
 
 #define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline))
 
-static HMX_INLINE_ALWAYS void hmx_set_output_scales(const void *scales) {
-    asm volatile("bias = mxmem2(%0)" :: "r"(scales));
-}
-
 // Initialise aligned 256-byte area with scale vector + zero padding.
 static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) {
     HVX_Vector *pv = (HVX_Vector *)out_scales;
@@ -25,58 +21,6 @@ static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vecto
     *pv   = Q6_V_vzero();
 }
 
-// Load multiple contiguous tiles with :deep streaming.
-// Rt = total region size - 1; the hardware streams through [Rs, Rs + Rt].
-// IMPORTANT: the tile region [Rs, Rs + Rt] must NOT cross a VTCM 4 MB bank
-// boundary, otherwise the mxmem instruction will raise a precise bus error.
-// Callers must ensure their VTCM layout satisfies this constraint.
-static HMX_INLINE_ALWAYS void hmx_load_tiles_fp16(const __fp16 *row_tiles,
-                                                   const __fp16 *col_tiles,
-                                                   size_t n_tiles) {
-    size_t limit = n_tiles * HMX_FP16_TILE_SIZE - 1;
-    asm volatile(
-        "{ activation.hf = mxmem(%0, %1):deep\n"
-        "weight.hf = mxmem(%2, %3) }\n"
-        :: "r"(row_tiles), "r"(limit), "r"(col_tiles), "r"(limit)
-        : "memory");
-}
-
-// Load a single activation+weight tile pair (no :deep streaming).
-// Rt defines the accessible region [Rs, Rs+Rt].  Following the reference formula
-// (limit = n_tiles * HMX_FP16_TILE_SIZE - 1), for a single tile Rt = 2047.
-// The original code used Rt=0x7FFF (32 KB region); when dynamic VTCM allocation
-// places a tile near a 4 MB bank boundary, the oversized region crosses it and
-// triggers a precise bus error (0x2601).  Rt=2047 confines accesses to exactly
-// one 2048-byte tile while covering all 16 HVX vectors (offsets 0..2047).
-static HMX_INLINE_ALWAYS void hmx_load_tile_pair_fp16(const __fp16 *act_tile,
-                                                       const __fp16 *wt_tile) {
-    asm volatile(
-        "{ activation.hf = mxmem(%0, %1)\n"
-        "weight.hf = mxmem(%2, %3) }\n"
-        :: "r"(act_tile), "r"(2047),
-           "r"(wt_tile),  "r"(2047)
-        : "memory");
-}
-
-static HMX_INLINE_ALWAYS void hmx_consume_accumulator_fp16(__fp16 *out) {
-    // Use the combined convert-and-store instruction (matches the reference
-    // Q6_mxmem_AR_after_hf intrinsic).  The previous two-instruction sequence
-    // "cvt.hf = acc(2); mxmem = cvt" used an undocumented Rs=2 parameter.
-    asm volatile(
-        "mxmem(%0, %1):after.hf = acc\n"
-        :: "r"(out), "r"(0)
-        : "memory");
-}
-
-// Compute inner product of two vectors of tiles and store result.
-static HMX_INLINE_ALWAYS void hmx_dot_fp16(__fp16 *out,
-                                            const __fp16 *row_tiles,
-                                            const __fp16 *col_tiles,
-                                            size_t n_tiles) {
-    hmx_load_tiles_fp16(row_tiles, col_tiles, n_tiles);
-    hmx_consume_accumulator_fp16(out);
-}
-
 // --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) ---
 
 static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) {
diff --git a/ggml/src/ggml-hexagon/htp/hvx-base.h b/ggml/src/ggml-hexagon/htp/hvx-base.h
index db05ab40d2..ed6026e762 100644
--- a/ggml/src/ggml-hexagon/htp/hvx-base.h
+++ b/ggml/src/ggml-hexagon/htp/hvx-base.h
@@ -116,9 +116,14 @@ static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) {
 }
 
 static inline HVX_Vector hvx_vec_f32_to_f16_shuff(HVX_Vector v0, HVX_Vector v1) {
+#if __HVX_ARCH__ >= 81
+    HVX_Vector q0 = Q6_Vqf32_equals_Vsf(v0);
+    HVX_Vector q1 = Q6_Vqf32_equals_Vsf(v1);
+#else
     const HVX_Vector zero = Q6_V_vzero();
     HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero);
     HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero);
+#endif
     return Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0));
 }
 

From 56ae47942c4bca918a96d678d7f4639b9bff34ed Mon Sep 17 00:00:00 2001
From: Max Krasnyansky <maxk@qti.qualcomm.com>
Date: Sat, 11 Apr 2026 17:52:52 -0700
Subject: [PATCH 5/6] hex-vmem: drop vmem limit a touch under 3GB on v73

---
 ggml/src/ggml-hexagon/htp/htp-ops.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/ggml/src/ggml-hexagon/htp/htp-ops.h b/ggml/src/ggml-hexagon/htp/htp-ops.h
index 44a6ab4f73..fa84b674cd 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ops.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ops.h
@@ -91,7 +91,12 @@ enum htp_op_code {
 #define HTP_OP_MAX_BUFS    8
 #define HTP_OP_MAX_REQS    256
 #define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS)
+
+#if __HVX_ARCH__ < 75
+#define HTP_OP_MAX_VMEM    (3167538380u)
+#else
 #define HTP_OP_MAX_VMEM    (3221225472u)
+#endif
 
 enum htp_tensor_flags {
     HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights)

From 0d7997717c60c9d9665f745a4d10c8b3f2bdcc91 Mon Sep 17 00:00:00 2001
From: njsyw1997 <44545837+njsyw1997@users.noreply.github.com>
Date: Sat, 11 Apr 2026 18:19:58 -0700
Subject: [PATCH 6/6] hexagon: add fwd declaration of htp_context

---
 ggml/src/ggml-hexagon/htp/htp-ctx.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ggml/src/ggml-hexagon/htp/htp-ctx.h b/ggml/src/ggml-hexagon/htp/htp-ctx.h
index 7b09bb4f41..65eaf1b4bc 100644
--- a/ggml/src/ggml-hexagon/htp/htp-ctx.h
+++ b/ggml/src/ggml-hexagon/htp/htp-ctx.h
@@ -31,6 +31,8 @@ struct htp_spad {
     uint32_t                  size_per_thread; // size per thread
 };
 
+struct htp_context;
+
 // Context while processing an Op
 // TODO: fold this into the main context
 struct htp_ops_context {