Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions ggml/src/ggml-hexagon/htp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ list(FIND HTP_HMX_VERSIONS ${DSP_VERSION} _hmx_idx)

if (_hmx_idx GREATER_EQUAL 0)
target_sources(${HTP_LIB} PRIVATE
hmx-worker.c
hmx-matmul-ops.c
)

Expand Down
389 changes: 252 additions & 137 deletions ggml/src/ggml-hexagon/htp/hmx-matmul-ops.c

Large diffs are not rendered by default.

56 changes: 0 additions & 56 deletions ggml/src/ggml-hexagon/htp/hmx-utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,69 +14,13 @@

#define HMX_INLINE_ALWAYS inline __attribute__((unused, always_inline))

static HMX_INLINE_ALWAYS void hmx_set_output_scales(const void *scales) {
asm volatile("bias = mxmem2(%0)" :: "r"(scales));
}

// Initialise aligned 256-byte area with scale vector + zero padding.
static HMX_INLINE_ALWAYS void hmx_init_column_scales(void *out_scales, HVX_Vector v_scale) {
HVX_Vector *pv = (HVX_Vector *)out_scales;
*pv++ = v_scale;
*pv = Q6_V_vzero();
}

// Load multiple contiguous tiles with :deep streaming.
// Rt = total region size - 1; the hardware streams through [Rs, Rs + Rt].
// IMPORTANT: the tile region [Rs, Rs + Rt] must NOT cross a VTCM 4 MB bank
// boundary, otherwise the mxmem instruction will raise a precise bus error.
// Callers must ensure their VTCM layout satisfies this constraint.
static HMX_INLINE_ALWAYS void hmx_load_tiles_fp16(const __fp16 *row_tiles,
const __fp16 *col_tiles,
size_t n_tiles) {
size_t limit = n_tiles * HMX_FP16_TILE_SIZE - 1;
asm volatile(
"{ activation.hf = mxmem(%0, %1):deep\n"
"weight.hf = mxmem(%2, %3) }\n"
:: "r"(row_tiles), "r"(limit), "r"(col_tiles), "r"(limit)
: "memory");
}

// Load a single activation+weight tile pair (no :deep streaming).
// Rt defines the accessible region [Rs, Rs+Rt]. Following the reference formula
// (limit = n_tiles * HMX_FP16_TILE_SIZE - 1), for a single tile Rt = 2047.
// The original code used Rt=0x7FFF (32 KB region); when dynamic VTCM allocation
// places a tile near a 4 MB bank boundary, the oversized region crosses it and
// triggers a precise bus error (0x2601). Rt=2047 confines accesses to exactly
// one 2048-byte tile while covering all 16 HVX vectors (offsets 0..2047).
static HMX_INLINE_ALWAYS void hmx_load_tile_pair_fp16(const __fp16 *act_tile,
const __fp16 *wt_tile) {
asm volatile(
"{ activation.hf = mxmem(%0, %1)\n"
"weight.hf = mxmem(%2, %3) }\n"
:: "r"(act_tile), "r"(2047),
"r"(wt_tile), "r"(2047)
: "memory");
}

static HMX_INLINE_ALWAYS void hmx_consume_accumulator_fp16(__fp16 *out) {
// Use the combined convert-and-store instruction (matches the reference
// Q6_mxmem_AR_after_hf intrinsic). The previous two-instruction sequence
// "cvt.hf = acc(2); mxmem = cvt" used an undocumented Rs=2 parameter.
asm volatile(
"mxmem(%0, %1):after.hf = acc\n"
:: "r"(out), "r"(0)
: "memory");
}

// Compute inner product of two vectors of tiles and store result.
static HMX_INLINE_ALWAYS void hmx_dot_fp16(__fp16 *out,
const __fp16 *row_tiles,
const __fp16 *col_tiles,
size_t n_tiles) {
hmx_load_tiles_fp16(row_tiles, col_tiles, n_tiles);
hmx_consume_accumulator_fp16(out);
}

// --- VTCM sequential allocator (from htp-ops-lib/include/dsp/vtcm_mgr.h) ---

static inline uint8_t *vtcm_seq_alloc(uint8_t **vtcm_ptr, size_t size) {
Expand Down
205 changes: 205 additions & 0 deletions ggml/src/ggml-hexagon/htp/hmx-worker.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,205 @@
#include "hmx-worker.h"

#include <HAP_compute_res.h>
#include <HAP_farf.h>
#include <qurt.h>
#include <stdatomic.h>
#include <stdlib.h>
#include <string.h>

// ---------------------------------------------------------------------------
// Internal types
// ---------------------------------------------------------------------------

enum hmx_worker_cmd {
HMX_WORKER_CMD_BEGIN, // acquire HMX lock
HMX_WORKER_CMD_JOB, // execute fn(data)
HMX_WORKER_CMD_END, // release HMX lock
HMX_WORKER_CMD_KILL, // exit thread
};

struct hmx_worker_context {
// Command channel: main thread → worker
atomic_uint cmd_seqn; // bumped by main thread for each command
enum hmx_worker_cmd cmd_type;
hmx_worker_fn_t fn;
void * data;

// Completion channel: worker → main thread
atomic_uint done_seqn; // set to cmd_seqn when command completes

// Configuration
uint32_t vtcm_rctx;

// Thread resources
qurt_thread_t thread;
void * stack; // single allocation: stack + context
};

// ---------------------------------------------------------------------------
// Worker thread entry point
// ---------------------------------------------------------------------------

static void hmx_worker_main(void * arg) {
struct hmx_worker_context * ctx = (struct hmx_worker_context *) arg;

FARF(HIGH, "hmx-worker: thread started");

unsigned int prev_seqn = 0;
for (;;) {
unsigned int seqn = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire);
if (seqn == prev_seqn) {
qurt_futex_wait(&ctx->cmd_seqn, prev_seqn);
continue;
}
prev_seqn = seqn;

switch (ctx->cmd_type) {
case HMX_WORKER_CMD_BEGIN:
HAP_compute_res_hmx_lock(ctx->vtcm_rctx);
break;

case HMX_WORKER_CMD_JOB:
ctx->fn(ctx->data);
break;

case HMX_WORKER_CMD_END:
HAP_compute_res_hmx_unlock(ctx->vtcm_rctx);
break;

case HMX_WORKER_CMD_KILL:
atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release);
qurt_futex_wake(&ctx->done_seqn, 1);
FARF(HIGH, "hmx-worker: thread stopped");
return;
}

atomic_store_explicit(&ctx->done_seqn, seqn, memory_order_release);
qurt_futex_wake(&ctx->done_seqn, 1);
}
}

// ---------------------------------------------------------------------------
// Internal helpers
// ---------------------------------------------------------------------------

// Issue a command to the worker (non-blocking).
static void hmx_worker_issue(struct hmx_worker_context * ctx,
enum hmx_worker_cmd type,
hmx_worker_fn_t fn,
void * data) {
ctx->cmd_type = type;
ctx->fn = fn;
ctx->data = data;
atomic_fetch_add_explicit(&ctx->cmd_seqn, 1, memory_order_release);
qurt_futex_wake(&ctx->cmd_seqn, 1);
}

// Block until the worker has completed the most recently issued command.
static void hmx_worker_drain(struct hmx_worker_context * ctx) {
unsigned int expected = atomic_load_explicit(&ctx->cmd_seqn, memory_order_acquire);
for (;;) {
unsigned int seen = atomic_load_explicit(&ctx->done_seqn, memory_order_acquire);
if (seen == expected) {
return;
}
// Pass the same observed value to futex_wait(). If the worker completes
// between the load above and the futex call, the value mismatch makes the
// wait return immediately instead of sleeping forever on the new seqn.
qurt_futex_wait(&ctx->done_seqn, seen);
}
}

// ---------------------------------------------------------------------------
// Public API
// ---------------------------------------------------------------------------

#define LOWEST_USABLE_QURT_PRIO (254)

AEEResult hmx_worker_init(hmx_worker_context_t * out, uint32_t stack_size, uint32_t vtcm_rctx) {
if (!out) {
return AEE_EBADPARM;
}

// Single allocation: stack followed by context struct.
size_t total = stack_size + sizeof(struct hmx_worker_context);
unsigned char * blob = (unsigned char *) malloc(total);
if (!blob) {
FARF(ERROR, "hmx-worker: allocation failed (%zu bytes)", total);
return AEE_ENOMEMORY;
}
memset(blob, 0, total);

struct hmx_worker_context * ctx = (struct hmx_worker_context *) (blob + stack_size);
ctx->stack = blob;
ctx->vtcm_rctx = vtcm_rctx;
atomic_init(&ctx->cmd_seqn, 0);
atomic_init(&ctx->done_seqn, 0);

// Match caller thread priority (same pattern as worker-pool.c).
int prio = qurt_thread_get_priority(qurt_thread_get_id());
if (prio < 1) {
prio = 1;
}
if (prio > LOWEST_USABLE_QURT_PRIO) {
prio = LOWEST_USABLE_QURT_PRIO;
}

qurt_thread_attr_t attr;
qurt_thread_attr_init(&attr);
qurt_thread_attr_set_stack_addr(&attr, blob);
qurt_thread_attr_set_stack_size(&attr, stack_size);
qurt_thread_attr_set_priority(&attr, prio);
qurt_thread_attr_set_name(&attr, "hmx_worker");

int err = qurt_thread_create(&ctx->thread, &attr, hmx_worker_main, ctx);
if (err) {
FARF(ERROR, "hmx-worker: thread create failed (%d)", err);
free(blob);
return AEE_EQURTTHREADCREATE;
}

*out = ctx;
return AEE_SUCCESS;
}

void hmx_worker_release(hmx_worker_context_t ctx) {
if (!ctx) {
return;
}

// Tell the worker to exit.
hmx_worker_issue(ctx, HMX_WORKER_CMD_KILL, NULL, NULL);
hmx_worker_drain(ctx);

int status;
qurt_thread_join(ctx->thread, &status);

free(ctx->stack);
}

AEEResult hmx_worker_begin(hmx_worker_context_t ctx) {
hmx_worker_issue(ctx, HMX_WORKER_CMD_BEGIN, NULL, NULL);
hmx_worker_drain(ctx); // wait until HMX lock is acquired
return AEE_SUCCESS;
}

AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void * data) {
// Caller is expected to have called wait() for any previous job.
// Safety: drain any residual (should be instant in normal flow).
hmx_worker_drain(ctx);
hmx_worker_issue(ctx, HMX_WORKER_CMD_JOB, fn, data);
return AEE_SUCCESS;
}

AEEResult hmx_worker_wait(hmx_worker_context_t ctx) {
hmx_worker_drain(ctx);
return AEE_SUCCESS;
}

AEEResult hmx_worker_end(hmx_worker_context_t ctx) {
hmx_worker_drain(ctx); // ensure no in-flight job
hmx_worker_issue(ctx, HMX_WORKER_CMD_END, NULL, NULL);
hmx_worker_drain(ctx); // wait until HMX lock is released
return AEE_SUCCESS;
}
54 changes: 54 additions & 0 deletions ggml/src/ggml-hexagon/htp/hmx-worker.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef HMX_WORKER_H
#define HMX_WORKER_H

// Async HMX worker: single dedicated thread for HMX compute,
// allowing the main thread to run HVX/DMA work in parallel.
//
// Lifecycle per matmul op:
// hmx_worker_begin — worker thread acquires HMX lock
// hmx_worker_submit — fire a job (non-blocking)
// hmx_worker_wait — block until current job completes
// ... — repeat submit/wait as needed
// hmx_worker_end — worker thread releases HMX lock
//
// Design: single-producer single-consumer, 1 in-flight job max.

#include <AEEStdDef.h>
#include <AEEStdErr.h>
#include <stdint.h>

#ifdef __cplusplus
extern "C" {
#endif

typedef void (*hmx_worker_fn_t)(void * data);

typedef struct hmx_worker_context * hmx_worker_context_t;

// Create worker thread. Thread starts idle (no HMX lock held).
AEEResult hmx_worker_init(hmx_worker_context_t * ctx, uint32_t stack_size, uint32_t vtcm_rctx);

// Destroy worker thread. Must not be called while a job is in-flight.
void hmx_worker_release(hmx_worker_context_t ctx);

// Worker thread acquires HMX lock. Blocks until lock is held.
AEEResult hmx_worker_begin(hmx_worker_context_t ctx);

// Submit a job (non-blocking). Caller must have called wait() for any
// previous job before submitting a new one.
// |data| must remain valid until the corresponding wait() returns.
AEEResult hmx_worker_submit(hmx_worker_context_t ctx, hmx_worker_fn_t fn, void * data);

// Block until the current in-flight job completes.
// Returns immediately if no job is in-flight.
AEEResult hmx_worker_wait(hmx_worker_context_t ctx);

// Ensure no in-flight job, then worker thread releases HMX lock.
// Blocks until unlock is complete.
AEEResult hmx_worker_end(hmx_worker_context_t ctx);

#ifdef __cplusplus
}
#endif

#endif /* HMX_WORKER_H */
7 changes: 7 additions & 0 deletions ggml/src/ggml-hexagon/htp/htp-ctx.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

#include "hex-dma.h"
#include "htp-ops.h"
#include "hmx-worker.h"
#include "worker-pool.h"

#include <assert.h>
Expand Down Expand Up @@ -30,6 +31,8 @@ struct htp_spad {
uint32_t size_per_thread; // size per thread
};

struct htp_context;

// Context while processing an Op
// TODO: fold this into the main context
struct htp_ops_context {
Expand Down Expand Up @@ -72,6 +75,10 @@ struct htp_context {
atomic_bool vtcm_needs_release;

struct htp_ops_context octx;

#ifdef HTP_HAS_HMX
hmx_worker_context_t hmx_worker; // Async HMX worker for pipeline overlap
#endif
};

int op_matmul(struct htp_ops_context * octx);
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-hexagon/htp/htp-ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,12 @@ enum htp_op_code {
#define HTP_OP_MAX_BUFS 8
#define HTP_OP_MAX_REQS 256
#define HTP_OP_MAX_TENSORS (HTP_OP_MAX_REQS * HTP_OP_MAX_INPUTS + HTP_OP_MAX_REQS)

#if __HVX_ARCH__ < 75
#define HTP_OP_MAX_VMEM (3167538380u)
#else
#define HTP_OP_MAX_VMEM (3221225472u)
#endif

enum htp_tensor_flags {
HTP_TENSOR_COMPUTE = (1U << 0), // Tensor buffer temporal compute data (not weights)
Expand Down
5 changes: 5 additions & 0 deletions ggml/src/ggml-hexagon/htp/hvx-base.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,9 +116,14 @@ static inline HVX_VectorPred hvx_vec_is_nan_f16(HVX_Vector v) {
}

static inline HVX_Vector hvx_vec_f32_to_f16_shuff(HVX_Vector v0, HVX_Vector v1) {
#if __HVX_ARCH__ >= 81
HVX_Vector q0 = Q6_Vqf32_equals_Vsf(v0);
HVX_Vector q1 = Q6_Vqf32_equals_Vsf(v1);
#else
const HVX_Vector zero = Q6_V_vzero();
HVX_Vector q0 = Q6_Vqf32_vadd_VsfVsf(v0, zero);
HVX_Vector q1 = Q6_Vqf32_vadd_VsfVsf(v1, zero);
#endif
return Q6_Vhf_equals_Wqf32(Q6_W_vcombine_VV(q1, q0));
}

Expand Down
Loading
Loading