Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions src/a2a3/platform/include/common/platform_config.h
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,19 @@ constexpr int PLATFORM_DUMP_READYQUEUE_SIZE = PLATFORM_MAX_AICPU_THREADS * PLATF
*/
constexpr int PLATFORM_DUMP_TIMEOUT_SECONDS = 30;

/**
* Dump-args mask pool dimensions. The pool is keyed by (ring_id, slot) packed
* from a PTO2 task_id, so it must span the largest ring depth and task window
* any runtime built against this platform can use. The dump infra is shared by
* every runtime (device-orch tensormap_and_ringbuffer at ring depth 4 and the
* single-ring host-orch host_build_graph), so these are sized to the maximum
* rather than coupled to one runtime's pto_runtime2_types.h — a runtime that
* lowers its own PTO2_MAX_RING_DEPTH must not shrink the pool other runtimes
* rely on (see set_dump_args_task_mask's ring_id bound check).
*/
constexpr uint32_t PLATFORM_DUMP_MASK_POOL_MAX_RINGS = 4;
constexpr uint32_t PLATFORM_DUMP_MASK_POOL_MAX_SLOTS = 16384;

// =============================================================================
// PMU Profiling Configuration
// =============================================================================
Expand Down
5 changes: 2 additions & 3 deletions src/a2a3/platform/include/common/tensor_dump.h
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,6 @@
#include <cstdint>

#include "common/platform_config.h"
#include "host_build_graph/runtime/pto_runtime2_types.h"

// =============================================================================
// Constants
Expand Down Expand Up @@ -84,8 +83,8 @@ using TensorDumpArgMask = uint64_t;
// Zero preserves legacy "dump all tasks" behavior unless selective mode is enabled.
constexpr TensorDumpArgMask TENSOR_DUMP_ARG_MASK_NONE = 0;
constexpr uint32_t TENSOR_DUMP_ARG_MASK_BITS = 64;
constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PTO2_MAX_RING_DEPTH;
constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PTO2_TASK_WINDOW_SIZE;
constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_RINGS = PLATFORM_DUMP_MASK_POOL_MAX_RINGS;
constexpr uint32_t TENSOR_DUMP_MASK_POOL_MAX_SLOTS = PLATFORM_DUMP_MASK_POOL_MAX_SLOTS;
constexpr uint32_t TENSOR_DUMP_MASK_POOL_DEFAULT_SLOT_MASK = TENSOR_DUMP_MASK_POOL_MAX_SLOTS - 1;
constexpr uint8_t TENSOR_DUMP_RECORD_FLAG_ARG_INDEX_AMBIGUOUS = 1u << 0;

Expand Down
37 changes: 37 additions & 0 deletions src/a2a3/platform/onboard/host/device_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -968,6 +968,43 @@ int DeviceRunner::init_scope_stats(int num_threads, int device_id) {
return 0;
}

void *DeviceRunner::svm_register(void *dev_ptr, std::size_t bytes) {
if (dev_ptr == nullptr || bytes == 0) {
return nullptr;
}
if (load_hal_if_needed() != 0) {
LOG_ERROR("svm_register: failed to load ascend_hal: %s", dlerror());
return nullptr;
}
HalHostRegisterFn fn = get_halHostRegister();
if (fn == nullptr) {
LOG_ERROR("svm_register: halHostRegister symbol not found: %s", dlerror());
return nullptr;
}
void *host_va = nullptr;
int rc = fn(dev_ptr, bytes, DEV_SVM_MAP_HOST, device_id_, &host_va);
if (rc != 0) {
LOG_ERROR("svm_register: halHostRegister failed for dev_ptr %p (rc=%d)", dev_ptr, rc);
return nullptr;
}
return host_va;
}

void DeviceRunner::svm_unregister(void *dev_ptr) {
if (dev_ptr == nullptr) {
return;
}
// halHostUnregister is keyed by the device pointer (mirrors the profiling
// finalize path); the HAL maps it back to the host VA internally.
HalHostUnregisterFn fn = get_halHostUnregister();
if (fn != nullptr) {
int rc = fn(dev_ptr, device_id_);
if (rc != 0) {
LOG_ERROR("svm_unregister: halHostUnregister failed for dev_ptr %p (rc=%d)", dev_ptr, rc);
}
}
}

void DeviceRunner::finalize_collectors() {
auto unregister_cb = [](void *dev_ptr, int device_id) -> int {
HalHostUnregisterFn fn = get_halHostUnregister();
Expand Down
8 changes: 8 additions & 0 deletions src/a2a3/platform/onboard/host/device_runner.h
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,14 @@ class DeviceRunner : public DeviceRunnerBase {
*/
int run(Runtime &runtime, const CallConfig &config) override;

// SVM map/unmap a device buffer into host address space via
// halHostRegister(DEV_SVM_MAP_HOST) / halHostUnregister. host_build_graph
// uses these so its host-side orchestrator can read control tensors whose
// buffer.addr is a device address. The returned host VA may differ from
// dev_ptr — callers must use it for host access.
void *svm_register(void *dev_ptr, std::size_t bytes) override;
void svm_unregister(void *dev_ptr) override;

/**
* a2a3-only `dep_gen` enablement setter. The shared
* `set_l2_swimlane_enabled`, `set_dump_tensor_enabled`,
Expand Down
202 changes: 160 additions & 42 deletions src/a2a3/runtime/host_build_graph/aicore/aicore_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,26 +14,65 @@
#include "aicore/l2_swimlane_collector_aicore.h"
#include "aicore/pmu_collector_aicore.h"
#include "common/l2_swimlane_profiling.h"
#include "common/platform_config.h" // Platform configuration (C/C++ compatible)
#include "common/platform_config.h" // Register-based communication
#include "pto2_dispatch_payload.h"
#include "runtime.h"

typedef void (*KernelFunc)(__gm__ int64_t *);
/**
* Unified function pointer type for kernel dispatch
*
* All kernels follow the same signature: void kernel(__gm__ int64_t* args)
* This enables simple, switch-free dispatch.
*/
typedef void (*UnifiedKernelFunc)(__gm__ int64_t *);

__aicore__ __attribute__((always_inline)) static void execute_task(__gm__ Task *task) {
if (task->function_bin_addr == 0) {
/**
* Execute task from PTO2DispatchPayload.
*
* Reads function_bin_addr and args from the dispatch payload.
*
* @param payload Pointer to PTO2DispatchPayload in global memory
*/
__aicore__ __attribute__((always_inline)) static void execute_task(__gm__ PTO2DispatchPayload *payload) {
if (payload == nullptr || payload->function_bin_addr == 0) {
return;
}
KernelFunc kernel = (KernelFunc)task->function_bin_addr;
kernel(reinterpret_cast<__gm__ int64_t *>(task->args));

UnifiedKernelFunc kernel = (UnifiedKernelFunc)payload->function_bin_addr;
kernel(reinterpret_cast<__gm__ int64_t *>(payload->args));
OUT_OF_ORDER_STORE_BARRIER();
}

/**
* AICore main execution loop
*
* Implements the AICPU-AICore register-based dispatch protocol:
* 1. Wait for AICPU ready signal via handshake buffer
* 2. Report physical core ID and core type, signal AICore ready
* 3. Cache per-core PTO2DispatchPayload pointer from hank->task
* 4. Poll DATA_MAIN_BASE register for task dispatch until exit signal
*
* AICPU writes &s_payload_per_core[i] to hank->task before setting
* aicpu_ready=1. AICore caches this pointer and reads function_bin_addr +
* args pointer from it on each dispatch. reg_val is a monotonically
* increasing task ID used only for dispatch signaling and ACK/FIN protocol.
*
* Profiling state (enable flag, L2 swimlane rotation channel) is published into the platform
* via set_aicore_profiling_flag / set_aicore_l2_swimlane_ring at kernel entry —
* this routine reads it through the matching getters, so neither Handshake
* nor this signature carry profiling fields.
*
* @param runtime Pointer to Runtime in global memory
* @param block_idx Block index (core ID)
* @param core_type Core type (AIC or AIV)
*/
__aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, int block_idx, CoreType core_type) {
__gm__ Handshake *my_hank = (__gm__ Handshake *)(&runtime->workers[block_idx]);

// Phase 1: Wait for AICPU initialization signal
while (my_hank->aicpu_ready == 0) {
dcci(my_hank, SINGLE_CACHE_LINE);
SPIN_WAIT_HINT();
}

// Phase 2: Report physical core ID, signal ready
Expand All @@ -43,73 +82,139 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
dcci(&my_hank->aicore_regs_ready, SINGLE_CACHE_LINE, CACHELINE_OUT);
while (my_hank->aicpu_regs_ready == 0) {
dcci(&my_hank->aicpu_regs_ready, SINGLE_CACHE_LINE);
SPIN_WAIT_HINT();
}
// Report initial idle status via register
write_reg(RegId::COND, AICORE_IDLE_VALUE);

// Phase 3: Report core type, signal ready
my_hank->core_type = core_type;
OUT_OF_ORDER_STORE_BARRIER();
my_hank->aicore_done = block_idx + 1;
my_hank->aicore_done = block_idx + 1; // Signal ready (use block_idx + 1 to avoid 0)

dcci(my_hank, SINGLE_CACHE_LINE, CACHELINE_OUT);

// Cache per-core dispatch payload pointer (set by AICPU before aicpu_ready)
__gm__ PTO2DispatchPayload *payload = reinterpret_cast<__gm__ PTO2DispatchPayload *>(my_hank->task);

uint32_t enable_profiling_flag = get_aicore_profiling_flag();
bool l2_swimlane_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_L2_SWIMLANE);
bool dump_tensor_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_DUMP_TENSOR);
bool pmu_enabled = GET_PROFILING_FLAG(enable_profiling_flag, PROFILING_FLAG_PMU);

// Per-core L2SwimlaneActiveHead channel; see tensormap_and_ringbuffer/.../aicore_executor.cpp.
// Deferred until first task so AICPU's init has populated the rotation
// table (the dispatch itself proves init is done).
__gm__ L2SwimlaneActiveHead *l2_swimlane_head = nullptr;
// Per-core L2SwimlaneActiveHead channel. AICPU completes
// `l2_swimlane_aicpu_init` before writing `aicpu_ready = 1` in
// `handshake_all_cores`, and Phase 1 above has already observed
// `aicpu_ready == 1`, so the rotation-table slot is populated and the
// first deref is safe here — off the dispatch→start critical path.
__gm__ L2SwimlaneActiveHead *l2_swimlane_head = l2_swimlane_enabled ? get_l2_swimlane_aicore_head() : nullptr;
// cached_buf_seq must start != AICPU's initial head.current_buf_seq (0)
// so the first record_task observes a mismatch and loads the buffer ptr.
L2SwimlaneAicoreLocalState l2_swimlane_local = {nullptr, UINT32_MAX, 0};

volatile uint32_t task_id = AICPU_IDLE_TASK_ID;
volatile uint32_t last_task_id = AICPU_IDLE_TASK_ID;
// Phase 4: Main execution loop - poll register for tasks until exit signal
// Register encoding: AICPU_IDLE_TASK_ID=idle, task_id=task, AICORE_EXIT_SIGNAL=exit
uint32_t reg_val = AICPU_IDLE_TASK_ID;
uint32_t last_reg_val = AICPU_IDLE_TASK_ID;
bool exiting = false;

while (true) {
task_id = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
if (task_id == AICORE_EXIT_SIGNAL) {
reg_val = static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE));
if (reg_val == AICORE_EXIT_SIGNAL) {
// Signal exit acknowledgment to AICPU
write_reg(RegId::COND, AICORE_EXITED_VALUE);
break;
}

if (task_id == AICPU_IDLE_TASK_ID || task_id == last_task_id) {
// Execute task if new (reg_val encoding: AICPU_IDLE_TASK_ID=idle, task_id=task)
if (reg_val == AICPU_IDLE_TASK_ID || reg_val == last_reg_val) {
SPIN_WAIT_HINT();
continue;
}

{
// receive_time captures the instant DATA_MAIN_BASE returned a new
// task_id, BEFORE the ack write. Paired with start_time (captured
// after task_ptr resolve) it lets DFX split head_OH into the
// AICPU→AICore NoC propagation (dispatch_ts → receive_time,
// hardware-bound) and the AICore-local ack + task_ptr resolve
// (receive_time → start_time). host_build_graph has no per-task
// dcci so the local-setup span is naturally tighter than the
// tensormap_and_ringbuffer runtime; the field still records it.
// receive_time marks the moment AICPU's full "task is ready to
// execute" signal landed on this core. Paired with start_time
// (captured after the per-task dcci + ack pair) it lets DFX split
// head_OH into the AICPU→AICore-ready propagation (dispatch_ts →
// receive_time, hardware + scheduling-bound) and the AICore-local
// critical-path prep (receive_time → start_time, software-tunable).
// Stored in the record as a 32-bit delta `start_time - receive_time`.
//
// For the common path (not_ready == 0) the new task_id on
// DATA_MAIN_BASE is itself the ready signal, so receive_time is
// stamped immediately and local_setup covers dcci + ack.
//
// For the speculative early-dispatch path (not_ready == 1) the
// dcci ran BEFORE the dependency-wait spin, so its cost is hidden
// behind the doorbell-wait — not on the critical path between
// "task genuinely ready" and "kernel begins". receive_time is
// re-stamped after the doorbell arrives, so propagation absorbs
// both the original NoC delivery AND any speculation overshoot,
// while local_setup stays the pure ack-on-critical-path cost. This
// makes local_setup the clean "AICore prep we can't hide" figure
// for both paths.
uint64_t receive_time = get_sys_cnt_aicore();

uint32_t actual_task_id = task_id;
write_reg(RegId::COND, MAKE_ACK_VALUE(actual_task_id));
uint32_t task_id = reg_val; // Decode: register holds task_id directly

// Select dual-buffer slot: same bit as AICPU used when writing payload
__gm__ PTO2DispatchPayload *exec_payload = payload + (task_id & 1u);

// First-task lazy resolve of the rotation channel.
if (l2_swimlane_enabled && l2_swimlane_head == nullptr) {
l2_swimlane_head = get_l2_swimlane_aicore_head();
// Invalidate payload buffer (AICPU updates its content each dispatch)
dcci(exec_payload, ENTIRE_DATA_CACHE);

// Speculative early-dispatch gate. A not-ready task was staged on
// this core before its dependencies resolved; wait until AICPU rings
// the doorbell (DATA_MAIN_BASE high 32 == task_id) before executing.
// The ACK is deferred until AFTER the gate so the scheduler keeps the
// core off-limits (pending_occupied stays set, no ACK->pending_freed)
// while the task is gated — preventing a real task from being
// dual-issued behind it. The kernel's own input dcci runs inside
// execute_task() below — strictly AFTER this gate — so predecessor
// outputs are visible. not_ready == 0 (the common path) skips this.
if (exec_payload->not_ready) {
while (true) {
// Honor teardown: shutdown overwrites the low half with EXIT.
// Check it on the doorbell-match iteration too, so an EXIT that
// races in right after the matching doorbell still wins over
// executing the gated task.
if (read_dmb_high32() == task_id) {
if (static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE)) == AICORE_EXIT_SIGNAL) {
exiting = true;
}
break;
}
if (static_cast<uint32_t>(read_reg(RegId::DATA_MAIN_BASE)) == AICORE_EXIT_SIGNAL) {
exiting = true;
break;
}
SPIN_WAIT_HINT();
}
Comment thread
ChaoWao marked this conversation as resolved.
if (exiting) {
write_reg(RegId::COND, AICORE_EXITED_VALUE);
break;
}
// Re-stamp receive_time at the moment the doorbell landed: the
// dcci above ran during the speculative-staging window
// (overlapped with the dependency wait, off the critical path).
// Propagation now absorbs the speculation overshoot; local_setup
// = start - receive stays the pure ack-on-critical-path cost.
receive_time = get_sys_cnt_aicore();
}

__gm__ Task *task_ptr = &(runtime->tasks[actual_task_id]);
write_reg(RegId::COND, MAKE_ACK_VALUE(task_id));

// Performance profiling: record start time
uint64_t start_time = get_sys_cnt_aicore();

// PMU: start counting window around kernel execution
if (pmu_enabled) {
pmu_aicore_begin();
}

execute_task(task_ptr);
// Execute the task
execute_task(exec_payload);

if (pmu_enabled) {
pmu_aicore_end();
Expand All @@ -119,22 +224,35 @@ __aicore__ __attribute__((weak)) void aicore_execute(__gm__ Runtime *runtime, in
pipe_barrier(PIPE_ALL);
}

// Performance profiling: record task execution.
// Two identity fields go into the record (different roles):
// - task_token_raw (PTO2 ring/local) is pulled from the dispatch
// payload's LocalContext.async_ctx — already in AICore cache
// from the just-completed task, no extra GM load. Host uses
// it as the canonical task identity for JSON output / ring
// decoding.
// - reg_task_id is `task_id` (= reg_val, the per-core dispatch
// token AICore just read from DATA_MAIN_BASE). Per-dispatch
// unique within this core; host uses it as the join key
// against the AICPU record stream. Required for correctness
// under SPMD (block_num > num_cores) and MIX cluster spread,
// where multiple dispatches of the same task share the same
// task_token_raw.
last_reg_val = reg_val;
write_reg(RegId::COND, MAKE_FIN_VALUE(task_id));

// Sample end_time AFTER the FIN write so the op-event end marks the
// moment the AICPU can first observe completion — any compute-end ->
// FIN gap (epilogue / write-back) shows directly on the bar instead
// of being inferred. The record write itself stays off the critical
// path (it runs after FIN, so it no longer delays completion).
if (l2_swimlane_enabled) {
uint64_t end_time = get_sys_cnt_aicore();
// host_build_graph uses plain task indices; zero-extend into
// the task_token_raw slot (identity) AND pass as reg_task_id
// (join key). With block_num always == 1 in this runtime
// there is no dispatch fan-out per task, so identity and
// dispatch token coincide and a single value covers both.
uint64_t task_token_raw = exec_payload->local_context.async_ctx.task_token.raw;
l2_swimlane_aicore_record_task(
l2_swimlane_head, &l2_swimlane_local, static_cast<uint64_t>(actual_task_id),
static_cast<uint32_t>(actual_task_id), receive_time, start_time, end_time
l2_swimlane_head, &l2_swimlane_local, task_token_raw, task_id, receive_time, start_time, end_time
);
}

last_task_id = task_id;

write_reg(RegId::COND, MAKE_FIN_VALUE(actual_task_id));
}
}

Expand Down
Loading
Loading