From 445329d3492e615880553ac52b55b78b6a3dfe4c Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Tue, 30 Jun 2026 19:10:46 +0800 Subject: [PATCH] perf(runtime): overlap AICore handshake wakeups; batch the release barrier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit handshake_all_cores ran two costs serially that can be parallelized: 1. Step 2 blocked on core i (wait aicore_regs_ready, init regs, wait aicore_done) before looking at core i+1, so the 72 AICore cores' wakeup latencies summed. The cores wake and advance independently, so this is now two phase-batched sweeps (poll every outstanding core per pass, service the ready ones): the per-core wakeup waits overlap instead of accumulating. The handshake flags are GM reads, not the nGnRE MMIO reg window, so sweeping is not subject to the serial-LDR constraint that COND polling is. 2. Step 1 raised aicpu_ready inside the per-core loop with a barrier each iteration (71 redundant barriers). The task pointers are now published with one barrier, then aicpu_ready is raised for all cores — one barrier suffices since AICore only relies on "all task stores visible before any aicpu_ready". Measured (qwen3-14B 3.5k decode, a2a3 onboard, PTO2_RING_TASK_WINDOW=524288): preamble 329us -> 150us/step. Output tokens identical. The residual ~150us is the physical floor: AICore launch (rtKernelLaunch lazy binary load) + NoC cold-wakeup, plus one structurally-required GM round-trip to bind logical block_idx <-> runtime-assigned physical_core_id (the register channel cannot bootstrap that binding — it needs the physical core id the binding is establishing, and has no host-preclearable "not ready" sentinel). Applied symmetrically to a2a3 and a5. Co-Authored-By: Claude Opus 4.8 --- .../runtime/scheduler/scheduler_cold_path.cpp | 139 +++++++++++------- .../runtime/scheduler/scheduler_cold_path.cpp | 138 ++++++++++------- 2 files changed, 172 insertions(+), 105 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 948388aef..4539cf0de 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -703,12 +703,17 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { LOG_INFO_V0("Handshaking with %d cores", cores_total_num_); - // Step 1: Write per-core payload addresses and send handshake signal. - // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before - // aicpu_ready=1, so AICore reads the correct payload pointer after waking up. + // Step 1: Write per-core payload addresses, then release all cores. The + // task pointers are written first and published with a single barrier, then + // aicpu_ready is raised for every core. One barrier (not one per core) + // suffices: the barrier guarantees every task store is globally visible + // before any aicpu_ready store, which is the only ordering AICore relies on + // (it reads task only after observing aicpu_ready==1). for (int32_t i = 0; i < cores_total_num_; i++) { all_handshakes[i].task = reinterpret_cast(&payload_per_core_[i][0]); - OUT_OF_ORDER_STORE_BARRIER(); + } + OUT_OF_ORDER_STORE_BARRIER(); + for (int32_t i = 0; i < cores_total_num_; i++) { all_handshakes[i].aicpu_ready = 1; } OUT_OF_ORDER_STORE_BARRIER(); @@ -716,70 +721,98 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { // Get platform physical cores count for validation uint32_t max_physical_cores_count = platform_get_physical_cores_count(); - // Step 2: Wait for all cores to respond, collect core type and register addresses + // Step 2: collect responses from all cores. The 72 AICore cores wake and + // advance their handshake phases in parallel, so we sweep — poll every + // outstanding core per pass and service whichever are ready — rather than + // blocking on core i before looking at core i+1. A per-core blocking loop + // serializes the wakeups (Σ per-core latency); sweeping overlaps them + // (≈ max per-core latency + one drain of the GM-flag polls). The flags are + // GM reads (not the nGnRE MMIO reg window), so the polls are not forced + // serial the way RegId::COND polling is. bool handshake_failed = false; - for (int32_t i = 0; i < cores_total_num_; i++) { - Handshake *hank = &all_handshakes[i]; - - while (hank->aicore_regs_ready == 0) { - SPIN_WAIT_HINT(); - } - - uint32_t physical_core_id = hank->physical_core_id; - - if (physical_core_id >= max_physical_cores_count) { - LOG_ERROR( - "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, - max_physical_cores_count - ); - handshake_failed = true; - continue; - } - - uint64_t *regs = reinterpret_cast(regs_); - uint64_t reg_addr = regs[physical_core_id]; - - // Initialize AICore registers after discovery (first round) - platform_init_aicore_regs(reg_addr); - OUT_OF_ORDER_STORE_BARRIER(); - hank->aicpu_regs_ready = 1; - - OUT_OF_ORDER_STORE_BARRIER(); - - while (hank->aicore_done == 0) { - SPIN_WAIT_HINT(); - } - - CoreType type = hank->core_type; + uint64_t *regs = reinterpret_cast(regs_); + bool regs_phase_done[RUNTIME_MAX_WORKER] = {false}; + uint64_t reg_addr_of[RUNTIME_MAX_WORKER] = {0}; + + // Sweep A: wait for aicore_regs_ready, init that core's regs, ack with + // aicpu_regs_ready=1. Servicing a ready core (regs init + ack) carries no + // cross-core dependency, so it is done in-pass while other cores are still + // waking. + for (int32_t remaining = cores_total_num_; remaining > 0;) { + for (int32_t i = 0; i < cores_total_num_; i++) { + if (regs_phase_done[i]) continue; + Handshake *hank = &all_handshakes[i]; + if (hank->aicore_regs_ready == 0) { + SPIN_WAIT_HINT(); + continue; + } - core_exec_states_[i].reg_addr = reg_addr; - core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); + uint32_t physical_core_id = hank->physical_core_id; + if (physical_core_id >= max_physical_cores_count) { + LOG_ERROR( + "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, + max_physical_cores_count + ); + handshake_failed = true; + regs_phase_done[i] = true; + remaining--; + continue; + } + uint64_t reg_addr = regs[physical_core_id]; + reg_addr_of[i] = reg_addr; + platform_init_aicore_regs(reg_addr); + OUT_OF_ORDER_STORE_BARRIER(); + hank->aicpu_regs_ready = 1; #if PTO2_PROFILING - // Record physical_core_id for PMU init later (CoreExecState has no room - // for this field under PTO2_PROFILING). - physical_core_ids_[i] = physical_core_id; + physical_core_ids_[i] = physical_core_id; #endif #if !PTO2_PROFILING - core_exec_states_[i].worker_id = i; - core_exec_states_[i].physical_core_id = physical_core_id; - core_exec_states_[i].core_type = type; + core_exec_states_[i].physical_core_id = physical_core_id; #endif - - if (type == CoreType::AIC) { - aic_worker_ids_[aic_count_++] = i; - LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); - } else { - aiv_worker_ids_[aiv_count_++] = i; - LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); + regs_phase_done[i] = true; + remaining--; } } + OUT_OF_ORDER_STORE_BARRIER(); if (handshake_failed) { emergency_shutdown(runtime); return -1; } + // Sweep B: wait for aicore_done, latch core type + register pointers. Same + // sweep so the second round-trip's wakeups also overlap. + bool done_phase_done[RUNTIME_MAX_WORKER] = {false}; + for (int32_t remaining = cores_total_num_; remaining > 0;) { + for (int32_t i = 0; i < cores_total_num_; i++) { + if (done_phase_done[i]) continue; + Handshake *hank = &all_handshakes[i]; + if (hank->aicore_done == 0) { + SPIN_WAIT_HINT(); + continue; + } + + CoreType type = hank->core_type; + uint64_t reg_addr = reg_addr_of[i]; + core_exec_states_[i].reg_addr = reg_addr; + core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); +#if !PTO2_PROFILING + core_exec_states_[i].worker_id = i; + core_exec_states_[i].core_type = type; +#endif + if (type == CoreType::AIC) { + aic_worker_ids_[aic_count_++] = i; + LOG_INFO_V0("Core %d: AIC, reg_addr=0x%lx", i, reg_addr); + } else { + aiv_worker_ids_[aiv_count_++] = i; + LOG_INFO_V0("Core %d: AIV, reg_addr=0x%lx", i, reg_addr); + } + done_phase_done[i] = true; + remaining--; + } + } + LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_); return 0; } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 59b90ce47..655fcdd9b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -707,12 +707,17 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { LOG_INFO_V0("Handshaking with %d cores", cores_total_num_); - // Step 1: Write per-core payload addresses and send handshake signal. - // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before - // aicpu_ready=1, so AICore reads the correct payload pointer after waking up. + // Step 1: Write per-core payload addresses, then release all cores. The + // task pointers are written first and published with a single barrier, then + // aicpu_ready is raised for every core. One barrier (not one per core) + // suffices: the barrier guarantees every task store is globally visible + // before any aicpu_ready store, which is the only ordering AICore relies on + // (it reads task only after observing aicpu_ready==1). for (int32_t i = 0; i < cores_total_num_; i++) { all_handshakes[i].task = reinterpret_cast(&payload_per_core_[i][0]); - OUT_OF_ORDER_STORE_BARRIER(); + } + OUT_OF_ORDER_STORE_BARRIER(); + for (int32_t i = 0; i < cores_total_num_; i++) { all_handshakes[i].aicpu_ready = 1; } OUT_OF_ORDER_STORE_BARRIER(); @@ -720,69 +725,98 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { // Get platform physical cores count for validation uint32_t max_physical_cores_count = platform_get_physical_cores_count(); - // Step 2: Wait for all cores to respond, collect core type and register addresses + // Step 2: collect responses from all cores. The AICore cores wake and + // advance their handshake phases in parallel, so we sweep — poll every + // outstanding core per pass and service whichever are ready — rather than + // blocking on core i before looking at core i+1. A per-core blocking loop + // serializes the wakeups (Σ per-core latency); sweeping overlaps them + // (≈ max per-core latency + one drain of the GM-flag polls). The flags are + // GM reads (not the nGnRE MMIO reg window), so the polls are not forced + // serial the way RegId::COND polling is. bool handshake_failed = false; - for (int32_t i = 0; i < cores_total_num_; i++) { - Handshake *hank = &all_handshakes[i]; - - while (hank->aicore_regs_ready == 0) { - SPIN_WAIT_HINT(); - } - - uint32_t physical_core_id = hank->physical_core_id; - - if (physical_core_id >= max_physical_cores_count) { - LOG_ERROR( - "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, - max_physical_cores_count - ); - handshake_failed = true; - continue; - } - - uint64_t *regs = reinterpret_cast(regs_); - uint64_t reg_addr = regs[physical_core_id]; - - // Initialize AICore registers after discovery (first round) - platform_init_aicore_regs(reg_addr); - OUT_OF_ORDER_STORE_BARRIER(); - hank->aicpu_regs_ready = 1; - - OUT_OF_ORDER_STORE_BARRIER(); - - while (hank->aicore_done == 0) { - SPIN_WAIT_HINT(); - } - - CoreType type = hank->core_type; + uint64_t *regs = reinterpret_cast(regs_); + bool regs_phase_done[RUNTIME_MAX_WORKER] = {false}; + uint64_t reg_addr_of[RUNTIME_MAX_WORKER] = {0}; + + // Sweep A: wait for aicore_regs_ready, init that core's regs, ack with + // aicpu_regs_ready=1. Servicing a ready core (regs init + ack) carries no + // cross-core dependency, so it is done in-pass while other cores are still + // waking. + for (int32_t remaining = cores_total_num_; remaining > 0;) { + for (int32_t i = 0; i < cores_total_num_; i++) { + if (regs_phase_done[i]) continue; + Handshake *hank = &all_handshakes[i]; + if (hank->aicore_regs_ready == 0) { + SPIN_WAIT_HINT(); + continue; + } - core_exec_states_[i].reg_addr = reg_addr; - core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); + uint32_t physical_core_id = hank->physical_core_id; + if (physical_core_id >= max_physical_cores_count) { + LOG_ERROR( + "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, + max_physical_cores_count + ); + handshake_failed = true; + regs_phase_done[i] = true; + remaining--; + continue; + } + uint64_t reg_addr = regs[physical_core_id]; + reg_addr_of[i] = reg_addr; + platform_init_aicore_regs(reg_addr); + OUT_OF_ORDER_STORE_BARRIER(); + hank->aicpu_regs_ready = 1; #if PTO2_PROFILING - physical_core_ids_[i] = physical_core_id; + physical_core_ids_[i] = physical_core_id; #endif - #if !PTO2_PROFILING - core_exec_states_[i].worker_id = i; - core_exec_states_[i].physical_core_id = physical_core_id; - core_exec_states_[i].core_type = type; + core_exec_states_[i].physical_core_id = physical_core_id; #endif - - if (type == CoreType::AIC) { - aic_worker_ids_[aic_count_++] = i; - LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); - } else { - aiv_worker_ids_[aiv_count_++] = i; - LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); + regs_phase_done[i] = true; + remaining--; } } + OUT_OF_ORDER_STORE_BARRIER(); if (handshake_failed) { emergency_shutdown(runtime); return -1; } + // Sweep B: wait for aicore_done, latch core type + register pointers. Same + // sweep so the second round-trip's wakeups also overlap. + bool done_phase_done[RUNTIME_MAX_WORKER] = {false}; + for (int32_t remaining = cores_total_num_; remaining > 0;) { + for (int32_t i = 0; i < cores_total_num_; i++) { + if (done_phase_done[i]) continue; + Handshake *hank = &all_handshakes[i]; + if (hank->aicore_done == 0) { + SPIN_WAIT_HINT(); + continue; + } + + CoreType type = hank->core_type; + uint64_t reg_addr = reg_addr_of[i]; + core_exec_states_[i].reg_addr = reg_addr; + core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); +#if !PTO2_PROFILING + core_exec_states_[i].worker_id = i; + core_exec_states_[i].core_type = type; +#endif + if (type == CoreType::AIC) { + aic_worker_ids_[aic_count_++] = i; + LOG_INFO_V0("Core %d: AIC, reg_addr=0x%lx", i, reg_addr); + } else { + aiv_worker_ids_[aiv_count_++] = i; + LOG_INFO_V0("Core %d: AIV, reg_addr=0x%lx", i, reg_addr); + } + done_phase_done[i] = true; + remaining--; + } + } + LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_); return 0; }