From 445329d3492e615880553ac52b55b78b6a3dfe4c Mon Sep 17 00:00:00 2001
From: Chao Wang <26245345+ChaoWao@users.noreply.github.com>
Date: Tue, 30 Jun 2026 19:10:46 +0800
Subject: [PATCH] perf(runtime): overlap AICore handshake wakeups; batch the
 release barrier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

handshake_all_cores ran two costs serially that can be parallelized:

1. Step 2 blocked on core i (wait aicore_regs_ready, init regs, wait
   aicore_done) before looking at core i+1, so the 72 AICore cores' wakeup
   latencies summed. The cores wake and advance independently, so this is now
   two phase-batched sweeps (poll every outstanding core per pass, service the
   ready ones): the per-core wakeup waits overlap instead of accumulating. The
   handshake flags are GM reads, not the nGnRE MMIO reg window, so sweeping is
   not subject to the serial-LDR constraint that COND polling is.

2. Step 1 raised aicpu_ready inside the per-core loop with a barrier each
   iteration (71 redundant barriers). The task pointers are now published with
   one barrier, then aicpu_ready is raised for all cores — one barrier suffices
   since AICore only relies on "all task stores visible before any aicpu_ready".

Measured (qwen3-14B 3.5k decode, a2a3 onboard, PTO2_RING_TASK_WINDOW=524288):
preamble 329us -> 150us/step. Output tokens identical.

The residual ~150us is the physical floor: AICore launch (rtKernelLaunch lazy
binary load) + NoC cold-wakeup, plus one structurally-required GM round-trip to
bind logical block_idx <-> runtime-assigned physical_core_id (the register
channel cannot bootstrap that binding — it needs the physical core id the
binding is establishing, and has no host-preclearable "not ready" sentinel).
Applied symmetrically to a2a3 and a5.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .../runtime/scheduler/scheduler_cold_path.cpp | 139 +++++++++++-------
 .../runtime/scheduler/scheduler_cold_path.cpp | 138 ++++++++++-------
 2 files changed, 172 insertions(+), 105 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index 948388aef..4539cf0de 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -703,12 +703,17 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
 
     LOG_INFO_V0("Handshaking with %d cores", cores_total_num_);
 
-    // Step 1: Write per-core payload addresses and send handshake signal.
-    // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before
-    // aicpu_ready=1, so AICore reads the correct payload pointer after waking up.
+    // Step 1: Write per-core payload addresses, then release all cores. The
+    // task pointers are written first and published with a single barrier, then
+    // aicpu_ready is raised for every core. One barrier (not one per core)
+    // suffices: the barrier guarantees every task store is globally visible
+    // before any aicpu_ready store, which is the only ordering AICore relies on
+    // (it reads task only after observing aicpu_ready==1).
     for (int32_t i = 0; i < cores_total_num_; i++) {
         all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
-        OUT_OF_ORDER_STORE_BARRIER();
+    }
+    OUT_OF_ORDER_STORE_BARRIER();
+    for (int32_t i = 0; i < cores_total_num_; i++) {
         all_handshakes[i].aicpu_ready = 1;
     }
     OUT_OF_ORDER_STORE_BARRIER();
@@ -716,70 +721,98 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
     // Get platform physical cores count for validation
     uint32_t max_physical_cores_count = platform_get_physical_cores_count();
 
-    // Step 2: Wait for all cores to respond, collect core type and register addresses
+    // Step 2: collect responses from all cores. The 72 AICore cores wake and
+    // advance their handshake phases in parallel, so we sweep — poll every
+    // outstanding core per pass and service whichever are ready — rather than
+    // blocking on core i before looking at core i+1. A per-core blocking loop
+    // serializes the wakeups (Σ per-core latency); sweeping overlaps them
+    // (≈ max per-core latency + one drain of the GM-flag polls). The flags are
+    // GM reads (not the nGnRE MMIO reg window), so the polls are not forced
+    // serial the way RegId::COND polling is.
     bool handshake_failed = false;
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-
-        while (hank->aicore_regs_ready == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        uint32_t physical_core_id = hank->physical_core_id;
-
-        if (physical_core_id >= max_physical_cores_count) {
-            LOG_ERROR(
-                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
-                max_physical_cores_count
-            );
-            handshake_failed = true;
-            continue;
-        }
-
-        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
-        uint64_t reg_addr = regs[physical_core_id];
-
-        // Initialize AICore registers after discovery (first round)
-        platform_init_aicore_regs(reg_addr);
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-
-        OUT_OF_ORDER_STORE_BARRIER();
-
-        while (hank->aicore_done == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        CoreType type = hank->core_type;
+    uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
+    bool regs_phase_done[RUNTIME_MAX_WORKER] = {false};
+    uint64_t reg_addr_of[RUNTIME_MAX_WORKER] = {0};
+
+    // Sweep A: wait for aicore_regs_ready, init that core's regs, ack with
+    // aicpu_regs_ready=1. Servicing a ready core (regs init + ack) carries no
+    // cross-core dependency, so it is done in-pass while other cores are still
+    // waking.
+    for (int32_t remaining = cores_total_num_; remaining > 0;) {
+        for (int32_t i = 0; i < cores_total_num_; i++) {
+            if (regs_phase_done[i]) continue;
+            Handshake *hank = &all_handshakes[i];
+            if (hank->aicore_regs_ready == 0) {
+                SPIN_WAIT_HINT();
+                continue;
+            }
 
-        core_exec_states_[i].reg_addr = reg_addr;
-        core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+            uint32_t physical_core_id = hank->physical_core_id;
+            if (physical_core_id >= max_physical_cores_count) {
+                LOG_ERROR(
+                    "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
+                    max_physical_cores_count
+                );
+                handshake_failed = true;
+                regs_phase_done[i] = true;
+                remaining--;
+                continue;
+            }
 
+            uint64_t reg_addr = regs[physical_core_id];
+            reg_addr_of[i] = reg_addr;
+            platform_init_aicore_regs(reg_addr);
+            OUT_OF_ORDER_STORE_BARRIER();
+            hank->aicpu_regs_ready = 1;
 #if PTO2_PROFILING
-        // Record physical_core_id for PMU init later (CoreExecState has no room
-        // for this field under PTO2_PROFILING).
-        physical_core_ids_[i] = physical_core_id;
+            physical_core_ids_[i] = physical_core_id;
 #endif
 #if !PTO2_PROFILING
-        core_exec_states_[i].worker_id = i;
-        core_exec_states_[i].physical_core_id = physical_core_id;
-        core_exec_states_[i].core_type = type;
+            core_exec_states_[i].physical_core_id = physical_core_id;
 #endif
-
-        if (type == CoreType::AIC) {
-            aic_worker_ids_[aic_count_++] = i;
-            LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        } else {
-            aiv_worker_ids_[aiv_count_++] = i;
-            LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
+            regs_phase_done[i] = true;
+            remaining--;
         }
     }
+    OUT_OF_ORDER_STORE_BARRIER();
 
     if (handshake_failed) {
         emergency_shutdown(runtime);
         return -1;
     }
 
+    // Sweep B: wait for aicore_done, latch core type + register pointers. Same
+    // sweep so the second round-trip's wakeups also overlap.
+    bool done_phase_done[RUNTIME_MAX_WORKER] = {false};
+    for (int32_t remaining = cores_total_num_; remaining > 0;) {
+        for (int32_t i = 0; i < cores_total_num_; i++) {
+            if (done_phase_done[i]) continue;
+            Handshake *hank = &all_handshakes[i];
+            if (hank->aicore_done == 0) {
+                SPIN_WAIT_HINT();
+                continue;
+            }
+
+            CoreType type = hank->core_type;
+            uint64_t reg_addr = reg_addr_of[i];
+            core_exec_states_[i].reg_addr = reg_addr;
+            core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+#if !PTO2_PROFILING
+            core_exec_states_[i].worker_id = i;
+            core_exec_states_[i].core_type = type;
+#endif
+            if (type == CoreType::AIC) {
+                aic_worker_ids_[aic_count_++] = i;
+                LOG_INFO_V0("Core %d: AIC, reg_addr=0x%lx", i, reg_addr);
+            } else {
+                aiv_worker_ids_[aiv_count_++] = i;
+                LOG_INFO_V0("Core %d: AIV, reg_addr=0x%lx", i, reg_addr);
+            }
+            done_phase_done[i] = true;
+            remaining--;
+        }
+    }
+
     LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_);
     return 0;
 }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
index 59b90ce47..655fcdd9b 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp
@@ -707,12 +707,17 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
 
     LOG_INFO_V0("Handshaking with %d cores", cores_total_num_);
 
-    // Step 1: Write per-core payload addresses and send handshake signal.
-    // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before
-    // aicpu_ready=1, so AICore reads the correct payload pointer after waking up.
+    // Step 1: Write per-core payload addresses, then release all cores. The
+    // task pointers are written first and published with a single barrier, then
+    // aicpu_ready is raised for every core. One barrier (not one per core)
+    // suffices: the barrier guarantees every task store is globally visible
+    // before any aicpu_ready store, which is the only ordering AICore relies on
+    // (it reads task only after observing aicpu_ready==1).
     for (int32_t i = 0; i < cores_total_num_; i++) {
         all_handshakes[i].task = reinterpret_cast<uint64_t>(&payload_per_core_[i][0]);
-        OUT_OF_ORDER_STORE_BARRIER();
+    }
+    OUT_OF_ORDER_STORE_BARRIER();
+    for (int32_t i = 0; i < cores_total_num_; i++) {
         all_handshakes[i].aicpu_ready = 1;
     }
     OUT_OF_ORDER_STORE_BARRIER();
@@ -720,69 +725,98 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) {
     // Get platform physical cores count for validation
     uint32_t max_physical_cores_count = platform_get_physical_cores_count();
 
-    // Step 2: Wait for all cores to respond, collect core type and register addresses
+    // Step 2: collect responses from all cores. The AICore cores wake and
+    // advance their handshake phases in parallel, so we sweep — poll every
+    // outstanding core per pass and service whichever are ready — rather than
+    // blocking on core i before looking at core i+1. A per-core blocking loop
+    // serializes the wakeups (Σ per-core latency); sweeping overlaps them
+    // (≈ max per-core latency + one drain of the GM-flag polls). The flags are
+    // GM reads (not the nGnRE MMIO reg window), so the polls are not forced
+    // serial the way RegId::COND polling is.
     bool handshake_failed = false;
-    for (int32_t i = 0; i < cores_total_num_; i++) {
-        Handshake *hank = &all_handshakes[i];
-
-        while (hank->aicore_regs_ready == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        uint32_t physical_core_id = hank->physical_core_id;
-
-        if (physical_core_id >= max_physical_cores_count) {
-            LOG_ERROR(
-                "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
-                max_physical_cores_count
-            );
-            handshake_failed = true;
-            continue;
-        }
-
-        uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
-        uint64_t reg_addr = regs[physical_core_id];
-
-        // Initialize AICore registers after discovery (first round)
-        platform_init_aicore_regs(reg_addr);
-        OUT_OF_ORDER_STORE_BARRIER();
-        hank->aicpu_regs_ready = 1;
-
-        OUT_OF_ORDER_STORE_BARRIER();
-
-        while (hank->aicore_done == 0) {
-            SPIN_WAIT_HINT();
-        }
-
-        CoreType type = hank->core_type;
+    uint64_t *regs = reinterpret_cast<uint64_t *>(regs_);
+    bool regs_phase_done[RUNTIME_MAX_WORKER] = {false};
+    uint64_t reg_addr_of[RUNTIME_MAX_WORKER] = {0};
+
+    // Sweep A: wait for aicore_regs_ready, init that core's regs, ack with
+    // aicpu_regs_ready=1. Servicing a ready core (regs init + ack) carries no
+    // cross-core dependency, so it is done in-pass while other cores are still
+    // waking.
+    for (int32_t remaining = cores_total_num_; remaining > 0;) {
+        for (int32_t i = 0; i < cores_total_num_; i++) {
+            if (regs_phase_done[i]) continue;
+            Handshake *hank = &all_handshakes[i];
+            if (hank->aicore_regs_ready == 0) {
+                SPIN_WAIT_HINT();
+                continue;
+            }
 
-        core_exec_states_[i].reg_addr = reg_addr;
-        core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+            uint32_t physical_core_id = hank->physical_core_id;
+            if (physical_core_id >= max_physical_cores_count) {
+                LOG_ERROR(
+                    "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id,
+                    max_physical_cores_count
+                );
+                handshake_failed = true;
+                regs_phase_done[i] = true;
+                remaining--;
+                continue;
+            }
 
+            uint64_t reg_addr = regs[physical_core_id];
+            reg_addr_of[i] = reg_addr;
+            platform_init_aicore_regs(reg_addr);
+            OUT_OF_ORDER_STORE_BARRIER();
+            hank->aicpu_regs_ready = 1;
 #if PTO2_PROFILING
-        physical_core_ids_[i] = physical_core_id;
+            physical_core_ids_[i] = physical_core_id;
 #endif
-
 #if !PTO2_PROFILING
-        core_exec_states_[i].worker_id = i;
-        core_exec_states_[i].physical_core_id = physical_core_id;
-        core_exec_states_[i].core_type = type;
+            core_exec_states_[i].physical_core_id = physical_core_id;
 #endif
-
-        if (type == CoreType::AIC) {
-            aic_worker_ids_[aic_count_++] = i;
-            LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
-        } else {
-            aiv_worker_ids_[aiv_count_++] = i;
-            LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr);
+            regs_phase_done[i] = true;
+            remaining--;
         }
     }
+    OUT_OF_ORDER_STORE_BARRIER();
 
     if (handshake_failed) {
         emergency_shutdown(runtime);
         return -1;
     }
 
+    // Sweep B: wait for aicore_done, latch core type + register pointers. Same
+    // sweep so the second round-trip's wakeups also overlap.
+    bool done_phase_done[RUNTIME_MAX_WORKER] = {false};
+    for (int32_t remaining = cores_total_num_; remaining > 0;) {
+        for (int32_t i = 0; i < cores_total_num_; i++) {
+            if (done_phase_done[i]) continue;
+            Handshake *hank = &all_handshakes[i];
+            if (hank->aicore_done == 0) {
+                SPIN_WAIT_HINT();
+                continue;
+            }
+
+            CoreType type = hank->core_type;
+            uint64_t reg_addr = reg_addr_of[i];
+            core_exec_states_[i].reg_addr = reg_addr;
+            core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND);
+#if !PTO2_PROFILING
+            core_exec_states_[i].worker_id = i;
+            core_exec_states_[i].core_type = type;
+#endif
+            if (type == CoreType::AIC) {
+                aic_worker_ids_[aic_count_++] = i;
+                LOG_INFO_V0("Core %d: AIC, reg_addr=0x%lx", i, reg_addr);
+            } else {
+                aiv_worker_ids_[aiv_count_++] = i;
+                LOG_INFO_V0("Core %d: AIV, reg_addr=0x%lx", i, reg_addr);
+            }
+            done_phase_done[i] = true;
+            remaining--;
+        }
+    }
+
     LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_);
     return 0;
 }