diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 948388aef..4539cf0de 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -703,12 +703,17 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { LOG_INFO_V0("Handshaking with %d cores", cores_total_num_); - // Step 1: Write per-core payload addresses and send handshake signal. - // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before - // aicpu_ready=1, so AICore reads the correct payload pointer after waking up. + // Step 1: Write per-core payload addresses, then release all cores. The + // task pointers are written first and published with a single barrier, then + // aicpu_ready is raised for every core. One barrier (not one per core) + // suffices: the barrier guarantees every task store is globally visible + // before any aicpu_ready store, which is the only ordering AICore relies on + // (it reads task only after observing aicpu_ready==1). for (int32_t i = 0; i < cores_total_num_; i++) { all_handshakes[i].task = reinterpret_cast(&payload_per_core_[i][0]); - OUT_OF_ORDER_STORE_BARRIER(); + } + OUT_OF_ORDER_STORE_BARRIER(); + for (int32_t i = 0; i < cores_total_num_; i++) { all_handshakes[i].aicpu_ready = 1; } OUT_OF_ORDER_STORE_BARRIER(); @@ -716,70 +721,98 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { // Get platform physical cores count for validation uint32_t max_physical_cores_count = platform_get_physical_cores_count(); - // Step 2: Wait for all cores to respond, collect core type and register addresses + // Step 2: collect responses from all cores. The 72 AICore cores wake and + // advance their handshake phases in parallel, so we sweep — poll every + // outstanding core per pass and service whichever are ready — rather than + // blocking on core i before looking at core i+1. A per-core blocking loop + // serializes the wakeups (Σ per-core latency); sweeping overlaps them + // (≈ max per-core latency + one drain of the GM-flag polls). The flags are + // GM reads (not the nGnRE MMIO reg window), so the polls are not forced + // serial the way RegId::COND polling is. bool handshake_failed = false; - for (int32_t i = 0; i < cores_total_num_; i++) { - Handshake *hank = &all_handshakes[i]; - - while (hank->aicore_regs_ready == 0) { - SPIN_WAIT_HINT(); - } - - uint32_t physical_core_id = hank->physical_core_id; - - if (physical_core_id >= max_physical_cores_count) { - LOG_ERROR( - "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, - max_physical_cores_count - ); - handshake_failed = true; - continue; - } - - uint64_t *regs = reinterpret_cast(regs_); - uint64_t reg_addr = regs[physical_core_id]; - - // Initialize AICore registers after discovery (first round) - platform_init_aicore_regs(reg_addr); - OUT_OF_ORDER_STORE_BARRIER(); - hank->aicpu_regs_ready = 1; - - OUT_OF_ORDER_STORE_BARRIER(); - - while (hank->aicore_done == 0) { - SPIN_WAIT_HINT(); - } - - CoreType type = hank->core_type; + uint64_t *regs = reinterpret_cast(regs_); + bool regs_phase_done[RUNTIME_MAX_WORKER] = {false}; + uint64_t reg_addr_of[RUNTIME_MAX_WORKER] = {0}; + + // Sweep A: wait for aicore_regs_ready, init that core's regs, ack with + // aicpu_regs_ready=1. Servicing a ready core (regs init + ack) carries no + // cross-core dependency, so it is done in-pass while other cores are still + // waking. + for (int32_t remaining = cores_total_num_; remaining > 0;) { + for (int32_t i = 0; i < cores_total_num_; i++) { + if (regs_phase_done[i]) continue; + Handshake *hank = &all_handshakes[i]; + if (hank->aicore_regs_ready == 0) { + SPIN_WAIT_HINT(); + continue; + } - core_exec_states_[i].reg_addr = reg_addr; - core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); + uint32_t physical_core_id = hank->physical_core_id; + if (physical_core_id >= max_physical_cores_count) { + LOG_ERROR( + "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, + max_physical_cores_count + ); + handshake_failed = true; + regs_phase_done[i] = true; + remaining--; + continue; + } + uint64_t reg_addr = regs[physical_core_id]; + reg_addr_of[i] = reg_addr; + platform_init_aicore_regs(reg_addr); + OUT_OF_ORDER_STORE_BARRIER(); + hank->aicpu_regs_ready = 1; #if PTO2_PROFILING - // Record physical_core_id for PMU init later (CoreExecState has no room - // for this field under PTO2_PROFILING). - physical_core_ids_[i] = physical_core_id; + physical_core_ids_[i] = physical_core_id; #endif #if !PTO2_PROFILING - core_exec_states_[i].worker_id = i; - core_exec_states_[i].physical_core_id = physical_core_id; - core_exec_states_[i].core_type = type; + core_exec_states_[i].physical_core_id = physical_core_id; #endif - - if (type == CoreType::AIC) { - aic_worker_ids_[aic_count_++] = i; - LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); - } else { - aiv_worker_ids_[aiv_count_++] = i; - LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); + regs_phase_done[i] = true; + remaining--; } } + OUT_OF_ORDER_STORE_BARRIER(); if (handshake_failed) { emergency_shutdown(runtime); return -1; } + // Sweep B: wait for aicore_done, latch core type + register pointers. Same + // sweep so the second round-trip's wakeups also overlap. + bool done_phase_done[RUNTIME_MAX_WORKER] = {false}; + for (int32_t remaining = cores_total_num_; remaining > 0;) { + for (int32_t i = 0; i < cores_total_num_; i++) { + if (done_phase_done[i]) continue; + Handshake *hank = &all_handshakes[i]; + if (hank->aicore_done == 0) { + SPIN_WAIT_HINT(); + continue; + } + + CoreType type = hank->core_type; + uint64_t reg_addr = reg_addr_of[i]; + core_exec_states_[i].reg_addr = reg_addr; + core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); +#if !PTO2_PROFILING + core_exec_states_[i].worker_id = i; + core_exec_states_[i].core_type = type; +#endif + if (type == CoreType::AIC) { + aic_worker_ids_[aic_count_++] = i; + LOG_INFO_V0("Core %d: AIC, reg_addr=0x%lx", i, reg_addr); + } else { + aiv_worker_ids_[aiv_count_++] = i; + LOG_INFO_V0("Core %d: AIV, reg_addr=0x%lx", i, reg_addr); + } + done_phase_done[i] = true; + remaining--; + } + } + LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_); return 0; } diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp index 59b90ce47..655fcdd9b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/scheduler/scheduler_cold_path.cpp @@ -707,12 +707,17 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { LOG_INFO_V0("Handshaking with %d cores", cores_total_num_); - // Step 1: Write per-core payload addresses and send handshake signal. - // OUT_OF_ORDER_STORE_BARRIER() ensures task is globally visible before - // aicpu_ready=1, so AICore reads the correct payload pointer after waking up. + // Step 1: Write per-core payload addresses, then release all cores. The + // task pointers are written first and published with a single barrier, then + // aicpu_ready is raised for every core. One barrier (not one per core) + // suffices: the barrier guarantees every task store is globally visible + // before any aicpu_ready store, which is the only ordering AICore relies on + // (it reads task only after observing aicpu_ready==1). for (int32_t i = 0; i < cores_total_num_; i++) { all_handshakes[i].task = reinterpret_cast(&payload_per_core_[i][0]); - OUT_OF_ORDER_STORE_BARRIER(); + } + OUT_OF_ORDER_STORE_BARRIER(); + for (int32_t i = 0; i < cores_total_num_; i++) { all_handshakes[i].aicpu_ready = 1; } OUT_OF_ORDER_STORE_BARRIER(); @@ -720,69 +725,98 @@ int32_t SchedulerContext::handshake_all_cores(Runtime *runtime) { // Get platform physical cores count for validation uint32_t max_physical_cores_count = platform_get_physical_cores_count(); - // Step 2: Wait for all cores to respond, collect core type and register addresses + // Step 2: collect responses from all cores. The AICore cores wake and + // advance their handshake phases in parallel, so we sweep — poll every + // outstanding core per pass and service whichever are ready — rather than + // blocking on core i before looking at core i+1. A per-core blocking loop + // serializes the wakeups (Σ per-core latency); sweeping overlaps them + // (≈ max per-core latency + one drain of the GM-flag polls). The flags are + // GM reads (not the nGnRE MMIO reg window), so the polls are not forced + // serial the way RegId::COND polling is. bool handshake_failed = false; - for (int32_t i = 0; i < cores_total_num_; i++) { - Handshake *hank = &all_handshakes[i]; - - while (hank->aicore_regs_ready == 0) { - SPIN_WAIT_HINT(); - } - - uint32_t physical_core_id = hank->physical_core_id; - - if (physical_core_id >= max_physical_cores_count) { - LOG_ERROR( - "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, - max_physical_cores_count - ); - handshake_failed = true; - continue; - } - - uint64_t *regs = reinterpret_cast(regs_); - uint64_t reg_addr = regs[physical_core_id]; - - // Initialize AICore registers after discovery (first round) - platform_init_aicore_regs(reg_addr); - OUT_OF_ORDER_STORE_BARRIER(); - hank->aicpu_regs_ready = 1; - - OUT_OF_ORDER_STORE_BARRIER(); - - while (hank->aicore_done == 0) { - SPIN_WAIT_HINT(); - } - - CoreType type = hank->core_type; + uint64_t *regs = reinterpret_cast(regs_); + bool regs_phase_done[RUNTIME_MAX_WORKER] = {false}; + uint64_t reg_addr_of[RUNTIME_MAX_WORKER] = {0}; + + // Sweep A: wait for aicore_regs_ready, init that core's regs, ack with + // aicpu_regs_ready=1. Servicing a ready core (regs init + ack) carries no + // cross-core dependency, so it is done in-pass while other cores are still + // waking. + for (int32_t remaining = cores_total_num_; remaining > 0;) { + for (int32_t i = 0; i < cores_total_num_; i++) { + if (regs_phase_done[i]) continue; + Handshake *hank = &all_handshakes[i]; + if (hank->aicore_regs_ready == 0) { + SPIN_WAIT_HINT(); + continue; + } - core_exec_states_[i].reg_addr = reg_addr; - core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); + uint32_t physical_core_id = hank->physical_core_id; + if (physical_core_id >= max_physical_cores_count) { + LOG_ERROR( + "Core %d reported invalid physical_core_id=%u (platform max=%u)", i, physical_core_id, + max_physical_cores_count + ); + handshake_failed = true; + regs_phase_done[i] = true; + remaining--; + continue; + } + uint64_t reg_addr = regs[physical_core_id]; + reg_addr_of[i] = reg_addr; + platform_init_aicore_regs(reg_addr); + OUT_OF_ORDER_STORE_BARRIER(); + hank->aicpu_regs_ready = 1; #if PTO2_PROFILING - physical_core_ids_[i] = physical_core_id; + physical_core_ids_[i] = physical_core_id; #endif - #if !PTO2_PROFILING - core_exec_states_[i].worker_id = i; - core_exec_states_[i].physical_core_id = physical_core_id; - core_exec_states_[i].core_type = type; + core_exec_states_[i].physical_core_id = physical_core_id; #endif - - if (type == CoreType::AIC) { - aic_worker_ids_[aic_count_++] = i; - LOG_INFO_V0("Core %d: AIC, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); - } else { - aiv_worker_ids_[aiv_count_++] = i; - LOG_INFO_V0("Core %d: AIV, physical_id=%u, reg_addr=0x%lx", i, physical_core_id, reg_addr); + regs_phase_done[i] = true; + remaining--; } } + OUT_OF_ORDER_STORE_BARRIER(); if (handshake_failed) { emergency_shutdown(runtime); return -1; } + // Sweep B: wait for aicore_done, latch core type + register pointers. Same + // sweep so the second round-trip's wakeups also overlap. + bool done_phase_done[RUNTIME_MAX_WORKER] = {false}; + for (int32_t remaining = cores_total_num_; remaining > 0;) { + for (int32_t i = 0; i < cores_total_num_; i++) { + if (done_phase_done[i]) continue; + Handshake *hank = &all_handshakes[i]; + if (hank->aicore_done == 0) { + SPIN_WAIT_HINT(); + continue; + } + + CoreType type = hank->core_type; + uint64_t reg_addr = reg_addr_of[i]; + core_exec_states_[i].reg_addr = reg_addr; + core_exec_states_[i].cond_ptr = get_reg_ptr(reg_addr, RegId::COND); +#if !PTO2_PROFILING + core_exec_states_[i].worker_id = i; + core_exec_states_[i].core_type = type; +#endif + if (type == CoreType::AIC) { + aic_worker_ids_[aic_count_++] = i; + LOG_INFO_V0("Core %d: AIC, reg_addr=0x%lx", i, reg_addr); + } else { + aiv_worker_ids_[aiv_count_++] = i; + LOG_INFO_V0("Core %d: AIV, reg_addr=0x%lx", i, reg_addr); + } + done_phase_done[i] = true; + remaining--; + } + } + LOG_INFO_V0("Core discovery complete: %d AIC, %d AIV", aic_count_, aiv_count_); return 0; }