diff --git a/docs/dfx/args-dump.md b/docs/dfx/args-dump.md index 9b4d78cef..d80944dbd 100644 --- a/docs/dfx/args-dump.md +++ b/docs/dfx/args-dump.md @@ -481,12 +481,12 @@ normal execution continues. `halHostRegister` maps device memory into host virtual address space so the host can read device buffers directly. -`TensorDumpCollector` runs two background threads on top of a +`TensorDumpCollector` runs split mgmt threads and collector shards on top of a [`BufferPoolManager`](../src/common/platform/include/host/buffer_pool_manager.h): -a mgmt thread that polls SPSC ready queues and recycles full -metadata buffers **while kernels are still executing**, plus a -poll thread that drains the L2 hand-off queue into -`on_buffer_collected`. +drain/refill shards poll SPSC ready queues and recycle full metadata +buffers **while kernels are still executing**, a replenish thread keeps +free queues topped up, and collector shards drain the host hand-off queues +into `on_buffer_collected`. ```text HOST DEVICE @@ -499,19 +499,19 @@ poll thread that drains the L2 hand-off queue into │ │ │ │ │ start() │ │ per-task run loop: │ │ ┌────────────────────┐ │ │ BEFORE_DISPATCH │ -│ │ mgmt thread │ │ │ dump_arg_record() │ -│ │ (BufferPool driver)│ │ SPSC ready │ → write to arena │ +│ │ drain/refill shard │ │ │ dump_arg_record() │ +│ │ + replenish thread │ │ SPSC ready │ → write to arena │ │ │ poll ready queue │<┼──queues──────<│ → append record │ │ │ recycle buffers │─┼──free queue──>│ → push to ready_q │ │ └────────────────────┘ │ │ dispatch kernel │ │ ┌────────────────────┐ │ │ wait FIN │ -│ │ poll thread │ │ │ AFTER_COMPLETION │ +│ │ collector shard │ │ │ AFTER_COMPLETION │ │ │ reads arena via │ │ shared mem │ dump_arg_record() │ │ │ host mapping │<┼──mapping─────<│ │ │ └────────────────────┘ │ │ │ │ │ │ dump_args_flush() │ │ stop() │ │ log per-thread stats │ -│ join mgmt → join poll │ └──────────────────────────┘ +│ join mgmt → collectors │ └──────────────────────────┘ │ reconcile_counters() │ │ recover leftovers │ │ + dropped accounting │ @@ -530,29 +530,28 @@ poll thread that drains the L2 hand-off queue into init_tensor_dump() dump_collector_.initialize(..., output_prefix_) kernel_args_.args.dump_data_base = dump_collector_.get_dump_shm_device_ptr() -start() ← spawn mgmt thread (drains L1 ringbuffer) - then spawn poll thread (consumes L2 queue) +start() ← spawn split mgmt threads (drain/refill + + replenish), then collector shards launch AICPU / AICore rtStreamSynchronize ← wait for kernel completion -stop() ← join mgmt (its final-drain pass into L2 - has poll as the consumer), then signal - poll and join it +stop() ← join mgmt/replenish after final drain, + then signal collector shards and join them reconcile_counters() ← recover leftover current buffers + dropped accounting export_dump_files() ``` -[`TensorDumpCollector`](../src/a2a3/platform/include/host/tensor_dump_collector.h) +[`TensorDumpCollector`](../src/common/platform/include/host/tensor_dump_collector.h) on a2a3 inherits from [`profiling_common::ProfilerBase`](../src/common/platform/include/host/profiler_base.h): -the base class owns the mgmt thread, the poll thread, and the +the base class owns split mgmt threads, collector shards, and the `BufferPoolManager` they share. `TensorDumpCollector` only supplies the dump-specific pieces — the `DumpModule` trait that describes the shared-memory layout, `initialize` that allocates and pre-fills free queues, an `on_buffer_collected` callback that gathers payload bytes into the in-memory record list, plus `reconcile_counters` / `export_dump_files` / -`finalize`. The mgmt/poll threading, buffer pooling, and `Module` +`finalize`. The mgmt/collector threading, buffer pooling, and `Module` trait pattern are shared with PMU and L2Swimlane — see [profiling-framework.md](../profiling-framework.md) for the framework reference. @@ -561,7 +560,7 @@ framework reference. a5's `TensorDumpCollector` derives from `ProfilerBase` and shares the -mgmt + poll thread structure with a2a3. The single behavioral +split mgmt + collector shard structure with a2a3. The single behavioral deviation from §5.4 is the **transport channel**: a5 has no `halHostRegister`, so each device buffer is paired with a host-shadow `malloc()` and the mgmt loop synchronizes the two via @@ -597,8 +596,8 @@ the buffer's records. │ register_mapping(s) │ │ BEFORE_DISPATCH │ │ │ │ dump_arg_record() │ │ start(thread_factory) │ │ dispatch kernel │ -│ mgmt_thread starts │ │ wait FIN │ -│ poll_thread starts │ │ AFTER_COMPLETION │ +│ split mgmt starts │ │ wait FIN │ +│ collector shards start │ │ AFTER_COMPLETION │ │ │ │ dump_arg_record() │ │ mgmt every 10us tick: │ │ if buffer full: │ │ copy_from_device(shm) │<──memcpy─────<│ push ready entry, │ @@ -612,7 +611,7 @@ the buffer's records. │ for each modified │ │ │ │ field │ │ │ │ │ │ │ -│ poll thread: │ │ │ +│ collector shard: │ │ │ │ wait_pop_ready │ │ │ │ on_buffer_collected → │ │ │ │ copy arena slice │<──memcpy─────<│ │ @@ -622,7 +621,7 @@ the buffer's records. │ │ │ │ │ rtStreamSynchronize │ │ │ │ stop() │ │ │ -│ join mgmt + poll │ │ │ +│ join mgmt + collectors │ │ │ │ reconcile_counters() │ │ │ │ recover leftovers │ │ │ │ + dropped accounting │ │ │ @@ -638,17 +637,17 @@ the buffer's records. init_tensor_dump() dump_collector_.initialize(num_dump_threads, ..., output_prefix_) kernel_args_.args.dump_data_base = dump_collector_.get_dump_shm_device_ptr() -dump_collector_.start(thread_factory) ← mgmt + poll threads +dump_collector_.start(thread_factory) ← split mgmt + collector shards launch AICPU / AICore rtStreamSynchronize -dump_collector_.stop() ← join mgmt + poll, drain final batch +dump_collector_.stop() ← join mgmt + collectors, drain final batch dump_collector_.reconcile_counters() ← recover leftover current buffers + dropped accounting dump_collector_.export_dump_files() dump_collector_.finalize() ``` -[`TensorDumpCollector`](../src/a5/platform/include/host/tensor_dump_collector.h) +[`TensorDumpCollector`](../src/common/platform/include/host/tensor_dump_collector.h) on a5 inherits the same CRTP base ([`profiling_common::ProfilerBase`](../src/common/platform/include/host/profiler_base.h)) as a2a3 and parameterizes @@ -670,7 +669,7 @@ before that flush runs, `reconcile_counters` recovers a non-empty | Device-side layout | identical (same `DumpDataHeader` / `DumpMetaBuffer` / arena shape, `static_assert`-checked) | | | AICPU recording logic | identical | | | Buffer model | rotating pool (free + ready queues per thread) | identical | -| Host threads | mgmt + poll, streams during execution | identical | +| Host threads | split mgmt + collector shards, streams during execution | identical | | Host-class shape | `ProfilerBase` | identical | | Host transport | `halHostRegister` shared memory | host-shadow `malloc` + per-tick `rtMemcpy`/`memcpy` | | `MemoryOps` callbacks | 3 (`alloc`, `reg`, `free_`) | 5 (+ `copy_to_device`, `copy_from_device`) | @@ -694,9 +693,10 @@ With `--dump-args`, AICPU records full `BEFORE_DISPATCH` / non-contiguous views). - The completion `pipe_barrier(PIPE_ALL)` before writing FIN, which serializes all device-side writes for dumped tasks. -- The arena and metadata writes themselves; the host transport - cost is taken concurrently on a2a3 (mgmt + poll threads) or after - the stream finishes on a5. +- The arena and metadata writes themselves; host drain/replenish and + collector work runs concurrently with the stream on both architectures. + a5 additionally pays `rtMemcpy`/`memcpy` transport cost to keep host + shadows in sync. For interactive debugging, total memory pressure is what to watch: the default per-thread arena is 128 MiB @@ -893,7 +893,7 @@ per-thread arena (default 128 MiB). Bump **`dropped_overwrite > 0` in summary.** On a5, the run produced more total payload than fits in the arena; on a2a3, the host -mgmt/poll threads couldn't keep up. Reduce the number of dumped +mgmt/collector pipeline couldn't keep up. Reduce the number of dumped tasks (filter by `func_id` upstream) or increase `PLATFORM_DUMP_BUFFERS_PER_THREAD`. diff --git a/docs/dfx/l2-swimlane-profiling.md b/docs/dfx/l2-swimlane-profiling.md index e288d7454..ae882d177 100644 --- a/docs/dfx/l2-swimlane-profiling.md +++ b/docs/dfx/l2-swimlane-profiling.md @@ -609,11 +609,11 @@ sched overhead per session as price for unbounded session length). `halHostRegister` maps device memory into host virtual address space so the host can read device buffers directly. -`L2SwimlaneCollector` runs two background threads on top of a +`L2SwimlaneCollector` runs split mgmt threads and collector shards on top of a [`BufferPoolManager`](../src/common/platform/include/host/buffer_pool_manager.h): -a mgmt thread that polls SPSC ready queues and recycles full -buffers **while kernels are still executing**, plus a poll -thread that drains the L2 hand-off queue into +drain/refill shards poll SPSC ready queues and recycle full buffers +**while kernels are still executing**, a replenish thread keeps free +queues topped up, and collector shards drain the host hand-off queues into `on_buffer_collected`. `L2SwimlaneModule` declares four buffer kinds going through one ready @@ -641,19 +641,19 @@ are single-kind. │ │ │ │ │ start(tf) │ │ AICPU on FIN: │ │ ┌────────────────────┐ │ SPSC ready │ commit AicpuTask │ -│ │ mgmt thread │ │ queues │ record (kind 0); fill │ -│ │ (BufferPool driver)│ │<──4 kinds────<│ func_id / dispatch / │ +│ │ drain/refill shard │ │ queues │ record (kind 0); fill │ +│ │ + replenish thread │ │<──4 kinds────<│ func_id / dispatch / │ │ │ poll ready queue │<┼──multiplexed──│ finish; rotate buffer │ │ │ recycle buffers │─┼──free queue──>│ when full │ │ └────────────────────┘ │ │ AICPU scheduler thread: │ │ ┌────────────────────┐ │ │ per work iter: write │ -│ │ poll thread │ │ │ SchedPhaseRecord │ +│ │ collector shard │ │ │ SchedPhaseRecord │ │ │ reads via host │ │ shared mem │ (kind 1). Per submit: │ │ │ mapping; copies │<┼──mapping─────<│ write OrchPhaseRecord │ │ │ to host vectors │ │ │ (kind 2). │ │ └────────────────────┘ │ │ │ │ stop() │ │ │ -│ join mgmt → join poll │ │ │ +│ join mgmt → collectors │ │ │ │ read_phase_header_metadata() │ │ │ reconcile_counters() │ │ │ │ export_swimlane_json() │ │ │ @@ -667,10 +667,10 @@ are single-kind. init_l2_swimlane() l2_swimlane_collector_.initialize(num_aicore, ..., output_prefix_) kernel_args_.args.l2_swimlane_data_base = l2_swimlane_collector_.get_l2_swimlane_shm_device_ptr() -start(tf) ← spawn mgmt + poll threads +start(tf) ← spawn split mgmt + collector shards launch AICPU / AICore rtStreamSynchronize -stop() ← join mgmt → join poll +stop() ← join mgmt/replenish → join collectors read_phase_header_metadata() ← single-shot read of the core→thread mapping reconcile_counters() ← three-bucket accounting for both @@ -684,7 +684,7 @@ finalize(unregister, free) [`L2SwimlaneCollector`](../src/a2a3/platform/include/host/l2_swimlane_collector.h) on a2a3 inherits from [`profiling_common::ProfilerBase`](../src/common/platform/include/host/profiler_base.h): -the base class owns the mgmt thread, the poll thread, and the +the base class owns split mgmt threads, collector shards, and the `BufferPoolManager` they share. `L2SwimlaneCollector` supplies the L2-specific pieces — the `L2SwimlaneModule` trait (notably `kBufferKinds = 4` and `kind_of()`), `initialize` that @@ -694,7 +694,7 @@ allocates and pre-fills all four kinds of free queues, an to copy into the right per-core or per-thread vector, plus `read_phase_header_metadata` / `reconcile_counters` / `export_swimlane_json` / `finalize`. The -mgmt/poll threading and `Module` trait pattern are shared with +mgmt/collector threading and `Module` trait pattern are shared with PMU and TensorDump — see [profiling-framework.md](../profiling-framework.md) for the framework reference. @@ -702,9 +702,11 @@ framework reference. ### 5.3 a5 — same framework, host-shadow transport a5's `L2SwimlaneCollector` derives from -`ProfilerBase` and shares the -mgmt + poll thread structure with a2a3. The single behavioral -deviation from §5.2 is the **transport channel**: a5 has no +`ProfilerBase` and uses the same +framework abstractions as a2a3, including the same split mgmt + +collector shard shape (`kMgmtDrainThreadCount` = `kCollectorThreadCount` += `PLATFORM_MAX_AICPU_THREADS`, i.e. 7 on a5 vs 4 on a2a3). The +behavioral deviation from §5.2 is the **transport channel**: a5 has no `halHostRegister`, so each device buffer is paired with a host-shadow `malloc()` and the mgmt loop synchronizes the two via `profiling_copy.h` (`rtMemcpy` onboard, plain `memcpy` in sim). @@ -836,7 +838,7 @@ PHASE), same shape as a2a3. | AICPU commit on FIN | identical | | | Buffer model | rotating pool (free + ready queues) per kind | identical | | Ready queue | per-AICPU-thread, multiplexes 4 kinds via `ReadyQueueEntry::kind` | per-AICPU-thread, 2 kinds via `is_phase` | -| Host threads | mgmt + poll, streams during execution | identical | +| Host threads | split mgmt + collector shards, streams during execution | same split mgmt + collector shards (7 = `PLATFORM_MAX_AICPU_THREADS` vs a2a3's 4) | | Host-class shape | `ProfilerBase` (`kBufferKinds = 4`) | same base, `kBufferKinds = 2` | | Host transport | `halHostRegister` shared memory | host-shadow `malloc` + per-tick `rtMemcpy`/`memcpy` | | `MemoryOps` callbacks | 3 (`alloc`, `reg`, `free_`) | 5 (+ `copy_to_device`, `copy_from_device`) | @@ -864,10 +866,11 @@ Phase-record overhead (only at `--enable-l2-swimlane >= 3`): - a5 — one 40 B `L2SwimlaneAicpuPhaseRecord` per emitted phase (legacy unified shape). -Both architectures drain buffers concurrently with execution via the -mgmt + poll thread pair; a5 additionally pays per-tick -`rtMemcpy`/`memcpy` round-trips to keep the host shadow in sync, -which overlap with device execution. +Both architectures drain buffers concurrently with execution through the +ProfilerBase mgmt/collector pipeline; both a2a3 and a5 use split mgmt plus +collector shards for this profiler (a5 with 7 shards, a2a3 with 4). a5 +additionally pays per-buffer `rtMemcpy`/`memcpy` round-trips to keep the +host shadow in sync, which overlap with device execution. `--rounds > 1` collects only on the first round so the steady-state benchmark is not perturbed. diff --git a/docs/dfx/pmu-profiling.md b/docs/dfx/pmu-profiling.md index 2b2617c3b..2d134f377 100644 --- a/docs/dfx/pmu-profiling.md +++ b/docs/dfx/pmu-profiling.md @@ -218,8 +218,8 @@ collected_on_host + dropped == total (a2a3, 2 buckets) AICPU reads the 8 PMU counters via MMIO (`read_reg(reg_base, PMU_CNTi)`) directly into a `PmuRecord` on every task FIN. Buffers rotate through an SPSC free queue per core; full buffers flow through a per-thread -ready queue to a host mgmt thread that recycles them, while a host -poll thread streams records to CSV during execution. +ready queue to host drain/refill shards that recycle them, while +collector shards stream records to CSV during execution. ```text HOST DEVICE @@ -233,20 +233,20 @@ poll thread streams records to CSV during execution. │ │ │ │ │ start(tf) │ │ per-task FIN: │ │ ┌────────────────────┐ │ │ read 8 PMU_CNTs+TOTAL │ -│ │ mgmt thread │ │ │ into records[count] │ -│ │ (BufferPool driver)│ │ SPSC ready │ if buffer full: │ +│ │ drain/refill shard │ │ │ into records[count] │ +│ │ + replenish thread │ │ SPSC ready │ if buffer full: │ │ │ poll ready queue │<┼──queues──────<│ push ready entry, │ │ │ recycle buffers │─┼──free queue──>│ pop next buffer │ │ └────────────────────┘ │ │ │ │ ┌────────────────────┐ │ shared mem │ pmu_aicpu_flush(): │ -│ │ poll thread │ │ mapping │ push remaining full │ +│ │ collector shard │ │ mapping │ push remaining full │ │ │ read records via │<┼──────────────<│ buffers to ready_q │ │ │ host mapping │ │ │ │ │ │ append to CSV │ │ │ │ │ └────────────────────┘ │ └──────────────────────────┘ │ │ │ stop() │ -│ join mgmt → join poll │ +│ join mgmt → collectors │ │ reconcile_counters() │ │ finalize() │ └──────────────────────────┘ @@ -278,13 +278,14 @@ PmuBuffer pool (rotated) (BUFFERS_PER_CORE per core) init_pmu() pmu_collector_.init(num_aicore, num_threads, csv_path, event_type, ...) kernel_args_.args.pmu_data_base = pmu_collector_.get_pmu_shm_device_ptr() -start(tf) ← spawn mgmt thread (drains AICPU L1 ready - queue, recycles full buffers via - BufferPoolManager) + poll thread (drains - L2 hand-off, appends to CSV) +start(tf) ← spawn split mgmt threads (drain AICPU ready + queues, refills free queues, and runs + background replenish via BufferPoolManager) + + collector shards (drain host hand-off, + append to CSV) launch AICPU / AICore rtStreamSynchronize ← wait for kernel completion -stop() ← join mgmt → join poll +stop() ← join mgmt/replenish → join collectors reconcile_counters() ← assert collected + dropped == total; any non-empty current_buf_ptr is a flush bug, logged as ERROR @@ -294,12 +295,12 @@ finalize(unregister, free) [`PmuCollector`](../src/a2a3/platform/include/host/pmu_collector.h) inherits from [`profiling_common::ProfilerBase`](../src/common/platform/include/host/profiler_base.h): -the base class owns the mgmt thread, the poll thread, and the +the base class owns split mgmt threads, collector shards, and the `BufferPoolManager` they share. `PmuCollector` only supplies the PMU-specific pieces — the `PmuModule` trait that describes the shared-memory layout, an `init()` that allocates and pre-fills the free queues, an `on_buffer_collected()` callback that appends records to the -CSV, and `reconcile_counters()` / `finalize()`. The mgmt/poll threading, +CSV, and `reconcile_counters()` / `finalize()`. The mgmt/collector threading, buffer pooling, and `Module` trait pattern are shared with TensorDump and L2Swimlane — see [profiling-framework.md](../profiling-framework.md) for the framework reference. @@ -317,9 +318,12 @@ a2a3). At shutdown, AICPU flushes any partially-filled buffers via `pmu_aicpu_flush_buffers()`. a5's `PmuCollector` derives from -`ProfilerBase` and shares the mgmt + poll -thread structure with a2a3. The single behavioral deviation from -§5.2 is the **transport channel**: a5 has no `halHostRegister`, so +`ProfilerBase` and uses the same framework +abstractions as a2a3, including the same split mgmt + collector shard +shape (`kMgmtDrainThreadCount` = `kCollectorThreadCount` = +`PLATFORM_MAX_AICPU_THREADS`, i.e. 7 on a5 vs 4 on a2a3). The +behavioral deviation from §5.2 is the **transport channel**: a5 has no +`halHostRegister`, so each device buffer is paired with a host-shadow `malloc()` and the mgmt loop synchronizes the two via `profiling_copy.h` (`rtMemcpy` onboard, `memcpy` in sim). `MemoryOps` therefore carries five @@ -483,7 +487,7 @@ device-side counters. | Counter readout | AICPU MMIO `read_reg` | AICore MMIO `ld_dev` | | Per-core staging | direct write into `records[count]` | dual-issue slots, AICPU commits on FIN | | Buffer model | rotating pool (free + ready queues, SPSC protocol) | identical | -| Host threads | mgmt + poll, streams during execution | identical | +| Host threads | split mgmt + collector shards, streams during execution | same split mgmt + collector shards (7 = `PLATFORM_MAX_AICPU_THREADS` vs a2a3's 4) | | Host-class shape | `ProfilerBase` | identical | | Host transport | `halHostRegister` shared memory | host-shadow `malloc` + per-tick `rtMemcpy`/`memcpy` | | `MemoryOps` callbacks | 3 (`alloc`, `reg`, `free_`) | 5 (+ `copy_to_device`, `copy_from_device`) | @@ -499,9 +503,10 @@ counter-read code paths are skipped. When enabled, the dominant per-task overhead is the MMIO counter read (8 reads on a2a3, 10 on a5) plus a single record copy. On both architectures, streaming keeps host-side work off the critical path — -the collector thread drains buffers concurrently with kernel execution. -On a5 the copy hooks add `rtMemcpy` round-trips that a2a3's shared -memory avoids, but these overlap with device execution. +the collector shards drain buffers concurrently with kernel execution. +Both a2a3 and a5 use split mgmt plus collector shards (a5 with 7 shards, +a2a3 with 4). a5's copy hooks add `rtMemcpy` round-trips that a2a3's +shared memory avoids, but these overlap with device execution. For meaningful per-task numbers on a2a3 the runtime collapses to single-issue dispatch automatically whenever `--enable-pmu` is set (see diff --git a/docs/dfx/scope-stats.md b/docs/dfx/scope-stats.md index 682760bd7..5970731be 100644 --- a/docs/dfx/scope-stats.md +++ b/docs/dfx/scope-stats.md @@ -331,7 +331,7 @@ ScopeStatsCollector platform scope_stats_collector_aicpu.cpp set kernel_args fields runtime: scope_stats_set_ring_capacity() launch kernel runtime: scope_stats_set_tensormap_capacity() │ │ - poll thread: on PTO2_SCOPE begin/end: + collector shard(s): on PTO2_SCOPE begin/end: append records to memory ◀──┐ runtime samples task/heap/dep_pool/tensormap │ │ runtime: scope_stats_begin()/end() │ │ └─ emit record, append to buffer; diff --git a/docs/profiling-framework.md b/docs/profiling-framework.md index f6aa030b6..a3678b92f 100644 --- a/docs/profiling-framework.md +++ b/docs/profiling-framework.md @@ -1,7 +1,8 @@ # Profiling Framework -Shared host-side infrastructure that the PMU, L2Swimlane, TensorDump, and -ScopeStats collectors are built on. The framework headers live in +Shared host-side infrastructure that the PMU, L2Swimlane, DepGen, +TensorDump, and ScopeStats collectors are built on. The framework headers +live in [`src/common/platform/include/host/`](../src/common/platform/include/host/) and are consumed verbatim by both a2a3 and a5 collectors (PR #944 unified the previously-divergent per-arch copies into one set). This page @@ -11,62 +12,65 @@ the collectors themselves still carry. The per-collector pages ([pmu-profiling.md](dfx/pmu-profiling.md), [l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md), +[dep_gen.md](dfx/dep_gen.md), [args-dump.md](dfx/args-dump.md), [scope-stats.md](dfx/scope-stats.md)) describe the data each subsystem collects and how it enables it on-device. ## 1. Why a shared framework -Each profiling subsystem on a2a3 needs the same plumbing on the host: +Each profiling subsystem needs the same plumbing on the host: -- A management thread that polls the AICPU's per-thread SPSC ready queues +- A management path that polls the AICPU's per-thread SPSC ready queues and recycles full buffers back to the device while kernels are still - running. -- A collector thread that drains the host-side hand-off queue and copies + running. A module may opt into split drain/refill threads plus a + replenish thread. +- Collector thread shards that drain host-side hand-off queues and copy records out of each ready buffer. - A pool of pre-registered device buffers (allocated up-front, refilled on - demand) keyed by "kind" — PMU has 1 kind, TensorDump has 1, L2Swimlane has 2 - (perf records + phase markers). + demand) keyed by "kind". PMU, DepGen, TensorDump, and ScopeStats have one + kind; L2Swimlane has four. - A dev↔host pointer map so the management thread can resolve a device pointer popped off a ready queue to the host-mapped pointer the collector thread will read. -- A teardown sequence that flushes both queues without losing late entries. +- A teardown sequence that flushes the device queues and host shards without + losing late entries. -Before unification this was three near-identical implementations. The -framework collapses it to one control-flow implementation parameterized on -a small per-subsystem trait. +Before unification this was near-identical control flow repeated across +collectors. The framework collapses it to one implementation parameterized +on a small per-subsystem trait. ## 2. Layered view ```text ┌──────────────────────────────────────────┐ - │ PmuCollector / L2SwimlaneCollector / │ Derived (CRTP) - │ TensorDumpCollector │ ─ on_buffer_collected + │ Pmu / L2Swimlane / DepGen / Dump / Scope │ Derived (CRTP) + │ collectors │ ─ on_buffer_collected └─────────────┬────────────────────────────┘ ─ kIdleTimeoutSec / kSubsystemName │ public ProfilerBase ┌─────────────▼────────────────────────────┐ │ ProfilerBase │ Thread orchestration - │ ─ owns mgmt thread + collector thread │ ─ start/stop lifecycle + │ ─ owns mgmt + collector thread(s) │ ─ start/stop lifecycle │ ─ runs ProfilerAlgorithms │ ─ consume → notify_copy_done └─────────────┬────────────────────────────┘ │ has-a ┌─────────────▼────────────────────────────┐ │ BufferPoolManager │ Data structures (no threads) - │ ─ ready_queue / done_queue │ ─ recycled pools (per kind) + │ ─ ready/done queue shards │ ─ recycled pools (per kind) │ ─ alloc_and_register / resolve_host_ptr │ ─ MemoryOps (type-erased) └──────────────────────────────────────────┘ ▲ │ Module trait wires layout into algorithms ┌───────────────┴────────────────┐ - │ PmuModule / L2SwimlaneModule / │ Pure static trait (no state) - │ DumpModule │ ─ DataHeader / ReadyEntry / FreeQueue + │ Pmu / L2Swimlane / DepGen / │ Pure static trait (no state) + │ Dump / Scope modules │ ─ DataHeader / ReadyEntry / FreeQueue └────────────────────────────────┘ ─ kBufferKinds / kReadyQueueSize ─ resolve_entry / for_each_instance ``` `ProfilerBase` is the owner: it holds `BufferPoolManager manager_` as a -member ([profiler_base.h:414](../src/common/platform/include/host/profiler_base.h#L414)), -spawns and joins both threads, and dispatches collected buffers to +member, spawns and joins the mgmt / collector threads, and dispatches +collected buffers to `Derived::on_buffer_collected` via CRTP. `BufferPoolManager` owns no threads — it is just the shared data structure both threads access. `Module` is a stateless trait that tells the generic algorithms how the @@ -79,17 +83,20 @@ subsystem's shared-memory layout is shaped. Defined in [`buffer_pool_manager.h`](../src/common/platform/include/host/buffer_pool_manager.h). Owns: -- `ready_queue_` — mgmt → collector hand-off, guarded by mutex+cv. -- `done_queue_` — collector → mgmt recycle channel, guarded by mutex. -- `recycled_[kind]` — per-kind pool of free device buffers (mgmt-only). +- `ready_shards_` — mgmt → collector hand-off shards, each guarded by + mutex+cv. +- `done_shards_` — collector → mgmt recycle shards, each guarded by mutex. +- `recycled_[shard][kind]` — shard-local pool of free device buffers, + guarded by one mutex per shard/kind. - `dev_to_host_` — single source of truth for `resolve_host_ptr`. - `MemoryOps` — type-erased `alloc / reg / free_` callbacks, plus the `shared_mem_host` and `device_id` stashed once at start. Owns no threads. Every entry point is documented as one of: -- mgmt-only (recycled pool ops, `drain_done_into_recycled`), -- collector-only (`notify_copy_done`), +- mgmt-only or internally locked (`drain_done_into_recycled`, recycled + pool ops), +- collector-only (`notify_copy_done`, one shard per collector), - shared with internal locking (`push_to_ready` / `wait_pop_ready` / `try_pop_ready`), - start/stop-only (`set_memory_context`, `release_owned_buffers`, @@ -100,28 +107,39 @@ Owns no threads. Every entry point is documented as one of: Defined in [`profiler_base.h`](../src/common/platform/include/host/profiler_base.h). Provides: -- The two threads and their lifecycle (`start` / `stop`). -- `mgmt_loop` — drains `done_queue` → recycled, polls every AICPU - per-thread ready queue (bounded by `PLATFORM_MAX_AICPU_THREADS`), - invokes `ProfilerAlgorithms::process_entry` per popped entry, - and tops up free queues with `proactive_replenish`. -- `poll_and_collect_loop` — `wait_pop_ready` with a 100 ms cv tick, - dispatches to `Derived::on_buffer_collected`, then calls +- The mgmt thread(s), collector thread(s), and their lifecycle (`start` / + `stop`). +- Split mgmt threads — `mgmt_drain_loop` drains ready queues and refills the + originating free queue from the current drain shard's local recycled pool + (`ProfilerAlgorithms::process_entry` per popped entry), while + `mgmt_replenish_loop` only drains done buffers into shard-local recycled + pools. A one-shot `proactive_replenish` seeds every free queue before the + threads start. Split drain threads do not bulk-mirror the whole + shared-memory region; they refresh only their queue indices / entries + before advancing `queue_heads`. On an empty scan, split drain does a short + busy-poll window before falling back to the 10 us sleep, so micro-bursts + are less likely to miss AICPU's bounded wait window. +- Optional collector sharding (`Module::kCollectorThreadCount`) — each + collector drains one host ready shard and returns finished buffers through + the matching done shard. +- `poll_and_collect_loop` — per-shard `wait_pop_ready` with a 100 ms cv + tick, dispatches to `Derived::on_buffer_collected`, then calls `manager_.notify_copy_done(...)` itself; idle-timeout hang detector. - `set_memory_context` / `clear_memory_context` so `Derived::init` can stash the alloc/reg/free callbacks before threads start; if init aborts before stashing, `start(tf)` becomes a no-op. -`ProfilerAlgorithms` (in the same header, [profiler_base.h:170](../src/common/platform/include/host/profiler_base.h#L170)) +`ProfilerAlgorithms` (in the same +[profiler_base.h](../src/common/platform/include/host/profiler_base.h)) is where the unified algorithms live: - `try_pop_aicpu_entry` — barrier-correct head/tail advance over the per-thread ready queue, with a range-check guard against device-side corruption. -- `process_entry` — three-level fallback (recycled → drain done → alloc) - to refill the originating free_queue with **exactly one** buffer per - popped entry, then resolve host_ptr and push to ready. The 1-in/1-out - ratio bounds per-tick latency. +- `process_entry` — shard-local fallback (local recycled → local done → + other recycled shard → alloc) to refill the originating free_queue until + it is full or no buffer is available, then resolve host_ptr and push to + ready. - `proactive_replenish` — drain done, then top every (kind, instance) free queue up to `kSlotCount`, batch-allocating `batch_size(kind)` buffers when the recycled pool of a kind drains mid-fill so recovery @@ -130,17 +148,21 @@ is where the unified algorithms live: ### 3.3 `Module` — trait layer A stateless `struct` per subsystem (`PmuModule`, `L2SwimlaneModule`, -`DumpModule`) that tells the generic algorithms what the shared-memory -layout looks like. The contract lives in the docblock at the top of +`DepGenModule`, `DumpModule`, `ScopeStatsModule`) that tells the generic +algorithms what the shared-memory layout looks like. The contract lives in the +docblock at the top of [`profiler_base.h`](../src/common/platform/include/host/profiler_base.h); the required members are: | Member | Purpose | | ------ | ------- | | `using DataHeader / ReadyEntry / ReadyBufferInfo / FreeQueue` | Layout types | -| `kBufferKinds` (PMU=1, Dump=1, L2Swimlane=2) | Number of per-kind recycled pools | +| `kBufferKinds` | Number of buffer kinds inside each recycled shard | | `kReadyQueueSize`, `kSlotCount` | AICPU ready queue / free queue depth | | `kSubsystemName` | Tag used in framework log lines | +| `kMgmtDrainThreadCount` | Optional; number of mgmt drain shards (defaults to 1) | +| `kCollectorThreadCount` | Optional number of collector / host ready-queue shards | +| `refresh_replenish_metadata(mgr, header)` | Optional hook to refresh cached queue metadata before a replenish pass | | `header_from_shm(void*) → DataHeader*` | Cast shared-memory base to header | | `batch_size(int kind) → int` | Per-kind batch-alloc count | | `resolve_entry(shm, header, q, entry) → optional` | Map a popped ready entry to (kind, free_queue, buffer_size, partial info); return `nullopt` to drop | @@ -150,7 +172,10 @@ the required members are: The Module structs are defined alongside their collectors in [pmu_collector.h](../src/a2a3/platform/include/host/pmu_collector.h), [l2_swimlane_collector.h](../src/a2a3/platform/include/host/l2_swimlane_collector.h), -and [tensor_dump_collector.h](../src/a2a3/platform/include/host/tensor_dump_collector.h) +[dep_gen_collector.h](../src/a2a3/platform/include/host/dep_gen_collector.h), +[tensor_dump_collector.h](../src/common/platform/include/host/tensor_dump_collector.h), +and +[scope_stats_collector.h](../src/common/platform/include/host/scope_stats_collector.h) — each is a few dozen lines of static methods over the subsystem's own `DataHeader` / ringbuffer types. @@ -178,34 +203,35 @@ and only has to provide: ## 4. End-to-end data flow ```text - AICPU mgmt thread collector thread - ───── ─────────── ──────────────── + AICPU mgmt thread(s) collector shard(s) + ───── ────────────── ────────────────── write record into drain_done_into_recycled current free buffer ──────────────────────────► try_pop_aicpu_entry(q) process_entry: - pop_recycled / alloc_and_register - (refill originating free_queue, 1-in/1-out) + pop local recycled / local done / alloc + (top up originating free_queue) resolve_host_ptr - push_to_ready ──────────────────► wait_pop_ready + push_to_ready(shard q) ─────────► wait_pop_ready(q) Derived::on_buffer_collected (copy records out) - notify_copy_done - ◄────────────────────────────────── (done_queue) + notify_copy_done(q) + ◄────────────────────────────────── done shard q (next tick) drain into recycled ▲ │ - proactive_replenish: top every - free_queue up to kSlotCount; - batch-alloc when a kind drains. + split runtime replenish: + drain done into shard-local + recycled pools only. ``` -Both queues plus the per-kind recycled pools and the dev↔host map all +The queue shards plus the shard-local recycled pools and the dev↔host map all live in the single `BufferPoolManager` instance owned by `ProfilerBase`. -The mgmt thread is the only writer to the ready queue; the collector -thread is the only writer to the done queue. Recycled pools are -mgmt-only. +Each ready shard has one collector consumer; each done shard is written by +its matching collector and drained into the same recycled shard. Split drain +refills the originating free queue on the hot path; split replenish no longer +writes free queues at runtime. ## 5. Lifecycle @@ -219,17 +245,17 @@ mgmt-only. assemble MemoryOps from stashed callbacks (sim mode installs an identity reg wrapper so register == nullptr is supported uniformly) manager_.set_memory_context(ops, shm_host, device_id) - spawn mgmt thread ← started first; mgmt is the only writer to L2 - spawn collector thread + spawn mgmt thread(s) ← started first; mgmt writes host ready shards + spawn collector thread(s) ... AICPU / AICore execute ... ProfilerBase::stop() mgmt_running_ = false - join mgmt thread ← mgmt's final-drain pass flushes the last - entries into ready_queue before exiting + join mgmt thread(s) ← mgmt final-drain flushes the last entries into + ready shards before exiting execution_complete_ = true - join collector thread ← drains ready_queue once more, then exits + join collector thread(s)← each shard drains once more, then exits Derived::finalize(unregister, free) manager_.release_owned_buffers([&](void* p) { unregister + free }) @@ -240,9 +266,9 @@ mgmt-only. The order in `stop()` is load-bearing: mgmt is joined **before** `execution_complete_` is signalled so its final-drain output has a -consumer; the collector then drains and exits. On return both queues are -empty and `on_buffer_collected` has been called for every entry that was -in either queue. +consumer; collectors then drain and exit. On return all host shards are +empty and `on_buffer_collected` has been called for every entry that was in +any shard. `Derived::finalize` is responsible for the buffers the collector still owns at stop time (`free_queue` slots and `current_buf_ptr`); the @@ -255,19 +281,28 @@ mid-run by the framework. | State | Reader(s) | Writer(s) | Synchronization | | ----- | --------- | --------- | --------------- | -| `ready_queue_` | collector | mgmt | `ready_mutex_` + `ready_cv_` | -| `done_queue_` | mgmt | collector | `done_mutex_` | -| `recycled_[kind]` | mgmt | mgmt | none (single-threaded access) | -| `dev_to_host_` | mgmt (`alloc_and_register`, `resolve_host_ptr`) | mgmt | none during run; collector touches it only in `release_owned_buffers` / `clear_mappings`, after `stop()` has joined mgmt | +| `ready_shards_[q]` | collector q | mgmt drain q | shard mutex + cv | +| `done_shards_[q]` | mgmt / replenish | collector q | shard mutex | +| `recycled_[shard][kind]` | drain shard / replenish | drain shard / replenish | shard/kind mutex | +| `dev_to_host_` | mgmt (`alloc_and_register`, `resolve_host_ptr`) | mgmt | `mapping_mutex_`; collector touches it only in `release_owned_buffers` / `clear_mappings`, after `stop()` has joined mgmt | | `MemoryOps` / `shared_mem_host_` / `device_id_` | both threads | start-only | `set_memory_context` is called once before threads spawn; read-only afterwards | -| AICPU per-thread ready queues (`header->queues[q]`) | mgmt (head advance) | AICPU (tail advance) | `rmb` / `wmb` paired with AICPU writers | -| Per-instance `FreeQueue` | AICPU (head advance) | mgmt (tail advance) | `rmb` / `wmb` paired with AICPU readers | +| AICPU per-thread ready queues (`header->queues[q]`) | mgmt (head advance) | AICPU (tail advance) | `read_range_from_device` in split drain, then `write_range_to_device` for `queue_heads[q]` | +| Per-instance `FreeQueue` | AICPU (head advance) | mgmt (tail advance) | per-free-queue writer lock; host refreshes `head` before writing `buffer_ptrs[]` / `tail` | Two things follow: -- `dev_to_host_` is unlocked because mgmt owns it during the run and the - collector only touches it when mgmt is joined. Adding a collector path - that mutates the map mid-run would require revisiting this. +- `dev_to_host_` has a narrow mapping lock; recycled pools are split by + collector shard and kind so the hot drain/refill path mostly stays local. +- Device-side queue backpressure is bounded for the profiling writers that + use this protocol. If the host does not make ready-queue space or + free-queue entries visible within the short wait budget, AICPU records a + drop and keeps the workload moving instead of spinning indefinitely. +- The AICPU writer publishes a full buffer to the ready queue before + acquiring its replacement buffer. If no replacement is visible yet, the + current pointer is cleared and later records first try to recover from + the free queue before counting a per-record drop. This matters under a + one-buffer stress shape: the host cannot return a replacement until it + first observes the full ready buffer. - The mgmt thread must never zero AICPU-owned fields (`count`, `head`, `tail` on the AICPU side). The AICPU is the sole writer to those and resets them itself on flush/drop/pop. @@ -295,11 +330,15 @@ Existing collectors are the canonical examples: - [`PmuCollector`](../src/a2a3/platform/include/host/pmu_collector.h) — single kind, per-core instances. See [pmu-profiling.md](dfx/pmu-profiling.md). -- [`TensorDumpCollector`](../src/a2a3/platform/include/host/tensor_dump_collector.h) +- [`DepGenCollector`](../src/a2a3/platform/include/host/dep_gen_collector.h) + — single kind, one instance. See [dep_gen.md](dfx/dep_gen.md). +- [`TensorDumpCollector`](../src/common/platform/include/host/tensor_dump_collector.h) — single kind, per-AICPU-thread instances. See [args-dump.md](dfx/args-dump.md). +- [`ScopeStatsCollector`](../src/common/platform/include/host/scope_stats_collector.h) + — single kind, one instance. See [scope-stats.md](dfx/scope-stats.md). - [`L2SwimlaneCollector`](../src/a2a3/platform/include/host/l2_swimlane_collector.h) - — two kinds (perf records + phase markers), per-core / per-thread - instances; the canonical multi-kind example. See + — four kinds (AICPU task, scheduler phase, orchestrator phase, AICore + task), per-core / per-thread instances; the canonical multi-kind example. See [l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md). ## 8. a5 specifics — host-shadow transport @@ -332,8 +371,9 @@ changes capture that: **not** called from the mgmt loop — it would race with AICPU writes to device-only fields (`current_buf_ptr`, `total/dropped/mismatch` counters, `queue_tails`, `free_queue.head`, - `L2SwimlaneAicpuPhaseHeader::magic`, `core_to_thread[]`), rolling them back - to whatever the host shadow had at the start of the tick. Per-buffer payloads (`L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` / + `L2SwimlaneAicpuPhaseHeader::magic`, `core_to_thread[]`), rolling them + back to whatever the host shadow had at the start of the tick. Per-buffer + payloads (`L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` / `DumpMetaBuffer`) are still pulled on demand inside `ProfilerAlgorithms::process_entry` after resolving the host pointer for a popped ready entry. The bulk `mirror_shm_to_device` is kept @@ -389,7 +429,7 @@ rotating `L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` flips — flipping is now fully internal to `*_complete_record` and never crosses into Handshake. Everything else — Module concept contract, alloc policy -(1-in/1-out + proactive replenish), `kIdleTimeoutSec` / `kSubsystemName` +(drain-shard top-up + proactive replenish), `kIdleTimeoutSec` / `kSubsystemName` contract, mgmt-then-poll start/stop ordering, buffer-pool sizing constants — matches a2a3 exactly. New collectors should be reviewed against both arches when added. diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h index 757db37a5..78c7a88b9 100644 --- a/src/a2a3/platform/include/common/platform_config.h +++ b/src/a2a3/platform/include/common/platform_config.h @@ -129,13 +129,13 @@ constexpr int PLATFORM_PROF_SLOT_COUNT = 4; /** * L2SwimlaneAicpuTaskBuffer pre-allocation count per AICore. - * 1 goes into the free_queue at init, the rest into the recycled pool. + * Up to PLATFORM_PROF_SLOT_COUNT go into the free_queue at init, the rest into the recycled pool. */ constexpr int PLATFORM_PROF_BUFFERS_PER_CORE = 8; /** * L2SwimlaneAicoreTaskBuffer pre-allocation count per AICore (AICore-as-producer pool). - * 1 goes into the free_queue at init, the rest into the recycled pool. + * Up to PLATFORM_PROF_SLOT_COUNT go into the free_queue at init, the rest into the recycled pool. * Mirrors PLATFORM_PROF_BUFFERS_PER_CORE in role; smaller because AICore records * are slim (32 B each) and the buffer is also smaller per the rotation design. */ @@ -144,8 +144,8 @@ constexpr int PLATFORM_AICORE_BUFFERS_PER_CORE = 4; /** * Host preallocation count per AICPU thread for the two phase pools, split per * kind (sched vs orch) because their throughput is asymmetric — a single shared - * value over-provisions the lighter one. 1 buffer seeds the free_queue at init, - * the rest the recycled pool. + * value over-provisions the lighter one. Up to PLATFORM_PROF_SLOT_COUNT buffers + * seed the free_queue at init, and the rest seed the recycled pool. * * Floor for both: SLOT_COUNT(4) + 1 = 5 (free_queue fillable + 1 active buffer). * Pure host preallocation — zero ABI (the device-visible ready_queue is decoupled diff --git a/src/a2a3/platform/include/host/dep_gen_collector.h b/src/a2a3/platform/include/host/dep_gen_collector.h index 789e695e2..e5f86a89d 100644 --- a/src/a2a3/platform/include/host/dep_gen_collector.h +++ b/src/a2a3/platform/include/host/dep_gen_collector.h @@ -16,16 +16,17 @@ * * Architecture: * - BufferPoolManager: shared mgmt-thread infrastructure that - * polls the per-thread ready queue, drains the done_queue, and replenishes - * the (single instance's) free_queue from a unified recycled pool. - * - DepGenCollector: collector thread pops full DepGenBuffers from the manager - * and appends their DepGenRecords to a binary file (submit_trace.bin). + * polls per-thread ready queues, drains done-queue shards, and replenishes + * the single instance's free_queue from a unified recycled pool. + * - DepGenCollector: collector thread shards pop full DepGenBuffers from the + * manager and append their DepGenRecords to a binary file + * (submit_trace.bin). * * Lifecycle: * init() — Allocate header + 1 BufferState + N DepGenBuffers * (pre-fills free_queue; surplus → recycled pool). * Calls set_memory_context() on the base. - * start(tf) — Inherited: launches mgmt + poll threads. + * start(tf) — Inherited: launches mgmt + collector threads. * [device execution] * stop() — Inherited: drain queues, join threads. * reconcile_counters() — Sanity-check current_buf_ptr is cleared by @@ -64,7 +65,7 @@ // --------------------------------------------------------------------------- /** - * Internal hand-off struct delivered from the mgmt thread to the collector. + * Internal hand-off struct delivered from a drain thread to a collector shard. * thread_index identifies the AICPU thread queue the entry was popped from * (always equal to the orchestrator thread index, since dep_gen is single- * instance — exposed for symmetry with PmuReadyBufferInfo). @@ -87,6 +88,8 @@ struct DepGenModule { static constexpr uint32_t kReadyQueueSize = PLATFORM_DEP_GEN_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_DEP_GEN_SLOT_COUNT; static constexpr const char *kSubsystemName = "DepGenModule"; + static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS; + static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS; /** * Buffers grown by proactive_replenish are batch-allocated up to the @@ -104,7 +107,18 @@ struct DepGenModule { * resets it itself on flush/drop/pop. */ static std::optional> - resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) { + resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) { + if (shm == nullptr || header == nullptr) { + LOG_ERROR("DepGenModule: invalid shared memory/header while resolving ready entry"); + return std::nullopt; + } + if (header->num_instances != 1 || entry.instance_index >= header->num_instances) { + LOG_ERROR( + "DepGenModule: invalid ready entry instance=%u (num_instances=%u)", entry.instance_index, + header->num_instances + ); + return std::nullopt; + } DepGenBufferState *state = get_dep_gen_buffer_state(shm, static_cast(entry.instance_index)); profiling_common::EntrySite site; site.kind = 0; diff --git a/src/a2a3/platform/include/host/l2_swimlane_collector.h b/src/a2a3/platform/include/host/l2_swimlane_collector.h index c7297d0e0..b8bd2bb9b 100644 --- a/src/a2a3/platform/include/host/l2_swimlane_collector.h +++ b/src/a2a3/platform/include/host/l2_swimlane_collector.h @@ -16,9 +16,9 @@ * Architecture: * - BufferPoolManager: shared mgmt-thread infrastructure that polls * the AICPU ready queue, replenishes per-core / per-thread free queues, and - * hands full buffers off to the collector thread. - * - L2SwimlaneCollector: main thread copies records from the manager's ready queue - * into host vectors and exports the swimlane visualization. + * hands full buffers off to collector thread shards. + * - L2SwimlaneCollector: collector thread shards copy records from manager ready queues + * into host vectors; the owner thread exports the swimlane visualization after stop(). * * Memory operations are injected through callbacks for sim/onboard portability. */ @@ -27,8 +27,11 @@ #define SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_ #include +#include +#include #include #include +#include #include #include #include @@ -87,6 +90,8 @@ struct L2SwimlaneModule { static constexpr uint32_t kReadyQueueSize = PLATFORM_PROF_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_PROF_SLOT_COUNT; static constexpr const char *kSubsystemName = "L2SwimlaneModule"; + static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS; + static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS; /** * batch_size for proactive_replenish's alloc fallback. Sized so that a @@ -121,6 +126,13 @@ struct L2SwimlaneModule { static DataHeader *header_from_shm(void *shm) { return get_l2_swimlane_header(shm); } + template + static void refresh_replenish_metadata(Mgr &mgr, DataHeader *header) { + mgr.read_range_from_device(&header->num_sched_phase_threads, sizeof(header->num_sched_phase_threads)); + mgr.read_range_from_device(&header->num_orch_phase_threads, sizeof(header->num_orch_phase_threads)); + rmb(); + } + /** * Branch on entry.kind to pick the per-core task state, per-thread sched- * or orch-phase state, or per-core AICore state. Returns nullopt for @@ -263,13 +275,11 @@ using L2SwimlaneFreeCallback = profiling_common::ProfFreeCallback; * 1. initialize() — allocate shared memory, pre-fill free_queues, * hand the memory context to the base via * set_memory_context(). - * 2. start(tf) — inherited from ProfilerBase: assembles a - * MemoryOps from the stashed callbacks and - * launches the mgmt + poll threads. + * 2. start(tf) — inherited from ProfilerBase; launches + * drain/refill, replenish, and collector threads. * 3. ... device execution ... - * 4. stop() — joins both threads in the correct order - * (mgmt first so its final-drain entries - * have a consumer). + * 4. stop() — joins drain/refill and replenish before + * letting collector threads exit. * 5. read_phase_header_metadata() — single-shot read of the core→thread * mapping from L2SwimlaneDataHeader. * 6. reconcile_counters() — device-side three-bucket accounting for @@ -329,7 +339,7 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase> collected_sched_phase_records_; std::vector> collected_orch_phase_records_; - bool has_phase_data_{false}; + std::atomic has_phase_data_{false}; // Core-to-thread mapping (core_id → scheduler thread index, -1 = unassigned) std::vector core_to_thread_; // Running totals used at reconcile time to cross-check device-side counters. - uint64_t total_perf_collected_{0}; - uint64_t total_sched_phase_collected_{0}; - uint64_t total_orch_phase_collected_{0}; + std::atomic total_perf_collected_{0}; + std::atomic total_sched_phase_collected_{0}; + std::atomic total_orch_phase_collected_{0}; + + std::array perf_record_mutexes_; + std::array aicore_record_mutexes_; + std::array sched_phase_record_mutexes_; + std::array orch_phase_record_mutexes_; // Allocate a single buffer (any of the L2SwimlaneAicpu*Buffer kinds) and register it. // The RAII counterpart ``release_one_buffer`` lives on ProfilerBase and diff --git a/src/a2a3/platform/include/host/pmu_collector.h b/src/a2a3/platform/include/host/pmu_collector.h index 30b464aec..b1742790a 100644 --- a/src/a2a3/platform/include/host/pmu_collector.h +++ b/src/a2a3/platform/include/host/pmu_collector.h @@ -14,11 +14,11 @@ * @brief Host-side PMU buffer allocation, streaming collection, and CSV export. * * Architecture: - * - BufferPoolManager: shared mgmt-thread infrastructure that polls - * per-thread DumpReadyQueues, drains the done_queue, and replenishes the + * - BufferPoolManager: shared split-mgmt infrastructure that polls + * per-thread ready queues, drains done-queue shards, and replenishes the * per-core free_queues from a unified recycled pool. - * - PmuCollector: collector thread pops full PmuBuffers from the manager - * and appends them to the CSV file. + * - PmuCollector: collector thread shards pop full PmuBuffers from the manager + * and append them to the CSV file. * * Lifecycle: * init() — Allocate header + per-core states + PmuBuffers @@ -27,12 +27,12 @@ * on the base so start(tf) can launch threads. * start(tf) — Inherited from ProfilerBase: assembles * MemoryOps from the stashed callbacks and - * launches the mgmt + poll threads. + * launches the mgmt + collector threads. * [device execution] - * stop() — Stop mgmt → join mgmt → signal poll → - * drain L2 → join poll, in that order. On - * return both thread exits and queue drains - * are complete. + * stop() — Stop mgmt → join mgmt → signal collectors → + * drain ready shards → join collectors, in + * that order. On return both thread exits and + * queue drains are complete. * reconcile_counters() — Sanity-check PmuBufferState::current_buf_ptr * (any non-zero pointer with records is a * device-flush bug, logged as ERROR) and run @@ -78,9 +78,8 @@ */ /** - * Internal hand-off struct delivered from the mgmt thread to the collector. - * thread_index is the logical AICPU thread queue the entry was popped from, - * passed through by ProfilerBase's mgmt loop. + * Internal hand-off struct delivered from a drain thread to a collector shard. + * thread_index is the logical AICPU thread queue the entry was popped from. */ struct PmuReadyBufferInfo { uint32_t core_index; @@ -100,6 +99,8 @@ struct PmuModule { static constexpr uint32_t kReadyQueueSize = PLATFORM_PMU_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_PMU_SLOT_COUNT; static constexpr const char *kSubsystemName = "PmuModule"; + static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS; + static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS; /** * Buffers grown by proactive_replenish are batch-allocated up to the @@ -118,7 +119,18 @@ struct PmuModule { * resets it itself on flush/drop/pop. */ static std::optional> - resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) { + resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) { + if (shm == nullptr || header == nullptr) { + LOG_ERROR("PmuModule: invalid shared memory/header while resolving ready entry"); + return std::nullopt; + } + if (entry.core_index >= header->num_cores || entry.core_index >= static_cast(PLATFORM_MAX_CORES)) { + LOG_ERROR( + "PmuModule: invalid ready entry core=%u (num_cores=%u, max=%u)", entry.core_index, header->num_cores, + static_cast(PLATFORM_MAX_CORES) + ); + return std::nullopt; + } PmuBufferState *state = get_pmu_buffer_state(shm, static_cast(entry.core_index)); profiling_common::EntrySite site; site.kind = 0; diff --git a/src/a2a3/platform/shared/aicpu/dep_gen_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/dep_gen_collector_aicpu.cpp index 9b934a2f4..e2db5c4a1 100644 --- a/src/a2a3/platform/shared/aicpu/dep_gen_collector_aicpu.cpp +++ b/src/a2a3/platform/shared/aicpu/dep_gen_collector_aicpu.cpp @@ -20,15 +20,17 @@ * - Host pushes free DepGenBuffers via free_queue. * - AICPU pops when current buffer fills; pushes full buffer to per-thread * ready_queue (indexed by orch_thread_idx). - * - On free_queue empty or ready_queue full: overwrite current buffer - * (record dropped_record_count, keep AICPU alive). Host reads dropped - * at finalize to decide whether to emit deps.json. + * - Full buffers are published before AICPU tries to recover a replacement. + * If recovery is delayed, later records are counted as dropped until host + * replenishes free_queue. Host reads dropped at finalize to decide whether + * to emit deps.json. */ #include "aicpu/dep_gen_collector_aicpu.h" #include +#include "aicpu/device_time.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "common/unified_log.h" @@ -41,6 +43,9 @@ static DepGenDataHeader *s_dep_gen_header = nullptr; static DepGenBufferState *s_dep_gen_state = nullptr; static int s_orch_thread_idx = -1; // set via dep_gen_aicpu_set_orch_thread_idx +static constexpr uint64_t kDepGenQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000; // 20 us +static constexpr uint32_t kDepGenQueueBackpressurePollMask = 1023; + extern "C" void set_platform_dep_gen_base(uint64_t dep_gen_data_base) { g_platform_dep_gen_base = dep_gen_data_base; } extern "C" uint64_t get_platform_dep_gen_base() { return g_platform_dep_gen_base; } @@ -56,26 +61,74 @@ void dep_gen_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thr // --------------------------------------------------------------------------- static int enqueue_dep_gen_ready_buffer(uint64_t buffer_ptr, uint32_t buffer_seq) { - if (s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + if (s_dep_gen_header == nullptr || s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) { return -1; } int q = s_orch_thread_idx; uint32_t capacity = PLATFORM_DEP_GEN_READYQUEUE_SIZE; - uint32_t current_tail = s_dep_gen_header->queue_tails[q]; - uint32_t current_head = s_dep_gen_header->queue_heads[q]; + uint32_t current_tail = 0; + uint32_t current_head = 0; + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + current_tail = s_dep_gen_header->queue_tails[q]; + current_head = s_dep_gen_header->queue_heads[q]; + uint32_t next_tail = (current_tail + 1) % capacity; + if (next_tail != current_head) { + break; + } + if ((++spins & kDepGenQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kDepGenQueueBackpressureWaitCycles) { + return -1; + } + } while (true); uint32_t next_tail = (current_tail + 1) % capacity; - if (next_tail == current_head) { - return -1; // Queue full - } - s_dep_gen_header->queues[q][current_tail].instance_index = 0; s_dep_gen_header->queues[q][current_tail].buffer_ptr = buffer_ptr; s_dep_gen_header->queues[q][current_tail].buffer_seq = buffer_seq; + wmb(); // publish: entry fields visible before the tail advance s_dep_gen_header->queue_tails[q] = next_tail; return 0; } +static DepGenBuffer *try_pop_dep_gen_buffer(uint32_t next_seq) { + if (s_dep_gen_state == nullptr) { + return nullptr; + } + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + uint32_t head = 0; + uint32_t tail = 0; + + do { + head = s_dep_gen_state->free_queue.head; + tail = s_dep_gen_state->free_queue.tail; + if (head != tail) { + rmb(); // acquire: order the tail read before the buffer_ptrs read below + break; + } + if ((++spins & kDepGenQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kDepGenQueueBackpressureWaitCycles) { + return nullptr; + } + } while (true); + + uint64_t new_buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT]; + s_dep_gen_state->free_queue.head = head + 1; + if (new_buf_ptr == 0) { + return nullptr; + } + + DepGenBuffer *new_buf = reinterpret_cast(new_buf_ptr); + new_buf->count = 0; + s_dep_gen_state->current_buf_ptr = new_buf_ptr; + s_dep_gen_state->current_buf_seq = next_seq; + wmb(); + return new_buf; +} + // --------------------------------------------------------------------------- // Internal: switch the current buffer // --------------------------------------------------------------------------- @@ -89,21 +142,6 @@ static void dep_gen_switch_buffer() { return; } - // Check free_queue before committing the full buffer - rmb(); - uint32_t head = s_dep_gen_state->free_queue.head; - uint32_t tail = s_dep_gen_state->free_queue.tail; - - if (head == tail) { - // No replacement buffer available — overwrite current buffer to keep - // the orch loop alive; account every record we drop. - LOG_WARN("dep_gen: no free buffer, overwriting current (dropped %u records)", full_buf->count); - s_dep_gen_state->dropped_record_count += full_buf->count; - full_buf->count = 0; - wmb(); - return; - } - uint32_t seq = s_dep_gen_state->current_buf_seq; int rc = enqueue_dep_gen_ready_buffer(s_dep_gen_state->current_buf_ptr, seq); if (rc != 0) { @@ -114,16 +152,12 @@ static void dep_gen_switch_buffer() { return; } - // Pop next buffer from free_queue - uint64_t new_buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT]; - rmb(); - s_dep_gen_state->free_queue.head = head + 1; - s_dep_gen_state->current_buf_ptr = new_buf_ptr; - s_dep_gen_state->current_buf_seq = seq + 1; + uint32_t next_seq = seq + 1; + s_dep_gen_state->current_buf_ptr = 0; + s_dep_gen_state->current_buf_seq = next_seq; wmb(); - DepGenBuffer *new_buf = reinterpret_cast(new_buf_ptr); - new_buf->count = 0; + (void)try_pop_dep_gen_buffer(next_seq); } // --------------------------------------------------------------------------- @@ -144,14 +178,8 @@ void dep_gen_aicpu_init() { uint32_t tail = s_dep_gen_state->free_queue.tail; if (head != tail) { - uint64_t buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT]; - rmb(); - s_dep_gen_state->free_queue.head = head + 1; - s_dep_gen_state->current_buf_ptr = buf_ptr; - s_dep_gen_state->current_buf_seq = 0; - wmb(); - DepGenBuffer *buf = reinterpret_cast(buf_ptr); - buf->count = 0; + (void)try_pop_dep_gen_buffer(0); + uint64_t buf_ptr = s_dep_gen_state->current_buf_ptr; LOG_INFO_V0("dep_gen: popped initial buffer addr=0x%lx", buf_ptr); } else { LOG_ERROR("dep_gen: free_queue empty during init"); @@ -180,9 +208,13 @@ void dep_gen_aicpu_record_submit( rmb(); uint64_t cur_ptr = s_dep_gen_state->current_buf_ptr; if (cur_ptr == 0) { - s_dep_gen_state->dropped_record_count += 1; - wmb(); - return; + DepGenBuffer *recovered = try_pop_dep_gen_buffer(s_dep_gen_state->current_buf_seq); + if (recovered == nullptr) { + s_dep_gen_state->dropped_record_count += 1; + wmb(); + return; + } + cur_ptr = s_dep_gen_state->current_buf_ptr; } DepGenBuffer *buf = reinterpret_cast(cur_ptr); @@ -205,9 +237,13 @@ void dep_gen_aicpu_record_submit( rmb(); cur_ptr = s_dep_gen_state->current_buf_ptr; if (cur_ptr == 0) { - s_dep_gen_state->dropped_record_count += 1; - wmb(); - return; + DepGenBuffer *recovered = try_pop_dep_gen_buffer(s_dep_gen_state->current_buf_seq); + if (recovered == nullptr) { + s_dep_gen_state->dropped_record_count += 1; + wmb(); + return; + } + cur_ptr = s_dep_gen_state->current_buf_ptr; } buf = reinterpret_cast(cur_ptr); local_count = buf->count; // refresh after switch — new buffer starts at 0 diff --git a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp index 5ed92cd61..0d030eb2e 100644 --- a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp +++ b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp @@ -108,6 +108,59 @@ extern "C" uint64_t get_platform_l2_swimlane_aicore_rotation_table() { } L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; } +static constexpr uint64_t kL2SwimlaneQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000; // 20 us +static constexpr uint32_t kL2SwimlaneQueueBackpressurePollMask = 1023; + +static bool +wait_for_ready_queue_space(L2SwimlaneDataHeader *header, int thread_idx, uint32_t *tail_out, uint32_t *head_out) { + if (header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return false; + } + const uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE; + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + uint32_t current_tail = header->queue_tails[thread_idx]; + uint32_t current_head = header->queue_heads[thread_idx]; + uint32_t next_tail = (current_tail + 1) % capacity; + if (next_tail != current_head) { + *tail_out = current_tail; + *head_out = current_head; + return true; + } + if ((++spins & kL2SwimlaneQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kL2SwimlaneQueueBackpressureWaitCycles) { + break; + } + } while (true); + return false; +} + +static bool wait_for_free_queue_entry(L2SwimlaneFreeQueue *free_queue, uint32_t *head_out, uint32_t *tail_out) { + if (free_queue == nullptr) { + return false; + } + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + uint32_t head = free_queue->head; + uint32_t tail = free_queue->tail; + if (head != tail) { + *head_out = head; + *tail_out = tail; + rmb(); // acquire: order the tail read above before the caller's buffer_ptrs read + return true; + } + if ((++spins & kL2SwimlaneQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kL2SwimlaneQueueBackpressureWaitCycles) { + break; + } + } while (true); + return false; +} + /** * Enqueue ready buffer to per-thread queue * @@ -124,24 +177,50 @@ static int enqueue_ready_buffer( L2SwimlaneBufferKind kind ) { uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE; - uint32_t current_tail = header->queue_tails[thread_idx]; - uint32_t current_head = header->queue_heads[thread_idx]; + uint32_t current_tail = 0; + uint32_t current_head = 0; - // Check if queue is full - uint32_t next_tail = (current_tail + 1) % capacity; - if (next_tail == current_head) { + if (!wait_for_ready_queue_space(header, thread_idx, ¤t_tail, ¤t_head)) { return -1; } + uint32_t next_tail = (current_tail + 1) % capacity; header->queues[thread_idx][current_tail].core_index = core_index; header->queues[thread_idx][current_tail].kind = kind; header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr; header->queues[thread_idx][current_tail].buffer_seq = buffer_seq; + wmb(); // publish: entry fields visible before the tail advance header->queue_tails[thread_idx] = next_tail; return 0; } +static L2SwimlaneAicpuTaskBuffer * +try_pop_records_buffer(int core_id, L2SwimlaneAicpuTaskPool *state, uint32_t next_seq) { + uint32_t head = 0; + uint32_t tail = 0; + if (!wait_for_free_queue_entry(&state->free_queue, &head, &tail)) { + return nullptr; + } + + uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; + rmb(); + state->free_queue.head = head + 1; + if (new_buf_ptr == 0) { + return nullptr; + } + + auto *new_buf = reinterpret_cast(new_buf_ptr); + new_buf->count = 0; + wmb(); + + state->head.current_buf_ptr = new_buf_ptr; + state->head.current_buf_seq = next_seq; + s_current_aicpu_task_buffers[core_id] = new_buf; + wmb(); + return new_buf; +} + void l2_swimlane_aicpu_init(int worker_count) { // Reset cross-launch state up front. AICPU statics persist across launches // on the same loaded .so; without this reset, an enabled→disabled launch @@ -280,47 +359,34 @@ static void switch_records_buffer(int core_id, int thread_idx) { LOG_INFO_V0("Thread %d: Core %d buffer is full (count=%u)", thread_idx, core_id, full_buf->count); - // Check free_queue before committing the full buffer - rmb(); - uint32_t head = state->free_queue.head; - uint32_t tail = state->free_queue.tail; - - if (head == tail) { - // No replacement buffer available — overwrite current buffer to keep AICore alive - LOG_WARN("Thread %d: Core %d no free buffer, overwriting current buffer (data lost)", thread_idx, core_id); - state->head.dropped_record_count = state->head.dropped_record_count + full_buf->count; - full_buf->count = 0; - wmb(); - return; - } - - // Enqueue full buffer to ReadyQueue uint32_t seq = state->head.current_buf_seq; + uint64_t full_buf_ptr = state->head.current_buf_ptr; int rc = enqueue_ready_buffer( - s_l2_swimlane_header, thread_idx, core_id, state->head.current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask + s_l2_swimlane_header, thread_idx, core_id, full_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask ); if (rc != 0) { LOG_ERROR("Thread %d: Core %d failed to enqueue buffer (queue full), data lost!", thread_idx, core_id); - // Revert: discard data and keep writing state->head.dropped_record_count = state->head.dropped_record_count + full_buf->count; full_buf->count = 0; wmb(); return; } - // Pop next buffer from free_queue - uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; - rmb(); - state->free_queue.head = head + 1; - state->head.current_buf_ptr = new_buf_ptr; - state->head.current_buf_seq = seq + 1; + uint32_t next_seq = seq + 1; + state->head.current_buf_ptr = 0; + state->head.current_buf_seq = next_seq; + s_current_aicpu_task_buffers[core_id] = nullptr; wmb(); - L2SwimlaneAicpuTaskBuffer *new_buf = reinterpret_cast(new_buf_ptr); - new_buf->count = 0; - s_current_aicpu_task_buffers[core_id] = new_buf; + L2SwimlaneAicpuTaskBuffer *new_buf = try_pop_records_buffer(core_id, state, next_seq); + if (new_buf == nullptr) { + return; + } - LOG_INFO_V0("Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr); + LOG_INFO_V0( + "Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, + reinterpret_cast(new_buf) + ); } // Try to rotate the AICore buffer for `core_id`. Called from the completion @@ -338,10 +404,9 @@ static void aicore_rotate(int core_id, int thread_idx) { uint64_t old_buf_ptr = ac_state->head.current_buf_ptr; uint32_t seq = ac_state->head.current_buf_seq; - rmb(); - uint32_t head = ac_state->free_queue.head; - uint32_t tail = ac_state->free_queue.tail; - if (head == tail) { + uint32_t head = 0; + uint32_t tail = 0; + if (!wait_for_free_queue_entry(&ac_state->free_queue, &head, &tail)) { // No replacement available — AICore continues to write into the old // buffer; its slot counter will hit BUFFER_SIZE and the slot guard // silently drops further records. We deliberately do NOT bump @@ -362,6 +427,16 @@ static void aicore_rotate(int core_id, int thread_idx) { return; } + uint64_t new_buf_ptr = ac_state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; + rmb(); + if (new_buf_ptr == 0) { + LOG_WARN( + "Thread %d: Core %d AICore free_queue returned a null buffer at rotation; keeping old buffer active", + thread_idx, core_id + ); + return; + } + // Enqueue the just-filled AICore buffer with count = BUFFER_SIZE. if (old_buf_ptr != 0) { L2SwimlaneAicoreTaskBuffer *old_buf = reinterpret_cast(old_buf_ptr); @@ -393,8 +468,6 @@ static void aicore_rotate(int core_id, int thread_idx) { // detect rotation, then reads head.current_buf_ptr. Write ptr first so // AICore can never see a new seq with a stale ptr. new_buf->count=0 must // also be visible before AICore's slot writes begin. - uint64_t new_buf_ptr = ac_state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; - rmb(); ac_state->free_queue.head = head + 1; L2SwimlaneAicoreTaskBuffer *new_buf = reinterpret_cast(new_buf_ptr); new_buf->count = 0; @@ -461,10 +534,14 @@ int l2_swimlane_aicpu_complete_task( L2SwimlaneAicpuTaskBuffer *l2_swimlane_buf = s_current_aicpu_task_buffers[core_id]; if (l2_swimlane_buf == nullptr) { - // No active records buffer (init ran out of free buffers); count as drop - // so host reconciliation stays consistent. - state->head.dropped_record_count += 1; - return -1; + l2_swimlane_buf = try_pop_records_buffer(core_id, state, state->head.current_buf_seq); + if (l2_swimlane_buf == nullptr) { + // No active records buffer (init ran out of free buffers or host has + // not refilled after the last published full buffer); count as drop + // so host reconciliation stays consistent. + state->head.dropped_record_count += 1; + return -1; + } } uint32_t count = l2_swimlane_buf->count; if (count >= PLATFORM_PROF_BUFFER_SIZE) { @@ -721,19 +798,22 @@ static void switch_phase_buffer_kind( ); state->head.dropped_record_count += full_buf->count; full_buf->count = 0; - *current_buf_out = nullptr; - state->head.current_buf_ptr = 0; wmb(); return; } - rmb(); - uint32_t head = state->free_queue.head; - uint32_t tail = state->free_queue.tail; - if (head != tail) { + uint32_t head = 0; + uint32_t tail = 0; + if (wait_for_free_queue_entry(&state->free_queue, &head, &tail)) { uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; rmb(); state->free_queue.head = head + 1; + if (new_buf_ptr == 0) { + *current_buf_out = nullptr; + state->head.current_buf_ptr = 0; + wmb(); + return; + } state->head.current_buf_ptr = new_buf_ptr; state->head.current_buf_seq = seq + 1; wmb(); @@ -764,13 +844,15 @@ static Record *acquire_phase_slot( ) { Buffer *buf = *current_buf_out; if (buf == nullptr) { - rmb(); - uint32_t head = state->free_queue.head; - uint32_t tail = state->free_queue.tail; - if (head != tail) { + uint32_t head = 0; + uint32_t tail = 0; + if (wait_for_free_queue_entry(&state->free_queue, &head, &tail)) { uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; rmb(); state->free_queue.head = head + 1; + if (buf_ptr == 0) { + return nullptr; + } state->head.current_buf_ptr = buf_ptr; state->head.current_buf_seq += 1; wmb(); diff --git a/src/a2a3/platform/shared/aicpu/pmu_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/pmu_collector_aicpu.cpp index 3633b9bba..b2592b1a9 100644 --- a/src/a2a3/platform/shared/aicpu/pmu_collector_aicpu.cpp +++ b/src/a2a3/platform/shared/aicpu/pmu_collector_aicpu.cpp @@ -19,14 +19,16 @@ * Buffer switching: * - SPSC free_queue: Host pushes free PmuBuffers, AICPU pops when switching. * - Per-thread ready_queue: AICPU enqueues full buffers for host collection. - * - On free_queue empty or ready_queue full: overwrite current buffer (data lost, - * avoids blocking the AICPU dispatch loop). + * - Full buffers are published before AICPU tries to recover a replacement. + * If recovery is delayed, later records are counted as dropped until host + * replenishes free_queue. */ #include "aicpu/pmu_collector_aicpu.h" #include +#include "aicpu/device_time.h" #include "aicpu/platform_regs.h" #include "common/memory_barrier.h" #include "common/platform_config.h" @@ -47,6 +49,9 @@ static PmuDataHeader *s_pmu_header = nullptr; // Populated by pmu_aicpu_init(); 0 means "no PMU for this core" (sim). static uint64_t s_pmu_reg_addrs[PLATFORM_MAX_CORES] = {0}; +static constexpr uint64_t kPmuQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000; // 20 us +static constexpr uint32_t kPmuQueueBackpressurePollMask = 1023; + extern "C" void set_platform_pmu_base(uint64_t pmu_data_base) { g_platform_pmu_base = pmu_data_base; } extern "C" uint64_t get_platform_pmu_base() { return g_platform_pmu_base; } @@ -101,22 +106,74 @@ static void pmu_read_counters(uint64_t reg_base, PmuRecord *out) { // --------------------------------------------------------------------------- static int enqueue_pmu_ready_buffer(int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq) { + if (s_pmu_header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return -1; + } uint32_t capacity = PLATFORM_PMU_READYQUEUE_SIZE; - uint32_t current_tail = s_pmu_header->queue_tails[thread_idx]; - uint32_t current_head = s_pmu_header->queue_heads[thread_idx]; + uint32_t current_tail = 0; + uint32_t current_head = 0; + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + current_tail = s_pmu_header->queue_tails[thread_idx]; + current_head = s_pmu_header->queue_heads[thread_idx]; + uint32_t next_tail = (current_tail + 1) % capacity; + if (next_tail != current_head) { + break; + } + if ((++spins & kPmuQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kPmuQueueBackpressureWaitCycles) { + return -1; + } + } while (true); uint32_t next_tail = (current_tail + 1) % capacity; - if (next_tail == current_head) { - return -1; // Queue full - } - s_pmu_header->queues[thread_idx][current_tail].core_index = core_index; s_pmu_header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr; s_pmu_header->queues[thread_idx][current_tail].buffer_seq = buffer_seq; + wmb(); // publish: entry fields visible before the tail advance s_pmu_header->queue_tails[thread_idx] = next_tail; return 0; } +static PmuBuffer *try_pop_pmu_buffer(int core_id, PmuBufferState *state, uint32_t next_seq) { + (void)core_id; + if (state == nullptr) { + return nullptr; + } + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + uint32_t head = 0; + uint32_t tail = 0; + + do { + head = state->free_queue.head; + tail = state->free_queue.tail; + if (head != tail) { + rmb(); // acquire: order the tail read before the buffer_ptrs read below + break; + } + if ((++spins & kPmuQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kPmuQueueBackpressureWaitCycles) { + return nullptr; + } + } while (true); + + uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT]; + state->free_queue.head = head + 1; + if (new_buf_ptr == 0) { + return nullptr; + } + + PmuBuffer *new_buf = reinterpret_cast(new_buf_ptr); + new_buf->count = 0; + state->current_buf_ptr = new_buf_ptr; + state->current_buf_seq = next_seq; + wmb(); + return new_buf; +} + // --------------------------------------------------------------------------- // Internal: switch the current buffer for one core // --------------------------------------------------------------------------- @@ -132,20 +189,6 @@ static void pmu_switch_buffer(int core_id, int thread_idx) { return; } - // Check free_queue before committing the full buffer - rmb(); - uint32_t head = state->free_queue.head; - uint32_t tail = state->free_queue.tail; - - if (head == tail) { - // No replacement buffer available — overwrite current buffer to keep AICPU alive - LOG_WARN("Thread %d: Core %d no free PMU buffer, overwriting current buffer (data lost)", thread_idx, core_id); - state->dropped_record_count += full_buf->count; - full_buf->count = 0; - wmb(); - return; - } - // Enqueue full buffer to ready_queue uint32_t seq = state->current_buf_seq; int rc = enqueue_pmu_ready_buffer(thread_idx, static_cast(core_id), state->current_buf_ptr, seq); @@ -159,18 +202,19 @@ static void pmu_switch_buffer(int core_id, int thread_idx) { return; } - // Pop next buffer from free_queue - uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT]; - rmb(); - state->free_queue.head = head + 1; - state->current_buf_ptr = new_buf_ptr; - state->current_buf_seq = seq + 1; + uint32_t next_seq = seq + 1; + state->current_buf_ptr = 0; + state->current_buf_seq = next_seq; wmb(); - PmuBuffer *new_buf = reinterpret_cast(new_buf_ptr); - new_buf->count = 0; - - LOG_INFO_V0("Thread %d: Core %d switched to new PMU buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr); + PmuBuffer *new_buf = try_pop_pmu_buffer(core_id, state, next_seq); + if (new_buf == nullptr) { + return; + } + LOG_INFO_V0( + "Thread %d: Core %d switched to new PMU buffer (addr=0x%lx)", thread_idx, core_id, + reinterpret_cast(new_buf) + ); } // --------------------------------------------------------------------------- @@ -225,16 +269,8 @@ void pmu_aicpu_init(const uint32_t *physical_core_ids, int num_cores) { uint32_t tail = state->free_queue.tail; if (head != tail) { - uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT]; - rmb(); - state->free_queue.head = head + 1; - state->current_buf_ptr = buf_ptr; - state->current_buf_seq = 0; - wmb(); - - PmuBuffer *buf = reinterpret_cast(buf_ptr); - buf->count = 0; - + (void)try_pop_pmu_buffer(i, state, 0); + uint64_t buf_ptr = state->current_buf_ptr; LOG_DEBUG("Core %d: popped initial PMU buffer (addr=0x%lx)", i, buf_ptr); } else { LOG_ERROR("Core %d: PMU free_queue is empty during init!", i); @@ -266,12 +302,18 @@ void pmu_aicpu_record_task(int core_id, int thread_idx, uint64_t task_id, uint32 rmb(); uint64_t cur_ptr = state->current_buf_ptr; + PmuBuffer *pmu_buf = nullptr; if (cur_ptr == 0) { - state->dropped_record_count += 1; - wmb(); - return; + pmu_buf = try_pop_pmu_buffer(core_id, state, state->current_buf_seq); + if (pmu_buf == nullptr) { + state->dropped_record_count += 1; + wmb(); + return; + } + cur_ptr = state->current_buf_ptr; + } else { + pmu_buf = reinterpret_cast(cur_ptr); } - PmuBuffer *pmu_buf = reinterpret_cast(cur_ptr); // Switch buffer if full if (pmu_buf->count >= static_cast(PLATFORM_PMU_RECORDS_PER_BUFFER)) { @@ -279,11 +321,16 @@ void pmu_aicpu_record_task(int core_id, int thread_idx, uint64_t task_id, uint32 rmb(); cur_ptr = state->current_buf_ptr; if (cur_ptr == 0) { - state->dropped_record_count += 1; - wmb(); - return; + pmu_buf = try_pop_pmu_buffer(core_id, state, state->current_buf_seq); + if (pmu_buf == nullptr) { + state->dropped_record_count += 1; + wmb(); + return; + } + cur_ptr = state->current_buf_ptr; + } else { + pmu_buf = reinterpret_cast(cur_ptr); } - pmu_buf = reinterpret_cast(cur_ptr); } uint32_t idx = pmu_buf->count; diff --git a/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp b/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp index b0a9123b2..091b146b1 100644 --- a/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp +++ b/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp @@ -20,7 +20,6 @@ #include "host/l2_swimlane_collector.h" -#include #include #include #include @@ -94,9 +93,9 @@ int L2SwimlaneCollector::initialize( aicpu_thread_num_ = aicpu_thread_num; l2_swimlane_level_ = l2_swimlane_level; output_prefix_ = output_prefix; - total_perf_collected_ = 0; - total_sched_phase_collected_ = 0; - total_orch_phase_collected_ = 0; + total_perf_collected_.store(0, std::memory_order_relaxed); + total_sched_phase_collected_.store(0, std::memory_order_relaxed); + total_orch_phase_collected_.store(0, std::memory_order_relaxed); // Stash the memory context on the base up-front so alloc_single_buffer // sees consistent values during init. shm_host_ stays nullptr until the @@ -179,7 +178,9 @@ int L2SwimlaneCollector::initialize( LOG_DEBUG(" buffer_capacity: %d", PLATFORM_PROF_BUFFER_SIZE); LOG_DEBUG(" queue capacity: %d", PLATFORM_PROF_READYQUEUE_SIZE); - // Step 5: Initialize L2SwimlaneAicpuTaskPools — 1 buffer per core in free_queue, rest to recycled pool + // Step 5: Initialize L2SwimlaneAicpuTaskPools. Seed as many buffers as + // the device-side free_queue can hold; any remaining buffers stay in the + // host recycled pool. for (int i = 0; i < num_aicore; i++) { L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(perf_host_ptr, i); memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool)); @@ -189,6 +190,9 @@ int L2SwimlaneCollector::initialize( state->head.current_buf_ptr = 0; state->head.current_buf_seq = 0; + const int initial_free_count = (PLATFORM_PROF_BUFFERS_PER_CORE < PLATFORM_PROF_SLOT_COUNT) ? + PLATFORM_PROF_BUFFERS_PER_CORE : + PLATFORM_PROF_SLOT_COUNT; for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_CORE; s++) { void *host_buf_ptr = nullptr; void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuTaskBuffer), &host_buf_ptr); @@ -200,14 +204,14 @@ int L2SwimlaneCollector::initialize( memset(buf, 0, sizeof(L2SwimlaneAicpuTaskBuffer)); buf->count = 0; - if (s == 0) { - state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); + if (s < initial_free_count) { + state->free_queue.buffer_ptrs[s] = reinterpret_cast(dev_buf_ptr); } else { manager_.push_recycled(static_cast(ProfBufferType::AICPU_TASK), dev_buf_ptr); } } wmb(); - state->free_queue.tail = 1; + state->free_queue.tail = static_cast(initial_free_count); wmb(); } @@ -217,6 +221,9 @@ int L2SwimlaneCollector::initialize( L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(perf_host_ptr, num_aicore, i); memset(ac_state, 0, sizeof(L2SwimlaneAicoreTaskPool)); + const int initial_free_count = (PLATFORM_AICORE_BUFFERS_PER_CORE < PLATFORM_PROF_SLOT_COUNT) ? + PLATFORM_AICORE_BUFFERS_PER_CORE : + PLATFORM_PROF_SLOT_COUNT; for (int s = 0; s < PLATFORM_AICORE_BUFFERS_PER_CORE; s++) { void *host_buf_ptr = nullptr; void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicoreTaskBuffer), &host_buf_ptr); @@ -228,21 +235,20 @@ int L2SwimlaneCollector::initialize( memset(buf, 0, sizeof(L2SwimlaneAicoreTaskBuffer)); buf->count = 0; - if (s == 0) { - ac_state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); + if (s < initial_free_count) { + ac_state->free_queue.buffer_ptrs[s] = reinterpret_cast(dev_buf_ptr); } else { manager_.push_recycled(static_cast(ProfBufferType::AICORE_TASK), dev_buf_ptr); } } wmb(); - ac_state->free_queue.tail = 1; + ac_state->free_queue.tail = static_cast(initial_free_count); wmb(); } LOG_DEBUG( - "Initialized buffer pools: %d L2SwimlaneAicpuTaskBuffers/core + %d L2SwimlaneAicoreTaskBuffers/core (1 in " - "free_queue, " - "rest in recycled pool)", - PLATFORM_PROF_BUFFERS_PER_CORE, PLATFORM_AICORE_BUFFERS_PER_CORE + "Initialized buffer pools: %d L2SwimlaneAicpuTaskBuffers/core + %d L2SwimlaneAicoreTaskBuffers/core (up to " + "%d in free_queue, rest in recycled pool)", + PLATFORM_PROF_BUFFERS_PER_CORE, PLATFORM_AICORE_BUFFERS_PER_CORE, PLATFORM_PROF_SLOT_COUNT ); // Step 5c: Standalone uint64_t[num_aicore] table that will hold per-core @@ -265,9 +271,10 @@ int L2SwimlaneCollector::initialize( // Step 6: Initialize per-thread phase pools — both sched and orch. Each // pool is sized to its own PLATFORM_PROF_{SCHED,ORCH}_BUFFERS_PER_THREAD - // (1 in free_queue, rest in the recycled pool tagged by kind). Templated on the - // concrete TypedBuffer so the `count` zero-store uses the matching layout - // — sched and orch buffers have DIFFERENT sizes (64B vs 32B records), + // (seeded into free_queue up to slot capacity, rest in the recycled pool + // tagged by kind). Templated on the concrete TypedBuffer so the `count` + // zero-store uses the matching layout — sched and orch buffers have + // DIFFERENT sizes (64B vs 32B records), // so a single cast type for both would land the count store past the end // of the orch allocation and corrupt the heap. // state_count pool states are zeroed (so the host's [0, PLATFORM_MAX) @@ -284,6 +291,8 @@ int L2SwimlaneCollector::initialize( auto *state = get_state(perf_host_ptr, num_aicore, t); memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool)); if (t >= buffer_count) continue; // zeroed state only; no buffers (unused slot) + const int initial_free_count = + (buffers_per_thread < PLATFORM_PROF_SLOT_COUNT) ? buffers_per_thread : PLATFORM_PROF_SLOT_COUNT; for (int s = 0; s < buffers_per_thread; s++) { void *host_buf_ptr = nullptr; void *dev_buf_ptr = alloc_single_buffer(buffer_bytes, &host_buf_ptr); @@ -295,14 +304,14 @@ int L2SwimlaneCollector::initialize( // matching Buffer type. The records payload is overwritten by // AICPU on first use. reinterpret_cast(host_buf_ptr)->count = 0; - if (s == 0) { - state->free_queue.buffer_ptrs[0] = reinterpret_cast(dev_buf_ptr); + if (s < initial_free_count) { + state->free_queue.buffer_ptrs[s] = reinterpret_cast(dev_buf_ptr); } else { manager_.push_recycled(static_cast(recycle_kind), dev_buf_ptr); } } wmb(); - state->free_queue.tail = 1; + state->free_queue.tail = static_cast(initial_free_count); wmb(); } return 0; @@ -338,8 +347,10 @@ int L2SwimlaneCollector::initialize( return -1; } LOG_DEBUG( - "Initialized %d sched (%d buf/thread) + 1 orch (%d buf) PhaseBufferStates", num_phase_threads, - PLATFORM_PROF_SCHED_BUFFERS_PER_THREAD, PLATFORM_PROF_ORCH_BUFFERS_PER_THREAD + "Initialized %d sched (%d buf/thread) + 1 orch (%d buf) PhaseBufferStates (seeded up to %d free_queue " + "slots)", + num_phase_threads, PLATFORM_PROF_SCHED_BUFFERS_PER_THREAD, PLATFORM_PROF_ORCH_BUFFERS_PER_THREAD, + PLATFORM_PROF_SLOT_COUNT ); wmb(); @@ -378,10 +389,11 @@ void L2SwimlaneCollector::copy_perf_buffer(const ReadyBufferInfo &info) { } uint32_t core_index = info.index; if (core_index < static_cast(num_aicore_)) { + std::scoped_lock lock(perf_record_mutexes_[core_index]); for (uint32_t i = 0; i < count; i++) { collected_perf_records_[core_index].push_back(buf->records[i]); } - total_perf_collected_ += count; + total_perf_collected_.fetch_add(count, std::memory_order_relaxed); } } @@ -394,12 +406,13 @@ void L2SwimlaneCollector::copy_sched_phase_buffer(const ReadyBufferInfo &info) { } uint32_t tidx = info.index; if (tidx < collected_sched_phase_records_.size()) { + std::scoped_lock lock(sched_phase_record_mutexes_[tidx]); for (uint32_t i = 0; i < count; i++) { collected_sched_phase_records_[tidx].push_back(buf->records[i]); } - total_sched_phase_collected_ += count; + total_sched_phase_collected_.fetch_add(count, std::memory_order_relaxed); if (count > 0) { - has_phase_data_ = true; + has_phase_data_.store(true, std::memory_order_relaxed); } } } @@ -413,12 +426,13 @@ void L2SwimlaneCollector::copy_orch_phase_buffer(const ReadyBufferInfo &info) { } uint32_t tidx = info.index; if (tidx < collected_orch_phase_records_.size()) { + std::scoped_lock lock(orch_phase_record_mutexes_[tidx]); for (uint32_t i = 0; i < count; i++) { collected_orch_phase_records_[tidx].push_back(buf->records[i]); } - total_orch_phase_collected_ += count; + total_orch_phase_collected_.fetch_add(count, std::memory_order_relaxed); if (count > 0) { - has_phase_data_ = true; + has_phase_data_.store(true, std::memory_order_relaxed); } } } @@ -453,16 +467,19 @@ void L2SwimlaneCollector::copy_aicore_buffer(const ReadyBufferInfo &info) { if (count > static_cast(PLATFORM_AICORE_BUFFER_SIZE)) { count = PLATFORM_AICORE_BUFFER_SIZE; } - auto &dst = collected_aicore_records_[core_index]; - dst.reserve(dst.size() + count); uint32_t skipped = 0; - for (uint32_t i = 0; i < count; i++) { - const L2SwimlaneAicoreTaskRecord &r = buf->records[i]; - if (r.start_time == 0) { - skipped++; - continue; + { + std::scoped_lock lock(aicore_record_mutexes_[core_index]); + auto &dst = collected_aicore_records_[core_index]; + dst.reserve(dst.size() + count); + for (uint32_t i = 0; i < count; i++) { + const L2SwimlaneAicoreTaskRecord &r = buf->records[i]; + if (r.start_time == 0) { + skipped++; + continue; + } + dst.push_back(r); } - dst.push_back(r); } if (skipped > 0) { LOG_WARN( @@ -554,8 +571,7 @@ void L2SwimlaneCollector::reconcile_counters() { if (dropped_device > 0) { LOG_WARN( - "L2Swimlane reconcile: %lu %s records dropped on device side (buffer full / " - "ready_queue full).", + "L2Swimlane reconcile: %lu %s records dropped on device side.", static_cast(dropped_device), kind ); } @@ -591,7 +607,7 @@ void L2SwimlaneCollector::reconcile_counters() { [](void *host_ptr) { return reinterpret_cast(host_ptr)->count; }, - total_perf_collected_, /*optional=*/false + total_perf_collected_.load(std::memory_order_relaxed), /*optional=*/false ); reconcile_one( @@ -602,7 +618,7 @@ void L2SwimlaneCollector::reconcile_counters() { [](void *host_ptr) { return reinterpret_cast(host_ptr)->count; }, - total_sched_phase_collected_, /*optional=*/true + total_sched_phase_collected_.load(std::memory_order_relaxed), /*optional=*/true ); reconcile_one( @@ -613,7 +629,7 @@ void L2SwimlaneCollector::reconcile_counters() { [](void *host_ptr) { return reinterpret_cast(host_ptr)->count; }, - total_orch_phase_collected_, /*optional=*/true + total_orch_phase_collected_.load(std::memory_order_relaxed), /*optional=*/true ); } @@ -673,7 +689,10 @@ void L2SwimlaneCollector::read_phase_header_metadata() { LOG_INFO_V0(" Core-to-thread mapping: %d cores", num_phase_cores); } - LOG_INFO_V0("Phase metadata collection complete: has_phase_data=%s", has_phase_data_ ? "yes" : "no"); + LOG_INFO_V0( + "Phase metadata collection complete: has_phase_data=%s", + has_phase_data_.load(std::memory_order_relaxed) ? "yes" : "no" + ); } void L2SwimlaneCollector::set_core_types(const CoreType *types, int n) { @@ -1008,10 +1027,10 @@ int L2SwimlaneCollector::finalize(L2SwimlaneUnregisterCallback unregister_cb, co collected_sched_phase_records_.clear(); collected_orch_phase_records_.clear(); core_to_thread_.clear(); - has_phase_data_ = false; - total_perf_collected_ = 0; - total_sched_phase_collected_ = 0; - total_orch_phase_collected_ = 0; + has_phase_data_.store(false, std::memory_order_relaxed); + total_perf_collected_.store(0, std::memory_order_relaxed); + total_sched_phase_collected_.store(0, std::memory_order_relaxed); + total_orch_phase_collected_.store(0, std::memory_order_relaxed); clear_memory_context(); LOG_DEBUG("Performance profiling cleanup complete"); diff --git a/src/a5/platform/include/host/dep_gen_collector.h b/src/a5/platform/include/host/dep_gen_collector.h index 96c1bcd9f..6b8f8cfb8 100644 --- a/src/a5/platform/include/host/dep_gen_collector.h +++ b/src/a5/platform/include/host/dep_gen_collector.h @@ -16,16 +16,17 @@ * * Architecture: * - BufferPoolManager: shared mgmt-thread infrastructure that - * polls the per-thread ready queue, drains the done_queue, and replenishes - * the (single instance's) free_queue from a unified recycled pool. - * - DepGenCollector: collector thread pops full DepGenBuffers from the manager - * and appends their DepGenRecords to a binary file (submit_trace.bin). + * polls per-thread ready queues, drains done-queue shards, and replenishes + * the single instance's free_queue from a unified recycled pool. + * - DepGenCollector: collector thread shards pop full DepGenBuffers from the + * manager and append their DepGenRecords to a binary file + * (submit_trace.bin). * * Lifecycle: * init() — Allocate header + 1 BufferState + N DepGenBuffers * (pre-fills free_queue; surplus → recycled pool). * Calls set_memory_context() on the base. - * start(tf) — Inherited: launches mgmt + poll threads. + * start(tf) — Inherited: launches mgmt + collector threads. * [device execution] * stop() — Inherited: drain queues, join threads. * reconcile_counters() — Sanity-check current_buf_ptr is cleared by @@ -64,7 +65,7 @@ // --------------------------------------------------------------------------- /** - * Internal hand-off struct delivered from the mgmt thread to the collector. + * Internal hand-off struct delivered from a drain thread to a collector shard. * thread_index identifies the AICPU thread queue the entry was popped from * (always equal to the orchestrator thread index, since dep_gen is single- * instance — exposed for symmetry with PmuReadyBufferInfo). @@ -87,6 +88,8 @@ struct DepGenModule { static constexpr uint32_t kReadyQueueSize = PLATFORM_DEP_GEN_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_DEP_GEN_SLOT_COUNT; static constexpr const char *kSubsystemName = "DepGenModule"; + static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS; + static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS; /** * Buffers grown by proactive_replenish are batch-allocated up to the @@ -104,7 +107,18 @@ struct DepGenModule { * resets it itself on flush/drop/pop. */ static std::optional> - resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) { + resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) { + if (shm == nullptr || header == nullptr) { + LOG_ERROR("DepGenModule: invalid shared memory/header while resolving ready entry"); + return std::nullopt; + } + if (header->num_instances != 1 || entry.instance_index >= header->num_instances) { + LOG_ERROR( + "DepGenModule: invalid ready entry instance=%u (num_instances=%u)", entry.instance_index, + header->num_instances + ); + return std::nullopt; + } DepGenBufferState *state = get_dep_gen_buffer_state(shm, static_cast(entry.instance_index)); profiling_common::EntrySite site; site.kind = 0; diff --git a/src/a5/platform/include/host/l2_swimlane_collector.h b/src/a5/platform/include/host/l2_swimlane_collector.h index 24d6a037a..44d755611 100644 --- a/src/a5/platform/include/host/l2_swimlane_collector.h +++ b/src/a5/platform/include/host/l2_swimlane_collector.h @@ -16,9 +16,9 @@ * Architecture: * - BufferPoolManager: shared mgmt-thread infrastructure that polls * the AICPU ready queue, replenishes per-core / per-thread free queues, and - * hands full buffers off to the collector thread. - * - L2SwimlaneCollector: main thread copies records from the manager's ready queue - * into host vectors and exports the swimlane visualization. + * hands full buffers off to collector thread shards. + * - L2SwimlaneCollector: collector thread shards copy records from manager ready queues + * into host vectors; the owner thread exports the swimlane visualization after stop(). * * Memory operations are injected through callbacks for sim/onboard portability. */ @@ -27,8 +27,11 @@ #define SRC_A5_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_ #include +#include +#include #include #include +#include #include #include #include @@ -87,6 +90,8 @@ struct L2SwimlaneModule { static constexpr uint32_t kReadyQueueSize = PLATFORM_PROF_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_PROF_SLOT_COUNT; static constexpr const char *kSubsystemName = "L2SwimlaneModule"; + static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS; + static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS; /** * batch_size for proactive_replenish's alloc fallback. Sized so that a @@ -121,6 +126,13 @@ struct L2SwimlaneModule { static DataHeader *header_from_shm(void *shm) { return get_l2_swimlane_header(shm); } + template + static void refresh_replenish_metadata(Mgr &mgr, DataHeader *header) { + mgr.read_range_from_device(&header->num_sched_phase_threads, sizeof(header->num_sched_phase_threads)); + mgr.read_range_from_device(&header->num_orch_phase_threads, sizeof(header->num_orch_phase_threads)); + rmb(); + } + /** * Branch on entry.kind to pick the per-core task state, per-thread sched- * or orch-phase state, or per-core AICore state. Returns nullopt for @@ -459,15 +471,20 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase> collected_sched_phase_records_; std::vector> collected_orch_phase_records_; - bool has_phase_data_{false}; + std::atomic has_phase_data_{false}; // Core-to-thread mapping (core_id → scheduler thread index, -1 = unassigned) std::vector core_to_thread_; // Running totals used at reconcile time to cross-check device-side counters. - uint64_t total_perf_collected_{0}; - uint64_t total_sched_phase_collected_{0}; - uint64_t total_orch_phase_collected_{0}; + std::atomic total_perf_collected_{0}; + std::atomic total_sched_phase_collected_{0}; + std::atomic total_orch_phase_collected_{0}; + + std::array perf_record_mutexes_; + std::array aicore_record_mutexes_; + std::array sched_phase_record_mutexes_; + std::array orch_phase_record_mutexes_; // Per-buffer-kind handlers used by on_buffer_collected. void copy_perf_buffer(const ReadyBufferInfo &info); diff --git a/src/a5/platform/include/host/pmu_collector.h b/src/a5/platform/include/host/pmu_collector.h index 7a7cdc79a..b42467aa1 100644 --- a/src/a5/platform/include/host/pmu_collector.h +++ b/src/a5/platform/include/host/pmu_collector.h @@ -14,11 +14,11 @@ * @brief Host-side PMU buffer allocation, streaming collection, and CSV export. * * Architecture: - * - BufferPoolManager: shared mgmt-thread infrastructure that - * polls per-thread PmuReadyQueues, drains the done_queue, and replenishes - * the per-core free_queues from a unified recycled pool. - * - PmuCollector: collector thread pops full PmuBuffers from the manager - * and appends them to the CSV file. + * - BufferPoolManager: shared split-mgmt infrastructure that polls + * per-thread ready queues, drains done-queue shards, and replenishes the + * per-core free_queues from a unified recycled pool. + * - PmuCollector: collector thread shards pop full PmuBuffers from the manager + * and append them to the CSV file. * * a5 specifics: device↔host transfers go through profiling_copy.h. The * framework's mgmt loop mirrors the shm region per tick; per-buffer @@ -32,12 +32,12 @@ * start(tf) can launch threads. * start(tf) — Inherited from ProfilerBase: assembles * MemoryOps from the stashed callbacks - * and launches the mgmt + poll threads. + * and launches the mgmt + collector threads. * [device execution] - * stop() — Stop mgmt → join mgmt → signal poll → - * drain L2 → join poll, in that order. On - * return both thread exits and queue - * drains are complete. + * stop() — Stop mgmt → join mgmt → signal collectors → + * drain ready shards → join collectors, in + * that order. On return both thread exits and + * queue drains are complete. * reconcile_counters() — Sanity-check PmuBufferState::current_buf_ptr * (any non-zero pointer with records is a * device-flush bug, logged as ERROR) and @@ -84,9 +84,8 @@ */ /** - * Internal hand-off struct delivered from the mgmt thread to the - * collector. thread_index is the logical AICPU thread queue the entry was - * popped from, passed through by ProfilerBase's mgmt loop. + * Internal hand-off struct delivered from a drain thread to a collector shard. + * thread_index is the logical AICPU thread queue the entry was popped from. */ struct PmuReadyBufferInfo { uint32_t core_index; @@ -106,6 +105,8 @@ struct PmuModule { static constexpr uint32_t kReadyQueueSize = PLATFORM_PMU_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_PMU_SLOT_COUNT; static constexpr const char *kSubsystemName = "PmuModule"; + static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS; + static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS; /** * Buffers grown by proactive_replenish are batch-allocated up to the @@ -124,7 +125,18 @@ struct PmuModule { * and resets it itself when popping from free_queue. */ static std::optional> - resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) { + resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) { + if (shm == nullptr || header == nullptr) { + LOG_ERROR("PmuModule: invalid shared memory/header while resolving ready entry"); + return std::nullopt; + } + if (entry.core_index >= header->num_cores || entry.core_index >= static_cast(PLATFORM_MAX_CORES)) { + LOG_ERROR( + "PmuModule: invalid ready entry core=%u (num_cores=%u, max=%u)", entry.core_index, header->num_cores, + static_cast(PLATFORM_MAX_CORES) + ); + return std::nullopt; + } PmuBufferState *state = get_pmu_buffer_state(shm, static_cast(entry.core_index)); profiling_common::EntrySite site; site.kind = 0; diff --git a/src/a5/platform/shared/aicpu/dep_gen_collector_aicpu.cpp b/src/a5/platform/shared/aicpu/dep_gen_collector_aicpu.cpp index 9b934a2f4..e2db5c4a1 100644 --- a/src/a5/platform/shared/aicpu/dep_gen_collector_aicpu.cpp +++ b/src/a5/platform/shared/aicpu/dep_gen_collector_aicpu.cpp @@ -20,15 +20,17 @@ * - Host pushes free DepGenBuffers via free_queue. * - AICPU pops when current buffer fills; pushes full buffer to per-thread * ready_queue (indexed by orch_thread_idx). - * - On free_queue empty or ready_queue full: overwrite current buffer - * (record dropped_record_count, keep AICPU alive). Host reads dropped - * at finalize to decide whether to emit deps.json. + * - Full buffers are published before AICPU tries to recover a replacement. + * If recovery is delayed, later records are counted as dropped until host + * replenishes free_queue. Host reads dropped at finalize to decide whether + * to emit deps.json. */ #include "aicpu/dep_gen_collector_aicpu.h" #include +#include "aicpu/device_time.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "common/unified_log.h" @@ -41,6 +43,9 @@ static DepGenDataHeader *s_dep_gen_header = nullptr; static DepGenBufferState *s_dep_gen_state = nullptr; static int s_orch_thread_idx = -1; // set via dep_gen_aicpu_set_orch_thread_idx +static constexpr uint64_t kDepGenQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000; // 20 us +static constexpr uint32_t kDepGenQueueBackpressurePollMask = 1023; + extern "C" void set_platform_dep_gen_base(uint64_t dep_gen_data_base) { g_platform_dep_gen_base = dep_gen_data_base; } extern "C" uint64_t get_platform_dep_gen_base() { return g_platform_dep_gen_base; } @@ -56,26 +61,74 @@ void dep_gen_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thr // --------------------------------------------------------------------------- static int enqueue_dep_gen_ready_buffer(uint64_t buffer_ptr, uint32_t buffer_seq) { - if (s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + if (s_dep_gen_header == nullptr || s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) { return -1; } int q = s_orch_thread_idx; uint32_t capacity = PLATFORM_DEP_GEN_READYQUEUE_SIZE; - uint32_t current_tail = s_dep_gen_header->queue_tails[q]; - uint32_t current_head = s_dep_gen_header->queue_heads[q]; + uint32_t current_tail = 0; + uint32_t current_head = 0; + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + current_tail = s_dep_gen_header->queue_tails[q]; + current_head = s_dep_gen_header->queue_heads[q]; + uint32_t next_tail = (current_tail + 1) % capacity; + if (next_tail != current_head) { + break; + } + if ((++spins & kDepGenQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kDepGenQueueBackpressureWaitCycles) { + return -1; + } + } while (true); uint32_t next_tail = (current_tail + 1) % capacity; - if (next_tail == current_head) { - return -1; // Queue full - } - s_dep_gen_header->queues[q][current_tail].instance_index = 0; s_dep_gen_header->queues[q][current_tail].buffer_ptr = buffer_ptr; s_dep_gen_header->queues[q][current_tail].buffer_seq = buffer_seq; + wmb(); // publish: entry fields visible before the tail advance s_dep_gen_header->queue_tails[q] = next_tail; return 0; } +static DepGenBuffer *try_pop_dep_gen_buffer(uint32_t next_seq) { + if (s_dep_gen_state == nullptr) { + return nullptr; + } + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + uint32_t head = 0; + uint32_t tail = 0; + + do { + head = s_dep_gen_state->free_queue.head; + tail = s_dep_gen_state->free_queue.tail; + if (head != tail) { + rmb(); // acquire: order the tail read before the buffer_ptrs read below + break; + } + if ((++spins & kDepGenQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kDepGenQueueBackpressureWaitCycles) { + return nullptr; + } + } while (true); + + uint64_t new_buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT]; + s_dep_gen_state->free_queue.head = head + 1; + if (new_buf_ptr == 0) { + return nullptr; + } + + DepGenBuffer *new_buf = reinterpret_cast(new_buf_ptr); + new_buf->count = 0; + s_dep_gen_state->current_buf_ptr = new_buf_ptr; + s_dep_gen_state->current_buf_seq = next_seq; + wmb(); + return new_buf; +} + // --------------------------------------------------------------------------- // Internal: switch the current buffer // --------------------------------------------------------------------------- @@ -89,21 +142,6 @@ static void dep_gen_switch_buffer() { return; } - // Check free_queue before committing the full buffer - rmb(); - uint32_t head = s_dep_gen_state->free_queue.head; - uint32_t tail = s_dep_gen_state->free_queue.tail; - - if (head == tail) { - // No replacement buffer available — overwrite current buffer to keep - // the orch loop alive; account every record we drop. - LOG_WARN("dep_gen: no free buffer, overwriting current (dropped %u records)", full_buf->count); - s_dep_gen_state->dropped_record_count += full_buf->count; - full_buf->count = 0; - wmb(); - return; - } - uint32_t seq = s_dep_gen_state->current_buf_seq; int rc = enqueue_dep_gen_ready_buffer(s_dep_gen_state->current_buf_ptr, seq); if (rc != 0) { @@ -114,16 +152,12 @@ static void dep_gen_switch_buffer() { return; } - // Pop next buffer from free_queue - uint64_t new_buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT]; - rmb(); - s_dep_gen_state->free_queue.head = head + 1; - s_dep_gen_state->current_buf_ptr = new_buf_ptr; - s_dep_gen_state->current_buf_seq = seq + 1; + uint32_t next_seq = seq + 1; + s_dep_gen_state->current_buf_ptr = 0; + s_dep_gen_state->current_buf_seq = next_seq; wmb(); - DepGenBuffer *new_buf = reinterpret_cast(new_buf_ptr); - new_buf->count = 0; + (void)try_pop_dep_gen_buffer(next_seq); } // --------------------------------------------------------------------------- @@ -144,14 +178,8 @@ void dep_gen_aicpu_init() { uint32_t tail = s_dep_gen_state->free_queue.tail; if (head != tail) { - uint64_t buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT]; - rmb(); - s_dep_gen_state->free_queue.head = head + 1; - s_dep_gen_state->current_buf_ptr = buf_ptr; - s_dep_gen_state->current_buf_seq = 0; - wmb(); - DepGenBuffer *buf = reinterpret_cast(buf_ptr); - buf->count = 0; + (void)try_pop_dep_gen_buffer(0); + uint64_t buf_ptr = s_dep_gen_state->current_buf_ptr; LOG_INFO_V0("dep_gen: popped initial buffer addr=0x%lx", buf_ptr); } else { LOG_ERROR("dep_gen: free_queue empty during init"); @@ -180,9 +208,13 @@ void dep_gen_aicpu_record_submit( rmb(); uint64_t cur_ptr = s_dep_gen_state->current_buf_ptr; if (cur_ptr == 0) { - s_dep_gen_state->dropped_record_count += 1; - wmb(); - return; + DepGenBuffer *recovered = try_pop_dep_gen_buffer(s_dep_gen_state->current_buf_seq); + if (recovered == nullptr) { + s_dep_gen_state->dropped_record_count += 1; + wmb(); + return; + } + cur_ptr = s_dep_gen_state->current_buf_ptr; } DepGenBuffer *buf = reinterpret_cast(cur_ptr); @@ -205,9 +237,13 @@ void dep_gen_aicpu_record_submit( rmb(); cur_ptr = s_dep_gen_state->current_buf_ptr; if (cur_ptr == 0) { - s_dep_gen_state->dropped_record_count += 1; - wmb(); - return; + DepGenBuffer *recovered = try_pop_dep_gen_buffer(s_dep_gen_state->current_buf_seq); + if (recovered == nullptr) { + s_dep_gen_state->dropped_record_count += 1; + wmb(); + return; + } + cur_ptr = s_dep_gen_state->current_buf_ptr; } buf = reinterpret_cast(cur_ptr); local_count = buf->count; // refresh after switch — new buffer starts at 0 diff --git a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp index 5ed92cd61..0d030eb2e 100644 --- a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp +++ b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp @@ -108,6 +108,59 @@ extern "C" uint64_t get_platform_l2_swimlane_aicore_rotation_table() { } L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; } +static constexpr uint64_t kL2SwimlaneQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000; // 20 us +static constexpr uint32_t kL2SwimlaneQueueBackpressurePollMask = 1023; + +static bool +wait_for_ready_queue_space(L2SwimlaneDataHeader *header, int thread_idx, uint32_t *tail_out, uint32_t *head_out) { + if (header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return false; + } + const uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE; + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + uint32_t current_tail = header->queue_tails[thread_idx]; + uint32_t current_head = header->queue_heads[thread_idx]; + uint32_t next_tail = (current_tail + 1) % capacity; + if (next_tail != current_head) { + *tail_out = current_tail; + *head_out = current_head; + return true; + } + if ((++spins & kL2SwimlaneQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kL2SwimlaneQueueBackpressureWaitCycles) { + break; + } + } while (true); + return false; +} + +static bool wait_for_free_queue_entry(L2SwimlaneFreeQueue *free_queue, uint32_t *head_out, uint32_t *tail_out) { + if (free_queue == nullptr) { + return false; + } + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + uint32_t head = free_queue->head; + uint32_t tail = free_queue->tail; + if (head != tail) { + *head_out = head; + *tail_out = tail; + rmb(); // acquire: order the tail read above before the caller's buffer_ptrs read + return true; + } + if ((++spins & kL2SwimlaneQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kL2SwimlaneQueueBackpressureWaitCycles) { + break; + } + } while (true); + return false; +} + /** * Enqueue ready buffer to per-thread queue * @@ -124,24 +177,50 @@ static int enqueue_ready_buffer( L2SwimlaneBufferKind kind ) { uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE; - uint32_t current_tail = header->queue_tails[thread_idx]; - uint32_t current_head = header->queue_heads[thread_idx]; + uint32_t current_tail = 0; + uint32_t current_head = 0; - // Check if queue is full - uint32_t next_tail = (current_tail + 1) % capacity; - if (next_tail == current_head) { + if (!wait_for_ready_queue_space(header, thread_idx, ¤t_tail, ¤t_head)) { return -1; } + uint32_t next_tail = (current_tail + 1) % capacity; header->queues[thread_idx][current_tail].core_index = core_index; header->queues[thread_idx][current_tail].kind = kind; header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr; header->queues[thread_idx][current_tail].buffer_seq = buffer_seq; + wmb(); // publish: entry fields visible before the tail advance header->queue_tails[thread_idx] = next_tail; return 0; } +static L2SwimlaneAicpuTaskBuffer * +try_pop_records_buffer(int core_id, L2SwimlaneAicpuTaskPool *state, uint32_t next_seq) { + uint32_t head = 0; + uint32_t tail = 0; + if (!wait_for_free_queue_entry(&state->free_queue, &head, &tail)) { + return nullptr; + } + + uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; + rmb(); + state->free_queue.head = head + 1; + if (new_buf_ptr == 0) { + return nullptr; + } + + auto *new_buf = reinterpret_cast(new_buf_ptr); + new_buf->count = 0; + wmb(); + + state->head.current_buf_ptr = new_buf_ptr; + state->head.current_buf_seq = next_seq; + s_current_aicpu_task_buffers[core_id] = new_buf; + wmb(); + return new_buf; +} + void l2_swimlane_aicpu_init(int worker_count) { // Reset cross-launch state up front. AICPU statics persist across launches // on the same loaded .so; without this reset, an enabled→disabled launch @@ -280,47 +359,34 @@ static void switch_records_buffer(int core_id, int thread_idx) { LOG_INFO_V0("Thread %d: Core %d buffer is full (count=%u)", thread_idx, core_id, full_buf->count); - // Check free_queue before committing the full buffer - rmb(); - uint32_t head = state->free_queue.head; - uint32_t tail = state->free_queue.tail; - - if (head == tail) { - // No replacement buffer available — overwrite current buffer to keep AICore alive - LOG_WARN("Thread %d: Core %d no free buffer, overwriting current buffer (data lost)", thread_idx, core_id); - state->head.dropped_record_count = state->head.dropped_record_count + full_buf->count; - full_buf->count = 0; - wmb(); - return; - } - - // Enqueue full buffer to ReadyQueue uint32_t seq = state->head.current_buf_seq; + uint64_t full_buf_ptr = state->head.current_buf_ptr; int rc = enqueue_ready_buffer( - s_l2_swimlane_header, thread_idx, core_id, state->head.current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask + s_l2_swimlane_header, thread_idx, core_id, full_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask ); if (rc != 0) { LOG_ERROR("Thread %d: Core %d failed to enqueue buffer (queue full), data lost!", thread_idx, core_id); - // Revert: discard data and keep writing state->head.dropped_record_count = state->head.dropped_record_count + full_buf->count; full_buf->count = 0; wmb(); return; } - // Pop next buffer from free_queue - uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; - rmb(); - state->free_queue.head = head + 1; - state->head.current_buf_ptr = new_buf_ptr; - state->head.current_buf_seq = seq + 1; + uint32_t next_seq = seq + 1; + state->head.current_buf_ptr = 0; + state->head.current_buf_seq = next_seq; + s_current_aicpu_task_buffers[core_id] = nullptr; wmb(); - L2SwimlaneAicpuTaskBuffer *new_buf = reinterpret_cast(new_buf_ptr); - new_buf->count = 0; - s_current_aicpu_task_buffers[core_id] = new_buf; + L2SwimlaneAicpuTaskBuffer *new_buf = try_pop_records_buffer(core_id, state, next_seq); + if (new_buf == nullptr) { + return; + } - LOG_INFO_V0("Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr); + LOG_INFO_V0( + "Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, + reinterpret_cast(new_buf) + ); } // Try to rotate the AICore buffer for `core_id`. Called from the completion @@ -338,10 +404,9 @@ static void aicore_rotate(int core_id, int thread_idx) { uint64_t old_buf_ptr = ac_state->head.current_buf_ptr; uint32_t seq = ac_state->head.current_buf_seq; - rmb(); - uint32_t head = ac_state->free_queue.head; - uint32_t tail = ac_state->free_queue.tail; - if (head == tail) { + uint32_t head = 0; + uint32_t tail = 0; + if (!wait_for_free_queue_entry(&ac_state->free_queue, &head, &tail)) { // No replacement available — AICore continues to write into the old // buffer; its slot counter will hit BUFFER_SIZE and the slot guard // silently drops further records. We deliberately do NOT bump @@ -362,6 +427,16 @@ static void aicore_rotate(int core_id, int thread_idx) { return; } + uint64_t new_buf_ptr = ac_state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; + rmb(); + if (new_buf_ptr == 0) { + LOG_WARN( + "Thread %d: Core %d AICore free_queue returned a null buffer at rotation; keeping old buffer active", + thread_idx, core_id + ); + return; + } + // Enqueue the just-filled AICore buffer with count = BUFFER_SIZE. if (old_buf_ptr != 0) { L2SwimlaneAicoreTaskBuffer *old_buf = reinterpret_cast(old_buf_ptr); @@ -393,8 +468,6 @@ static void aicore_rotate(int core_id, int thread_idx) { // detect rotation, then reads head.current_buf_ptr. Write ptr first so // AICore can never see a new seq with a stale ptr. new_buf->count=0 must // also be visible before AICore's slot writes begin. - uint64_t new_buf_ptr = ac_state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; - rmb(); ac_state->free_queue.head = head + 1; L2SwimlaneAicoreTaskBuffer *new_buf = reinterpret_cast(new_buf_ptr); new_buf->count = 0; @@ -461,10 +534,14 @@ int l2_swimlane_aicpu_complete_task( L2SwimlaneAicpuTaskBuffer *l2_swimlane_buf = s_current_aicpu_task_buffers[core_id]; if (l2_swimlane_buf == nullptr) { - // No active records buffer (init ran out of free buffers); count as drop - // so host reconciliation stays consistent. - state->head.dropped_record_count += 1; - return -1; + l2_swimlane_buf = try_pop_records_buffer(core_id, state, state->head.current_buf_seq); + if (l2_swimlane_buf == nullptr) { + // No active records buffer (init ran out of free buffers or host has + // not refilled after the last published full buffer); count as drop + // so host reconciliation stays consistent. + state->head.dropped_record_count += 1; + return -1; + } } uint32_t count = l2_swimlane_buf->count; if (count >= PLATFORM_PROF_BUFFER_SIZE) { @@ -721,19 +798,22 @@ static void switch_phase_buffer_kind( ); state->head.dropped_record_count += full_buf->count; full_buf->count = 0; - *current_buf_out = nullptr; - state->head.current_buf_ptr = 0; wmb(); return; } - rmb(); - uint32_t head = state->free_queue.head; - uint32_t tail = state->free_queue.tail; - if (head != tail) { + uint32_t head = 0; + uint32_t tail = 0; + if (wait_for_free_queue_entry(&state->free_queue, &head, &tail)) { uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; rmb(); state->free_queue.head = head + 1; + if (new_buf_ptr == 0) { + *current_buf_out = nullptr; + state->head.current_buf_ptr = 0; + wmb(); + return; + } state->head.current_buf_ptr = new_buf_ptr; state->head.current_buf_seq = seq + 1; wmb(); @@ -764,13 +844,15 @@ static Record *acquire_phase_slot( ) { Buffer *buf = *current_buf_out; if (buf == nullptr) { - rmb(); - uint32_t head = state->free_queue.head; - uint32_t tail = state->free_queue.tail; - if (head != tail) { + uint32_t head = 0; + uint32_t tail = 0; + if (wait_for_free_queue_entry(&state->free_queue, &head, &tail)) { uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT]; rmb(); state->free_queue.head = head + 1; + if (buf_ptr == 0) { + return nullptr; + } state->head.current_buf_ptr = buf_ptr; state->head.current_buf_seq += 1; wmb(); diff --git a/src/a5/platform/shared/aicpu/pmu_collector_aicpu.cpp b/src/a5/platform/shared/aicpu/pmu_collector_aicpu.cpp index b8477e9ff..6c6a215d3 100644 --- a/src/a5/platform/shared/aicpu/pmu_collector_aicpu.cpp +++ b/src/a5/platform/shared/aicpu/pmu_collector_aicpu.cpp @@ -16,8 +16,9 @@ * Buffer switching mirrors a2a3 pmu_collector_aicpu.cpp: * - SPSC free_queue: Host pushes free PmuBuffers, AICPU pops when switching. * - Per-thread ready_queue: AICPU enqueues full buffers for host collection. - * - On free_queue empty or ready_queue full: overwrite current buffer (data lost, - * same policy as a2a3 — avoids blocking the AICPU dispatch loop). + * - Full buffers are published before AICPU tries to recover a replacement. + * If recovery is delayed, later records are counted as dropped until host + * replenishes free_queue. * * a5-specific: AICore reads PMU MMIO itself (via ld_dev) and writes the * snapshot into a per-core stable PmuAicoreRing @@ -31,6 +32,7 @@ #include +#include "aicpu/device_time.h" #include "aicpu/platform_regs.h" #include "common/memory_barrier.h" #include "common/platform_config.h" @@ -58,6 +60,9 @@ static PmuAicoreRing *s_pmu_aicore_rings[PLATFORM_MAX_CORES]; // Populated by pmu_aicpu_init(); 0 means "no PMU for this core" (sim). static uint64_t s_pmu_reg_addrs[PLATFORM_MAX_CORES] = {0}; +static constexpr uint64_t kPmuQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000; // 20 us +static constexpr uint32_t kPmuQueueBackpressurePollMask = 1023; + extern "C" void set_platform_pmu_base(uint64_t pmu_data_base) { g_platform_pmu_base = pmu_data_base; } extern "C" uint64_t get_platform_pmu_base() { return g_platform_pmu_base; } @@ -107,22 +112,74 @@ static void pmu_stop(uint64_t reg_base, uint32_t saved_ctrl0, uint32_t saved_ctr // --------------------------------------------------------------------------- static int enqueue_pmu_ready_buffer(int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq) { + if (s_pmu_header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return -1; + } uint32_t capacity = PLATFORM_PMU_READYQUEUE_SIZE; - uint32_t current_tail = s_pmu_header->queue_tails[thread_idx]; - uint32_t current_head = s_pmu_header->queue_heads[thread_idx]; + uint32_t current_tail = 0; + uint32_t current_head = 0; + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + current_tail = s_pmu_header->queue_tails[thread_idx]; + current_head = s_pmu_header->queue_heads[thread_idx]; + uint32_t next_tail = (current_tail + 1) % capacity; + if (next_tail != current_head) { + break; + } + if ((++spins & kPmuQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kPmuQueueBackpressureWaitCycles) { + return -1; + } + } while (true); uint32_t next_tail = (current_tail + 1) % capacity; - if (next_tail == current_head) { - return -1; // Queue full - } - s_pmu_header->queues[thread_idx][current_tail].core_index = core_index; s_pmu_header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr; s_pmu_header->queues[thread_idx][current_tail].buffer_seq = buffer_seq; + wmb(); // publish: entry fields visible before the tail advance s_pmu_header->queue_tails[thread_idx] = next_tail; return 0; } +static PmuBuffer *try_pop_pmu_buffer(int core_id, PmuBufferState *state, uint32_t next_seq) { + (void)core_id; + if (state == nullptr) { + return nullptr; + } + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + uint32_t head = 0; + uint32_t tail = 0; + + do { + head = state->free_queue.head; + tail = state->free_queue.tail; + if (head != tail) { + rmb(); // acquire: order the tail read before the buffer_ptrs read below + break; + } + if ((++spins & kPmuQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kPmuQueueBackpressureWaitCycles) { + return nullptr; + } + } while (true); + + uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT]; + state->free_queue.head = head + 1; + if (new_buf_ptr == 0) { + return nullptr; + } + + PmuBuffer *new_buf = reinterpret_cast(new_buf_ptr); + new_buf->count = 0; + state->current_buf_ptr = new_buf_ptr; + state->current_buf_seq = next_seq; + wmb(); + return new_buf; +} + // --------------------------------------------------------------------------- // Internal: switch the current buffer for one core (called from // complete_record when records[count] hits PLATFORM_PMU_RECORDS_PER_BUFFER) @@ -139,20 +196,6 @@ static void pmu_switch_buffer(int core_id, int thread_idx) { return; } - // Check free_queue before committing the full buffer - rmb(); - uint32_t head = state->free_queue.head; - uint32_t tail = state->free_queue.tail; - - if (head == tail) { - // No replacement buffer available — overwrite current buffer to keep AICPU alive - LOG_WARN("Thread %d: Core %d no free PMU buffer, overwriting current buffer (data lost)", thread_idx, core_id); - state->dropped_record_count += full_buf->count; - full_buf->count = 0; - wmb(); - return; - } - // Enqueue full buffer to ready_queue uint32_t seq = state->current_buf_seq; int rc = enqueue_pmu_ready_buffer(thread_idx, static_cast(core_id), state->current_buf_ptr, seq); @@ -166,19 +209,20 @@ static void pmu_switch_buffer(int core_id, int thread_idx) { return; } - // Pop next buffer from free_queue - uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT]; - rmb(); - state->free_queue.head = head + 1; - state->current_buf_ptr = new_buf_ptr; - state->current_buf_seq = seq + 1; + uint32_t next_seq = seq + 1; + state->current_buf_ptr = 0; + state->current_buf_seq = next_seq; wmb(); - PmuBuffer *new_buf = reinterpret_cast(new_buf_ptr); - new_buf->count = 0; - wmb(); + PmuBuffer *new_buf = try_pop_pmu_buffer(core_id, state, next_seq); + if (new_buf == nullptr) { + return; + } - LOG_INFO_V0("Thread %d: Core %d switched to new PMU buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr); + LOG_INFO_V0( + "Thread %d: Core %d switched to new PMU buffer (addr=0x%lx)", thread_idx, core_id, + reinterpret_cast(new_buf) + ); } // --------------------------------------------------------------------------- @@ -244,16 +288,8 @@ void pmu_aicpu_init(const uint32_t *physical_core_ids, int num_cores) { uint32_t tail = state->free_queue.tail; if (head != tail) { - uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT]; - rmb(); - state->free_queue.head = head + 1; - state->current_buf_ptr = buf_ptr; - state->current_buf_seq = 0; - wmb(); - - PmuBuffer *buf = reinterpret_cast(buf_ptr); - buf->count = 0; - + (void)try_pop_pmu_buffer(i, state, 0); + uint64_t buf_ptr = state->current_buf_ptr; LOG_DEBUG("Core %d: popped initial PMU buffer (addr=0x%lx)", i, buf_ptr); } else { LOG_ERROR("Core %d: PMU free_queue is empty during init!", i); @@ -300,12 +336,18 @@ void pmu_aicpu_complete_record( rmb(); uint64_t cur_ptr = state->current_buf_ptr; + PmuBuffer *buf = nullptr; if (cur_ptr == 0) { - state->dropped_record_count += 1; - wmb(); - return; + buf = try_pop_pmu_buffer(core_id, state, state->current_buf_seq); + if (buf == nullptr) { + state->dropped_record_count += 1; + wmb(); + return; + } + cur_ptr = state->current_buf_ptr; + } else { + buf = reinterpret_cast(cur_ptr); } - PmuBuffer *buf = reinterpret_cast(cur_ptr); // Switch buffer if full (internal — ring address is unchanged) if (buf->count >= static_cast(PLATFORM_PMU_RECORDS_PER_BUFFER)) { @@ -313,11 +355,16 @@ void pmu_aicpu_complete_record( rmb(); cur_ptr = state->current_buf_ptr; if (cur_ptr == 0) { - state->dropped_record_count += 1; - wmb(); - return; + buf = try_pop_pmu_buffer(core_id, state, state->current_buf_seq); + if (buf == nullptr) { + state->dropped_record_count += 1; + wmb(); + return; + } + cur_ptr = state->current_buf_ptr; + } else { + buf = reinterpret_cast(cur_ptr); } - buf = reinterpret_cast(cur_ptr); } uint32_t idx = buf->count; diff --git a/src/a5/platform/shared/host/l2_swimlane_collector.cpp b/src/a5/platform/shared/host/l2_swimlane_collector.cpp index 752d5fd18..148c881f5 100644 --- a/src/a5/platform/shared/host/l2_swimlane_collector.cpp +++ b/src/a5/platform/shared/host/l2_swimlane_collector.cpp @@ -75,9 +75,9 @@ int L2SwimlaneCollector::initialize( aicpu_thread_num_ = aicpu_thread_num; l2_swimlane_level_ = l2_swimlane_level; output_prefix_ = output_prefix; - total_perf_collected_ = 0; - total_sched_phase_collected_ = 0; - total_orch_phase_collected_ = 0; + total_perf_collected_.store(0, std::memory_order_relaxed); + total_sched_phase_collected_.store(0, std::memory_order_relaxed); + total_orch_phase_collected_.store(0, std::memory_order_relaxed); // Stash the memory context on the base up-front so alloc_paired_buffer // sees consistent values during init. shm_host_ stays nullptr until the @@ -378,10 +378,11 @@ void L2SwimlaneCollector::copy_perf_buffer(const ReadyBufferInfo &info) { } uint32_t core_index = info.index; if (core_index < static_cast(num_aicore_)) { + std::scoped_lock lock(perf_record_mutexes_[core_index]); for (uint32_t i = 0; i < count; i++) { collected_perf_records_[core_index].push_back(buf->records[i]); } - total_perf_collected_ += count; + total_perf_collected_.fetch_add(count, std::memory_order_relaxed); } } @@ -394,12 +395,13 @@ void L2SwimlaneCollector::copy_sched_phase_buffer(const ReadyBufferInfo &info) { } uint32_t tidx = info.index; if (tidx < collected_sched_phase_records_.size()) { + std::scoped_lock lock(sched_phase_record_mutexes_[tidx]); for (uint32_t i = 0; i < count; i++) { collected_sched_phase_records_[tidx].push_back(buf->records[i]); } - total_sched_phase_collected_ += count; + total_sched_phase_collected_.fetch_add(count, std::memory_order_relaxed); if (count > 0) { - has_phase_data_ = true; + has_phase_data_.store(true, std::memory_order_relaxed); } } } @@ -413,12 +415,13 @@ void L2SwimlaneCollector::copy_orch_phase_buffer(const ReadyBufferInfo &info) { } uint32_t tidx = info.index; if (tidx < collected_orch_phase_records_.size()) { + std::scoped_lock lock(orch_phase_record_mutexes_[tidx]); for (uint32_t i = 0; i < count; i++) { collected_orch_phase_records_[tidx].push_back(buf->records[i]); } - total_orch_phase_collected_ += count; + total_orch_phase_collected_.fetch_add(count, std::memory_order_relaxed); if (count > 0) { - has_phase_data_ = true; + has_phase_data_.store(true, std::memory_order_relaxed); } } } @@ -453,16 +456,19 @@ void L2SwimlaneCollector::copy_aicore_buffer(const ReadyBufferInfo &info) { if (count > static_cast(PLATFORM_AICORE_BUFFER_SIZE)) { count = PLATFORM_AICORE_BUFFER_SIZE; } - auto &dst = collected_aicore_records_[core_index]; - dst.reserve(dst.size() + count); uint32_t skipped = 0; - for (uint32_t i = 0; i < count; i++) { - const L2SwimlaneAicoreTaskRecord &r = buf->records[i]; - if (r.start_time == 0) { - skipped++; - continue; + { + std::scoped_lock lock(aicore_record_mutexes_[core_index]); + auto &dst = collected_aicore_records_[core_index]; + dst.reserve(dst.size() + count); + for (uint32_t i = 0; i < count; i++) { + const L2SwimlaneAicoreTaskRecord &r = buf->records[i]; + if (r.start_time == 0) { + skipped++; + continue; + } + dst.push_back(r); } - dst.push_back(r); } if (skipped > 0) { LOG_WARN( @@ -566,8 +572,7 @@ void L2SwimlaneCollector::reconcile_counters() { if (dropped_device > 0) { LOG_WARN( - "L2Swimlane reconcile: %lu %s records dropped on device side (buffer full / " - "ready_queue full).", + "L2Swimlane reconcile: %lu %s records dropped on device side.", static_cast(dropped_device), kind ); } @@ -603,7 +608,7 @@ void L2SwimlaneCollector::reconcile_counters() { [](void *host_ptr) { return reinterpret_cast(host_ptr)->count; }, - sizeof(L2SwimlaneAicpuTaskBuffer), total_perf_collected_, /*optional=*/false + sizeof(L2SwimlaneAicpuTaskBuffer), total_perf_collected_.load(std::memory_order_relaxed), /*optional=*/false ); reconcile_one( @@ -614,7 +619,8 @@ void L2SwimlaneCollector::reconcile_counters() { [](void *host_ptr) { return reinterpret_cast(host_ptr)->count; }, - sizeof(L2SwimlaneAicpuSchedPhaseBuffer), total_sched_phase_collected_, /*optional=*/true + sizeof(L2SwimlaneAicpuSchedPhaseBuffer), total_sched_phase_collected_.load(std::memory_order_relaxed), + /*optional=*/true ); reconcile_one( @@ -625,7 +631,8 @@ void L2SwimlaneCollector::reconcile_counters() { [](void *host_ptr) { return reinterpret_cast(host_ptr)->count; }, - sizeof(L2SwimlaneAicpuOrchPhaseBuffer), total_orch_phase_collected_, /*optional=*/true + sizeof(L2SwimlaneAicpuOrchPhaseBuffer), total_orch_phase_collected_.load(std::memory_order_relaxed), + /*optional=*/true ); } @@ -691,7 +698,10 @@ void L2SwimlaneCollector::read_phase_header_metadata() { LOG_INFO_V0(" Core-to-thread mapping: %d cores", num_phase_cores); } - LOG_INFO_V0("Phase metadata collection complete: has_phase_data=%s", has_phase_data_ ? "yes" : "no"); + LOG_INFO_V0( + "Phase metadata collection complete: has_phase_data=%s", + has_phase_data_.load(std::memory_order_relaxed) ? "yes" : "no" + ); } void L2SwimlaneCollector::set_core_types(const CoreType *types, int n) { @@ -1036,10 +1046,10 @@ int L2SwimlaneCollector::finalize(L2SwimlaneUnregisterCallback unregister_cb, co collected_sched_phase_records_.clear(); collected_orch_phase_records_.clear(); core_to_thread_.clear(); - has_phase_data_ = false; - total_perf_collected_ = 0; - total_sched_phase_collected_ = 0; - total_orch_phase_collected_ = 0; + has_phase_data_.store(false, std::memory_order_relaxed); + total_perf_collected_.store(0, std::memory_order_relaxed); + total_sched_phase_collected_.store(0, std::memory_order_relaxed); + total_orch_phase_collected_.store(0, std::memory_order_relaxed); clear_memory_context(); LOG_DEBUG("Performance profiling cleanup complete"); diff --git a/src/common/platform/include/host/buffer_pool_manager.h b/src/common/platform/include/host/buffer_pool_manager.h index 9eabb97ea..5156cb1e7 100644 --- a/src/common/platform/include/host/buffer_pool_manager.h +++ b/src/common/platform/include/host/buffer_pool_manager.h @@ -14,9 +14,9 @@ * @brief Generic buffer-pool data structure shared by L2Swimlane, TensorDump, * and PMU collectors. Owns: * - * - ready_queue (mgmt → collector) with mutex/cv, - * - done_queue (collector → mgmt) with mutex, - * - per-kind recycled-buffer pools, + * - ready_queue shard(s) (mgmt → collector) with mutex/cv, + * - done_queue shard(s) (collector → mgmt) with mutex, + * - shard-local per-kind recycled-buffer pools, * - dev↔host pointer mapping table, * - alloc_and_register / free_buffer / resolve_host_ptr helpers. * @@ -27,7 +27,7 @@ * Defines the shared types used by the framework: ThreadFactory (for thread * creation with optional device-context binding), MemoryOps (type-erased * alloc/reg/free/copy callbacks), and DoneInfo (per-buffer ownership info - * passed through done_queue). + * passed through done queues). * * SVM vs host-shadow (chosen at runtime by what the collector installs) * --------------------------------------------------------------------- @@ -71,10 +71,12 @@ #ifndef SRC_COMMON_PLATFORM_INCLUDE_HOST_BUFFER_POOL_MANAGER_H_ #define SRC_COMMON_PLATFORM_INCLUDE_HOST_BUFFER_POOL_MANAGER_H_ +#include #include #include #include #include +#include #include #include #include @@ -128,8 +130,8 @@ struct MemoryOps { }; /** - * Per-buffer ownership info threaded through the done_queue so that the mgmt - * thread, when it recycles a finished buffer, knows which per-kind pool it + * Per-buffer ownership info threaded through a done queue shard so that the + * mgmt thread, when it recycles a finished buffer, knows which per-kind pool it * came from. */ struct DoneInfo { @@ -137,6 +139,16 @@ struct DoneInfo { int kind; // [0, Module::kBufferKinds) }; +template +struct ProfilerModuleCollectorThreadCount { + static constexpr int value = 1; +}; + +template +struct ProfilerModuleCollectorThreadCount> { + static constexpr int value = Module::kCollectorThreadCount; +}; + template class BufferPoolManager { // Static checks for the Module concept. Required type aliases trigger @@ -149,9 +161,12 @@ class BufferPoolManager { public: using ReadyBufferInfo = typename Module::ReadyBufferInfo; + static constexpr int kCollectorShardCount = ProfilerModuleCollectorThreadCount::value; + static_assert(kCollectorShardCount >= 1, "Module::kCollectorThreadCount must be >= 1"); BufferPoolManager() : - recycled_(Module::kBufferKinds) {} + ready_shards_(kCollectorShardCount), + done_shards_(kCollectorShardCount) {} ~BufferPoolManager() = default; BufferPoolManager(const BufferPoolManager &) = delete; @@ -181,7 +196,7 @@ class BufferPoolManager { /** * Release every device buffer the framework currently owns: recycled - * pools, done_queue, and ready_queue. Buffers still in the per-pool + * pools, done queues, and ready queues. Buffers still in the per-pool * free_queue or held as current_buf_ptr are NOT touched — those belong * to the collector and must be released by it (the AICPU may still be * referencing them via shared memory until execution ends). @@ -215,23 +230,29 @@ class BufferPoolManager { } }; - for (auto &pool : recycled_) { - for (void *p : pool) - release_once(p); - pool.clear(); + for (auto &shard_pools : recycled_) { + for (auto &pool : shard_pools) { + for (void *p : pool) + release_once(p); + pool.clear(); + } } { - std::scoped_lock lock(done_mutex_); - while (!done_queue_.empty()) { - release_once(done_queue_.front().dev_ptr); - done_queue_.pop(); + for (auto &shard : done_shards_) { + std::scoped_lock lock(shard.mutex); + while (!shard.queue.empty()) { + release_once(shard.queue.front().dev_ptr); + shard.queue.pop(); + } } } { - std::scoped_lock lock(ready_mutex_); - while (!ready_queue_.empty()) { - release_once(ready_queue_.front().dev_buffer_ptr); - ready_queue_.pop(); + for (auto &shard : ready_shards_) { + std::scoped_lock lock(shard.mutex); + while (!shard.queue.empty()) { + release_once(shard.queue.front().dev_buffer_ptr); + shard.queue.pop(); + } } } } @@ -269,15 +290,17 @@ class BufferPoolManager { */ template void release_all_owned(const ReleaseFn &release_fn) { - for (auto &pool : recycled_) - pool.clear(); - { - std::scoped_lock lock(done_mutex_); - std::queue().swap(done_queue_); + for (auto &shard_pools : recycled_) { + for (auto &pool : shard_pools) + pool.clear(); } - { - std::scoped_lock lock(ready_mutex_); - std::queue().swap(ready_queue_); + for (auto &shard : done_shards_) { + std::scoped_lock lock(shard.mutex); + std::queue().swap(shard.queue); + } + for (auto &shard : ready_shards_) { + std::scoped_lock lock(shard.mutex); + std::queue().swap(shard.queue); } for (auto &kv : dev_to_host_) { if (kv.first != nullptr) { @@ -425,45 +448,50 @@ class BufferPoolManager { } // ------------------------------------------------------------------------- - // ready_queue: mgmt thread pushes, collector thread pops + // ready_queue shards: mgmt threads push, collector threads pop // ------------------------------------------------------------------------- - void push_to_ready(const ReadyBufferInfo &info) { + void push_to_ready(const ReadyBufferInfo &info, int shard_index = 0) { + auto &shard = ready_shards_[normalize_shard(shard_index)]; { - std::scoped_lock lock(ready_mutex_); - ready_queue_.push(info); + std::scoped_lock lock(shard.mutex); + shard.queue.push(info); } - ready_cv_.notify_one(); + shard.cv.notify_one(); } - bool try_pop_ready(ReadyBufferInfo &out) { - std::scoped_lock lock(ready_mutex_); - if (ready_queue_.empty()) return false; - out = ready_queue_.front(); - ready_queue_.pop(); + bool try_pop_ready(ReadyBufferInfo &out, int shard_index = 0) { + auto &shard = ready_shards_[normalize_shard(shard_index)]; + std::scoped_lock lock(shard.mutex); + if (shard.queue.empty()) return false; + out = shard.queue.front(); + shard.queue.pop(); return true; } - bool wait_pop_ready(ReadyBufferInfo &out, std::chrono::milliseconds timeout) { - std::unique_lock lock(ready_mutex_); - if (!ready_cv_.wait_for(lock, timeout, [this] { - return !ready_queue_.empty(); + bool wait_pop_ready(ReadyBufferInfo &out, std::chrono::milliseconds timeout, int shard_index = 0) { + auto &shard = ready_shards_[normalize_shard(shard_index)]; + std::unique_lock lock(shard.mutex); + if (!shard.cv.wait_for(lock, timeout, [&shard] { + return !shard.queue.empty(); })) { return false; } - out = ready_queue_.front(); - ready_queue_.pop(); + out = shard.queue.front(); + shard.queue.pop(); return true; } // ------------------------------------------------------------------------- - // done_queue: collector thread reports buffers it has finished copying; - // mgmt thread folds them back into the recycled pool of the right kind. + // done_queue shards: collector threads report buffers they have finished + // copying; mgmt folds them back into the same shard's recycled pool of the + // right kind. // ------------------------------------------------------------------------- - void notify_copy_done(void *dev_ptr, int kind) { - std::scoped_lock lock(done_mutex_); - done_queue_.push(DoneInfo{dev_ptr, kind}); + void notify_copy_done(void *dev_ptr, int kind, int shard_index = 0) { + auto &shard = done_shards_[normalize_shard(shard_index)]; + std::scoped_lock lock(shard.mutex); + shard.queue.push(DoneInfo{dev_ptr, kind}); } // ------------------------------------------------------------------------- @@ -497,7 +525,10 @@ class BufferPoolManager { return nullptr; } *host_ptr_out = host_ptr; - dev_to_host_[dev_ptr] = host_ptr; + { + std::scoped_lock lock(mapping_mutex_); + dev_to_host_[dev_ptr] = host_ptr; + } return dev_ptr; } @@ -508,15 +539,21 @@ class BufferPoolManager { */ void free_buffer(void *dev_ptr) { if (dev_ptr == nullptr) return; - auto it = dev_to_host_.find(dev_ptr); - void *host_ptr = (it != dev_to_host_.end()) ? it->second : nullptr; - if (it != dev_to_host_.end()) { - dev_to_host_.erase(it); + void *host_ptr = nullptr; + bool free_host_shadow = false; + { + std::scoped_lock lock(mapping_mutex_); + auto it = dev_to_host_.find(dev_ptr); + host_ptr = (it != dev_to_host_.end()) ? it->second : nullptr; + if (it != dev_to_host_.end()) { + dev_to_host_.erase(it); + } + free_host_shadow = (host_ptr != nullptr && malloc_shadows_.erase(host_ptr) > 0); } if (ops_.free_) { ops_.free_(dev_ptr); } - if (host_ptr != nullptr && malloc_shadows_.erase(host_ptr) > 0) { + if (free_host_shadow) { std::free(host_ptr); } } @@ -526,6 +563,7 @@ class BufferPoolManager { * alloc_and_register / register_mapping time. */ void *resolve_host_ptr(void *dev_ptr) { + std::scoped_lock lock(mapping_mutex_); auto it = dev_to_host_.find(dev_ptr); if (it != dev_to_host_.end()) return it->second; LOG_ERROR("BufferPoolManager: no host mapping for dev_ptr=%p", dev_ptr); @@ -537,7 +575,10 @@ class BufferPoolManager { * initialize() when it pre-allocates buffers and wants the mgmt thread * to be able to resolve them later. */ - void register_mapping(void *dev_ptr, void *host_ptr) { dev_to_host_[dev_ptr] = host_ptr; } + void register_mapping(void *dev_ptr, void *host_ptr) { + std::scoped_lock lock(mapping_mutex_); + dev_to_host_[dev_ptr] = host_ptr; + } /** * Claim ownership of a host shadow that the framework malloc'd. Only @@ -547,6 +588,7 @@ class BufferPoolManager { */ void add_malloc_shadow(void *host_ptr) { if (host_ptr != nullptr) { + std::scoped_lock lock(mapping_mutex_); malloc_shadows_.insert(host_ptr); } } @@ -556,36 +598,82 @@ class BufferPoolManager { * empty. Caller is responsible for resolving host_ptr (via * resolve_host_ptr) before handing the buffer back to AICPU. */ - void *pop_recycled(int kind) { - auto &pool = recycled_[kind]; + void *pop_recycled(int kind, int shard_index = 0) { + auto shard = normalize_shard(shard_index); + std::scoped_lock lock(recycled_mutexes_[shard][kind]); + auto &pool = recycled_[shard][kind]; if (pool.empty()) return nullptr; void *p = pool.back(); pool.pop_back(); return p; } - void push_recycled(int kind, void *dev_ptr) { recycled_[kind].push_back(dev_ptr); } + void *pop_recycled_any(int kind, int preferred_shard = 0) { + if (void *p = pop_recycled(kind, preferred_shard); p != nullptr) return p; + const auto preferred = normalize_shard(preferred_shard); + for (size_t s = 0; s < recycled_.size(); s++) { + if (s == preferred) continue; + if (void *p = pop_recycled(kind, static_cast(s)); p != nullptr) return p; + } + return nullptr; + } + + void push_recycled(int kind, void *dev_ptr, int shard_index = 0) { + auto shard = normalize_shard(shard_index); + std::scoped_lock lock(recycled_mutexes_[shard][kind]); + recycled_[shard][kind].push_back(dev_ptr); + } + + size_t recycled_count(int kind) const { + size_t total = 0; + for (size_t shard = 0; shard < recycled_.size(); shard++) { + std::scoped_lock lock(recycled_mutexes_[shard][kind]); + total += recycled_[shard][kind].size(); + } + return total; + } bool recycled_empty() const { - for (const auto &pool : recycled_) { - if (!pool.empty()) return false; + for (size_t shard = 0; shard < recycled_.size(); shard++) { + for (int kind = 0; kind < Module::kBufferKinds; kind++) { + std::scoped_lock lock(recycled_mutexes_[shard][kind]); + if (!recycled_[shard][kind].empty()) return false; + } } return true; } + template + decltype(auto) with_free_queue_writer(const void *queue_key, Fn &&fn) { + std::scoped_lock lock(free_queue_mutexes_[free_queue_lock_index(queue_key)]); + return fn(); + } + /** - * Drain everything currently in done_queue back into the per-kind + * Drain everything currently in done queue shards back into the per-kind * recycled pool. May be called from Module::process_entry when its * primary recycled pool ran out, to harvest buffers the collector freed * in the meantime. */ - void drain_done_into_recycled() { - std::scoped_lock lock(done_mutex_); - while (!done_queue_.empty()) { - const DoneInfo &info = done_queue_.front(); - recycled_[info.kind].push_back(info.dev_ptr); - done_queue_.pop(); + size_t drain_done_into_recycled(int shard_index) { + auto &shard = done_shards_[normalize_shard(shard_index)]; + size_t drained = 0; + std::scoped_lock lock(shard.mutex); + while (!shard.queue.empty()) { + const DoneInfo &info = shard.queue.front(); + push_recycled(info.kind, info.dev_ptr, shard_index); + shard.queue.pop(); + drained++; } + return drained; + } + + size_t drain_done_into_recycled() { + size_t drained = 0; + for (size_t shard = 0; shard < done_shards_.size(); shard++) { + drained += drain_done_into_recycled(static_cast(shard)); + } + return drained; } void *shared_mem_dev() const { return shared_mem_dev_; } @@ -593,6 +681,22 @@ class BufferPoolManager { int device_id() const { return device_id_; } private: + struct ReadyQueueShard { + std::mutex mutex; + std::condition_variable cv; + std::queue queue; + }; + + struct DoneQueueShard { + std::mutex mutex; + std::queue queue; + }; + + static size_t normalize_shard(int shard_index) { + if (shard_index < 0) return 0; + return static_cast(shard_index) % static_cast(kCollectorShardCount); + } + // Subsystem inputs (set by ProfilerBase::start via set_memory_context). void *shared_mem_dev_{nullptr}; void *shared_mem_host_{nullptr}; @@ -601,13 +705,21 @@ class BufferPoolManager { MemoryOps ops_; // mgmt → collector - std::mutex ready_mutex_; - std::condition_variable ready_cv_; - std::queue ready_queue_; + std::vector ready_shards_; // collector → mgmt - std::mutex done_mutex_; - std::queue done_queue_; + std::vector done_shards_; + + // Host-side pointer mappings are shared across all collector shards. + mutable std::mutex mapping_mutex_; + static constexpr size_t kFreeQueueLockStripes = 64; + + static size_t free_queue_lock_index(const void *queue_key) { + auto raw = reinterpret_cast(queue_key); + return (raw >> 6) % kFreeQueueLockStripes; + } + + std::array free_queue_mutexes_; // dev → host mapping (single source of truth for resolve_host_ptr) std::unordered_map dev_to_host_; @@ -618,8 +730,9 @@ class BufferPoolManager { // HAL-managed mappings (halHostRegister) live outside this set. std::unordered_set malloc_shadows_; - // Per-kind recycled buffer pools (vector indexed by Module-defined kind id) - std::vector> recycled_; + // Local recycled buffer pools indexed by collector shard, then Module-defined kind id. + std::array, Module::kBufferKinds>, kCollectorShardCount> recycled_; + mutable std::array, kCollectorShardCount> recycled_mutexes_; }; } // namespace profiling_common diff --git a/src/common/platform/include/host/profiler_base.h b/src/common/platform/include/host/profiler_base.h index 7f46bc704..c0b22c51f 100644 --- a/src/common/platform/include/host/profiler_base.h +++ b/src/common/platform/include/host/profiler_base.h @@ -13,8 +13,8 @@ * @file profiler_base.h * @brief CRTP scaffolding shared by L2Swimlane / Dump / PMU collectors. * - * Owns the BufferPoolManager, the mgmt thread (which polls AICPU - * ready queues and recycles buffers), and the collector poll thread. + * Owns the BufferPoolManager, the mgmt thread(s) that poll AICPU + * ready queues / recycle buffers, and the collector poll thread(s). * * Module concept contract * ----------------------- @@ -26,7 +26,7 @@ * // Types * using DataHeader = ...; // Shared-memory header (e.g. L2SwimlaneDataHeader). * using ReadyEntry = ...; // Per-AICPU-thread ready-queue entry. - * using ReadyBufferInfo = ...; // Hand-off struct to the collector thread + * using ReadyBufferInfo = ...; // Hand-off struct to collector thread(s) * // (carries dev/host ptrs, optional kind * // discriminator, and the seq). * using FreeQueue = ...; // Per-instance SPSC queue of free buffer @@ -34,10 +34,17 @@ * // `buffer_ptrs[kSlotCount]`. * * // Constants - * static constexpr int kBufferKinds; // L2Swimlane=2 (perf+phase), Dump=1, PMU=1. + * static constexpr int kBufferKinds; // L2Swimlane=4, Dump=1, PMU=1. * static constexpr uint32_t kReadyQueueSize; // Per-thread ready-queue depth. * static constexpr uint32_t kSlotCount; // FreeQueue::buffer_ptrs[] length. * static constexpr const char* kSubsystemName; // "PMU" / "L2Swimlane" / "Dump". + * // Optional: number of mgmt drain shards (defaults to 1). + * static constexpr int kMgmtDrainThreadCount; + * // Optional: number of collector threads / host ready-queue shards. + * static constexpr int kCollectorThreadCount; + * // Optional: refresh cached queue metadata before a replenish pass. + * template + * static void refresh_replenish_metadata(Mgr&, DataHeader*); * * // Header pointer cast (host_ptr → DataHeader*) * static DataHeader* header_from_shm(void* shared_mem_host); @@ -66,16 +73,15 @@ * Alloc policy * ------------ * - * process_entry replenishes the originating free_queue with EXACTLY - * one buffer per call, matching the 1-in / 1-out - * ratio against the entry the AICPU just produced. - * Single allocation when both recycled and done are - * dry; bounds the per-tick latency. + * process_entry replenishes the originating free_queue from the + * current drain shard's local recycled pool until + * the free_queue is full or no buffer is available. * proactive_replenish fills to kSlotCount across all instances of every * kind. When recycled drains it batch-allocates * `batch_size(kind)` buffers at once to amortize the - * allocator cost — recovery from a double-empty - * condition takes one tick instead of N. + * allocator cost. Split-mgmt collectors use this + * only before threads start; runtime replenish only + * drains collector-done buffers into local pools. * * The above two algorithms live in ProfilerAlgorithms; Module only * supplies the data-access traits above. Implementors must NOT zero `count` @@ -89,17 +95,16 @@ * start(tf) becomes a no-op (shm_host_ stays nullptr). * 2. start(tf) — atomically: (a) assembles a MemoryOps from the stashed * callbacks, (b) hands it to the manager via set_memory_context, - * (c) launches the mgmt thread, (d) launches the poll thread. Mgmt is - * started before poll because mgmt is the only writer to L2 (the - * ready_queue) and poll is its sole consumer. + * (c) launches the mgmt thread(s), (d) launches the collector thread(s). + * Mgmt is started before collectors because mgmt is the only writer to + * the host ready queue shard(s) and collectors are their consumers. * 3. ... device execution ... * 4. stop() — atomically: - * a) flips mgmt_running_, joins the mgmt thread; the mgmt thread's + * a) flips mgmt_running_, joins the mgmt thread(s); the drain thread's * final-drain pass pushes the last L1→L2 entries before exiting. - * b) execution_complete_ is set; the poll loop sees it on its next - * idle tick, drains L2 (which now contains mgmt's final-drain - * output), and exits. - * c) collector thread joined. + * b) execution_complete_ is set; each collector loop sees it on its + * next idle tick, drains its host ready queue shard, and exits. + * c) collector thread(s) joined. * Caller is then guaranteed L1 and L2 are both empty and all collected * data has been delivered to Derived::on_buffer_collected. * @@ -108,18 +113,17 @@ * * - Collectors on platforms without SVM (a5: no halHostRegister) install * `copy_to_device` / `copy_from_device` in MemoryOps so every device - * read/write goes through rtMemcpy (onboard) or memcpy (sim). The - * mgmt_loop then pulls the device-side shared-memory region into the - * host shadow at the top of every tick (`mirror_shm_from_device`) and - * pushes the few host-modified fields (`queue_heads[q]` after pop, - * `free_queue.tail` + `buffer_ptrs[]` after refill) back as narrow - * `write_range_to_device` writes. The bulk `mirror_shm_to_device` is - * intentionally NOT called from mgmt_loop: it raced with AICPU writes - * to device-only fields (current_buf_ptr, total/dropped/mismatch - * counters, queue_tails, free_queue.head, and on a5 - * L2SwimlaneAicpuPhaseHeader::magic) and rolled them back to the - * host-shadow values mirrored in at the top of the tick. Buffer - * contents are mirrored on demand inside ProfilerAlgorithms. + * read/write goes through rtMemcpy (onboard) or memcpy (sim). The mgmt + * drain threads refresh only their own queue indices and the popped + * entry from device (narrow `read_range_from_device`) and push the few + * host-modified fields (`queue_heads[q]` after pop, `free_queue.tail` + + * `buffer_ptrs[]` after refill) back as narrow `write_range_to_device` + * writes. The bulk `mirror_shm_to_device` is intentionally NOT called: + * it would race with AICPU writes to device-only fields (current_buf_ptr, + * total/dropped/mismatch counters, queue_tails, free_queue.head, and on + * a5 L2SwimlaneAicpuPhaseHeader::magic) and roll them back to stale + * host-shadow values. Buffer contents are mirrored on demand inside + * ProfilerAlgorithms. * - On these platforms `reg` always allocates a paired host shadow; the * framework never falls back to identity-mapping (which would be wrong * without SVM). Collectors pass nullptr-safe callbacks via @@ -159,6 +163,7 @@ #include #include #include +#include #include #include @@ -170,6 +175,16 @@ namespace profiling_common { +template +struct ProfilerModuleDrainThreadCount { + static constexpr int value = 1; +}; + +template +struct ProfilerModuleDrainThreadCount> { + static constexpr int value = Module::kMgmtDrainThreadCount; +}; + // Common subsystem callback signatures. All four collectors (PMU / TensorDump // / L2Swimlane / DepGen) used to declare their own typedefs with identical // shapes; these are the canonical types stashed in ProfilerBase via @@ -307,7 +322,16 @@ struct ProfilerAlgorithms { // entry with `read_range_from_device` and skip the pop if the refreshed // entry still looks empty — try again next tick. template - static bool try_pop_aicpu_entry(Mgr &mgr, DataHeader *header, int q, ReadyEntry &out) { + static bool + try_pop_aicpu_entry(Mgr &mgr, DataHeader *header, int q, ReadyEntry &out, bool refresh_indices = false) { + if (refresh_indices) { + if (mgr.read_range_from_device(&header->queue_heads[q], sizeof(header->queue_heads[q])) != 0 || + mgr.read_range_from_device(&header->queue_tails[q], sizeof(header->queue_tails[q])) != 0) { + LOG_ERROR("%s: failed to refresh ready_queue indices for thread %d", Module::kSubsystemName, q); + return false; + } + rmb(); + } uint32_t head = header->queue_heads[q]; uint32_t tail = header->queue_tails[q]; if (head >= Module::kReadyQueueSize || tail >= Module::kReadyQueueSize) { @@ -326,26 +350,33 @@ struct ProfilerAlgorithms { // race described above. If the entry's `buffer_ptr` is still 0 the // producer hasn't finished publishing — treat the queue as empty // for this tick. - mgr.read_range_from_device(&header->queues[q][head], sizeof(header->queues[q][head])); + if (mgr.read_range_from_device(&header->queues[q][head], sizeof(header->queues[q][head])) != 0) { + LOG_ERROR("%s: failed to refresh ready_queue entry for thread %d", Module::kSubsystemName, q); + return false; + } rmb(); out = header->queues[q][head]; if (out.buffer_ptr == 0) { return false; } - head = (head + 1) % Module::kReadyQueueSize; - header->queue_heads[q] = head; + uint32_t old_head = head; + uint32_t next_head = (head + 1) % Module::kReadyQueueSize; + header->queue_heads[q] = next_head; wmb(); // Push the new head value back to device. The bulk mirror_shm_to_device // is intentionally not used here — see buffer_pool_manager.h. - mgr.write_range_to_device(&header->queue_heads[q], sizeof(header->queue_heads[q])); + if (mgr.write_range_to_device(&header->queue_heads[q], sizeof(header->queue_heads[q])) != 0) { + header->queue_heads[q] = old_head; + LOG_ERROR("%s: failed to advance ready_queue head for thread %d", Module::kSubsystemName, q); + return false; + } return true; } - // Refill the originating pool's free_queue with exactly one buffer - // (recycled → drain done → alloc), then push the popped buffer's - // ReadyBufferInfo to the collector LAST. Skips the push if host_ptr - // resolution fails — handing a null pointer to on_buffer_collected - // would crash the collector thread. + // Refill the originating pool's free_queue from this drain shard's local + // recycled pool, then push the popped buffer's ReadyBufferInfo to the + // collector LAST. Skips the push if host_ptr resolution fails — handing a + // null pointer to on_buffer_collected would crash the collector thread. // // a5 specifics: after resolving the popped buffer's host shadow, copy // the buffer contents from device to host before delivery. The host @@ -356,11 +387,6 @@ struct ProfilerAlgorithms { if (!site_opt.has_value()) return; auto &site = *site_opt; - void *new_dev = obtain_buffer(mgr, site.kind, site.buffer_size); - if (new_dev != nullptr) { - push_to_free_queue(mgr, *site.free_queue, new_dev); - } - site.info.host_buffer_ptr = mgr.resolve_host_ptr(site.info.dev_buffer_ptr); if (site.info.host_buffer_ptr == nullptr) { // resolve_host_ptr already logged. Drop rather than deliver null. @@ -368,34 +394,66 @@ struct ProfilerAlgorithms { } // a5: pull buffer contents from device into the host shadow before // the collector reads `count` and `records[]`. - mgr.copy_buffer_from_device(site.info.host_buffer_ptr, site.info.dev_buffer_ptr, site.buffer_size); + if (mgr.copy_buffer_from_device(site.info.host_buffer_ptr, site.info.dev_buffer_ptr, site.buffer_size) != 0) { + LOG_ERROR( + "%s: failed to copy ready buffer from device (kind=%d, thread=%d)", Module::kSubsystemName, site.kind, q + ); + return; + } + + (void)top_up_free_queue(mgr, site.kind, *site.free_queue, site.buffer_size, q); - mgr.push_to_ready(site.info); + mgr.push_to_ready(site.info, q); } - // Drain done_queue into recycled, then top up every (kind, instance) - // free_queue to kSlotCount. When the recycled pool of a given kind drains - // mid-fill, batch-allocate `batch_size(kind)` buffers and continue. + // Drain done_queue into local recycled pools, then top up every (kind, + // instance) free_queue to kSlotCount. Split-mgmt collectors call this only + // before threads start; their runtime replenish loop only drains done. template - static void proactive_replenish(Mgr &mgr, DataHeader *header) { + static uint64_t proactive_replenish(Mgr &mgr, DataHeader *header) { mgr.drain_done_into_recycled(); + return replenish_free_queues(mgr, header); + } + + template + static uint64_t replenish_free_queues(Mgr &mgr, DataHeader *header) { + uint64_t pushed = 0; + refresh_replenish_metadata(mgr, header, 0); Module::for_each_instance(mgr.shared_mem_host(), header, [&](int kind, FreeQueue *fq, size_t buf_size) { - top_up_free_queue(mgr, kind, *fq, buf_size); + pushed += top_up_free_queue(mgr, kind, *fq, buf_size); }); + return pushed; } private: - // Three-level fallback used by process_entry's 1-in/1-out replenish. + template + static auto refresh_replenish_metadata(Mgr &mgr, DataHeader *header, int) + -> decltype(M::refresh_replenish_metadata(mgr, header), void()) { + M::refresh_replenish_metadata(mgr, header); + } + + template + static void refresh_replenish_metadata(Mgr &, DataHeader *, long) {} + + // Fallback used by drain-shard free_queue top-up. template - static void *obtain_buffer(Mgr &mgr, int kind, size_t buf_size) { - void *p = mgr.pop_recycled(kind); + static void *obtain_buffer(Mgr &mgr, int kind, size_t buf_size, int shard_index) { + void *p = mgr.pop_recycled(kind, shard_index); if (p != nullptr) return p; - mgr.drain_done_into_recycled(); - p = mgr.pop_recycled(kind); + mgr.drain_done_into_recycled(shard_index); + p = mgr.pop_recycled(kind, shard_index); + if (p != nullptr) return p; + p = mgr.pop_recycled_any(kind, shard_index); if (p != nullptr) return p; - void *host_ptr = nullptr; - p = mgr.alloc_and_register(buf_size, &host_ptr); + const int batch = Module::batch_size(kind); + for (int i = 0; i < batch; i++) { + void *host_ptr = nullptr; + void *dev = mgr.alloc_and_register(buf_size, &host_ptr); + if (dev == nullptr) break; + mgr.push_recycled(kind, dev, shard_index); + } + p = mgr.pop_recycled(kind, shard_index); if (p == nullptr) { LOG_WARN( "%s: alloc failed for %zu bytes (kind=%d) — increase BUFFERS_PER_* to reduce drops", @@ -405,9 +463,9 @@ struct ProfilerAlgorithms { return p; } - // Append one buffer pointer to a per-instance free_queue. Caller owns - // the "queue is not full" guarantee (process_entry: 1-in/1-out; - // top_up_free_queue: explicit fq_used < kSlotCount). + // Append one buffer pointer to a per-instance free_queue if it has + // capacity. The manager serializes host writers so split drain shards and + // non-split/proactive refill paths never race on free_queue.tail. // // a5: write the new slot and the advanced tail back to device via // `write_range_to_device` so AICPU sees the refill without us bulk @@ -415,50 +473,67 @@ struct ProfilerAlgorithms { // written before the tail so AICPU never observes a tail update without // the corresponding pointer. template - static void push_to_free_queue(Mgr &mgr, FreeQueue &fq, void *dev_ptr) { - uint32_t fq_tail = fq.tail; - uint32_t slot_idx = fq_tail % Module::kSlotCount; - fq.buffer_ptrs[slot_idx] = reinterpret_cast(dev_ptr); - wmb(); - mgr.write_range_to_device(&fq.buffer_ptrs[slot_idx], sizeof(fq.buffer_ptrs[slot_idx])); - fq.tail = fq_tail + 1; - wmb(); - mgr.write_range_to_device(&fq.tail, sizeof(fq.tail)); + static bool try_push_to_free_queue(Mgr &mgr, FreeQueue &fq, void *dev_ptr) { + return mgr.with_free_queue_writer(&fq, [&]() { + if (mgr.read_range_from_device(&fq.head, sizeof(fq.head)) != 0) { + LOG_ERROR("%s: failed to refresh free_queue head", Module::kSubsystemName); + return false; + } + rmb(); + uint32_t fq_head = fq.head; + uint32_t fq_tail = fq.tail; + if (fq_tail - fq_head >= Module::kSlotCount) { + return false; + } + uint32_t slot_idx = fq_tail % Module::kSlotCount; + uint64_t old_slot = fq.buffer_ptrs[slot_idx]; + fq.buffer_ptrs[slot_idx] = reinterpret_cast(dev_ptr); + wmb(); + if (mgr.write_range_to_device(&fq.buffer_ptrs[slot_idx], sizeof(fq.buffer_ptrs[slot_idx])) != 0) { + fq.buffer_ptrs[slot_idx] = old_slot; + LOG_ERROR("%s: failed to publish free_queue slot", Module::kSubsystemName); + return false; + } + fq.tail = fq_tail + 1; + wmb(); + if (mgr.write_range_to_device(&fq.tail, sizeof(fq.tail)) != 0) { + fq.tail = fq_tail; + fq.buffer_ptrs[slot_idx] = old_slot; + LOG_ERROR("%s: failed to publish free_queue tail", Module::kSubsystemName); + return false; + } + return true; + }); } - // Fill one (kind, instance) free_queue to kSlotCount, batch-allocating - // when the recycled pool of this kind drains mid-fill. template - static void top_up_free_queue(Mgr &mgr, int kind, FreeQueue &fq, size_t buf_size) { - rmb(); - uint32_t fq_head = fq.head; - uint32_t fq_tail = fq.tail; - uint32_t fq_used = fq_tail - fq_head; - - while (fq_used < Module::kSlotCount) { - void *new_dev = mgr.pop_recycled(kind); - if (new_dev == nullptr) { - const int batch = Module::batch_size(kind); - for (int i = 0; i < batch; i++) { - void *host_ptr = nullptr; - void *dev = mgr.alloc_and_register(buf_size, &host_ptr); - if (dev == nullptr) break; - mgr.push_recycled(kind, dev); - } - new_dev = mgr.pop_recycled(kind); + static bool free_queue_has_space(Mgr &mgr, FreeQueue &fq) { + return mgr.with_free_queue_writer(&fq, [&]() { + if (mgr.read_range_from_device(&fq.head, sizeof(fq.head)) != 0) { + LOG_ERROR("%s: failed to refresh free_queue head", Module::kSubsystemName); + return false; } - if (new_dev == nullptr) return; + rmb(); + return fq.tail - fq.head < Module::kSlotCount; + }); + } - uint32_t slot_idx = fq_tail % Module::kSlotCount; - fq.buffer_ptrs[slot_idx] = reinterpret_cast(new_dev); - wmb(); - mgr.write_range_to_device(&fq.buffer_ptrs[slot_idx], sizeof(fq.buffer_ptrs[slot_idx])); - fq_tail++; - fq.tail = fq_tail; - wmb(); - mgr.write_range_to_device(&fq.tail, sizeof(fq.tail)); - fq_used++; + // Fill one (kind, instance) free_queue to kSlotCount from one drain + // shard's local recycled pool, batch-allocating when that shard is dry. + template + static uint64_t top_up_free_queue(Mgr &mgr, int kind, FreeQueue &fq, size_t buf_size, int shard_index = 0) { + uint64_t pushed = 0; + + while (free_queue_has_space(mgr, fq)) { + void *new_dev = obtain_buffer(mgr, kind, buf_size, shard_index); + if (new_dev == nullptr) return pushed; + if (!try_push_to_free_queue(mgr, fq, new_dev)) { + mgr.push_recycled(kind, new_dev, shard_index); + return pushed; + } + pushed++; } + return pushed; } }; @@ -530,12 +605,12 @@ class ProfilerBase { /** * Assemble a MemoryOps from the callbacks stashed by set_memory_context() - * and launch the mgmt + poll threads. If shm_host_ is nullptr (Derived's + * and launch the mgmt + collector threads. If shm_host_ is nullptr (Derived's * init() aborted before set_memory_context, or finalize() has cleared * the context) this is a no-op. * - * Order matters: mgmt is started before poll because mgmt is the only - * writer to L2 (the ready_queue) and poll is its sole consumer. The + * Order matters: mgmt is started before collectors because mgmt is the + * only writer to L2 (the ready queues) and collectors are the consumers. The * register slot defaults to identity on the SVM path (copy_to_device_ * is null) or to a host-shadow malloc lambda on the non-SVM path * (copy_to_device_ installed) — so BufferPoolManager always has a @@ -588,29 +663,73 @@ class ProfilerBase { ops.copy_from_device = copy_from_device_; manager_.set_memory_context(std::move(ops), shm_dev_, shm_host_, shm_size_, device_id_); + execution_complete_.store(false, std::memory_order_release); + { + DataHeader *header = Module::header_from_shm(manager_.shared_mem_host()); + (void)ProfilerAlgorithms::proactive_replenish(manager_, header); + } + mgmt_running_.store(true, std::memory_order_release); - if (thread_factory) { - mgmt_thread_ = thread_factory([this]() { - mgmt_loop(); - }); - } else { - mgmt_thread_ = std::thread(&ProfilerBase::mgmt_loop, this); + { + constexpr int kDrainThreads = ProfilerModuleDrainThreadCount::value; + static_assert(kDrainThreads >= 1, "kMgmtDrainThreadCount must be >= 1"); + if constexpr (kDrainThreads == 1) { + if (thread_factory) { + mgmt_thread_ = thread_factory([this]() { + mgmt_drain_loop(0, 1); + }); + } else { + mgmt_thread_ = std::thread(&ProfilerBase::mgmt_drain_loop, this, 0, 1); + } + } else { + mgmt_drain_threads_.reserve(kDrainThreads); + for (int i = 0; i < kDrainThreads; i++) { + if (thread_factory) { + mgmt_drain_threads_.push_back(thread_factory([this, i]() { + mgmt_drain_loop(i, kDrainThreads); + })); + } else { + mgmt_drain_threads_.emplace_back(&ProfilerBase::mgmt_drain_loop, this, i, kDrainThreads); + } + } + } + if (thread_factory) { + mgmt_replenish_thread_ = thread_factory([this]() { + mgmt_replenish_loop(); + }); + } else { + mgmt_replenish_thread_ = std::thread(&ProfilerBase::mgmt_replenish_loop, this); + } } - execution_complete_.store(false, std::memory_order_release); - if (thread_factory) { - collector_thread_ = thread_factory([this]() { - poll_and_collect_loop(); - }); + constexpr int kCollectorThreads = ProfilerModuleCollectorThreadCount::value; + static_assert(kCollectorThreads >= 1, "kCollectorThreadCount must be >= 1"); + if constexpr (kCollectorThreads == 1) { + if (thread_factory) { + collector_thread_ = thread_factory([this]() { + poll_and_collect_loop(0, 1); + }); + } else { + collector_thread_ = std::thread(&ProfilerBase::poll_and_collect_loop, this, 0, 1); + } } else { - collector_thread_ = std::thread(&ProfilerBase::poll_and_collect_loop, this); + collector_threads_.reserve(kCollectorThreads); + for (int i = 0; i < kCollectorThreads; i++) { + if (thread_factory) { + collector_threads_.push_back(thread_factory([this, i]() { + poll_and_collect_loop(i, kCollectorThreads); + })); + } else { + collector_threads_.emplace_back(&ProfilerBase::poll_and_collect_loop, this, i, kCollectorThreads); + } + } } } /** * Stop the mgmt thread, drain whatever it pushes during its final pass, * and join the collector. Idempotent. Caller is guaranteed on return - * that mgmt's L1 ringbuffer and the host-side L2 ready_queue are both + * that mgmt's L1 ringbuffer and the host-side ready queue shard(s) are * empty and Derived::on_buffer_collected has been called for every * entry that was in either queue. Framework-owned buffers are NOT freed * here — Derived's finalize() must do that. @@ -624,10 +743,25 @@ class ProfilerBase { if (mgmt_thread_.joinable()) { mgmt_thread_.join(); } + for (auto &thread : mgmt_drain_threads_) { + if (thread.joinable()) { + thread.join(); + } + } + mgmt_drain_threads_.clear(); + if (mgmt_replenish_thread_.joinable()) { + mgmt_replenish_thread_.join(); + } execution_complete_.store(true, std::memory_order_release); if (collector_thread_.joinable()) { collector_thread_.join(); } + for (auto &thread : collector_threads_) { + if (thread.joinable()) { + thread.join(); + } + } + collector_threads_.clear(); } Manager &manager() { return manager_; } @@ -637,6 +771,7 @@ class ProfilerBase { Manager manager_; std::atomic execution_complete_{false}; std::thread collector_thread_; + std::vector collector_threads_; // Memory context stashed by Derived::init() via set_memory_context(). // Derived may read these from finalize() / alloc helpers via the @@ -744,100 +879,89 @@ class ProfilerBase { } private: - /** - * mgmt thread main loop. Each tick: - * 0) Mirror the device-side shared-memory region (DataHeader + all - * BufferStates) into the host shadow so subsequent reads see the - * latest queue_tails / current_buf_ptr / per-state counters. - * 1) Drain done_queue into recycled pools. - * 2) Iterate AICPU per-thread ready queues (PLATFORM_MAX_AICPU_THREADS - * upper bound; empty queues are O(1) head==tail checks) and call - * Module::process_entry per entry. process_entry pulls each - * popped buffer's contents from device on demand. - * try_pop_aicpu_entry / push_to_free_queue write the few host-modified - * fields (queue_heads[q], free_queue.tail/buffer_ptrs[]) back to - * device immediately via `write_range_to_device`. - * 3) Call Module::proactive_replenish to top up any depleted free - * queues. - * 4) Sleep 10 us if no work was done. - * - * The bulk `mirror_shm_to_device` deliberately is NOT called: it races - * with AICPU writes to device-only fields (current_buf_ptr, total/dropped/ - * mismatch counters, queue_tails, free_queue.head, core_to_thread[], - * and on a5 L2SwimlaneAicpuPhaseHeader::magic) and rolls them back to - * whatever was mirrored in at the start of the tick. Each host-side - * modification is written back as a narrow field write inside Alg. - * - * On exit (mgmt_running_ → false) it does one final drain pass without - * sleeping to flush any straggler entries the device pushed before - * stopping. - */ - void mgmt_loop() { + void mgmt_drain_loop(int queue_start, int queue_stride) { DataHeader *header = Module::header_from_shm(manager_.shared_mem_host()); using Alg = ProfilerAlgorithms; + constexpr int kIdleBusyPollLoops = 64; + int idle_busy_polls = 0; - while (mgmt_running_.load(std::memory_order_acquire)) { - manager_.mirror_shm_from_device(); - - manager_.drain_done_into_recycled(); - + while (mgmt_running_.load(std::memory_order_relaxed)) { bool found_any = false; - for (int q = 0; q < PLATFORM_MAX_AICPU_THREADS; q++) { + for (int q = queue_start; q < PLATFORM_MAX_AICPU_THREADS; q += queue_stride) { ReadyEntry entry; - while (Alg::try_pop_aicpu_entry(manager_, header, q, entry)) { + while (Alg::try_pop_aicpu_entry(manager_, header, q, entry, true)) { Alg::process_entry(manager_, header, q, entry); found_any = true; } } - - Alg::proactive_replenish(manager_, header); + if (found_any) { + idle_busy_polls = 0; + } if (!found_any) { - std::this_thread::sleep_for(std::chrono::microseconds(10)); + if (idle_busy_polls < kIdleBusyPollLoops) { + idle_busy_polls++; + } else { + std::this_thread::sleep_for(std::chrono::microseconds(10)); + } } } - // Final drain after mgmt_running_ flipped: don't sleep, don't - // replenish. try_pop_aicpu_entry still pushes the advanced - // queue_heads back to device per-pop. - manager_.mirror_shm_from_device(); - for (int q = 0; q < PLATFORM_MAX_AICPU_THREADS; q++) { + for (int q = queue_start; q < PLATFORM_MAX_AICPU_THREADS; q += queue_stride) { ReadyEntry entry; - while (Alg::try_pop_aicpu_entry(manager_, header, q, entry)) { + while (Alg::try_pop_aicpu_entry(manager_, header, q, entry, true)) { Alg::process_entry(manager_, header, q, entry); } } } + void mgmt_replenish_loop() { + while (mgmt_running_.load(std::memory_order_relaxed)) { + size_t drained = manager_.drain_done_into_recycled(); + + if (drained == 0) { + std::this_thread::sleep_for(std::chrono::microseconds(10)); + } + } + } + /** - * Main collector loop. Blocks on the manager's ready_queue with a 100 ms + * Main collector loop. Blocks on one manager ready-queue shard with a 100 ms * cv-wait tick. On each hit it dispatches the buffer to Derived via * on_buffer_collected() and recycles the buffer. Exits in two cases: * - * 1. execution_complete_ was set (by stop()) and the ready_queue is + * 1. execution_complete_ was set (by stop()) and this ready_queue shard is * empty, after a final non-blocking drain pass. * 2. No buffer arrived for `Derived::kIdleTimeoutSec` consecutive * seconds AND execution_complete_ has not been signalled — this - * is a hang detector that logs an error and bails out. + * is a hang detector that logs an error and bails out. Multi-shard + * collectors arm this only after a shard has seen traffic, because + * an empty shard can be a valid run shape. */ - void poll_and_collect_loop() { + void poll_and_collect_loop(int shard_index, int shard_count) { const auto wait_tick = std::chrono::milliseconds(100); const auto idle_timeout = std::chrono::seconds(Derived::kIdleTimeoutSec); std::optional idle_start; + bool has_seen_buffer = false; while (true) { ReadyBufferInfo info; - if (manager_.wait_pop_ready(info, wait_tick)) { - consume(info); + if (manager_.wait_pop_ready(info, wait_tick, shard_index)) { + consume(info, shard_index); + has_seen_buffer = true; idle_start.reset(); continue; } if (execution_complete_.load(std::memory_order_acquire)) { - while (manager_.try_pop_ready(info)) { - consume(info); + while (manager_.try_pop_ready(info, shard_index)) { + consume(info, shard_index); + has_seen_buffer = true; } break; } + if (shard_count > 1 && !has_seen_buffer) { + continue; + } if (!idle_start.has_value()) { idle_start = std::chrono::steady_clock::now(); } @@ -851,16 +975,18 @@ class ProfilerBase { } } - void consume(const ReadyBufferInfo &info) { + void consume(const ReadyBufferInfo &info, int shard_index) { static_cast(this)->on_buffer_collected(info); if constexpr (Module::kBufferKinds > 1) { - manager_.notify_copy_done(info.dev_buffer_ptr, Module::kind_of(info)); + manager_.notify_copy_done(info.dev_buffer_ptr, Module::kind_of(info), shard_index); } else { - manager_.notify_copy_done(info.dev_buffer_ptr, 0); + manager_.notify_copy_done(info.dev_buffer_ptr, 0, shard_index); } } std::thread mgmt_thread_; + std::vector mgmt_drain_threads_; + std::thread mgmt_replenish_thread_; std::atomic mgmt_running_{false}; }; diff --git a/src/common/platform/include/host/scope_stats_collector.h b/src/common/platform/include/host/scope_stats_collector.h index 583830294..21041be29 100644 --- a/src/common/platform/include/host/scope_stats_collector.h +++ b/src/common/platform/include/host/scope_stats_collector.h @@ -14,10 +14,10 @@ * @brief Host-side scope_stats streaming collector + NDJSON export. * * Architecture mirrors PmuCollector: BufferPoolManager runs - * the mgmt thread (polls the per-thread ready queue, recycles buffers, refills - * the single instance's free_queue); ScopeStatsCollector's poll thread appends - * each full buffer's ScopeStatsRecords to an in-memory vector. After stop(), - * write_jsonl() renders them to + * split mgmt threads (poll per-thread ready queues, recycle buffers, refill the + * single instance's free_queue); ScopeStatsCollector's collector thread shards + * append each full buffer's ScopeStatsRecords to an in-memory vector. After + * stop(), write_jsonl() renders them to * /scope_stats/scope_stats.jsonl. * * Memory mirroring is handled by the framework via the MemoryOps installed @@ -31,7 +31,7 @@ * Lifecycle: * init() — Allocate header + 1 BufferState + N ScopeStatsBuffers * (pre-fills free_queue; surplus → recycled pool). - * start(tf) — Inherited: launches mgmt + poll threads. + * start(tf) — Inherited: launches mgmt + collector threads. * [device execution] * stop() — Inherited: drain queues, join threads. * reconcile_counters() — Recover any un-flushed current buffer left by an @@ -89,6 +89,8 @@ struct ScopeStatsModule { static constexpr uint32_t kReadyQueueSize = PLATFORM_SCOPE_STATS_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_SCOPE_STATS_SLOT_COUNT; static constexpr const char *kSubsystemName = "ScopeStatsModule"; + static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS; + static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS; static constexpr int batch_size(int /*kind*/) { constexpr int kBatch = PLATFORM_SCOPE_STATS_BUFFERS_PER_INSTANCE - PLATFORM_SCOPE_STATS_SLOT_COUNT; @@ -98,7 +100,18 @@ struct ScopeStatsModule { static DataHeader *header_from_shm(void *shm) { return get_scope_stats_header(shm); } static std::optional> - resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) { + resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) { + if (shm == nullptr || header == nullptr) { + LOG_ERROR("ScopeStatsModule: invalid shared memory/header while resolving ready entry"); + return std::nullopt; + } + if (header->num_instances != 1 || entry.instance_index >= header->num_instances) { + LOG_ERROR( + "ScopeStatsModule: invalid ready entry instance=%u (num_instances=%u)", entry.instance_index, + header->num_instances + ); + return std::nullopt; + } ScopeStatsBufferState *state = get_scope_stats_buffer_state(shm, static_cast(entry.instance_index)); profiling_common::EntrySite site; site.kind = 0; diff --git a/src/common/platform/include/host/tensor_dump_collector.h b/src/common/platform/include/host/tensor_dump_collector.h index 67343fa80..e8f649a00 100644 --- a/src/common/platform/include/host/tensor_dump_collector.h +++ b/src/common/platform/include/host/tensor_dump_collector.h @@ -14,9 +14,9 @@ * @brief Host-side tensor dump collector with independent shared memory. * * Architecture: - * - BufferPoolManager: shared mgmt-thread infrastructure that - * polls per-thread DumpReadyQueues, replenishes free_queues, and hands - * full DumpMetaBuffers off to the collector thread. + * - BufferPoolManager: shared split-mgmt infrastructure that + * polls per-thread ready queues, replenishes free_queues, and hands + * full DumpMetaBuffers off to collector thread shards. * - TensorDumpCollector: copies tensor metadata + arena bytes into host * vectors and writes the result to disk (.bin + JSON). * @@ -86,6 +86,8 @@ struct DumpModule { static constexpr uint32_t kReadyQueueSize = PLATFORM_DUMP_READYQUEUE_SIZE; static constexpr uint32_t kSlotCount = PLATFORM_DUMP_SLOT_COUNT; static constexpr const char *kSubsystemName = "DumpModule"; + static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS; + static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS; /** * Tensor-dump bursts can be very large; the batch is sized so a fully @@ -100,7 +102,19 @@ struct DumpModule { static DataHeader *header_from_shm(void *shm) { return get_dump_header(shm); } static std::optional> - resolve_entry(void *shm, DataHeader * /*header*/, int /*q*/, const ReadyEntry &entry) { + resolve_entry(void *shm, DataHeader *header, int /*q*/, const ReadyEntry &entry) { + if (shm == nullptr || header == nullptr) { + LOG_ERROR("DumpModule: invalid shared memory/header while resolving ready entry"); + return std::nullopt; + } + if (entry.thread_index >= header->num_dump_threads || + entry.thread_index >= static_cast(PLATFORM_MAX_AICPU_THREADS)) { + LOG_ERROR( + "DumpModule: invalid ready entry thread=%u (num_dump_threads=%u, max=%u)", entry.thread_index, + header->num_dump_threads, static_cast(PLATFORM_MAX_AICPU_THREADS) + ); + return std::nullopt; + } DumpBufferState *state = get_dump_buffer_state(shm, static_cast(entry.thread_index)); profiling_common::EntrySite site; site.kind = 0; @@ -296,6 +310,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase write_queue_; diff --git a/src/common/platform/shared/aicpu/scope_stats_collector_aicpu.cpp b/src/common/platform/shared/aicpu/scope_stats_collector_aicpu.cpp index c7a8410bc..496d2773e 100644 --- a/src/common/platform/shared/aicpu/scope_stats_collector_aicpu.cpp +++ b/src/common/platform/shared/aicpu/scope_stats_collector_aicpu.cpp @@ -21,6 +21,7 @@ #include "aicpu/scope_stats_collector_aicpu.h" #include +#include "aicpu/device_time.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "common/scope_stats.h" @@ -44,6 +45,9 @@ static int s_orch_thread_idx = -1; // set via scope_stats_aicpu_set_orch_thread // unroll_heap_offset). Reset in set_platform_scope_stats_base. static uint64_t s_heap_wraps[PTO2_SCOPE_STATS_MAX_RING_DEPTH][2] = {}; +static constexpr uint64_t kScopeStatsQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000; // 20 us +static constexpr uint32_t kScopeStatsQueueBackpressurePollMask = 1023; + namespace { const char *s_pending_site_file = nullptr; @@ -89,19 +93,30 @@ inline void copy_basename(char (&dst)[32], const char *src) { // Enqueue a full buffer onto the orchestrator thread's ready_queue. Returns 0 // on success, -1 if the queue is full or the orch thread index is unset. int enqueue_ready_buffer(uint64_t buffer_ptr, uint32_t buffer_seq) { - if (s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + if (s_scope_stats_header == nullptr || s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) { return -1; } int q = s_orch_thread_idx; uint32_t capacity = PLATFORM_SCOPE_STATS_READYQUEUE_SIZE; - uint32_t current_tail = s_scope_stats_header->queue_tails[q]; - uint32_t current_head = s_scope_stats_header->queue_heads[q]; + uint32_t current_tail = 0; + uint32_t current_head = 0; + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + current_tail = s_scope_stats_header->queue_tails[q]; + current_head = s_scope_stats_header->queue_heads[q]; + uint32_t next_tail = (current_tail + 1) % capacity; + if (next_tail != current_head) { + break; + } + if ((++spins & kScopeStatsQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kScopeStatsQueueBackpressureWaitCycles) { + return -1; + } + } while (true); uint32_t next_tail = (current_tail + 1) % capacity; - if (next_tail == current_head) { - return -1; // Queue full - } - s_scope_stats_header->queues[q][current_tail].instance_index = 0; s_scope_stats_header->queues[q][current_tail].buffer_ptr = buffer_ptr; s_scope_stats_header->queues[q][current_tail].buffer_seq = buffer_seq; @@ -115,23 +130,38 @@ int enqueue_ready_buffer(uint64_t buffer_ptr, uint32_t buffer_seq) { // Pop a free buffer into current_buf_ptr. Returns true if one was available. bool pop_free_buffer() { - rmb(); - uint32_t head = s_scope_stats_state->free_queue.head; - uint32_t tail = s_scope_stats_state->free_queue.tail; - if (head == tail) { - return false; - } + if (s_scope_stats_state == nullptr) return false; + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + uint32_t head = 0; + uint32_t tail = 0; + do { + head = s_scope_stats_state->free_queue.head; + tail = s_scope_stats_state->free_queue.tail; + if (head != tail) { + rmb(); // acquire: order the tail read before the buffer_ptrs read below + break; + } + if ((++spins & kScopeStatsQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kScopeStatsQueueBackpressureWaitCycles) { + return false; + } + } while (true); + uint64_t buf_ptr = s_scope_stats_state->free_queue.buffer_ptrs[head % PLATFORM_SCOPE_STATS_SLOT_COUNT]; - rmb(); s_scope_stats_state->free_queue.head = head + 1; + if (buf_ptr == 0) { + return false; + } s_scope_stats_state->current_buf_ptr = buf_ptr; reinterpret_cast(buf_ptr)->count = 0; wmb(); return true; } -// Commit the full current buffer to the ready_queue and pop a replacement. On -// no free buffer / ready_queue full, drop the buffer's records and reuse it. +// Commit the full current buffer to the ready_queue before popping a +// replacement. If no replacement is available, later records drop until host +// replenishes free_queue. void switch_buffer() { if (s_scope_stats_state == nullptr) { return; @@ -141,20 +171,6 @@ void switch_buffer() { return; } - rmb(); - uint32_t head = s_scope_stats_state->free_queue.head; - uint32_t tail = s_scope_stats_state->free_queue.tail; - if (head == tail) { - // Host can't recycle buffers fast enough: drop silently (count only, no - // per-drop log). Logging here would make a slow host pay device-side - // hot-path cost — the device must not be coupled to host throughput. The - // total is surfaced via dropped_record_count in the finalize summary. - s_scope_stats_state->dropped_record_count += full_buf->count; - full_buf->count = 0; - wmb(); - return; - } - uint32_t seq = s_scope_stats_state->current_buf_seq; int rc = enqueue_ready_buffer(s_scope_stats_state->current_buf_ptr, seq); if (rc != 0) { @@ -164,13 +180,10 @@ void switch_buffer() { return; } - uint64_t new_buf_ptr = s_scope_stats_state->free_queue.buffer_ptrs[head % PLATFORM_SCOPE_STATS_SLOT_COUNT]; - rmb(); - s_scope_stats_state->free_queue.head = head + 1; - s_scope_stats_state->current_buf_ptr = new_buf_ptr; + s_scope_stats_state->current_buf_ptr = 0; s_scope_stats_state->current_buf_seq = seq + 1; - reinterpret_cast(new_buf_ptr)->count = 0; wmb(); + (void)pop_free_buffer(); } // Unroll a wrapping heap byte offset into a monotonic value using the diff --git a/src/common/platform/shared/aicpu/tensor_dump_aicpu.cpp b/src/common/platform/shared/aicpu/tensor_dump_aicpu.cpp index e62888b92..e2623057b 100644 --- a/src/common/platform/shared/aicpu/tensor_dump_aicpu.cpp +++ b/src/common/platform/shared/aicpu/tensor_dump_aicpu.cpp @@ -23,6 +23,7 @@ #include #include +#include "aicpu/device_time.h" #include "common/memory_barrier.h" #include "common/platform_config.h" #include "common/unified_log.h" @@ -49,15 +50,19 @@ static inline void account_dropped_records(DumpBufferState *state, uint32_t drop state->dropped_record_count = (next < prev) ? UINT32_MAX : next; } -extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dump_base = dump_data_base; } - -extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; } +static constexpr uint64_t kDumpQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000; // 20 us +static constexpr uint32_t kDumpQueueBackpressurePollMask = 1023; static bool g_enable_dump_args = false; // Dump level latched from the header in dump_args_init(). The selective // (PARTIAL) and json-only (FULL_JSON_ONLY) modes are derived from it rather // than tracked as separate flags — mirrors g_l2_swimlane_level. static DumpTensorLevel g_dump_args_level = DumpTensorLevel::OFF; + +extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dump_base = dump_data_base; } + +extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; } + struct DumpTaskMaskEntry { uint64_t task_id; TensorDumpArgMask mask; @@ -342,34 +347,79 @@ bool try_log_dump_args_layout_mismatch() { * Enqueue a full dump metadata buffer to the thread's ready queue. */ static int enqueue_dump_ready_buffer(int thread_idx, uint64_t buffer_ptr, uint32_t buffer_seq) { + if (s_dump_header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { + return -1; + } uint32_t capacity = PLATFORM_DUMP_READYQUEUE_SIZE; - uint32_t current_tail = s_dump_header->queue_tails[thread_idx]; - uint32_t current_head = s_dump_header->queue_heads[thread_idx]; + uint32_t current_tail = 0; + uint32_t current_head = 0; + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + + do { + current_tail = s_dump_header->queue_tails[thread_idx]; + current_head = s_dump_header->queue_heads[thread_idx]; + uint32_t next_tail = (current_tail + 1) % capacity; + if (next_tail != current_head) { + break; + } + if ((++spins & kDumpQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kDumpQueueBackpressureWaitCycles) { + return -1; + } + } while (true); uint32_t next_tail = (current_tail + 1) % capacity; - if (next_tail == current_head) { - return -1; // Queue full - } - s_dump_header->queues[thread_idx][current_tail].thread_index = static_cast(thread_idx); s_dump_header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr; s_dump_header->queues[thread_idx][current_tail].buffer_seq = buffer_seq; - wmb(); + wmb(); // publish: entry fields visible before the tail advance s_dump_header->queue_tails[thread_idx] = next_tail; - wmb(); return 0; } -/** - * Maximum spin-wait iterations when free_queue or ready_queue is exhausted. - * Gives host mgmt_loop time to replenish before falling back to buffer overwrite. - */ -static constexpr uint32_t DUMP_SPIN_WAIT_LIMIT = 1000000; +static DumpMetaBuffer *try_pop_dump_meta_buffer(int thread_idx, DumpBufferState *state, uint32_t next_seq) { + if (thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS || state == nullptr) { + return nullptr; + } + const uint64_t start = get_sys_cnt_aicpu(); + uint32_t spins = 0; + uint32_t head = 0; + uint32_t tail = 0; + + do { + head = state->free_queue.head; + tail = state->free_queue.tail; + if (head != tail) { + rmb(); // acquire: order the tail read before the buffer_ptrs read below + break; + } + if ((++spins & kDumpQueueBackpressurePollMask) == 0 && + get_sys_cnt_aicpu() - start >= kDumpQueueBackpressureWaitCycles) { + return nullptr; + } + } while (true); + + uint64_t new_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_DUMP_SLOT_COUNT]; + state->free_queue.head = head + 1; + if (new_ptr == 0) { + return nullptr; + } + + DumpMetaBuffer *new_buf = reinterpret_cast(new_ptr); + new_buf->count = 0; + s_current_dump_buf[thread_idx] = new_buf; + state->current_buf_ptr = new_ptr; + state->current_buf_seq = next_seq; + wmb(); + return new_buf; +} /** - * Switch metadata buffer: enqueue the full buffer, pop a new one. - * Spin-waits briefly for host to replenish before falling back to overwrite. + * Switch metadata buffer: enqueue the full buffer first, then pop a new one. + * If no replacement is available, later records drop until host replenishes + * free_queue. */ static int switch_dump_meta_buffer(int thread_idx) { if (thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) { @@ -381,77 +431,39 @@ static int switch_dump_meta_buffer(int thread_idx) { return -1; } - // Spin-wait for a free buffer, giving host mgmt_loop time to replenish - rmb(); - uint32_t head = state->free_queue.head; - uint32_t tail = state->free_queue.tail; - if (head == tail) { - for (uint32_t spin = 0; spin < DUMP_SPIN_WAIT_LIMIT; spin++) { - rmb(); - head = state->free_queue.head; - tail = state->free_queue.tail; - if (head != tail) { - break; - } - } - } - if (head == tail) { - // Still empty after spin — overwrite current buffer - account_dropped_records(state, cur->count); - cur->count = 0; - wmb(); - if (!s_logged_no_free_meta_buffer[thread_idx]) { - s_logged_no_free_meta_buffer[thread_idx] = true; - LOG_WARN( - "Args dump ran out of free metadata buffers on thread %d after spin-wait, " - "overwriting current buffer. Increase PLATFORM_DUMP_BUFFERS_PER_THREAD.", - thread_idx - ); - } - return 0; - } - - // Enqueue the full buffer (spin-wait if ready queue is full) uint64_t buf_addr = reinterpret_cast(cur); uint32_t seq = state->current_buf_seq; int rc = enqueue_dump_ready_buffer(thread_idx, buf_addr, seq); if (rc != 0) { - for (uint32_t spin = 0; spin < DUMP_SPIN_WAIT_LIMIT; spin++) { - rmb(); - rc = enqueue_dump_ready_buffer(thread_idx, buf_addr, seq); - if (rc == 0) { - break; - } - } - } - if (rc != 0) { - // Still full after spin — overwrite current buffer account_dropped_records(state, cur->count); cur->count = 0; wmb(); if (!s_logged_ready_queue_full[thread_idx]) { s_logged_ready_queue_full[thread_idx] = true; LOG_WARN( - "Args dump ready queue full on thread %d after spin-wait, " - "overwriting current buffer. Increase PLATFORM_DUMP_READYQUEUE_SIZE.", + "Args dump ready queue full on thread %d after bounded wait, " + "dropping current metadata buffer. Increase PLATFORM_DUMP_READYQUEUE_SIZE.", thread_idx ); } return 0; } - // Pop next buffer from free_queue - uint64_t new_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_DUMP_SLOT_COUNT]; - rmb(); - state->free_queue.head = head + 1; - - DumpMetaBuffer *new_buf = reinterpret_cast(new_ptr); - new_buf->count = 0; - s_current_dump_buf[thread_idx] = new_buf; - state->current_buf_ptr = new_ptr; - state->current_buf_seq = seq + 1; + uint32_t next_seq = seq + 1; + s_current_dump_buf[thread_idx] = nullptr; + state->current_buf_ptr = 0; + state->current_buf_seq = next_seq; wmb(); + if (try_pop_dump_meta_buffer(thread_idx, state, next_seq) == nullptr && !s_logged_no_free_meta_buffer[thread_idx]) { + s_logged_no_free_meta_buffer[thread_idx] = true; + LOG_WARN( + "Args dump published a full metadata buffer on thread %d but no replacement was available; " + "records will drop until recovery. Increase PLATFORM_DUMP_BUFFERS_PER_THREAD.", + thread_idx + ); + } + s_buffers_switched[thread_idx]++; return 0; @@ -588,17 +600,8 @@ void dump_args_init(int num_dump_threads) { uint32_t head = state->free_queue.head; uint32_t tail = state->free_queue.tail; if (head != tail) { - uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_DUMP_SLOT_COUNT]; - rmb(); - state->free_queue.head = head + 1; - wmb(); - - DumpMetaBuffer *buf = reinterpret_cast(buf_ptr); - buf->count = 0; - s_current_dump_buf[t] = buf; - state->current_buf_ptr = buf_ptr; - state->current_buf_seq = 0; - wmb(); + (void)try_pop_dump_meta_buffer(t, state, 0); + uint64_t buf_ptr = state->current_buf_ptr; LOG_DEBUG("Thread %d: popped initial dump buffer (addr=0x%lx)", t, buf_ptr); } else { LOG_ERROR("Thread %d: dump free_queue is empty during init!", t); @@ -625,7 +628,11 @@ int dump_arg_record(int thread_idx, const TensorDumpInfo &info) { DumpBufferState *state = s_dump_states[thread_idx]; DumpMetaBuffer *buf = s_current_dump_buf[thread_idx]; if (buf == nullptr) { - return -1; + buf = try_pop_dump_meta_buffer(thread_idx, state, state != nullptr ? state->current_buf_seq : 0); + if (buf == nullptr) { + account_dropped_records(state, 1); + return -1; + } } // Switch metadata buffer if full @@ -635,7 +642,11 @@ int dump_arg_record(int thread_idx, const TensorDumpInfo &info) { } buf = s_current_dump_buf[thread_idx]; if (buf == nullptr) { - return -1; + buf = try_pop_dump_meta_buffer(thread_idx, state, state != nullptr ? state->current_buf_seq : 0); + if (buf == nullptr) { + account_dropped_records(state, 1); + return -1; + } } } diff --git a/src/common/platform/shared/host/tensor_dump_collector.cpp b/src/common/platform/shared/host/tensor_dump_collector.cpp index 5af7c9cfa..fa186dcf2 100644 --- a/src/common/platform/shared/host/tensor_dump_collector.cpp +++ b/src/common/platform/shared/host/tensor_dump_collector.cpp @@ -306,6 +306,7 @@ void TensorDumpCollector::process_dump_buffer(const DumpReadyBufferInfo &info) { } void TensorDumpCollector::on_buffer_collected(const DumpReadyBufferInfo &info) { + std::scoped_lock lock(collector_state_mutex_); start_writer_thread_once(); process_dump_buffer(info); buffers_collected_++; diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index d1ffb4d2a..9b92322ed 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -368,6 +368,10 @@ set_tests_properties(test_chip_callable_upload_immutable PROPERTIES LABELS "no_h add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp) add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp) add_common_utils_test(test_device_arena common/test_device_arena.cpp) +add_common_utils_test(test_buffer_pool_manager common/test_buffer_pool_manager.cpp) +target_include_directories(test_buffer_pool_manager PRIVATE + ${CMAKE_SOURCE_DIR}/../../../src/common/log/include +) add_common_utils_test(test_l3_l2_orch_comm common/test_l3_l2_orch_comm.cpp) add_executable(test_l3_l2_orch_endpoint common/test_l3_l2_orch_endpoint.cpp diff --git a/tests/ut/cpp/common/test_buffer_pool_manager.cpp b/tests/ut/cpp/common/test_buffer_pool_manager.cpp new file mode 100644 index 000000000..210706e53 --- /dev/null +++ b/tests/ut/cpp/common/test_buffer_pool_manager.cpp @@ -0,0 +1,107 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include "host/buffer_pool_manager.h" + +#include + +#include +#include +#include + +namespace { + +struct TestHeader {}; +struct TestReadyEntry {}; + +struct TestReadyBufferInfo { + void *dev_buffer_ptr{nullptr}; + uint32_t shard_marker{0}; +}; + +struct TestModule { + using DataHeader = TestHeader; + using ReadyEntry = TestReadyEntry; + using ReadyBufferInfo = TestReadyBufferInfo; + + static constexpr int kBufferKinds = 2; + static constexpr int kCollectorThreadCount = 4; +}; + +void *ptr(uintptr_t value) { return reinterpret_cast(value); } + +} // namespace + +TEST(BufferPoolManagerShardingTest, ReadyShardsAreIndependent) { + using Manager = profiling_common::BufferPoolManager; + static_assert(Manager::kCollectorShardCount == 4); + + Manager manager; + manager.push_to_ready(TestReadyBufferInfo{ptr(0x1000), 0}, 0); + manager.push_to_ready(TestReadyBufferInfo{ptr(0x2000), 1}, 1); + manager.push_to_ready(TestReadyBufferInfo{ptr(0x5000), 5}, 5); // normalizes to shard 1 + + TestReadyBufferInfo out; + EXPECT_FALSE(manager.try_pop_ready(out, 2)); + + ASSERT_TRUE(manager.try_pop_ready(out, 0)); + EXPECT_EQ(out.dev_buffer_ptr, ptr(0x1000)); + EXPECT_EQ(out.shard_marker, 0u); + EXPECT_FALSE(manager.try_pop_ready(out, 0)); + + ASSERT_TRUE(manager.try_pop_ready(out, 1)); + EXPECT_EQ(out.dev_buffer_ptr, ptr(0x2000)); + EXPECT_EQ(out.shard_marker, 1u); + ASSERT_TRUE(manager.try_pop_ready(out, 1)); + EXPECT_EQ(out.dev_buffer_ptr, ptr(0x5000)); + EXPECT_EQ(out.shard_marker, 5u); + EXPECT_FALSE(manager.try_pop_ready(out, 1)); +} + +TEST(BufferPoolManagerShardingTest, DoneShardsRecycleByKind) { + profiling_common::BufferPoolManager manager; + + manager.notify_copy_done(ptr(0x1000), /*kind=*/0, /*shard_index=*/0); + manager.notify_copy_done(ptr(0x2000), /*kind=*/1, /*shard_index=*/1); + manager.notify_copy_done(ptr(0x5000), /*kind=*/1, /*shard_index=*/5); // normalizes to shard 1 + + EXPECT_EQ(manager.drain_done_into_recycled(), 3u); + EXPECT_EQ(manager.recycled_count(0), 1u); + EXPECT_EQ(manager.recycled_count(1), 2u); + + EXPECT_EQ(manager.pop_recycled(0), ptr(0x1000)); + + std::set kind_one; + kind_one.insert(manager.pop_recycled(1, 1)); + kind_one.insert(manager.pop_recycled(1, 1)); + EXPECT_EQ(kind_one, (std::set{ptr(0x2000), ptr(0x5000)})); +} + +TEST(BufferPoolManagerShardingTest, ReleaseOwnedBuffersVisitsAllShards) { + profiling_common::BufferPoolManager manager; + manager.push_recycled(/*kind=*/0, ptr(0x1000)); + manager.push_to_ready(TestReadyBufferInfo{ptr(0x2000), 2}, /*shard_index=*/2); + manager.notify_copy_done(ptr(0x3000), /*kind=*/1, /*shard_index=*/3); + + std::vector released; + manager.release_owned_buffers([&](void *p) { + released.push_back(p); + }); + + EXPECT_EQ( + std::set(released.begin(), released.end()), (std::set{ptr(0x1000), ptr(0x2000), ptr(0x3000)}) + ); + EXPECT_TRUE(manager.recycled_empty()); + + TestReadyBufferInfo out; + EXPECT_FALSE(manager.try_pop_ready(out, 2)); + EXPECT_EQ(manager.drain_done_into_recycled(), 0u); +}