diff --git a/docs/dfx/args-dump.md b/docs/dfx/args-dump.md
index 9b4d78cef..d80944dbd 100644
--- a/docs/dfx/args-dump.md
+++ b/docs/dfx/args-dump.md
@@ -481,12 +481,12 @@ normal execution continues.
 
 `halHostRegister` maps device memory into host virtual address
 space so the host can read device buffers directly.
-`TensorDumpCollector` runs two background threads on top of a
+`TensorDumpCollector` runs split mgmt threads and collector shards on top of a
 [`BufferPoolManager<DumpModule>`](../src/common/platform/include/host/buffer_pool_manager.h):
-a mgmt thread that polls SPSC ready queues and recycles full
-metadata buffers **while kernels are still executing**, plus a
-poll thread that drains the L2 hand-off queue into
-`on_buffer_collected`.
+drain/refill shards poll SPSC ready queues and recycle full metadata
+buffers **while kernels are still executing**, a replenish thread keeps
+free queues topped up, and collector shards drain the host hand-off queues
+into `on_buffer_collected`.
 
 ```text
         HOST                                         DEVICE
@@ -499,19 +499,19 @@ poll thread that drains the L2 hand-off queue into
 │                          │               │                          │
 │ start()                  │               │ per-task run loop:       │
 │   ┌────────────────────┐ │               │   BEFORE_DISPATCH        │
-│   │ mgmt thread        │ │               │     dump_arg_record()    │
-│   │ (BufferPool driver)│ │ SPSC ready    │     → write to arena     │
+│   │ drain/refill shard │ │               │     dump_arg_record()    │
+│   │ + replenish thread │ │ SPSC ready    │     → write to arena     │
 │   │   poll ready queue │<┼──queues──────<│     → append record      │
 │   │   recycle buffers  │─┼──free queue──>│     → push to ready_q    │
 │   └────────────────────┘ │               │   dispatch kernel        │
 │   ┌────────────────────┐ │               │   wait FIN               │
-│   │ poll thread        │ │               │   AFTER_COMPLETION       │
+│   │ collector shard    │ │               │   AFTER_COMPLETION       │
 │   │   reads arena via  │ │ shared mem    │     dump_arg_record()    │
 │   │   host mapping     │<┼──mapping─────<│                          │
 │   └────────────────────┘ │               │                          │
 │                          │               │ dump_args_flush()        │
 │ stop()                   │               │   log per-thread stats   │
-│   join mgmt → join poll  │               └──────────────────────────┘
+│   join mgmt → collectors │               └──────────────────────────┘
 │ reconcile_counters()     │
 │   recover leftovers      │
 │   + dropped accounting   │
@@ -530,29 +530,28 @@ poll thread that drains the L2 hand-off queue into
 init_tensor_dump()
   dump_collector_.initialize(..., output_prefix_)
   kernel_args_.args.dump_data_base = dump_collector_.get_dump_shm_device_ptr()
-start()                          ← spawn mgmt thread (drains L1 ringbuffer)
-                                   then spawn poll thread (consumes L2 queue)
+start()                          ← spawn split mgmt threads (drain/refill
+                                   + replenish), then collector shards
 launch AICPU / AICore
 rtStreamSynchronize              ← wait for kernel completion
-stop()                           ← join mgmt (its final-drain pass into L2
-                                   has poll as the consumer), then signal
-                                   poll and join it
+stop()                           ← join mgmt/replenish after final drain,
+                                   then signal collector shards and join them
 reconcile_counters()             ← recover leftover current buffers
                                    + dropped accounting
 export_dump_files()
 ```
 
-[`TensorDumpCollector`](../src/a2a3/platform/include/host/tensor_dump_collector.h)
+[`TensorDumpCollector`](../src/common/platform/include/host/tensor_dump_collector.h)
 on a2a3 inherits from
 [`profiling_common::ProfilerBase<TensorDumpCollector, DumpModule>`](../src/common/platform/include/host/profiler_base.h):
-the base class owns the mgmt thread, the poll thread, and the
+the base class owns split mgmt threads, collector shards, and the
 `BufferPoolManager<DumpModule>` they share. `TensorDumpCollector`
 only supplies the dump-specific pieces — the `DumpModule` trait
 that describes the shared-memory layout, `initialize` that
 allocates and pre-fills free queues, an `on_buffer_collected`
 callback that gathers payload bytes into the in-memory record
 list, plus `reconcile_counters` / `export_dump_files` /
-`finalize`. The mgmt/poll threading, buffer pooling, and `Module`
+`finalize`. The mgmt/collector threading, buffer pooling, and `Module`
 trait pattern are shared with PMU and L2Swimlane — see
 [profiling-framework.md](../profiling-framework.md) for the
 framework reference.
@@ -561,7 +560,7 @@ framework reference.
 
 a5's `TensorDumpCollector` derives from
 `ProfilerBase<TensorDumpCollector, DumpModule>` and shares the
-mgmt + poll thread structure with a2a3. The single behavioral
+split mgmt + collector shard structure with a2a3. The single behavioral
 deviation from §5.4 is the **transport channel**: a5 has no
 `halHostRegister`, so each device buffer is paired with a
 host-shadow `malloc()` and the mgmt loop synchronizes the two via
@@ -597,8 +596,8 @@ the buffer's records.
 │   register_mapping(s)    │               │   BEFORE_DISPATCH        │
 │                          │               │     dump_arg_record()    │
 │ start(thread_factory)    │               │   dispatch kernel        │
-│   mgmt_thread starts     │               │   wait FIN               │
-│   poll_thread starts     │               │   AFTER_COMPLETION       │
+│   split mgmt starts      │               │   wait FIN               │
+│   collector shards start │               │   AFTER_COMPLETION       │
 │                          │               │     dump_arg_record()    │
 │ mgmt every 10us tick:    │               │   if buffer full:        │
 │   copy_from_device(shm)  │<──memcpy─────<│     push ready entry,    │
@@ -612,7 +611,7 @@ the buffer's records.
 │     for each modified    │               │                          │
 │     field                │               │                          │
 │                          │               │                          │
-│ poll thread:             │               │                          │
+│ collector shard:         │               │                          │
 │   wait_pop_ready         │               │                          │
 │   on_buffer_collected →  │               │                          │
 │     copy arena slice     │<──memcpy─────<│                          │
@@ -622,7 +621,7 @@ the buffer's records.
 │                          │               │                          │
 │ rtStreamSynchronize      │               │                          │
 │ stop()                   │               │                          │
-│   join mgmt + poll       │               │                          │
+│   join mgmt + collectors │               │                          │
 │ reconcile_counters()     │               │                          │
 │   recover leftovers      │               │                          │
 │   + dropped accounting   │               │                          │
@@ -638,17 +637,17 @@ the buffer's records.
 init_tensor_dump()
   dump_collector_.initialize(num_dump_threads, ..., output_prefix_)
   kernel_args_.args.dump_data_base = dump_collector_.get_dump_shm_device_ptr()
-dump_collector_.start(thread_factory)   ← mgmt + poll threads
+dump_collector_.start(thread_factory)   ← split mgmt + collector shards
 launch AICPU / AICore
 rtStreamSynchronize
-dump_collector_.stop()                  ← join mgmt + poll, drain final batch
+dump_collector_.stop()                  ← join mgmt + collectors, drain final batch
 dump_collector_.reconcile_counters()    ← recover leftover current buffers
                                           + dropped accounting
 dump_collector_.export_dump_files()
 dump_collector_.finalize()
 ```
 
-[`TensorDumpCollector`](../src/a5/platform/include/host/tensor_dump_collector.h)
+[`TensorDumpCollector`](../src/common/platform/include/host/tensor_dump_collector.h)
 on a5 inherits the same CRTP base
 ([`profiling_common::ProfilerBase`](../src/common/platform/include/host/profiler_base.h))
 as a2a3 and parameterizes
@@ -670,7 +669,7 @@ before that flush runs, `reconcile_counters` recovers a non-empty
 | Device-side layout | identical (same `DumpDataHeader` / `DumpMetaBuffer` / arena shape, `static_assert`-checked) | |
 | AICPU recording logic | identical | |
 | Buffer model | rotating pool (free + ready queues per thread) | identical |
-| Host threads | mgmt + poll, streams during execution | identical |
+| Host threads | split mgmt + collector shards, streams during execution | identical |
 | Host-class shape | `ProfilerBase<TensorDumpCollector, DumpModule>` | identical |
 | Host transport | `halHostRegister` shared memory | host-shadow `malloc` + per-tick `rtMemcpy`/`memcpy` |
 | `MemoryOps` callbacks | 3 (`alloc`, `reg`, `free_`) | 5 (+ `copy_to_device`, `copy_from_device`) |
@@ -694,9 +693,10 @@ With `--dump-args`, AICPU records full `BEFORE_DISPATCH` /
   non-contiguous views).
 - The completion `pipe_barrier(PIPE_ALL)` before writing FIN, which
   serializes all device-side writes for dumped tasks.
-- The arena and metadata writes themselves; the host transport
-  cost is taken concurrently on a2a3 (mgmt + poll threads) or after
-  the stream finishes on a5.
+- The arena and metadata writes themselves; host drain/replenish and
+  collector work runs concurrently with the stream on both architectures.
+  a5 additionally pays `rtMemcpy`/`memcpy` transport cost to keep host
+  shadows in sync.
 
 For interactive debugging, total memory pressure is what to watch:
 the default per-thread arena is 128 MiB
@@ -893,7 +893,7 @@ per-thread arena (default 128 MiB). Bump
 
 **`dropped_overwrite > 0` in summary.** On a5, the run produced
 more total payload than fits in the arena; on a2a3, the host
-mgmt/poll threads couldn't keep up. Reduce the number of dumped
+mgmt/collector pipeline couldn't keep up. Reduce the number of dumped
 tasks (filter by `func_id` upstream) or increase
 `PLATFORM_DUMP_BUFFERS_PER_THREAD`.
 
diff --git a/docs/dfx/l2-swimlane-profiling.md b/docs/dfx/l2-swimlane-profiling.md
index e288d7454..ae882d177 100644
--- a/docs/dfx/l2-swimlane-profiling.md
+++ b/docs/dfx/l2-swimlane-profiling.md
@@ -609,11 +609,11 @@ sched overhead per session as price for unbounded session length).
 
 `halHostRegister` maps device memory into host virtual address
 space so the host can read device buffers directly.
-`L2SwimlaneCollector` runs two background threads on top of a
+`L2SwimlaneCollector` runs split mgmt threads and collector shards on top of a
 [`BufferPoolManager<L2SwimlaneModule>`](../src/common/platform/include/host/buffer_pool_manager.h):
-a mgmt thread that polls SPSC ready queues and recycles full
-buffers **while kernels are still executing**, plus a poll
-thread that drains the L2 hand-off queue into
+drain/refill shards poll SPSC ready queues and recycle full buffers
+**while kernels are still executing**, a replenish thread keeps free
+queues topped up, and collector shards drain the host hand-off queues into
 `on_buffer_collected`.
 
 `L2SwimlaneModule` declares four buffer kinds going through one ready
@@ -641,19 +641,19 @@ are single-kind.
 │                          │               │                          │
 │ start(tf)                │               │ AICPU on FIN:            │
 │   ┌────────────────────┐ │ SPSC ready    │   commit AicpuTask       │
-│   │ mgmt thread        │ │ queues        │   record (kind 0); fill  │
-│   │ (BufferPool driver)│ │<──4 kinds────<│   func_id / dispatch /   │
+│   │ drain/refill shard │ │ queues        │   record (kind 0); fill  │
+│   │ + replenish thread │ │<──4 kinds────<│   func_id / dispatch /   │
 │   │   poll ready queue │<┼──multiplexed──│   finish; rotate buffer  │
 │   │   recycle buffers  │─┼──free queue──>│   when full              │
 │   └────────────────────┘ │               │ AICPU scheduler thread:  │
 │   ┌────────────────────┐ │               │   per work iter: write   │
-│   │ poll thread        │ │               │   SchedPhaseRecord       │
+│   │ collector shard    │ │               │   SchedPhaseRecord       │
 │   │   reads via host   │ │ shared mem    │   (kind 1). Per submit:  │
 │   │   mapping; copies  │<┼──mapping─────<│   write OrchPhaseRecord  │
 │   │   to host vectors  │ │               │   (kind 2).              │
 │   └────────────────────┘ │               │                          │
 │ stop()                   │               │                          │
-│   join mgmt → join poll  │               │                          │
+│   join mgmt → collectors │               │                          │
 │ read_phase_header_metadata()             │                          │
 │ reconcile_counters()     │               │                          │
 │ export_swimlane_json()   │               │                          │
@@ -667,10 +667,10 @@ are single-kind.
 init_l2_swimlane()
   l2_swimlane_collector_.initialize(num_aicore, ..., output_prefix_)
   kernel_args_.args.l2_swimlane_data_base = l2_swimlane_collector_.get_l2_swimlane_shm_device_ptr()
-start(tf)                          ← spawn mgmt + poll threads
+start(tf)                          ← spawn split mgmt + collector shards
 launch AICPU / AICore
 rtStreamSynchronize
-stop()                             ← join mgmt → join poll
+stop()                             ← join mgmt/replenish → join collectors
 read_phase_header_metadata()       ← single-shot read of the
                                      core→thread mapping
 reconcile_counters()               ← three-bucket accounting for both
@@ -684,7 +684,7 @@ finalize(unregister, free)
 [`L2SwimlaneCollector`](../src/a2a3/platform/include/host/l2_swimlane_collector.h)
 on a2a3 inherits from
 [`profiling_common::ProfilerBase<L2SwimlaneCollector, L2SwimlaneModule>`](../src/common/platform/include/host/profiler_base.h):
-the base class owns the mgmt thread, the poll thread, and the
+the base class owns split mgmt threads, collector shards, and the
 `BufferPoolManager<L2SwimlaneModule>` they share. `L2SwimlaneCollector`
 supplies the L2-specific pieces — the `L2SwimlaneModule` trait
 (notably `kBufferKinds = 4` and `kind_of()`), `initialize` that
@@ -694,7 +694,7 @@ allocates and pre-fills all four kinds of free queues, an
 to copy into the right per-core or per-thread vector, plus
 `read_phase_header_metadata` /
 `reconcile_counters` / `export_swimlane_json` / `finalize`. The
-mgmt/poll threading and `Module` trait pattern are shared with
+mgmt/collector threading and `Module` trait pattern are shared with
 PMU and TensorDump — see
 [profiling-framework.md](../profiling-framework.md) for the
 framework reference.
@@ -702,9 +702,11 @@ framework reference.
 ### 5.3 a5 — same framework, host-shadow transport
 
 a5's `L2SwimlaneCollector` derives from
-`ProfilerBase<L2SwimlaneCollector, L2SwimlaneModule>` and shares the
-mgmt + poll thread structure with a2a3. The single behavioral
-deviation from §5.2 is the **transport channel**: a5 has no
+`ProfilerBase<L2SwimlaneCollector, L2SwimlaneModule>` and uses the same
+framework abstractions as a2a3, including the same split mgmt +
+collector shard shape (`kMgmtDrainThreadCount` = `kCollectorThreadCount`
+= `PLATFORM_MAX_AICPU_THREADS`, i.e. 7 on a5 vs 4 on a2a3). The
+behavioral deviation from §5.2 is the **transport channel**: a5 has no
 `halHostRegister`, so each device buffer is paired with a
 host-shadow `malloc()` and the mgmt loop synchronizes the two via
 `profiling_copy.h` (`rtMemcpy` onboard, plain `memcpy` in sim).
@@ -836,7 +838,7 @@ PHASE), same shape as a2a3.
 | AICPU commit on FIN | identical | |
 | Buffer model | rotating pool (free + ready queues) per kind | identical |
 | Ready queue | per-AICPU-thread, multiplexes 4 kinds via `ReadyQueueEntry::kind` | per-AICPU-thread, 2 kinds via `is_phase` |
-| Host threads | mgmt + poll, streams during execution | identical |
+| Host threads | split mgmt + collector shards, streams during execution | same split mgmt + collector shards (7 = `PLATFORM_MAX_AICPU_THREADS` vs a2a3's 4) |
 | Host-class shape | `ProfilerBase<L2SwimlaneCollector, L2SwimlaneModule>` (`kBufferKinds = 4`) | same base, `kBufferKinds = 2` |
 | Host transport | `halHostRegister` shared memory | host-shadow `malloc` + per-tick `rtMemcpy`/`memcpy` |
 | `MemoryOps` callbacks | 3 (`alloc`, `reg`, `free_`) | 5 (+ `copy_to_device`, `copy_from_device`) |
@@ -864,10 +866,11 @@ Phase-record overhead (only at `--enable-l2-swimlane >= 3`):
 - a5 — one 40 B `L2SwimlaneAicpuPhaseRecord` per emitted phase
   (legacy unified shape).
 
-Both architectures drain buffers concurrently with execution via the
-mgmt + poll thread pair; a5 additionally pays per-tick
-`rtMemcpy`/`memcpy` round-trips to keep the host shadow in sync,
-which overlap with device execution.
+Both architectures drain buffers concurrently with execution through the
+ProfilerBase mgmt/collector pipeline; both a2a3 and a5 use split mgmt plus
+collector shards for this profiler (a5 with 7 shards, a2a3 with 4). a5
+additionally pays per-buffer `rtMemcpy`/`memcpy` round-trips to keep the
+host shadow in sync, which overlap with device execution.
 
 `--rounds > 1` collects only on the first round so the steady-state
 benchmark is not perturbed.
diff --git a/docs/dfx/pmu-profiling.md b/docs/dfx/pmu-profiling.md
index 2b2617c3b..2d134f377 100644
--- a/docs/dfx/pmu-profiling.md
+++ b/docs/dfx/pmu-profiling.md
@@ -218,8 +218,8 @@ collected_on_host + dropped == total              (a2a3, 2 buckets)
 AICPU reads the 8 PMU counters via MMIO (`read_reg(reg_base, PMU_CNTi)`)
 directly into a `PmuRecord` on every task FIN. Buffers rotate through
 an SPSC free queue per core; full buffers flow through a per-thread
-ready queue to a host mgmt thread that recycles them, while a host
-poll thread streams records to CSV during execution.
+ready queue to host drain/refill shards that recycle them, while
+collector shards stream records to CSV during execution.
 
 ```text
         HOST                                         DEVICE
@@ -233,20 +233,20 @@ poll thread streams records to CSV during execution.
 │                          │               │                          │
 │ start(tf)                │               │ per-task FIN:            │
 │   ┌────────────────────┐ │               │   read 8 PMU_CNTs+TOTAL  │
-│   │ mgmt thread        │ │               │     into records[count]  │
-│   │ (BufferPool driver)│ │ SPSC ready    │   if buffer full:        │
+│   │ drain/refill shard │ │               │     into records[count]  │
+│   │ + replenish thread │ │ SPSC ready    │   if buffer full:        │
 │   │   poll ready queue │<┼──queues──────<│     push ready entry,    │
 │   │   recycle buffers  │─┼──free queue──>│     pop next buffer      │
 │   └────────────────────┘ │               │                          │
 │   ┌────────────────────┐ │ shared mem    │ pmu_aicpu_flush():       │
-│   │ poll thread        │ │ mapping       │   push remaining full    │
+│   │ collector shard    │ │ mapping       │   push remaining full    │
 │   │   read records via │<┼──────────────<│   buffers to ready_q     │
 │   │   host mapping     │ │               │                          │
 │   │   append to CSV    │ │               │                          │
 │   └────────────────────┘ │               └──────────────────────────┘
 │                          │
 │ stop()                   │
-│   join mgmt → join poll  │
+│   join mgmt → collectors │
 │ reconcile_counters()     │
 │ finalize()               │
 └──────────────────────────┘
@@ -278,13 +278,14 @@ PmuBuffer pool (rotated)                        (BUFFERS_PER_CORE per core)
 init_pmu()
   pmu_collector_.init(num_aicore, num_threads, csv_path, event_type, ...)
   kernel_args_.args.pmu_data_base = pmu_collector_.get_pmu_shm_device_ptr()
-start(tf)                       ← spawn mgmt thread (drains AICPU L1 ready
-                                  queue, recycles full buffers via
-                                  BufferPoolManager) + poll thread (drains
-                                  L2 hand-off, appends to CSV)
+start(tf)                       ← spawn split mgmt threads (drain AICPU ready
+                                  queues, refills free queues, and runs
+                                  background replenish via BufferPoolManager)
+                                  + collector shards (drain host hand-off,
+                                  append to CSV)
 launch AICPU / AICore
 rtStreamSynchronize             ← wait for kernel completion
-stop()                          ← join mgmt → join poll
+stop()                          ← join mgmt/replenish → join collectors
 reconcile_counters()            ← assert collected + dropped == total;
                                   any non-empty current_buf_ptr is a
                                   flush bug, logged as ERROR
@@ -294,12 +295,12 @@ finalize(unregister, free)
 [`PmuCollector`](../src/a2a3/platform/include/host/pmu_collector.h)
 inherits from
 [`profiling_common::ProfilerBase<PmuCollector, PmuModule>`](../src/common/platform/include/host/profiler_base.h):
-the base class owns the mgmt thread, the poll thread, and the
+the base class owns split mgmt threads, collector shards, and the
 `BufferPoolManager<PmuModule>` they share. `PmuCollector` only supplies
 the PMU-specific pieces — the `PmuModule` trait that describes the
 shared-memory layout, an `init()` that allocates and pre-fills the free
 queues, an `on_buffer_collected()` callback that appends records to the
-CSV, and `reconcile_counters()` / `finalize()`. The mgmt/poll threading,
+CSV, and `reconcile_counters()` / `finalize()`. The mgmt/collector threading,
 buffer pooling, and `Module` trait pattern are shared with TensorDump
 and L2Swimlane — see [profiling-framework.md](../profiling-framework.md) for
 the framework reference.
@@ -317,9 +318,12 @@ a2a3). At shutdown, AICPU flushes any partially-filled buffers via
 `pmu_aicpu_flush_buffers()`.
 
 a5's `PmuCollector` derives from
-`ProfilerBase<PmuCollector, PmuModule>` and shares the mgmt + poll
-thread structure with a2a3. The single behavioral deviation from
-§5.2 is the **transport channel**: a5 has no `halHostRegister`, so
+`ProfilerBase<PmuCollector, PmuModule>` and uses the same framework
+abstractions as a2a3, including the same split mgmt + collector shard
+shape (`kMgmtDrainThreadCount` = `kCollectorThreadCount` =
+`PLATFORM_MAX_AICPU_THREADS`, i.e. 7 on a5 vs 4 on a2a3). The
+behavioral deviation from §5.2 is the **transport channel**: a5 has no
+`halHostRegister`, so
 each device buffer is paired with a host-shadow `malloc()` and the
 mgmt loop synchronizes the two via `profiling_copy.h` (`rtMemcpy`
 onboard, `memcpy` in sim). `MemoryOps` therefore carries five
@@ -483,7 +487,7 @@ device-side counters.
 | Counter readout | AICPU MMIO `read_reg` | AICore MMIO `ld_dev` |
 | Per-core staging | direct write into `records[count]` | dual-issue slots, AICPU commits on FIN |
 | Buffer model | rotating pool (free + ready queues, SPSC protocol) | identical |
-| Host threads | mgmt + poll, streams during execution | identical |
+| Host threads | split mgmt + collector shards, streams during execution | same split mgmt + collector shards (7 = `PLATFORM_MAX_AICPU_THREADS` vs a2a3's 4) |
 | Host-class shape | `ProfilerBase<PmuCollector, PmuModule>` | identical |
 | Host transport | `halHostRegister` shared memory | host-shadow `malloc` + per-tick `rtMemcpy`/`memcpy` |
 | `MemoryOps` callbacks | 3 (`alloc`, `reg`, `free_`) | 5 (+ `copy_to_device`, `copy_from_device`) |
@@ -499,9 +503,10 @@ counter-read code paths are skipped.
 When enabled, the dominant per-task overhead is the MMIO counter read
 (8 reads on a2a3, 10 on a5) plus a single record copy. On both
 architectures, streaming keeps host-side work off the critical path —
-the collector thread drains buffers concurrently with kernel execution.
-On a5 the copy hooks add `rtMemcpy` round-trips that a2a3's shared
-memory avoids, but these overlap with device execution.
+the collector shards drain buffers concurrently with kernel execution.
+Both a2a3 and a5 use split mgmt plus collector shards (a5 with 7 shards,
+a2a3 with 4). a5's copy hooks add `rtMemcpy` round-trips that a2a3's
+shared memory avoids, but these overlap with device execution.
 
 For meaningful per-task numbers on a2a3 the runtime collapses to
 single-issue dispatch automatically whenever `--enable-pmu` is set (see
diff --git a/docs/dfx/scope-stats.md b/docs/dfx/scope-stats.md
index 682760bd7..5970731be 100644
--- a/docs/dfx/scope-stats.md
+++ b/docs/dfx/scope-stats.md
@@ -331,7 +331,7 @@ ScopeStatsCollector                platform scope_stats_collector_aicpu.cpp
   set kernel_args fields             runtime: scope_stats_set_ring_capacity()
   launch kernel                      runtime: scope_stats_set_tensormap_capacity()
       │                                  │
-  poll thread:                       on PTO2_SCOPE begin/end:
+  collector shard(s):                on PTO2_SCOPE begin/end:
    append records to memory  ◀──┐      runtime samples task/heap/dep_pool/tensormap
       │                         │      runtime: scope_stats_begin()/end()
       │                         │         └─ emit record, append to buffer;
diff --git a/docs/profiling-framework.md b/docs/profiling-framework.md
index f6aa030b6..a3678b92f 100644
--- a/docs/profiling-framework.md
+++ b/docs/profiling-framework.md
@@ -1,7 +1,8 @@
 # Profiling Framework
 
-Shared host-side infrastructure that the PMU, L2Swimlane, TensorDump, and
-ScopeStats collectors are built on. The framework headers live in
+Shared host-side infrastructure that the PMU, L2Swimlane, DepGen,
+TensorDump, and ScopeStats collectors are built on. The framework headers
+live in
 [`src/common/platform/include/host/`](../src/common/platform/include/host/)
 and are consumed verbatim by both a2a3 and a5 collectors (PR #944
 unified the previously-divergent per-arch copies into one set). This page
@@ -11,62 +12,65 @@ the collectors themselves still carry.
 The per-collector pages
 ([pmu-profiling.md](dfx/pmu-profiling.md),
 [l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md),
+[dep_gen.md](dfx/dep_gen.md),
 [args-dump.md](dfx/args-dump.md),
 [scope-stats.md](dfx/scope-stats.md))
 describe the data each subsystem collects and how it enables it on-device.
 
 ## 1. Why a shared framework
 
-Each profiling subsystem on a2a3 needs the same plumbing on the host:
+Each profiling subsystem needs the same plumbing on the host:
 
-- A management thread that polls the AICPU's per-thread SPSC ready queues
+- A management path that polls the AICPU's per-thread SPSC ready queues
   and recycles full buffers back to the device while kernels are still
-  running.
-- A collector thread that drains the host-side hand-off queue and copies
+  running. A module may opt into split drain/refill threads plus a
+  replenish thread.
+- Collector thread shards that drain host-side hand-off queues and copy
   records out of each ready buffer.
 - A pool of pre-registered device buffers (allocated up-front, refilled on
-  demand) keyed by "kind" — PMU has 1 kind, TensorDump has 1, L2Swimlane has 2
-  (perf records + phase markers).
+  demand) keyed by "kind". PMU, DepGen, TensorDump, and ScopeStats have one
+  kind; L2Swimlane has four.
 - A dev↔host pointer map so the management thread can resolve a device
   pointer popped off a ready queue to the host-mapped pointer the collector
   thread will read.
-- A teardown sequence that flushes both queues without losing late entries.
+- A teardown sequence that flushes the device queues and host shards without
+  losing late entries.
 
-Before unification this was three near-identical implementations. The
-framework collapses it to one control-flow implementation parameterized on
-a small per-subsystem trait.
+Before unification this was near-identical control flow repeated across
+collectors. The framework collapses it to one implementation parameterized
+on a small per-subsystem trait.
 
 ## 2. Layered view
 
 ```text
                 ┌──────────────────────────────────────────┐
-                │  PmuCollector / L2SwimlaneCollector /        │  Derived (CRTP)
-                │  TensorDumpCollector                     │  ─ on_buffer_collected
+                │  Pmu / L2Swimlane / DepGen / Dump / Scope │  Derived (CRTP)
+                │  collectors                               │  ─ on_buffer_collected
                 └─────────────┬────────────────────────────┘  ─ kIdleTimeoutSec / kSubsystemName
                               │ public ProfilerBase<Derived, Module>
                 ┌─────────────▼────────────────────────────┐
                 │  ProfilerBase<Derived, Module>           │  Thread orchestration
-                │  ─ owns mgmt thread + collector thread   │  ─ start/stop lifecycle
+                │  ─ owns mgmt + collector thread(s)       │  ─ start/stop lifecycle
                 │  ─ runs ProfilerAlgorithms<Module>       │  ─ consume → notify_copy_done
                 └─────────────┬────────────────────────────┘
                               │ has-a
                 ┌─────────────▼────────────────────────────┐
                 │  BufferPoolManager<Module>               │  Data structures (no threads)
-                │  ─ ready_queue / done_queue              │  ─ recycled pools (per kind)
+                │  ─ ready/done queue shards               │  ─ recycled pools (per kind)
                 │  ─ alloc_and_register / resolve_host_ptr │  ─ MemoryOps (type-erased)
                 └──────────────────────────────────────────┘
                               ▲
                               │ Module trait wires layout into algorithms
               ┌───────────────┴────────────────┐
-              │  PmuModule / L2SwimlaneModule /    │  Pure static trait (no state)
-              │  DumpModule                    │  ─ DataHeader / ReadyEntry / FreeQueue
+              │  Pmu / L2Swimlane / DepGen /      │  Pure static trait (no state)
+              │  Dump / Scope modules             │  ─ DataHeader / ReadyEntry / FreeQueue
               └────────────────────────────────┘  ─ kBufferKinds / kReadyQueueSize
                                                   ─ resolve_entry / for_each_instance
 ```
 
 `ProfilerBase` is the owner: it holds `BufferPoolManager manager_` as a
-member ([profiler_base.h:414](../src/common/platform/include/host/profiler_base.h#L414)),
-spawns and joins both threads, and dispatches collected buffers to
+member, spawns and joins the mgmt / collector threads, and dispatches
+collected buffers to
 `Derived::on_buffer_collected` via CRTP. `BufferPoolManager` owns no
 threads — it is just the shared data structure both threads access.
 `Module` is a stateless trait that tells the generic algorithms how the
@@ -79,17 +83,20 @@ subsystem's shared-memory layout is shaped.
 Defined in [`buffer_pool_manager.h`](../src/common/platform/include/host/buffer_pool_manager.h).
 Owns:
 
-- `ready_queue_` — mgmt → collector hand-off, guarded by mutex+cv.
-- `done_queue_` — collector → mgmt recycle channel, guarded by mutex.
-- `recycled_[kind]` — per-kind pool of free device buffers (mgmt-only).
+- `ready_shards_` — mgmt → collector hand-off shards, each guarded by
+  mutex+cv.
+- `done_shards_` — collector → mgmt recycle shards, each guarded by mutex.
+- `recycled_[shard][kind]` — shard-local pool of free device buffers,
+  guarded by one mutex per shard/kind.
 - `dev_to_host_` — single source of truth for `resolve_host_ptr`.
 - `MemoryOps` — type-erased `alloc / reg / free_` callbacks, plus the
   `shared_mem_host` and `device_id` stashed once at start.
 
 Owns no threads. Every entry point is documented as one of:
 
-- mgmt-only (recycled pool ops, `drain_done_into_recycled`),
-- collector-only (`notify_copy_done`),
+- mgmt-only or internally locked (`drain_done_into_recycled`, recycled
+  pool ops),
+- collector-only (`notify_copy_done`, one shard per collector),
 - shared with internal locking (`push_to_ready` / `wait_pop_ready` /
   `try_pop_ready`),
 - start/stop-only (`set_memory_context`, `release_owned_buffers`,
@@ -100,28 +107,39 @@ Owns no threads. Every entry point is documented as one of:
 Defined in [`profiler_base.h`](../src/common/platform/include/host/profiler_base.h).
 Provides:
 
-- The two threads and their lifecycle (`start` / `stop`).
-- `mgmt_loop` — drains `done_queue` → recycled, polls every AICPU
-  per-thread ready queue (bounded by `PLATFORM_MAX_AICPU_THREADS`),
-  invokes `ProfilerAlgorithms<Module>::process_entry` per popped entry,
-  and tops up free queues with `proactive_replenish`.
-- `poll_and_collect_loop` — `wait_pop_ready` with a 100 ms cv tick,
-  dispatches to `Derived::on_buffer_collected`, then calls
+- The mgmt thread(s), collector thread(s), and their lifecycle (`start` /
+  `stop`).
+- Split mgmt threads — `mgmt_drain_loop` drains ready queues and refills the
+  originating free queue from the current drain shard's local recycled pool
+  (`ProfilerAlgorithms<Module>::process_entry` per popped entry), while
+  `mgmt_replenish_loop` only drains done buffers into shard-local recycled
+  pools. A one-shot `proactive_replenish` seeds every free queue before the
+  threads start. Split drain threads do not bulk-mirror the whole
+  shared-memory region; they refresh only their queue indices / entries
+  before advancing `queue_heads`. On an empty scan, split drain does a short
+  busy-poll window before falling back to the 10 us sleep, so micro-bursts
+  are less likely to miss AICPU's bounded wait window.
+- Optional collector sharding (`Module::kCollectorThreadCount`) — each
+  collector drains one host ready shard and returns finished buffers through
+  the matching done shard.
+- `poll_and_collect_loop` — per-shard `wait_pop_ready` with a 100 ms cv
+  tick, dispatches to `Derived::on_buffer_collected`, then calls
   `manager_.notify_copy_done(...)` itself; idle-timeout hang detector.
 - `set_memory_context` / `clear_memory_context` so `Derived::init` can
   stash the alloc/reg/free callbacks before threads start; if init aborts
   before stashing, `start(tf)` becomes a no-op.
 
-`ProfilerAlgorithms<Module>` (in the same header, [profiler_base.h:170](../src/common/platform/include/host/profiler_base.h#L170))
+`ProfilerAlgorithms<Module>` (in the same
+[profiler_base.h](../src/common/platform/include/host/profiler_base.h))
 is where the unified algorithms live:
 
 - `try_pop_aicpu_entry` — barrier-correct head/tail advance over the
   per-thread ready queue, with a range-check guard against device-side
   corruption.
-- `process_entry` — three-level fallback (recycled → drain done → alloc)
-  to refill the originating free_queue with **exactly one** buffer per
-  popped entry, then resolve host_ptr and push to ready. The 1-in/1-out
-  ratio bounds per-tick latency.
+- `process_entry` — shard-local fallback (local recycled → local done →
+  other recycled shard → alloc) to refill the originating free_queue until
+  it is full or no buffer is available, then resolve host_ptr and push to
+  ready.
 - `proactive_replenish` — drain done, then top every (kind, instance)
   free queue up to `kSlotCount`, batch-allocating `batch_size(kind)`
   buffers when the recycled pool of a kind drains mid-fill so recovery
@@ -130,17 +148,21 @@ is where the unified algorithms live:
 ### 3.3 `Module` — trait layer
 
 A stateless `struct` per subsystem (`PmuModule`, `L2SwimlaneModule`,
-`DumpModule`) that tells the generic algorithms what the shared-memory
-layout looks like. The contract lives in the docblock at the top of
+`DepGenModule`, `DumpModule`, `ScopeStatsModule`) that tells the generic
+algorithms what the shared-memory layout looks like. The contract lives in the
+docblock at the top of
 [`profiler_base.h`](../src/common/platform/include/host/profiler_base.h);
 the required members are:
 
 | Member | Purpose |
 | ------ | ------- |
 | `using DataHeader / ReadyEntry / ReadyBufferInfo / FreeQueue` | Layout types |
-| `kBufferKinds` (PMU=1, Dump=1, L2Swimlane=2) | Number of per-kind recycled pools |
+| `kBufferKinds` | Number of buffer kinds inside each recycled shard |
 | `kReadyQueueSize`, `kSlotCount` | AICPU ready queue / free queue depth |
 | `kSubsystemName` | Tag used in framework log lines |
+| `kMgmtDrainThreadCount` | Optional; number of mgmt drain shards (defaults to 1) |
+| `kCollectorThreadCount` | Optional number of collector / host ready-queue shards |
+| `refresh_replenish_metadata(mgr, header)` | Optional hook to refresh cached queue metadata before a replenish pass |
 | `header_from_shm(void*) → DataHeader*` | Cast shared-memory base to header |
 | `batch_size(int kind) → int` | Per-kind batch-alloc count |
 | `resolve_entry(shm, header, q, entry) → optional<EntrySite>` | Map a popped ready entry to (kind, free_queue, buffer_size, partial info); return `nullopt` to drop |
@@ -150,7 +172,10 @@ the required members are:
 The Module structs are defined alongside their collectors in
 [pmu_collector.h](../src/a2a3/platform/include/host/pmu_collector.h),
 [l2_swimlane_collector.h](../src/a2a3/platform/include/host/l2_swimlane_collector.h),
-and [tensor_dump_collector.h](../src/a2a3/platform/include/host/tensor_dump_collector.h)
+[dep_gen_collector.h](../src/a2a3/platform/include/host/dep_gen_collector.h),
+[tensor_dump_collector.h](../src/common/platform/include/host/tensor_dump_collector.h),
+and
+[scope_stats_collector.h](../src/common/platform/include/host/scope_stats_collector.h)
 — each is a few dozen lines of static methods over the subsystem's own
 `DataHeader` / ringbuffer types.
 
@@ -178,34 +203,35 @@ and only has to provide:
 ## 4. End-to-end data flow
 
 ```text
-  AICPU                       mgmt thread                       collector thread
-  ─────                       ───────────                       ────────────────
+  AICPU                       mgmt thread(s)                    collector shard(s)
+  ─────                       ──────────────                    ──────────────────
   write record into         drain_done_into_recycled
   current free buffer       ──────────────────────────►
                             try_pop_aicpu_entry(q)
                             process_entry:
-                              pop_recycled / alloc_and_register
-                                (refill originating free_queue, 1-in/1-out)
+                              pop local recycled / local done / alloc
+                                (top up originating free_queue)
                               resolve_host_ptr
-                              push_to_ready ──────────────────► wait_pop_ready
+                              push_to_ready(shard q) ─────────► wait_pop_ready(q)
                                                                 Derived::on_buffer_collected
                                                                   (copy records out)
-                                                                notify_copy_done
-                            ◄────────────────────────────────── (done_queue)
+                                                                notify_copy_done(q)
+                            ◄────────────────────────────────── done shard q
                             (next tick) drain into recycled
 
                                           ▲
                                           │
-                            proactive_replenish: top every
-                            free_queue up to kSlotCount;
-                            batch-alloc when a kind drains.
+                            split runtime replenish:
+                            drain done into shard-local
+                            recycled pools only.
 ```
 
-Both queues plus the per-kind recycled pools and the dev↔host map all
+The queue shards plus the shard-local recycled pools and the dev↔host map all
 live in the single `BufferPoolManager` instance owned by `ProfilerBase`.
-The mgmt thread is the only writer to the ready queue; the collector
-thread is the only writer to the done queue. Recycled pools are
-mgmt-only.
+Each ready shard has one collector consumer; each done shard is written by
+its matching collector and drained into the same recycled shard. Split drain
+refills the originating free queue on the hot path; split replenish no longer
+writes free queues at runtime.
 
 ## 5. Lifecycle
 
@@ -219,17 +245,17 @@ mgmt-only.
     assemble MemoryOps from stashed callbacks (sim mode installs an
       identity reg wrapper so register == nullptr is supported uniformly)
     manager_.set_memory_context(ops, shm_host, device_id)
-    spawn mgmt thread       ← started first; mgmt is the only writer to L2
-    spawn collector thread
+    spawn mgmt thread(s)    ← started first; mgmt writes host ready shards
+    spawn collector thread(s)
 
     ... AICPU / AICore execute ...
 
   ProfilerBase::stop()
     mgmt_running_ = false
-    join mgmt thread        ← mgmt's final-drain pass flushes the last
-                              entries into ready_queue before exiting
+    join mgmt thread(s)     ← mgmt final-drain flushes the last entries into
+                              ready shards before exiting
     execution_complete_ = true
-    join collector thread   ← drains ready_queue once more, then exits
+    join collector thread(s)← each shard drains once more, then exits
 
   Derived::finalize(unregister, free)
     manager_.release_owned_buffers([&](void* p) { unregister + free })
@@ -240,9 +266,9 @@ mgmt-only.
 
 The order in `stop()` is load-bearing: mgmt is joined **before**
 `execution_complete_` is signalled so its final-drain output has a
-consumer; the collector then drains and exits. On return both queues are
-empty and `on_buffer_collected` has been called for every entry that was
-in either queue.
+consumer; collectors then drain and exit. On return all host shards are
+empty and `on_buffer_collected` has been called for every entry that was in
+any shard.
 
 `Derived::finalize` is responsible for the buffers the collector still
 owns at stop time (`free_queue` slots and `current_buf_ptr`); the
@@ -255,19 +281,28 @@ mid-run by the framework.
 
 | State | Reader(s) | Writer(s) | Synchronization |
 | ----- | --------- | --------- | --------------- |
-| `ready_queue_` | collector | mgmt | `ready_mutex_` + `ready_cv_` |
-| `done_queue_` | mgmt | collector | `done_mutex_` |
-| `recycled_[kind]` | mgmt | mgmt | none (single-threaded access) |
-| `dev_to_host_` | mgmt (`alloc_and_register`, `resolve_host_ptr`) | mgmt | none during run; collector touches it only in `release_owned_buffers` / `clear_mappings`, after `stop()` has joined mgmt |
+| `ready_shards_[q]` | collector q | mgmt drain q | shard mutex + cv |
+| `done_shards_[q]` | mgmt / replenish | collector q | shard mutex |
+| `recycled_[shard][kind]` | drain shard / replenish | drain shard / replenish | shard/kind mutex |
+| `dev_to_host_` | mgmt (`alloc_and_register`, `resolve_host_ptr`) | mgmt | `mapping_mutex_`; collector touches it only in `release_owned_buffers` / `clear_mappings`, after `stop()` has joined mgmt |
 | `MemoryOps` / `shared_mem_host_` / `device_id_` | both threads | start-only | `set_memory_context` is called once before threads spawn; read-only afterwards |
-| AICPU per-thread ready queues (`header->queues[q]`) | mgmt (head advance) | AICPU (tail advance) | `rmb` / `wmb` paired with AICPU writers |
-| Per-instance `FreeQueue` | AICPU (head advance) | mgmt (tail advance) | `rmb` / `wmb` paired with AICPU readers |
+| AICPU per-thread ready queues (`header->queues[q]`) | mgmt (head advance) | AICPU (tail advance) | `read_range_from_device` in split drain, then `write_range_to_device` for `queue_heads[q]` |
+| Per-instance `FreeQueue` | AICPU (head advance) | mgmt (tail advance) | per-free-queue writer lock; host refreshes `head` before writing `buffer_ptrs[]` / `tail` |
 
 Two things follow:
 
-- `dev_to_host_` is unlocked because mgmt owns it during the run and the
-  collector only touches it when mgmt is joined. Adding a collector path
-  that mutates the map mid-run would require revisiting this.
+- `dev_to_host_` has a narrow mapping lock; recycled pools are split by
+  collector shard and kind so the hot drain/refill path mostly stays local.
+- Device-side queue backpressure is bounded for the profiling writers that
+  use this protocol. If the host does not make ready-queue space or
+  free-queue entries visible within the short wait budget, AICPU records a
+  drop and keeps the workload moving instead of spinning indefinitely.
+- The AICPU writer publishes a full buffer to the ready queue before
+  acquiring its replacement buffer. If no replacement is visible yet, the
+  current pointer is cleared and later records first try to recover from
+  the free queue before counting a per-record drop. This matters under a
+  one-buffer stress shape: the host cannot return a replacement until it
+  first observes the full ready buffer.
 - The mgmt thread must never zero AICPU-owned fields (`count`, `head`,
   `tail` on the AICPU side). The AICPU is the sole writer to those and
   resets them itself on flush/drop/pop.
@@ -295,11 +330,15 @@ Existing collectors are the canonical examples:
 
 - [`PmuCollector`](../src/a2a3/platform/include/host/pmu_collector.h)
   — single kind, per-core instances. See [pmu-profiling.md](dfx/pmu-profiling.md).
-- [`TensorDumpCollector`](../src/a2a3/platform/include/host/tensor_dump_collector.h)
+- [`DepGenCollector`](../src/a2a3/platform/include/host/dep_gen_collector.h)
+  — single kind, one instance. See [dep_gen.md](dfx/dep_gen.md).
+- [`TensorDumpCollector`](../src/common/platform/include/host/tensor_dump_collector.h)
   — single kind, per-AICPU-thread instances. See [args-dump.md](dfx/args-dump.md).
+- [`ScopeStatsCollector`](../src/common/platform/include/host/scope_stats_collector.h)
+  — single kind, one instance. See [scope-stats.md](dfx/scope-stats.md).
 - [`L2SwimlaneCollector`](../src/a2a3/platform/include/host/l2_swimlane_collector.h)
-  — two kinds (perf records + phase markers), per-core / per-thread
-  instances; the canonical multi-kind example. See
+  — four kinds (AICPU task, scheduler phase, orchestrator phase, AICore
+  task), per-core / per-thread instances; the canonical multi-kind example. See
   [l2-swimlane-profiling.md](dfx/l2-swimlane-profiling.md).
 
 ## 8. a5 specifics — host-shadow transport
@@ -332,8 +371,9 @@ changes capture that:
    **not** called from the mgmt loop — it would race with AICPU writes
    to device-only fields (`current_buf_ptr`, `total/dropped/mismatch`
    counters, `queue_tails`, `free_queue.head`,
-   `L2SwimlaneAicpuPhaseHeader::magic`, `core_to_thread[]`), rolling them back
-   to whatever the host shadow had at the start of the tick. Per-buffer payloads (`L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` /
+   `L2SwimlaneAicpuPhaseHeader::magic`, `core_to_thread[]`), rolling them
+   back to whatever the host shadow had at the start of the tick. Per-buffer
+   payloads (`L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` /
    `DumpMetaBuffer`) are still pulled on demand inside
    `ProfilerAlgorithms::process_entry` after resolving the host pointer
    for a popped ready entry. The bulk `mirror_shm_to_device` is kept
@@ -389,7 +429,7 @@ rotating `L2SwimlaneAicpuTaskBuffer` / `PmuBuffer` flips — flipping is now
 fully internal to `*_complete_record` and never crosses into Handshake.
 
 Everything else — Module concept contract, alloc policy
-(1-in/1-out + proactive replenish), `kIdleTimeoutSec` / `kSubsystemName`
+(drain-shard top-up + proactive replenish), `kIdleTimeoutSec` / `kSubsystemName`
 contract, mgmt-then-poll start/stop ordering, buffer-pool sizing
 constants — matches a2a3 exactly. New collectors should be reviewed
 against both arches when added.
diff --git a/src/a2a3/platform/include/common/platform_config.h b/src/a2a3/platform/include/common/platform_config.h
index 757db37a5..78c7a88b9 100644
--- a/src/a2a3/platform/include/common/platform_config.h
+++ b/src/a2a3/platform/include/common/platform_config.h
@@ -129,13 +129,13 @@ constexpr int PLATFORM_PROF_SLOT_COUNT = 4;
 
 /**
  * L2SwimlaneAicpuTaskBuffer pre-allocation count per AICore.
- * 1 goes into the free_queue at init, the rest into the recycled pool.
+ * Up to PLATFORM_PROF_SLOT_COUNT go into the free_queue at init, the rest into the recycled pool.
  */
 constexpr int PLATFORM_PROF_BUFFERS_PER_CORE = 8;
 
 /**
  * L2SwimlaneAicoreTaskBuffer pre-allocation count per AICore (AICore-as-producer pool).
- * 1 goes into the free_queue at init, the rest into the recycled pool.
+ * Up to PLATFORM_PROF_SLOT_COUNT go into the free_queue at init, the rest into the recycled pool.
  * Mirrors PLATFORM_PROF_BUFFERS_PER_CORE in role; smaller because AICore records
  * are slim (32 B each) and the buffer is also smaller per the rotation design.
  */
@@ -144,8 +144,8 @@ constexpr int PLATFORM_AICORE_BUFFERS_PER_CORE = 4;
 /**
  * Host preallocation count per AICPU thread for the two phase pools, split per
  * kind (sched vs orch) because their throughput is asymmetric — a single shared
- * value over-provisions the lighter one. 1 buffer seeds the free_queue at init,
- * the rest the recycled pool.
+ * value over-provisions the lighter one. Up to PLATFORM_PROF_SLOT_COUNT buffers
+ * seed the free_queue at init, and the rest seed the recycled pool.
  *
  * Floor for both: SLOT_COUNT(4) + 1 = 5 (free_queue fillable + 1 active buffer).
  * Pure host preallocation — zero ABI (the device-visible ready_queue is decoupled
diff --git a/src/a2a3/platform/include/host/dep_gen_collector.h b/src/a2a3/platform/include/host/dep_gen_collector.h
index 789e695e2..e5f86a89d 100644
--- a/src/a2a3/platform/include/host/dep_gen_collector.h
+++ b/src/a2a3/platform/include/host/dep_gen_collector.h
@@ -16,16 +16,17 @@
  *
  * Architecture:
  * - BufferPoolManager<DepGenModule>: shared mgmt-thread infrastructure that
- *   polls the per-thread ready queue, drains the done_queue, and replenishes
- *   the (single instance's) free_queue from a unified recycled pool.
- * - DepGenCollector: collector thread pops full DepGenBuffers from the manager
- *   and appends their DepGenRecords to a binary file (submit_trace.bin).
+ *   polls per-thread ready queues, drains done-queue shards, and replenishes
+ *   the single instance's free_queue from a unified recycled pool.
+ * - DepGenCollector: collector thread shards pop full DepGenBuffers from the
+ *   manager and append their DepGenRecords to a binary file
+ *   (submit_trace.bin).
  *
  * Lifecycle:
  *   init()                       — Allocate header + 1 BufferState + N DepGenBuffers
  *                                  (pre-fills free_queue; surplus → recycled pool).
  *                                  Calls set_memory_context() on the base.
- *   start(tf)                    — Inherited: launches mgmt + poll threads.
+ *   start(tf)                    — Inherited: launches mgmt + collector threads.
  *   [device execution]
  *   stop()                       — Inherited: drain queues, join threads.
  *   reconcile_counters()         — Sanity-check current_buf_ptr is cleared by
@@ -64,7 +65,7 @@
 // ---------------------------------------------------------------------------
 
 /**
- * Internal hand-off struct delivered from the mgmt thread to the collector.
+ * Internal hand-off struct delivered from a drain thread to a collector shard.
  * thread_index identifies the AICPU thread queue the entry was popped from
  * (always equal to the orchestrator thread index, since dep_gen is single-
  * instance — exposed for symmetry with PmuReadyBufferInfo).
@@ -87,6 +88,8 @@ struct DepGenModule {
     static constexpr uint32_t kReadyQueueSize = PLATFORM_DEP_GEN_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_DEP_GEN_SLOT_COUNT;
     static constexpr const char *kSubsystemName = "DepGenModule";
+    static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS;
+    static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS;
 
     /**
      * Buffers grown by proactive_replenish are batch-allocated up to the
@@ -104,7 +107,18 @@ struct DepGenModule {
      * resets it itself on flush/drop/pop.
      */
     static std::optional<profiling_common::EntrySite<DepGenModule>>
-    resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) {
+    resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) {
+        if (shm == nullptr || header == nullptr) {
+            LOG_ERROR("DepGenModule: invalid shared memory/header while resolving ready entry");
+            return std::nullopt;
+        }
+        if (header->num_instances != 1 || entry.instance_index >= header->num_instances) {
+            LOG_ERROR(
+                "DepGenModule: invalid ready entry instance=%u (num_instances=%u)", entry.instance_index,
+                header->num_instances
+            );
+            return std::nullopt;
+        }
         DepGenBufferState *state = get_dep_gen_buffer_state(shm, static_cast<int>(entry.instance_index));
         profiling_common::EntrySite<DepGenModule> site;
         site.kind = 0;
diff --git a/src/a2a3/platform/include/host/l2_swimlane_collector.h b/src/a2a3/platform/include/host/l2_swimlane_collector.h
index c7297d0e0..b8bd2bb9b 100644
--- a/src/a2a3/platform/include/host/l2_swimlane_collector.h
+++ b/src/a2a3/platform/include/host/l2_swimlane_collector.h
@@ -16,9 +16,9 @@
  * Architecture:
  * - BufferPoolManager<L2SwimlaneModule>: shared mgmt-thread infrastructure that polls
  *   the AICPU ready queue, replenishes per-core / per-thread free queues, and
- *   hands full buffers off to the collector thread.
- * - L2SwimlaneCollector: main thread copies records from the manager's ready queue
- *   into host vectors and exports the swimlane visualization.
+ *   hands full buffers off to collector thread shards.
+ * - L2SwimlaneCollector: collector thread shards copy records from manager ready queues
+ *   into host vectors; the owner thread exports the swimlane visualization after stop().
  *
  * Memory operations are injected through callbacks for sim/onboard portability.
  */
@@ -27,8 +27,11 @@
 #define SRC_A2A3_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_
 
 #include <atomic>
+#include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <mutex>
 #include <string>
 #include <thread>
 #include <vector>
@@ -87,6 +90,8 @@ struct L2SwimlaneModule {
     static constexpr uint32_t kReadyQueueSize = PLATFORM_PROF_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_PROF_SLOT_COUNT;
     static constexpr const char *kSubsystemName = "L2SwimlaneModule";
+    static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS;
+    static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS;
 
     /**
      * batch_size for proactive_replenish's alloc fallback. Sized so that a
@@ -121,6 +126,13 @@ struct L2SwimlaneModule {
 
     static DataHeader *header_from_shm(void *shm) { return get_l2_swimlane_header(shm); }
 
+    template <typename Mgr>
+    static void refresh_replenish_metadata(Mgr &mgr, DataHeader *header) {
+        mgr.read_range_from_device(&header->num_sched_phase_threads, sizeof(header->num_sched_phase_threads));
+        mgr.read_range_from_device(&header->num_orch_phase_threads, sizeof(header->num_orch_phase_threads));
+        rmb();
+    }
+
     /**
      * Branch on entry.kind to pick the per-core task state, per-thread sched-
      * or orch-phase state, or per-core AICore state. Returns nullopt for
@@ -263,13 +275,11 @@ using L2SwimlaneFreeCallback = profiling_common::ProfFreeCallback;
  *   1. initialize()                — allocate shared memory, pre-fill free_queues,
  *                                    hand the memory context to the base via
  *                                    set_memory_context().
- *   2. start(tf)                   — inherited from ProfilerBase: assembles a
- *                                    MemoryOps from the stashed callbacks and
- *                                    launches the mgmt + poll threads.
+ *   2. start(tf)                   — inherited from ProfilerBase; launches
+ *                                    drain/refill, replenish, and collector threads.
  *   3. ... device execution ...
- *   4. stop()                      — joins both threads in the correct order
- *                                    (mgmt first so its final-drain entries
- *                                    have a consumer).
+ *   4. stop()                      — joins drain/refill and replenish before
+ *                                    letting collector threads exit.
  *   5. read_phase_header_metadata() — single-shot read of the core→thread
  *                                    mapping from L2SwimlaneDataHeader.
  *   6. reconcile_counters()        — device-side three-bucket accounting for
@@ -329,7 +339,7 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase<L2SwimlaneColl
     );
 
     /**
-     * Per-buffer callback invoked by ProfilerBase's poll loop. Dispatches on
+     * Per-buffer callback invoked by ProfilerBase's collector loop. Dispatches on
      * info.type to copy either an L2SwimlaneAicpuTaskBuffer (PERF_RECORD) into the per-core
      * record vector, or a L2SwimlaneAicpuSchedPhaseBuffer / L2SwimlaneAicpuOrchPhaseBuffer into the per-thread
      * phase-record vector.
@@ -459,15 +469,20 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase<L2SwimlaneColl
     // orch records (kind-tagged at routing time; no parse-time discrimination).
     std::vector<std::vector<L2SwimlaneAicpuSchedPhaseRecord>> collected_sched_phase_records_;
     std::vector<std::vector<L2SwimlaneAicpuOrchPhaseRecord>> collected_orch_phase_records_;
-    bool has_phase_data_{false};
+    std::atomic<bool> has_phase_data_{false};
 
     // Core-to-thread mapping (core_id → scheduler thread index, -1 = unassigned)
     std::vector<int8_t> core_to_thread_;
 
     // Running totals used at reconcile time to cross-check device-side counters.
-    uint64_t total_perf_collected_{0};
-    uint64_t total_sched_phase_collected_{0};
-    uint64_t total_orch_phase_collected_{0};
+    std::atomic<uint64_t> total_perf_collected_{0};
+    std::atomic<uint64_t> total_sched_phase_collected_{0};
+    std::atomic<uint64_t> total_orch_phase_collected_{0};
+
+    std::array<std::mutex, PLATFORM_MAX_CORES> perf_record_mutexes_;
+    std::array<std::mutex, PLATFORM_MAX_CORES> aicore_record_mutexes_;
+    std::array<std::mutex, PLATFORM_MAX_AICPU_THREADS> sched_phase_record_mutexes_;
+    std::array<std::mutex, PLATFORM_MAX_AICPU_THREADS> orch_phase_record_mutexes_;
 
     // Allocate a single buffer (any of the L2SwimlaneAicpu*Buffer kinds) and register it.
     // The RAII counterpart ``release_one_buffer`` lives on ProfilerBase and
diff --git a/src/a2a3/platform/include/host/pmu_collector.h b/src/a2a3/platform/include/host/pmu_collector.h
index 30b464aec..b1742790a 100644
--- a/src/a2a3/platform/include/host/pmu_collector.h
+++ b/src/a2a3/platform/include/host/pmu_collector.h
@@ -14,11 +14,11 @@
  * @brief Host-side PMU buffer allocation, streaming collection, and CSV export.
  *
  * Architecture:
- * - BufferPoolManager<PmuModule>: shared mgmt-thread infrastructure that polls
- *   per-thread DumpReadyQueues, drains the done_queue, and replenishes the
+ * - BufferPoolManager<PmuModule>: shared split-mgmt infrastructure that polls
+ *   per-thread ready queues, drains done-queue shards, and replenishes the
  *   per-core free_queues from a unified recycled pool.
- * - PmuCollector: collector thread pops full PmuBuffers from the manager
- *   and appends them to the CSV file.
+ * - PmuCollector: collector thread shards pop full PmuBuffers from the manager
+ *   and append them to the CSV file.
  *
  * Lifecycle:
  *   init()                       — Allocate header + per-core states + PmuBuffers
@@ -27,12 +27,12 @@
  *                                  on the base so start(tf) can launch threads.
  *   start(tf)                    — Inherited from ProfilerBase: assembles
  *                                  MemoryOps from the stashed callbacks and
- *                                  launches the mgmt + poll threads.
+ *                                  launches the mgmt + collector threads.
  *   [device execution]
- *   stop()                       — Stop mgmt → join mgmt → signal poll →
- *                                  drain L2 → join poll, in that order. On
- *                                  return both thread exits and queue drains
- *                                  are complete.
+ *   stop()                       — Stop mgmt → join mgmt → signal collectors →
+ *                                  drain ready shards → join collectors, in
+ *                                  that order. On return both thread exits and
+ *                                  queue drains are complete.
  *   reconcile_counters()         — Sanity-check PmuBufferState::current_buf_ptr
  *                                  (any non-zero pointer with records is a
  *                                  device-flush bug, logged as ERROR) and run
@@ -78,9 +78,8 @@
  */
 
 /**
- * Internal hand-off struct delivered from the mgmt thread to the collector.
- * thread_index is the logical AICPU thread queue the entry was popped from,
- * passed through by ProfilerBase's mgmt loop.
+ * Internal hand-off struct delivered from a drain thread to a collector shard.
+ * thread_index is the logical AICPU thread queue the entry was popped from.
  */
 struct PmuReadyBufferInfo {
     uint32_t core_index;
@@ -100,6 +99,8 @@ struct PmuModule {
     static constexpr uint32_t kReadyQueueSize = PLATFORM_PMU_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_PMU_SLOT_COUNT;
     static constexpr const char *kSubsystemName = "PmuModule";
+    static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS;
+    static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS;
 
     /**
      * Buffers grown by proactive_replenish are batch-allocated up to the
@@ -118,7 +119,18 @@ struct PmuModule {
      * resets it itself on flush/drop/pop.
      */
     static std::optional<profiling_common::EntrySite<PmuModule>>
-    resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) {
+    resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) {
+        if (shm == nullptr || header == nullptr) {
+            LOG_ERROR("PmuModule: invalid shared memory/header while resolving ready entry");
+            return std::nullopt;
+        }
+        if (entry.core_index >= header->num_cores || entry.core_index >= static_cast<uint32_t>(PLATFORM_MAX_CORES)) {
+            LOG_ERROR(
+                "PmuModule: invalid ready entry core=%u (num_cores=%u, max=%u)", entry.core_index, header->num_cores,
+                static_cast<uint32_t>(PLATFORM_MAX_CORES)
+            );
+            return std::nullopt;
+        }
         PmuBufferState *state = get_pmu_buffer_state(shm, static_cast<int>(entry.core_index));
         profiling_common::EntrySite<PmuModule> site;
         site.kind = 0;
diff --git a/src/a2a3/platform/shared/aicpu/dep_gen_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/dep_gen_collector_aicpu.cpp
index 9b934a2f4..e2db5c4a1 100644
--- a/src/a2a3/platform/shared/aicpu/dep_gen_collector_aicpu.cpp
+++ b/src/a2a3/platform/shared/aicpu/dep_gen_collector_aicpu.cpp
@@ -20,15 +20,17 @@
  *   - Host pushes free DepGenBuffers via free_queue.
  *   - AICPU pops when current buffer fills; pushes full buffer to per-thread
  *     ready_queue (indexed by orch_thread_idx).
- *   - On free_queue empty or ready_queue full: overwrite current buffer
- *     (record dropped_record_count, keep AICPU alive). Host reads dropped
- *     at finalize to decide whether to emit deps.json.
+ *   - Full buffers are published before AICPU tries to recover a replacement.
+ *     If recovery is delayed, later records are counted as dropped until host
+ *     replenishes free_queue. Host reads dropped at finalize to decide whether
+ *     to emit deps.json.
  */
 
 #include "aicpu/dep_gen_collector_aicpu.h"
 
 #include <cstring>
 
+#include "aicpu/device_time.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
@@ -41,6 +43,9 @@ static DepGenDataHeader *s_dep_gen_header = nullptr;
 static DepGenBufferState *s_dep_gen_state = nullptr;
 static int s_orch_thread_idx = -1;  // set via dep_gen_aicpu_set_orch_thread_idx
 
+static constexpr uint64_t kDepGenQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000;  // 20 us
+static constexpr uint32_t kDepGenQueueBackpressurePollMask = 1023;
+
 extern "C" void set_platform_dep_gen_base(uint64_t dep_gen_data_base) { g_platform_dep_gen_base = dep_gen_data_base; }
 
 extern "C" uint64_t get_platform_dep_gen_base() { return g_platform_dep_gen_base; }
@@ -56,26 +61,74 @@ void dep_gen_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thr
 // ---------------------------------------------------------------------------
 
 static int enqueue_dep_gen_ready_buffer(uint64_t buffer_ptr, uint32_t buffer_seq) {
-    if (s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
+    if (s_dep_gen_header == nullptr || s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
         return -1;
     }
     int q = s_orch_thread_idx;
     uint32_t capacity = PLATFORM_DEP_GEN_READYQUEUE_SIZE;
-    uint32_t current_tail = s_dep_gen_header->queue_tails[q];
-    uint32_t current_head = s_dep_gen_header->queue_heads[q];
+    uint32_t current_tail = 0;
+    uint32_t current_head = 0;
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        current_tail = s_dep_gen_header->queue_tails[q];
+        current_head = s_dep_gen_header->queue_heads[q];
+        uint32_t next_tail = (current_tail + 1) % capacity;
+        if (next_tail != current_head) {
+            break;
+        }
+        if ((++spins & kDepGenQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kDepGenQueueBackpressureWaitCycles) {
+            return -1;
+        }
+    } while (true);
 
     uint32_t next_tail = (current_tail + 1) % capacity;
-    if (next_tail == current_head) {
-        return -1;  // Queue full
-    }
-
     s_dep_gen_header->queues[q][current_tail].instance_index = 0;
     s_dep_gen_header->queues[q][current_tail].buffer_ptr = buffer_ptr;
     s_dep_gen_header->queues[q][current_tail].buffer_seq = buffer_seq;
+    wmb();  // publish: entry fields visible before the tail advance
     s_dep_gen_header->queue_tails[q] = next_tail;
     return 0;
 }
 
+static DepGenBuffer *try_pop_dep_gen_buffer(uint32_t next_seq) {
+    if (s_dep_gen_state == nullptr) {
+        return nullptr;
+    }
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+    uint32_t head = 0;
+    uint32_t tail = 0;
+
+    do {
+        head = s_dep_gen_state->free_queue.head;
+        tail = s_dep_gen_state->free_queue.tail;
+        if (head != tail) {
+            rmb();  // acquire: order the tail read before the buffer_ptrs read below
+            break;
+        }
+        if ((++spins & kDepGenQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kDepGenQueueBackpressureWaitCycles) {
+            return nullptr;
+        }
+    } while (true);
+
+    uint64_t new_buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT];
+    s_dep_gen_state->free_queue.head = head + 1;
+    if (new_buf_ptr == 0) {
+        return nullptr;
+    }
+
+    DepGenBuffer *new_buf = reinterpret_cast<DepGenBuffer *>(new_buf_ptr);
+    new_buf->count = 0;
+    s_dep_gen_state->current_buf_ptr = new_buf_ptr;
+    s_dep_gen_state->current_buf_seq = next_seq;
+    wmb();
+    return new_buf;
+}
+
 // ---------------------------------------------------------------------------
 // Internal: switch the current buffer
 // ---------------------------------------------------------------------------
@@ -89,21 +142,6 @@ static void dep_gen_switch_buffer() {
         return;
     }
 
-    // Check free_queue before committing the full buffer
-    rmb();
-    uint32_t head = s_dep_gen_state->free_queue.head;
-    uint32_t tail = s_dep_gen_state->free_queue.tail;
-
-    if (head == tail) {
-        // No replacement buffer available — overwrite current buffer to keep
-        // the orch loop alive; account every record we drop.
-        LOG_WARN("dep_gen: no free buffer, overwriting current (dropped %u records)", full_buf->count);
-        s_dep_gen_state->dropped_record_count += full_buf->count;
-        full_buf->count = 0;
-        wmb();
-        return;
-    }
-
     uint32_t seq = s_dep_gen_state->current_buf_seq;
     int rc = enqueue_dep_gen_ready_buffer(s_dep_gen_state->current_buf_ptr, seq);
     if (rc != 0) {
@@ -114,16 +152,12 @@ static void dep_gen_switch_buffer() {
         return;
     }
 
-    // Pop next buffer from free_queue
-    uint64_t new_buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT];
-    rmb();
-    s_dep_gen_state->free_queue.head = head + 1;
-    s_dep_gen_state->current_buf_ptr = new_buf_ptr;
-    s_dep_gen_state->current_buf_seq = seq + 1;
+    uint32_t next_seq = seq + 1;
+    s_dep_gen_state->current_buf_ptr = 0;
+    s_dep_gen_state->current_buf_seq = next_seq;
     wmb();
 
-    DepGenBuffer *new_buf = reinterpret_cast<DepGenBuffer *>(new_buf_ptr);
-    new_buf->count = 0;
+    (void)try_pop_dep_gen_buffer(next_seq);
 }
 
 // ---------------------------------------------------------------------------
@@ -144,14 +178,8 @@ void dep_gen_aicpu_init() {
     uint32_t tail = s_dep_gen_state->free_queue.tail;
 
     if (head != tail) {
-        uint64_t buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT];
-        rmb();
-        s_dep_gen_state->free_queue.head = head + 1;
-        s_dep_gen_state->current_buf_ptr = buf_ptr;
-        s_dep_gen_state->current_buf_seq = 0;
-        wmb();
-        DepGenBuffer *buf = reinterpret_cast<DepGenBuffer *>(buf_ptr);
-        buf->count = 0;
+        (void)try_pop_dep_gen_buffer(0);
+        uint64_t buf_ptr = s_dep_gen_state->current_buf_ptr;
         LOG_INFO_V0("dep_gen: popped initial buffer addr=0x%lx", buf_ptr);
     } else {
         LOG_ERROR("dep_gen: free_queue empty during init");
@@ -180,9 +208,13 @@ void dep_gen_aicpu_record_submit(
     rmb();
     uint64_t cur_ptr = s_dep_gen_state->current_buf_ptr;
     if (cur_ptr == 0) {
-        s_dep_gen_state->dropped_record_count += 1;
-        wmb();
-        return;
+        DepGenBuffer *recovered = try_pop_dep_gen_buffer(s_dep_gen_state->current_buf_seq);
+        if (recovered == nullptr) {
+            s_dep_gen_state->dropped_record_count += 1;
+            wmb();
+            return;
+        }
+        cur_ptr = s_dep_gen_state->current_buf_ptr;
     }
     DepGenBuffer *buf = reinterpret_cast<DepGenBuffer *>(cur_ptr);
 
@@ -205,9 +237,13 @@ void dep_gen_aicpu_record_submit(
         rmb();
         cur_ptr = s_dep_gen_state->current_buf_ptr;
         if (cur_ptr == 0) {
-            s_dep_gen_state->dropped_record_count += 1;
-            wmb();
-            return;
+            DepGenBuffer *recovered = try_pop_dep_gen_buffer(s_dep_gen_state->current_buf_seq);
+            if (recovered == nullptr) {
+                s_dep_gen_state->dropped_record_count += 1;
+                wmb();
+                return;
+            }
+            cur_ptr = s_dep_gen_state->current_buf_ptr;
         }
         buf = reinterpret_cast<DepGenBuffer *>(cur_ptr);
         local_count = buf->count;  // refresh after switch — new buffer starts at 0
diff --git a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
index 5ed92cd61..0d030eb2e 100644
--- a/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
+++ b/src/a2a3/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
@@ -108,6 +108,59 @@ extern "C" uint64_t get_platform_l2_swimlane_aicore_rotation_table() {
 }
 L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; }
 
+static constexpr uint64_t kL2SwimlaneQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000;  // 20 us
+static constexpr uint32_t kL2SwimlaneQueueBackpressurePollMask = 1023;
+
+static bool
+wait_for_ready_queue_space(L2SwimlaneDataHeader *header, int thread_idx, uint32_t *tail_out, uint32_t *head_out) {
+    if (header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
+        return false;
+    }
+    const uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE;
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        uint32_t current_tail = header->queue_tails[thread_idx];
+        uint32_t current_head = header->queue_heads[thread_idx];
+        uint32_t next_tail = (current_tail + 1) % capacity;
+        if (next_tail != current_head) {
+            *tail_out = current_tail;
+            *head_out = current_head;
+            return true;
+        }
+        if ((++spins & kL2SwimlaneQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kL2SwimlaneQueueBackpressureWaitCycles) {
+            break;
+        }
+    } while (true);
+    return false;
+}
+
+static bool wait_for_free_queue_entry(L2SwimlaneFreeQueue *free_queue, uint32_t *head_out, uint32_t *tail_out) {
+    if (free_queue == nullptr) {
+        return false;
+    }
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        uint32_t head = free_queue->head;
+        uint32_t tail = free_queue->tail;
+        if (head != tail) {
+            *head_out = head;
+            *tail_out = tail;
+            rmb();  // acquire: order the tail read above before the caller's buffer_ptrs read
+            return true;
+        }
+        if ((++spins & kL2SwimlaneQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kL2SwimlaneQueueBackpressureWaitCycles) {
+            break;
+        }
+    } while (true);
+    return false;
+}
+
 /**
  * Enqueue ready buffer to per-thread queue
  *
@@ -124,24 +177,50 @@ static int enqueue_ready_buffer(
     L2SwimlaneBufferKind kind
 ) {
     uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE;
-    uint32_t current_tail = header->queue_tails[thread_idx];
-    uint32_t current_head = header->queue_heads[thread_idx];
+    uint32_t current_tail = 0;
+    uint32_t current_head = 0;
 
-    // Check if queue is full
-    uint32_t next_tail = (current_tail + 1) % capacity;
-    if (next_tail == current_head) {
+    if (!wait_for_ready_queue_space(header, thread_idx, &current_tail, &current_head)) {
         return -1;
     }
+    uint32_t next_tail = (current_tail + 1) % capacity;
 
     header->queues[thread_idx][current_tail].core_index = core_index;
     header->queues[thread_idx][current_tail].kind = kind;
     header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr;
     header->queues[thread_idx][current_tail].buffer_seq = buffer_seq;
+    wmb();  // publish: entry fields visible before the tail advance
     header->queue_tails[thread_idx] = next_tail;
 
     return 0;
 }
 
+static L2SwimlaneAicpuTaskBuffer *
+try_pop_records_buffer(int core_id, L2SwimlaneAicpuTaskPool *state, uint32_t next_seq) {
+    uint32_t head = 0;
+    uint32_t tail = 0;
+    if (!wait_for_free_queue_entry(&state->free_queue, &head, &tail)) {
+        return nullptr;
+    }
+
+    uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
+    rmb();
+    state->free_queue.head = head + 1;
+    if (new_buf_ptr == 0) {
+        return nullptr;
+    }
+
+    auto *new_buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(new_buf_ptr);
+    new_buf->count = 0;
+    wmb();
+
+    state->head.current_buf_ptr = new_buf_ptr;
+    state->head.current_buf_seq = next_seq;
+    s_current_aicpu_task_buffers[core_id] = new_buf;
+    wmb();
+    return new_buf;
+}
+
 void l2_swimlane_aicpu_init(int worker_count) {
     // Reset cross-launch state up front. AICPU statics persist across launches
     // on the same loaded .so; without this reset, an enabled→disabled launch
@@ -280,47 +359,34 @@ static void switch_records_buffer(int core_id, int thread_idx) {
 
     LOG_INFO_V0("Thread %d: Core %d buffer is full (count=%u)", thread_idx, core_id, full_buf->count);
 
-    // Check free_queue before committing the full buffer
-    rmb();
-    uint32_t head = state->free_queue.head;
-    uint32_t tail = state->free_queue.tail;
-
-    if (head == tail) {
-        // No replacement buffer available — overwrite current buffer to keep AICore alive
-        LOG_WARN("Thread %d: Core %d no free buffer, overwriting current buffer (data lost)", thread_idx, core_id);
-        state->head.dropped_record_count = state->head.dropped_record_count + full_buf->count;
-        full_buf->count = 0;
-        wmb();
-        return;
-    }
-
-    // Enqueue full buffer to ReadyQueue
     uint32_t seq = state->head.current_buf_seq;
+    uint64_t full_buf_ptr = state->head.current_buf_ptr;
     int rc = enqueue_ready_buffer(
-        s_l2_swimlane_header, thread_idx, core_id, state->head.current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask
+        s_l2_swimlane_header, thread_idx, core_id, full_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask
     );
     if (rc != 0) {
         LOG_ERROR("Thread %d: Core %d failed to enqueue buffer (queue full), data lost!", thread_idx, core_id);
-        // Revert: discard data and keep writing
         state->head.dropped_record_count = state->head.dropped_record_count + full_buf->count;
         full_buf->count = 0;
         wmb();
         return;
     }
 
-    // Pop next buffer from free_queue
-    uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
-    rmb();
-    state->free_queue.head = head + 1;
-    state->head.current_buf_ptr = new_buf_ptr;
-    state->head.current_buf_seq = seq + 1;
+    uint32_t next_seq = seq + 1;
+    state->head.current_buf_ptr = 0;
+    state->head.current_buf_seq = next_seq;
+    s_current_aicpu_task_buffers[core_id] = nullptr;
     wmb();
 
-    L2SwimlaneAicpuTaskBuffer *new_buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(new_buf_ptr);
-    new_buf->count = 0;
-    s_current_aicpu_task_buffers[core_id] = new_buf;
+    L2SwimlaneAicpuTaskBuffer *new_buf = try_pop_records_buffer(core_id, state, next_seq);
+    if (new_buf == nullptr) {
+        return;
+    }
 
-    LOG_INFO_V0("Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr);
+    LOG_INFO_V0(
+        "Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id,
+        reinterpret_cast<uint64_t>(new_buf)
+    );
 }
 
 // Try to rotate the AICore buffer for `core_id`. Called from the completion
@@ -338,10 +404,9 @@ static void aicore_rotate(int core_id, int thread_idx) {
     uint64_t old_buf_ptr = ac_state->head.current_buf_ptr;
     uint32_t seq = ac_state->head.current_buf_seq;
 
-    rmb();
-    uint32_t head = ac_state->free_queue.head;
-    uint32_t tail = ac_state->free_queue.tail;
-    if (head == tail) {
+    uint32_t head = 0;
+    uint32_t tail = 0;
+    if (!wait_for_free_queue_entry(&ac_state->free_queue, &head, &tail)) {
         // No replacement available — AICore continues to write into the old
         // buffer; its slot counter will hit BUFFER_SIZE and the slot guard
         // silently drops further records. We deliberately do NOT bump
@@ -362,6 +427,16 @@ static void aicore_rotate(int core_id, int thread_idx) {
         return;
     }
 
+    uint64_t new_buf_ptr = ac_state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
+    rmb();
+    if (new_buf_ptr == 0) {
+        LOG_WARN(
+            "Thread %d: Core %d AICore free_queue returned a null buffer at rotation; keeping old buffer active",
+            thread_idx, core_id
+        );
+        return;
+    }
+
     // Enqueue the just-filled AICore buffer with count = BUFFER_SIZE.
     if (old_buf_ptr != 0) {
         L2SwimlaneAicoreTaskBuffer *old_buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(old_buf_ptr);
@@ -393,8 +468,6 @@ static void aicore_rotate(int core_id, int thread_idx) {
     // detect rotation, then reads head.current_buf_ptr. Write ptr first so
     // AICore can never see a new seq with a stale ptr. new_buf->count=0 must
     // also be visible before AICore's slot writes begin.
-    uint64_t new_buf_ptr = ac_state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
-    rmb();
     ac_state->free_queue.head = head + 1;
     L2SwimlaneAicoreTaskBuffer *new_buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(new_buf_ptr);
     new_buf->count = 0;
@@ -461,10 +534,14 @@ int l2_swimlane_aicpu_complete_task(
 
     L2SwimlaneAicpuTaskBuffer *l2_swimlane_buf = s_current_aicpu_task_buffers[core_id];
     if (l2_swimlane_buf == nullptr) {
-        // No active records buffer (init ran out of free buffers); count as drop
-        // so host reconciliation stays consistent.
-        state->head.dropped_record_count += 1;
-        return -1;
+        l2_swimlane_buf = try_pop_records_buffer(core_id, state, state->head.current_buf_seq);
+        if (l2_swimlane_buf == nullptr) {
+            // No active records buffer (init ran out of free buffers or host has
+            // not refilled after the last published full buffer); count as drop
+            // so host reconciliation stays consistent.
+            state->head.dropped_record_count += 1;
+            return -1;
+        }
     }
     uint32_t count = l2_swimlane_buf->count;
     if (count >= PLATFORM_PROF_BUFFER_SIZE) {
@@ -721,19 +798,22 @@ static void switch_phase_buffer_kind(
         );
         state->head.dropped_record_count += full_buf->count;
         full_buf->count = 0;
-        *current_buf_out = nullptr;
-        state->head.current_buf_ptr = 0;
         wmb();
         return;
     }
 
-    rmb();
-    uint32_t head = state->free_queue.head;
-    uint32_t tail = state->free_queue.tail;
-    if (head != tail) {
+    uint32_t head = 0;
+    uint32_t tail = 0;
+    if (wait_for_free_queue_entry(&state->free_queue, &head, &tail)) {
         uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
         rmb();
         state->free_queue.head = head + 1;
+        if (new_buf_ptr == 0) {
+            *current_buf_out = nullptr;
+            state->head.current_buf_ptr = 0;
+            wmb();
+            return;
+        }
         state->head.current_buf_ptr = new_buf_ptr;
         state->head.current_buf_seq = seq + 1;
         wmb();
@@ -764,13 +844,15 @@ static Record *acquire_phase_slot(
 ) {
     Buffer *buf = *current_buf_out;
     if (buf == nullptr) {
-        rmb();
-        uint32_t head = state->free_queue.head;
-        uint32_t tail = state->free_queue.tail;
-        if (head != tail) {
+        uint32_t head = 0;
+        uint32_t tail = 0;
+        if (wait_for_free_queue_entry(&state->free_queue, &head, &tail)) {
             uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
             rmb();
             state->free_queue.head = head + 1;
+            if (buf_ptr == 0) {
+                return nullptr;
+            }
             state->head.current_buf_ptr = buf_ptr;
             state->head.current_buf_seq += 1;
             wmb();
diff --git a/src/a2a3/platform/shared/aicpu/pmu_collector_aicpu.cpp b/src/a2a3/platform/shared/aicpu/pmu_collector_aicpu.cpp
index 3633b9bba..b2592b1a9 100644
--- a/src/a2a3/platform/shared/aicpu/pmu_collector_aicpu.cpp
+++ b/src/a2a3/platform/shared/aicpu/pmu_collector_aicpu.cpp
@@ -19,14 +19,16 @@
  * Buffer switching:
  *   - SPSC free_queue: Host pushes free PmuBuffers, AICPU pops when switching.
  *   - Per-thread ready_queue: AICPU enqueues full buffers for host collection.
- *   - On free_queue empty or ready_queue full: overwrite current buffer (data lost,
- *     avoids blocking the AICPU dispatch loop).
+ *   - Full buffers are published before AICPU tries to recover a replacement.
+ *     If recovery is delayed, later records are counted as dropped until host
+ *     replenishes free_queue.
  */
 
 #include "aicpu/pmu_collector_aicpu.h"
 
 #include <cstring>
 
+#include "aicpu/device_time.h"
 #include "aicpu/platform_regs.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
@@ -47,6 +49,9 @@ static PmuDataHeader *s_pmu_header = nullptr;
 // Populated by pmu_aicpu_init(); 0 means "no PMU for this core" (sim).
 static uint64_t s_pmu_reg_addrs[PLATFORM_MAX_CORES] = {0};
 
+static constexpr uint64_t kPmuQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000;  // 20 us
+static constexpr uint32_t kPmuQueueBackpressurePollMask = 1023;
+
 extern "C" void set_platform_pmu_base(uint64_t pmu_data_base) { g_platform_pmu_base = pmu_data_base; }
 
 extern "C" uint64_t get_platform_pmu_base() { return g_platform_pmu_base; }
@@ -101,22 +106,74 @@ static void pmu_read_counters(uint64_t reg_base, PmuRecord *out) {
 // ---------------------------------------------------------------------------
 
 static int enqueue_pmu_ready_buffer(int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq) {
+    if (s_pmu_header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
+        return -1;
+    }
     uint32_t capacity = PLATFORM_PMU_READYQUEUE_SIZE;
-    uint32_t current_tail = s_pmu_header->queue_tails[thread_idx];
-    uint32_t current_head = s_pmu_header->queue_heads[thread_idx];
+    uint32_t current_tail = 0;
+    uint32_t current_head = 0;
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        current_tail = s_pmu_header->queue_tails[thread_idx];
+        current_head = s_pmu_header->queue_heads[thread_idx];
+        uint32_t next_tail = (current_tail + 1) % capacity;
+        if (next_tail != current_head) {
+            break;
+        }
+        if ((++spins & kPmuQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kPmuQueueBackpressureWaitCycles) {
+            return -1;
+        }
+    } while (true);
 
     uint32_t next_tail = (current_tail + 1) % capacity;
-    if (next_tail == current_head) {
-        return -1;  // Queue full
-    }
-
     s_pmu_header->queues[thread_idx][current_tail].core_index = core_index;
     s_pmu_header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr;
     s_pmu_header->queues[thread_idx][current_tail].buffer_seq = buffer_seq;
+    wmb();  // publish: entry fields visible before the tail advance
     s_pmu_header->queue_tails[thread_idx] = next_tail;
     return 0;
 }
 
+static PmuBuffer *try_pop_pmu_buffer(int core_id, PmuBufferState *state, uint32_t next_seq) {
+    (void)core_id;
+    if (state == nullptr) {
+        return nullptr;
+    }
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+    uint32_t head = 0;
+    uint32_t tail = 0;
+
+    do {
+        head = state->free_queue.head;
+        tail = state->free_queue.tail;
+        if (head != tail) {
+            rmb();  // acquire: order the tail read before the buffer_ptrs read below
+            break;
+        }
+        if ((++spins & kPmuQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kPmuQueueBackpressureWaitCycles) {
+            return nullptr;
+        }
+    } while (true);
+
+    uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT];
+    state->free_queue.head = head + 1;
+    if (new_buf_ptr == 0) {
+        return nullptr;
+    }
+
+    PmuBuffer *new_buf = reinterpret_cast<PmuBuffer *>(new_buf_ptr);
+    new_buf->count = 0;
+    state->current_buf_ptr = new_buf_ptr;
+    state->current_buf_seq = next_seq;
+    wmb();
+    return new_buf;
+}
+
 // ---------------------------------------------------------------------------
 // Internal: switch the current buffer for one core
 // ---------------------------------------------------------------------------
@@ -132,20 +189,6 @@ static void pmu_switch_buffer(int core_id, int thread_idx) {
         return;
     }
 
-    // Check free_queue before committing the full buffer
-    rmb();
-    uint32_t head = state->free_queue.head;
-    uint32_t tail = state->free_queue.tail;
-
-    if (head == tail) {
-        // No replacement buffer available — overwrite current buffer to keep AICPU alive
-        LOG_WARN("Thread %d: Core %d no free PMU buffer, overwriting current buffer (data lost)", thread_idx, core_id);
-        state->dropped_record_count += full_buf->count;
-        full_buf->count = 0;
-        wmb();
-        return;
-    }
-
     // Enqueue full buffer to ready_queue
     uint32_t seq = state->current_buf_seq;
     int rc = enqueue_pmu_ready_buffer(thread_idx, static_cast<uint32_t>(core_id), state->current_buf_ptr, seq);
@@ -159,18 +202,19 @@ static void pmu_switch_buffer(int core_id, int thread_idx) {
         return;
     }
 
-    // Pop next buffer from free_queue
-    uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT];
-    rmb();
-    state->free_queue.head = head + 1;
-    state->current_buf_ptr = new_buf_ptr;
-    state->current_buf_seq = seq + 1;
+    uint32_t next_seq = seq + 1;
+    state->current_buf_ptr = 0;
+    state->current_buf_seq = next_seq;
     wmb();
 
-    PmuBuffer *new_buf = reinterpret_cast<PmuBuffer *>(new_buf_ptr);
-    new_buf->count = 0;
-
-    LOG_INFO_V0("Thread %d: Core %d switched to new PMU buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr);
+    PmuBuffer *new_buf = try_pop_pmu_buffer(core_id, state, next_seq);
+    if (new_buf == nullptr) {
+        return;
+    }
+    LOG_INFO_V0(
+        "Thread %d: Core %d switched to new PMU buffer (addr=0x%lx)", thread_idx, core_id,
+        reinterpret_cast<uint64_t>(new_buf)
+    );
 }
 
 // ---------------------------------------------------------------------------
@@ -225,16 +269,8 @@ void pmu_aicpu_init(const uint32_t *physical_core_ids, int num_cores) {
         uint32_t tail = state->free_queue.tail;
 
         if (head != tail) {
-            uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT];
-            rmb();
-            state->free_queue.head = head + 1;
-            state->current_buf_ptr = buf_ptr;
-            state->current_buf_seq = 0;
-            wmb();
-
-            PmuBuffer *buf = reinterpret_cast<PmuBuffer *>(buf_ptr);
-            buf->count = 0;
-
+            (void)try_pop_pmu_buffer(i, state, 0);
+            uint64_t buf_ptr = state->current_buf_ptr;
             LOG_DEBUG("Core %d: popped initial PMU buffer (addr=0x%lx)", i, buf_ptr);
         } else {
             LOG_ERROR("Core %d: PMU free_queue is empty during init!", i);
@@ -266,12 +302,18 @@ void pmu_aicpu_record_task(int core_id, int thread_idx, uint64_t task_id, uint32
 
     rmb();
     uint64_t cur_ptr = state->current_buf_ptr;
+    PmuBuffer *pmu_buf = nullptr;
     if (cur_ptr == 0) {
-        state->dropped_record_count += 1;
-        wmb();
-        return;
+        pmu_buf = try_pop_pmu_buffer(core_id, state, state->current_buf_seq);
+        if (pmu_buf == nullptr) {
+            state->dropped_record_count += 1;
+            wmb();
+            return;
+        }
+        cur_ptr = state->current_buf_ptr;
+    } else {
+        pmu_buf = reinterpret_cast<PmuBuffer *>(cur_ptr);
     }
-    PmuBuffer *pmu_buf = reinterpret_cast<PmuBuffer *>(cur_ptr);
 
     // Switch buffer if full
     if (pmu_buf->count >= static_cast<uint32_t>(PLATFORM_PMU_RECORDS_PER_BUFFER)) {
@@ -279,11 +321,16 @@ void pmu_aicpu_record_task(int core_id, int thread_idx, uint64_t task_id, uint32
         rmb();
         cur_ptr = state->current_buf_ptr;
         if (cur_ptr == 0) {
-            state->dropped_record_count += 1;
-            wmb();
-            return;
+            pmu_buf = try_pop_pmu_buffer(core_id, state, state->current_buf_seq);
+            if (pmu_buf == nullptr) {
+                state->dropped_record_count += 1;
+                wmb();
+                return;
+            }
+            cur_ptr = state->current_buf_ptr;
+        } else {
+            pmu_buf = reinterpret_cast<PmuBuffer *>(cur_ptr);
         }
-        pmu_buf = reinterpret_cast<PmuBuffer *>(cur_ptr);
     }
 
     uint32_t idx = pmu_buf->count;
diff --git a/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp b/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp
index b0a9123b2..091b146b1 100644
--- a/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp
+++ b/src/a2a3/platform/shared/host/l2_swimlane_collector.cpp
@@ -20,7 +20,6 @@
 
 #include "host/l2_swimlane_collector.h"
 
-#include <chrono>
 #include <cinttypes>
 #include <cstdlib>
 #include <ctime>
@@ -94,9 +93,9 @@ int L2SwimlaneCollector::initialize(
     aicpu_thread_num_ = aicpu_thread_num;
     l2_swimlane_level_ = l2_swimlane_level;
     output_prefix_ = output_prefix;
-    total_perf_collected_ = 0;
-    total_sched_phase_collected_ = 0;
-    total_orch_phase_collected_ = 0;
+    total_perf_collected_.store(0, std::memory_order_relaxed);
+    total_sched_phase_collected_.store(0, std::memory_order_relaxed);
+    total_orch_phase_collected_.store(0, std::memory_order_relaxed);
 
     // Stash the memory context on the base up-front so alloc_single_buffer
     // sees consistent values during init. shm_host_ stays nullptr until the
@@ -179,7 +178,9 @@ int L2SwimlaneCollector::initialize(
     LOG_DEBUG("  buffer_capacity:        %d", PLATFORM_PROF_BUFFER_SIZE);
     LOG_DEBUG("  queue capacity:         %d", PLATFORM_PROF_READYQUEUE_SIZE);
 
-    // Step 5: Initialize L2SwimlaneAicpuTaskPools — 1 buffer per core in free_queue, rest to recycled pool
+    // Step 5: Initialize L2SwimlaneAicpuTaskPools. Seed as many buffers as
+    // the device-side free_queue can hold; any remaining buffers stay in the
+    // host recycled pool.
     for (int i = 0; i < num_aicore; i++) {
         L2SwimlaneAicpuTaskPool *state = get_perf_buffer_state(perf_host_ptr, i);
         memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool));
@@ -189,6 +190,9 @@ int L2SwimlaneCollector::initialize(
         state->head.current_buf_ptr = 0;
         state->head.current_buf_seq = 0;
 
+        const int initial_free_count = (PLATFORM_PROF_BUFFERS_PER_CORE < PLATFORM_PROF_SLOT_COUNT) ?
+                                           PLATFORM_PROF_BUFFERS_PER_CORE :
+                                           PLATFORM_PROF_SLOT_COUNT;
         for (int s = 0; s < PLATFORM_PROF_BUFFERS_PER_CORE; s++) {
             void *host_buf_ptr = nullptr;
             void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicpuTaskBuffer), &host_buf_ptr);
@@ -200,14 +204,14 @@ int L2SwimlaneCollector::initialize(
             memset(buf, 0, sizeof(L2SwimlaneAicpuTaskBuffer));
             buf->count = 0;
 
-            if (s == 0) {
-                state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
+            if (s < initial_free_count) {
+                state->free_queue.buffer_ptrs[s] = reinterpret_cast<uint64_t>(dev_buf_ptr);
             } else {
                 manager_.push_recycled(static_cast<int>(ProfBufferType::AICPU_TASK), dev_buf_ptr);
             }
         }
         wmb();
-        state->free_queue.tail = 1;
+        state->free_queue.tail = static_cast<uint32_t>(initial_free_count);
         wmb();
     }
 
@@ -217,6 +221,9 @@ int L2SwimlaneCollector::initialize(
         L2SwimlaneAicoreTaskPool *ac_state = get_aicore_buffer_state(perf_host_ptr, num_aicore, i);
         memset(ac_state, 0, sizeof(L2SwimlaneAicoreTaskPool));
 
+        const int initial_free_count = (PLATFORM_AICORE_BUFFERS_PER_CORE < PLATFORM_PROF_SLOT_COUNT) ?
+                                           PLATFORM_AICORE_BUFFERS_PER_CORE :
+                                           PLATFORM_PROF_SLOT_COUNT;
         for (int s = 0; s < PLATFORM_AICORE_BUFFERS_PER_CORE; s++) {
             void *host_buf_ptr = nullptr;
             void *dev_buf_ptr = alloc_single_buffer(sizeof(L2SwimlaneAicoreTaskBuffer), &host_buf_ptr);
@@ -228,21 +235,20 @@ int L2SwimlaneCollector::initialize(
             memset(buf, 0, sizeof(L2SwimlaneAicoreTaskBuffer));
             buf->count = 0;
 
-            if (s == 0) {
-                ac_state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
+            if (s < initial_free_count) {
+                ac_state->free_queue.buffer_ptrs[s] = reinterpret_cast<uint64_t>(dev_buf_ptr);
             } else {
                 manager_.push_recycled(static_cast<int>(ProfBufferType::AICORE_TASK), dev_buf_ptr);
             }
         }
         wmb();
-        ac_state->free_queue.tail = 1;
+        ac_state->free_queue.tail = static_cast<uint32_t>(initial_free_count);
         wmb();
     }
     LOG_DEBUG(
-        "Initialized buffer pools: %d L2SwimlaneAicpuTaskBuffers/core + %d L2SwimlaneAicoreTaskBuffers/core (1 in "
-        "free_queue, "
-        "rest in recycled pool)",
-        PLATFORM_PROF_BUFFERS_PER_CORE, PLATFORM_AICORE_BUFFERS_PER_CORE
+        "Initialized buffer pools: %d L2SwimlaneAicpuTaskBuffers/core + %d L2SwimlaneAicoreTaskBuffers/core (up to "
+        "%d in free_queue, rest in recycled pool)",
+        PLATFORM_PROF_BUFFERS_PER_CORE, PLATFORM_AICORE_BUFFERS_PER_CORE, PLATFORM_PROF_SLOT_COUNT
     );
 
     // Step 5c: Standalone uint64_t[num_aicore] table that will hold per-core
@@ -265,9 +271,10 @@ int L2SwimlaneCollector::initialize(
 
     // Step 6: Initialize per-thread phase pools — both sched and orch. Each
     // pool is sized to its own PLATFORM_PROF_{SCHED,ORCH}_BUFFERS_PER_THREAD
-    // (1 in free_queue, rest in the recycled pool tagged by kind). Templated on the
-    // concrete TypedBuffer so the `count` zero-store uses the matching layout
-    // — sched and orch buffers have DIFFERENT sizes (64B vs 32B records),
+    // (seeded into free_queue up to slot capacity, rest in the recycled pool
+    // tagged by kind). Templated on the concrete TypedBuffer so the `count`
+    // zero-store uses the matching layout — sched and orch buffers have
+    // DIFFERENT sizes (64B vs 32B records),
     // so a single cast type for both would land the count store past the end
     // of the orch allocation and corrupt the heap.
     // state_count pool states are zeroed (so the host's [0, PLATFORM_MAX)
@@ -284,6 +291,8 @@ int L2SwimlaneCollector::initialize(
             auto *state = get_state(perf_host_ptr, num_aicore, t);
             memset(state, 0, sizeof(L2SwimlaneAicpuTaskPool));
             if (t >= buffer_count) continue;  // zeroed state only; no buffers (unused slot)
+            const int initial_free_count =
+                (buffers_per_thread < PLATFORM_PROF_SLOT_COUNT) ? buffers_per_thread : PLATFORM_PROF_SLOT_COUNT;
             for (int s = 0; s < buffers_per_thread; s++) {
                 void *host_buf_ptr = nullptr;
                 void *dev_buf_ptr = alloc_single_buffer(buffer_bytes, &host_buf_ptr);
@@ -295,14 +304,14 @@ int L2SwimlaneCollector::initialize(
                 // matching Buffer type. The records payload is overwritten by
                 // AICPU on first use.
                 reinterpret_cast<Buffer *>(host_buf_ptr)->count = 0;
-                if (s == 0) {
-                    state->free_queue.buffer_ptrs[0] = reinterpret_cast<uint64_t>(dev_buf_ptr);
+                if (s < initial_free_count) {
+                    state->free_queue.buffer_ptrs[s] = reinterpret_cast<uint64_t>(dev_buf_ptr);
                 } else {
                     manager_.push_recycled(static_cast<int>(recycle_kind), dev_buf_ptr);
                 }
             }
             wmb();
-            state->free_queue.tail = 1;
+            state->free_queue.tail = static_cast<uint32_t>(initial_free_count);
             wmb();
         }
         return 0;
@@ -338,8 +347,10 @@ int L2SwimlaneCollector::initialize(
         return -1;
     }
     LOG_DEBUG(
-        "Initialized %d sched (%d buf/thread) + 1 orch (%d buf) PhaseBufferStates", num_phase_threads,
-        PLATFORM_PROF_SCHED_BUFFERS_PER_THREAD, PLATFORM_PROF_ORCH_BUFFERS_PER_THREAD
+        "Initialized %d sched (%d buf/thread) + 1 orch (%d buf) PhaseBufferStates (seeded up to %d free_queue "
+        "slots)",
+        num_phase_threads, PLATFORM_PROF_SCHED_BUFFERS_PER_THREAD, PLATFORM_PROF_ORCH_BUFFERS_PER_THREAD,
+        PLATFORM_PROF_SLOT_COUNT
     );
 
     wmb();
@@ -378,10 +389,11 @@ void L2SwimlaneCollector::copy_perf_buffer(const ReadyBufferInfo &info) {
     }
     uint32_t core_index = info.index;
     if (core_index < static_cast<uint32_t>(num_aicore_)) {
+        std::scoped_lock<std::mutex> lock(perf_record_mutexes_[core_index]);
         for (uint32_t i = 0; i < count; i++) {
             collected_perf_records_[core_index].push_back(buf->records[i]);
         }
-        total_perf_collected_ += count;
+        total_perf_collected_.fetch_add(count, std::memory_order_relaxed);
     }
 }
 
@@ -394,12 +406,13 @@ void L2SwimlaneCollector::copy_sched_phase_buffer(const ReadyBufferInfo &info) {
     }
     uint32_t tidx = info.index;
     if (tidx < collected_sched_phase_records_.size()) {
+        std::scoped_lock<std::mutex> lock(sched_phase_record_mutexes_[tidx]);
         for (uint32_t i = 0; i < count; i++) {
             collected_sched_phase_records_[tidx].push_back(buf->records[i]);
         }
-        total_sched_phase_collected_ += count;
+        total_sched_phase_collected_.fetch_add(count, std::memory_order_relaxed);
         if (count > 0) {
-            has_phase_data_ = true;
+            has_phase_data_.store(true, std::memory_order_relaxed);
         }
     }
 }
@@ -413,12 +426,13 @@ void L2SwimlaneCollector::copy_orch_phase_buffer(const ReadyBufferInfo &info) {
     }
     uint32_t tidx = info.index;
     if (tidx < collected_orch_phase_records_.size()) {
+        std::scoped_lock<std::mutex> lock(orch_phase_record_mutexes_[tidx]);
         for (uint32_t i = 0; i < count; i++) {
             collected_orch_phase_records_[tidx].push_back(buf->records[i]);
         }
-        total_orch_phase_collected_ += count;
+        total_orch_phase_collected_.fetch_add(count, std::memory_order_relaxed);
         if (count > 0) {
-            has_phase_data_ = true;
+            has_phase_data_.store(true, std::memory_order_relaxed);
         }
     }
 }
@@ -453,16 +467,19 @@ void L2SwimlaneCollector::copy_aicore_buffer(const ReadyBufferInfo &info) {
     if (count > static_cast<uint32_t>(PLATFORM_AICORE_BUFFER_SIZE)) {
         count = PLATFORM_AICORE_BUFFER_SIZE;
     }
-    auto &dst = collected_aicore_records_[core_index];
-    dst.reserve(dst.size() + count);
     uint32_t skipped = 0;
-    for (uint32_t i = 0; i < count; i++) {
-        const L2SwimlaneAicoreTaskRecord &r = buf->records[i];
-        if (r.start_time == 0) {
-            skipped++;
-            continue;
+    {
+        std::scoped_lock<std::mutex> lock(aicore_record_mutexes_[core_index]);
+        auto &dst = collected_aicore_records_[core_index];
+        dst.reserve(dst.size() + count);
+        for (uint32_t i = 0; i < count; i++) {
+            const L2SwimlaneAicoreTaskRecord &r = buf->records[i];
+            if (r.start_time == 0) {
+                skipped++;
+                continue;
+            }
+            dst.push_back(r);
         }
-        dst.push_back(r);
     }
     if (skipped > 0) {
         LOG_WARN(
@@ -554,8 +571,7 @@ void L2SwimlaneCollector::reconcile_counters() {
 
         if (dropped_device > 0) {
             LOG_WARN(
-                "L2Swimlane reconcile: %lu %s records dropped on device side (buffer full / "
-                "ready_queue full).",
+                "L2Swimlane reconcile: %lu %s records dropped on device side.",
                 static_cast<unsigned long>(dropped_device), kind
             );
         }
@@ -591,7 +607,7 @@ void L2SwimlaneCollector::reconcile_counters() {
         [](void *host_ptr) {
             return reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(host_ptr)->count;
         },
-        total_perf_collected_, /*optional=*/false
+        total_perf_collected_.load(std::memory_order_relaxed), /*optional=*/false
     );
 
     reconcile_one(
@@ -602,7 +618,7 @@ void L2SwimlaneCollector::reconcile_counters() {
         [](void *host_ptr) {
             return reinterpret_cast<L2SwimlaneAicpuSchedPhaseBuffer *>(host_ptr)->count;
         },
-        total_sched_phase_collected_, /*optional=*/true
+        total_sched_phase_collected_.load(std::memory_order_relaxed), /*optional=*/true
     );
 
     reconcile_one(
@@ -613,7 +629,7 @@ void L2SwimlaneCollector::reconcile_counters() {
         [](void *host_ptr) {
             return reinterpret_cast<L2SwimlaneAicpuOrchPhaseBuffer *>(host_ptr)->count;
         },
-        total_orch_phase_collected_, /*optional=*/true
+        total_orch_phase_collected_.load(std::memory_order_relaxed), /*optional=*/true
     );
 }
 
@@ -673,7 +689,10 @@ void L2SwimlaneCollector::read_phase_header_metadata() {
         LOG_INFO_V0("  Core-to-thread mapping: %d cores", num_phase_cores);
     }
 
-    LOG_INFO_V0("Phase metadata collection complete: has_phase_data=%s", has_phase_data_ ? "yes" : "no");
+    LOG_INFO_V0(
+        "Phase metadata collection complete: has_phase_data=%s",
+        has_phase_data_.load(std::memory_order_relaxed) ? "yes" : "no"
+    );
 }
 
 void L2SwimlaneCollector::set_core_types(const CoreType *types, int n) {
@@ -1008,10 +1027,10 @@ int L2SwimlaneCollector::finalize(L2SwimlaneUnregisterCallback unregister_cb, co
     collected_sched_phase_records_.clear();
     collected_orch_phase_records_.clear();
     core_to_thread_.clear();
-    has_phase_data_ = false;
-    total_perf_collected_ = 0;
-    total_sched_phase_collected_ = 0;
-    total_orch_phase_collected_ = 0;
+    has_phase_data_.store(false, std::memory_order_relaxed);
+    total_perf_collected_.store(0, std::memory_order_relaxed);
+    total_sched_phase_collected_.store(0, std::memory_order_relaxed);
+    total_orch_phase_collected_.store(0, std::memory_order_relaxed);
     clear_memory_context();
 
     LOG_DEBUG("Performance profiling cleanup complete");
diff --git a/src/a5/platform/include/host/dep_gen_collector.h b/src/a5/platform/include/host/dep_gen_collector.h
index 96c1bcd9f..6b8f8cfb8 100644
--- a/src/a5/platform/include/host/dep_gen_collector.h
+++ b/src/a5/platform/include/host/dep_gen_collector.h
@@ -16,16 +16,17 @@
  *
  * Architecture:
  * - BufferPoolManager<DepGenModule>: shared mgmt-thread infrastructure that
- *   polls the per-thread ready queue, drains the done_queue, and replenishes
- *   the (single instance's) free_queue from a unified recycled pool.
- * - DepGenCollector: collector thread pops full DepGenBuffers from the manager
- *   and appends their DepGenRecords to a binary file (submit_trace.bin).
+ *   polls per-thread ready queues, drains done-queue shards, and replenishes
+ *   the single instance's free_queue from a unified recycled pool.
+ * - DepGenCollector: collector thread shards pop full DepGenBuffers from the
+ *   manager and append their DepGenRecords to a binary file
+ *   (submit_trace.bin).
  *
  * Lifecycle:
  *   init()                       — Allocate header + 1 BufferState + N DepGenBuffers
  *                                  (pre-fills free_queue; surplus → recycled pool).
  *                                  Calls set_memory_context() on the base.
- *   start(tf)                    — Inherited: launches mgmt + poll threads.
+ *   start(tf)                    — Inherited: launches mgmt + collector threads.
  *   [device execution]
  *   stop()                       — Inherited: drain queues, join threads.
  *   reconcile_counters()         — Sanity-check current_buf_ptr is cleared by
@@ -64,7 +65,7 @@
 // ---------------------------------------------------------------------------
 
 /**
- * Internal hand-off struct delivered from the mgmt thread to the collector.
+ * Internal hand-off struct delivered from a drain thread to a collector shard.
  * thread_index identifies the AICPU thread queue the entry was popped from
  * (always equal to the orchestrator thread index, since dep_gen is single-
  * instance — exposed for symmetry with PmuReadyBufferInfo).
@@ -87,6 +88,8 @@ struct DepGenModule {
     static constexpr uint32_t kReadyQueueSize = PLATFORM_DEP_GEN_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_DEP_GEN_SLOT_COUNT;
     static constexpr const char *kSubsystemName = "DepGenModule";
+    static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS;
+    static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS;
 
     /**
      * Buffers grown by proactive_replenish are batch-allocated up to the
@@ -104,7 +107,18 @@ struct DepGenModule {
      * resets it itself on flush/drop/pop.
      */
     static std::optional<profiling_common::EntrySite<DepGenModule>>
-    resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) {
+    resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) {
+        if (shm == nullptr || header == nullptr) {
+            LOG_ERROR("DepGenModule: invalid shared memory/header while resolving ready entry");
+            return std::nullopt;
+        }
+        if (header->num_instances != 1 || entry.instance_index >= header->num_instances) {
+            LOG_ERROR(
+                "DepGenModule: invalid ready entry instance=%u (num_instances=%u)", entry.instance_index,
+                header->num_instances
+            );
+            return std::nullopt;
+        }
         DepGenBufferState *state = get_dep_gen_buffer_state(shm, static_cast<int>(entry.instance_index));
         profiling_common::EntrySite<DepGenModule> site;
         site.kind = 0;
diff --git a/src/a5/platform/include/host/l2_swimlane_collector.h b/src/a5/platform/include/host/l2_swimlane_collector.h
index 24d6a037a..44d755611 100644
--- a/src/a5/platform/include/host/l2_swimlane_collector.h
+++ b/src/a5/platform/include/host/l2_swimlane_collector.h
@@ -16,9 +16,9 @@
  * Architecture:
  * - BufferPoolManager<L2SwimlaneModule>: shared mgmt-thread infrastructure that polls
  *   the AICPU ready queue, replenishes per-core / per-thread free queues, and
- *   hands full buffers off to the collector thread.
- * - L2SwimlaneCollector: main thread copies records from the manager's ready queue
- *   into host vectors and exports the swimlane visualization.
+ *   hands full buffers off to collector thread shards.
+ * - L2SwimlaneCollector: collector thread shards copy records from manager ready queues
+ *   into host vectors; the owner thread exports the swimlane visualization after stop().
  *
  * Memory operations are injected through callbacks for sim/onboard portability.
  */
@@ -27,8 +27,11 @@
 #define SRC_A5_PLATFORM_INCLUDE_HOST_L2_SWIMLANE_COLLECTOR_H_
 
 #include <atomic>
+#include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
+#include <mutex>
 #include <string>
 #include <thread>
 #include <vector>
@@ -87,6 +90,8 @@ struct L2SwimlaneModule {
     static constexpr uint32_t kReadyQueueSize = PLATFORM_PROF_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_PROF_SLOT_COUNT;
     static constexpr const char *kSubsystemName = "L2SwimlaneModule";
+    static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS;
+    static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS;
 
     /**
      * batch_size for proactive_replenish's alloc fallback. Sized so that a
@@ -121,6 +126,13 @@ struct L2SwimlaneModule {
 
     static DataHeader *header_from_shm(void *shm) { return get_l2_swimlane_header(shm); }
 
+    template <typename Mgr>
+    static void refresh_replenish_metadata(Mgr &mgr, DataHeader *header) {
+        mgr.read_range_from_device(&header->num_sched_phase_threads, sizeof(header->num_sched_phase_threads));
+        mgr.read_range_from_device(&header->num_orch_phase_threads, sizeof(header->num_orch_phase_threads));
+        rmb();
+    }
+
     /**
      * Branch on entry.kind to pick the per-core task state, per-thread sched-
      * or orch-phase state, or per-core AICore state. Returns nullopt for
@@ -459,15 +471,20 @@ class L2SwimlaneCollector : public profiling_common::ProfilerBase<L2SwimlaneColl
     // orch records (kind-tagged at routing time; no parse-time discrimination).
     std::vector<std::vector<L2SwimlaneAicpuSchedPhaseRecord>> collected_sched_phase_records_;
     std::vector<std::vector<L2SwimlaneAicpuOrchPhaseRecord>> collected_orch_phase_records_;
-    bool has_phase_data_{false};
+    std::atomic<bool> has_phase_data_{false};
 
     // Core-to-thread mapping (core_id → scheduler thread index, -1 = unassigned)
     std::vector<int8_t> core_to_thread_;
 
     // Running totals used at reconcile time to cross-check device-side counters.
-    uint64_t total_perf_collected_{0};
-    uint64_t total_sched_phase_collected_{0};
-    uint64_t total_orch_phase_collected_{0};
+    std::atomic<uint64_t> total_perf_collected_{0};
+    std::atomic<uint64_t> total_sched_phase_collected_{0};
+    std::atomic<uint64_t> total_orch_phase_collected_{0};
+
+    std::array<std::mutex, PLATFORM_MAX_CORES> perf_record_mutexes_;
+    std::array<std::mutex, PLATFORM_MAX_CORES> aicore_record_mutexes_;
+    std::array<std::mutex, PLATFORM_MAX_AICPU_THREADS> sched_phase_record_mutexes_;
+    std::array<std::mutex, PLATFORM_MAX_AICPU_THREADS> orch_phase_record_mutexes_;
 
     // Per-buffer-kind handlers used by on_buffer_collected.
     void copy_perf_buffer(const ReadyBufferInfo &info);
diff --git a/src/a5/platform/include/host/pmu_collector.h b/src/a5/platform/include/host/pmu_collector.h
index 7a7cdc79a..b42467aa1 100644
--- a/src/a5/platform/include/host/pmu_collector.h
+++ b/src/a5/platform/include/host/pmu_collector.h
@@ -14,11 +14,11 @@
  * @brief Host-side PMU buffer allocation, streaming collection, and CSV export.
  *
  * Architecture:
- * - BufferPoolManager<PmuModule>: shared mgmt-thread infrastructure that
- *   polls per-thread PmuReadyQueues, drains the done_queue, and replenishes
- *   the per-core free_queues from a unified recycled pool.
- * - PmuCollector: collector thread pops full PmuBuffers from the manager
- *   and appends them to the CSV file.
+ * - BufferPoolManager<PmuModule>: shared split-mgmt infrastructure that polls
+ *   per-thread ready queues, drains done-queue shards, and replenishes the
+ *   per-core free_queues from a unified recycled pool.
+ * - PmuCollector: collector thread shards pop full PmuBuffers from the manager
+ *   and append them to the CSV file.
  *
  * a5 specifics: device↔host transfers go through profiling_copy.h. The
  * framework's mgmt loop mirrors the shm region per tick; per-buffer
@@ -32,12 +32,12 @@
  *                                  start(tf) can launch threads.
  *   start(tf)                    — Inherited from ProfilerBase: assembles
  *                                  MemoryOps from the stashed callbacks
- *                                  and launches the mgmt + poll threads.
+ *                                  and launches the mgmt + collector threads.
  *   [device execution]
- *   stop()                       — Stop mgmt → join mgmt → signal poll →
- *                                  drain L2 → join poll, in that order. On
- *                                  return both thread exits and queue
- *                                  drains are complete.
+ *   stop()                       — Stop mgmt → join mgmt → signal collectors →
+ *                                  drain ready shards → join collectors, in
+ *                                  that order. On return both thread exits and
+ *                                  queue drains are complete.
  *   reconcile_counters()         — Sanity-check PmuBufferState::current_buf_ptr
  *                                  (any non-zero pointer with records is a
  *                                  device-flush bug, logged as ERROR) and
@@ -84,9 +84,8 @@
  */
 
 /**
- * Internal hand-off struct delivered from the mgmt thread to the
- * collector. thread_index is the logical AICPU thread queue the entry was
- * popped from, passed through by ProfilerBase's mgmt loop.
+ * Internal hand-off struct delivered from a drain thread to a collector shard.
+ * thread_index is the logical AICPU thread queue the entry was popped from.
  */
 struct PmuReadyBufferInfo {
     uint32_t core_index;
@@ -106,6 +105,8 @@ struct PmuModule {
     static constexpr uint32_t kReadyQueueSize = PLATFORM_PMU_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_PMU_SLOT_COUNT;
     static constexpr const char *kSubsystemName = "PmuModule";
+    static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS;
+    static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS;
 
     /**
      * Buffers grown by proactive_replenish are batch-allocated up to the
@@ -124,7 +125,18 @@ struct PmuModule {
      * and resets it itself when popping from free_queue.
      */
     static std::optional<profiling_common::EntrySite<PmuModule>>
-    resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) {
+    resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) {
+        if (shm == nullptr || header == nullptr) {
+            LOG_ERROR("PmuModule: invalid shared memory/header while resolving ready entry");
+            return std::nullopt;
+        }
+        if (entry.core_index >= header->num_cores || entry.core_index >= static_cast<uint32_t>(PLATFORM_MAX_CORES)) {
+            LOG_ERROR(
+                "PmuModule: invalid ready entry core=%u (num_cores=%u, max=%u)", entry.core_index, header->num_cores,
+                static_cast<uint32_t>(PLATFORM_MAX_CORES)
+            );
+            return std::nullopt;
+        }
         PmuBufferState *state = get_pmu_buffer_state(shm, static_cast<int>(entry.core_index));
         profiling_common::EntrySite<PmuModule> site;
         site.kind = 0;
diff --git a/src/a5/platform/shared/aicpu/dep_gen_collector_aicpu.cpp b/src/a5/platform/shared/aicpu/dep_gen_collector_aicpu.cpp
index 9b934a2f4..e2db5c4a1 100644
--- a/src/a5/platform/shared/aicpu/dep_gen_collector_aicpu.cpp
+++ b/src/a5/platform/shared/aicpu/dep_gen_collector_aicpu.cpp
@@ -20,15 +20,17 @@
  *   - Host pushes free DepGenBuffers via free_queue.
  *   - AICPU pops when current buffer fills; pushes full buffer to per-thread
  *     ready_queue (indexed by orch_thread_idx).
- *   - On free_queue empty or ready_queue full: overwrite current buffer
- *     (record dropped_record_count, keep AICPU alive). Host reads dropped
- *     at finalize to decide whether to emit deps.json.
+ *   - Full buffers are published before AICPU tries to recover a replacement.
+ *     If recovery is delayed, later records are counted as dropped until host
+ *     replenishes free_queue. Host reads dropped at finalize to decide whether
+ *     to emit deps.json.
  */
 
 #include "aicpu/dep_gen_collector_aicpu.h"
 
 #include <cstring>
 
+#include "aicpu/device_time.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
@@ -41,6 +43,9 @@ static DepGenDataHeader *s_dep_gen_header = nullptr;
 static DepGenBufferState *s_dep_gen_state = nullptr;
 static int s_orch_thread_idx = -1;  // set via dep_gen_aicpu_set_orch_thread_idx
 
+static constexpr uint64_t kDepGenQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000;  // 20 us
+static constexpr uint32_t kDepGenQueueBackpressurePollMask = 1023;
+
 extern "C" void set_platform_dep_gen_base(uint64_t dep_gen_data_base) { g_platform_dep_gen_base = dep_gen_data_base; }
 
 extern "C" uint64_t get_platform_dep_gen_base() { return g_platform_dep_gen_base; }
@@ -56,26 +61,74 @@ void dep_gen_aicpu_set_orch_thread_idx(int thread_idx) { s_orch_thread_idx = thr
 // ---------------------------------------------------------------------------
 
 static int enqueue_dep_gen_ready_buffer(uint64_t buffer_ptr, uint32_t buffer_seq) {
-    if (s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
+    if (s_dep_gen_header == nullptr || s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
         return -1;
     }
     int q = s_orch_thread_idx;
     uint32_t capacity = PLATFORM_DEP_GEN_READYQUEUE_SIZE;
-    uint32_t current_tail = s_dep_gen_header->queue_tails[q];
-    uint32_t current_head = s_dep_gen_header->queue_heads[q];
+    uint32_t current_tail = 0;
+    uint32_t current_head = 0;
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        current_tail = s_dep_gen_header->queue_tails[q];
+        current_head = s_dep_gen_header->queue_heads[q];
+        uint32_t next_tail = (current_tail + 1) % capacity;
+        if (next_tail != current_head) {
+            break;
+        }
+        if ((++spins & kDepGenQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kDepGenQueueBackpressureWaitCycles) {
+            return -1;
+        }
+    } while (true);
 
     uint32_t next_tail = (current_tail + 1) % capacity;
-    if (next_tail == current_head) {
-        return -1;  // Queue full
-    }
-
     s_dep_gen_header->queues[q][current_tail].instance_index = 0;
     s_dep_gen_header->queues[q][current_tail].buffer_ptr = buffer_ptr;
     s_dep_gen_header->queues[q][current_tail].buffer_seq = buffer_seq;
+    wmb();  // publish: entry fields visible before the tail advance
     s_dep_gen_header->queue_tails[q] = next_tail;
     return 0;
 }
 
+static DepGenBuffer *try_pop_dep_gen_buffer(uint32_t next_seq) {
+    if (s_dep_gen_state == nullptr) {
+        return nullptr;
+    }
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+    uint32_t head = 0;
+    uint32_t tail = 0;
+
+    do {
+        head = s_dep_gen_state->free_queue.head;
+        tail = s_dep_gen_state->free_queue.tail;
+        if (head != tail) {
+            rmb();  // acquire: order the tail read before the buffer_ptrs read below
+            break;
+        }
+        if ((++spins & kDepGenQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kDepGenQueueBackpressureWaitCycles) {
+            return nullptr;
+        }
+    } while (true);
+
+    uint64_t new_buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT];
+    s_dep_gen_state->free_queue.head = head + 1;
+    if (new_buf_ptr == 0) {
+        return nullptr;
+    }
+
+    DepGenBuffer *new_buf = reinterpret_cast<DepGenBuffer *>(new_buf_ptr);
+    new_buf->count = 0;
+    s_dep_gen_state->current_buf_ptr = new_buf_ptr;
+    s_dep_gen_state->current_buf_seq = next_seq;
+    wmb();
+    return new_buf;
+}
+
 // ---------------------------------------------------------------------------
 // Internal: switch the current buffer
 // ---------------------------------------------------------------------------
@@ -89,21 +142,6 @@ static void dep_gen_switch_buffer() {
         return;
     }
 
-    // Check free_queue before committing the full buffer
-    rmb();
-    uint32_t head = s_dep_gen_state->free_queue.head;
-    uint32_t tail = s_dep_gen_state->free_queue.tail;
-
-    if (head == tail) {
-        // No replacement buffer available — overwrite current buffer to keep
-        // the orch loop alive; account every record we drop.
-        LOG_WARN("dep_gen: no free buffer, overwriting current (dropped %u records)", full_buf->count);
-        s_dep_gen_state->dropped_record_count += full_buf->count;
-        full_buf->count = 0;
-        wmb();
-        return;
-    }
-
     uint32_t seq = s_dep_gen_state->current_buf_seq;
     int rc = enqueue_dep_gen_ready_buffer(s_dep_gen_state->current_buf_ptr, seq);
     if (rc != 0) {
@@ -114,16 +152,12 @@ static void dep_gen_switch_buffer() {
         return;
     }
 
-    // Pop next buffer from free_queue
-    uint64_t new_buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT];
-    rmb();
-    s_dep_gen_state->free_queue.head = head + 1;
-    s_dep_gen_state->current_buf_ptr = new_buf_ptr;
-    s_dep_gen_state->current_buf_seq = seq + 1;
+    uint32_t next_seq = seq + 1;
+    s_dep_gen_state->current_buf_ptr = 0;
+    s_dep_gen_state->current_buf_seq = next_seq;
     wmb();
 
-    DepGenBuffer *new_buf = reinterpret_cast<DepGenBuffer *>(new_buf_ptr);
-    new_buf->count = 0;
+    (void)try_pop_dep_gen_buffer(next_seq);
 }
 
 // ---------------------------------------------------------------------------
@@ -144,14 +178,8 @@ void dep_gen_aicpu_init() {
     uint32_t tail = s_dep_gen_state->free_queue.tail;
 
     if (head != tail) {
-        uint64_t buf_ptr = s_dep_gen_state->free_queue.buffer_ptrs[head % PLATFORM_DEP_GEN_SLOT_COUNT];
-        rmb();
-        s_dep_gen_state->free_queue.head = head + 1;
-        s_dep_gen_state->current_buf_ptr = buf_ptr;
-        s_dep_gen_state->current_buf_seq = 0;
-        wmb();
-        DepGenBuffer *buf = reinterpret_cast<DepGenBuffer *>(buf_ptr);
-        buf->count = 0;
+        (void)try_pop_dep_gen_buffer(0);
+        uint64_t buf_ptr = s_dep_gen_state->current_buf_ptr;
         LOG_INFO_V0("dep_gen: popped initial buffer addr=0x%lx", buf_ptr);
     } else {
         LOG_ERROR("dep_gen: free_queue empty during init");
@@ -180,9 +208,13 @@ void dep_gen_aicpu_record_submit(
     rmb();
     uint64_t cur_ptr = s_dep_gen_state->current_buf_ptr;
     if (cur_ptr == 0) {
-        s_dep_gen_state->dropped_record_count += 1;
-        wmb();
-        return;
+        DepGenBuffer *recovered = try_pop_dep_gen_buffer(s_dep_gen_state->current_buf_seq);
+        if (recovered == nullptr) {
+            s_dep_gen_state->dropped_record_count += 1;
+            wmb();
+            return;
+        }
+        cur_ptr = s_dep_gen_state->current_buf_ptr;
     }
     DepGenBuffer *buf = reinterpret_cast<DepGenBuffer *>(cur_ptr);
 
@@ -205,9 +237,13 @@ void dep_gen_aicpu_record_submit(
         rmb();
         cur_ptr = s_dep_gen_state->current_buf_ptr;
         if (cur_ptr == 0) {
-            s_dep_gen_state->dropped_record_count += 1;
-            wmb();
-            return;
+            DepGenBuffer *recovered = try_pop_dep_gen_buffer(s_dep_gen_state->current_buf_seq);
+            if (recovered == nullptr) {
+                s_dep_gen_state->dropped_record_count += 1;
+                wmb();
+                return;
+            }
+            cur_ptr = s_dep_gen_state->current_buf_ptr;
         }
         buf = reinterpret_cast<DepGenBuffer *>(cur_ptr);
         local_count = buf->count;  // refresh after switch — new buffer starts at 0
diff --git a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
index 5ed92cd61..0d030eb2e 100644
--- a/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
+++ b/src/a5/platform/shared/aicpu/l2_swimlane_collector_aicpu.cpp
@@ -108,6 +108,59 @@ extern "C" uint64_t get_platform_l2_swimlane_aicore_rotation_table() {
 }
 L2SwimlaneLevel get_l2_swimlane_level() { return g_l2_swimlane_level; }
 
+static constexpr uint64_t kL2SwimlaneQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000;  // 20 us
+static constexpr uint32_t kL2SwimlaneQueueBackpressurePollMask = 1023;
+
+static bool
+wait_for_ready_queue_space(L2SwimlaneDataHeader *header, int thread_idx, uint32_t *tail_out, uint32_t *head_out) {
+    if (header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
+        return false;
+    }
+    const uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE;
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        uint32_t current_tail = header->queue_tails[thread_idx];
+        uint32_t current_head = header->queue_heads[thread_idx];
+        uint32_t next_tail = (current_tail + 1) % capacity;
+        if (next_tail != current_head) {
+            *tail_out = current_tail;
+            *head_out = current_head;
+            return true;
+        }
+        if ((++spins & kL2SwimlaneQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kL2SwimlaneQueueBackpressureWaitCycles) {
+            break;
+        }
+    } while (true);
+    return false;
+}
+
+static bool wait_for_free_queue_entry(L2SwimlaneFreeQueue *free_queue, uint32_t *head_out, uint32_t *tail_out) {
+    if (free_queue == nullptr) {
+        return false;
+    }
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        uint32_t head = free_queue->head;
+        uint32_t tail = free_queue->tail;
+        if (head != tail) {
+            *head_out = head;
+            *tail_out = tail;
+            rmb();  // acquire: order the tail read above before the caller's buffer_ptrs read
+            return true;
+        }
+        if ((++spins & kL2SwimlaneQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kL2SwimlaneQueueBackpressureWaitCycles) {
+            break;
+        }
+    } while (true);
+    return false;
+}
+
 /**
  * Enqueue ready buffer to per-thread queue
  *
@@ -124,24 +177,50 @@ static int enqueue_ready_buffer(
     L2SwimlaneBufferKind kind
 ) {
     uint32_t capacity = PLATFORM_PROF_READYQUEUE_SIZE;
-    uint32_t current_tail = header->queue_tails[thread_idx];
-    uint32_t current_head = header->queue_heads[thread_idx];
+    uint32_t current_tail = 0;
+    uint32_t current_head = 0;
 
-    // Check if queue is full
-    uint32_t next_tail = (current_tail + 1) % capacity;
-    if (next_tail == current_head) {
+    if (!wait_for_ready_queue_space(header, thread_idx, &current_tail, &current_head)) {
         return -1;
     }
+    uint32_t next_tail = (current_tail + 1) % capacity;
 
     header->queues[thread_idx][current_tail].core_index = core_index;
     header->queues[thread_idx][current_tail].kind = kind;
     header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr;
     header->queues[thread_idx][current_tail].buffer_seq = buffer_seq;
+    wmb();  // publish: entry fields visible before the tail advance
     header->queue_tails[thread_idx] = next_tail;
 
     return 0;
 }
 
+static L2SwimlaneAicpuTaskBuffer *
+try_pop_records_buffer(int core_id, L2SwimlaneAicpuTaskPool *state, uint32_t next_seq) {
+    uint32_t head = 0;
+    uint32_t tail = 0;
+    if (!wait_for_free_queue_entry(&state->free_queue, &head, &tail)) {
+        return nullptr;
+    }
+
+    uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
+    rmb();
+    state->free_queue.head = head + 1;
+    if (new_buf_ptr == 0) {
+        return nullptr;
+    }
+
+    auto *new_buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(new_buf_ptr);
+    new_buf->count = 0;
+    wmb();
+
+    state->head.current_buf_ptr = new_buf_ptr;
+    state->head.current_buf_seq = next_seq;
+    s_current_aicpu_task_buffers[core_id] = new_buf;
+    wmb();
+    return new_buf;
+}
+
 void l2_swimlane_aicpu_init(int worker_count) {
     // Reset cross-launch state up front. AICPU statics persist across launches
     // on the same loaded .so; without this reset, an enabled→disabled launch
@@ -280,47 +359,34 @@ static void switch_records_buffer(int core_id, int thread_idx) {
 
     LOG_INFO_V0("Thread %d: Core %d buffer is full (count=%u)", thread_idx, core_id, full_buf->count);
 
-    // Check free_queue before committing the full buffer
-    rmb();
-    uint32_t head = state->free_queue.head;
-    uint32_t tail = state->free_queue.tail;
-
-    if (head == tail) {
-        // No replacement buffer available — overwrite current buffer to keep AICore alive
-        LOG_WARN("Thread %d: Core %d no free buffer, overwriting current buffer (data lost)", thread_idx, core_id);
-        state->head.dropped_record_count = state->head.dropped_record_count + full_buf->count;
-        full_buf->count = 0;
-        wmb();
-        return;
-    }
-
-    // Enqueue full buffer to ReadyQueue
     uint32_t seq = state->head.current_buf_seq;
+    uint64_t full_buf_ptr = state->head.current_buf_ptr;
     int rc = enqueue_ready_buffer(
-        s_l2_swimlane_header, thread_idx, core_id, state->head.current_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask
+        s_l2_swimlane_header, thread_idx, core_id, full_buf_ptr, seq, L2SwimlaneBufferKind::AicpuTask
     );
     if (rc != 0) {
         LOG_ERROR("Thread %d: Core %d failed to enqueue buffer (queue full), data lost!", thread_idx, core_id);
-        // Revert: discard data and keep writing
         state->head.dropped_record_count = state->head.dropped_record_count + full_buf->count;
         full_buf->count = 0;
         wmb();
         return;
     }
 
-    // Pop next buffer from free_queue
-    uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
-    rmb();
-    state->free_queue.head = head + 1;
-    state->head.current_buf_ptr = new_buf_ptr;
-    state->head.current_buf_seq = seq + 1;
+    uint32_t next_seq = seq + 1;
+    state->head.current_buf_ptr = 0;
+    state->head.current_buf_seq = next_seq;
+    s_current_aicpu_task_buffers[core_id] = nullptr;
     wmb();
 
-    L2SwimlaneAicpuTaskBuffer *new_buf = reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(new_buf_ptr);
-    new_buf->count = 0;
-    s_current_aicpu_task_buffers[core_id] = new_buf;
+    L2SwimlaneAicpuTaskBuffer *new_buf = try_pop_records_buffer(core_id, state, next_seq);
+    if (new_buf == nullptr) {
+        return;
+    }
 
-    LOG_INFO_V0("Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr);
+    LOG_INFO_V0(
+        "Thread %d: Core %d switched to new buffer (addr=0x%lx)", thread_idx, core_id,
+        reinterpret_cast<uint64_t>(new_buf)
+    );
 }
 
 // Try to rotate the AICore buffer for `core_id`. Called from the completion
@@ -338,10 +404,9 @@ static void aicore_rotate(int core_id, int thread_idx) {
     uint64_t old_buf_ptr = ac_state->head.current_buf_ptr;
     uint32_t seq = ac_state->head.current_buf_seq;
 
-    rmb();
-    uint32_t head = ac_state->free_queue.head;
-    uint32_t tail = ac_state->free_queue.tail;
-    if (head == tail) {
+    uint32_t head = 0;
+    uint32_t tail = 0;
+    if (!wait_for_free_queue_entry(&ac_state->free_queue, &head, &tail)) {
         // No replacement available — AICore continues to write into the old
         // buffer; its slot counter will hit BUFFER_SIZE and the slot guard
         // silently drops further records. We deliberately do NOT bump
@@ -362,6 +427,16 @@ static void aicore_rotate(int core_id, int thread_idx) {
         return;
     }
 
+    uint64_t new_buf_ptr = ac_state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
+    rmb();
+    if (new_buf_ptr == 0) {
+        LOG_WARN(
+            "Thread %d: Core %d AICore free_queue returned a null buffer at rotation; keeping old buffer active",
+            thread_idx, core_id
+        );
+        return;
+    }
+
     // Enqueue the just-filled AICore buffer with count = BUFFER_SIZE.
     if (old_buf_ptr != 0) {
         L2SwimlaneAicoreTaskBuffer *old_buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(old_buf_ptr);
@@ -393,8 +468,6 @@ static void aicore_rotate(int core_id, int thread_idx) {
     // detect rotation, then reads head.current_buf_ptr. Write ptr first so
     // AICore can never see a new seq with a stale ptr. new_buf->count=0 must
     // also be visible before AICore's slot writes begin.
-    uint64_t new_buf_ptr = ac_state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
-    rmb();
     ac_state->free_queue.head = head + 1;
     L2SwimlaneAicoreTaskBuffer *new_buf = reinterpret_cast<L2SwimlaneAicoreTaskBuffer *>(new_buf_ptr);
     new_buf->count = 0;
@@ -461,10 +534,14 @@ int l2_swimlane_aicpu_complete_task(
 
     L2SwimlaneAicpuTaskBuffer *l2_swimlane_buf = s_current_aicpu_task_buffers[core_id];
     if (l2_swimlane_buf == nullptr) {
-        // No active records buffer (init ran out of free buffers); count as drop
-        // so host reconciliation stays consistent.
-        state->head.dropped_record_count += 1;
-        return -1;
+        l2_swimlane_buf = try_pop_records_buffer(core_id, state, state->head.current_buf_seq);
+        if (l2_swimlane_buf == nullptr) {
+            // No active records buffer (init ran out of free buffers or host has
+            // not refilled after the last published full buffer); count as drop
+            // so host reconciliation stays consistent.
+            state->head.dropped_record_count += 1;
+            return -1;
+        }
     }
     uint32_t count = l2_swimlane_buf->count;
     if (count >= PLATFORM_PROF_BUFFER_SIZE) {
@@ -721,19 +798,22 @@ static void switch_phase_buffer_kind(
         );
         state->head.dropped_record_count += full_buf->count;
         full_buf->count = 0;
-        *current_buf_out = nullptr;
-        state->head.current_buf_ptr = 0;
         wmb();
         return;
     }
 
-    rmb();
-    uint32_t head = state->free_queue.head;
-    uint32_t tail = state->free_queue.tail;
-    if (head != tail) {
+    uint32_t head = 0;
+    uint32_t tail = 0;
+    if (wait_for_free_queue_entry(&state->free_queue, &head, &tail)) {
         uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
         rmb();
         state->free_queue.head = head + 1;
+        if (new_buf_ptr == 0) {
+            *current_buf_out = nullptr;
+            state->head.current_buf_ptr = 0;
+            wmb();
+            return;
+        }
         state->head.current_buf_ptr = new_buf_ptr;
         state->head.current_buf_seq = seq + 1;
         wmb();
@@ -764,13 +844,15 @@ static Record *acquire_phase_slot(
 ) {
     Buffer *buf = *current_buf_out;
     if (buf == nullptr) {
-        rmb();
-        uint32_t head = state->free_queue.head;
-        uint32_t tail = state->free_queue.tail;
-        if (head != tail) {
+        uint32_t head = 0;
+        uint32_t tail = 0;
+        if (wait_for_free_queue_entry(&state->free_queue, &head, &tail)) {
             uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PROF_SLOT_COUNT];
             rmb();
             state->free_queue.head = head + 1;
+            if (buf_ptr == 0) {
+                return nullptr;
+            }
             state->head.current_buf_ptr = buf_ptr;
             state->head.current_buf_seq += 1;
             wmb();
diff --git a/src/a5/platform/shared/aicpu/pmu_collector_aicpu.cpp b/src/a5/platform/shared/aicpu/pmu_collector_aicpu.cpp
index b8477e9ff..6c6a215d3 100644
--- a/src/a5/platform/shared/aicpu/pmu_collector_aicpu.cpp
+++ b/src/a5/platform/shared/aicpu/pmu_collector_aicpu.cpp
@@ -16,8 +16,9 @@
  * Buffer switching mirrors a2a3 pmu_collector_aicpu.cpp:
  *   - SPSC free_queue: Host pushes free PmuBuffers, AICPU pops when switching.
  *   - Per-thread ready_queue: AICPU enqueues full buffers for host collection.
- *   - On free_queue empty or ready_queue full: overwrite current buffer (data lost,
- *     same policy as a2a3 — avoids blocking the AICPU dispatch loop).
+ *   - Full buffers are published before AICPU tries to recover a replacement.
+ *     If recovery is delayed, later records are counted as dropped until host
+ *     replenishes free_queue.
  *
  * a5-specific: AICore reads PMU MMIO itself (via ld_dev) and writes the
  * snapshot into a per-core stable PmuAicoreRing
@@ -31,6 +32,7 @@
 
 #include <cstring>
 
+#include "aicpu/device_time.h"
 #include "aicpu/platform_regs.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
@@ -58,6 +60,9 @@ static PmuAicoreRing *s_pmu_aicore_rings[PLATFORM_MAX_CORES];
 // Populated by pmu_aicpu_init(); 0 means "no PMU for this core" (sim).
 static uint64_t s_pmu_reg_addrs[PLATFORM_MAX_CORES] = {0};
 
+static constexpr uint64_t kPmuQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000;  // 20 us
+static constexpr uint32_t kPmuQueueBackpressurePollMask = 1023;
+
 extern "C" void set_platform_pmu_base(uint64_t pmu_data_base) { g_platform_pmu_base = pmu_data_base; }
 
 extern "C" uint64_t get_platform_pmu_base() { return g_platform_pmu_base; }
@@ -107,22 +112,74 @@ static void pmu_stop(uint64_t reg_base, uint32_t saved_ctrl0, uint32_t saved_ctr
 // ---------------------------------------------------------------------------
 
 static int enqueue_pmu_ready_buffer(int thread_idx, uint32_t core_index, uint64_t buffer_ptr, uint32_t buffer_seq) {
+    if (s_pmu_header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
+        return -1;
+    }
     uint32_t capacity = PLATFORM_PMU_READYQUEUE_SIZE;
-    uint32_t current_tail = s_pmu_header->queue_tails[thread_idx];
-    uint32_t current_head = s_pmu_header->queue_heads[thread_idx];
+    uint32_t current_tail = 0;
+    uint32_t current_head = 0;
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        current_tail = s_pmu_header->queue_tails[thread_idx];
+        current_head = s_pmu_header->queue_heads[thread_idx];
+        uint32_t next_tail = (current_tail + 1) % capacity;
+        if (next_tail != current_head) {
+            break;
+        }
+        if ((++spins & kPmuQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kPmuQueueBackpressureWaitCycles) {
+            return -1;
+        }
+    } while (true);
 
     uint32_t next_tail = (current_tail + 1) % capacity;
-    if (next_tail == current_head) {
-        return -1;  // Queue full
-    }
-
     s_pmu_header->queues[thread_idx][current_tail].core_index = core_index;
     s_pmu_header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr;
     s_pmu_header->queues[thread_idx][current_tail].buffer_seq = buffer_seq;
+    wmb();  // publish: entry fields visible before the tail advance
     s_pmu_header->queue_tails[thread_idx] = next_tail;
     return 0;
 }
 
+static PmuBuffer *try_pop_pmu_buffer(int core_id, PmuBufferState *state, uint32_t next_seq) {
+    (void)core_id;
+    if (state == nullptr) {
+        return nullptr;
+    }
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+    uint32_t head = 0;
+    uint32_t tail = 0;
+
+    do {
+        head = state->free_queue.head;
+        tail = state->free_queue.tail;
+        if (head != tail) {
+            rmb();  // acquire: order the tail read before the buffer_ptrs read below
+            break;
+        }
+        if ((++spins & kPmuQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kPmuQueueBackpressureWaitCycles) {
+            return nullptr;
+        }
+    } while (true);
+
+    uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT];
+    state->free_queue.head = head + 1;
+    if (new_buf_ptr == 0) {
+        return nullptr;
+    }
+
+    PmuBuffer *new_buf = reinterpret_cast<PmuBuffer *>(new_buf_ptr);
+    new_buf->count = 0;
+    state->current_buf_ptr = new_buf_ptr;
+    state->current_buf_seq = next_seq;
+    wmb();
+    return new_buf;
+}
+
 // ---------------------------------------------------------------------------
 // Internal: switch the current buffer for one core (called from
 // complete_record when records[count] hits PLATFORM_PMU_RECORDS_PER_BUFFER)
@@ -139,20 +196,6 @@ static void pmu_switch_buffer(int core_id, int thread_idx) {
         return;
     }
 
-    // Check free_queue before committing the full buffer
-    rmb();
-    uint32_t head = state->free_queue.head;
-    uint32_t tail = state->free_queue.tail;
-
-    if (head == tail) {
-        // No replacement buffer available — overwrite current buffer to keep AICPU alive
-        LOG_WARN("Thread %d: Core %d no free PMU buffer, overwriting current buffer (data lost)", thread_idx, core_id);
-        state->dropped_record_count += full_buf->count;
-        full_buf->count = 0;
-        wmb();
-        return;
-    }
-
     // Enqueue full buffer to ready_queue
     uint32_t seq = state->current_buf_seq;
     int rc = enqueue_pmu_ready_buffer(thread_idx, static_cast<uint32_t>(core_id), state->current_buf_ptr, seq);
@@ -166,19 +209,20 @@ static void pmu_switch_buffer(int core_id, int thread_idx) {
         return;
     }
 
-    // Pop next buffer from free_queue
-    uint64_t new_buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT];
-    rmb();
-    state->free_queue.head = head + 1;
-    state->current_buf_ptr = new_buf_ptr;
-    state->current_buf_seq = seq + 1;
+    uint32_t next_seq = seq + 1;
+    state->current_buf_ptr = 0;
+    state->current_buf_seq = next_seq;
     wmb();
 
-    PmuBuffer *new_buf = reinterpret_cast<PmuBuffer *>(new_buf_ptr);
-    new_buf->count = 0;
-    wmb();
+    PmuBuffer *new_buf = try_pop_pmu_buffer(core_id, state, next_seq);
+    if (new_buf == nullptr) {
+        return;
+    }
 
-    LOG_INFO_V0("Thread %d: Core %d switched to new PMU buffer (addr=0x%lx)", thread_idx, core_id, new_buf_ptr);
+    LOG_INFO_V0(
+        "Thread %d: Core %d switched to new PMU buffer (addr=0x%lx)", thread_idx, core_id,
+        reinterpret_cast<uint64_t>(new_buf)
+    );
 }
 
 // ---------------------------------------------------------------------------
@@ -244,16 +288,8 @@ void pmu_aicpu_init(const uint32_t *physical_core_ids, int num_cores) {
         uint32_t tail = state->free_queue.tail;
 
         if (head != tail) {
-            uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_PMU_SLOT_COUNT];
-            rmb();
-            state->free_queue.head = head + 1;
-            state->current_buf_ptr = buf_ptr;
-            state->current_buf_seq = 0;
-            wmb();
-
-            PmuBuffer *buf = reinterpret_cast<PmuBuffer *>(buf_ptr);
-            buf->count = 0;
-
+            (void)try_pop_pmu_buffer(i, state, 0);
+            uint64_t buf_ptr = state->current_buf_ptr;
             LOG_DEBUG("Core %d: popped initial PMU buffer (addr=0x%lx)", i, buf_ptr);
         } else {
             LOG_ERROR("Core %d: PMU free_queue is empty during init!", i);
@@ -300,12 +336,18 @@ void pmu_aicpu_complete_record(
 
     rmb();
     uint64_t cur_ptr = state->current_buf_ptr;
+    PmuBuffer *buf = nullptr;
     if (cur_ptr == 0) {
-        state->dropped_record_count += 1;
-        wmb();
-        return;
+        buf = try_pop_pmu_buffer(core_id, state, state->current_buf_seq);
+        if (buf == nullptr) {
+            state->dropped_record_count += 1;
+            wmb();
+            return;
+        }
+        cur_ptr = state->current_buf_ptr;
+    } else {
+        buf = reinterpret_cast<PmuBuffer *>(cur_ptr);
     }
-    PmuBuffer *buf = reinterpret_cast<PmuBuffer *>(cur_ptr);
 
     // Switch buffer if full (internal — ring address is unchanged)
     if (buf->count >= static_cast<uint32_t>(PLATFORM_PMU_RECORDS_PER_BUFFER)) {
@@ -313,11 +355,16 @@ void pmu_aicpu_complete_record(
         rmb();
         cur_ptr = state->current_buf_ptr;
         if (cur_ptr == 0) {
-            state->dropped_record_count += 1;
-            wmb();
-            return;
+            buf = try_pop_pmu_buffer(core_id, state, state->current_buf_seq);
+            if (buf == nullptr) {
+                state->dropped_record_count += 1;
+                wmb();
+                return;
+            }
+            cur_ptr = state->current_buf_ptr;
+        } else {
+            buf = reinterpret_cast<PmuBuffer *>(cur_ptr);
         }
-        buf = reinterpret_cast<PmuBuffer *>(cur_ptr);
     }
 
     uint32_t idx = buf->count;
diff --git a/src/a5/platform/shared/host/l2_swimlane_collector.cpp b/src/a5/platform/shared/host/l2_swimlane_collector.cpp
index 752d5fd18..148c881f5 100644
--- a/src/a5/platform/shared/host/l2_swimlane_collector.cpp
+++ b/src/a5/platform/shared/host/l2_swimlane_collector.cpp
@@ -75,9 +75,9 @@ int L2SwimlaneCollector::initialize(
     aicpu_thread_num_ = aicpu_thread_num;
     l2_swimlane_level_ = l2_swimlane_level;
     output_prefix_ = output_prefix;
-    total_perf_collected_ = 0;
-    total_sched_phase_collected_ = 0;
-    total_orch_phase_collected_ = 0;
+    total_perf_collected_.store(0, std::memory_order_relaxed);
+    total_sched_phase_collected_.store(0, std::memory_order_relaxed);
+    total_orch_phase_collected_.store(0, std::memory_order_relaxed);
 
     // Stash the memory context on the base up-front so alloc_paired_buffer
     // sees consistent values during init. shm_host_ stays nullptr until the
@@ -378,10 +378,11 @@ void L2SwimlaneCollector::copy_perf_buffer(const ReadyBufferInfo &info) {
     }
     uint32_t core_index = info.index;
     if (core_index < static_cast<uint32_t>(num_aicore_)) {
+        std::scoped_lock<std::mutex> lock(perf_record_mutexes_[core_index]);
         for (uint32_t i = 0; i < count; i++) {
             collected_perf_records_[core_index].push_back(buf->records[i]);
         }
-        total_perf_collected_ += count;
+        total_perf_collected_.fetch_add(count, std::memory_order_relaxed);
     }
 }
 
@@ -394,12 +395,13 @@ void L2SwimlaneCollector::copy_sched_phase_buffer(const ReadyBufferInfo &info) {
     }
     uint32_t tidx = info.index;
     if (tidx < collected_sched_phase_records_.size()) {
+        std::scoped_lock<std::mutex> lock(sched_phase_record_mutexes_[tidx]);
         for (uint32_t i = 0; i < count; i++) {
             collected_sched_phase_records_[tidx].push_back(buf->records[i]);
         }
-        total_sched_phase_collected_ += count;
+        total_sched_phase_collected_.fetch_add(count, std::memory_order_relaxed);
         if (count > 0) {
-            has_phase_data_ = true;
+            has_phase_data_.store(true, std::memory_order_relaxed);
         }
     }
 }
@@ -413,12 +415,13 @@ void L2SwimlaneCollector::copy_orch_phase_buffer(const ReadyBufferInfo &info) {
     }
     uint32_t tidx = info.index;
     if (tidx < collected_orch_phase_records_.size()) {
+        std::scoped_lock<std::mutex> lock(orch_phase_record_mutexes_[tidx]);
         for (uint32_t i = 0; i < count; i++) {
             collected_orch_phase_records_[tidx].push_back(buf->records[i]);
         }
-        total_orch_phase_collected_ += count;
+        total_orch_phase_collected_.fetch_add(count, std::memory_order_relaxed);
         if (count > 0) {
-            has_phase_data_ = true;
+            has_phase_data_.store(true, std::memory_order_relaxed);
         }
     }
 }
@@ -453,16 +456,19 @@ void L2SwimlaneCollector::copy_aicore_buffer(const ReadyBufferInfo &info) {
     if (count > static_cast<uint32_t>(PLATFORM_AICORE_BUFFER_SIZE)) {
         count = PLATFORM_AICORE_BUFFER_SIZE;
     }
-    auto &dst = collected_aicore_records_[core_index];
-    dst.reserve(dst.size() + count);
     uint32_t skipped = 0;
-    for (uint32_t i = 0; i < count; i++) {
-        const L2SwimlaneAicoreTaskRecord &r = buf->records[i];
-        if (r.start_time == 0) {
-            skipped++;
-            continue;
+    {
+        std::scoped_lock<std::mutex> lock(aicore_record_mutexes_[core_index]);
+        auto &dst = collected_aicore_records_[core_index];
+        dst.reserve(dst.size() + count);
+        for (uint32_t i = 0; i < count; i++) {
+            const L2SwimlaneAicoreTaskRecord &r = buf->records[i];
+            if (r.start_time == 0) {
+                skipped++;
+                continue;
+            }
+            dst.push_back(r);
         }
-        dst.push_back(r);
     }
     if (skipped > 0) {
         LOG_WARN(
@@ -566,8 +572,7 @@ void L2SwimlaneCollector::reconcile_counters() {
 
         if (dropped_device > 0) {
             LOG_WARN(
-                "L2Swimlane reconcile: %lu %s records dropped on device side (buffer full / "
-                "ready_queue full).",
+                "L2Swimlane reconcile: %lu %s records dropped on device side.",
                 static_cast<unsigned long>(dropped_device), kind
             );
         }
@@ -603,7 +608,7 @@ void L2SwimlaneCollector::reconcile_counters() {
         [](void *host_ptr) {
             return reinterpret_cast<L2SwimlaneAicpuTaskBuffer *>(host_ptr)->count;
         },
-        sizeof(L2SwimlaneAicpuTaskBuffer), total_perf_collected_, /*optional=*/false
+        sizeof(L2SwimlaneAicpuTaskBuffer), total_perf_collected_.load(std::memory_order_relaxed), /*optional=*/false
     );
 
     reconcile_one(
@@ -614,7 +619,8 @@ void L2SwimlaneCollector::reconcile_counters() {
         [](void *host_ptr) {
             return reinterpret_cast<L2SwimlaneAicpuSchedPhaseBuffer *>(host_ptr)->count;
         },
-        sizeof(L2SwimlaneAicpuSchedPhaseBuffer), total_sched_phase_collected_, /*optional=*/true
+        sizeof(L2SwimlaneAicpuSchedPhaseBuffer), total_sched_phase_collected_.load(std::memory_order_relaxed),
+        /*optional=*/true
     );
 
     reconcile_one(
@@ -625,7 +631,8 @@ void L2SwimlaneCollector::reconcile_counters() {
         [](void *host_ptr) {
             return reinterpret_cast<L2SwimlaneAicpuOrchPhaseBuffer *>(host_ptr)->count;
         },
-        sizeof(L2SwimlaneAicpuOrchPhaseBuffer), total_orch_phase_collected_, /*optional=*/true
+        sizeof(L2SwimlaneAicpuOrchPhaseBuffer), total_orch_phase_collected_.load(std::memory_order_relaxed),
+        /*optional=*/true
     );
 }
 
@@ -691,7 +698,10 @@ void L2SwimlaneCollector::read_phase_header_metadata() {
         LOG_INFO_V0("  Core-to-thread mapping: %d cores", num_phase_cores);
     }
 
-    LOG_INFO_V0("Phase metadata collection complete: has_phase_data=%s", has_phase_data_ ? "yes" : "no");
+    LOG_INFO_V0(
+        "Phase metadata collection complete: has_phase_data=%s",
+        has_phase_data_.load(std::memory_order_relaxed) ? "yes" : "no"
+    );
 }
 
 void L2SwimlaneCollector::set_core_types(const CoreType *types, int n) {
@@ -1036,10 +1046,10 @@ int L2SwimlaneCollector::finalize(L2SwimlaneUnregisterCallback unregister_cb, co
     collected_sched_phase_records_.clear();
     collected_orch_phase_records_.clear();
     core_to_thread_.clear();
-    has_phase_data_ = false;
-    total_perf_collected_ = 0;
-    total_sched_phase_collected_ = 0;
-    total_orch_phase_collected_ = 0;
+    has_phase_data_.store(false, std::memory_order_relaxed);
+    total_perf_collected_.store(0, std::memory_order_relaxed);
+    total_sched_phase_collected_.store(0, std::memory_order_relaxed);
+    total_orch_phase_collected_.store(0, std::memory_order_relaxed);
     clear_memory_context();
 
     LOG_DEBUG("Performance profiling cleanup complete");
diff --git a/src/common/platform/include/host/buffer_pool_manager.h b/src/common/platform/include/host/buffer_pool_manager.h
index 9eabb97ea..5156cb1e7 100644
--- a/src/common/platform/include/host/buffer_pool_manager.h
+++ b/src/common/platform/include/host/buffer_pool_manager.h
@@ -14,9 +14,9 @@
  * @brief Generic buffer-pool data structure shared by L2Swimlane, TensorDump,
  *        and PMU collectors. Owns:
  *
- *   - ready_queue (mgmt → collector) with mutex/cv,
- *   - done_queue (collector → mgmt) with mutex,
- *   - per-kind recycled-buffer pools,
+ *   - ready_queue shard(s) (mgmt → collector) with mutex/cv,
+ *   - done_queue shard(s) (collector → mgmt) with mutex,
+ *   - shard-local per-kind recycled-buffer pools,
  *   - dev↔host pointer mapping table,
  *   - alloc_and_register / free_buffer / resolve_host_ptr helpers.
  *
@@ -27,7 +27,7 @@
  * Defines the shared types used by the framework: ThreadFactory (for thread
  * creation with optional device-context binding), MemoryOps (type-erased
  * alloc/reg/free/copy callbacks), and DoneInfo (per-buffer ownership info
- * passed through done_queue).
+ * passed through done queues).
  *
  * SVM vs host-shadow (chosen at runtime by what the collector installs)
  * ---------------------------------------------------------------------
@@ -71,10 +71,12 @@
 #ifndef SRC_COMMON_PLATFORM_INCLUDE_HOST_BUFFER_POOL_MANAGER_H_
 #define SRC_COMMON_PLATFORM_INCLUDE_HOST_BUFFER_POOL_MANAGER_H_
 
+#include <array>
 #include <atomic>
 #include <chrono>
 #include <condition_variable>
 #include <cstddef>
+#include <cstdint>
 #include <cstdlib>
 #include <functional>
 #include <mutex>
@@ -128,8 +130,8 @@ struct MemoryOps {
 };
 
 /**
- * Per-buffer ownership info threaded through the done_queue so that the mgmt
- * thread, when it recycles a finished buffer, knows which per-kind pool it
+ * Per-buffer ownership info threaded through a done queue shard so that the
+ * mgmt thread, when it recycles a finished buffer, knows which per-kind pool it
  * came from.
  */
 struct DoneInfo {
@@ -137,6 +139,16 @@ struct DoneInfo {
     int kind;  // [0, Module::kBufferKinds)
 };
 
+template <typename Module, typename = void>
+struct ProfilerModuleCollectorThreadCount {
+    static constexpr int value = 1;
+};
+
+template <typename Module>
+struct ProfilerModuleCollectorThreadCount<Module, std::void_t<decltype(Module::kCollectorThreadCount)>> {
+    static constexpr int value = Module::kCollectorThreadCount;
+};
+
 template <typename Module>
 class BufferPoolManager {
     // Static checks for the Module concept. Required type aliases trigger
@@ -149,9 +161,12 @@ class BufferPoolManager {
 
 public:
     using ReadyBufferInfo = typename Module::ReadyBufferInfo;
+    static constexpr int kCollectorShardCount = ProfilerModuleCollectorThreadCount<Module>::value;
+    static_assert(kCollectorShardCount >= 1, "Module::kCollectorThreadCount must be >= 1");
 
     BufferPoolManager() :
-        recycled_(Module::kBufferKinds) {}
+        ready_shards_(kCollectorShardCount),
+        done_shards_(kCollectorShardCount) {}
     ~BufferPoolManager() = default;
 
     BufferPoolManager(const BufferPoolManager &) = delete;
@@ -181,7 +196,7 @@ class BufferPoolManager {
 
     /**
      * Release every device buffer the framework currently owns: recycled
-     * pools, done_queue, and ready_queue. Buffers still in the per-pool
+     * pools, done queues, and ready queues. Buffers still in the per-pool
      * free_queue or held as current_buf_ptr are NOT touched — those belong
      * to the collector and must be released by it (the AICPU may still be
      * referencing them via shared memory until execution ends).
@@ -215,23 +230,29 @@ class BufferPoolManager {
             }
         };
 
-        for (auto &pool : recycled_) {
-            for (void *p : pool)
-                release_once(p);
-            pool.clear();
+        for (auto &shard_pools : recycled_) {
+            for (auto &pool : shard_pools) {
+                for (void *p : pool)
+                    release_once(p);
+                pool.clear();
+            }
         }
         {
-            std::scoped_lock<std::mutex> lock(done_mutex_);
-            while (!done_queue_.empty()) {
-                release_once(done_queue_.front().dev_ptr);
-                done_queue_.pop();
+            for (auto &shard : done_shards_) {
+                std::scoped_lock<std::mutex> lock(shard.mutex);
+                while (!shard.queue.empty()) {
+                    release_once(shard.queue.front().dev_ptr);
+                    shard.queue.pop();
+                }
             }
         }
         {
-            std::scoped_lock<std::mutex> lock(ready_mutex_);
-            while (!ready_queue_.empty()) {
-                release_once(ready_queue_.front().dev_buffer_ptr);
-                ready_queue_.pop();
+            for (auto &shard : ready_shards_) {
+                std::scoped_lock<std::mutex> lock(shard.mutex);
+                while (!shard.queue.empty()) {
+                    release_once(shard.queue.front().dev_buffer_ptr);
+                    shard.queue.pop();
+                }
             }
         }
     }
@@ -269,15 +290,17 @@ class BufferPoolManager {
      */
     template <typename ReleaseFn>
     void release_all_owned(const ReleaseFn &release_fn) {
-        for (auto &pool : recycled_)
-            pool.clear();
-        {
-            std::scoped_lock<std::mutex> lock(done_mutex_);
-            std::queue<DoneInfo>().swap(done_queue_);
+        for (auto &shard_pools : recycled_) {
+            for (auto &pool : shard_pools)
+                pool.clear();
         }
-        {
-            std::scoped_lock<std::mutex> lock(ready_mutex_);
-            std::queue<ReadyBufferInfo>().swap(ready_queue_);
+        for (auto &shard : done_shards_) {
+            std::scoped_lock<std::mutex> lock(shard.mutex);
+            std::queue<DoneInfo>().swap(shard.queue);
+        }
+        for (auto &shard : ready_shards_) {
+            std::scoped_lock<std::mutex> lock(shard.mutex);
+            std::queue<ReadyBufferInfo>().swap(shard.queue);
         }
         for (auto &kv : dev_to_host_) {
             if (kv.first != nullptr) {
@@ -425,45 +448,50 @@ class BufferPoolManager {
     }
 
     // -------------------------------------------------------------------------
-    // ready_queue: mgmt thread pushes, collector thread pops
+    // ready_queue shards: mgmt threads push, collector threads pop
     // -------------------------------------------------------------------------
 
-    void push_to_ready(const ReadyBufferInfo &info) {
+    void push_to_ready(const ReadyBufferInfo &info, int shard_index = 0) {
+        auto &shard = ready_shards_[normalize_shard(shard_index)];
         {
-            std::scoped_lock<std::mutex> lock(ready_mutex_);
-            ready_queue_.push(info);
+            std::scoped_lock<std::mutex> lock(shard.mutex);
+            shard.queue.push(info);
         }
-        ready_cv_.notify_one();
+        shard.cv.notify_one();
     }
 
-    bool try_pop_ready(ReadyBufferInfo &out) {
-        std::scoped_lock<std::mutex> lock(ready_mutex_);
-        if (ready_queue_.empty()) return false;
-        out = ready_queue_.front();
-        ready_queue_.pop();
+    bool try_pop_ready(ReadyBufferInfo &out, int shard_index = 0) {
+        auto &shard = ready_shards_[normalize_shard(shard_index)];
+        std::scoped_lock<std::mutex> lock(shard.mutex);
+        if (shard.queue.empty()) return false;
+        out = shard.queue.front();
+        shard.queue.pop();
         return true;
     }
 
-    bool wait_pop_ready(ReadyBufferInfo &out, std::chrono::milliseconds timeout) {
-        std::unique_lock<std::mutex> lock(ready_mutex_);
-        if (!ready_cv_.wait_for(lock, timeout, [this] {
-                return !ready_queue_.empty();
+    bool wait_pop_ready(ReadyBufferInfo &out, std::chrono::milliseconds timeout, int shard_index = 0) {
+        auto &shard = ready_shards_[normalize_shard(shard_index)];
+        std::unique_lock<std::mutex> lock(shard.mutex);
+        if (!shard.cv.wait_for(lock, timeout, [&shard] {
+                return !shard.queue.empty();
             })) {
             return false;
         }
-        out = ready_queue_.front();
-        ready_queue_.pop();
+        out = shard.queue.front();
+        shard.queue.pop();
         return true;
     }
 
     // -------------------------------------------------------------------------
-    // done_queue: collector thread reports buffers it has finished copying;
-    // mgmt thread folds them back into the recycled pool of the right kind.
+    // done_queue shards: collector threads report buffers they have finished
+    // copying; mgmt folds them back into the same shard's recycled pool of the
+    // right kind.
     // -------------------------------------------------------------------------
 
-    void notify_copy_done(void *dev_ptr, int kind) {
-        std::scoped_lock<std::mutex> lock(done_mutex_);
-        done_queue_.push(DoneInfo{dev_ptr, kind});
+    void notify_copy_done(void *dev_ptr, int kind, int shard_index = 0) {
+        auto &shard = done_shards_[normalize_shard(shard_index)];
+        std::scoped_lock<std::mutex> lock(shard.mutex);
+        shard.queue.push(DoneInfo{dev_ptr, kind});
     }
 
     // -------------------------------------------------------------------------
@@ -497,7 +525,10 @@ class BufferPoolManager {
             return nullptr;
         }
         *host_ptr_out = host_ptr;
-        dev_to_host_[dev_ptr] = host_ptr;
+        {
+            std::scoped_lock<std::mutex> lock(mapping_mutex_);
+            dev_to_host_[dev_ptr] = host_ptr;
+        }
         return dev_ptr;
     }
 
@@ -508,15 +539,21 @@ class BufferPoolManager {
      */
     void free_buffer(void *dev_ptr) {
         if (dev_ptr == nullptr) return;
-        auto it = dev_to_host_.find(dev_ptr);
-        void *host_ptr = (it != dev_to_host_.end()) ? it->second : nullptr;
-        if (it != dev_to_host_.end()) {
-            dev_to_host_.erase(it);
+        void *host_ptr = nullptr;
+        bool free_host_shadow = false;
+        {
+            std::scoped_lock<std::mutex> lock(mapping_mutex_);
+            auto it = dev_to_host_.find(dev_ptr);
+            host_ptr = (it != dev_to_host_.end()) ? it->second : nullptr;
+            if (it != dev_to_host_.end()) {
+                dev_to_host_.erase(it);
+            }
+            free_host_shadow = (host_ptr != nullptr && malloc_shadows_.erase(host_ptr) > 0);
         }
         if (ops_.free_) {
             ops_.free_(dev_ptr);
         }
-        if (host_ptr != nullptr && malloc_shadows_.erase(host_ptr) > 0) {
+        if (free_host_shadow) {
             std::free(host_ptr);
         }
     }
@@ -526,6 +563,7 @@ class BufferPoolManager {
      * alloc_and_register / register_mapping time.
      */
     void *resolve_host_ptr(void *dev_ptr) {
+        std::scoped_lock<std::mutex> lock(mapping_mutex_);
         auto it = dev_to_host_.find(dev_ptr);
         if (it != dev_to_host_.end()) return it->second;
         LOG_ERROR("BufferPoolManager: no host mapping for dev_ptr=%p", dev_ptr);
@@ -537,7 +575,10 @@ class BufferPoolManager {
      * initialize() when it pre-allocates buffers and wants the mgmt thread
      * to be able to resolve them later.
      */
-    void register_mapping(void *dev_ptr, void *host_ptr) { dev_to_host_[dev_ptr] = host_ptr; }
+    void register_mapping(void *dev_ptr, void *host_ptr) {
+        std::scoped_lock<std::mutex> lock(mapping_mutex_);
+        dev_to_host_[dev_ptr] = host_ptr;
+    }
 
     /**
      * Claim ownership of a host shadow that the framework malloc'd. Only
@@ -547,6 +588,7 @@ class BufferPoolManager {
      */
     void add_malloc_shadow(void *host_ptr) {
         if (host_ptr != nullptr) {
+            std::scoped_lock<std::mutex> lock(mapping_mutex_);
             malloc_shadows_.insert(host_ptr);
         }
     }
@@ -556,36 +598,82 @@ class BufferPoolManager {
      * empty. Caller is responsible for resolving host_ptr (via
      * resolve_host_ptr) before handing the buffer back to AICPU.
      */
-    void *pop_recycled(int kind) {
-        auto &pool = recycled_[kind];
+    void *pop_recycled(int kind, int shard_index = 0) {
+        auto shard = normalize_shard(shard_index);
+        std::scoped_lock<std::mutex> lock(recycled_mutexes_[shard][kind]);
+        auto &pool = recycled_[shard][kind];
         if (pool.empty()) return nullptr;
         void *p = pool.back();
         pool.pop_back();
         return p;
     }
 
-    void push_recycled(int kind, void *dev_ptr) { recycled_[kind].push_back(dev_ptr); }
+    void *pop_recycled_any(int kind, int preferred_shard = 0) {
+        if (void *p = pop_recycled(kind, preferred_shard); p != nullptr) return p;
+        const auto preferred = normalize_shard(preferred_shard);
+        for (size_t s = 0; s < recycled_.size(); s++) {
+            if (s == preferred) continue;
+            if (void *p = pop_recycled(kind, static_cast<int>(s)); p != nullptr) return p;
+        }
+        return nullptr;
+    }
+
+    void push_recycled(int kind, void *dev_ptr, int shard_index = 0) {
+        auto shard = normalize_shard(shard_index);
+        std::scoped_lock<std::mutex> lock(recycled_mutexes_[shard][kind]);
+        recycled_[shard][kind].push_back(dev_ptr);
+    }
+
+    size_t recycled_count(int kind) const {
+        size_t total = 0;
+        for (size_t shard = 0; shard < recycled_.size(); shard++) {
+            std::scoped_lock<std::mutex> lock(recycled_mutexes_[shard][kind]);
+            total += recycled_[shard][kind].size();
+        }
+        return total;
+    }
 
     bool recycled_empty() const {
-        for (const auto &pool : recycled_) {
-            if (!pool.empty()) return false;
+        for (size_t shard = 0; shard < recycled_.size(); shard++) {
+            for (int kind = 0; kind < Module::kBufferKinds; kind++) {
+                std::scoped_lock<std::mutex> lock(recycled_mutexes_[shard][kind]);
+                if (!recycled_[shard][kind].empty()) return false;
+            }
         }
         return true;
     }
 
+    template <typename Fn>
+    decltype(auto) with_free_queue_writer(const void *queue_key, Fn &&fn) {
+        std::scoped_lock<std::mutex> lock(free_queue_mutexes_[free_queue_lock_index(queue_key)]);
+        return fn();
+    }
+
     /**
-     * Drain everything currently in done_queue back into the per-kind
+     * Drain everything currently in done queue shards back into the per-kind
      * recycled pool. May be called from Module::process_entry when its
      * primary recycled pool ran out, to harvest buffers the collector freed
      * in the meantime.
      */
-    void drain_done_into_recycled() {
-        std::scoped_lock<std::mutex> lock(done_mutex_);
-        while (!done_queue_.empty()) {
-            const DoneInfo &info = done_queue_.front();
-            recycled_[info.kind].push_back(info.dev_ptr);
-            done_queue_.pop();
+    size_t drain_done_into_recycled(int shard_index) {
+        auto &shard = done_shards_[normalize_shard(shard_index)];
+        size_t drained = 0;
+        std::scoped_lock<std::mutex> lock(shard.mutex);
+        while (!shard.queue.empty()) {
+            const DoneInfo &info = shard.queue.front();
+            push_recycled(info.kind, info.dev_ptr, shard_index);
+            shard.queue.pop();
+            drained++;
         }
+        return drained;
+    }
+
+    size_t drain_done_into_recycled() {
+        size_t drained = 0;
+        for (size_t shard = 0; shard < done_shards_.size(); shard++) {
+            drained += drain_done_into_recycled(static_cast<int>(shard));
+        }
+        return drained;
     }
 
     void *shared_mem_dev() const { return shared_mem_dev_; }
@@ -593,6 +681,22 @@ class BufferPoolManager {
     int device_id() const { return device_id_; }
 
 private:
+    struct ReadyQueueShard {
+        std::mutex mutex;
+        std::condition_variable cv;
+        std::queue<ReadyBufferInfo> queue;
+    };
+
+    struct DoneQueueShard {
+        std::mutex mutex;
+        std::queue<DoneInfo> queue;
+    };
+
+    static size_t normalize_shard(int shard_index) {
+        if (shard_index < 0) return 0;
+        return static_cast<size_t>(shard_index) % static_cast<size_t>(kCollectorShardCount);
+    }
+
     // Subsystem inputs (set by ProfilerBase::start via set_memory_context).
     void *shared_mem_dev_{nullptr};
     void *shared_mem_host_{nullptr};
@@ -601,13 +705,21 @@ class BufferPoolManager {
     MemoryOps ops_;
 
     // mgmt → collector
-    std::mutex ready_mutex_;
-    std::condition_variable ready_cv_;
-    std::queue<ReadyBufferInfo> ready_queue_;
+    std::vector<ReadyQueueShard> ready_shards_;
 
     // collector → mgmt
-    std::mutex done_mutex_;
-    std::queue<DoneInfo> done_queue_;
+    std::vector<DoneQueueShard> done_shards_;
+
+    // Host-side pointer mappings are shared across all collector shards.
+    mutable std::mutex mapping_mutex_;
+    static constexpr size_t kFreeQueueLockStripes = 64;
+
+    static size_t free_queue_lock_index(const void *queue_key) {
+        auto raw = reinterpret_cast<uintptr_t>(queue_key);
+        return (raw >> 6) % kFreeQueueLockStripes;
+    }
+
+    std::array<std::mutex, kFreeQueueLockStripes> free_queue_mutexes_;
 
     // dev → host mapping (single source of truth for resolve_host_ptr)
     std::unordered_map<void *, void *> dev_to_host_;
@@ -618,8 +730,9 @@ class BufferPoolManager {
     // HAL-managed mappings (halHostRegister) live outside this set.
     std::unordered_set<void *> malloc_shadows_;
 
-    // Per-kind recycled buffer pools (vector indexed by Module-defined kind id)
-    std::vector<std::vector<void *>> recycled_;
+    // Local recycled buffer pools indexed by collector shard, then Module-defined kind id.
+    std::array<std::array<std::vector<void *>, Module::kBufferKinds>, kCollectorShardCount> recycled_;
+    mutable std::array<std::array<std::mutex, Module::kBufferKinds>, kCollectorShardCount> recycled_mutexes_;
 };
 
 }  // namespace profiling_common
diff --git a/src/common/platform/include/host/profiler_base.h b/src/common/platform/include/host/profiler_base.h
index 7f46bc704..c0b22c51f 100644
--- a/src/common/platform/include/host/profiler_base.h
+++ b/src/common/platform/include/host/profiler_base.h
@@ -13,8 +13,8 @@
  * @file profiler_base.h
  * @brief CRTP scaffolding shared by L2Swimlane / Dump / PMU collectors.
  *
- * Owns the BufferPoolManager<Module>, the mgmt thread (which polls AICPU
- * ready queues and recycles buffers), and the collector poll thread.
+ * Owns the BufferPoolManager<Module>, the mgmt thread(s) that poll AICPU
+ * ready queues / recycle buffers, and the collector poll thread(s).
  *
  * Module concept contract
  * -----------------------
@@ -26,7 +26,7 @@
  *   // Types
  *   using DataHeader      = ...;   // Shared-memory header (e.g. L2SwimlaneDataHeader).
  *   using ReadyEntry      = ...;   // Per-AICPU-thread ready-queue entry.
- *   using ReadyBufferInfo = ...;   // Hand-off struct to the collector thread
+ *   using ReadyBufferInfo = ...;   // Hand-off struct to collector thread(s)
  *                                  // (carries dev/host ptrs, optional kind
  *                                  // discriminator, and the seq).
  *   using FreeQueue       = ...;   // Per-instance SPSC queue of free buffer
@@ -34,10 +34,17 @@
  *                                  // `buffer_ptrs[kSlotCount]`.
  *
  *   // Constants
- *   static constexpr int      kBufferKinds;    // L2Swimlane=2 (perf+phase), Dump=1, PMU=1.
+ *   static constexpr int      kBufferKinds;    // L2Swimlane=4, Dump=1, PMU=1.
  *   static constexpr uint32_t kReadyQueueSize; // Per-thread ready-queue depth.
  *   static constexpr uint32_t kSlotCount;      // FreeQueue::buffer_ptrs[] length.
  *   static constexpr const char* kSubsystemName; // "PMU" / "L2Swimlane" / "Dump".
+ *   // Optional: number of mgmt drain shards (defaults to 1).
+ *   static constexpr int      kMgmtDrainThreadCount;
+ *   // Optional: number of collector threads / host ready-queue shards.
+ *   static constexpr int      kCollectorThreadCount;
+ *   // Optional: refresh cached queue metadata before a replenish pass.
+ *   template <typename Mgr>
+ *   static void refresh_replenish_metadata(Mgr&, DataHeader*);
  *
  *   // Header pointer cast (host_ptr → DataHeader*)
  *   static DataHeader* header_from_shm(void* shared_mem_host);
@@ -66,16 +73,15 @@
  * Alloc policy
  * ------------
  *
- *   process_entry          replenishes the originating free_queue with EXACTLY
- *                          one buffer per call, matching the 1-in / 1-out
- *                          ratio against the entry the AICPU just produced.
- *                          Single allocation when both recycled and done are
- *                          dry; bounds the per-tick latency.
+ *   process_entry          replenishes the originating free_queue from the
+ *                          current drain shard's local recycled pool until
+ *                          the free_queue is full or no buffer is available.
  *   proactive_replenish    fills to kSlotCount across all instances of every
  *                          kind. When recycled drains it batch-allocates
  *                          `batch_size(kind)` buffers at once to amortize the
- *                          allocator cost — recovery from a double-empty
- *                          condition takes one tick instead of N.
+ *                          allocator cost. Split-mgmt collectors use this
+ *                          only before threads start; runtime replenish only
+ *                          drains collector-done buffers into local pools.
  *
  * The above two algorithms live in ProfilerAlgorithms<Module>; Module only
  * supplies the data-access traits above. Implementors must NOT zero `count`
@@ -89,17 +95,16 @@
  *      start(tf) becomes a no-op (shm_host_ stays nullptr).
  *   2. start(tf) — atomically: (a) assembles a MemoryOps from the stashed
  *      callbacks, (b) hands it to the manager via set_memory_context,
- *      (c) launches the mgmt thread, (d) launches the poll thread. Mgmt is
- *      started before poll because mgmt is the only writer to L2 (the
- *      ready_queue) and poll is its sole consumer.
+ *      (c) launches the mgmt thread(s), (d) launches the collector thread(s).
+ *      Mgmt is started before collectors because mgmt is the only writer to
+ *      the host ready queue shard(s) and collectors are their consumers.
  *   3. ... device execution ...
  *   4. stop() — atomically:
- *        a) flips mgmt_running_, joins the mgmt thread; the mgmt thread's
+ *        a) flips mgmt_running_, joins the mgmt thread(s); the drain thread's
  *           final-drain pass pushes the last L1→L2 entries before exiting.
- *        b) execution_complete_ is set; the poll loop sees it on its next
- *           idle tick, drains L2 (which now contains mgmt's final-drain
- *           output), and exits.
- *        c) collector thread joined.
+ *        b) execution_complete_ is set; each collector loop sees it on its
+ *           next idle tick, drains its host ready queue shard, and exits.
+ *        c) collector thread(s) joined.
  *      Caller is then guaranteed L1 and L2 are both empty and all collected
  *      data has been delivered to Derived::on_buffer_collected.
  *
@@ -108,18 +113,17 @@
  *
  *   - Collectors on platforms without SVM (a5: no halHostRegister) install
  *     `copy_to_device` / `copy_from_device` in MemoryOps so every device
- *     read/write goes through rtMemcpy (onboard) or memcpy (sim). The
- *     mgmt_loop then pulls the device-side shared-memory region into the
- *     host shadow at the top of every tick (`mirror_shm_from_device`) and
- *     pushes the few host-modified fields (`queue_heads[q]` after pop,
- *     `free_queue.tail` + `buffer_ptrs[]` after refill) back as narrow
- *     `write_range_to_device` writes. The bulk `mirror_shm_to_device` is
- *     intentionally NOT called from mgmt_loop: it raced with AICPU writes
- *     to device-only fields (current_buf_ptr, total/dropped/mismatch
- *     counters, queue_tails, free_queue.head, and on a5
- *     L2SwimlaneAicpuPhaseHeader::magic) and rolled them back to the
- *     host-shadow values mirrored in at the top of the tick. Buffer
- *     contents are mirrored on demand inside ProfilerAlgorithms.
+ *     read/write goes through rtMemcpy (onboard) or memcpy (sim). The mgmt
+ *     drain threads refresh only their own queue indices and the popped
+ *     entry from device (narrow `read_range_from_device`) and push the few
+ *     host-modified fields (`queue_heads[q]` after pop, `free_queue.tail` +
+ *     `buffer_ptrs[]` after refill) back as narrow `write_range_to_device`
+ *     writes. The bulk `mirror_shm_to_device` is intentionally NOT called:
+ *     it would race with AICPU writes to device-only fields (current_buf_ptr,
+ *     total/dropped/mismatch counters, queue_tails, free_queue.head, and on
+ *     a5 L2SwimlaneAicpuPhaseHeader::magic) and roll them back to stale
+ *     host-shadow values. Buffer contents are mirrored on demand inside
+ *     ProfilerAlgorithms.
  *   - On these platforms `reg` always allocates a paired host shadow; the
  *     framework never falls back to identity-mapping (which would be wrong
  *     without SVM). Collectors pass nullptr-safe callbacks via
@@ -159,6 +163,7 @@
 #include <functional>
 #include <optional>
 #include <thread>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -170,6 +175,16 @@
 
 namespace profiling_common {
 
+template <typename Module, typename = void>
+struct ProfilerModuleDrainThreadCount {
+    static constexpr int value = 1;
+};
+
+template <typename Module>
+struct ProfilerModuleDrainThreadCount<Module, std::void_t<decltype(Module::kMgmtDrainThreadCount)>> {
+    static constexpr int value = Module::kMgmtDrainThreadCount;
+};
+
 // Common subsystem callback signatures. All four collectors (PMU / TensorDump
 // / L2Swimlane / DepGen) used to declare their own typedefs with identical
 // shapes; these are the canonical types stashed in ProfilerBase via
@@ -307,7 +322,16 @@ struct ProfilerAlgorithms {
     // entry with `read_range_from_device` and skip the pop if the refreshed
     // entry still looks empty — try again next tick.
     template <typename Mgr>
-    static bool try_pop_aicpu_entry(Mgr &mgr, DataHeader *header, int q, ReadyEntry &out) {
+    static bool
+    try_pop_aicpu_entry(Mgr &mgr, DataHeader *header, int q, ReadyEntry &out, bool refresh_indices = false) {
+        if (refresh_indices) {
+            if (mgr.read_range_from_device(&header->queue_heads[q], sizeof(header->queue_heads[q])) != 0 ||
+                mgr.read_range_from_device(&header->queue_tails[q], sizeof(header->queue_tails[q])) != 0) {
+                LOG_ERROR("%s: failed to refresh ready_queue indices for thread %d", Module::kSubsystemName, q);
+                return false;
+            }
+            rmb();
+        }
         uint32_t head = header->queue_heads[q];
         uint32_t tail = header->queue_tails[q];
         if (head >= Module::kReadyQueueSize || tail >= Module::kReadyQueueSize) {
@@ -326,26 +350,33 @@ struct ProfilerAlgorithms {
         // race described above. If the entry's `buffer_ptr` is still 0 the
         // producer hasn't finished publishing — treat the queue as empty
         // for this tick.
-        mgr.read_range_from_device(&header->queues[q][head], sizeof(header->queues[q][head]));
+        if (mgr.read_range_from_device(&header->queues[q][head], sizeof(header->queues[q][head])) != 0) {
+            LOG_ERROR("%s: failed to refresh ready_queue entry for thread %d", Module::kSubsystemName, q);
+            return false;
+        }
         rmb();
         out = header->queues[q][head];
         if (out.buffer_ptr == 0) {
             return false;
         }
-        head = (head + 1) % Module::kReadyQueueSize;
-        header->queue_heads[q] = head;
+        uint32_t old_head = head;
+        uint32_t next_head = (head + 1) % Module::kReadyQueueSize;
+        header->queue_heads[q] = next_head;
         wmb();
         // Push the new head value back to device. The bulk mirror_shm_to_device
         // is intentionally not used here — see buffer_pool_manager.h.
-        mgr.write_range_to_device(&header->queue_heads[q], sizeof(header->queue_heads[q]));
+        if (mgr.write_range_to_device(&header->queue_heads[q], sizeof(header->queue_heads[q])) != 0) {
+            header->queue_heads[q] = old_head;
+            LOG_ERROR("%s: failed to advance ready_queue head for thread %d", Module::kSubsystemName, q);
+            return false;
+        }
         return true;
     }
 
-    // Refill the originating pool's free_queue with exactly one buffer
-    // (recycled → drain done → alloc), then push the popped buffer's
-    // ReadyBufferInfo to the collector LAST. Skips the push if host_ptr
-    // resolution fails — handing a null pointer to on_buffer_collected
-    // would crash the collector thread.
+    // Refill the originating pool's free_queue from this drain shard's local
+    // recycled pool, then push the popped buffer's ReadyBufferInfo to the
+    // collector LAST. Skips the push if host_ptr resolution fails — handing a
+    // null pointer to on_buffer_collected would crash the collector thread.
     //
     // a5 specifics: after resolving the popped buffer's host shadow, copy
     // the buffer contents from device to host before delivery. The host
@@ -356,11 +387,6 @@ struct ProfilerAlgorithms {
         if (!site_opt.has_value()) return;
         auto &site = *site_opt;
 
-        void *new_dev = obtain_buffer(mgr, site.kind, site.buffer_size);
-        if (new_dev != nullptr) {
-            push_to_free_queue(mgr, *site.free_queue, new_dev);
-        }
-
         site.info.host_buffer_ptr = mgr.resolve_host_ptr(site.info.dev_buffer_ptr);
         if (site.info.host_buffer_ptr == nullptr) {
             // resolve_host_ptr already logged. Drop rather than deliver null.
@@ -368,34 +394,66 @@ struct ProfilerAlgorithms {
         }
         // a5: pull buffer contents from device into the host shadow before
         // the collector reads `count` and `records[]`.
-        mgr.copy_buffer_from_device(site.info.host_buffer_ptr, site.info.dev_buffer_ptr, site.buffer_size);
+        if (mgr.copy_buffer_from_device(site.info.host_buffer_ptr, site.info.dev_buffer_ptr, site.buffer_size) != 0) {
+            LOG_ERROR(
+                "%s: failed to copy ready buffer from device (kind=%d, thread=%d)", Module::kSubsystemName, site.kind, q
+            );
+            return;
+        }
+
+        (void)top_up_free_queue(mgr, site.kind, *site.free_queue, site.buffer_size, q);
 
-        mgr.push_to_ready(site.info);
+        mgr.push_to_ready(site.info, q);
     }
 
-    // Drain done_queue into recycled, then top up every (kind, instance)
-    // free_queue to kSlotCount. When the recycled pool of a given kind drains
-    // mid-fill, batch-allocate `batch_size(kind)` buffers and continue.
+    // Drain done_queue into local recycled pools, then top up every (kind,
+    // instance) free_queue to kSlotCount. Split-mgmt collectors call this only
+    // before threads start; their runtime replenish loop only drains done.
     template <typename Mgr>
-    static void proactive_replenish(Mgr &mgr, DataHeader *header) {
+    static uint64_t proactive_replenish(Mgr &mgr, DataHeader *header) {
         mgr.drain_done_into_recycled();
+        return replenish_free_queues(mgr, header);
+    }
+
+    template <typename Mgr>
+    static uint64_t replenish_free_queues(Mgr &mgr, DataHeader *header) {
+        uint64_t pushed = 0;
+        refresh_replenish_metadata(mgr, header, 0);
         Module::for_each_instance(mgr.shared_mem_host(), header, [&](int kind, FreeQueue *fq, size_t buf_size) {
-            top_up_free_queue(mgr, kind, *fq, buf_size);
+            pushed += top_up_free_queue(mgr, kind, *fq, buf_size);
         });
+        return pushed;
     }
 
 private:
-    // Three-level fallback used by process_entry's 1-in/1-out replenish.
+    template <typename Mgr, typename M = Module>
+    static auto refresh_replenish_metadata(Mgr &mgr, DataHeader *header, int)
+        -> decltype(M::refresh_replenish_metadata(mgr, header), void()) {
+        M::refresh_replenish_metadata(mgr, header);
+    }
+
+    template <typename Mgr>
+    static void refresh_replenish_metadata(Mgr &, DataHeader *, long) {}
+
+    // Fallback used by drain-shard free_queue top-up.
     template <typename Mgr>
-    static void *obtain_buffer(Mgr &mgr, int kind, size_t buf_size) {
-        void *p = mgr.pop_recycled(kind);
+    static void *obtain_buffer(Mgr &mgr, int kind, size_t buf_size, int shard_index) {
+        void *p = mgr.pop_recycled(kind, shard_index);
         if (p != nullptr) return p;
-        mgr.drain_done_into_recycled();
-        p = mgr.pop_recycled(kind);
+        mgr.drain_done_into_recycled(shard_index);
+        p = mgr.pop_recycled(kind, shard_index);
+        if (p != nullptr) return p;
+        p = mgr.pop_recycled_any(kind, shard_index);
         if (p != nullptr) return p;
 
-        void *host_ptr = nullptr;
-        p = mgr.alloc_and_register(buf_size, &host_ptr);
+        const int batch = Module::batch_size(kind);
+        for (int i = 0; i < batch; i++) {
+            void *host_ptr = nullptr;
+            void *dev = mgr.alloc_and_register(buf_size, &host_ptr);
+            if (dev == nullptr) break;
+            mgr.push_recycled(kind, dev, shard_index);
+        }
+        p = mgr.pop_recycled(kind, shard_index);
         if (p == nullptr) {
             LOG_WARN(
                 "%s: alloc failed for %zu bytes (kind=%d) — increase BUFFERS_PER_* to reduce drops",
@@ -405,9 +463,9 @@ struct ProfilerAlgorithms {
         return p;
     }
 
-    // Append one buffer pointer to a per-instance free_queue. Caller owns
-    // the "queue is not full" guarantee (process_entry: 1-in/1-out;
-    // top_up_free_queue: explicit fq_used < kSlotCount).
+    // Append one buffer pointer to a per-instance free_queue if it has
+    // capacity. The manager serializes host writers so split drain shards and
+    // non-split/proactive refill paths never race on free_queue.tail.
     //
     // a5: write the new slot and the advanced tail back to device via
     // `write_range_to_device` so AICPU sees the refill without us bulk
@@ -415,50 +473,67 @@ struct ProfilerAlgorithms {
     // written before the tail so AICPU never observes a tail update without
     // the corresponding pointer.
     template <typename Mgr>
-    static void push_to_free_queue(Mgr &mgr, FreeQueue &fq, void *dev_ptr) {
-        uint32_t fq_tail = fq.tail;
-        uint32_t slot_idx = fq_tail % Module::kSlotCount;
-        fq.buffer_ptrs[slot_idx] = reinterpret_cast<uint64_t>(dev_ptr);
-        wmb();
-        mgr.write_range_to_device(&fq.buffer_ptrs[slot_idx], sizeof(fq.buffer_ptrs[slot_idx]));
-        fq.tail = fq_tail + 1;
-        wmb();
-        mgr.write_range_to_device(&fq.tail, sizeof(fq.tail));
+    static bool try_push_to_free_queue(Mgr &mgr, FreeQueue &fq, void *dev_ptr) {
+        return mgr.with_free_queue_writer(&fq, [&]() {
+            if (mgr.read_range_from_device(&fq.head, sizeof(fq.head)) != 0) {
+                LOG_ERROR("%s: failed to refresh free_queue head", Module::kSubsystemName);
+                return false;
+            }
+            rmb();
+            uint32_t fq_head = fq.head;
+            uint32_t fq_tail = fq.tail;
+            if (fq_tail - fq_head >= Module::kSlotCount) {
+                return false;
+            }
+            uint32_t slot_idx = fq_tail % Module::kSlotCount;
+            uint64_t old_slot = fq.buffer_ptrs[slot_idx];
+            fq.buffer_ptrs[slot_idx] = reinterpret_cast<uint64_t>(dev_ptr);
+            wmb();
+            if (mgr.write_range_to_device(&fq.buffer_ptrs[slot_idx], sizeof(fq.buffer_ptrs[slot_idx])) != 0) {
+                fq.buffer_ptrs[slot_idx] = old_slot;
+                LOG_ERROR("%s: failed to publish free_queue slot", Module::kSubsystemName);
+                return false;
+            }
+            fq.tail = fq_tail + 1;
+            wmb();
+            if (mgr.write_range_to_device(&fq.tail, sizeof(fq.tail)) != 0) {
+                fq.tail = fq_tail;
+                fq.buffer_ptrs[slot_idx] = old_slot;
+                LOG_ERROR("%s: failed to publish free_queue tail", Module::kSubsystemName);
+                return false;
+            }
+            return true;
+        });
     }
 
-    // Fill one (kind, instance) free_queue to kSlotCount, batch-allocating
-    // when the recycled pool of this kind drains mid-fill.
     template <typename Mgr>
-    static void top_up_free_queue(Mgr &mgr, int kind, FreeQueue &fq, size_t buf_size) {
-        rmb();
-        uint32_t fq_head = fq.head;
-        uint32_t fq_tail = fq.tail;
-        uint32_t fq_used = fq_tail - fq_head;
-
-        while (fq_used < Module::kSlotCount) {
-            void *new_dev = mgr.pop_recycled(kind);
-            if (new_dev == nullptr) {
-                const int batch = Module::batch_size(kind);
-                for (int i = 0; i < batch; i++) {
-                    void *host_ptr = nullptr;
-                    void *dev = mgr.alloc_and_register(buf_size, &host_ptr);
-                    if (dev == nullptr) break;
-                    mgr.push_recycled(kind, dev);
-                }
-                new_dev = mgr.pop_recycled(kind);
+    static bool free_queue_has_space(Mgr &mgr, FreeQueue &fq) {
+        return mgr.with_free_queue_writer(&fq, [&]() {
+            if (mgr.read_range_from_device(&fq.head, sizeof(fq.head)) != 0) {
+                LOG_ERROR("%s: failed to refresh free_queue head", Module::kSubsystemName);
+                return false;
             }
-            if (new_dev == nullptr) return;
+            rmb();
+            return fq.tail - fq.head < Module::kSlotCount;
+        });
+    }
 
-            uint32_t slot_idx = fq_tail % Module::kSlotCount;
-            fq.buffer_ptrs[slot_idx] = reinterpret_cast<uint64_t>(new_dev);
-            wmb();
-            mgr.write_range_to_device(&fq.buffer_ptrs[slot_idx], sizeof(fq.buffer_ptrs[slot_idx]));
-            fq_tail++;
-            fq.tail = fq_tail;
-            wmb();
-            mgr.write_range_to_device(&fq.tail, sizeof(fq.tail));
-            fq_used++;
+    // Fill one (kind, instance) free_queue to kSlotCount from one drain
+    // shard's local recycled pool, batch-allocating when that shard is dry.
+    template <typename Mgr>
+    static uint64_t top_up_free_queue(Mgr &mgr, int kind, FreeQueue &fq, size_t buf_size, int shard_index = 0) {
+        uint64_t pushed = 0;
+
+        while (free_queue_has_space(mgr, fq)) {
+            void *new_dev = obtain_buffer(mgr, kind, buf_size, shard_index);
+            if (new_dev == nullptr) return pushed;
+            if (!try_push_to_free_queue(mgr, fq, new_dev)) {
+                mgr.push_recycled(kind, new_dev, shard_index);
+                return pushed;
+            }
+            pushed++;
         }
+        return pushed;
     }
 };
 
@@ -530,12 +605,12 @@ class ProfilerBase {
 
     /**
      * Assemble a MemoryOps from the callbacks stashed by set_memory_context()
-     * and launch the mgmt + poll threads. If shm_host_ is nullptr (Derived's
+     * and launch the mgmt + collector threads. If shm_host_ is nullptr (Derived's
      * init() aborted before set_memory_context, or finalize() has cleared
      * the context) this is a no-op.
      *
-     * Order matters: mgmt is started before poll because mgmt is the only
-     * writer to L2 (the ready_queue) and poll is its sole consumer. The
+     * Order matters: mgmt is started before collectors because mgmt is the
+     * only writer to L2 (the ready queues) and collectors are the consumers. The
      * register slot defaults to identity on the SVM path (copy_to_device_
      * is null) or to a host-shadow malloc lambda on the non-SVM path
      * (copy_to_device_ installed) — so BufferPoolManager always has a
@@ -588,29 +663,73 @@ class ProfilerBase {
         ops.copy_from_device = copy_from_device_;
         manager_.set_memory_context(std::move(ops), shm_dev_, shm_host_, shm_size_, device_id_);
 
+        execution_complete_.store(false, std::memory_order_release);
+        {
+            DataHeader *header = Module::header_from_shm(manager_.shared_mem_host());
+            (void)ProfilerAlgorithms<Module>::proactive_replenish(manager_, header);
+        }
+
         mgmt_running_.store(true, std::memory_order_release);
-        if (thread_factory) {
-            mgmt_thread_ = thread_factory([this]() {
-                mgmt_loop();
-            });
-        } else {
-            mgmt_thread_ = std::thread(&ProfilerBase::mgmt_loop, this);
+        {
+            constexpr int kDrainThreads = ProfilerModuleDrainThreadCount<Module>::value;
+            static_assert(kDrainThreads >= 1, "kMgmtDrainThreadCount must be >= 1");
+            if constexpr (kDrainThreads == 1) {
+                if (thread_factory) {
+                    mgmt_thread_ = thread_factory([this]() {
+                        mgmt_drain_loop(0, 1);
+                    });
+                } else {
+                    mgmt_thread_ = std::thread(&ProfilerBase::mgmt_drain_loop, this, 0, 1);
+                }
+            } else {
+                mgmt_drain_threads_.reserve(kDrainThreads);
+                for (int i = 0; i < kDrainThreads; i++) {
+                    if (thread_factory) {
+                        mgmt_drain_threads_.push_back(thread_factory([this, i]() {
+                            mgmt_drain_loop(i, kDrainThreads);
+                        }));
+                    } else {
+                        mgmt_drain_threads_.emplace_back(&ProfilerBase::mgmt_drain_loop, this, i, kDrainThreads);
+                    }
+                }
+            }
+            if (thread_factory) {
+                mgmt_replenish_thread_ = thread_factory([this]() {
+                    mgmt_replenish_loop();
+                });
+            } else {
+                mgmt_replenish_thread_ = std::thread(&ProfilerBase::mgmt_replenish_loop, this);
+            }
         }
 
-        execution_complete_.store(false, std::memory_order_release);
-        if (thread_factory) {
-            collector_thread_ = thread_factory([this]() {
-                poll_and_collect_loop();
-            });
+        constexpr int kCollectorThreads = ProfilerModuleCollectorThreadCount<Module>::value;
+        static_assert(kCollectorThreads >= 1, "kCollectorThreadCount must be >= 1");
+        if constexpr (kCollectorThreads == 1) {
+            if (thread_factory) {
+                collector_thread_ = thread_factory([this]() {
+                    poll_and_collect_loop(0, 1);
+                });
+            } else {
+                collector_thread_ = std::thread(&ProfilerBase::poll_and_collect_loop, this, 0, 1);
+            }
         } else {
-            collector_thread_ = std::thread(&ProfilerBase::poll_and_collect_loop, this);
+            collector_threads_.reserve(kCollectorThreads);
+            for (int i = 0; i < kCollectorThreads; i++) {
+                if (thread_factory) {
+                    collector_threads_.push_back(thread_factory([this, i]() {
+                        poll_and_collect_loop(i, kCollectorThreads);
+                    }));
+                } else {
+                    collector_threads_.emplace_back(&ProfilerBase::poll_and_collect_loop, this, i, kCollectorThreads);
+                }
+            }
         }
     }
 
     /**
      * Stop the mgmt thread, drain whatever it pushes during its final pass,
      * and join the collector. Idempotent. Caller is guaranteed on return
-     * that mgmt's L1 ringbuffer and the host-side L2 ready_queue are both
+     * that mgmt's L1 ringbuffer and the host-side ready queue shard(s) are
      * empty and Derived::on_buffer_collected has been called for every
      * entry that was in either queue. Framework-owned buffers are NOT freed
      * here — Derived's finalize() must do that.
@@ -624,10 +743,25 @@ class ProfilerBase {
         if (mgmt_thread_.joinable()) {
             mgmt_thread_.join();
         }
+        for (auto &thread : mgmt_drain_threads_) {
+            if (thread.joinable()) {
+                thread.join();
+            }
+        }
+        mgmt_drain_threads_.clear();
+        if (mgmt_replenish_thread_.joinable()) {
+            mgmt_replenish_thread_.join();
+        }
         execution_complete_.store(true, std::memory_order_release);
         if (collector_thread_.joinable()) {
             collector_thread_.join();
         }
+        for (auto &thread : collector_threads_) {
+            if (thread.joinable()) {
+                thread.join();
+            }
+        }
+        collector_threads_.clear();
     }
 
     Manager &manager() { return manager_; }
@@ -637,6 +771,7 @@ class ProfilerBase {
     Manager manager_;
     std::atomic<bool> execution_complete_{false};
     std::thread collector_thread_;
+    std::vector<std::thread> collector_threads_;
 
     // Memory context stashed by Derived::init() via set_memory_context().
     // Derived may read these from finalize() / alloc helpers via the
@@ -744,100 +879,89 @@ class ProfilerBase {
     }
 
 private:
-    /**
-     * mgmt thread main loop. Each tick:
-     *   0) Mirror the device-side shared-memory region (DataHeader + all
-     *      BufferStates) into the host shadow so subsequent reads see the
-     *      latest queue_tails / current_buf_ptr / per-state counters.
-     *   1) Drain done_queue into recycled pools.
-     *   2) Iterate AICPU per-thread ready queues (PLATFORM_MAX_AICPU_THREADS
-     *      upper bound; empty queues are O(1) head==tail checks) and call
-     *      Module::process_entry per entry. process_entry pulls each
-     *      popped buffer's contents from device on demand.
-     *      try_pop_aicpu_entry / push_to_free_queue write the few host-modified
-     *      fields (queue_heads[q], free_queue.tail/buffer_ptrs[]) back to
-     *      device immediately via `write_range_to_device`.
-     *   3) Call Module::proactive_replenish to top up any depleted free
-     *      queues.
-     *   4) Sleep 10 us if no work was done.
-     *
-     * The bulk `mirror_shm_to_device` deliberately is NOT called: it races
-     * with AICPU writes to device-only fields (current_buf_ptr, total/dropped/
-     * mismatch counters, queue_tails, free_queue.head, core_to_thread[],
-     * and on a5 L2SwimlaneAicpuPhaseHeader::magic) and rolls them back to
-     * whatever was mirrored in at the start of the tick. Each host-side
-     * modification is written back as a narrow field write inside Alg.
-     *
-     * On exit (mgmt_running_ → false) it does one final drain pass without
-     * sleeping to flush any straggler entries the device pushed before
-     * stopping.
-     */
-    void mgmt_loop() {
+    void mgmt_drain_loop(int queue_start, int queue_stride) {
         DataHeader *header = Module::header_from_shm(manager_.shared_mem_host());
         using Alg = ProfilerAlgorithms<Module>;
+        constexpr int kIdleBusyPollLoops = 64;
+        int idle_busy_polls = 0;
 
-        while (mgmt_running_.load(std::memory_order_acquire)) {
-            manager_.mirror_shm_from_device();
-
-            manager_.drain_done_into_recycled();
-
+        while (mgmt_running_.load(std::memory_order_relaxed)) {
             bool found_any = false;
-            for (int q = 0; q < PLATFORM_MAX_AICPU_THREADS; q++) {
+            for (int q = queue_start; q < PLATFORM_MAX_AICPU_THREADS; q += queue_stride) {
                 ReadyEntry entry;
-                while (Alg::try_pop_aicpu_entry(manager_, header, q, entry)) {
+                while (Alg::try_pop_aicpu_entry(manager_, header, q, entry, true)) {
                     Alg::process_entry(manager_, header, q, entry);
                     found_any = true;
                 }
             }
-
-            Alg::proactive_replenish(manager_, header);
+            if (found_any) {
+                idle_busy_polls = 0;
+            }
 
             if (!found_any) {
-                std::this_thread::sleep_for(std::chrono::microseconds(10));
+                if (idle_busy_polls < kIdleBusyPollLoops) {
+                    idle_busy_polls++;
+                } else {
+                    std::this_thread::sleep_for(std::chrono::microseconds(10));
+                }
             }
         }
 
-        // Final drain after mgmt_running_ flipped: don't sleep, don't
-        // replenish. try_pop_aicpu_entry still pushes the advanced
-        // queue_heads back to device per-pop.
-        manager_.mirror_shm_from_device();
-        for (int q = 0; q < PLATFORM_MAX_AICPU_THREADS; q++) {
+        for (int q = queue_start; q < PLATFORM_MAX_AICPU_THREADS; q += queue_stride) {
             ReadyEntry entry;
-            while (Alg::try_pop_aicpu_entry(manager_, header, q, entry)) {
+            while (Alg::try_pop_aicpu_entry(manager_, header, q, entry, true)) {
                 Alg::process_entry(manager_, header, q, entry);
             }
         }
     }
 
+    void mgmt_replenish_loop() {
+        while (mgmt_running_.load(std::memory_order_relaxed)) {
+            size_t drained = manager_.drain_done_into_recycled();
+
+            if (drained == 0) {
+                std::this_thread::sleep_for(std::chrono::microseconds(10));
+            }
+        }
+    }
+
     /**
-     * Main collector loop. Blocks on the manager's ready_queue with a 100 ms
+     * Main collector loop. Blocks on one manager ready-queue shard with a 100 ms
      * cv-wait tick. On each hit it dispatches the buffer to Derived via
      * on_buffer_collected() and recycles the buffer. Exits in two cases:
      *
-     *   1. execution_complete_ was set (by stop()) and the ready_queue is
+     *   1. execution_complete_ was set (by stop()) and this ready_queue shard is
      *      empty, after a final non-blocking drain pass.
      *   2. No buffer arrived for `Derived::kIdleTimeoutSec` consecutive
      *      seconds AND execution_complete_ has not been signalled — this
-     *      is a hang detector that logs an error and bails out.
+     *      is a hang detector that logs an error and bails out. Multi-shard
+     *      collectors arm this only after a shard has seen traffic, because
+     *      an empty shard can be a valid run shape.
      */
-    void poll_and_collect_loop() {
+    void poll_and_collect_loop(int shard_index, int shard_count) {
         const auto wait_tick = std::chrono::milliseconds(100);
         const auto idle_timeout = std::chrono::seconds(Derived::kIdleTimeoutSec);
         std::optional<std::chrono::steady_clock::time_point> idle_start;
+        bool has_seen_buffer = false;
 
         while (true) {
             ReadyBufferInfo info;
-            if (manager_.wait_pop_ready(info, wait_tick)) {
-                consume(info);
+            if (manager_.wait_pop_ready(info, wait_tick, shard_index)) {
+                consume(info, shard_index);
+                has_seen_buffer = true;
                 idle_start.reset();
                 continue;
             }
             if (execution_complete_.load(std::memory_order_acquire)) {
-                while (manager_.try_pop_ready(info)) {
-                    consume(info);
+                while (manager_.try_pop_ready(info, shard_index)) {
+                    consume(info, shard_index);
+                    has_seen_buffer = true;
                 }
                 break;
             }
+            if (shard_count > 1 && !has_seen_buffer) {
+                continue;
+            }
             if (!idle_start.has_value()) {
                 idle_start = std::chrono::steady_clock::now();
             }
@@ -851,16 +975,18 @@ class ProfilerBase {
         }
     }
 
-    void consume(const ReadyBufferInfo &info) {
+    void consume(const ReadyBufferInfo &info, int shard_index) {
         static_cast<Derived *>(this)->on_buffer_collected(info);
         if constexpr (Module::kBufferKinds > 1) {
-            manager_.notify_copy_done(info.dev_buffer_ptr, Module::kind_of(info));
+            manager_.notify_copy_done(info.dev_buffer_ptr, Module::kind_of(info), shard_index);
         } else {
-            manager_.notify_copy_done(info.dev_buffer_ptr, 0);
+            manager_.notify_copy_done(info.dev_buffer_ptr, 0, shard_index);
         }
     }
 
     std::thread mgmt_thread_;
+    std::vector<std::thread> mgmt_drain_threads_;
+    std::thread mgmt_replenish_thread_;
     std::atomic<bool> mgmt_running_{false};
 };
 
diff --git a/src/common/platform/include/host/scope_stats_collector.h b/src/common/platform/include/host/scope_stats_collector.h
index 583830294..21041be29 100644
--- a/src/common/platform/include/host/scope_stats_collector.h
+++ b/src/common/platform/include/host/scope_stats_collector.h
@@ -14,10 +14,10 @@
  * @brief Host-side scope_stats streaming collector + NDJSON export.
  *
  * Architecture mirrors PmuCollector: BufferPoolManager<ScopeStatsModule> runs
- * the mgmt thread (polls the per-thread ready queue, recycles buffers, refills
- * the single instance's free_queue); ScopeStatsCollector's poll thread appends
- * each full buffer's ScopeStatsRecords to an in-memory vector. After stop(),
- * write_jsonl() renders them to
+ * split mgmt threads (poll per-thread ready queues, recycle buffers, refill the
+ * single instance's free_queue); ScopeStatsCollector's collector thread shards
+ * append each full buffer's ScopeStatsRecords to an in-memory vector. After
+ * stop(), write_jsonl() renders them to
  * <output_dir>/scope_stats/scope_stats.jsonl.
  *
  * Memory mirroring is handled by the framework via the MemoryOps installed
@@ -31,7 +31,7 @@
  * Lifecycle:
  *   init()               — Allocate header + 1 BufferState + N ScopeStatsBuffers
  *                          (pre-fills free_queue; surplus → recycled pool).
- *   start(tf)            — Inherited: launches mgmt + poll threads.
+ *   start(tf)            — Inherited: launches mgmt + collector threads.
  *   [device execution]
  *   stop()               — Inherited: drain queues, join threads.
  *   reconcile_counters() — Recover any un-flushed current buffer left by an
@@ -89,6 +89,8 @@ struct ScopeStatsModule {
     static constexpr uint32_t kReadyQueueSize = PLATFORM_SCOPE_STATS_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_SCOPE_STATS_SLOT_COUNT;
     static constexpr const char *kSubsystemName = "ScopeStatsModule";
+    static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS;
+    static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS;
 
     static constexpr int batch_size(int /*kind*/) {
         constexpr int kBatch = PLATFORM_SCOPE_STATS_BUFFERS_PER_INSTANCE - PLATFORM_SCOPE_STATS_SLOT_COUNT;
@@ -98,7 +100,18 @@ struct ScopeStatsModule {
     static DataHeader *header_from_shm(void *shm) { return get_scope_stats_header(shm); }
 
     static std::optional<profiling_common::EntrySite<ScopeStatsModule>>
-    resolve_entry(void *shm, DataHeader * /*header*/, int q, const ReadyEntry &entry) {
+    resolve_entry(void *shm, DataHeader *header, int q, const ReadyEntry &entry) {
+        if (shm == nullptr || header == nullptr) {
+            LOG_ERROR("ScopeStatsModule: invalid shared memory/header while resolving ready entry");
+            return std::nullopt;
+        }
+        if (header->num_instances != 1 || entry.instance_index >= header->num_instances) {
+            LOG_ERROR(
+                "ScopeStatsModule: invalid ready entry instance=%u (num_instances=%u)", entry.instance_index,
+                header->num_instances
+            );
+            return std::nullopt;
+        }
         ScopeStatsBufferState *state = get_scope_stats_buffer_state(shm, static_cast<int>(entry.instance_index));
         profiling_common::EntrySite<ScopeStatsModule> site;
         site.kind = 0;
diff --git a/src/common/platform/include/host/tensor_dump_collector.h b/src/common/platform/include/host/tensor_dump_collector.h
index 67343fa80..e8f649a00 100644
--- a/src/common/platform/include/host/tensor_dump_collector.h
+++ b/src/common/platform/include/host/tensor_dump_collector.h
@@ -14,9 +14,9 @@
  * @brief Host-side tensor dump collector with independent shared memory.
  *
  * Architecture:
- * - BufferPoolManager<DumpModule>: shared mgmt-thread infrastructure that
- *   polls per-thread DumpReadyQueues, replenishes free_queues, and hands
- *   full DumpMetaBuffers off to the collector thread.
+ * - BufferPoolManager<DumpModule>: shared split-mgmt infrastructure that
+ *   polls per-thread ready queues, replenishes free_queues, and hands
+ *   full DumpMetaBuffers off to collector thread shards.
  * - TensorDumpCollector: copies tensor metadata + arena bytes into host
  *   vectors and writes the result to disk (.bin + JSON).
  *
@@ -86,6 +86,8 @@ struct DumpModule {
     static constexpr uint32_t kReadyQueueSize = PLATFORM_DUMP_READYQUEUE_SIZE;
     static constexpr uint32_t kSlotCount = PLATFORM_DUMP_SLOT_COUNT;
     static constexpr const char *kSubsystemName = "DumpModule";
+    static constexpr int kMgmtDrainThreadCount = PLATFORM_MAX_AICPU_THREADS;
+    static constexpr int kCollectorThreadCount = PLATFORM_MAX_AICPU_THREADS;
 
     /**
      * Tensor-dump bursts can be very large; the batch is sized so a fully
@@ -100,7 +102,19 @@ struct DumpModule {
     static DataHeader *header_from_shm(void *shm) { return get_dump_header(shm); }
 
     static std::optional<profiling_common::EntrySite<DumpModule>>
-    resolve_entry(void *shm, DataHeader * /*header*/, int /*q*/, const ReadyEntry &entry) {
+    resolve_entry(void *shm, DataHeader *header, int /*q*/, const ReadyEntry &entry) {
+        if (shm == nullptr || header == nullptr) {
+            LOG_ERROR("DumpModule: invalid shared memory/header while resolving ready entry");
+            return std::nullopt;
+        }
+        if (entry.thread_index >= header->num_dump_threads ||
+            entry.thread_index >= static_cast<uint32_t>(PLATFORM_MAX_AICPU_THREADS)) {
+            LOG_ERROR(
+                "DumpModule: invalid ready entry thread=%u (num_dump_threads=%u, max=%u)", entry.thread_index,
+                header->num_dump_threads, static_cast<uint32_t>(PLATFORM_MAX_AICPU_THREADS)
+            );
+            return std::nullopt;
+        }
         DumpBufferState *state = get_dump_buffer_state(shm, static_cast<int>(entry.thread_index));
         profiling_common::EntrySite<DumpModule> site;
         site.kind = 0;
@@ -296,6 +310,7 @@ class TensorDumpCollector : public profiling_common::ProfilerBase<TensorDumpColl
 
     // Writer thread: streams arg payloads to a single args.bin
     std::thread writer_thread_;
+    std::mutex collector_state_mutex_;
     std::mutex write_mutex_;
     std::condition_variable write_cv_;
     std::queue<DumpedTensor> write_queue_;
diff --git a/src/common/platform/shared/aicpu/scope_stats_collector_aicpu.cpp b/src/common/platform/shared/aicpu/scope_stats_collector_aicpu.cpp
index c7a8410bc..496d2773e 100644
--- a/src/common/platform/shared/aicpu/scope_stats_collector_aicpu.cpp
+++ b/src/common/platform/shared/aicpu/scope_stats_collector_aicpu.cpp
@@ -21,6 +21,7 @@
 #include "aicpu/scope_stats_collector_aicpu.h"
 #include <cstring>
 
+#include "aicpu/device_time.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "common/scope_stats.h"
@@ -44,6 +45,9 @@ static int s_orch_thread_idx = -1;  // set via scope_stats_aicpu_set_orch_thread
 // unroll_heap_offset). Reset in set_platform_scope_stats_base.
 static uint64_t s_heap_wraps[PTO2_SCOPE_STATS_MAX_RING_DEPTH][2] = {};
 
+static constexpr uint64_t kScopeStatsQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000;  // 20 us
+static constexpr uint32_t kScopeStatsQueueBackpressurePollMask = 1023;
+
 namespace {
 
 const char *s_pending_site_file = nullptr;
@@ -89,19 +93,30 @@ inline void copy_basename(char (&dst)[32], const char *src) {
 // Enqueue a full buffer onto the orchestrator thread's ready_queue. Returns 0
 // on success, -1 if the queue is full or the orch thread index is unset.
 int enqueue_ready_buffer(uint64_t buffer_ptr, uint32_t buffer_seq) {
-    if (s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
+    if (s_scope_stats_header == nullptr || s_orch_thread_idx < 0 || s_orch_thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
         return -1;
     }
     int q = s_orch_thread_idx;
     uint32_t capacity = PLATFORM_SCOPE_STATS_READYQUEUE_SIZE;
-    uint32_t current_tail = s_scope_stats_header->queue_tails[q];
-    uint32_t current_head = s_scope_stats_header->queue_heads[q];
+    uint32_t current_tail = 0;
+    uint32_t current_head = 0;
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        current_tail = s_scope_stats_header->queue_tails[q];
+        current_head = s_scope_stats_header->queue_heads[q];
+        uint32_t next_tail = (current_tail + 1) % capacity;
+        if (next_tail != current_head) {
+            break;
+        }
+        if ((++spins & kScopeStatsQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kScopeStatsQueueBackpressureWaitCycles) {
+            return -1;
+        }
+    } while (true);
 
     uint32_t next_tail = (current_tail + 1) % capacity;
-    if (next_tail == current_head) {
-        return -1;  // Queue full
-    }
-
     s_scope_stats_header->queues[q][current_tail].instance_index = 0;
     s_scope_stats_header->queues[q][current_tail].buffer_ptr = buffer_ptr;
     s_scope_stats_header->queues[q][current_tail].buffer_seq = buffer_seq;
@@ -115,23 +130,38 @@ int enqueue_ready_buffer(uint64_t buffer_ptr, uint32_t buffer_seq) {
 
 // Pop a free buffer into current_buf_ptr. Returns true if one was available.
 bool pop_free_buffer() {
-    rmb();
-    uint32_t head = s_scope_stats_state->free_queue.head;
-    uint32_t tail = s_scope_stats_state->free_queue.tail;
-    if (head == tail) {
-        return false;
-    }
+    if (s_scope_stats_state == nullptr) return false;
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+    uint32_t head = 0;
+    uint32_t tail = 0;
+    do {
+        head = s_scope_stats_state->free_queue.head;
+        tail = s_scope_stats_state->free_queue.tail;
+        if (head != tail) {
+            rmb();  // acquire: order the tail read before the buffer_ptrs read below
+            break;
+        }
+        if ((++spins & kScopeStatsQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kScopeStatsQueueBackpressureWaitCycles) {
+            return false;
+        }
+    } while (true);
+
     uint64_t buf_ptr = s_scope_stats_state->free_queue.buffer_ptrs[head % PLATFORM_SCOPE_STATS_SLOT_COUNT];
-    rmb();
     s_scope_stats_state->free_queue.head = head + 1;
+    if (buf_ptr == 0) {
+        return false;
+    }
     s_scope_stats_state->current_buf_ptr = buf_ptr;
     reinterpret_cast<ScopeStatsBuffer *>(buf_ptr)->count = 0;
     wmb();
     return true;
 }
 
-// Commit the full current buffer to the ready_queue and pop a replacement. On
-// no free buffer / ready_queue full, drop the buffer's records and reuse it.
+// Commit the full current buffer to the ready_queue before popping a
+// replacement. If no replacement is available, later records drop until host
+// replenishes free_queue.
 void switch_buffer() {
     if (s_scope_stats_state == nullptr) {
         return;
@@ -141,20 +171,6 @@ void switch_buffer() {
         return;
     }
 
-    rmb();
-    uint32_t head = s_scope_stats_state->free_queue.head;
-    uint32_t tail = s_scope_stats_state->free_queue.tail;
-    if (head == tail) {
-        // Host can't recycle buffers fast enough: drop silently (count only, no
-        // per-drop log). Logging here would make a slow host pay device-side
-        // hot-path cost — the device must not be coupled to host throughput. The
-        // total is surfaced via dropped_record_count in the finalize summary.
-        s_scope_stats_state->dropped_record_count += full_buf->count;
-        full_buf->count = 0;
-        wmb();
-        return;
-    }
-
     uint32_t seq = s_scope_stats_state->current_buf_seq;
     int rc = enqueue_ready_buffer(s_scope_stats_state->current_buf_ptr, seq);
     if (rc != 0) {
@@ -164,13 +180,10 @@ void switch_buffer() {
         return;
     }
 
-    uint64_t new_buf_ptr = s_scope_stats_state->free_queue.buffer_ptrs[head % PLATFORM_SCOPE_STATS_SLOT_COUNT];
-    rmb();
-    s_scope_stats_state->free_queue.head = head + 1;
-    s_scope_stats_state->current_buf_ptr = new_buf_ptr;
+    s_scope_stats_state->current_buf_ptr = 0;
     s_scope_stats_state->current_buf_seq = seq + 1;
-    reinterpret_cast<ScopeStatsBuffer *>(new_buf_ptr)->count = 0;
     wmb();
+    (void)pop_free_buffer();
 }
 
 // Unroll a wrapping heap byte offset into a monotonic value using the
diff --git a/src/common/platform/shared/aicpu/tensor_dump_aicpu.cpp b/src/common/platform/shared/aicpu/tensor_dump_aicpu.cpp
index e62888b92..e2623057b 100644
--- a/src/common/platform/shared/aicpu/tensor_dump_aicpu.cpp
+++ b/src/common/platform/shared/aicpu/tensor_dump_aicpu.cpp
@@ -23,6 +23,7 @@
 #include <cstdlib>
 #include <cstring>
 
+#include "aicpu/device_time.h"
 #include "common/memory_barrier.h"
 #include "common/platform_config.h"
 #include "common/unified_log.h"
@@ -49,15 +50,19 @@ static inline void account_dropped_records(DumpBufferState *state, uint32_t drop
     state->dropped_record_count = (next < prev) ? UINT32_MAX : next;
 }
 
-extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dump_base = dump_data_base; }
-
-extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; }
+static constexpr uint64_t kDumpQueueBackpressureWaitCycles = PLATFORM_PROF_SYS_CNT_FREQ / 50000;  // 20 us
+static constexpr uint32_t kDumpQueueBackpressurePollMask = 1023;
 
 static bool g_enable_dump_args = false;
 // Dump level latched from the header in dump_args_init(). The selective
 // (PARTIAL) and json-only (FULL_JSON_ONLY) modes are derived from it rather
 // than tracked as separate flags — mirrors g_l2_swimlane_level.
 static DumpTensorLevel g_dump_args_level = DumpTensorLevel::OFF;
+
+extern "C" void set_platform_dump_base(uint64_t dump_data_base) { g_platform_dump_base = dump_data_base; }
+
+extern "C" uint64_t get_platform_dump_base() { return g_platform_dump_base; }
+
 struct DumpTaskMaskEntry {
     uint64_t task_id;
     TensorDumpArgMask mask;
@@ -342,34 +347,79 @@ bool try_log_dump_args_layout_mismatch() {
  * Enqueue a full dump metadata buffer to the thread's ready queue.
  */
 static int enqueue_dump_ready_buffer(int thread_idx, uint64_t buffer_ptr, uint32_t buffer_seq) {
+    if (s_dump_header == nullptr || thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
+        return -1;
+    }
     uint32_t capacity = PLATFORM_DUMP_READYQUEUE_SIZE;
-    uint32_t current_tail = s_dump_header->queue_tails[thread_idx];
-    uint32_t current_head = s_dump_header->queue_heads[thread_idx];
+    uint32_t current_tail = 0;
+    uint32_t current_head = 0;
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+
+    do {
+        current_tail = s_dump_header->queue_tails[thread_idx];
+        current_head = s_dump_header->queue_heads[thread_idx];
+        uint32_t next_tail = (current_tail + 1) % capacity;
+        if (next_tail != current_head) {
+            break;
+        }
+        if ((++spins & kDumpQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kDumpQueueBackpressureWaitCycles) {
+            return -1;
+        }
+    } while (true);
 
     uint32_t next_tail = (current_tail + 1) % capacity;
-    if (next_tail == current_head) {
-        return -1;  // Queue full
-    }
-
     s_dump_header->queues[thread_idx][current_tail].thread_index = static_cast<uint32_t>(thread_idx);
     s_dump_header->queues[thread_idx][current_tail].buffer_ptr = buffer_ptr;
     s_dump_header->queues[thread_idx][current_tail].buffer_seq = buffer_seq;
-    wmb();
+    wmb();  // publish: entry fields visible before the tail advance
     s_dump_header->queue_tails[thread_idx] = next_tail;
-    wmb();
 
     return 0;
 }
 
-/**
- * Maximum spin-wait iterations when free_queue or ready_queue is exhausted.
- * Gives host mgmt_loop time to replenish before falling back to buffer overwrite.
- */
-static constexpr uint32_t DUMP_SPIN_WAIT_LIMIT = 1000000;
+static DumpMetaBuffer *try_pop_dump_meta_buffer(int thread_idx, DumpBufferState *state, uint32_t next_seq) {
+    if (thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS || state == nullptr) {
+        return nullptr;
+    }
+    const uint64_t start = get_sys_cnt_aicpu();
+    uint32_t spins = 0;
+    uint32_t head = 0;
+    uint32_t tail = 0;
+
+    do {
+        head = state->free_queue.head;
+        tail = state->free_queue.tail;
+        if (head != tail) {
+            rmb();  // acquire: order the tail read before the buffer_ptrs read below
+            break;
+        }
+        if ((++spins & kDumpQueueBackpressurePollMask) == 0 &&
+            get_sys_cnt_aicpu() - start >= kDumpQueueBackpressureWaitCycles) {
+            return nullptr;
+        }
+    } while (true);
+
+    uint64_t new_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_DUMP_SLOT_COUNT];
+    state->free_queue.head = head + 1;
+    if (new_ptr == 0) {
+        return nullptr;
+    }
+
+    DumpMetaBuffer *new_buf = reinterpret_cast<DumpMetaBuffer *>(new_ptr);
+    new_buf->count = 0;
+    s_current_dump_buf[thread_idx] = new_buf;
+    state->current_buf_ptr = new_ptr;
+    state->current_buf_seq = next_seq;
+    wmb();
+    return new_buf;
+}
 
 /**
- * Switch metadata buffer: enqueue the full buffer, pop a new one.
- * Spin-waits briefly for host to replenish before falling back to overwrite.
+ * Switch metadata buffer: enqueue the full buffer first, then pop a new one.
+ * If no replacement is available, later records drop until host replenishes
+ * free_queue.
  */
 static int switch_dump_meta_buffer(int thread_idx) {
     if (thread_idx < 0 || thread_idx >= PLATFORM_MAX_AICPU_THREADS) {
@@ -381,77 +431,39 @@ static int switch_dump_meta_buffer(int thread_idx) {
         return -1;
     }
 
-    // Spin-wait for a free buffer, giving host mgmt_loop time to replenish
-    rmb();
-    uint32_t head = state->free_queue.head;
-    uint32_t tail = state->free_queue.tail;
-    if (head == tail) {
-        for (uint32_t spin = 0; spin < DUMP_SPIN_WAIT_LIMIT; spin++) {
-            rmb();
-            head = state->free_queue.head;
-            tail = state->free_queue.tail;
-            if (head != tail) {
-                break;
-            }
-        }
-    }
-    if (head == tail) {
-        // Still empty after spin — overwrite current buffer
-        account_dropped_records(state, cur->count);
-        cur->count = 0;
-        wmb();
-        if (!s_logged_no_free_meta_buffer[thread_idx]) {
-            s_logged_no_free_meta_buffer[thread_idx] = true;
-            LOG_WARN(
-                "Args dump ran out of free metadata buffers on thread %d after spin-wait, "
-                "overwriting current buffer. Increase PLATFORM_DUMP_BUFFERS_PER_THREAD.",
-                thread_idx
-            );
-        }
-        return 0;
-    }
-
-    // Enqueue the full buffer (spin-wait if ready queue is full)
     uint64_t buf_addr = reinterpret_cast<uint64_t>(cur);
     uint32_t seq = state->current_buf_seq;
     int rc = enqueue_dump_ready_buffer(thread_idx, buf_addr, seq);
     if (rc != 0) {
-        for (uint32_t spin = 0; spin < DUMP_SPIN_WAIT_LIMIT; spin++) {
-            rmb();
-            rc = enqueue_dump_ready_buffer(thread_idx, buf_addr, seq);
-            if (rc == 0) {
-                break;
-            }
-        }
-    }
-    if (rc != 0) {
-        // Still full after spin — overwrite current buffer
         account_dropped_records(state, cur->count);
         cur->count = 0;
         wmb();
         if (!s_logged_ready_queue_full[thread_idx]) {
             s_logged_ready_queue_full[thread_idx] = true;
             LOG_WARN(
-                "Args dump ready queue full on thread %d after spin-wait, "
-                "overwriting current buffer. Increase PLATFORM_DUMP_READYQUEUE_SIZE.",
+                "Args dump ready queue full on thread %d after bounded wait, "
+                "dropping current metadata buffer. Increase PLATFORM_DUMP_READYQUEUE_SIZE.",
                 thread_idx
             );
         }
         return 0;
     }
 
-    // Pop next buffer from free_queue
-    uint64_t new_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_DUMP_SLOT_COUNT];
-    rmb();
-    state->free_queue.head = head + 1;
-
-    DumpMetaBuffer *new_buf = reinterpret_cast<DumpMetaBuffer *>(new_ptr);
-    new_buf->count = 0;
-    s_current_dump_buf[thread_idx] = new_buf;
-    state->current_buf_ptr = new_ptr;
-    state->current_buf_seq = seq + 1;
+    uint32_t next_seq = seq + 1;
+    s_current_dump_buf[thread_idx] = nullptr;
+    state->current_buf_ptr = 0;
+    state->current_buf_seq = next_seq;
     wmb();
 
+    if (try_pop_dump_meta_buffer(thread_idx, state, next_seq) == nullptr && !s_logged_no_free_meta_buffer[thread_idx]) {
+        s_logged_no_free_meta_buffer[thread_idx] = true;
+        LOG_WARN(
+            "Args dump published a full metadata buffer on thread %d but no replacement was available; "
+            "records will drop until recovery. Increase PLATFORM_DUMP_BUFFERS_PER_THREAD.",
+            thread_idx
+        );
+    }
+
     s_buffers_switched[thread_idx]++;
 
     return 0;
@@ -588,17 +600,8 @@ void dump_args_init(int num_dump_threads) {
         uint32_t head = state->free_queue.head;
         uint32_t tail = state->free_queue.tail;
         if (head != tail) {
-            uint64_t buf_ptr = state->free_queue.buffer_ptrs[head % PLATFORM_DUMP_SLOT_COUNT];
-            rmb();
-            state->free_queue.head = head + 1;
-            wmb();
-
-            DumpMetaBuffer *buf = reinterpret_cast<DumpMetaBuffer *>(buf_ptr);
-            buf->count = 0;
-            s_current_dump_buf[t] = buf;
-            state->current_buf_ptr = buf_ptr;
-            state->current_buf_seq = 0;
-            wmb();
+            (void)try_pop_dump_meta_buffer(t, state, 0);
+            uint64_t buf_ptr = state->current_buf_ptr;
             LOG_DEBUG("Thread %d: popped initial dump buffer (addr=0x%lx)", t, buf_ptr);
         } else {
             LOG_ERROR("Thread %d: dump free_queue is empty during init!", t);
@@ -625,7 +628,11 @@ int dump_arg_record(int thread_idx, const TensorDumpInfo &info) {
     DumpBufferState *state = s_dump_states[thread_idx];
     DumpMetaBuffer *buf = s_current_dump_buf[thread_idx];
     if (buf == nullptr) {
-        return -1;
+        buf = try_pop_dump_meta_buffer(thread_idx, state, state != nullptr ? state->current_buf_seq : 0);
+        if (buf == nullptr) {
+            account_dropped_records(state, 1);
+            return -1;
+        }
     }
 
     // Switch metadata buffer if full
@@ -635,7 +642,11 @@ int dump_arg_record(int thread_idx, const TensorDumpInfo &info) {
         }
         buf = s_current_dump_buf[thread_idx];
         if (buf == nullptr) {
-            return -1;
+            buf = try_pop_dump_meta_buffer(thread_idx, state, state != nullptr ? state->current_buf_seq : 0);
+            if (buf == nullptr) {
+                account_dropped_records(state, 1);
+                return -1;
+            }
         }
     }
 
diff --git a/src/common/platform/shared/host/tensor_dump_collector.cpp b/src/common/platform/shared/host/tensor_dump_collector.cpp
index 5af7c9cfa..fa186dcf2 100644
--- a/src/common/platform/shared/host/tensor_dump_collector.cpp
+++ b/src/common/platform/shared/host/tensor_dump_collector.cpp
@@ -306,6 +306,7 @@ void TensorDumpCollector::process_dump_buffer(const DumpReadyBufferInfo &info) {
 }
 
 void TensorDumpCollector::on_buffer_collected(const DumpReadyBufferInfo &info) {
+    std::scoped_lock<std::mutex> lock(collector_state_mutex_);
     start_writer_thread_once();
     process_dump_buffer(info);
     buffers_collected_++;
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index d1ffb4d2a..9b92322ed 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -368,6 +368,10 @@ set_tests_properties(test_chip_callable_upload_immutable PROPERTIES LABELS "no_h
 add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp)
 add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp)
 add_common_utils_test(test_device_arena common/test_device_arena.cpp)
+add_common_utils_test(test_buffer_pool_manager common/test_buffer_pool_manager.cpp)
+target_include_directories(test_buffer_pool_manager PRIVATE
+    ${CMAKE_SOURCE_DIR}/../../../src/common/log/include
+)
 add_common_utils_test(test_l3_l2_orch_comm common/test_l3_l2_orch_comm.cpp)
 add_executable(test_l3_l2_orch_endpoint
     common/test_l3_l2_orch_endpoint.cpp
diff --git a/tests/ut/cpp/common/test_buffer_pool_manager.cpp b/tests/ut/cpp/common/test_buffer_pool_manager.cpp
new file mode 100644
index 000000000..210706e53
--- /dev/null
+++ b/tests/ut/cpp/common/test_buffer_pool_manager.cpp
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include "host/buffer_pool_manager.h"
+
+#include <gtest/gtest.h>
+
+#include <cstdint>
+#include <set>
+#include <vector>
+
+namespace {
+
+struct TestHeader {};
+struct TestReadyEntry {};
+
+struct TestReadyBufferInfo {
+    void *dev_buffer_ptr{nullptr};
+    uint32_t shard_marker{0};
+};
+
+struct TestModule {
+    using DataHeader = TestHeader;
+    using ReadyEntry = TestReadyEntry;
+    using ReadyBufferInfo = TestReadyBufferInfo;
+
+    static constexpr int kBufferKinds = 2;
+    static constexpr int kCollectorThreadCount = 4;
+};
+
+void *ptr(uintptr_t value) { return reinterpret_cast<void *>(value); }
+
+}  // namespace
+
+TEST(BufferPoolManagerShardingTest, ReadyShardsAreIndependent) {
+    using Manager = profiling_common::BufferPoolManager<TestModule>;
+    static_assert(Manager::kCollectorShardCount == 4);
+
+    Manager manager;
+    manager.push_to_ready(TestReadyBufferInfo{ptr(0x1000), 0}, 0);
+    manager.push_to_ready(TestReadyBufferInfo{ptr(0x2000), 1}, 1);
+    manager.push_to_ready(TestReadyBufferInfo{ptr(0x5000), 5}, 5);  // normalizes to shard 1
+
+    TestReadyBufferInfo out;
+    EXPECT_FALSE(manager.try_pop_ready(out, 2));
+
+    ASSERT_TRUE(manager.try_pop_ready(out, 0));
+    EXPECT_EQ(out.dev_buffer_ptr, ptr(0x1000));
+    EXPECT_EQ(out.shard_marker, 0u);
+    EXPECT_FALSE(manager.try_pop_ready(out, 0));
+
+    ASSERT_TRUE(manager.try_pop_ready(out, 1));
+    EXPECT_EQ(out.dev_buffer_ptr, ptr(0x2000));
+    EXPECT_EQ(out.shard_marker, 1u);
+    ASSERT_TRUE(manager.try_pop_ready(out, 1));
+    EXPECT_EQ(out.dev_buffer_ptr, ptr(0x5000));
+    EXPECT_EQ(out.shard_marker, 5u);
+    EXPECT_FALSE(manager.try_pop_ready(out, 1));
+}
+
+TEST(BufferPoolManagerShardingTest, DoneShardsRecycleByKind) {
+    profiling_common::BufferPoolManager<TestModule> manager;
+
+    manager.notify_copy_done(ptr(0x1000), /*kind=*/0, /*shard_index=*/0);
+    manager.notify_copy_done(ptr(0x2000), /*kind=*/1, /*shard_index=*/1);
+    manager.notify_copy_done(ptr(0x5000), /*kind=*/1, /*shard_index=*/5);  // normalizes to shard 1
+
+    EXPECT_EQ(manager.drain_done_into_recycled(), 3u);
+    EXPECT_EQ(manager.recycled_count(0), 1u);
+    EXPECT_EQ(manager.recycled_count(1), 2u);
+
+    EXPECT_EQ(manager.pop_recycled(0), ptr(0x1000));
+
+    std::set<void *> kind_one;
+    kind_one.insert(manager.pop_recycled(1, 1));
+    kind_one.insert(manager.pop_recycled(1, 1));
+    EXPECT_EQ(kind_one, (std::set<void *>{ptr(0x2000), ptr(0x5000)}));
+}
+
+TEST(BufferPoolManagerShardingTest, ReleaseOwnedBuffersVisitsAllShards) {
+    profiling_common::BufferPoolManager<TestModule> manager;
+    manager.push_recycled(/*kind=*/0, ptr(0x1000));
+    manager.push_to_ready(TestReadyBufferInfo{ptr(0x2000), 2}, /*shard_index=*/2);
+    manager.notify_copy_done(ptr(0x3000), /*kind=*/1, /*shard_index=*/3);
+
+    std::vector<void *> released;
+    manager.release_owned_buffers([&](void *p) {
+        released.push_back(p);
+    });
+
+    EXPECT_EQ(
+        std::set<void *>(released.begin(), released.end()), (std::set<void *>{ptr(0x1000), ptr(0x2000), ptr(0x3000)})
+    );
+    EXPECT_TRUE(manager.recycled_empty());
+
+    TestReadyBufferInfo out;
+    EXPECT_FALSE(manager.try_pop_ready(out, 2));
+    EXPECT_EQ(manager.drain_done_into_recycled(), 0u);
+}