From 5962af6d248071f58658ba60654b1826497b2b6f Mon Sep 17 00:00:00 2001
From: ccyywwen <75376396+ccyywwen@users.noreply.github.com>
Date: Wed, 24 Jun 2026 15:45:02 +0800
Subject: [PATCH 1/7] Add: L3-L2 message queue design

- Define the staged base queue transport design and PR1/PR2 split.
- Add the base implementation plan for the queue stack.
---
 docs/l3-l2-message-queue-base-impl.md | 798 ++++++++++++++++++++++
 docs/l3-l2-message-queue-design.md    | 922 ++++++++++++++++++++++++++
 2 files changed, 1720 insertions(+)
 create mode 100644 docs/l3-l2-message-queue-base-impl.md
 create mode 100644 docs/l3-l2-message-queue-design.md

diff --git a/docs/l3-l2-message-queue-base-impl.md b/docs/l3-l2-message-queue-base-impl.md
new file mode 100644
index 000000000..d63f446cf
--- /dev/null
+++ b/docs/l3-l2-message-queue-base-impl.md
@@ -0,0 +1,798 @@
+# L3-L2 Message Queue Base Queue Two-PR Implementation Plan
+
+## 1. Scope And Platform Support
+
+This document covers a two-PR delivery of the base bidirectional SPSC message
+queue transport described in `l3-l2-message-queue-design.md`.
+
+PR1 implements the core queue transport and primitive-compatible fast-path API:
+
+- one input queue from L3 to L2;
+- one output queue from L2 to L3;
+- descriptor rings and payload arenas in one primitive L3-L2 region;
+- `DATA`, `ERROR`, and input-only `STOP` descriptors;
+- explicit output reserve/publish on L2;
+- explicit input peek/release on L2;
+- L3 enqueue, output ownership/dequeue, stop, and cleanup APIs;
+- non-zero L3 buffers limited to primitive-compatible registered
+  `orch.alloc(...)` host Tensors;
+- two single-writer abort flags for timeout disambiguation;
+- unit tests for ABI, layout, counters, zero-byte descriptors, queue
+  mechanics, and fast-path APIs.
+
+PR2 implements the usability and end-to-end layer:
+
+- lazy internal staging for ordinary L3 host buffers;
+- ordinary host-buffer enqueue and output read convenience paths;
+- one base queue example with a small message-local AICore task.
+- scene tests on supported platforms;
+- final user-facing documentation cleanup.
+
+Neither PR includes:
+
+- the L2 input window helper;
+- multiple active DATA input handles on L2;
+- out-of-order input release;
+- fragmented payload arenas;
+- multiple outstanding producer reservations per direction;
+- output-side STOP acknowledgement messages.
+
+Supported across the two PRs:
+
+- `a2a3` onboard;
+- `a2a3sim`;
+- `a5sim`.
+
+Not supported:
+
+- `a5` onboard.
+
+The exact Python and C++ class names may change during implementation, but the
+ABI, state transitions, and observable behavior in this document are base queue
+requirements. Scope tags below identify whether a requirement lands in PR1 or
+PR2.
+
+## 2. Expected User Flow
+
+The final base queue should be usable without exposing descriptor offsets,
+counter offsets, or payload arena cursors to application code. PR1 supports
+the same operation shape with primitive-compatible registered host Tensors for
+non-zero L3 buffers. PR2 relaxes that buffer requirement with lazy staging.
+
+Expected L3 shape:
+
+```python
+queue = orch.create_l3_l2_queue(
+    worker_id=0,
+    depth=8,
+    input_arena_bytes=1 << 20,
+    output_arena_bytes=1 << 20,
+)
+
+for payload in input_payloads:
+    queue.input.enqueue(payload.buffer, nbytes=payload.nbytes, timeout=timeout_s)
+
+queue.input.enqueue(None, nbytes=0, timeout=timeout_s)  # zero-byte DATA
+queue.request_stop(timeout=timeout_s)
+
+while not application_done:
+    message = queue.output.peek(timeout=timeout_s)
+    output_buffer = choose_buffer(message.payload_nbytes)
+    queue.output.read_into(message, output_buffer)
+    queue.output.release(message)
+    handle_application_output(message)
+
+queue.free()
+```
+
+If the application already owns a large enough output buffer, it may use the
+convenience path instead:
+
+```python
+message = queue.output.dequeue_into(max_sized_output_buffer, timeout=timeout_s)
+```
+
+Expected base L2 shape:
+
+```cpp
+L3L2QueueEndpoint queue(desc_scalars, queue_args);
+for (;;) {
+    auto in = queue.input().peek(timeout);
+    if (in.opcode == L3L2QueueOpcode::STOP) {
+        queue.input().release(in);
+        break;
+    }
+
+    auto out = queue.output().reserve(output_nbytes, timeout);
+    launch_message_local_aicore_work(in.payload_view, out.gm_addr);
+    wait_until_output_bytes_are_visible();
+    queue.output().publish(out, L3L2QueueOpcode::DATA);
+    queue.input().release(in);
+}
+```
+
+Application payload schema, request IDs, final-output markers, and output
+cardinality are application responsibilities. PR1 transport order does not
+imply request correlation beyond FIFO order within each queue direction.
+
+## 3. API Surface
+
+PR1 must expose the semantic operations below. PR2 keeps the same operation
+surface and only expands accepted L3 buffer types through lazy staging. Exact
+class and method names may change during implementation, but the
+implementation must not require users to manipulate descriptor slots, counter
+offsets, payload arena offsets, or head/tail reconstruction state directly.
+
+Required L3 Python surface:
+
+```text
+orch.create_l3_l2_queue(
+    worker_id,
+    depth,
+    input_arena_bytes,
+    output_arena_bytes,
+) -> queue
+
+queue.input.enqueue(buffer_or_none, nbytes, timeout)
+queue.input.try_enqueue(buffer_or_none, nbytes)
+
+queue.output.dequeue_into(buffer, timeout) -> message
+queue.output.try_dequeue_into(buffer) -> message or no-progress
+
+queue.request_stop(timeout)
+queue.try_request_stop()
+queue.free()
+```
+
+L3 message results must expose at least:
+
+```text
+seq
+opcode
+payload_nbytes
+```
+
+Convenience dequeue APIs may copy and release in one operation. PR1 must also
+expose explicit output ownership APIs with these semantics:
+
+```text
+queue.output.peek(timeout) -> message_handle
+queue.output.try_peek() -> message_handle or no-progress
+queue.output.read_into(message_handle, buffer)
+queue.output.release(message_handle)
+```
+
+Required L2 C++ surface:
+
+```text
+L3L2QueueEndpoint queue(desc_scalars, queue_args)
+
+queue.input().peek(timeout) -> input_handle
+queue.input().try_peek() -> input_handle or no-progress
+queue.input().release(input_handle)
+
+queue.output().reserve(nbytes, timeout) -> output_reservation
+queue.output().try_reserve(nbytes) -> output_reservation or no-progress
+queue.output().publish(output_reservation, opcode)
+```
+
+L2 input handles must expose at least:
+
+```text
+seq
+opcode
+payload_nbytes
+payload_view or empty payload marker
+```
+
+L2 output reservations must expose at least:
+
+```text
+seq or publish sequence context
+payload_offset
+payload_nbytes
+gm_addr for non-zero payload writes
+```
+
+The API must preserve these user-visible semantics:
+
+- finite timeouts are required for blocking operations;
+- `try_*` operations return no-progress without mutating shared state when the
+  queue cannot make progress;
+- ordinary timeout does not poison the queue unless peer abort is observed;
+- zero-byte messages may pass `buffer_or_none == None`;
+- PR1 non-zero L3 buffers must be primitive-compatible registered
+  `orch.alloc(...)` host Tensors;
+- PR2 L3 convenience APIs accept ordinary contiguous host byte spans and lazily
+  stage them when they are not primitive-compatible registered tensors;
+- primitive-compatible `orch.alloc(...)` host Tensors remain the fast path in
+  both PRs;
+- output ownership APIs are the recommended path for variable-size outputs,
+  while `dequeue_into` remains valid when the caller supplies a large enough
+  target buffer;
+- after successful `request_stop`, L3 input enqueue rejects later input
+  messages locally without poisoning;
+- `ERROR` is an application-level message, not a transport exception;
+- cleanup/free remains valid after local poison or remote-aborted terminal
+  state.
+
+## 4. L3 Host Buffer Contract And Lazy Staging
+
+The primitive L3 payload APIs require a registered, child-visible
+`orch.alloc(...)` host Tensor.
+
+PR1 buffer contract:
+
+- `nbytes == 0` accepts `buffer_or_none == None` and uses the zero-byte
+  descriptor path;
+- non-zero L3 input enqueue buffers must be primitive-compatible registered
+  `orch.alloc(...)` host Tensors;
+- non-zero L3 output read targets must be primitive-compatible registered
+  `orch.alloc(...)` host Tensors;
+- ordinary `bytes`, `bytearray`, `memoryview`, private tensors, and other
+  non-registered host buffers are rejected before shared-state mutation;
+- rejecting a non-registered buffer is a pre-mutation validation failure and
+  does not poison or set an abort flag.
+
+PR2 buffer contract:
+
+- `nbytes == 0` accepts `buffer_or_none == None` and uses the zero-byte
+  descriptor path;
+- if the input buffer is a primitive-compatible registered `orch.alloc(...)`
+  host Tensor, enqueue uses it directly as the zero-extra-host-copy fast path;
+- otherwise enqueue accepts an ordinary readable contiguous host byte span,
+  such as `bytes`, `bytearray`, `memoryview`, or a contiguous CPU tensor-like
+  object the implementation can view as bytes;
+- non-fast-path enqueue copies the user bytes into an internal registered
+  staging Tensor, then issues primitive `payload_write` from that staging
+  Tensor.
+
+For L3 output read:
+
+- if the output target is a primitive-compatible registered `orch.alloc(...)`
+  host Tensor, `read_into` or `dequeue_into` uses it directly as the fast path;
+- otherwise the target must be an ordinary writable contiguous host byte span;
+- non-fast-path read first issues primitive `payload_read` into an internal
+  registered staging Tensor, then copies from staging into the user target.
+
+The staging Tensor is allocated lazily and owned by the queue handle. It may
+grow when a later operation needs a larger staging span. The implementation
+must not expose staging offsets or staging Tensor ownership to users.
+
+If a payload is too large for the current staging Tensor, the queue should grow
+or allocate staging before issuing any primitive command. Failure to allocate
+staging is a pre-mutation validation/allocation failure: it rejects the
+operation, does not publish descriptors, does not release descriptors, does not
+poison, and does not set an abort flag.
+
+Staging may add one host-to-host copy. Users that need the lowest host overhead
+can pass primitive-compatible registered `orch.alloc(...)` host Tensors.
+
+## 5. PR1 ABI Surface
+
+The stable PR1 ABI is the L3/L2 shared contract. It is separate from exact
+Python or C++ method names.
+
+TaskArgs carry the primitive region descriptor followed by queue parameters:
+
+```text
+primitive desc[0..5]
+queue_magic_version
+depth
+input_arena_bytes
+output_arena_bytes
+```
+
+The queue ABI version covers:
+
+- descriptor slot size and field order;
+- opcode numeric values;
+- deterministic payload layout derivation;
+- counter offsets and meanings;
+- head/tail low32 reconstruction rules;
+- abort flag semantics;
+- zero-byte descriptor canonical form;
+- STOP and ERROR transport semantics.
+
+Descriptor slot ABI:
+
+```cpp
+struct L3L2QueueDescSlot {
+    uint64_t seq;
+    uint64_t opcode;
+    uint64_t payload_offset;
+    uint64_t payload_nbytes;
+};
+static_assert(sizeof(L3L2QueueDescSlot) == 32);
+```
+
+Opcode ABI:
+
+```text
+0      invalid / never published
+DATA   = 1
+STOP   = 2
+ERROR  = 3
+```
+
+Counter ABI:
+
+```text
+offset 0:   input_desc_tail       writer=L3
+offset 64:  input_desc_head       writer=L2
+offset 128: output_desc_tail      writer=L2
+offset 192: output_desc_head      writer=L3
+offset 256: l3_abort_flag         writer=L3
+offset 320: l2_abort_flag         writer=L2
+```
+
+Layout validation ABI:
+
+- `depth` must be a power of two and `depth <= 2^30`;
+- queue capacity is `depth`, not `depth - 1`;
+- descriptor slot size is 32 bytes;
+- descriptor rings are 8-byte aligned;
+- payload arena bases are 64-byte aligned;
+- arena byte sizes are positive 64-byte multiples;
+- `counter_bytes >= 384`.
+
+The following are not PR1 ABI:
+
+- exact Python class names;
+- exact C++ helper class names;
+- internal helper function names;
+- polling backoff strategy;
+- application payload schema;
+- example payload format.
+
+## 6. ABI And Layout
+
+The descriptor slot ABI is the existing 32-byte format:
+
+```cpp
+struct L3L2QueueDescSlot {
+    uint64_t seq;
+    uint64_t opcode;
+    uint64_t payload_offset;
+    uint64_t payload_nbytes;
+};
+static_assert(sizeof(L3L2QueueDescSlot) == 32);
+```
+
+`payload_offset` is relative to the primitive payload base. For non-zero
+message payloads, it points into the direction-local payload arena. It does not
+point to the descriptor slot itself.
+
+The layout helper must derive all payload and counter offsets. Python may
+mirror the calculation, but tests must keep the Python calculation and the C/C++
+helper in lockstep.
+
+PR1 counter layout:
+
+```text
+offset 0:   input_desc_tail       writer=L3
+offset 64:  input_desc_head       writer=L2
+offset 128: output_desc_tail      writer=L2
+offset 192: output_desc_head      writer=L3
+offset 256: l3_abort_flag         writer=L3
+offset 320: l2_abort_flag         writer=L2
+```
+
+`counter_bytes` must be at least 384. The abort flags are low-frequency
+diagnostic signals, but they still use the same 64-byte stride as the
+descriptor counters to preserve single-writer cache-line ownership.
+
+All six counters are initialized to zero before submitting the persistent L2
+run. Descriptor slots and payload bytes do not need to be zeroed for
+correctness.
+
+## 7. Primitive Command Mapping
+
+The queue is a wrapper over the existing L3-L2 primitive commands. PR1 must not
+add a new primitive command or bypass the primitive region lifetime model.
+
+Descriptor rings live in the primitive payload region. Descriptor slot access
+therefore uses the primitive payload APIs:
+
+- L3 writes input descriptor slots with `L3L2OrchRegion.payload_write`;
+- L3 reads output descriptor slots with `L3L2OrchRegion.payload_read`;
+- L2 reads input descriptor slots with `L3L2OrchEndpoint::payload_read`;
+- L2 writes output descriptor slots with `L3L2OrchEndpoint::payload_write`.
+
+Message payload arena access also uses the primitive payload APIs when the
+message payload is non-zero:
+
+- L3 input enqueue writes non-zero input payload bytes with
+  `L3L2OrchRegion.payload_write`;
+- L3 output dequeue reads non-zero output payload bytes with
+  `L3L2OrchRegion.payload_read`;
+- L2 input consume obtains a non-zero input payload GM view with
+  `L3L2OrchEndpoint::payload_read`;
+- L2 output reserve returns a GM span in the output arena; L2 application code
+  or AICore work writes that span before `publish`;
+- PR1 does not require a separate L2 message-payload copy API. If an
+  implementation uses `L3L2OrchEndpoint::payload_write` for a small L2-produced
+  output payload, it is only a helper for filling the reserved output arena
+  span before `publish`, not a separate transport path.
+
+Queue counters use the primitive signal APIs:
+
+- publishing descriptor tail, releasing descriptor head, and setting an abort
+  flag use `SIGNAL_NOTIFY` / `signal_notify`;
+- head/tail polling uses `SIGNAL_TEST` / `signal_test` snapshots;
+- timeout disambiguation samples the peer abort flag with `SIGNAL_TEST`, for
+  example `GE 1` against the peer flag address.
+
+Only a matched `SIGNAL_TEST` snapshot may drive head/tail reconstruction,
+descriptor replay, payload release, or payload reuse. A failed head/tail test
+does not establish acquire ordering and its observed value must not update
+local queue state. For abort flags, a matched `GE 1` test reports remote abort;
+an unmatched test leaves the timeout as ordinary no-progress.
+
+PR1 queue correctness must not depend on primitive `SIGNAL_WAIT`. Blocking
+queue operations are wrapper-level bounded polling loops over `SIGNAL_TEST`
+plus local queue-state checks.
+
+## 8. Zero-Byte Message Rules
+
+Zero-byte `DATA`, `ERROR`, and `STOP` descriptors are valid queue messages.
+They still consume one descriptor slot and follow the normal descriptor
+publication sequence.
+
+For any descriptor with `payload_nbytes == 0`:
+
+- `payload_offset` must be `0`;
+- `payload_offset == 0` is a canonical sentinel, not a payload address;
+- the message consumes no payload arena bytes;
+- producer payload cursors do not advance;
+- consumer payload cursors do not advance;
+- payload wrap-padding replay is skipped for that descriptor;
+- no message-payload arena copy/read/view is issued.
+
+Descriptor-ring access is separate from message-payload arena access.
+Descriptor slots live in the primitive payload region, so publishing or reading
+a zero-byte message may still use primitive payload access for descriptor-ring
+metadata. The rule above skips only the message payload arena path.
+
+Consumer validation order must make the zero-byte path explicit:
+
+```text
+1. validate descriptor sequence;
+2. validate opcode and direction legality;
+3. if payload_nbytes == 0:
+     require payload_offset == 0;
+     skip direction-local arena range checks and payload replay;
+   else:
+     require payload_offset to be inside the direction-local arena;
+     validate contiguous span and payload cursor replay.
+```
+
+This ordering matters because `payload_offset == 0` for a zero-byte output
+descriptor usually is not inside the output arena. A consumer that runs arena
+range validation before the zero-byte branch would reject a valid descriptor.
+
+If a published descriptor has `payload_nbytes == 0` and `payload_offset != 0`,
+the descriptor is invalid published state. The observing endpoint transitions
+to `POISONED(local-infrastructure)` and sets its own abort flag.
+
+## 9. Queue State And Abort Flags
+
+PR1 uses two single-writer abort flags:
+
+```text
+l3_abort_flag: writer=L3, reader=L2
+l2_abort_flag: writer=L2, reader=L3
+```
+
+Each flag is initialized to `0`. On local infrastructure poison, the endpoint
+sets its owned flag to `1` with `NotifyOp.Set`. The flag never resets within a
+queue lifetime. It is a terminal boolean, not an epoch and not a poison count.
+
+Abort flags are for timeout disambiguation. PR1 does not require every wait
+loop iteration to poll both data progress and abort progress. A blocking queue
+operation that reaches its timeout samples the peer abort flag:
+
+```text
+peer abort_flag == 0:
+  return ordinary timeout/no-progress;
+  keep the local queue live;
+  do not set the local abort flag.
+
+peer abort_flag == 1:
+  return remote-aborted transport failure;
+  transition the local handle to a terminal remote-aborted state;
+  do not publish descriptors or advance queue state;
+  do not set the local abort flag solely because the peer flag was observed.
+```
+
+The implementation may represent terminal remote abort with the existing
+`POISONED` state, but the reason must remain distinct:
+
+```text
+POISONED(local-infrastructure): set own abort_flag = 1
+POISONED(remote-aborted):       do not set own abort_flag
+```
+
+This distinction prevents a peer abort observation from being amplified into a
+new local infrastructure poison report.
+
+## 10. Capacity, Counters, And Reconstruction
+
+`depth` is the user-visible queue capacity. A queue created with `depth=N` can
+hold `N` published, unreleased descriptors.
+
+Validation rules:
+
+- `depth` must be a power of two;
+- `depth <= 2^30`;
+- queue capacity is `depth`, not `depth - 1`.
+
+Full and empty checks must use monotonic local `uint64_t` head/tail values, not
+only masked ring indices:
+
+```text
+empty iff tail == head
+full  iff tail - head == depth
+invalid shared state iff tail - head > depth
+```
+
+The shared head/tail counters store only the low 32 bits. Each endpoint keeps
+local `uint64_t` copies and reconstructs observed progress with signed 32-bit
+delta semantics:
+
+```text
+delta = int32_t(observed_low32 - local_low32)
+valid progress: 0 <= delta <= depth
+```
+
+`delta == depth` is valid. A peer may legally move from empty to full between
+observations. Negative deltas or deltas larger than `depth` are inconsistent
+shared state and poison the observing endpoint.
+
+Descriptor slot validity does not depend on opcode or slot clearing. A
+published descriptor is valid only when:
+
+```text
+slot.seq == expected_seq
+expected_seq == local_head_or_tail + 1
+slot_index == (expected_seq - 1) & (depth - 1)
+```
+
+Equivalent index calculations are allowed, but the sequence check must use the
+full 64-bit `seq`. Descriptor slots do not need to be cleared before reuse.
+
+Before a producer reuses released descriptor slots or payload arena bytes, it
+must replay exactly the released FIFO prefix after observing head progress.
+Replay must happen before slot reuse. Zero-byte descriptors in replay advance
+descriptor state only and do not advance payload cursors.
+
+## 11. Producer And Consumer Operation Details
+
+Producer sequence:
+
+```text
+reserve -> fill/copy payload if payload_nbytes > 0 -> publish descriptor
+```
+
+Consumer sequence:
+
+```text
+peek/acquire descriptor -> read/view payload if payload_nbytes > 0
+-> release descriptor and payload
+```
+
+Descriptor publication order:
+
+1. reserve a descriptor slot and, for non-zero payloads, a contiguous payload
+   arena span;
+2. write or expose the payload bytes;
+3. write descriptor fields other than `seq`;
+4. write `seq` as the descriptor validity marker;
+5. release-publish the tail counter.
+
+Descriptor release order:
+
+1. finish all uses of the message payload;
+2. update local release and payload cursor state;
+3. release-publish the head counter.
+
+Each direction allows at most one outstanding producer reservation. Publishing
+an unknown, stale, already-published, already-canceled, or cross-queue
+reservation is a local ownership contradiction and poisons the queue.
+
+The base queue has no reservation cancel. If a producer has successfully
+reserved a non-zero payload span and later cannot safely publish either `DATA`
+or application `ERROR`, it must poison the queue. If the queue remains
+trustworthy, the application may publish an `ERROR` descriptor using the
+reservation.
+
+`STOP` is an input-queue descriptor. It consumes one input descriptor slot,
+uses `payload_nbytes == 0` and `payload_offset == 0`, and is terminal for L3
+input enqueue. After L3 successfully publishes `STOP`, later input `DATA`,
+`ERROR`, or `STOP` attempts are rejected locally without poisoning. If L2 has
+observed `STOP` and later observes another published input descriptor, the
+descriptor is invalid published state and poisons the queue.
+
+`ERROR` remains an application-level message. Receiving `ERROR` does not poison
+the queue, set an abort flag, stop either direction, or imply transport abort.
+
+## 12. Error Handling Rules
+
+The guiding rule remains:
+
+```text
+Before shared-state mutation: reject, no poison, no abort flag.
+After shared-state mutation or inconsistent shared-state observation:
+  poison local infrastructure, set own abort_flag.
+```
+
+Pre-mutation validation failures do not poison and do not set abort flags:
+
+- `try_enqueue` sees no descriptor or payload space;
+- `try_request_stop` sees no input descriptor slot;
+- a blocking operation times out under ordinary backpressure;
+- payload size exceeds the arena before reservation mutates state;
+- queue creation rejects invalid layout or reconstruction parameters;
+- output buffer is too small before payload copy and before release;
+- invalid API arguments are caught before shared state is touched;
+- lazy staging allocation failure before primitive command issue;
+- enqueue is attempted after L3 has already published `STOP`;
+- application `ERROR` is sent or received normally.
+
+Infrastructure poison sets the endpoint's own abort flag:
+
+- descriptor sequence mismatch;
+- invalid opcode observed in a published descriptor;
+- `STOP` observed on the output queue;
+- zero-byte descriptor with non-zero `payload_offset`;
+- non-zero descriptor payload range outside its direction-local arena;
+- head/tail reconstruction observes impossible progress;
+- payload replay observes impossible state;
+- payload copy failure after command issue;
+- counter notify failure;
+- control-service response timeout after command issue;
+- L2 endpoint fatal error for this region;
+- reservation, publish, or release ownership state becomes contradictory.
+
+Ordinary timeout is ambiguous until the peer abort flag is sampled. A timeout
+with peer abort flag `0` is not poison. A timeout with peer abort flag `1`
+transitions the local handle to terminal `remote-aborted` without setting the
+local abort flag.
+
+Cleanup and `free()` remain valid and idempotent after both local
+infrastructure poison and remote-aborted terminal state.
+
+## 13. Example
+
+PR2 adds one base queue example:
+
+```text
+examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/
+```
+
+The example should demonstrate the intended user shape, not every edge case.
+It must show:
+
+- L3 creating a queue with `depth > 1`;
+- multiple variable-size input `DATA` messages;
+- one zero-byte `DATA` message;
+- a persistent L2 loop;
+- L2 processing at most one active DATA input at a time;
+- one small message-local AICore task;
+- L2 publishing one output `DATA` per input `DATA`;
+- L3 publishing `STOP`;
+- L3 continuing to dequeue outputs after `STOP` according to application final
+  output rules;
+- L2 releasing the `STOP` descriptor and returning from the persistent run.
+
+The example should not demonstrate:
+
+- the L2 input window;
+- multiple active input messages;
+- one input producing multiple outputs;
+- multiple inputs producing one output;
+- out-of-input-order output publish;
+- application `ERROR` protocol design;
+- abort flag failure paths.
+
+The zero-byte `DATA` message should exercise the descriptor-only message path.
+It should not require a child-visible zero-byte host buffer.
+
+## 14. Test Plan
+
+Both PRs require automated tests for their review-driven boundaries. A manual
+review checklist is not enough.
+
+PR1 test scope:
+
+- ABI and layout;
+- descriptor/counter protocol;
+- zero-byte descriptor handling;
+- capacity, full/empty, wrap, and low32 reconstruction;
+- abort flag semantics;
+- L2 endpoint API;
+- L3 fast-path API with primitive-compatible registered host Tensors.
+
+PR2 test scope:
+
+- lazy internal staging for ordinary L3 host buffers;
+- registered Tensor fast path remains no-staging;
+- staging allocation failure is pre-mutation and non-poisoning;
+- base queue example and scene coverage.
+
+Suggested C++ unit test category:
+
+```text
+tests/ut/cpp/common/test_l3_l2_message_queue.cpp
+```
+
+Suggested C++ unit tests:
+
+- `LayoutAssignsAbortFlagsAfterDescriptorCounters`
+- `LayoutRequiresCounterBytesForSixCounters`
+- `DescriptorSlotEncodingIsStable`
+- `ZeroByteDescriptorUsesCanonicalOffset`
+- `ZeroByteDescriptorWithNonZeroOffsetPoisons`
+- `CapacityEqualsDepthAllowsNPublishedDescriptors`
+- `CapacityEqualsDepthRejectsNthPlusOneDescriptor`
+- `FullAndEmptyUseMonotonicCountersNotMaskedIndices`
+- `Low32ReconstructionAcceptsDeltaEqualDepth`
+- `Low32ReconstructionHandlesCounterWrap`
+- `Low32ReconstructionRejectsNegativeDelta`
+- `Low32ReconstructionRejectsDeltaGreaterThanDepth`
+- `ReplaySkipsPayloadCursorAdvanceForZeroByteDescriptors`
+- `ReplayBeforeSlotReuseAfterFullQueueWrap`
+- `LocalInfrastructurePoisonSetsOwnAbortFlag`
+- `RemoteAbortObservationDoesNotSetOwnAbortFlag`
+- `OrdinaryTimeoutDoesNotSetAbortFlag`
+- `ApplicationErrorDoesNotSetAbortFlag`
+- `PreMutationValidationFailureDoesNotSetAbortFlag`
+
+Suggested Python unit test category:
+
+```text
+tests/ut/py/test_l3_l2_message_queue.py
+```
+
+Suggested Python unit tests:
+
+- `test_layout_matches_cpp_helper`
+- `test_counter_offsets_include_abort_flags`
+- `test_zero_byte_enqueue_skips_payload_arena_copy`
+- `test_zero_byte_dequeue_skips_payload_arena_read`
+- `test_enqueue_rejects_ordinary_host_bytes_before_pr2_staging`
+- `test_output_read_rejects_ordinary_buffer_before_pr2_staging`
+- `test_enqueue_accepts_ordinary_host_bytes_with_lazy_staging`
+- `test_enqueue_registered_tensor_uses_fast_path_without_staging`
+- `test_output_read_into_ordinary_buffer_uses_lazy_staging`
+- `test_staging_allocation_failure_does_not_poison`
+- `test_timeout_with_peer_abort_flag_reports_remote_aborted`
+- `test_timeout_without_peer_abort_flag_returns_timeout`
+- `test_remote_aborted_terminal_state_rejects_later_operations`
+
+Suggested scene/example tests:
+
+```text
+examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/
+```
+
+Suggested scene cases:
+
+- `variable_size_messages`: enqueue/dequeue several non-zero `DATA` messages;
+- `zero_byte_data`: send one zero-byte `DATA` and verify one corresponding
+  output is produced without payload arena bytes;
+- `depth_capacity`: with `depth=N`, publish `N` inputs before backpressure;
+- `fifo_stop`: publish `STOP`, drain outputs, and verify L2 exits;
+- `small_aicore_work`: each non-zero input launches message-local AICore work;
+- `l2_abort_flag_timeout_disambiguation`: force an L2 local infrastructure
+  poison, then verify L3 timeout reports remote-aborted instead of ordinary
+  timeout.
+
+The scene test matrix should include the PR1 supported simulation platforms
+where practical:
+
+- `a2a3sim`;
+- `a5sim`.
+
+Hardware execution should include `a2a3` onboard when device access is
+available through the repository's `task-submit` workflow.
diff --git a/docs/l3-l2-message-queue-design.md b/docs/l3-l2-message-queue-design.md
new file mode 100644
index 000000000..414b80d02
--- /dev/null
+++ b/docs/l3-l2-message-queue-design.md
@@ -0,0 +1,922 @@
+# L3-L2 SPSC Message Queue Design
+
+## 1. Goal
+
+This document proposes the functional shape of an L3-L2 SPSC message queue
+wrapper built on top of the existing `docs/l3-l2-orch-comm.md` primitives.
+
+The feature goal is to let one L3 orchestrator exchange a sequence of input
+and output messages with one persistent L2 orchestrator run. L3 can enqueue
+task inputs and dequeue task outputs while the L2 run stays alive. This avoids
+stopping the L2 run after every task and then paying host/device finish and
+init costs again for the next task.
+
+The target shape has two layers:
+
+- a base bidirectional queue transport with input and output queues;
+- an L2-side input window helper that lets L2 hold multiple input messages
+  concurrently without changing the L3 API or the transport ABI.
+
+The base transport should land first for reviewability. The input window can
+then be added as an L2 helper policy on top of the same descriptor ABI, region
+layout, counter layout, and L3 queue API.
+
+The queue wrapper does not change the primitive L3-L2 communication service.
+It uses the existing region descriptor, payload byte range, and `int32_t`
+signal counter primitives.
+
+## 2. Existing Primitive Constraints
+
+The primitive L3-L2 communication layer provides:
+
+- one region descriptor containing payload and counter base/size fields;
+- contiguous payload byte access through `PAYLOAD_READ` and `PAYLOAD_WRITE`;
+- address-based `int32_t` signal counters through `SIGNAL_NOTIFY`,
+  `SIGNAL_TEST`, and `SIGNAL_WAIT`;
+- region lifetime, release, and poison state handling.
+
+The primitive layer deliberately does not define queue layout, stream headers,
+opcodes, tensor schema, descriptor rings, STOP semantics, or typed tensor
+metadata. The message queue wrapper owns those protocol choices.
+
+The primitive layer requires only 4-byte alignment for counter addresses inside
+the registered counter range. The queue wrapper places high-frequency shared
+counter signals at 64-byte strides so counters written by different agents do
+not share a cache line.
+
+## 3. Public Functional Shape
+
+L3 creates one bidirectional queue object:
+
+```python
+queue = orch.create_l3_l2_queue(
+    worker_id=0,
+    depth=8,
+    input_arena_bytes=1 << 20,
+    output_arena_bytes=1 << 20,
+)
+```
+
+The L3-visible queue API exposes an input queue and an output queue. L3 sends
+ordinary application messages to L2 through the input queue and receives
+ordinary application messages from L2 through the output queue.
+
+The wrapper computes:
+
+- descriptor ring sizes;
+- payload section offsets;
+- counter offsets;
+- total region payload bytes;
+- total counter bytes.
+
+The user does not pass internal descriptor offsets, arena offsets, or counter
+offsets.
+
+The queue owns one `L3L2OrchRegion`. The L2 task receives the primitive region
+descriptor plus queue layout scalars through `TaskArgs`.
+
+The intended L3 API shape is illustrative, but the semantics are part of the
+transport contract:
+
+```python
+queue.input.enqueue(host_buffer, nbytes=None, timeout=timeout_s)
+message = queue.output.dequeue_into(host_buffer, timeout=timeout_s)
+handle = queue.output.peek(timeout=timeout_s)
+queue.output.read_into(handle, host_buffer)
+queue.output.release(handle)
+queue.request_stop(timeout=timeout_s)
+queue.free()
+```
+
+The output ownership APIs `peek`, `read_into`, and `release` are part of the
+base L3 API. They are the recommended path for variable-size outputs because
+the caller can inspect `payload_nbytes` before choosing or allocating a target
+buffer. Convenience APIs such as `dequeue_into` may copy and release in one
+operation when the caller already has a large enough target buffer. Core APIs
+that hand ownership to the caller require explicit release.
+
+`queue.free()` releases the L3 queue handle. It rejects later queue operations,
+but it does not synchronously free device memory. Physical cleanup follows the
+underlying region lifetime model.
+
+The L3 public queue API accepts ordinary contiguous host byte spans for
+convenience enqueue and output read operations. When the supplied buffer is
+already a primitive-compatible registered `orch.alloc(...)` Tensor, the queue
+uses it as the zero-extra-host-copy fast path. Otherwise the queue lazily
+stages through an internal registered host Tensor before issuing the primitive
+payload command, then copies between that staging Tensor and the user buffer.
+Zero-byte DATA and ERROR messages may pass `None` as the buffer. Staging hides
+the primitive child-visible Tensor requirement from ordinary queue users, but
+may add one host-to-host copy.
+
+The L2 input window extension is not visible to L3. It is an L2 helper policy
+that controls how many DATA input messages L2 may hold concurrently before
+releasing them in FIFO-safe order.
+
+## 4. Non-Goals
+
+- Multiple L2 orchestrators.
+- Multi-producer or multi-consumer queues.
+- Shared input/output payload allocator.
+- Split payload spans across arena wrap.
+- Dtype, shape, stride, tensor rank, or tile layout interpretation.
+- Changes to `ALLOC_REGION`, `PAYLOAD_READ`, `PAYLOAD_WRITE`,
+  `SIGNAL_NOTIFY`, `SIGNAL_TEST`, or `SIGNAL_WAIT`.
+- Exposing the L2 input window configuration through the L3 API.
+- Out-of-order input payload release.
+- Fragmented or hole-filled input arena allocators.
+- Output-side STOP acknowledgement messages.
+
+## 5. Region Layout
+
+The physical L3-L2 region has one payload range and one counter range. The
+queue wrapper divides the payload range into four logical sections:
+
+```text
+payload region
+├─ input descriptor ring
+├─ output descriptor ring
+├─ input payload arena
+└─ output payload arena
+```
+
+The descriptor rings live in the payload region because they are structured
+byte metadata. The counter range stores only shared head/tail signals.
+
+The input and output payload arenas are logically separate. This preserves SPSC
+ownership:
+
+```text
+input arena:
+  producer = L3
+  consumer = L2
+
+output arena:
+  producer = L2
+  consumer = L3
+```
+
+A shared payload allocator is intentionally out of scope because it would have
+two producers and two releasers.
+
+The queue layout is derived, not transmitted as internal offsets. `TaskArgs`
+carry the primitive region descriptor followed by four queue parameters:
+
+```text
+primitive desc[0..5]
+queue_magic_version
+depth
+input_arena_bytes
+output_arena_bytes
+```
+
+The queue magic/version belongs to the queue wrapper ABI, not to the primitive
+region ABI. It covers the descriptor slot format, opcode values, deterministic
+layout function, head/tail reconstruction rules, and STOP/ERROR transport
+semantics.
+
+A shared C/C++ layout helper is the source of truth for derived offsets and
+sizes. Python may mirror that calculation, but tests must keep the Python
+calculation and the C/C++ helper in lockstep. The helper derives:
+
+```text
+input_desc_offset
+output_desc_offset
+input_arena_offset
+output_arena_offset
+input_desc_tail = 0
+input_desc_head = 64
+output_desc_tail = 128
+output_desc_head = 192
+l3_abort_flag = 256
+l2_abort_flag = 320
+```
+
+Validation rules:
+
+- `depth` must be a power of two and `depth <= 2^30`.
+- Queue capacity is `depth` messages, not `depth - 1`.
+- Descriptor slot size is fixed at 32 bytes.
+- Descriptor rings are 8-byte aligned.
+- Payload arena bases are 64-byte aligned.
+- `input_arena_bytes` and `output_arena_bytes` must be positive 64-byte
+  multiples. They do not need to be powers of two.
+- `counter_bytes` must be at least 384.
+- `payload_bytes` must contain both descriptor rings and both payload arenas.
+- Unsupported `queue_magic_version` on L2 is a fatal queue decode error for
+  this region.
+
+The L3 queue creator initializes the four shared head/tail counters and the
+two abort flags to zero before submitting the persistent L2 run. Descriptor
+slots and payload bytes do not need to be zeroed for correctness.
+
+## 6. Descriptor ABI
+
+Each descriptor slot is 32 bytes and is encoded as four little-endian
+`uint64_t` values:
+
+```cpp
+struct L3L2QueueDescSlot {
+    uint64_t seq;
+    uint64_t opcode;
+    uint64_t payload_offset;
+    uint64_t payload_nbytes;
+};
+static_assert(sizeof(L3L2QueueDescSlot) == 32);
+```
+
+The queue uses 64-byte spacing for shared signal counters, not for descriptor
+slots. Each descriptor ring is SPSC, so the base descriptor ABI needs only the
+four transport fields above.
+
+`seq` is a full 64-bit infrastructure sequence number used for ring
+correctness, wrap detection, diagnostics, and input-window validation. It is
+not a user correlation ID. Applications that need request IDs, batch IDs,
+partial/final markers, or other correlation should put them in their own
+payload header.
+
+`payload_offset` is relative to the primitive region payload base, so L2 can
+call `endpoint.payload_read(payload_offset, payload_nbytes, &view)` directly.
+
+Future descriptor extensions should use an ABI version or application payload
+headers instead of reserving unused fields in every slot.
+
+## 7. Opcodes
+
+The queue transport defines these opcodes:
+
+```text
+0      invalid / never published
+DATA   = 1 ordinary application payload message
+STOP   = 2 graceful input-side shutdown request, input queue only
+ERROR  = 3 ordinary application-level error payload message, either direction
+```
+
+`ERROR` is a normal queue message. The queue layer does not interpret its
+payload, does not raise a transport exception for it, and does not poison the
+queue when it sees one. Applications define whether an `ERROR` payload
+correlates with a request, batch, stream, or other application state.
+
+Infrastructure errors are handled through poison state, not by trying to write
+an `ERROR` message into a potentially untrusted queue.
+
+`STOP` is valid only on the input queue. The output queue has no STOP message.
+L2 shutdown acknowledgement is provided by `Worker.run` drain, not by an
+output STOP. Observing STOP on the output queue is invalid published
+descriptor state and poisons the queue.
+
+DATA and ERROR may carry zero payload bytes. For any zero-byte message,
+`payload_offset` must be zero and the message consumes no payload arena bytes.
+STOP must also use `payload_nbytes == 0` and `payload_offset == 0`.
+
+## 8. Descriptor Counters And Derived Payload Cursors
+
+The queue shares only descriptor head/tail values through the primitive layer's
+`int32_t` signal counters. Each shared head/tail uses a 64-byte stride:
+
+```text
+offset 0:   input_desc_tail       writer=L3
+offset 64:  input_desc_head       writer=L2
+offset 128: output_desc_tail      writer=L2
+offset 192: output_desc_head      writer=L3
+offset 256: l3_abort_flag         writer=L3
+offset 320: l2_abort_flag         writer=L2
+```
+
+`counter_bytes` must be at least 384.
+
+The abort flags are single-writer terminal booleans used to disambiguate
+operation timeouts from remote infrastructure abort. They are initialized to
+zero and set to one with `NotifyOp.Set` when the owning endpoint enters local
+infrastructure poison. They do not carry application `ERROR` semantics, do not
+count poison events, and do not reset within a queue lifetime.
+
+Blocking queue operations are not required to poll abort flags on every wait
+iteration. When a blocking operation times out, the implementation samples the
+peer abort flag. If the peer flag is zero, the timeout remains ordinary
+no-progress and does not poison the local queue. If the peer flag is one, the
+operation reports remote infrastructure abort and transitions the local handle
+to a terminal remote-aborted state. Observing a peer abort flag does not set
+the local endpoint's own abort flag.
+
+The shared descriptor counters store the low 32 bits of logical `uint64_t`
+head/tail values. These values are monotonic message counts. The primitive
+transports these bits through `int32_t` counters. Endpoints reconstruct local
+`uint64_t` head/tail values from sampled counter values using signed 32-bit
+delta semantics:
+
+```text
+delta = int32_t(observed_low32 - local_low32)
+valid progress: 0 <= delta <= depth
+```
+
+Negative deltas or deltas larger than `depth` are inconsistent shared state.
+Queue creation rejects descriptor depths that would make head/tail
+reconstruction ambiguous. This is a validation error, not a poison condition.
+
+Descriptor head/tail reconstruction is safe because unobserved descriptor
+progress is bounded by the descriptor ring depth. Payload byte cursors are not
+shared counters and are not reconstructed from low-32-bit signal values.
+
+Each endpoint maintains the payload cursors it needs as local `uint64_t`
+state:
+
+```text
+producer local:
+  payload_tail
+  inferred_payload_head
+
+consumer local:
+  payload_head
+```
+
+The producer infers reusable payload space by observing `desc_head`
+progress and replaying the released descriptors before reusing those descriptor
+slots. The consumer maintains its local `payload_head` while releasing
+descriptors.
+Because payload cursor progress is derived from descriptor FIFO history, payload
+arena size is not limited by 32-bit signal counter reconstruction.
+
+Queue correctness is based on reconstructed descriptor head/tail state plus
+descriptor replay, not on primitive `GE` / `LT` comparison over the 32-bit
+counter value. Blocking queue operations use bounded polling over `SIGNAL_TEST`
+snapshots plus local queue-state checks. The timeout belongs to the wrapper
+operation. The design does not require primitive `SIGNAL_WAIT` for queue
+correctness.
+
+Local queue state may advance only after a matched `SIGNAL_TEST` snapshot. A
+failed `SIGNAL_TEST` result does not establish acquire ordering, and its
+`observed` value must not drive descriptor head/tail reconstruction, descriptor
+replay, or payload release. Implementations should choose a comparison that
+matches when the sampled counter has changed, such as `NE` against the local
+low-32 value. The protocol does not prescribe a busy-poll, sleep, yield, or
+backoff strategy.
+
+If a live endpoint observes counter, head/tail, cursor, or descriptor state that
+contradicts the descriptor reconstruction or payload replay rules, that is
+inconsistent shared state and poisons the queue.
+
+Descriptor slots carry the full 64-bit per-message `seq`, so message-level
+validation does not depend on reconstructing sequence numbers from counters.
+Input and output queues have independent sequence spaces. In each direction,
+the first published message has `seq = 1`; head/tail counters start at zero and
+store the number of messages published or released. A published slot has
+`seq = tail_before_publish + 1`.
+
+## 9. Payload Arena
+
+Each direction has a variable-size SPSC byte arena.
+
+Rules:
+
+- `payload_tail` and `payload_head` are logical `uint64_t` byte cursors.
+- Actual arena offset is `cursor % arena_bytes`.
+- `arena_bytes` is limited by region allocation capacity, addressability, and
+  runtime memory budget, not by 32-bit signal counter reconstruction.
+- A single message payload must be one contiguous span.
+- A single message payload must be `<= arena_bytes`.
+- Split payloads across the arena wrap are not supported.
+- If remaining bytes at the arena end cannot hold the next payload, the
+  producer may insert invisible padding by advancing `payload_tail` to the next
+  arena cycle.
+- Padding has no descriptor. On release, the consumer compares
+  `payload_head % arena_bytes` with the descriptor's arena-relative payload
+  offset. If they differ, the only valid base-queue case is wrap padding: the
+  descriptor offset is the base offset of this direction's arena and the
+  releaser first advances `payload_head` to the next arena cycle. It then
+  advances `payload_head` by `payload_nbytes`. Any other mismatch is
+  inconsistent shared state and poisons the queue. The same replay rule is used
+  by the producer after observing `desc_head` progress, before it reuses
+  released descriptor slots.
+- Zero-byte messages do not participate in wrap-padding checks and do not
+  advance payload cursors.
+
+Backpressure must check both descriptor slots and payload arena bytes. A free
+descriptor slot is not enough if the payload arena lacks enough contiguous
+space.
+
+Payload validation is direction-local. DATA and ERROR payloads must lie wholly
+inside the input arena for input descriptors, and wholly inside the output
+arena for output descriptors. Being inside the primitive payload range is not
+enough.
+
+## 10. Core Operation Sequence
+
+The queue exposes direction-specific operations. Exact class names may change,
+but the operation set and ownership semantics are the transport contract.
+
+L3 owns the input producer and output consumer operations:
+
+```text
+input.enqueue(buffer, nbytes, timeout)
+input.try_enqueue(buffer, nbytes)
+output.dequeue_into(buffer, timeout)
+output.try_dequeue_into(buffer)
+output.peek(timeout) -> message handle
+output.try_peek() -> message handle or no-progress
+output.read_into(handle, buffer)
+output.release(handle)
+request_stop(timeout)
+try_request_stop()
+free()
+```
+
+`dequeue_into` is the convenience path for full-message copy and release.
+The `peek` / `read_into` / `release` path is the explicit-ownership path.
+`free` releases the L3 queue handle, not the physical region.
+
+L2 owns the input consumer and output producer operations:
+
+```text
+input.peek(timeout) -> input handle
+input.try_peek() -> input handle or no-progress
+input.release(handle)
+output.reserve(nbytes, timeout) -> reservation
+output.try_reserve(nbytes) -> reservation or no-progress
+output.publish(reservation, opcode)
+```
+
+The L2 input window extension wraps the input consumer with additional
+`complete(handle)` ownership; it does not change the base transport ABI. The
+base queue has no output dequeue operation on L2 and no input enqueue operation
+on L2.
+
+The producer sequence is:
+
+```text
+reserve -> fill/copy payload -> publish descriptor
+```
+
+The consumer sequence is:
+
+```text
+peek/acquire descriptor -> read/view payload -> release descriptor and payload
+```
+
+Convenience APIs are built from the core operation sequence:
+
+```text
+enqueue      = reserve + copy + publish
+dequeue_into = peek + read + release
+```
+
+L3 input enqueue can usually use the convenience path because the input payload
+already exists in a host-visible buffer.
+
+L2 output needs the core path because it often must reserve output arena space
+before launching AICore work:
+
+```cpp
+auto out = output_queue.reserve(output_nbytes, timeout);
+Tensor output = make_tensor_external(out.gm_addr, shape, rank, dtype);
+// submit AICore work that writes output
+// synchronize so output bytes are visible
+output_queue.publish(out, L3L2QueueOpcode::DATA);
+```
+
+Each queue direction allows at most one outstanding producer reservation.
+`publish` accepts only the current outstanding reservation for that direction.
+Publishing an unknown, stale, already-published, or cross-queue reservation is
+a local ownership contradiction and poisons the queue.
+
+The base queue does not support reservation cancel. A successful reserve must
+be published. If filling the reservation fails but the queue remains
+trustworthy, the application may publish an ERROR message using that
+reservation. If the reservation cannot be safely published, the producer
+poisons the queue.
+
+Descriptor publication is ordered. The producer writes payload bytes first,
+writes descriptor fields, writes `seq` as the descriptor validity marker after
+the other descriptor fields, and then release-publishes the tail counter. The
+consumer acquire-observes tail progress before reading the slot, and
+accepts the descriptor only when `slot.seq` equals the expected sequence.
+
+Descriptor slots do not need to be cleared before reuse. Sequence validation
+distinguishes old and new contents.
+
+Descriptor release is ordered in the opposite direction. The consumer must
+finish using the payload, update local release state, and release-publish the
+head counter. The producer may replay released descriptors and infer reusable
+payload space only after acquire-observing matched head progress.
+
+All blocking operations require finite timeouts. Nonblocking `try_*` variants
+return without changing shared state when no descriptor slot, message, or
+payload space is available. Timeout under ordinary backpressure does not
+poison the queue.
+
+The queue layer returns transport messages to the application:
+
+```text
+seq
+opcode
+payload bytes or payload view
+```
+
+The queue layer does not infer application request correlation from queue order
+or from transport `seq`.
+
+Queue ownership is per message, not per byte range. Release or complete always
+applies to the whole descriptor payload span.
+
+For L3 convenience dequeue, a too-small output buffer is a local validation
+failure. The descriptor remains at the queue head, no release is published, and
+the caller may retry with a larger child-visible buffer.
+
+## 11. Base L2 Processing Contract
+
+After dequeuing one input message, L2 application code may submit any number
+of message-local AICore tasks and use runtime dependencies, manual scopes,
+async notify, or other L2 orchestration features.
+
+The base helper and example do not overlap ownership of multiple input
+messages. They keep at most one active DATA input message at a time:
+
+```text
+peek input
+reserve output
+submit message-local AICore work
+wait or otherwise prove message-local work is safe
+publish output
+release input
+next message
+```
+
+L2 must not release an input message until AICore no longer reads that input
+payload and any corresponding output has been successfully published.
+
+After an input is released, L2 and any in-flight AICore work must not read its
+payload view again.
+
+The queue layer does not understand dtype, shape, stride, or tensor schema. It
+returns byte views. Applications build typed tensors with their own protocol
+metadata.
+
+## 12. L2 Input Window Extension
+
+The target feature shape includes an L2 input window helper. The helper lets L2
+hold multiple DATA input messages concurrently while preserving FIFO-safe input
+release. It enables application-defined output cardinality and output order:
+
+- one input may produce no output;
+- one input may produce multiple outputs;
+- several inputs may produce one output;
+- status or progress outputs may be published independently;
+- output publish order may differ from input acquire order.
+
+The L3-visible queue API is unchanged by the input window extension. L3 still
+observes an input queue and an output queue. L3 receives output messages in
+publish order and does not infer input/output correlation from queue order.
+Correlation, aggregation, partial/final markers, request IDs, and batch IDs
+belong in the application payload header.
+
+`max_l2_inflight` is a local L2 helper policy. It is not part of queue creation
+and does not affect region layout:
+
+```cpp
+L3L2QueueEndpoint queue(desc, layout);
+L3L2InputWindow input_window(
+    queue.input(),
+    L3L2InputWindowConfig{.max_l2_inflight = 4}
+);
+```
+
+The helper tracks input handles with these states:
+
+```text
+ACQUIRED
+  Descriptor has been read. Payload view is available to L2.
+
+COMPLETED
+  Application has declared the input payload is no longer needed.
+
+RELEASED
+  Helper has advanced the input descriptor and payload cursors past this input.
+```
+
+The state transition is:
+
+```text
+ACQUIRED -> COMPLETED -> RELEASED
+```
+
+The application owns the transition to `COMPLETED`; the helper owns the
+transition to `RELEASED`. Completing an input means no future L2 code or
+in-flight AICore task will read that input payload, and the payload is no
+longer needed to construct future output.
+
+Completion is explicit. The helper must not infer completion from C++ object
+destruction or lexical scope exit. A handle that is completed twice, released
+twice, or destroyed while still active is a local ownership error.
+
+The helper releases inputs through a FIFO watermark. If inputs 10, 11, and 12
+are acquired and inputs 10 and 12 are completed, the helper may release input
+10 only. It must not release input 12 until input 11 is also completed. This
+keeps the input payload arena monotonic and avoids holes.
+
+Output publish remains application-driven and independent of input handles:
+
+```cpp
+auto out = queue.output().reserve(nbytes, timeout);
+// fill output directly or submit AICore work that writes out.gm_addr
+queue.output().publish(out, L3L2QueueOpcode::DATA);
+```
+
+The input window extension does not add an output completion manager. The L2
+application owns completion tracking and decides when an output is ready to
+publish.
+
+Output reservation and publish remain single-outstanding per direction. The
+input window allows multiple active input handles; it does not introduce
+multiple concurrent output reservations.
+
+## 13. STOP Semantics
+
+`STOP` is an input queue descriptor message:
+
+```text
+seq + opcode=STOP + payload_nbytes=0
+```
+
+It follows normal FIFO ordering. STOP is a graceful shutdown request, not
+cancel and not an immediate no-more-output marker.
+
+Base helper behavior:
+
+- L2 exits only after processing messages before the STOP.
+- L2 releases the STOP descriptor and returns from the persistent run.
+- `Worker.run` drain acts as the final acknowledgement.
+- No extra STOP ACK counter is required.
+
+Input-window behavior:
+
+- STOP can be acquired while earlier DATA inputs are still active.
+- STOP does not take effect ahead of earlier DATA inputs.
+- The helper stops acquiring further DATA inputs after STOP is observed.
+- Earlier active DATA inputs continue until the application completes them.
+- Outputs produced by earlier DATA inputs may still be published while the
+  helper drains.
+- The helper releases only the FIFO completed prefix.
+- Once all earlier DATA inputs are released, the helper releases STOP and the
+  persistent L2 run exits.
+
+STOP takes an input descriptor slot but does not count against
+`max_l2_inflight`, because `max_l2_inflight` controls only active DATA input
+ownership.
+
+STOP is terminal for the input queue. After L3 successfully publishes STOP,
+the input queue rejects further DATA, ERROR, or STOP enqueue attempts locally
+without poisoning. If L2 has observed STOP and later observes any further
+published input descriptor, including a second STOP, that is invalid published
+descriptor state and poisons the queue.
+
+STOP does not close the output queue. After publishing STOP, L3 may continue
+dequeueing DATA or ERROR messages from the output queue. The transport has no
+output-side terminal message and does not automatically know that the
+persistent L2 run has returned. Applications that need to know all business
+outputs have arrived must define that condition in their payload protocol, for
+example with expected counts or final markers.
+
+Publishing STOP and then immediately returning from the L3 orchestration
+function is transport-legal. It can still be an application error if L2 needs
+to publish final outputs: the output queue may fill and prevent L2 from
+finishing, causing `Worker.run` drain to fail or time out.
+
+Convenience APIs may expose:
+
+```text
+try_request_stop()
+request_stop(timeout)
+```
+
+`try_request_stop()` attempts to publish a STOP descriptor to the input queue
+and returns immediately if no input descriptor slot is available.
+
+`request_stop(timeout)` performs a bounded wait until a STOP descriptor can be
+published. The timeout covers only STOP enqueue/publish. It does not wait for
+L2 exit and does not drain outputs. If the timeout expires before STOP is
+published, the queue remains live and is not poisoned.
+
+## 14. Queue Lifetime And Cleanup
+
+A queue owns one primitive `L3L2OrchRegion`. Queue cleanup follows the
+underlying region cleanup path:
+
+```text
+optional request_stop() -> L2 persistent run exits
+L3 orchestration function returns
+Worker.run drains submitted L2 work
+runtime sends FREE_REGION for live L3-L2 regions
+queue/region handles expire
+```
+
+`request_stop()` and `queue.free()` are different operations. `request_stop()`
+is a protocol message that asks L2 to stop acquiring input. `queue.free()` is a
+local handle release that rejects later queue use. Neither operation
+synchronously releases the physical payload/counter region.
+
+Physical release is deferred until `Worker.run` has drained submitted L2 work.
+This keeps region memory live while an in-flight L2 task may still hold the
+primitive descriptor or payload views. If the L3 orchestration function exits
+with a live queue, runtime cleanup releases it through the same region cleanup
+path.
+
+Queue cleanup does not require the output queue to be empty. Once `Worker.run`
+has drained and the persistent L2 run has returned, freeing the region is
+memory-safe even if L3 left output messages unread. Those unread messages are
+discarded with the region. Applications that need every output must dequeue
+until their own final-output condition is satisfied before calling
+`queue.free()` or returning from the orchestration function.
+
+## 15. Error And Poison
+
+Application-level failure is represented by `opcode=ERROR` and optional
+application-defined payload bytes. `ERROR` is allowed in either direction and
+may be published during normal processing or while draining after STOP.
+Receiving `ERROR` does not poison the queue and does not change STOP
+semantics.
+
+Infrastructure poison is a queue/region state, not a descriptor message.
+
+The guiding rule is:
+
+```text
+Before shared-state mutation: reject, no poison.
+After shared-state mutation or inconsistent shared-state observation: poison.
+```
+
+Examples that do not poison:
+
+- `try_enqueue` sees no space.
+- `try_request_stop` sees no input descriptor slot.
+- Blocking enqueue/dequeue/request-stop times out under ordinary backpressure.
+- Payload is larger than the arena before reserve mutates state.
+- Queue creation rejects ambiguous descriptor head/tail reconstruction
+  parameters.
+- User buffer is too small before read copies payload bytes.
+- Invalid API arguments are caught before touching shared state.
+
+Examples that poison:
+
+- descriptor sequence mismatch;
+- invalid opcode observed in a published descriptor;
+- STOP observed on the output queue;
+- descriptor payload range outside its arena;
+- descriptor head/tail reconstruction or payload replay observes impossible
+  shared state;
+- payload copy failure after command issue;
+- counter notify failure;
+- control-service response timeout after command issue;
+- L2 endpoint fatal error for this region;
+- reservation, publish, or release state becomes self-contradictory.
+
+Ordinary queue operation timeout does not prove remote poison. After a
+blocking operation times out, the endpoint samples the peer abort flag. If the
+peer flag is still zero, the timeout remains ordinary no-progress and does not
+poison the local queue. If the peer flag is one, the endpoint reports remote
+infrastructure abort and transitions its local handle to a terminal
+remote-aborted state without setting its own abort flag. The peer may also
+observe primitive region fatal errors or `Worker.run` drain errors.
+
+Only local infrastructure poison sets the endpoint's own abort flag. Ordinary
+timeouts, application `ERROR` messages, pre-mutation validation failures, and
+observing the peer's abort flag do not set it.
+
+The L2 input window helper also poisons the queue when local ownership state
+becomes contradictory:
+
+- completing an input handle unknown to the helper;
+- completing or releasing a handle twice;
+- attempting to release a non-contiguous input while earlier inputs remain
+  incomplete;
+- acquiring DATA after STOP has put the helper into draining;
+- observing an acquired input sequence that contradicts the helper window.
+
+The Python queue object mirrors the existing region state model:
+
+```text
+LIVE
+RELEASED
+POISONED(local-infrastructure)
+POISONED(remote-aborted)
+EXPIRED
+```
+
+After poison, reserve, enqueue, peek, read, release, publish, and stop-request
+operations reject. Cleanup/free remains idempotent and valid.
+
+L2 C++ helper poison reports a fatal error including the primitive region id,
+so existing Host-side parsing can poison the corresponding region.
+
+## 16. Implementation Staging
+
+The feature can be implemented in two review-friendly stages. This staging is
+not an API boundary: the base transport should intentionally leave room for
+the input window without later ABI or L3 API changes.
+
+```text
+Stage 1:
+  base SPSC message queue transport
+  input and output descriptor rings
+  input and output payload arenas
+  descriptor head/tail protocol over int32_t signal counters
+  single-writer abort flags for timeout disambiguation
+  derived uint64_t payload cursors via descriptor replay
+  DATA / ERROR / input-only STOP
+  one active DATA input in the L2 helper/example
+
+Stage 2:
+  L2 input window helper
+  max_l2_inflight
+  application-driven input complete
+  FIFO-safe release of completed input prefix
+  flexible output cardinality and out-of-input-order output publish
+  FIFO STOP drain with earlier DATA inputs still active
+```
+
+Stage 1 intentionally leaves room for Stage 2 through these hook points:
+
+- descriptor `seq` is explicit and 64-bit;
+- input release is explicit, not tied to dequeue;
+- output reserve and publish are separate;
+- each direction has at most one outstanding producer reservation;
+- application correlation is kept in payload, so queue transport does not
+  assume one input maps to one output;
+- L3 queue creation and output ownership/dequeue APIs do not depend on
+  `max_l2_inflight`.
+
+Expected implementation locations:
+
+```text
+python/simpler/l3_l2_message_queue.py
+src/common/platform/include/aicpu/l3_l2_message_queue.h
+docs/l3-l2-message-queue.md
+examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/
+examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue_input_window/
+```
+
+The exact Python module and public API names may change during implementation,
+but the transport contract should remain stable.
+
+## 17. Tests And Examples
+
+Base queue tests should cover:
+
+- layout calculation;
+- descriptor slot encoding;
+- counter offset assignment;
+- queue creation rejecting ambiguous descriptor head/tail reconstruction
+  parameters;
+- enqueue reserve failure for payload larger than arena;
+- backpressure when descriptor ring is full;
+- backpressure when payload arena is full;
+- arena wrap with invisible padding;
+- STOP descriptor handling;
+- `try_request_stop` and `request_stop(timeout)` behavior;
+- ERROR as a normal application message in either direction;
+- L3 ordinary host-buffer enqueue/read through lazy staging;
+- L3 primitive-compatible registered Tensor fast paths without staging;
+- staging allocation failure before primitive command issue not poisoning the
+  queue;
+- abort flags distinguishing ordinary timeout from remote infrastructure
+  abort;
+- local infrastructure poison setting the local abort flag;
+- remote-aborted terminal state not setting the local abort flag;
+- poison on invalid published descriptor state;
+- poison on descriptor head/tail reconstruction or payload replay
+  inconsistency;
+- no poison on pre-mutation validation failure.
+
+The new example should be parallel to the existing primitive stream example,
+not a replacement for it. The primitive stream example should remain as the
+minimal demonstration of `docs/l3-l2-orch-comm.md`.
+
+The base queue example should demonstrate:
+
+- `depth > 1`;
+- variable-size input and output payloads;
+- input and output backpressure;
+- L2 persistent loop;
+- one input message containing message-local AICore work;
+- FIFO STOP shutdown;
+- L3 optionally dequeuing output after STOP according to application final
+  output rules.
+
+Input window tests and examples should cover:
+
+- `max_l2_inflight > 1`;
+- refusing to acquire new DATA input when the input window is full;
+- multiple input messages acquired before earlier inputs release;
+- application-driven input completion;
+- releasing only the FIFO completed prefix;
+- one input producing multiple outputs;
+- multiple inputs producing one output;
+- output publish order differing from input acquire order;
+- output correlation stored in the application payload header;
+- STOP entering draining while earlier DATA inputs remain active;
+- output DATA or ERROR publish during STOP drain;
+- local ownership errors poisoning the queue.
+
+Future work beyond the staged implementation is limited to out-of-order input
+payload release, fragmented payload arena allocation, abort reason/status
+metadata, low-latency abort polling, or concurrent output reservations, if
+those become necessary.

From 105de41b4e51a6759c07944d6d4ed0f783b51dab Mon Sep 17 00:00:00 2001
From: ccyywwen <75376396+ccyywwen@users.noreply.github.com>
Date: Fri, 26 Jun 2026 18:24:32 +0800
Subject: [PATCH 2/7] Add: L3-L2 message queue core transport

- Implement the PR1 L3 queue wrapper and L2 endpoint ABI on top of
  the primitive L3-L2 orchestration region transport.
- Wire Orchestrator.create_l3_l2_queue and cover descriptor layout,
  zero-byte messages, abort flags, capacity, and fast-path buffers in
  Python and C++ unit tests.
---
 python/simpler/l3_l2_message_queue.py         | 534 ++++++++++++++
 python/simpler/orchestrator.py                |  14 +
 .../include/aicpu/l3_l2_message_queue.h       | 659 +++++++++++++++++
 tests/ut/cpp/CMakeLists.txt                   |  17 +
 .../cpp/common/test_l3_l2_message_queue.cpp   | 495 +++++++++++++
 .../test_worker/test_l3_l2_message_queue.py   | 666 ++++++++++++++++++
 6 files changed, 2385 insertions(+)
 create mode 100644 python/simpler/l3_l2_message_queue.py
 create mode 100644 src/common/platform/include/aicpu/l3_l2_message_queue.h
 create mode 100644 tests/ut/cpp/common/test_l3_l2_message_queue.cpp
 create mode 100644 tests/ut/py/test_worker/test_l3_l2_message_queue.py

diff --git a/python/simpler/l3_l2_message_queue.py b/python/simpler/l3_l2_message_queue.py
new file mode 100644
index 000000000..462554650
--- /dev/null
+++ b/python/simpler/l3_l2_message_queue.py
@@ -0,0 +1,534 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+"""L3-side L3-L2 SPSC message queue wrapper."""
+
+from __future__ import annotations
+
+import ctypes
+import struct
+import time
+from dataclasses import dataclass
+from enum import IntEnum
+from typing import Any
+
+from .l3_l2_orch_comm import (
+    L3L2OrchCommCmd,
+    L3L2OrchCommRequest,
+    L3L2OrchRegion,
+    NotifyOp,
+    WaitCmp,
+)
+from .task_interface import DataType, Tensor
+
+L3L2_QUEUE_MAGIC = 0x4C335132
+L3L2_QUEUE_ABI_MAJOR = 1
+L3L2_QUEUE_ABI_MINOR = 0
+L3L2_QUEUE_DESC_SLOT_BYTES = 32
+L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT = 64
+L3L2_QUEUE_COUNTER_STRIDE = 64
+L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET = 0
+L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET = 64
+L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET = 128
+L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET = 192
+L3L2_QUEUE_L3_ABORT_FLAG_OFFSET = 256
+L3L2_QUEUE_L2_ABORT_FLAG_OFFSET = 320
+L3L2_QUEUE_COUNTER_BYTES = 384
+L3L2_QUEUE_MAX_DEPTH = 1 << 30
+
+_DESC = struct.Struct("<4Q")
+_POLL_INTERVAL_S = 0.00005
+
+
+class L3L2QueueOpcode(IntEnum):
+    INVALID = 0
+    DATA = 1
+    STOP = 2
+    ERROR = 3
+
+
+class _QueueState(IntEnum):
+    LIVE = 0
+    RELEASED = 1
+    POISONED_LOCAL = 2
+    POISONED_REMOTE = 3
+    EXPIRED = 4
+
+
+@dataclass(frozen=True)
+class L3L2QueueLayout:
+    depth: int
+    input_desc_offset: int
+    output_desc_offset: int
+    input_arena_offset: int
+    output_arena_offset: int
+    input_arena_bytes: int
+    output_arena_bytes: int
+    payload_bytes: int
+    input_desc_tail_offset: int
+    input_desc_head_offset: int
+    output_desc_tail_offset: int
+    output_desc_head_offset: int
+    l3_abort_flag_offset: int
+    l2_abort_flag_offset: int
+    counter_bytes: int
+
+
+@dataclass(frozen=True)
+class L3L2QueueMessage:
+    seq: int
+    opcode: L3L2QueueOpcode
+    payload_offset: int
+    payload_nbytes: int
+
+
+def l3_l2_queue_magic_version() -> int:
+    return (L3L2_QUEUE_MAGIC << 32) | (L3L2_QUEUE_ABI_MAJOR << 16) | L3L2_QUEUE_ABI_MINOR
+
+
+def _align_up(value: int, align: int) -> int:
+    remainder = value % align
+    return value if remainder == 0 else value + (align - remainder)
+
+
+def make_l3_l2_queue_layout(depth: int, input_arena_bytes: int, output_arena_bytes: int) -> L3L2QueueLayout:
+    depth = int(depth)
+    input_arena_bytes = int(input_arena_bytes)
+    output_arena_bytes = int(output_arena_bytes)
+    if depth <= 0 or depth & (depth - 1) != 0 or depth > L3L2_QUEUE_MAX_DEPTH:
+        raise ValueError("L3-L2 queue depth must be a power of two and <= 2^30")
+    if input_arena_bytes <= 0 or input_arena_bytes % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT != 0:
+        raise ValueError("L3-L2 queue input_arena_bytes must be a positive 64-byte multiple")
+    if output_arena_bytes <= 0 or output_arena_bytes % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT != 0:
+        raise ValueError("L3-L2 queue output_arena_bytes must be a positive 64-byte multiple")
+
+    desc_ring_bytes = depth * L3L2_QUEUE_DESC_SLOT_BYTES
+    input_desc_offset = 0
+    output_desc_offset = input_desc_offset + desc_ring_bytes
+    input_arena_offset = _align_up(output_desc_offset + desc_ring_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT)
+    output_arena_offset = _align_up(input_arena_offset + input_arena_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT)
+    payload_bytes = output_arena_offset + output_arena_bytes
+    return L3L2QueueLayout(
+        depth=depth,
+        input_desc_offset=input_desc_offset,
+        output_desc_offset=output_desc_offset,
+        input_arena_offset=input_arena_offset,
+        output_arena_offset=output_arena_offset,
+        input_arena_bytes=input_arena_bytes,
+        output_arena_bytes=output_arena_bytes,
+        payload_bytes=payload_bytes,
+        input_desc_tail_offset=L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET,
+        input_desc_head_offset=L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET,
+        output_desc_tail_offset=L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET,
+        output_desc_head_offset=L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET,
+        l3_abort_flag_offset=L3L2_QUEUE_L3_ABORT_FLAG_OFFSET,
+        l2_abort_flag_offset=L3L2_QUEUE_L2_ABORT_FLAG_OFFSET,
+        counter_bytes=L3L2_QUEUE_COUNTER_BYTES,
+    )
+
+
+def create_l3_l2_queue(
+    orch: Any,
+    *,
+    worker_id: int,
+    depth: int,
+    input_arena_bytes: int,
+    output_arena_bytes: int,
+) -> L3L2Queue:
+    layout = make_l3_l2_queue_layout(depth, input_arena_bytes, output_arena_bytes)
+    region = orch.create_l3_l2_region(
+        worker_id=int(worker_id),
+        payload_bytes=layout.payload_bytes,
+        counter_bytes=layout.counter_bytes,
+    )
+    desc_fields = orch.alloc([24], DataType.UINT8)
+    desc_seq = orch.alloc([8], DataType.UINT8)
+    desc_read = orch.alloc([L3L2_QUEUE_DESC_SLOT_BYTES], DataType.UINT8)
+    for offset in (
+        layout.input_desc_tail_offset,
+        layout.input_desc_head_offset,
+        layout.output_desc_tail_offset,
+        layout.output_desc_head_offset,
+        layout.l3_abort_flag_offset,
+        layout.l2_abort_flag_offset,
+    ):
+        region.counter(offset).notify(0, NotifyOp.Set)
+    return L3L2Queue(orch, region, layout, desc_fields, desc_seq, desc_read)
+
+
+class L3L2Queue:
+    def __init__(
+        self,
+        orch: Any,
+        region: L3L2OrchRegion,
+        layout: L3L2QueueLayout,
+        desc_fields: Tensor,
+        desc_seq: Tensor,
+        desc_read: Tensor,
+    ) -> None:
+        self._orch = orch
+        self._region = region
+        self._layout = layout
+        self._desc_fields = desc_fields
+        self._desc_seq = desc_seq
+        self._desc_read = desc_read
+        self._state = _QueueState.LIVE
+        self._input_head = 0
+        self._input_tail = 0
+        self._output_head = 0
+        self._output_tail = 0
+        self._input_payload_tail = 0
+        self._input_payload_head = 0
+        self._output_payload_head = 0
+        self._output_active: L3L2QueueMessage | None = None
+        self._stop_published = False
+        self.input = _L3InputQueue(self)
+        self.output = _L3OutputQueue(self)
+
+    @property
+    def region(self) -> L3L2OrchRegion:
+        return self._region
+
+    @property
+    def layout(self) -> L3L2QueueLayout:
+        return self._layout
+
+    @property
+    def magic_version(self) -> int:
+        return l3_l2_queue_magic_version()
+
+    def l2_task_arg_scalars(self) -> list[int]:
+        self._ensure_live()
+        return [
+            *self._region.descriptor_scalars(),
+            self.magic_version,
+            self._layout.depth,
+            self._layout.input_arena_bytes,
+            self._layout.output_arena_bytes,
+        ]
+
+    def try_request_stop(self) -> bool:
+        return self.input._try_enqueue(None, 0, L3L2QueueOpcode.STOP)
+
+    def request_stop(self, timeout: float) -> None:
+        self.input._enqueue(None, 0, L3L2QueueOpcode.STOP, timeout)
+
+    def free(self) -> None:
+        if self._state == _QueueState.RELEASED:
+            return
+        self._state = _QueueState.RELEASED
+        self._region.free()
+
+    def _ensure_live(self) -> None:
+        if self._state == _QueueState.RELEASED:
+            raise RuntimeError("L3-L2 queue has been released")
+        if self._state == _QueueState.POISONED_REMOTE:
+            raise RuntimeError("L3-L2 queue is remote-aborted")
+        if self._state == _QueueState.POISONED_LOCAL:
+            raise RuntimeError("L3-L2 queue is poisoned")
+        if self._state == _QueueState.EXPIRED:
+            raise RuntimeError("L3-L2 queue expired after orchestration run")
+        if getattr(self._region, "_expired", False):
+            self._state = _QueueState.EXPIRED
+            raise RuntimeError("L3-L2 queue expired after orchestration run")
+        self._region._ensure_live()
+
+    def _validate_registered_buffer(self, buffer: Any, nbytes: int) -> Tensor:
+        if not isinstance(buffer, Tensor):
+            raise ValueError("L3-L2 queue PR1 requires a registered Tensor returned by orch.alloc(...)")
+        self._region._owner._validate_l3_l2_orch_comm_host_buffer(buffer)
+        if int(nbytes) > int(buffer.nbytes()):
+            raise ValueError(f"L3-L2 queue nbytes={nbytes} exceeds registered Tensor size {int(buffer.nbytes())}")
+        return buffer
+
+    def _refresh_counter(self, offset: int, local_value: int, depth: int) -> int:
+        result = self._signal_test(offset, local_value & 0xFFFF_FFFF, WaitCmp.NE)
+        if not result.matched:
+            return local_value
+        observed = int(result.observed) & 0xFFFF_FFFF
+        local_low = local_value & 0xFFFF_FFFF
+        delta = ctypes.c_int32((observed - local_low) & 0xFFFF_FFFF).value
+        if delta < 0 or delta > depth:
+            self._poison_local()
+            raise RuntimeError("L3-L2 queue counter reconstruction failed")
+        return local_value + delta
+
+    def _sample_peer_abort_after_timeout(self) -> None:
+        result = self._signal_test(self._layout.l2_abort_flag_offset, 1, WaitCmp.GE)
+        if result.matched:
+            self._state = _QueueState.POISONED_REMOTE
+            raise RuntimeError("L3-L2 queue remote abort observed")
+        raise TimeoutError("L3-L2 queue operation timed out")
+
+    def _poison_local(self) -> None:
+        if self._state != _QueueState.LIVE:
+            return
+        self._state = _QueueState.POISONED_LOCAL
+        try:
+            self._region._owner._l3_l2_orch_comm_submit(
+                self._region._worker_id,
+                L3L2OrchCommRequest(
+                    cmd=L3L2OrchCommCmd.SIGNAL_NOTIFY,
+                    op=int(NotifyOp.Set),
+                    region_id=self._region.region_id,
+                    counter_addr=int(self._region.descriptor.counter_base) + self._layout.l3_abort_flag_offset,
+                    counter_operand=1,
+                ),
+                5.0,
+            )
+        except Exception:
+            pass
+
+    def _run_primitive(self, fn: Any, *args: Any, **kwargs: Any) -> Any:
+        try:
+            return fn(*args, **kwargs)
+        except Exception:
+            self._poison_local()
+            raise
+
+    def _signal_test(self, offset: int, cmp_value: int, cmp: WaitCmp) -> Any:
+        return self._run_primitive(lambda: self._region.counter(offset).test(cmp_value, cmp))
+
+    def _signal_notify(self, offset: int, value: int) -> None:
+        self._run_primitive(lambda: self._region.counter(offset).notify(value, NotifyOp.Set))
+
+    def _write_descriptor(
+        self, offset: int, seq: int, opcode: L3L2QueueOpcode, payload_offset: int, nbytes: int
+    ) -> None:
+        fields_buf = (ctypes.c_uint8 * 24).from_address(int(self._desc_fields.data))
+        fields_buf[:] = _DESC.pack(0, int(opcode), int(payload_offset), int(nbytes))[8:]
+        seq_buf = (ctypes.c_uint8 * 8).from_address(int(self._desc_seq.data))
+        seq_buf[:] = struct.pack("<Q", int(seq))
+        self._run_primitive(self._region.payload_write, offset + 8, self._desc_fields, nbytes=24)
+        self._run_primitive(self._region.payload_write, offset, self._desc_seq, nbytes=8)
+
+    def _read_descriptor(self, offset: int) -> L3L2QueueMessage:
+        self._run_primitive(self._region.payload_read, offset, self._desc_read, nbytes=L3L2_QUEUE_DESC_SLOT_BYTES)
+        raw = ctypes.string_at(int(self._desc_read.data), L3L2_QUEUE_DESC_SLOT_BYTES)
+        seq, opcode_value, payload_offset, payload_nbytes = _DESC.unpack(raw)
+        try:
+            opcode = L3L2QueueOpcode(opcode_value)
+        except ValueError:
+            self._poison_local()
+            raise RuntimeError("L3-L2 queue observed invalid descriptor opcode") from None
+        return L3L2QueueMessage(
+            seq=int(seq),
+            opcode=opcode,
+            payload_offset=int(payload_offset),
+            payload_nbytes=int(payload_nbytes),
+        )
+
+    def _advance_payload_head(
+        self,
+        cursor: int,
+        payload_offset: int,
+        payload_nbytes: int,
+        arena_offset: int,
+        arena_bytes: int,
+    ) -> int:
+        if payload_nbytes == 0:
+            return cursor
+        expected_offset = arena_offset + (cursor % arena_bytes)
+        if expected_offset != payload_offset:
+            if payload_offset != arena_offset:
+                self._poison_local()
+                raise RuntimeError("L3-L2 queue payload replay offset mismatch")
+            cursor += arena_bytes - (cursor % arena_bytes)
+        return cursor + payload_nbytes
+
+    def _replay_released_input_descriptors(self, old_head: int, new_head: int) -> None:
+        cursor = old_head
+        while cursor < new_head:
+            slot_index = cursor & (self._layout.depth - 1)
+            slot_offset = self._layout.input_desc_offset + slot_index * L3L2_QUEUE_DESC_SLOT_BYTES
+            message = self._read_descriptor(slot_offset)
+            if message.seq != cursor + 1:
+                self._poison_local()
+                raise RuntimeError("L3-L2 queue input release replay seq mismatch")
+            self._input_payload_head = self._advance_payload_head(
+                self._input_payload_head,
+                message.payload_offset,
+                message.payload_nbytes,
+                self._layout.input_arena_offset,
+                self._layout.input_arena_bytes,
+            )
+            cursor += 1
+
+
+class _L3InputQueue:
+    def __init__(self, queue: L3L2Queue) -> None:
+        self._queue = queue
+
+    def enqueue(self, buffer_or_none: Any, nbytes: int, timeout: float) -> None:
+        self._enqueue(buffer_or_none, nbytes, L3L2QueueOpcode.DATA, timeout)
+
+    def try_enqueue(self, buffer_or_none: Any, nbytes: int) -> bool:
+        return self._try_enqueue(buffer_or_none, nbytes, L3L2QueueOpcode.DATA)
+
+    def _enqueue(self, buffer_or_none: Any, nbytes: int, opcode: L3L2QueueOpcode, timeout: float) -> None:
+        if timeout is None or float(timeout) <= 0:
+            raise ValueError("L3-L2 queue blocking operations require a positive timeout")
+        deadline = time.monotonic() + float(timeout)
+        while True:
+            if self._try_enqueue(buffer_or_none, nbytes, opcode):
+                return
+            if self._queue._stop_published:
+                raise RuntimeError("L3-L2 queue input is stopped")
+            if time.monotonic() >= deadline:
+                self._queue._sample_peer_abort_after_timeout()
+            time.sleep(_POLL_INTERVAL_S)
+
+    def _try_enqueue(self, buffer_or_none: Any, nbytes: int, opcode: L3L2QueueOpcode) -> bool:
+        queue = self._queue
+        nbytes = int(nbytes)
+        if nbytes < 0:
+            raise ValueError("L3-L2 queue nbytes must be non-negative")
+        payload_tensor = None
+        if nbytes == 0:
+            if buffer_or_none is not None:
+                raise ValueError("L3-L2 queue zero-byte enqueue requires buffer_or_none == None")
+        else:
+            payload_tensor = queue._validate_registered_buffer(buffer_or_none, nbytes)
+
+        queue._ensure_live()
+        if queue._stop_published:
+            return False
+        if opcode == L3L2QueueOpcode.STOP and nbytes != 0:
+            raise ValueError("L3-L2 queue STOP must be zero-byte")
+
+        old_head = queue._input_head
+        queue._input_head = queue._refresh_counter(
+            queue._layout.input_desc_head_offset, queue._input_head, queue._layout.depth
+        )
+        if queue._input_head != old_head:
+            queue._replay_released_input_descriptors(old_head, queue._input_head)
+        if queue._input_tail - queue._input_head >= queue._layout.depth:
+            return False
+        if nbytes > queue._layout.input_arena_bytes:
+            return False
+
+        payload_offset = 0
+        if nbytes != 0:
+            arena_pos = queue._input_payload_tail % queue._layout.input_arena_bytes
+            if arena_pos + nbytes > queue._layout.input_arena_bytes:
+                queue._input_payload_tail += queue._layout.input_arena_bytes - arena_pos
+                arena_pos = 0
+            if queue._input_payload_tail + nbytes - queue._input_payload_head > queue._layout.input_arena_bytes:
+                return False
+            payload_offset = queue._layout.input_arena_offset + arena_pos
+            queue._run_primitive(queue._region.payload_write, payload_offset, payload_tensor, nbytes=nbytes)
+            queue._input_payload_tail += nbytes
+
+        seq = queue._input_tail + 1
+        slot_index = queue._input_tail & (queue._layout.depth - 1)
+        slot_offset = queue._layout.input_desc_offset + slot_index * L3L2_QUEUE_DESC_SLOT_BYTES
+        queue._write_descriptor(slot_offset, seq, opcode, payload_offset, nbytes)
+        queue._input_tail += 1
+        queue._signal_notify(queue._layout.input_desc_tail_offset, queue._input_tail)
+        if opcode == L3L2QueueOpcode.STOP:
+            queue._stop_published = True
+        return True
+
+
+class _L3OutputQueue:
+    def __init__(self, queue: L3L2Queue) -> None:
+        self._queue = queue
+
+    def try_peek(self) -> L3L2QueueMessage | None:
+        queue = self._queue
+        queue._ensure_live()
+        if queue._output_active is not None:
+            return queue._output_active
+        queue._output_tail = queue._refresh_counter(
+            queue._layout.output_desc_tail_offset, queue._output_tail, queue._layout.depth
+        )
+        if queue._output_tail == queue._output_head:
+            return None
+        slot_index = queue._output_head & (queue._layout.depth - 1)
+        slot_offset = queue._layout.output_desc_offset + slot_index * L3L2_QUEUE_DESC_SLOT_BYTES
+        message = queue._read_descriptor(slot_offset)
+        if message.seq != queue._output_head + 1:
+            queue._poison_local()
+            raise RuntimeError("L3-L2 queue output descriptor seq mismatch")
+        if message.opcode == L3L2QueueOpcode.STOP:
+            queue._poison_local()
+            raise RuntimeError("L3-L2 queue output descriptor cannot be STOP")
+        if message.payload_nbytes == 0:
+            if message.payload_offset != 0:
+                queue._poison_local()
+                raise RuntimeError("L3-L2 queue zero-byte output descriptor has nonzero offset")
+        else:
+            begin = queue._layout.output_arena_offset
+            end = begin + queue._layout.output_arena_bytes
+            if message.payload_offset < begin or message.payload_offset + message.payload_nbytes > end:
+                queue._poison_local()
+                raise RuntimeError("L3-L2 queue output payload outside output arena")
+            queue._advance_payload_head(
+                queue._output_payload_head,
+                message.payload_offset,
+                message.payload_nbytes,
+                queue._layout.output_arena_offset,
+                queue._layout.output_arena_bytes,
+            )
+        queue._output_active = message
+        return message
+
+    def peek(self, timeout: float) -> L3L2QueueMessage:
+        if timeout is None or float(timeout) <= 0:
+            raise ValueError("L3-L2 queue blocking operations require a positive timeout")
+        deadline = time.monotonic() + float(timeout)
+        while True:
+            message = self.try_peek()
+            if message is not None:
+                return message
+            if time.monotonic() >= deadline:
+                self._queue._sample_peer_abort_after_timeout()
+            time.sleep(_POLL_INTERVAL_S)
+
+    def read_into(self, handle: L3L2QueueMessage, buffer: Any) -> None:
+        queue = self._queue
+        queue._ensure_live()
+        if queue._output_active != handle:
+            raise RuntimeError("L3-L2 queue output handle is not active")
+        if handle.payload_nbytes == 0:
+            if buffer is not None:
+                raise ValueError("L3-L2 queue zero-byte output read requires buffer == None")
+            return
+        target = queue._validate_registered_buffer(buffer, handle.payload_nbytes)
+        queue._run_primitive(queue._region.payload_read, handle.payload_offset, target, nbytes=handle.payload_nbytes)
+
+    def release(self, handle: L3L2QueueMessage) -> None:
+        queue = self._queue
+        queue._ensure_live()
+        if queue._output_active != handle:
+            queue._poison_local()
+            raise RuntimeError("L3-L2 queue output handle is not active")
+        queue._output_payload_head = queue._advance_payload_head(
+            queue._output_payload_head,
+            handle.payload_offset,
+            handle.payload_nbytes,
+            queue._layout.output_arena_offset,
+            queue._layout.output_arena_bytes,
+        )
+        queue._output_head += 1
+        queue._output_active = None
+        queue._signal_notify(queue._layout.output_desc_head_offset, queue._output_head)
+
+    def dequeue_into(self, buffer: Any, timeout: float) -> L3L2QueueMessage:
+        handle = self.peek(timeout)
+        self.read_into(handle, buffer)
+        self.release(handle)
+        return handle
+
+    def try_dequeue_into(self, buffer: Any) -> L3L2QueueMessage | None:
+        handle = self.try_peek()
+        if handle is None:
+            return None
+        self.read_into(handle, buffer)
+        self.release(handle)
+        return handle
diff --git a/python/simpler/orchestrator.py b/python/simpler/orchestrator.py
index 87ec02e16..f998b48af 100644
--- a/python/simpler/orchestrator.py
+++ b/python/simpler/orchestrator.py
@@ -359,6 +359,20 @@ def create_l3_l2_region(self, *, worker_id: int, payload_bytes: int, counter_byt
             raise RuntimeError("create_l3_l2_region requires an Orchestrator bound to a Worker")
         return self._worker._create_l3_l2_region(int(worker_id), int(payload_bytes), int(counter_bytes))
 
+    def create_l3_l2_queue(self, *, worker_id: int, depth: int, input_arena_bytes: int, output_arena_bytes: int):
+        """Create an L3-L2 message queue backed by one L3-L2 communication region."""
+        if self._worker is None:
+            raise RuntimeError("create_l3_l2_queue requires an Orchestrator bound to a Worker")
+        from .l3_l2_message_queue import create_l3_l2_queue  # noqa: PLC0415
+
+        return create_l3_l2_queue(
+            self,
+            worker_id=int(worker_id),
+            depth=int(depth),
+            input_arena_bytes=int(input_arena_bytes),
+            output_arena_bytes=int(output_arena_bytes),
+        )
+
     # ------------------------------------------------------------------
     # Nested scope (Strict-1 per-scope rings)
     # ------------------------------------------------------------------
diff --git a/src/common/platform/include/aicpu/l3_l2_message_queue.h b/src/common/platform/include/aicpu/l3_l2_message_queue.h
new file mode 100644
index 000000000..383785c54
--- /dev/null
+++ b/src/common/platform/include/aicpu/l3_l2_message_queue.h
@@ -0,0 +1,659 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#ifndef SRC_COMMON_PLATFORM_INCLUDE_AICPU_L3_L2_MESSAGE_QUEUE_H_
+#define SRC_COMMON_PLATFORM_INCLUDE_AICPU_L3_L2_MESSAGE_QUEUE_H_
+
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+
+#include "aicpu/l3_l2_orch_endpoint.h"
+
+static constexpr uint32_t L3L2_QUEUE_MAGIC = 0x4C335132u;  // "L3Q2"
+static constexpr uint16_t L3L2_QUEUE_ABI_MAJOR = 1;
+static constexpr uint16_t L3L2_QUEUE_ABI_MINOR = 0;
+static constexpr uint64_t L3L2_QUEUE_DESC_SLOT_BYTES = 32;
+static constexpr uint64_t L3L2_QUEUE_DESC_RING_ALIGNMENT = 8;
+static constexpr uint64_t L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT = 64;
+static constexpr uint64_t L3L2_QUEUE_COUNTER_STRIDE = 64;
+static constexpr uint64_t L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET = 0;
+static constexpr uint64_t L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET = 64;
+static constexpr uint64_t L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET = 128;
+static constexpr uint64_t L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET = 192;
+static constexpr uint64_t L3L2_QUEUE_L3_ABORT_FLAG_OFFSET = 256;
+static constexpr uint64_t L3L2_QUEUE_L2_ABORT_FLAG_OFFSET = 320;
+static constexpr uint64_t L3L2_QUEUE_COUNTER_BYTES = 384;
+static constexpr uint64_t L3L2_QUEUE_MAX_DEPTH = 1ull << 30;
+
+struct L3L2QueueDescSlot {
+    uint64_t seq;
+    uint64_t opcode;
+    uint64_t payload_offset;
+    uint64_t payload_nbytes;
+};
+
+enum class L3L2QueueOpcode : uint64_t {
+    INVALID = 0,
+    DATA = 1,
+    STOP = 2,
+    ERROR = 3,
+};
+
+enum class L3L2QueueErrorKind : uint32_t {
+    NONE = 0,
+    BAD_ARGUMENT = 1,
+    BAD_DESCRIPTOR = 2,
+    INVALID_DESCRIPTOR = 3,
+    OUT_OF_SPACE = 4,
+    OWNERSHIP = 5,
+    REMOTE_ABORTED = 6,
+    ENDPOINT_ERROR = 7,
+};
+
+enum class L3L2QueueTimeoutStatus : uint32_t {
+    ORDINARY_TIMEOUT = 0,
+    REMOTE_ABORTED = 1,
+};
+
+struct L3L2QueueError {
+    L3L2QueueErrorKind kind;
+    const char *op;
+    uint64_t region_id;
+    const char *message;
+};
+
+struct L3L2QueueLayout {
+    uint64_t depth;
+    uint64_t input_desc_offset;
+    uint64_t output_desc_offset;
+    uint64_t input_arena_offset;
+    uint64_t output_arena_offset;
+    uint64_t input_arena_bytes;
+    uint64_t output_arena_bytes;
+    uint64_t payload_bytes;
+    uint64_t input_desc_tail_offset;
+    uint64_t input_desc_head_offset;
+    uint64_t output_desc_tail_offset;
+    uint64_t output_desc_head_offset;
+    uint64_t l3_abort_flag_offset;
+    uint64_t l2_abort_flag_offset;
+    uint64_t counter_bytes;
+};
+
+struct L3L2QueueArgs {
+    uint64_t magic_version;
+    uint64_t depth;
+    uint64_t input_arena_bytes;
+    uint64_t output_arena_bytes;
+};
+
+struct L3L2QueueInputHandle {
+    uint64_t seq;
+    L3L2QueueOpcode opcode;
+    uint64_t payload_offset;
+    uint64_t payload_nbytes;
+    L3L2OrchPayloadView payload;
+};
+
+struct L3L2QueueOutputReservation {
+    uint64_t seq;
+    uint64_t payload_offset;
+    uint64_t payload_nbytes;
+    L3L2OrchPayloadView payload;
+    bool valid;
+};
+
+static inline uint64_t l3_l2_queue_magic_version() {
+    return l3_l2_orch_comm_pack_magic_version(L3L2_QUEUE_MAGIC, L3L2_QUEUE_ABI_MAJOR, L3L2_QUEUE_ABI_MINOR);
+}
+
+static inline bool l3_l2_queue_is_power_of_two(uint64_t value) { return value != 0 && (value & (value - 1)) == 0; }
+
+static inline uint64_t l3_l2_queue_align_up(uint64_t value, uint64_t align) {
+    if (align == 0) {
+        return value;
+    }
+    uint64_t remainder = value % align;
+    return remainder == 0 ? value : value + (align - remainder);
+}
+
+static inline bool l3_l2_queue_valid_opcode(L3L2QueueOpcode opcode) {
+    return opcode == L3L2QueueOpcode::DATA || opcode == L3L2QueueOpcode::STOP || opcode == L3L2QueueOpcode::ERROR;
+}
+
+static inline bool
+l3_l2_queue_make_layout(uint64_t depth, uint64_t input_arena_bytes, uint64_t output_arena_bytes, L3L2QueueLayout *out) {
+    if (out == nullptr || !l3_l2_queue_is_power_of_two(depth) || depth > L3L2_QUEUE_MAX_DEPTH ||
+        input_arena_bytes == 0 || output_arena_bytes == 0 ||
+        input_arena_bytes % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT != 0 ||
+        output_arena_bytes % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT != 0) {
+        return false;
+    }
+
+    uint64_t desc_ring_bytes = depth * L3L2_QUEUE_DESC_SLOT_BYTES;
+    uint64_t input_desc_offset = 0;
+    uint64_t output_desc_offset = input_desc_offset + desc_ring_bytes;
+    uint64_t input_arena_offset =
+        l3_l2_queue_align_up(output_desc_offset + desc_ring_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT);
+    uint64_t output_arena_offset =
+        l3_l2_queue_align_up(input_arena_offset + input_arena_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT);
+    if (l3_l2_orch_comm_add_overflows(output_arena_offset, output_arena_bytes)) {
+        return false;
+    }
+
+    *out = L3L2QueueLayout{
+        depth,
+        input_desc_offset,
+        output_desc_offset,
+        input_arena_offset,
+        output_arena_offset,
+        input_arena_bytes,
+        output_arena_bytes,
+        output_arena_offset + output_arena_bytes,
+        L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET,
+        L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET,
+        L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET,
+        L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET,
+        L3L2_QUEUE_L3_ABORT_FLAG_OFFSET,
+        L3L2_QUEUE_L2_ABORT_FLAG_OFFSET,
+        L3L2_QUEUE_COUNTER_BYTES,
+    };
+    return output_desc_offset % L3L2_QUEUE_DESC_RING_ALIGNMENT == 0 &&
+           input_arena_offset % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT == 0 &&
+           output_arena_offset % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT == 0;
+}
+
+static inline bool
+l3_l2_queue_validate_region(const L3L2OrchRegionDesc &desc, const L3L2QueueArgs &args, L3L2QueueLayout *out_layout) {
+    L3L2QueueLayout layout{};
+    if (args.magic_version != l3_l2_queue_magic_version() ||
+        l3_l2_orch_comm_validate_desc(desc) != L3L2OrchCommValidationError::OK ||
+        !l3_l2_queue_make_layout(args.depth, args.input_arena_bytes, args.output_arena_bytes, &layout)) {
+        return false;
+    }
+    if (desc.payload_bytes < layout.payload_bytes || desc.counter_bytes < layout.counter_bytes) {
+        return false;
+    }
+    if (out_layout != nullptr) {
+        *out_layout = layout;
+    }
+    return true;
+}
+
+static inline void l3_l2_queue_encode_desc(
+    L3L2QueueDescSlot *slot, uint64_t seq, L3L2QueueOpcode opcode, uint64_t payload_offset, uint64_t payload_nbytes
+) {
+    if (slot == nullptr) {
+        return;
+    }
+    slot->seq = seq;
+    slot->opcode = static_cast<uint64_t>(opcode);
+    slot->payload_offset = payload_offset;
+    slot->payload_nbytes = payload_nbytes;
+}
+
+static inline bool l3_l2_queue_reconstruct_counter(int32_t observed_low32, uint64_t depth, uint64_t *local_value) {
+    if (local_value == nullptr || depth > L3L2_QUEUE_MAX_DEPTH) {
+        return false;
+    }
+    uint32_t local_low32 = static_cast<uint32_t>(*local_value);
+    int32_t delta = static_cast<int32_t>(static_cast<uint32_t>(observed_low32) - local_low32);
+    if (delta < 0 || static_cast<uint64_t>(delta) > depth) {
+        return false;
+    }
+    *local_value += static_cast<uint64_t>(delta);
+    return true;
+}
+
+class L3L2QueueEndpoint {
+public:
+    class InputQueue {
+    public:
+        explicit InputQueue(L3L2QueueEndpoint *parent) :
+            parent_(parent) {}
+
+        bool peek(uint64_t timeout_ns, L3L2QueueInputHandle *out) {
+            if (out == nullptr) {
+                return false;
+            }
+            uint64_t start = l3_l2_orch_endpoint_now();
+            uint64_t frequency_hz = l3_l2_orch_endpoint_timer_frequency_hz();
+            while (true) {
+                if (try_peek(out)) {
+                    return true;
+                }
+                if (parent_->error_.kind != L3L2QueueErrorKind::NONE) {
+                    return false;
+                }
+                uint64_t now = l3_l2_orch_endpoint_now();
+                if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) {
+                    parent_->disambiguate_timeout();
+                    return false;
+                }
+            }
+        }
+
+        bool try_peek(L3L2QueueInputHandle *out) {
+            if (out != nullptr) {
+                *out = L3L2QueueInputHandle{0, L3L2QueueOpcode::INVALID, 0, 0, L3L2OrchPayloadView{0, 0}};
+            }
+            if (!parent_->ensure_live("input.try_peek") || out == nullptr) {
+                return false;
+            }
+            if (active_) {
+                parent_->poison(L3L2QueueErrorKind::OWNERSHIP, "input.try_peek", "input handle already active");
+                return false;
+            }
+            if (!parent_->refresh_counter(
+                    parent_->layout_.input_desc_tail_offset, parent_->input_tail_, parent_->layout_.depth,
+                    "input.try_peek"
+                )) {
+                return false;
+            }
+            if (stopped_) {
+                if (parent_->input_tail_ != parent_->input_head_) {
+                    parent_->poison(
+                        L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek",
+                        "input descriptor published after STOP"
+                    );
+                }
+                return false;
+            }
+            if (parent_->input_tail_ == parent_->input_head_) {
+                return false;
+            }
+            if (parent_->input_tail_ - parent_->input_head_ > parent_->layout_.depth) {
+                parent_->poison(
+                    L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "input descriptor state invalid"
+                );
+                return false;
+            }
+
+            L3L2QueueDescSlot slot{};
+            uint64_t slot_index = parent_->input_head_ & (parent_->layout_.depth - 1);
+            uint64_t slot_offset = parent_->layout_.input_desc_offset + slot_index * sizeof(L3L2QueueDescSlot);
+            if (!parent_->read_desc_slot(slot_offset, &slot, "input.try_peek")) {
+                return false;
+            }
+            uint64_t expected_seq = parent_->input_head_ + 1;
+            if (slot.seq != expected_seq) {
+                parent_->poison(
+                    L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "input descriptor seq mismatch"
+                );
+                return false;
+            }
+            L3L2QueueOpcode opcode = static_cast<L3L2QueueOpcode>(slot.opcode);
+            if (!l3_l2_queue_valid_opcode(opcode)) {
+                parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "invalid input opcode");
+                return false;
+            }
+
+            L3L2OrchPayloadView view{0, 0};
+            if (slot.payload_nbytes == 0) {
+                if (slot.payload_offset != 0) {
+                    parent_->poison(
+                        L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek",
+                        "zero-byte descriptor uses nonzero payload offset"
+                    );
+                    return false;
+                }
+            } else if (!parent_->payload_in_arena(
+                           slot.payload_offset, slot.payload_nbytes, parent_->layout_.input_arena_offset,
+                           parent_->layout_.input_arena_bytes
+                       )) {
+                parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "input payload out of arena");
+                return false;
+            } else if (!parent_->endpoint_.payload_read(slot.payload_offset, slot.payload_nbytes, &view)) {
+                parent_->poison(
+                    L3L2QueueErrorKind::ENDPOINT_ERROR, "input.try_peek", parent_->endpoint_.error().message
+                );
+                return false;
+            }
+
+            *out = L3L2QueueInputHandle{slot.seq, opcode, slot.payload_offset, slot.payload_nbytes, view};
+            active_ = true;
+            active_seq_ = slot.seq;
+            return true;
+        }
+
+        bool release(const L3L2QueueInputHandle &handle) {
+            if (!parent_->ensure_live("input.release")) {
+                return false;
+            }
+            if (!active_ || handle.seq != active_seq_ || handle.seq != parent_->input_head_ + 1) {
+                parent_->poison(L3L2QueueErrorKind::OWNERSHIP, "input.release", "input handle is not active");
+                return false;
+            }
+            if (handle.payload_nbytes != 0) {
+                parent_->advance_payload_head(
+                    parent_->input_payload_head_, handle.payload_offset, handle.payload_nbytes,
+                    parent_->layout_.input_arena_offset, parent_->layout_.input_arena_bytes, "input.release"
+                );
+                if (parent_->error_.kind != L3L2QueueErrorKind::NONE) {
+                    return false;
+                }
+            }
+            parent_->input_head_ += 1;
+            if (handle.opcode == L3L2QueueOpcode::STOP) {
+                stopped_ = true;
+            }
+            active_ = false;
+            active_seq_ = 0;
+            return parent_->notify_counter(
+                parent_->layout_.input_desc_head_offset, static_cast<int32_t>(parent_->input_head_), "input.release"
+            );
+        }
+
+    private:
+        L3L2QueueEndpoint *parent_;
+        bool active_{false};
+        uint64_t active_seq_{0};
+        bool stopped_{false};
+    };
+
+    class OutputQueue {
+    public:
+        explicit OutputQueue(L3L2QueueEndpoint *parent) :
+            parent_(parent) {}
+
+        bool reserve(uint64_t nbytes, uint64_t timeout_ns, L3L2QueueOutputReservation *out) {
+            if (out == nullptr) {
+                return false;
+            }
+            uint64_t start = l3_l2_orch_endpoint_now();
+            uint64_t frequency_hz = l3_l2_orch_endpoint_timer_frequency_hz();
+            while (true) {
+                if (try_reserve(nbytes, out)) {
+                    return true;
+                }
+                if (parent_->error_.kind != L3L2QueueErrorKind::NONE) {
+                    return false;
+                }
+                uint64_t now = l3_l2_orch_endpoint_now();
+                if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) {
+                    parent_->disambiguate_timeout();
+                    return false;
+                }
+            }
+        }
+
+        bool try_reserve(uint64_t nbytes, L3L2QueueOutputReservation *out) {
+            if (out != nullptr) {
+                *out = L3L2QueueOutputReservation{0, 0, 0, L3L2OrchPayloadView{0, 0}, false};
+            }
+            if (!parent_->ensure_live("output.try_reserve") || out == nullptr) {
+                return false;
+            }
+            if (reservation_active_) {
+                parent_->poison(
+                    L3L2QueueErrorKind::OWNERSHIP, "output.try_reserve", "output reservation already active"
+                );
+                return false;
+            }
+            if (nbytes > parent_->layout_.output_arena_bytes) {
+                return false;
+            }
+            uint64_t old_head = parent_->output_head_;
+            if (!parent_->refresh_counter(
+                    parent_->layout_.output_desc_head_offset, parent_->output_head_, parent_->layout_.depth,
+                    "output.try_reserve"
+                )) {
+                return false;
+            }
+            if (parent_->output_head_ != old_head &&
+                !parent_->replay_output_releases(old_head, parent_->output_head_, "output.try_reserve")) {
+                return false;
+            }
+            if (parent_->output_tail_ - parent_->output_head_ >= parent_->layout_.depth) {
+                return false;
+            }
+
+            uint64_t payload_offset = 0;
+            L3L2OrchPayloadView view{0, 0};
+            if (nbytes != 0) {
+                uint64_t arena_base = parent_->layout_.output_arena_offset;
+                uint64_t arena_bytes = parent_->layout_.output_arena_bytes;
+                uint64_t arena_pos = parent_->output_payload_tail_ % arena_bytes;
+                if (arena_pos + nbytes > arena_bytes) {
+                    parent_->output_payload_tail_ += arena_bytes - arena_pos;
+                    arena_pos = 0;
+                }
+                if (parent_->output_payload_tail_ + nbytes - parent_->output_payload_head_ > arena_bytes) {
+                    return false;
+                }
+                payload_offset = arena_base + arena_pos;
+                view = L3L2OrchPayloadView{parent_->endpoint_.descriptor().payload_base + payload_offset, nbytes};
+                parent_->output_payload_tail_ += nbytes;
+            }
+
+            reservation_active_ = true;
+            reservation_seq_ = parent_->output_tail_ + 1;
+            reservation_offset_ = payload_offset;
+            reservation_nbytes_ = nbytes;
+            *out = L3L2QueueOutputReservation{reservation_seq_, payload_offset, nbytes, view, true};
+            return true;
+        }
+
+        bool publish(const L3L2QueueOutputReservation &reservation, L3L2QueueOpcode opcode) {
+            if (!parent_->ensure_live("output.publish")) {
+                return false;
+            }
+            if (!reservation_active_ || !reservation.valid || reservation.seq != reservation_seq_ ||
+                reservation.payload_offset != reservation_offset_ ||
+                reservation.payload_nbytes != reservation_nbytes_) {
+                parent_->poison(L3L2QueueErrorKind::OWNERSHIP, "output.publish", "unknown output reservation");
+                return false;
+            }
+            if (opcode == L3L2QueueOpcode::STOP || !l3_l2_queue_valid_opcode(opcode)) {
+                parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "output.publish", "invalid output opcode");
+                return false;
+            }
+            L3L2QueueDescSlot slot{};
+            l3_l2_queue_encode_desc(&slot, 0, opcode, reservation.payload_offset, reservation.payload_nbytes);
+            uint64_t slot_index = parent_->output_tail_ & (parent_->layout_.depth - 1);
+            uint64_t slot_offset = parent_->layout_.output_desc_offset + slot_index * sizeof(L3L2QueueDescSlot);
+            if (!parent_->write_desc_slot(slot_offset, slot, reservation.seq, "output.publish")) {
+                return false;
+            }
+            parent_->output_tail_ += 1;
+            reservation_active_ = false;
+            reservation_seq_ = 0;
+            reservation_offset_ = 0;
+            reservation_nbytes_ = 0;
+            return parent_->notify_counter(
+                parent_->layout_.output_desc_tail_offset, static_cast<int32_t>(parent_->output_tail_), "output.publish"
+            );
+        }
+
+    private:
+        L3L2QueueEndpoint *parent_;
+        bool reservation_active_{false};
+        uint64_t reservation_seq_{0};
+        uint64_t reservation_offset_{0};
+        uint64_t reservation_nbytes_{0};
+    };
+
+    L3L2QueueEndpoint(const L3L2OrchRegionDesc &desc, const L3L2QueueArgs &args) :
+        endpoint_(desc),
+        input_queue_(this),
+        output_queue_(this) {
+        if (endpoint_.error().kind != L3L2EndpointErrorKind::NONE ||
+            !l3_l2_queue_validate_region(desc, args, &layout_)) {
+            set_error(L3L2QueueErrorKind::BAD_DESCRIPTOR, "init", desc.region_id, "invalid queue descriptor");
+        }
+    }
+
+    const L3L2QueueError &error() const { return error_; }
+    const L3L2QueueLayout &layout() const { return layout_; }
+    InputQueue &input() { return input_queue_; }
+    OutputQueue &output() { return output_queue_; }
+
+    L3L2QueueTimeoutStatus disambiguate_timeout() {
+        if (error_.kind != L3L2QueueErrorKind::NONE) {
+            return error_.kind == L3L2QueueErrorKind::REMOTE_ABORTED ? L3L2QueueTimeoutStatus::REMOTE_ABORTED :
+                                                                       L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT;
+        }
+        L3L2OrchSignalTestResult result{};
+        uint64_t addr = 0;
+        if (!endpoint_.counter_addr(layout_.l3_abort_flag_offset, &addr) ||
+            !endpoint_.signal_test(addr, 1, L3L2OrchWaitCmp::GE, &result)) {
+            poison(L3L2QueueErrorKind::ENDPOINT_ERROR, "timeout", endpoint_.error().message);
+            return L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT;
+        }
+        if (result.matched) {
+            set_error(L3L2QueueErrorKind::REMOTE_ABORTED, "timeout", endpoint_.descriptor().region_id, "remote abort");
+            return L3L2QueueTimeoutStatus::REMOTE_ABORTED;
+        }
+        return L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT;
+    }
+
+private:
+    bool ensure_live(const char *op) {
+        if (error_.kind == L3L2QueueErrorKind::NONE) {
+            return true;
+        }
+        (void)op;
+        return false;
+    }
+
+    void set_error(L3L2QueueErrorKind kind, const char *op, uint64_t region_id, const char *message) {
+        if (error_.kind != L3L2QueueErrorKind::NONE) {
+            return;
+        }
+        error_ = L3L2QueueError{kind, op, region_id, message};
+    }
+
+    void poison(L3L2QueueErrorKind kind, const char *op, const char *message) {
+        set_error(kind, op, endpoint_.descriptor().region_id, message);
+        if (kind != L3L2QueueErrorKind::REMOTE_ABORTED) {
+            uint64_t addr = 0;
+            if (endpoint_.counter_addr(layout_.l2_abort_flag_offset, &addr)) {
+                endpoint_.signal_notify(addr, 1, L3L2OrchNotifyOp::Set);
+            }
+        }
+    }
+
+    bool notify_counter(uint64_t offset, int32_t value, const char *op) {
+        uint64_t addr = 0;
+        if (!endpoint_.counter_addr(offset, &addr) || !endpoint_.signal_notify(addr, value, L3L2OrchNotifyOp::Set)) {
+            poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message);
+            return false;
+        }
+        return true;
+    }
+
+    bool refresh_counter(uint64_t offset, uint64_t &local, uint64_t depth, const char *op) {
+        uint64_t addr = 0;
+        L3L2OrchSignalTestResult result{};
+        if (!endpoint_.counter_addr(offset, &addr) ||
+            !endpoint_.signal_test(addr, static_cast<int32_t>(local), L3L2OrchWaitCmp::NE, &result)) {
+            poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message);
+            return false;
+        }
+        if (!result.matched) {
+            return true;
+        }
+        if (!l3_l2_queue_reconstruct_counter(result.observed, depth, &local)) {
+            poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "counter reconstruction failed");
+            return false;
+        }
+        return true;
+    }
+
+    bool read_desc_slot(uint64_t slot_offset, L3L2QueueDescSlot *slot, const char *op) {
+        L3L2OrchPayloadView view{};
+        if (!endpoint_.payload_read(slot_offset, sizeof(L3L2QueueDescSlot), &view)) {
+            poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message);
+            return false;
+        }
+        memcpy(slot, reinterpret_cast<const void *>(static_cast<uintptr_t>(view.gm_addr)), sizeof(L3L2QueueDescSlot));
+        return true;
+    }
+
+    bool write_desc_slot(uint64_t slot_offset, const L3L2QueueDescSlot &slot, uint64_t seq, const char *op) {
+        L3L2QueueDescSlot fields = slot;
+        fields.seq = 0;
+        if (!endpoint_.payload_write(slot_offset + offsetof(L3L2QueueDescSlot, opcode), &fields.opcode, 24)) {
+            poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message);
+            return false;
+        }
+        if (!endpoint_.payload_write(slot_offset + offsetof(L3L2QueueDescSlot, seq), &seq, sizeof(seq))) {
+            poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message);
+            return false;
+        }
+        return true;
+    }
+
+    bool payload_in_arena(uint64_t offset, uint64_t nbytes, uint64_t arena_offset, uint64_t arena_bytes) const {
+        if (nbytes == 0 || l3_l2_orch_comm_add_overflows(offset, nbytes)) {
+            return false;
+        }
+        return offset >= arena_offset && offset + nbytes <= arena_offset + arena_bytes;
+    }
+
+    void advance_payload_head(
+        uint64_t &cursor, uint64_t payload_offset, uint64_t nbytes, uint64_t arena_offset, uint64_t arena_bytes,
+        const char *op
+    ) {
+        uint64_t expected_offset = arena_offset + (cursor % arena_bytes);
+        if (expected_offset != payload_offset) {
+            if (payload_offset != arena_offset) {
+                poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "payload replay offset mismatch");
+                return;
+            }
+            cursor += arena_bytes - (cursor % arena_bytes);
+        }
+        cursor += nbytes;
+    }
+
+    bool replay_output_releases(uint64_t old_head, uint64_t new_head, const char *op) {
+        uint64_t cursor = old_head;
+        while (cursor < new_head) {
+            L3L2QueueDescSlot slot{};
+            uint64_t slot_index = cursor & (layout_.depth - 1);
+            uint64_t slot_offset = layout_.output_desc_offset + slot_index * sizeof(L3L2QueueDescSlot);
+            if (!read_desc_slot(slot_offset, &slot, op)) {
+                return false;
+            }
+            if (slot.seq != cursor + 1) {
+                poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "output release replay seq mismatch");
+                return false;
+            }
+            if (slot.payload_nbytes != 0) {
+                advance_payload_head(
+                    output_payload_head_, slot.payload_offset, slot.payload_nbytes, layout_.output_arena_offset,
+                    layout_.output_arena_bytes, op
+                );
+                if (error_.kind != L3L2QueueErrorKind::NONE) {
+                    return false;
+                }
+            }
+            cursor += 1;
+        }
+        return true;
+    }
+
+    L3L2OrchEndpoint endpoint_;
+    L3L2QueueLayout layout_{};
+    L3L2QueueError error_{L3L2QueueErrorKind::NONE, "", 0, ""};
+    uint64_t input_head_{0};
+    uint64_t input_tail_{0};
+    uint64_t output_head_{0};
+    uint64_t output_tail_{0};
+    uint64_t input_payload_head_{0};
+    uint64_t output_payload_head_{0};
+    uint64_t output_payload_tail_{0};
+    InputQueue input_queue_;
+    OutputQueue output_queue_;
+};
+
+#endif  // SRC_COMMON_PLATFORM_INCLUDE_AICPU_L3_L2_MESSAGE_QUEUE_H_
diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt
index 5fe6dd186..d4fcc497f 100644
--- a/tests/ut/cpp/CMakeLists.txt
+++ b/tests/ut/cpp/CMakeLists.txt
@@ -369,6 +369,23 @@ add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp)
 add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp)
 add_common_utils_test(test_device_arena common/test_device_arena.cpp)
 add_common_utils_test(test_l3_l2_orch_comm common/test_l3_l2_orch_comm.cpp)
+add_executable(test_l3_l2_message_queue
+    common/test_l3_l2_message_queue.cpp
+    stubs/test_stubs.cpp
+)
+target_include_directories(test_l3_l2_message_queue PRIVATE
+    ${GTEST_INCLUDE_DIRS}
+    ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include
+    ${CMAKE_SOURCE_DIR}/../../../src/common/platform/include
+)
+target_link_libraries(test_l3_l2_message_queue PRIVATE
+    ${GTEST_MAIN_LIB}
+    ${GTEST_LIB}
+    pthread
+)
+add_test(NAME test_l3_l2_message_queue COMMAND test_l3_l2_message_queue)
+set_tests_properties(test_l3_l2_message_queue PROPERTIES LABELS "no_hardware")
+
 add_executable(test_l3_l2_orch_endpoint
     common/test_l3_l2_orch_endpoint.cpp
     stubs/test_stubs.cpp
diff --git a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
new file mode 100644
index 000000000..409da4763
--- /dev/null
+++ b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
@@ -0,0 +1,495 @@
+/*
+ * Copyright (c) PyPTO Contributors.
+ * This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+ * CANN Open Software License Agreement Version 2.0 (the "License").
+ * Please refer to the License for details. You may not use this file except in compliance with the License.
+ * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+ * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+ * See LICENSE in the root of the software repository for the full text of the License.
+ * -----------------------------------------------------------------------------------------------------------
+ */
+
+#include <array>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <type_traits>
+
+#include <gtest/gtest.h>
+
+#include "aicpu/l3_l2_message_queue.h"
+
+namespace {
+
+struct RegionStorage {
+    alignas(64) std::array<uint8_t, 512> payload{};
+    alignas(64) std::array<int32_t, 128> counters{};
+};
+
+L3L2OrchRegionDesc make_desc(RegionStorage *storage, uint64_t payload_bytes = 512, uint64_t counter_bytes = 512) {
+    return L3L2OrchRegionDesc{
+        l3_l2_orch_comm_magic_version(),
+        19,
+        reinterpret_cast<uint64_t>(storage->payload.data()),
+        payload_bytes,
+        reinterpret_cast<uint64_t>(storage->counters.data()),
+        counter_bytes,
+    };
+}
+
+size_t counter_index(uint64_t offset) { return static_cast<size_t>(offset / sizeof(int32_t)); }
+
+void publish_input_desc(
+    RegionStorage *storage, const L3L2QueueLayout &layout, uint64_t seq, L3L2QueueOpcode opcode,
+    uint64_t payload_offset = 0, uint64_t payload_nbytes = 0
+) {
+    L3L2QueueDescSlot slot{};
+    l3_l2_queue_encode_desc(&slot, seq, opcode, payload_offset, payload_nbytes);
+    uint64_t desc_offset = layout.input_desc_offset + ((seq - 1) & (layout.depth - 1)) * sizeof(L3L2QueueDescSlot);
+    std::memcpy(storage->payload.data() + desc_offset, &slot, sizeof(slot));
+    storage->counters[counter_index(layout.input_desc_tail_offset)] = static_cast<int32_t>(seq);
+}
+
+TEST(L3L2MessageQueueTest, LayoutAssignsPayloadAndAbortCounterOffsets) {
+    L3L2QueueLayout layout{};
+
+    ASSERT_TRUE(l3_l2_queue_make_layout(4, 128, 192, &layout));
+
+    EXPECT_EQ(layout.input_desc_offset, 0u);
+    EXPECT_EQ(layout.output_desc_offset, 4u * sizeof(L3L2QueueDescSlot));
+    EXPECT_EQ(layout.input_arena_offset % 64u, 0u);
+    EXPECT_EQ(layout.output_arena_offset % 64u, 0u);
+    EXPECT_EQ(layout.input_desc_tail_offset, 0u);
+    EXPECT_EQ(layout.input_desc_head_offset, 64u);
+    EXPECT_EQ(layout.output_desc_tail_offset, 128u);
+    EXPECT_EQ(layout.output_desc_head_offset, 192u);
+    EXPECT_EQ(layout.l3_abort_flag_offset, 256u);
+    EXPECT_EQ(layout.l2_abort_flag_offset, 320u);
+    EXPECT_EQ(layout.counter_bytes, 384u);
+    EXPECT_GE(layout.payload_bytes, layout.output_arena_offset + 192u);
+}
+
+TEST(L3L2MessageQueueTest, LayoutLockstepCasesMatchPythonMirrorExpectations) {
+    struct LayoutCase {
+        uint64_t depth;
+        uint64_t input_arena_bytes;
+        uint64_t output_arena_bytes;
+        uint64_t output_desc_offset;
+        uint64_t input_arena_offset;
+        uint64_t output_arena_offset;
+        uint64_t payload_bytes;
+    };
+
+    const std::array<LayoutCase, 3> cases{{
+        {1, 64, 64, 32, 64, 128, 192},
+        {4, 128, 192, 128, 256, 384, 576},
+        {8, 192, 64, 256, 512, 704, 768},
+    }};
+
+    for (const auto &test_case : cases) {
+        L3L2QueueLayout layout{};
+        ASSERT_TRUE(
+            l3_l2_queue_make_layout(test_case.depth, test_case.input_arena_bytes, test_case.output_arena_bytes, &layout)
+        );
+
+        EXPECT_EQ(layout.input_desc_offset, 0u);
+        EXPECT_EQ(layout.output_desc_offset, test_case.output_desc_offset);
+        EXPECT_EQ(layout.output_desc_offset, test_case.depth * sizeof(L3L2QueueDescSlot));
+        EXPECT_EQ(layout.input_arena_offset, test_case.input_arena_offset);
+        EXPECT_EQ(layout.output_arena_offset, test_case.output_arena_offset);
+        EXPECT_EQ(layout.payload_bytes, test_case.payload_bytes);
+        EXPECT_EQ(layout.input_desc_tail_offset, 0u);
+        EXPECT_EQ(layout.input_desc_head_offset, 64u);
+        EXPECT_EQ(layout.output_desc_tail_offset, 128u);
+        EXPECT_EQ(layout.output_desc_head_offset, 192u);
+        EXPECT_EQ(layout.l3_abort_flag_offset, 256u);
+        EXPECT_EQ(layout.l2_abort_flag_offset, 320u);
+        EXPECT_EQ(layout.counter_bytes, 384u);
+    }
+}
+
+TEST(L3L2MessageQueueTest, LayoutRejectsInvalidDepthArenaAndCounterBytes) {
+    L3L2QueueLayout layout{};
+
+    EXPECT_FALSE(l3_l2_queue_make_layout(3, 64, 64, &layout));
+    EXPECT_FALSE(l3_l2_queue_make_layout((1ull << 30) + 1, 64, 64, &layout));
+    EXPECT_FALSE(l3_l2_queue_make_layout(2, 0, 64, &layout));
+    EXPECT_FALSE(l3_l2_queue_make_layout(2, 65, 64, &layout));
+
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    EXPECT_FALSE(l3_l2_queue_validate_region(make_desc(&storage, 256, 320), args, &layout));
+    EXPECT_TRUE(l3_l2_queue_validate_region(make_desc(&storage, 512, 384), args, &layout));
+}
+
+TEST(L3L2MessageQueueTest, DescriptorSlotEncodingIsStable) {
+    static_assert(std::is_standard_layout<L3L2QueueDescSlot>::value, "descriptor must be POD-like");
+    static_assert(std::is_trivially_copyable<L3L2QueueDescSlot>::value, "descriptor must be fixed-size");
+
+    EXPECT_EQ(sizeof(L3L2QueueDescSlot), 32u);
+    EXPECT_EQ(offsetof(L3L2QueueDescSlot, seq), 0u);
+    EXPECT_EQ(offsetof(L3L2QueueDescSlot, opcode), 8u);
+    EXPECT_EQ(offsetof(L3L2QueueDescSlot, payload_offset), 16u);
+    EXPECT_EQ(offsetof(L3L2QueueDescSlot, payload_nbytes), 24u);
+
+    L3L2QueueDescSlot slot{};
+    l3_l2_queue_encode_desc(&slot, 7, L3L2QueueOpcode::ERROR, 128, 16);
+    EXPECT_EQ(slot.seq, 7u);
+    EXPECT_EQ(slot.opcode, 3u);
+    EXPECT_EQ(slot.payload_offset, 128u);
+    EXPECT_EQ(slot.payload_nbytes, 16u);
+}
+
+TEST(L3L2MessageQueueTest, Low32ReconstructionAcceptsWrapAndRejectsImpossibleDeltas) {
+    uint64_t value = 0xFFFF'FFFFull;
+
+    EXPECT_TRUE(l3_l2_queue_reconstruct_counter(0, 4, &value));
+    EXPECT_EQ(value, 0x1'0000'0000ull);
+
+    value = 100;
+    EXPECT_TRUE(l3_l2_queue_reconstruct_counter(104, 4, &value));
+    EXPECT_EQ(value, 104u);
+
+    value = 100;
+    EXPECT_FALSE(l3_l2_queue_reconstruct_counter(99, 4, &value));
+
+    value = 100;
+    EXPECT_FALSE(l3_l2_queue_reconstruct_counter(105, 4, &value));
+}
+
+TEST(L3L2MessageQueueTest, L2InputPeekHandlesZeroByteDescriptorBeforeArenaValidation) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    L3L2QueueDescSlot slot{};
+    l3_l2_queue_encode_desc(&slot, 1, L3L2QueueOpcode::DATA, 0, 0);
+    std::memcpy(storage.payload.data() + queue.layout().input_desc_offset, &slot, sizeof(slot));
+    storage.counters[0] = 1;
+
+    L3L2QueueInputHandle handle{};
+    ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message;
+
+    EXPECT_EQ(handle.seq, 1u);
+    EXPECT_EQ(handle.opcode, L3L2QueueOpcode::DATA);
+    EXPECT_EQ(handle.payload_nbytes, 0u);
+    EXPECT_EQ(handle.payload.gm_addr, 0u);
+    EXPECT_TRUE(queue.input().release(handle)) << queue.error().message;
+    EXPECT_EQ(storage.counters[16], 1);
+}
+
+TEST(L3L2MessageQueueTest, L2InputPeekPoisonsZeroByteDescriptorWithNonzeroOffset) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    L3L2QueueDescSlot slot{};
+    l3_l2_queue_encode_desc(&slot, 1, L3L2QueueOpcode::DATA, 8, 0);
+    std::memcpy(storage.payload.data() + queue.layout().input_desc_offset, &slot, sizeof(slot));
+    storage.counters[0] = 1;
+
+    L3L2QueueInputHandle handle{};
+    EXPECT_FALSE(queue.input().try_peek(&handle));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::INVALID_DESCRIPTOR);
+    EXPECT_EQ(storage.counters[80], 1);
+}
+
+TEST(L3L2MessageQueueTest, L2OutputReservePublishWritesDescriptorAndTail) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    L3L2QueueOutputReservation reservation{};
+    ASSERT_TRUE(queue.output().try_reserve(16, &reservation)) << queue.error().message;
+    EXPECT_EQ(reservation.payload_nbytes, 16u);
+    EXPECT_NE(reservation.payload.gm_addr, 0u);
+
+    ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::DATA)) << queue.error().message;
+
+    L3L2QueueDescSlot slot{};
+    std::memcpy(&slot, storage.payload.data() + queue.layout().output_desc_offset, sizeof(slot));
+    EXPECT_EQ(slot.seq, 1u);
+    EXPECT_EQ(slot.opcode, 1u);
+    EXPECT_EQ(slot.payload_nbytes, 16u);
+    EXPECT_EQ(storage.counters[32], 1);
+}
+
+TEST(L3L2MessageQueueTest, L2OutputReserveReplaysReleasedDescriptorsBeforeReusingArena) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        4,
+        64,
+        128,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    L3L2QueueOutputReservation first{};
+    ASSERT_TRUE(queue.output().try_reserve(80, &first)) << queue.error().message;
+    ASSERT_EQ(first.payload_offset, queue.layout().output_arena_offset);
+    ASSERT_TRUE(queue.output().publish(first, L3L2QueueOpcode::DATA)) << queue.error().message;
+
+    storage.counters[48] = 1;
+    L3L2QueueOutputReservation second{};
+    ASSERT_TRUE(queue.output().try_reserve(80, &second)) << queue.error().message;
+
+    EXPECT_EQ(second.payload_offset, queue.layout().output_arena_offset);
+}
+
+TEST(L3L2MessageQueueTest, RemoteAbortObservationDoesNotSetOwnAbortFlag) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+    storage.counters[64] = 1;
+
+    EXPECT_EQ(queue.disambiguate_timeout(), L3L2QueueTimeoutStatus::REMOTE_ABORTED);
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::REMOTE_ABORTED);
+    EXPECT_EQ(storage.counters[80], 0);
+}
+
+TEST(L3L2MessageQueueTest, OrdinaryTimeoutDoesNotSetOwnAbortFlag) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    EXPECT_EQ(queue.disambiguate_timeout(), L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT);
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0);
+}
+
+TEST(L3L2MessageQueueTest, OutputCapacityEqualsDepthAndFullIsNoProgressWithoutAbort) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    for (int i = 0; i < 2; ++i) {
+        L3L2QueueOutputReservation reservation{};
+        ASSERT_TRUE(queue.output().try_reserve(0, &reservation)) << queue.error().message;
+        ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::DATA)) << queue.error().message;
+    }
+    L3L2QueueOutputReservation third{};
+    EXPECT_FALSE(queue.output().try_reserve(0, &third));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET)], 2);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0);
+}
+
+TEST(L3L2MessageQueueTest, FullAndEmptyUseMonotonicCountersNotMaskedIndices) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    for (int i = 0; i < 2; ++i) {
+        L3L2QueueOutputReservation reservation{};
+        ASSERT_TRUE(queue.output().try_reserve(0, &reservation)) << queue.error().message;
+        ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::DATA)) << queue.error().message;
+    }
+    storage.counters[counter_index(L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET)] = 1;
+
+    L3L2QueueOutputReservation third{};
+    ASSERT_TRUE(queue.output().try_reserve(0, &third)) << queue.error().message;
+    ASSERT_TRUE(queue.output().publish(third, L3L2QueueOpcode::DATA)) << queue.error().message;
+
+    EXPECT_EQ(third.seq, 3u);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET)], 3);
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0);
+}
+
+TEST(L3L2MessageQueueTest, OutputReserveTooLargeIsPreMutationNoProgressWithoutAbort) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    L3L2QueueOutputReservation reservation{};
+    EXPECT_FALSE(queue.output().try_reserve(65, &reservation));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET)], 0);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0);
+}
+
+TEST(L3L2MessageQueueTest, OutputPublishApplicationErrorDoesNotSetAbortFlag) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    L3L2QueueOutputReservation reservation{};
+    ASSERT_TRUE(queue.output().try_reserve(0, &reservation)) << queue.error().message;
+    ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::ERROR)) << queue.error().message;
+
+    L3L2QueueDescSlot slot{};
+    std::memcpy(&slot, storage.payload.data() + queue.layout().output_desc_offset, sizeof(slot));
+    EXPECT_EQ(slot.opcode, static_cast<uint64_t>(L3L2QueueOpcode::ERROR));
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0);
+}
+
+TEST(L3L2MessageQueueTest, OutputPublishStaleReservationPoisonsAndSetsOwnAbortFlag) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    L3L2QueueOutputReservation reservation{};
+    ASSERT_TRUE(queue.output().try_reserve(0, &reservation)) << queue.error().message;
+    ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::DATA)) << queue.error().message;
+    EXPECT_FALSE(queue.output().publish(reservation, L3L2QueueOpcode::DATA));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::OWNERSHIP);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1);
+}
+
+TEST(L3L2MessageQueueTest, InputApplicationErrorIsNormalMessageAndDoesNotSetAbortFlag) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+    publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::ERROR);
+
+    L3L2QueueInputHandle handle{};
+    ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message;
+    EXPECT_EQ(handle.opcode, L3L2QueueOpcode::ERROR);
+    ASSERT_TRUE(queue.input().release(handle)) << queue.error().message;
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0);
+}
+
+TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidState) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+    publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::STOP);
+
+    L3L2QueueInputHandle stop{};
+    ASSERT_TRUE(queue.input().try_peek(&stop)) << queue.error().message;
+    ASSERT_TRUE(queue.input().release(stop)) << queue.error().message;
+
+    publish_input_desc(&storage, queue.layout(), 2, L3L2QueueOpcode::DATA);
+    L3L2QueueInputHandle later{};
+    EXPECT_FALSE(queue.input().try_peek(&later));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::INVALID_DESCRIPTOR);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1);
+}
+
+TEST(L3L2MessageQueueTest, NullInputPeekOutputIsPreMutationRejectionWithoutAbort) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    EXPECT_FALSE(queue.input().try_peek(nullptr));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0);
+}
+
+TEST(L3L2MessageQueueTest, InputSecondPeekBeforeReleasePoisonsOwnershipAndSetsOwnAbortFlag) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+    publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA);
+
+    L3L2QueueInputHandle handle{};
+    ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message;
+    L3L2QueueInputHandle second{};
+    EXPECT_FALSE(queue.input().try_peek(&second));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::OWNERSHIP);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1);
+}
+
+}  // namespace
diff --git a/tests/ut/py/test_worker/test_l3_l2_message_queue.py b/tests/ut/py/test_worker/test_l3_l2_message_queue.py
new file mode 100644
index 000000000..64b39f8cb
--- /dev/null
+++ b/tests/ut/py/test_worker/test_l3_l2_message_queue.py
@@ -0,0 +1,666 @@
+# Copyright (c) PyPTO Contributors.
+# This program is free software, you can redistribute it and/or modify it under the terms and conditions of
+# CANN Open Software License Agreement Version 2.0 (the "License").
+# Please refer to the License for details. You may not use this file except in compliance with the License.
+# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED,
+# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE.
+# See LICENSE in the root of the software repository for the full text of the License.
+# -----------------------------------------------------------------------------------------------------------
+
+import ctypes
+import math
+import struct
+from multiprocessing.shared_memory import SharedMemory
+
+import pytest
+from simpler.l3_l2_message_queue import (
+    L3L2_QUEUE_COUNTER_BYTES,
+    L3L2_QUEUE_DESC_SLOT_BYTES,
+    L3L2_QUEUE_L2_ABORT_FLAG_OFFSET,
+    L3L2_QUEUE_L3_ABORT_FLAG_OFFSET,
+    L3L2QueueMessage,
+    L3L2QueueOpcode,
+    make_l3_l2_queue_layout,
+)
+from simpler.l3_l2_orch_comm import (
+    L3L2OrchCommCmd,
+    L3L2OrchCommRequest,
+    L3L2OrchCommResponse,
+    L3L2OrchRegionDesc,
+    NotifyOp,
+    WaitCmp,
+)
+from simpler.orchestrator import Orchestrator
+from simpler.task_interface import DataType, Tensor, get_element_size
+from simpler.worker import _IDLE, _OFF_STATE, Worker, _buffer_field_addr, _mailbox_store_i32
+
+
+class _FakeCWorker:
+    def __init__(self):
+        self.bootstrap_calls: list[tuple[int, str]] = []
+
+    def control_l3_l2_orch_comm_init(self, worker_id: int, control_shm_name: str) -> None:
+        self.bootstrap_calls.append((int(worker_id), str(control_shm_name)))
+
+
+class _FakeCOrch:
+    def __init__(self):
+        self._buffers = []
+
+    def alloc(self, shape, dtype):
+        nbytes = math.prod(int(x) for x in shape) * int(get_element_size(dtype))
+        storage_t = ctypes.c_uint8 * nbytes
+        storage = storage_t()
+        self._buffers.append(storage)
+        return Tensor.make(ctypes.addressof(storage), tuple(int(x) for x in shape), dtype)
+
+
+class _FakeClient:
+    def __init__(self):
+        self.requests: list[tuple[L3L2OrchCommRequest, float]] = []
+        self.payload_writes: list[tuple[int, bytes]] = []
+        self.next_region_id = 1
+        self.payload_base = 0x1000_0000
+        self.counter_base = 0x2000_0000
+        self.payload = bytearray()
+        self.counters: dict[int, int] = {}
+        self.peer_abort = False
+        self.fail_next_cmd: L3L2OrchCommCmd | None = None
+
+    def submit(self, request, timeout_s: float):
+        self.requests.append((request, timeout_s))
+        if self.fail_next_cmd == request.cmd:
+            self.fail_next_cmd = None
+            raise RuntimeError(f"injected failure for {request.cmd.name}")
+        if request.cmd == L3L2OrchCommCmd.ALLOC_REGION:
+            region_id = self.next_region_id
+            self.next_region_id += 1
+            self.payload = bytearray(int(request.payload_bytes))
+            self.counters = {}
+            return L3L2OrchCommResponse(
+                status=0,
+                error_kind=0,
+                region_id=region_id,
+                observed_counter=0,
+                matched=False,
+                desc=L3L2OrchRegionDesc(
+                    magic_version=0x4C334C3200020000,
+                    region_id=region_id,
+                    payload_base=self.payload_base,
+                    payload_bytes=request.payload_bytes,
+                    counter_base=self.counter_base,
+                    counter_bytes=request.counter_bytes,
+                ),
+                message="",
+            )
+        if request.cmd == L3L2OrchCommCmd.PAYLOAD_WRITE:
+            data = ctypes.string_at(int(request.host_ptr), int(request.payload_bytes))
+            self.payload_writes.append(
+                (
+                    int(request.payload_offset),
+                    data,
+                )
+            )
+            begin = int(request.payload_offset)
+            self.payload[begin : begin + int(request.payload_bytes)] = data
+        if request.cmd == L3L2OrchCommCmd.PAYLOAD_READ:
+            begin = int(request.payload_offset)
+            data = bytes(self.payload[begin : begin + int(request.payload_bytes)])
+            ctypes.memmove(int(request.host_ptr), data, len(data))
+        if request.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY:
+            offset = int(request.counter_addr) - self.counter_base
+            if int(request.op) == int(NotifyOp.Add):
+                self.counters[offset] = int(self.counters.get(offset, 0)) + int(request.counter_operand)
+            else:
+                self.counters[offset] = int(request.counter_operand)
+        if request.cmd == L3L2OrchCommCmd.SIGNAL_TEST:
+            offset = int(request.counter_addr) - self.counter_base
+            observed = (
+                1 if self.peer_abort and offset == L3L2_QUEUE_L2_ABORT_FLAG_OFFSET else self.counters.get(offset, 0)
+            )
+            matched = _compare_counter(observed, int(request.counter_operand), int(request.op))
+            return L3L2OrchCommResponse(
+                status=0,
+                error_kind=0,
+                region_id=request.region_id,
+                observed_counter=observed,
+                matched=matched,
+                desc=None,
+                message="",
+            )
+        return L3L2OrchCommResponse(
+            status=0,
+            error_kind=0,
+            region_id=request.region_id,
+            observed_counter=request.counter_operand,
+            matched=True,
+            desc=None,
+            message="",
+        )
+
+
+def _compare_counter(observed: int, operand: int, cmp: int) -> bool:
+    if cmp == int(WaitCmp.EQ):
+        return observed == operand
+    if cmp == int(WaitCmp.NE):
+        return observed != operand
+    if cmp == int(WaitCmp.GT):
+        return observed > operand
+    if cmp == int(WaitCmp.GE):
+        return observed >= operand
+    if cmp == int(WaitCmp.LT):
+        return observed < operand
+    if cmp == int(WaitCmp.LE):
+        return observed <= operand
+    return False
+
+
+def _make_orchestrator() -> tuple[Orchestrator, Worker, SharedMemory, _FakeClient]:
+    worker = Worker(level=3, device_ids=[0], platform="a2a3", runtime="tensormap_and_ringbuffer")
+    shm = SharedMemory(create=True, size=4096)
+    assert shm.buf is not None
+    _mailbox_store_i32(_buffer_field_addr(shm.buf, _OFF_STATE), _IDLE)
+    fake_client = _FakeClient()
+    worker._initialized = True
+    worker._hierarchical_started = True
+    worker._worker = _FakeCWorker()
+    worker._chip_shms = [shm]
+    worker._make_l3_l2_orch_comm_client = lambda _shm: fake_client
+    return Orchestrator(_FakeCOrch(), worker), worker, shm, fake_client
+
+
+def _close(worker: Worker, shm: SharedMemory) -> None:
+    worker._close_l3_l2_orch_comm()
+    shm.close()
+    shm.unlink()
+
+
+def _publish_output(
+    fake_client: _FakeClient,
+    queue,
+    *,
+    seq: int = 1,
+    payload: bytes = b"",
+    opcode: int = int(L3L2QueueOpcode.DATA),
+    payload_offset: int | None = None,
+) -> None:
+    if payload_offset is None:
+        payload_offset = queue.layout.output_arena_offset if payload else 0
+    if payload:
+        fake_client.payload[payload_offset : payload_offset + len(payload)] = payload
+    desc = struct.pack("<4Q", seq, int(opcode), payload_offset, len(payload))
+    desc_offset = queue.layout.output_desc_offset + ((seq - 1) & (queue.layout.depth - 1)) * L3L2_QUEUE_DESC_SLOT_BYTES
+    fake_client.payload[desc_offset : desc_offset + L3L2_QUEUE_DESC_SLOT_BYTES] = desc
+    fake_client.counters[queue.layout.output_desc_tail_offset] = seq
+
+
+def test_layout_rejects_invalid_pr1_parameters():
+    invalid_args = [
+        (3, 128, 128),
+        ((1 << 30) + 1, 128, 128),
+        (4, 0, 128),
+        (4, 127, 128),
+        (4, 128, 0),
+        (4, 128, 127),
+    ]
+
+    for depth, input_arena_bytes, output_arena_bytes in invalid_args:
+        with pytest.raises(ValueError):
+            make_l3_l2_queue_layout(depth, input_arena_bytes, output_arena_bytes)
+
+
+@pytest.mark.parametrize(
+    ("depth", "input_arena_bytes", "output_arena_bytes", "expected"),
+    [
+        (
+            1,
+            64,
+            64,
+            {
+                "output_desc_offset": 32,
+                "input_arena_offset": 64,
+                "output_arena_offset": 128,
+                "payload_bytes": 192,
+            },
+        ),
+        (
+            4,
+            128,
+            192,
+            {
+                "output_desc_offset": 128,
+                "input_arena_offset": 256,
+                "output_arena_offset": 384,
+                "payload_bytes": 576,
+            },
+        ),
+        (
+            8,
+            192,
+            64,
+            {
+                "output_desc_offset": 256,
+                "input_arena_offset": 512,
+                "output_arena_offset": 704,
+                "payload_bytes": 768,
+            },
+        ),
+    ],
+)
+def test_layout_lockstep_cases_match_cpp_helper_expectations(depth, input_arena_bytes, output_arena_bytes, expected):
+    layout = make_l3_l2_queue_layout(
+        depth=depth,
+        input_arena_bytes=input_arena_bytes,
+        output_arena_bytes=output_arena_bytes,
+    )
+
+    assert layout.input_desc_offset == 0
+    assert layout.output_desc_offset == expected["output_desc_offset"]
+    assert layout.output_desc_offset == depth * L3L2_QUEUE_DESC_SLOT_BYTES
+    assert layout.input_arena_offset == expected["input_arena_offset"]
+    assert layout.output_arena_offset == expected["output_arena_offset"]
+    assert layout.payload_bytes == expected["payload_bytes"]
+    assert layout.input_arena_offset % 64 == 0
+    assert layout.output_arena_offset % 64 == 0
+    assert layout.input_desc_tail_offset == 0
+    assert layout.input_desc_head_offset == 64
+    assert layout.output_desc_tail_offset == 128
+    assert layout.output_desc_head_offset == 192
+    assert layout.l3_abort_flag_offset == L3L2_QUEUE_L3_ABORT_FLAG_OFFSET
+    assert layout.l2_abort_flag_offset == L3L2_QUEUE_L2_ABORT_FLAG_OFFSET
+    assert layout.counter_bytes == L3L2_QUEUE_COUNTER_BYTES
+
+
+def test_create_l3_l2_queue_allocates_region_and_exposes_l2_task_scalars():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=192)
+
+        alloc_req = fake_client.requests[0][0]
+        assert alloc_req.cmd == L3L2OrchCommCmd.ALLOC_REGION
+        assert alloc_req.payload_bytes == queue.layout.payload_bytes
+        assert alloc_req.counter_bytes == L3L2_QUEUE_COUNTER_BYTES
+        assert queue.l2_task_arg_scalars() == [
+            *queue.region.descriptor_scalars(),
+            queue.magic_version,
+            4,
+            128,
+            192,
+        ]
+        assert fake_client.counters == {
+            queue.layout.input_desc_tail_offset: 0,
+            queue.layout.input_desc_head_offset: 0,
+            queue.layout.output_desc_tail_offset: 0,
+            queue.layout.output_desc_head_offset: 0,
+            queue.layout.l3_abort_flag_offset: 0,
+            queue.layout.l2_abort_flag_offset: 0,
+        }
+    finally:
+        _close(worker, shm)
+
+
+def test_zero_byte_enqueue_skips_message_payload_write_and_publishes_descriptor():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        fake_client.requests.clear()
+        fake_client.payload_writes.clear()
+
+        queue.input.enqueue(None, nbytes=0, timeout=0.001)
+
+        payload_write_offsets = [offset for offset, _data in fake_client.payload_writes]
+        assert queue.layout.input_arena_offset not in payload_write_offsets
+        assert queue.layout.input_desc_offset in payload_write_offsets
+        notify_req = fake_client.requests[-1][0]
+        assert notify_req.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY
+        assert notify_req.op == int(NotifyOp.Set)
+        assert notify_req.counter_addr == queue.region.descriptor.counter_base + queue.layout.input_desc_tail_offset
+        assert notify_req.counter_operand == 1
+    finally:
+        _close(worker, shm)
+
+
+def test_enqueue_registered_tensor_uses_fast_path_without_staging():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        host = orch.alloc([16], DataType.UINT8)
+        fake_client.requests.clear()
+        fake_client.payload_writes.clear()
+
+        queue.input.enqueue(host, nbytes=16, timeout=0.001)
+
+        payload_write_offsets = [offset for offset, _data in fake_client.payload_writes]
+        assert queue.layout.input_arena_offset in payload_write_offsets
+        assert queue.layout.input_desc_offset in payload_write_offsets
+        assert all(req.cmd != L3L2OrchCommCmd.ALLOC_REGION for req, _timeout in fake_client.requests)
+    finally:
+        _close(worker, shm)
+
+
+def test_enqueue_replays_released_descriptors_before_reusing_input_arena():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        first = orch.alloc([80], DataType.UINT8)
+        second = orch.alloc([80], DataType.UINT8)
+
+        queue.input.enqueue(first, nbytes=80, timeout=0.001)
+        fake_client.counters[queue.layout.input_desc_head_offset] = 1
+        queue.input.enqueue(second, nbytes=80, timeout=0.001)
+
+        payload_offsets = [
+            offset for offset, data in fake_client.payload_writes if len(data) == 80
+        ]
+        assert payload_offsets == [queue.layout.input_arena_offset, queue.layout.input_arena_offset]
+    finally:
+        _close(worker, shm)
+
+
+def test_enqueue_rejects_ordinary_host_bytes_before_shared_mutation():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        fake_client.requests.clear()
+
+        with pytest.raises(ValueError, match="registered.*orch.alloc"):
+            queue.input.enqueue(b"ordinary", nbytes=8, timeout=0.001)
+
+        assert fake_client.requests == []
+        assert queue.region.descriptor_scalars()[1] == 1
+    finally:
+        _close(worker, shm)
+
+
+def test_output_read_into_registered_tensor_uses_fast_path_and_release_notifies_head():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        _publish_output(fake_client, queue, payload=b"abcdefghijklmnop")
+        output = orch.alloc([16], DataType.UINT8)
+
+        handle = queue.output.peek(timeout=0.001)
+        queue.output.read_into(handle, output)
+        queue.output.release(handle)
+
+        assert ctypes.string_at(int(output.data), 16) == b"abcdefghijklmnop"
+        assert fake_client.counters[queue.layout.output_desc_head_offset] == 1
+    finally:
+        _close(worker, shm)
+
+
+def test_dequeue_into_reads_and_releases_output():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        _publish_output(fake_client, queue, payload=b"abcdefghijklmnop")
+        output = orch.alloc([16], DataType.UINT8)
+
+        message = queue.output.dequeue_into(output, timeout=0.001)
+
+        assert message.seq == 1
+        assert message.opcode == L3L2QueueOpcode.DATA
+        assert ctypes.string_at(int(output.data), 16) == b"abcdefghijklmnop"
+        assert fake_client.counters[queue.layout.output_desc_head_offset] == 1
+    finally:
+        _close(worker, shm)
+
+
+def test_try_dequeue_into_empty_returns_none_without_abort():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        output = orch.alloc([16], DataType.UINT8)
+        fake_client.requests.clear()
+
+        assert queue.output.try_dequeue_into(output) is None
+
+        assert fake_client.counters.get(queue.layout.output_desc_head_offset, 0) == 0
+        assert all(
+            not (
+                req.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY
+                and req.counter_addr == queue.region.descriptor.counter_base + L3L2_QUEUE_L3_ABORT_FLAG_OFFSET
+            )
+            for req, _timeout in fake_client.requests
+        )
+    finally:
+        _close(worker, shm)
+
+
+def test_output_read_rejects_ordinary_buffer_before_shared_mutation():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        _publish_output(fake_client, queue, payload=b"abcdefghijklmnop")
+        handle = queue.output.peek(timeout=0.001)
+        fake_client.requests.clear()
+
+        with pytest.raises(ValueError, match="registered.*orch.alloc"):
+            queue.output.read_into(handle, bytearray(16))
+
+        assert fake_client.requests == []
+        assert fake_client.counters.get(queue.layout.output_desc_head_offset, 0) == 0
+    finally:
+        _close(worker, shm)
+
+
+def test_output_release_inactive_handle_poisons_and_sets_l3_abort_flag():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        _publish_output(fake_client, queue, payload=b"abcdefghijklmnop")
+        handle = queue.output.peek(timeout=0.001)
+        wrong = L3L2QueueMessage(handle.seq + 1, handle.opcode, handle.payload_offset, handle.payload_nbytes)
+        fake_client.requests.clear()
+
+        with pytest.raises(RuntimeError, match="not active"):
+            queue.output.release(wrong)
+
+        assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1
+        with pytest.raises(RuntimeError, match="poisoned"):
+            queue.output.try_peek()
+    finally:
+        _close(worker, shm)
+
+
+def test_output_stop_descriptor_poisons_and_sets_l3_abort_flag():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        _publish_output(fake_client, queue, opcode=int(L3L2QueueOpcode.STOP))
+
+        with pytest.raises(RuntimeError, match="cannot be STOP"):
+            queue.output.peek(timeout=0.001)
+
+        assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1
+    finally:
+        _close(worker, shm)
+
+
+def test_zero_byte_output_descriptor_with_nonzero_offset_poisons_and_sets_l3_abort_flag():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        _publish_output(fake_client, queue, payload_offset=queue.layout.output_arena_offset)
+
+        with pytest.raises(RuntimeError, match="zero-byte.*nonzero"):
+            queue.output.peek(timeout=0.001)
+
+        assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1
+    finally:
+        _close(worker, shm)
+
+
+def test_zero_byte_output_read_accepts_none_and_skips_payload_read():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        _publish_output(fake_client, queue, payload=b"")
+        handle = queue.output.peek(timeout=0.001)
+        fake_client.requests.clear()
+
+        queue.output.read_into(handle, None)
+        queue.output.release(handle)
+
+        assert all(req.cmd != L3L2OrchCommCmd.PAYLOAD_READ for req, _timeout in fake_client.requests)
+        assert fake_client.counters[queue.layout.output_desc_head_offset] == 1
+    finally:
+        _close(worker, shm)
+
+
+def test_try_enqueue_full_queue_returns_false_without_poison_or_publish():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=2, input_arena_bytes=128, output_arena_bytes=128)
+        queue.input.enqueue(None, nbytes=0, timeout=0.001)
+        queue.input.enqueue(None, nbytes=0, timeout=0.001)
+        fake_client.requests.clear()
+        fake_client.payload_writes.clear()
+
+        assert queue.input.try_enqueue(None, nbytes=0) is False
+
+        assert fake_client.payload_writes == []
+        assert fake_client.counters[queue.layout.input_desc_tail_offset] == 2
+        assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0
+    finally:
+        _close(worker, shm)
+
+
+def test_enqueue_after_stop_rejects_locally_without_polling_or_abort():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        queue.request_stop(timeout=0.001)
+        fake_client.requests.clear()
+
+        assert queue.input.try_enqueue(None, nbytes=0) is False
+        with pytest.raises(RuntimeError, match="stopped"):
+            queue.input.enqueue(None, nbytes=0, timeout=0.001)
+
+        assert fake_client.requests == []
+        assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0
+    finally:
+        _close(worker, shm)
+
+
+def test_try_enqueue_payload_larger_than_arena_returns_false_without_poison_or_publish():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        host = orch.alloc([256], DataType.UINT8)
+        fake_client.requests.clear()
+        fake_client.payload_writes.clear()
+
+        assert queue.input.try_enqueue(host, nbytes=256) is False
+
+        assert fake_client.payload_writes == []
+        assert fake_client.counters.get(queue.layout.input_desc_tail_offset, 0) == 0
+        assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0
+    finally:
+        _close(worker, shm)
+
+
+def test_output_payload_offset_mismatch_poisons_before_payload_read():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        _publish_output(
+            fake_client,
+            queue,
+            payload=b"abcdefghijklmnop",
+            payload_offset=queue.layout.output_arena_offset + 16,
+        )
+        fake_client.requests.clear()
+
+        with pytest.raises(RuntimeError, match="payload.*mismatch"):
+            queue.output.peek(timeout=0.001)
+
+        assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1
+        assert all(
+            not (
+                req.cmd == L3L2OrchCommCmd.PAYLOAD_READ
+                and req.payload_offset == queue.layout.output_arena_offset + 16
+            )
+            for req, _timeout in fake_client.requests
+        )
+    finally:
+        _close(worker, shm)
+
+
+def test_enqueue_payload_write_failure_sets_l3_abort_flag():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        host = orch.alloc([16], DataType.UINT8)
+        fake_client.fail_next_cmd = L3L2OrchCommCmd.PAYLOAD_WRITE
+
+        with pytest.raises(RuntimeError, match="injected failure"):
+            queue.input.enqueue(host, nbytes=16, timeout=0.001)
+
+        assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1
+        with pytest.raises(RuntimeError, match="poisoned"):
+            queue.input.try_enqueue(None, nbytes=0)
+    finally:
+        _close(worker, shm)
+
+
+def test_timeout_without_peer_abort_flag_returns_timeout_and_keeps_queue_live():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        fake_client.requests.clear()
+
+        with pytest.raises(TimeoutError, match="timed out"):
+            queue.output.peek(timeout=0.0001)
+
+        assert queue.region.descriptor_scalars()[1] == 1
+        assert all(
+            not (
+                req.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY
+                and req.counter_addr == queue.region.descriptor.counter_base + L3L2_QUEUE_L3_ABORT_FLAG_OFFSET
+            )
+            for req, _timeout in fake_client.requests
+        )
+    finally:
+        _close(worker, shm)
+
+
+def test_timeout_with_peer_abort_flag_reports_remote_aborted_without_setting_own_flag():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        fake_client.peer_abort = True
+        fake_client.requests.clear()
+
+        with pytest.raises(RuntimeError, match="remote.*abort"):
+            queue.output.peek(timeout=0.0001)
+
+        with pytest.raises(RuntimeError, match="remote.*abort"):
+            queue.input.try_enqueue(None, nbytes=0)
+        assert all(
+            not (
+                req.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY
+                and req.counter_addr == queue.region.descriptor.counter_base + L3L2_QUEUE_L3_ABORT_FLAG_OFFSET
+            )
+            for req, _timeout in fake_client.requests
+        )
+    finally:
+        _close(worker, shm)
+
+
+def test_expired_queue_rejects_later_operations_without_abort_flag():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        queue.region._expire()
+        fake_client.requests.clear()
+
+        with pytest.raises(RuntimeError, match="expired"):
+            queue.input.try_enqueue(None, nbytes=0)
+        with pytest.raises(RuntimeError, match="expired"):
+            queue.output.try_peek()
+
+        assert fake_client.requests == []
+        assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0
+    finally:
+        _close(worker, shm)

From 04e3a4c524a6df89bb32c34279e704aa2b99e8c6 Mon Sep 17 00:00:00 2001
From: ccyywwen <75376396+ccyywwen@users.noreply.github.com>
Date: Mon, 29 Jun 2026 09:15:15 +0800
Subject: [PATCH 3/7] Update: clean up L3 L2 queue PR1

- Drop the base implementation guide from tracked PR1 files while keeping
  it available locally for PR2 planning.
- Keep the L3-L2 queue Python tests compatible with the pyright target and
  ruff formatting used by CI.
---
 docs/l3-l2-message-queue-base-impl.md         | 798 ------------------
 .../test_worker/test_l3_l2_message_queue.py   |  12 +-
 2 files changed, 5 insertions(+), 805 deletions(-)
 delete mode 100644 docs/l3-l2-message-queue-base-impl.md

diff --git a/docs/l3-l2-message-queue-base-impl.md b/docs/l3-l2-message-queue-base-impl.md
deleted file mode 100644
index d63f446cf..000000000
--- a/docs/l3-l2-message-queue-base-impl.md
+++ /dev/null
@@ -1,798 +0,0 @@
-# L3-L2 Message Queue Base Queue Two-PR Implementation Plan
-
-## 1. Scope And Platform Support
-
-This document covers a two-PR delivery of the base bidirectional SPSC message
-queue transport described in `l3-l2-message-queue-design.md`.
-
-PR1 implements the core queue transport and primitive-compatible fast-path API:
-
-- one input queue from L3 to L2;
-- one output queue from L2 to L3;
-- descriptor rings and payload arenas in one primitive L3-L2 region;
-- `DATA`, `ERROR`, and input-only `STOP` descriptors;
-- explicit output reserve/publish on L2;
-- explicit input peek/release on L2;
-- L3 enqueue, output ownership/dequeue, stop, and cleanup APIs;
-- non-zero L3 buffers limited to primitive-compatible registered
-  `orch.alloc(...)` host Tensors;
-- two single-writer abort flags for timeout disambiguation;
-- unit tests for ABI, layout, counters, zero-byte descriptors, queue
-  mechanics, and fast-path APIs.
-
-PR2 implements the usability and end-to-end layer:
-
-- lazy internal staging for ordinary L3 host buffers;
-- ordinary host-buffer enqueue and output read convenience paths;
-- one base queue example with a small message-local AICore task.
-- scene tests on supported platforms;
-- final user-facing documentation cleanup.
-
-Neither PR includes:
-
-- the L2 input window helper;
-- multiple active DATA input handles on L2;
-- out-of-order input release;
-- fragmented payload arenas;
-- multiple outstanding producer reservations per direction;
-- output-side STOP acknowledgement messages.
-
-Supported across the two PRs:
-
-- `a2a3` onboard;
-- `a2a3sim`;
-- `a5sim`.
-
-Not supported:
-
-- `a5` onboard.
-
-The exact Python and C++ class names may change during implementation, but the
-ABI, state transitions, and observable behavior in this document are base queue
-requirements. Scope tags below identify whether a requirement lands in PR1 or
-PR2.
-
-## 2. Expected User Flow
-
-The final base queue should be usable without exposing descriptor offsets,
-counter offsets, or payload arena cursors to application code. PR1 supports
-the same operation shape with primitive-compatible registered host Tensors for
-non-zero L3 buffers. PR2 relaxes that buffer requirement with lazy staging.
-
-Expected L3 shape:
-
-```python
-queue = orch.create_l3_l2_queue(
-    worker_id=0,
-    depth=8,
-    input_arena_bytes=1 << 20,
-    output_arena_bytes=1 << 20,
-)
-
-for payload in input_payloads:
-    queue.input.enqueue(payload.buffer, nbytes=payload.nbytes, timeout=timeout_s)
-
-queue.input.enqueue(None, nbytes=0, timeout=timeout_s)  # zero-byte DATA
-queue.request_stop(timeout=timeout_s)
-
-while not application_done:
-    message = queue.output.peek(timeout=timeout_s)
-    output_buffer = choose_buffer(message.payload_nbytes)
-    queue.output.read_into(message, output_buffer)
-    queue.output.release(message)
-    handle_application_output(message)
-
-queue.free()
-```
-
-If the application already owns a large enough output buffer, it may use the
-convenience path instead:
-
-```python
-message = queue.output.dequeue_into(max_sized_output_buffer, timeout=timeout_s)
-```
-
-Expected base L2 shape:
-
-```cpp
-L3L2QueueEndpoint queue(desc_scalars, queue_args);
-for (;;) {
-    auto in = queue.input().peek(timeout);
-    if (in.opcode == L3L2QueueOpcode::STOP) {
-        queue.input().release(in);
-        break;
-    }
-
-    auto out = queue.output().reserve(output_nbytes, timeout);
-    launch_message_local_aicore_work(in.payload_view, out.gm_addr);
-    wait_until_output_bytes_are_visible();
-    queue.output().publish(out, L3L2QueueOpcode::DATA);
-    queue.input().release(in);
-}
-```
-
-Application payload schema, request IDs, final-output markers, and output
-cardinality are application responsibilities. PR1 transport order does not
-imply request correlation beyond FIFO order within each queue direction.
-
-## 3. API Surface
-
-PR1 must expose the semantic operations below. PR2 keeps the same operation
-surface and only expands accepted L3 buffer types through lazy staging. Exact
-class and method names may change during implementation, but the
-implementation must not require users to manipulate descriptor slots, counter
-offsets, payload arena offsets, or head/tail reconstruction state directly.
-
-Required L3 Python surface:
-
-```text
-orch.create_l3_l2_queue(
-    worker_id,
-    depth,
-    input_arena_bytes,
-    output_arena_bytes,
-) -> queue
-
-queue.input.enqueue(buffer_or_none, nbytes, timeout)
-queue.input.try_enqueue(buffer_or_none, nbytes)
-
-queue.output.dequeue_into(buffer, timeout) -> message
-queue.output.try_dequeue_into(buffer) -> message or no-progress
-
-queue.request_stop(timeout)
-queue.try_request_stop()
-queue.free()
-```
-
-L3 message results must expose at least:
-
-```text
-seq
-opcode
-payload_nbytes
-```
-
-Convenience dequeue APIs may copy and release in one operation. PR1 must also
-expose explicit output ownership APIs with these semantics:
-
-```text
-queue.output.peek(timeout) -> message_handle
-queue.output.try_peek() -> message_handle or no-progress
-queue.output.read_into(message_handle, buffer)
-queue.output.release(message_handle)
-```
-
-Required L2 C++ surface:
-
-```text
-L3L2QueueEndpoint queue(desc_scalars, queue_args)
-
-queue.input().peek(timeout) -> input_handle
-queue.input().try_peek() -> input_handle or no-progress
-queue.input().release(input_handle)
-
-queue.output().reserve(nbytes, timeout) -> output_reservation
-queue.output().try_reserve(nbytes) -> output_reservation or no-progress
-queue.output().publish(output_reservation, opcode)
-```
-
-L2 input handles must expose at least:
-
-```text
-seq
-opcode
-payload_nbytes
-payload_view or empty payload marker
-```
-
-L2 output reservations must expose at least:
-
-```text
-seq or publish sequence context
-payload_offset
-payload_nbytes
-gm_addr for non-zero payload writes
-```
-
-The API must preserve these user-visible semantics:
-
-- finite timeouts are required for blocking operations;
-- `try_*` operations return no-progress without mutating shared state when the
-  queue cannot make progress;
-- ordinary timeout does not poison the queue unless peer abort is observed;
-- zero-byte messages may pass `buffer_or_none == None`;
-- PR1 non-zero L3 buffers must be primitive-compatible registered
-  `orch.alloc(...)` host Tensors;
-- PR2 L3 convenience APIs accept ordinary contiguous host byte spans and lazily
-  stage them when they are not primitive-compatible registered tensors;
-- primitive-compatible `orch.alloc(...)` host Tensors remain the fast path in
-  both PRs;
-- output ownership APIs are the recommended path for variable-size outputs,
-  while `dequeue_into` remains valid when the caller supplies a large enough
-  target buffer;
-- after successful `request_stop`, L3 input enqueue rejects later input
-  messages locally without poisoning;
-- `ERROR` is an application-level message, not a transport exception;
-- cleanup/free remains valid after local poison or remote-aborted terminal
-  state.
-
-## 4. L3 Host Buffer Contract And Lazy Staging
-
-The primitive L3 payload APIs require a registered, child-visible
-`orch.alloc(...)` host Tensor.
-
-PR1 buffer contract:
-
-- `nbytes == 0` accepts `buffer_or_none == None` and uses the zero-byte
-  descriptor path;
-- non-zero L3 input enqueue buffers must be primitive-compatible registered
-  `orch.alloc(...)` host Tensors;
-- non-zero L3 output read targets must be primitive-compatible registered
-  `orch.alloc(...)` host Tensors;
-- ordinary `bytes`, `bytearray`, `memoryview`, private tensors, and other
-  non-registered host buffers are rejected before shared-state mutation;
-- rejecting a non-registered buffer is a pre-mutation validation failure and
-  does not poison or set an abort flag.
-
-PR2 buffer contract:
-
-- `nbytes == 0` accepts `buffer_or_none == None` and uses the zero-byte
-  descriptor path;
-- if the input buffer is a primitive-compatible registered `orch.alloc(...)`
-  host Tensor, enqueue uses it directly as the zero-extra-host-copy fast path;
-- otherwise enqueue accepts an ordinary readable contiguous host byte span,
-  such as `bytes`, `bytearray`, `memoryview`, or a contiguous CPU tensor-like
-  object the implementation can view as bytes;
-- non-fast-path enqueue copies the user bytes into an internal registered
-  staging Tensor, then issues primitive `payload_write` from that staging
-  Tensor.
-
-For L3 output read:
-
-- if the output target is a primitive-compatible registered `orch.alloc(...)`
-  host Tensor, `read_into` or `dequeue_into` uses it directly as the fast path;
-- otherwise the target must be an ordinary writable contiguous host byte span;
-- non-fast-path read first issues primitive `payload_read` into an internal
-  registered staging Tensor, then copies from staging into the user target.
-
-The staging Tensor is allocated lazily and owned by the queue handle. It may
-grow when a later operation needs a larger staging span. The implementation
-must not expose staging offsets or staging Tensor ownership to users.
-
-If a payload is too large for the current staging Tensor, the queue should grow
-or allocate staging before issuing any primitive command. Failure to allocate
-staging is a pre-mutation validation/allocation failure: it rejects the
-operation, does not publish descriptors, does not release descriptors, does not
-poison, and does not set an abort flag.
-
-Staging may add one host-to-host copy. Users that need the lowest host overhead
-can pass primitive-compatible registered `orch.alloc(...)` host Tensors.
-
-## 5. PR1 ABI Surface
-
-The stable PR1 ABI is the L3/L2 shared contract. It is separate from exact
-Python or C++ method names.
-
-TaskArgs carry the primitive region descriptor followed by queue parameters:
-
-```text
-primitive desc[0..5]
-queue_magic_version
-depth
-input_arena_bytes
-output_arena_bytes
-```
-
-The queue ABI version covers:
-
-- descriptor slot size and field order;
-- opcode numeric values;
-- deterministic payload layout derivation;
-- counter offsets and meanings;
-- head/tail low32 reconstruction rules;
-- abort flag semantics;
-- zero-byte descriptor canonical form;
-- STOP and ERROR transport semantics.
-
-Descriptor slot ABI:
-
-```cpp
-struct L3L2QueueDescSlot {
-    uint64_t seq;
-    uint64_t opcode;
-    uint64_t payload_offset;
-    uint64_t payload_nbytes;
-};
-static_assert(sizeof(L3L2QueueDescSlot) == 32);
-```
-
-Opcode ABI:
-
-```text
-0      invalid / never published
-DATA   = 1
-STOP   = 2
-ERROR  = 3
-```
-
-Counter ABI:
-
-```text
-offset 0:   input_desc_tail       writer=L3
-offset 64:  input_desc_head       writer=L2
-offset 128: output_desc_tail      writer=L2
-offset 192: output_desc_head      writer=L3
-offset 256: l3_abort_flag         writer=L3
-offset 320: l2_abort_flag         writer=L2
-```
-
-Layout validation ABI:
-
-- `depth` must be a power of two and `depth <= 2^30`;
-- queue capacity is `depth`, not `depth - 1`;
-- descriptor slot size is 32 bytes;
-- descriptor rings are 8-byte aligned;
-- payload arena bases are 64-byte aligned;
-- arena byte sizes are positive 64-byte multiples;
-- `counter_bytes >= 384`.
-
-The following are not PR1 ABI:
-
-- exact Python class names;
-- exact C++ helper class names;
-- internal helper function names;
-- polling backoff strategy;
-- application payload schema;
-- example payload format.
-
-## 6. ABI And Layout
-
-The descriptor slot ABI is the existing 32-byte format:
-
-```cpp
-struct L3L2QueueDescSlot {
-    uint64_t seq;
-    uint64_t opcode;
-    uint64_t payload_offset;
-    uint64_t payload_nbytes;
-};
-static_assert(sizeof(L3L2QueueDescSlot) == 32);
-```
-
-`payload_offset` is relative to the primitive payload base. For non-zero
-message payloads, it points into the direction-local payload arena. It does not
-point to the descriptor slot itself.
-
-The layout helper must derive all payload and counter offsets. Python may
-mirror the calculation, but tests must keep the Python calculation and the C/C++
-helper in lockstep.
-
-PR1 counter layout:
-
-```text
-offset 0:   input_desc_tail       writer=L3
-offset 64:  input_desc_head       writer=L2
-offset 128: output_desc_tail      writer=L2
-offset 192: output_desc_head      writer=L3
-offset 256: l3_abort_flag         writer=L3
-offset 320: l2_abort_flag         writer=L2
-```
-
-`counter_bytes` must be at least 384. The abort flags are low-frequency
-diagnostic signals, but they still use the same 64-byte stride as the
-descriptor counters to preserve single-writer cache-line ownership.
-
-All six counters are initialized to zero before submitting the persistent L2
-run. Descriptor slots and payload bytes do not need to be zeroed for
-correctness.
-
-## 7. Primitive Command Mapping
-
-The queue is a wrapper over the existing L3-L2 primitive commands. PR1 must not
-add a new primitive command or bypass the primitive region lifetime model.
-
-Descriptor rings live in the primitive payload region. Descriptor slot access
-therefore uses the primitive payload APIs:
-
-- L3 writes input descriptor slots with `L3L2OrchRegion.payload_write`;
-- L3 reads output descriptor slots with `L3L2OrchRegion.payload_read`;
-- L2 reads input descriptor slots with `L3L2OrchEndpoint::payload_read`;
-- L2 writes output descriptor slots with `L3L2OrchEndpoint::payload_write`.
-
-Message payload arena access also uses the primitive payload APIs when the
-message payload is non-zero:
-
-- L3 input enqueue writes non-zero input payload bytes with
-  `L3L2OrchRegion.payload_write`;
-- L3 output dequeue reads non-zero output payload bytes with
-  `L3L2OrchRegion.payload_read`;
-- L2 input consume obtains a non-zero input payload GM view with
-  `L3L2OrchEndpoint::payload_read`;
-- L2 output reserve returns a GM span in the output arena; L2 application code
-  or AICore work writes that span before `publish`;
-- PR1 does not require a separate L2 message-payload copy API. If an
-  implementation uses `L3L2OrchEndpoint::payload_write` for a small L2-produced
-  output payload, it is only a helper for filling the reserved output arena
-  span before `publish`, not a separate transport path.
-
-Queue counters use the primitive signal APIs:
-
-- publishing descriptor tail, releasing descriptor head, and setting an abort
-  flag use `SIGNAL_NOTIFY` / `signal_notify`;
-- head/tail polling uses `SIGNAL_TEST` / `signal_test` snapshots;
-- timeout disambiguation samples the peer abort flag with `SIGNAL_TEST`, for
-  example `GE 1` against the peer flag address.
-
-Only a matched `SIGNAL_TEST` snapshot may drive head/tail reconstruction,
-descriptor replay, payload release, or payload reuse. A failed head/tail test
-does not establish acquire ordering and its observed value must not update
-local queue state. For abort flags, a matched `GE 1` test reports remote abort;
-an unmatched test leaves the timeout as ordinary no-progress.
-
-PR1 queue correctness must not depend on primitive `SIGNAL_WAIT`. Blocking
-queue operations are wrapper-level bounded polling loops over `SIGNAL_TEST`
-plus local queue-state checks.
-
-## 8. Zero-Byte Message Rules
-
-Zero-byte `DATA`, `ERROR`, and `STOP` descriptors are valid queue messages.
-They still consume one descriptor slot and follow the normal descriptor
-publication sequence.
-
-For any descriptor with `payload_nbytes == 0`:
-
-- `payload_offset` must be `0`;
-- `payload_offset == 0` is a canonical sentinel, not a payload address;
-- the message consumes no payload arena bytes;
-- producer payload cursors do not advance;
-- consumer payload cursors do not advance;
-- payload wrap-padding replay is skipped for that descriptor;
-- no message-payload arena copy/read/view is issued.
-
-Descriptor-ring access is separate from message-payload arena access.
-Descriptor slots live in the primitive payload region, so publishing or reading
-a zero-byte message may still use primitive payload access for descriptor-ring
-metadata. The rule above skips only the message payload arena path.
-
-Consumer validation order must make the zero-byte path explicit:
-
-```text
-1. validate descriptor sequence;
-2. validate opcode and direction legality;
-3. if payload_nbytes == 0:
-     require payload_offset == 0;
-     skip direction-local arena range checks and payload replay;
-   else:
-     require payload_offset to be inside the direction-local arena;
-     validate contiguous span and payload cursor replay.
-```
-
-This ordering matters because `payload_offset == 0` for a zero-byte output
-descriptor usually is not inside the output arena. A consumer that runs arena
-range validation before the zero-byte branch would reject a valid descriptor.
-
-If a published descriptor has `payload_nbytes == 0` and `payload_offset != 0`,
-the descriptor is invalid published state. The observing endpoint transitions
-to `POISONED(local-infrastructure)` and sets its own abort flag.
-
-## 9. Queue State And Abort Flags
-
-PR1 uses two single-writer abort flags:
-
-```text
-l3_abort_flag: writer=L3, reader=L2
-l2_abort_flag: writer=L2, reader=L3
-```
-
-Each flag is initialized to `0`. On local infrastructure poison, the endpoint
-sets its owned flag to `1` with `NotifyOp.Set`. The flag never resets within a
-queue lifetime. It is a terminal boolean, not an epoch and not a poison count.
-
-Abort flags are for timeout disambiguation. PR1 does not require every wait
-loop iteration to poll both data progress and abort progress. A blocking queue
-operation that reaches its timeout samples the peer abort flag:
-
-```text
-peer abort_flag == 0:
-  return ordinary timeout/no-progress;
-  keep the local queue live;
-  do not set the local abort flag.
-
-peer abort_flag == 1:
-  return remote-aborted transport failure;
-  transition the local handle to a terminal remote-aborted state;
-  do not publish descriptors or advance queue state;
-  do not set the local abort flag solely because the peer flag was observed.
-```
-
-The implementation may represent terminal remote abort with the existing
-`POISONED` state, but the reason must remain distinct:
-
-```text
-POISONED(local-infrastructure): set own abort_flag = 1
-POISONED(remote-aborted):       do not set own abort_flag
-```
-
-This distinction prevents a peer abort observation from being amplified into a
-new local infrastructure poison report.
-
-## 10. Capacity, Counters, And Reconstruction
-
-`depth` is the user-visible queue capacity. A queue created with `depth=N` can
-hold `N` published, unreleased descriptors.
-
-Validation rules:
-
-- `depth` must be a power of two;
-- `depth <= 2^30`;
-- queue capacity is `depth`, not `depth - 1`.
-
-Full and empty checks must use monotonic local `uint64_t` head/tail values, not
-only masked ring indices:
-
-```text
-empty iff tail == head
-full  iff tail - head == depth
-invalid shared state iff tail - head > depth
-```
-
-The shared head/tail counters store only the low 32 bits. Each endpoint keeps
-local `uint64_t` copies and reconstructs observed progress with signed 32-bit
-delta semantics:
-
-```text
-delta = int32_t(observed_low32 - local_low32)
-valid progress: 0 <= delta <= depth
-```
-
-`delta == depth` is valid. A peer may legally move from empty to full between
-observations. Negative deltas or deltas larger than `depth` are inconsistent
-shared state and poison the observing endpoint.
-
-Descriptor slot validity does not depend on opcode or slot clearing. A
-published descriptor is valid only when:
-
-```text
-slot.seq == expected_seq
-expected_seq == local_head_or_tail + 1
-slot_index == (expected_seq - 1) & (depth - 1)
-```
-
-Equivalent index calculations are allowed, but the sequence check must use the
-full 64-bit `seq`. Descriptor slots do not need to be cleared before reuse.
-
-Before a producer reuses released descriptor slots or payload arena bytes, it
-must replay exactly the released FIFO prefix after observing head progress.
-Replay must happen before slot reuse. Zero-byte descriptors in replay advance
-descriptor state only and do not advance payload cursors.
-
-## 11. Producer And Consumer Operation Details
-
-Producer sequence:
-
-```text
-reserve -> fill/copy payload if payload_nbytes > 0 -> publish descriptor
-```
-
-Consumer sequence:
-
-```text
-peek/acquire descriptor -> read/view payload if payload_nbytes > 0
--> release descriptor and payload
-```
-
-Descriptor publication order:
-
-1. reserve a descriptor slot and, for non-zero payloads, a contiguous payload
-   arena span;
-2. write or expose the payload bytes;
-3. write descriptor fields other than `seq`;
-4. write `seq` as the descriptor validity marker;
-5. release-publish the tail counter.
-
-Descriptor release order:
-
-1. finish all uses of the message payload;
-2. update local release and payload cursor state;
-3. release-publish the head counter.
-
-Each direction allows at most one outstanding producer reservation. Publishing
-an unknown, stale, already-published, already-canceled, or cross-queue
-reservation is a local ownership contradiction and poisons the queue.
-
-The base queue has no reservation cancel. If a producer has successfully
-reserved a non-zero payload span and later cannot safely publish either `DATA`
-or application `ERROR`, it must poison the queue. If the queue remains
-trustworthy, the application may publish an `ERROR` descriptor using the
-reservation.
-
-`STOP` is an input-queue descriptor. It consumes one input descriptor slot,
-uses `payload_nbytes == 0` and `payload_offset == 0`, and is terminal for L3
-input enqueue. After L3 successfully publishes `STOP`, later input `DATA`,
-`ERROR`, or `STOP` attempts are rejected locally without poisoning. If L2 has
-observed `STOP` and later observes another published input descriptor, the
-descriptor is invalid published state and poisons the queue.
-
-`ERROR` remains an application-level message. Receiving `ERROR` does not poison
-the queue, set an abort flag, stop either direction, or imply transport abort.
-
-## 12. Error Handling Rules
-
-The guiding rule remains:
-
-```text
-Before shared-state mutation: reject, no poison, no abort flag.
-After shared-state mutation or inconsistent shared-state observation:
-  poison local infrastructure, set own abort_flag.
-```
-
-Pre-mutation validation failures do not poison and do not set abort flags:
-
-- `try_enqueue` sees no descriptor or payload space;
-- `try_request_stop` sees no input descriptor slot;
-- a blocking operation times out under ordinary backpressure;
-- payload size exceeds the arena before reservation mutates state;
-- queue creation rejects invalid layout or reconstruction parameters;
-- output buffer is too small before payload copy and before release;
-- invalid API arguments are caught before shared state is touched;
-- lazy staging allocation failure before primitive command issue;
-- enqueue is attempted after L3 has already published `STOP`;
-- application `ERROR` is sent or received normally.
-
-Infrastructure poison sets the endpoint's own abort flag:
-
-- descriptor sequence mismatch;
-- invalid opcode observed in a published descriptor;
-- `STOP` observed on the output queue;
-- zero-byte descriptor with non-zero `payload_offset`;
-- non-zero descriptor payload range outside its direction-local arena;
-- head/tail reconstruction observes impossible progress;
-- payload replay observes impossible state;
-- payload copy failure after command issue;
-- counter notify failure;
-- control-service response timeout after command issue;
-- L2 endpoint fatal error for this region;
-- reservation, publish, or release ownership state becomes contradictory.
-
-Ordinary timeout is ambiguous until the peer abort flag is sampled. A timeout
-with peer abort flag `0` is not poison. A timeout with peer abort flag `1`
-transitions the local handle to terminal `remote-aborted` without setting the
-local abort flag.
-
-Cleanup and `free()` remain valid and idempotent after both local
-infrastructure poison and remote-aborted terminal state.
-
-## 13. Example
-
-PR2 adds one base queue example:
-
-```text
-examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/
-```
-
-The example should demonstrate the intended user shape, not every edge case.
-It must show:
-
-- L3 creating a queue with `depth > 1`;
-- multiple variable-size input `DATA` messages;
-- one zero-byte `DATA` message;
-- a persistent L2 loop;
-- L2 processing at most one active DATA input at a time;
-- one small message-local AICore task;
-- L2 publishing one output `DATA` per input `DATA`;
-- L3 publishing `STOP`;
-- L3 continuing to dequeue outputs after `STOP` according to application final
-  output rules;
-- L2 releasing the `STOP` descriptor and returning from the persistent run.
-
-The example should not demonstrate:
-
-- the L2 input window;
-- multiple active input messages;
-- one input producing multiple outputs;
-- multiple inputs producing one output;
-- out-of-input-order output publish;
-- application `ERROR` protocol design;
-- abort flag failure paths.
-
-The zero-byte `DATA` message should exercise the descriptor-only message path.
-It should not require a child-visible zero-byte host buffer.
-
-## 14. Test Plan
-
-Both PRs require automated tests for their review-driven boundaries. A manual
-review checklist is not enough.
-
-PR1 test scope:
-
-- ABI and layout;
-- descriptor/counter protocol;
-- zero-byte descriptor handling;
-- capacity, full/empty, wrap, and low32 reconstruction;
-- abort flag semantics;
-- L2 endpoint API;
-- L3 fast-path API with primitive-compatible registered host Tensors.
-
-PR2 test scope:
-
-- lazy internal staging for ordinary L3 host buffers;
-- registered Tensor fast path remains no-staging;
-- staging allocation failure is pre-mutation and non-poisoning;
-- base queue example and scene coverage.
-
-Suggested C++ unit test category:
-
-```text
-tests/ut/cpp/common/test_l3_l2_message_queue.cpp
-```
-
-Suggested C++ unit tests:
-
-- `LayoutAssignsAbortFlagsAfterDescriptorCounters`
-- `LayoutRequiresCounterBytesForSixCounters`
-- `DescriptorSlotEncodingIsStable`
-- `ZeroByteDescriptorUsesCanonicalOffset`
-- `ZeroByteDescriptorWithNonZeroOffsetPoisons`
-- `CapacityEqualsDepthAllowsNPublishedDescriptors`
-- `CapacityEqualsDepthRejectsNthPlusOneDescriptor`
-- `FullAndEmptyUseMonotonicCountersNotMaskedIndices`
-- `Low32ReconstructionAcceptsDeltaEqualDepth`
-- `Low32ReconstructionHandlesCounterWrap`
-- `Low32ReconstructionRejectsNegativeDelta`
-- `Low32ReconstructionRejectsDeltaGreaterThanDepth`
-- `ReplaySkipsPayloadCursorAdvanceForZeroByteDescriptors`
-- `ReplayBeforeSlotReuseAfterFullQueueWrap`
-- `LocalInfrastructurePoisonSetsOwnAbortFlag`
-- `RemoteAbortObservationDoesNotSetOwnAbortFlag`
-- `OrdinaryTimeoutDoesNotSetAbortFlag`
-- `ApplicationErrorDoesNotSetAbortFlag`
-- `PreMutationValidationFailureDoesNotSetAbortFlag`
-
-Suggested Python unit test category:
-
-```text
-tests/ut/py/test_l3_l2_message_queue.py
-```
-
-Suggested Python unit tests:
-
-- `test_layout_matches_cpp_helper`
-- `test_counter_offsets_include_abort_flags`
-- `test_zero_byte_enqueue_skips_payload_arena_copy`
-- `test_zero_byte_dequeue_skips_payload_arena_read`
-- `test_enqueue_rejects_ordinary_host_bytes_before_pr2_staging`
-- `test_output_read_rejects_ordinary_buffer_before_pr2_staging`
-- `test_enqueue_accepts_ordinary_host_bytes_with_lazy_staging`
-- `test_enqueue_registered_tensor_uses_fast_path_without_staging`
-- `test_output_read_into_ordinary_buffer_uses_lazy_staging`
-- `test_staging_allocation_failure_does_not_poison`
-- `test_timeout_with_peer_abort_flag_reports_remote_aborted`
-- `test_timeout_without_peer_abort_flag_returns_timeout`
-- `test_remote_aborted_terminal_state_rejects_later_operations`
-
-Suggested scene/example tests:
-
-```text
-examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/
-```
-
-Suggested scene cases:
-
-- `variable_size_messages`: enqueue/dequeue several non-zero `DATA` messages;
-- `zero_byte_data`: send one zero-byte `DATA` and verify one corresponding
-  output is produced without payload arena bytes;
-- `depth_capacity`: with `depth=N`, publish `N` inputs before backpressure;
-- `fifo_stop`: publish `STOP`, drain outputs, and verify L2 exits;
-- `small_aicore_work`: each non-zero input launches message-local AICore work;
-- `l2_abort_flag_timeout_disambiguation`: force an L2 local infrastructure
-  poison, then verify L3 timeout reports remote-aborted instead of ordinary
-  timeout.
-
-The scene test matrix should include the PR1 supported simulation platforms
-where practical:
-
-- `a2a3sim`;
-- `a5sim`.
-
-Hardware execution should include `a2a3` onboard when device access is
-available through the repository's `task-submit` workflow.
diff --git a/tests/ut/py/test_worker/test_l3_l2_message_queue.py b/tests/ut/py/test_worker/test_l3_l2_message_queue.py
index 64b39f8cb..6f5230766 100644
--- a/tests/ut/py/test_worker/test_l3_l2_message_queue.py
+++ b/tests/ut/py/test_worker/test_l3_l2_message_queue.py
@@ -11,6 +11,7 @@
 import math
 import struct
 from multiprocessing.shared_memory import SharedMemory
+from typing import Optional
 
 import pytest
 from simpler.l3_l2_message_queue import (
@@ -65,7 +66,7 @@ def __init__(self):
         self.payload = bytearray()
         self.counters: dict[int, int] = {}
         self.peer_abort = False
-        self.fail_next_cmd: L3L2OrchCommCmd | None = None
+        self.fail_next_cmd: Optional[L3L2OrchCommCmd] = None
 
     def submit(self, request, timeout_s: float):
         self.requests.append((request, timeout_s))
@@ -182,7 +183,7 @@ def _publish_output(
     seq: int = 1,
     payload: bytes = b"",
     opcode: int = int(L3L2QueueOpcode.DATA),
-    payload_offset: int | None = None,
+    payload_offset: Optional[int] = None,
 ) -> None:
     if payload_offset is None:
         payload_offset = queue.layout.output_arena_offset if payload else 0
@@ -349,9 +350,7 @@ def test_enqueue_replays_released_descriptors_before_reusing_input_arena():
         fake_client.counters[queue.layout.input_desc_head_offset] = 1
         queue.input.enqueue(second, nbytes=80, timeout=0.001)
 
-        payload_offsets = [
-            offset for offset, data in fake_client.payload_writes if len(data) == 80
-        ]
+        payload_offsets = [offset for offset, data in fake_client.payload_writes if len(data) == 80]
         assert payload_offsets == [queue.layout.input_arena_offset, queue.layout.input_arena_offset]
     finally:
         _close(worker, shm)
@@ -578,8 +577,7 @@ def test_output_payload_offset_mismatch_poisons_before_payload_read():
         assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1
         assert all(
             not (
-                req.cmd == L3L2OrchCommCmd.PAYLOAD_READ
-                and req.payload_offset == queue.layout.output_arena_offset + 16
+                req.cmd == L3L2OrchCommCmd.PAYLOAD_READ and req.payload_offset == queue.layout.output_arena_offset + 16
             )
             for req, _timeout in fake_client.requests
         )

From 6107c8135158a3abf937fd33096d1c75704e63b6 Mon Sep 17 00:00:00 2001
From: ccyywwen <75376396+ccyywwen@users.noreply.github.com>
Date: Tue, 30 Jun 2026 11:03:12 +0800
Subject: [PATCH 4/7] Fix: harden L3-L2 queue edge cases

- Fail closed on queue layout uint64 overflow in C++ and Python mirror calculations

- Validate cached L2 input handle metadata before release and use cached descriptor state

- Gate C++ spin-loop timer reads and clean up Python regions on partial construction failure
---
 python/simpler/l3_l2_message_queue.py         | 59 +++++++++----
 .../include/aicpu/l3_l2_message_queue.h       | 82 +++++++++++++++----
 .../cpp/common/test_l3_l2_message_queue.cpp   | 43 ++++++++++
 .../test_worker/test_l3_l2_message_queue.py   | 24 ++++++
 4 files changed, 174 insertions(+), 34 deletions(-)

diff --git a/python/simpler/l3_l2_message_queue.py b/python/simpler/l3_l2_message_queue.py
index 462554650..38f6b845b 100644
--- a/python/simpler/l3_l2_message_queue.py
+++ b/python/simpler/l3_l2_message_queue.py
@@ -40,6 +40,7 @@
 L3L2_QUEUE_L2_ABORT_FLAG_OFFSET = 320
 L3L2_QUEUE_COUNTER_BYTES = 384
 L3L2_QUEUE_MAX_DEPTH = 1 << 30
+_UINT64_MAX = (1 << 64) - 1
 
 _DESC = struct.Struct("<4Q")
 _POLL_INTERVAL_S = 0.00005
@@ -92,8 +93,21 @@ def l3_l2_queue_magic_version() -> int:
 
 
 def _align_up(value: int, align: int) -> int:
+    if value < 0 or value > _UINT64_MAX:
+        raise ValueError("L3-L2 queue layout calculation overflowed uint64")
     remainder = value % align
-    return value if remainder == 0 else value + (align - remainder)
+    bump = 0 if remainder == 0 else align - remainder
+    result = value + bump
+    if result > _UINT64_MAX:
+        raise ValueError("L3-L2 queue layout calculation overflowed uint64")
+    return result
+
+
+def _checked_add_u64(lhs: int, rhs: int) -> int:
+    result = lhs + rhs
+    if lhs < 0 or rhs < 0 or result > _UINT64_MAX:
+        raise ValueError("L3-L2 queue layout calculation overflowed uint64")
+    return result
 
 
 def make_l3_l2_queue_layout(depth: int, input_arena_bytes: int, output_arena_bytes: int) -> L3L2QueueLayout:
@@ -108,11 +122,15 @@ def make_l3_l2_queue_layout(depth: int, input_arena_bytes: int, output_arena_byt
         raise ValueError("L3-L2 queue output_arena_bytes must be a positive 64-byte multiple")
 
     desc_ring_bytes = depth * L3L2_QUEUE_DESC_SLOT_BYTES
+    if desc_ring_bytes > _UINT64_MAX:
+        raise ValueError("L3-L2 queue layout calculation overflowed uint64")
     input_desc_offset = 0
-    output_desc_offset = input_desc_offset + desc_ring_bytes
-    input_arena_offset = _align_up(output_desc_offset + desc_ring_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT)
-    output_arena_offset = _align_up(input_arena_offset + input_arena_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT)
-    payload_bytes = output_arena_offset + output_arena_bytes
+    output_desc_offset = _checked_add_u64(input_desc_offset, desc_ring_bytes)
+    desc_end = _checked_add_u64(output_desc_offset, desc_ring_bytes)
+    input_arena_offset = _align_up(desc_end, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT)
+    input_arena_end = _checked_add_u64(input_arena_offset, input_arena_bytes)
+    output_arena_offset = _align_up(input_arena_end, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT)
+    payload_bytes = _checked_add_u64(output_arena_offset, output_arena_bytes)
     return L3L2QueueLayout(
         depth=depth,
         input_desc_offset=input_desc_offset,
@@ -146,18 +164,25 @@ def create_l3_l2_queue(
         payload_bytes=layout.payload_bytes,
         counter_bytes=layout.counter_bytes,
     )
-    desc_fields = orch.alloc([24], DataType.UINT8)
-    desc_seq = orch.alloc([8], DataType.UINT8)
-    desc_read = orch.alloc([L3L2_QUEUE_DESC_SLOT_BYTES], DataType.UINT8)
-    for offset in (
-        layout.input_desc_tail_offset,
-        layout.input_desc_head_offset,
-        layout.output_desc_tail_offset,
-        layout.output_desc_head_offset,
-        layout.l3_abort_flag_offset,
-        layout.l2_abort_flag_offset,
-    ):
-        region.counter(offset).notify(0, NotifyOp.Set)
+    try:
+        desc_fields = orch.alloc([24], DataType.UINT8)
+        desc_seq = orch.alloc([8], DataType.UINT8)
+        desc_read = orch.alloc([L3L2_QUEUE_DESC_SLOT_BYTES], DataType.UINT8)
+        for offset in (
+            layout.input_desc_tail_offset,
+            layout.input_desc_head_offset,
+            layout.output_desc_tail_offset,
+            layout.output_desc_head_offset,
+            layout.l3_abort_flag_offset,
+            layout.l2_abort_flag_offset,
+        ):
+            region.counter(offset).notify(0, NotifyOp.Set)
+    except Exception:
+        try:
+            region.free()
+        except Exception:
+            pass
+        raise
     return L3L2Queue(orch, region, layout, desc_fields, desc_seq, desc_read)
 
 
diff --git a/src/common/platform/include/aicpu/l3_l2_message_queue.h b/src/common/platform/include/aicpu/l3_l2_message_queue.h
index 383785c54..96dad5a40 100644
--- a/src/common/platform/include/aicpu/l3_l2_message_queue.h
+++ b/src/common/platform/include/aicpu/l3_l2_message_queue.h
@@ -126,6 +126,19 @@ static inline uint64_t l3_l2_queue_align_up(uint64_t value, uint64_t align) {
     return remainder == 0 ? value : value + (align - remainder);
 }
 
+static inline bool l3_l2_queue_align_up_checked(uint64_t value, uint64_t align, uint64_t *out) {
+    if (out == nullptr || align == 0) {
+        return false;
+    }
+    uint64_t remainder = value % align;
+    uint64_t bump = remainder == 0 ? 0 : align - remainder;
+    if (l3_l2_orch_comm_add_overflows(value, bump)) {
+        return false;
+    }
+    *out = value + bump;
+    return true;
+}
+
 static inline bool l3_l2_queue_valid_opcode(L3L2QueueOpcode opcode) {
     return opcode == L3L2QueueOpcode::DATA || opcode == L3L2QueueOpcode::STOP || opcode == L3L2QueueOpcode::ERROR;
 }
@@ -141,14 +154,30 @@ l3_l2_queue_make_layout(uint64_t depth, uint64_t input_arena_bytes, uint64_t out
 
     uint64_t desc_ring_bytes = depth * L3L2_QUEUE_DESC_SLOT_BYTES;
     uint64_t input_desc_offset = 0;
+    if (l3_l2_orch_comm_add_overflows(input_desc_offset, desc_ring_bytes)) {
+        return false;
+    }
     uint64_t output_desc_offset = input_desc_offset + desc_ring_bytes;
-    uint64_t input_arena_offset =
-        l3_l2_queue_align_up(output_desc_offset + desc_ring_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT);
-    uint64_t output_arena_offset =
-        l3_l2_queue_align_up(input_arena_offset + input_arena_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT);
+    if (l3_l2_orch_comm_add_overflows(output_desc_offset, desc_ring_bytes)) {
+        return false;
+    }
+    uint64_t desc_end = output_desc_offset + desc_ring_bytes;
+    uint64_t input_arena_offset = 0;
+    if (!l3_l2_queue_align_up_checked(desc_end, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT, &input_arena_offset)) {
+        return false;
+    }
+    if (l3_l2_orch_comm_add_overflows(input_arena_offset, input_arena_bytes)) {
+        return false;
+    }
+    uint64_t input_arena_end = input_arena_offset + input_arena_bytes;
+    uint64_t output_arena_offset = 0;
+    if (!l3_l2_queue_align_up_checked(input_arena_end, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT, &output_arena_offset)) {
+        return false;
+    }
     if (l3_l2_orch_comm_add_overflows(output_arena_offset, output_arena_bytes)) {
         return false;
     }
+    uint64_t payload_bytes = output_arena_offset + output_arena_bytes;
 
     *out = L3L2QueueLayout{
         depth,
@@ -158,7 +187,7 @@ l3_l2_queue_make_layout(uint64_t depth, uint64_t input_arena_bytes, uint64_t out
         output_arena_offset,
         input_arena_bytes,
         output_arena_bytes,
-        output_arena_offset + output_arena_bytes,
+        payload_bytes,
         L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET,
         L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET,
         L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET,
@@ -227,6 +256,7 @@ class L3L2QueueEndpoint {
             }
             uint64_t start = l3_l2_orch_endpoint_now();
             uint64_t frequency_hz = l3_l2_orch_endpoint_timer_frequency_hz();
+            uint64_t spins = 0;
             while (true) {
                 if (try_peek(out)) {
                     return true;
@@ -234,10 +264,13 @@ class L3L2QueueEndpoint {
                 if (parent_->error_.kind != L3L2QueueErrorKind::NONE) {
                     return false;
                 }
-                uint64_t now = l3_l2_orch_endpoint_now();
-                if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) {
-                    parent_->disambiguate_timeout();
-                    return false;
+                spins += 1;
+                if (timeout_ns == 0 || (spins & 1023ull) == 0) {
+                    uint64_t now = l3_l2_orch_endpoint_now();
+                    if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) {
+                        parent_->disambiguate_timeout();
+                        return false;
+                    }
                 }
             }
         }
@@ -322,6 +355,9 @@ class L3L2QueueEndpoint {
             *out = L3L2QueueInputHandle{slot.seq, opcode, slot.payload_offset, slot.payload_nbytes, view};
             active_ = true;
             active_seq_ = slot.seq;
+            active_opcode_ = opcode;
+            active_payload_offset_ = slot.payload_offset;
+            active_payload_nbytes_ = slot.payload_nbytes;
             return true;
         }
 
@@ -329,13 +365,15 @@ class L3L2QueueEndpoint {
             if (!parent_->ensure_live("input.release")) {
                 return false;
             }
-            if (!active_ || handle.seq != active_seq_ || handle.seq != parent_->input_head_ + 1) {
+            if (!active_ || handle.seq != active_seq_ || handle.seq != parent_->input_head_ + 1 ||
+                handle.opcode != active_opcode_ || handle.payload_offset != active_payload_offset_ ||
+                handle.payload_nbytes != active_payload_nbytes_) {
                 parent_->poison(L3L2QueueErrorKind::OWNERSHIP, "input.release", "input handle is not active");
                 return false;
             }
-            if (handle.payload_nbytes != 0) {
+            if (active_payload_nbytes_ != 0) {
                 parent_->advance_payload_head(
-                    parent_->input_payload_head_, handle.payload_offset, handle.payload_nbytes,
+                    parent_->input_payload_head_, active_payload_offset_, active_payload_nbytes_,
                     parent_->layout_.input_arena_offset, parent_->layout_.input_arena_bytes, "input.release"
                 );
                 if (parent_->error_.kind != L3L2QueueErrorKind::NONE) {
@@ -343,11 +381,14 @@ class L3L2QueueEndpoint {
                 }
             }
             parent_->input_head_ += 1;
-            if (handle.opcode == L3L2QueueOpcode::STOP) {
+            if (active_opcode_ == L3L2QueueOpcode::STOP) {
                 stopped_ = true;
             }
             active_ = false;
             active_seq_ = 0;
+            active_opcode_ = L3L2QueueOpcode::INVALID;
+            active_payload_offset_ = 0;
+            active_payload_nbytes_ = 0;
             return parent_->notify_counter(
                 parent_->layout_.input_desc_head_offset, static_cast<int32_t>(parent_->input_head_), "input.release"
             );
@@ -357,6 +398,9 @@ class L3L2QueueEndpoint {
         L3L2QueueEndpoint *parent_;
         bool active_{false};
         uint64_t active_seq_{0};
+        L3L2QueueOpcode active_opcode_{L3L2QueueOpcode::INVALID};
+        uint64_t active_payload_offset_{0};
+        uint64_t active_payload_nbytes_{0};
         bool stopped_{false};
     };
 
@@ -371,6 +415,7 @@ class L3L2QueueEndpoint {
             }
             uint64_t start = l3_l2_orch_endpoint_now();
             uint64_t frequency_hz = l3_l2_orch_endpoint_timer_frequency_hz();
+            uint64_t spins = 0;
             while (true) {
                 if (try_reserve(nbytes, out)) {
                     return true;
@@ -378,10 +423,13 @@ class L3L2QueueEndpoint {
                 if (parent_->error_.kind != L3L2QueueErrorKind::NONE) {
                     return false;
                 }
-                uint64_t now = l3_l2_orch_endpoint_now();
-                if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) {
-                    parent_->disambiguate_timeout();
-                    return false;
+                spins += 1;
+                if (timeout_ns == 0 || (spins & 1023ull) == 0) {
+                    uint64_t now = l3_l2_orch_endpoint_now();
+                    if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) {
+                        parent_->disambiguate_timeout();
+                        return false;
+                    }
                 }
             }
         }
diff --git a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
index 409da4763..e2761c426 100644
--- a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
+++ b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
@@ -13,6 +13,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <cstring>
+#include <limits>
 #include <type_traits>
 
 #include <gtest/gtest.h>
@@ -127,6 +128,25 @@ TEST(L3L2MessageQueueTest, LayoutRejectsInvalidDepthArenaAndCounterBytes) {
     EXPECT_TRUE(l3_l2_queue_validate_region(make_desc(&storage, 512, 384), args, &layout));
 }
 
+TEST(L3L2MessageQueueTest, LayoutOverflowFailsClosedWithoutModifyingOutput) {
+    L3L2QueueLayout layout{
+        7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61,
+    };
+    const L3L2QueueLayout original = layout;
+
+    EXPECT_FALSE(l3_l2_queue_make_layout(2, std::numeric_limits<uint64_t>::max() - 63, 64, &layout));
+
+    EXPECT_EQ(layout.depth, original.depth);
+    EXPECT_EQ(layout.input_desc_offset, original.input_desc_offset);
+    EXPECT_EQ(layout.output_desc_offset, original.output_desc_offset);
+    EXPECT_EQ(layout.input_arena_offset, original.input_arena_offset);
+    EXPECT_EQ(layout.output_arena_offset, original.output_arena_offset);
+    EXPECT_EQ(layout.input_arena_bytes, original.input_arena_bytes);
+    EXPECT_EQ(layout.output_arena_bytes, original.output_arena_bytes);
+    EXPECT_EQ(layout.payload_bytes, original.payload_bytes);
+    EXPECT_EQ(layout.counter_bytes, original.counter_bytes);
+}
+
 TEST(L3L2MessageQueueTest, DescriptorSlotEncodingIsStable) {
     static_assert(std::is_standard_layout<L3L2QueueDescSlot>::value, "descriptor must be POD-like");
     static_assert(std::is_trivially_copyable<L3L2QueueDescSlot>::value, "descriptor must be fixed-size");
@@ -430,6 +450,29 @@ TEST(L3L2MessageQueueTest, InputApplicationErrorIsNormalMessageAndDoesNotSetAbor
     EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0);
 }
 
+TEST(L3L2MessageQueueTest, InputReleaseRejectsCallerMutatedHandleMetadata) {
+    RegionStorage storage{};
+    L3L2QueueArgs args{
+        l3_l2_queue_magic_version(),
+        2,
+        64,
+        64,
+    };
+    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+    publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, 16);
+
+    L3L2QueueInputHandle handle{};
+    ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message;
+    handle.payload_nbytes = 0;
+
+    EXPECT_FALSE(queue.input().release(handle));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::OWNERSHIP);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET)], 0);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1);
+}
+
 TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidState) {
     RegionStorage storage{};
     L3L2QueueArgs args{
diff --git a/tests/ut/py/test_worker/test_l3_l2_message_queue.py b/tests/ut/py/test_worker/test_l3_l2_message_queue.py
index 6f5230766..04573a62b 100644
--- a/tests/ut/py/test_worker/test_l3_l2_message_queue.py
+++ b/tests/ut/py/test_worker/test_l3_l2_message_queue.py
@@ -210,6 +210,11 @@ def test_layout_rejects_invalid_pr1_parameters():
             make_l3_l2_queue_layout(depth, input_arena_bytes, output_arena_bytes)
 
 
+def test_layout_rejects_uint64_overflow_to_match_cpp_helper():
+    with pytest.raises(ValueError, match="overflowed uint64"):
+        make_l3_l2_queue_layout(2, (1 << 64) - 64, 64)
+
+
 @pytest.mark.parametrize(
     ("depth", "input_arena_bytes", "output_arena_bytes", "expected"),
     [
@@ -300,6 +305,25 @@ def test_create_l3_l2_queue_allocates_region_and_exposes_l2_task_scalars():
         _close(worker, shm)
 
 
+def test_create_l3_l2_queue_frees_region_on_post_region_alloc_failure():
+    orch, worker, shm, _fake_client = _make_orchestrator()
+    original_alloc = orch._o.alloc
+
+    def fail_alloc(_shape, _dtype):
+        raise RuntimeError("injected alloc failure")
+
+    orch._o.alloc = fail_alloc
+    try:
+        with pytest.raises(RuntimeError, match="injected alloc failure"):
+            orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+
+        assert len(worker._live_l3_l2_regions) == 1
+        assert worker._live_l3_l2_regions[0]._released is True
+    finally:
+        orch._o.alloc = original_alloc
+        _close(worker, shm)
+
+
 def test_zero_byte_enqueue_skips_message_payload_write_and_publishes_descriptor():
     orch, worker, shm, fake_client = _make_orchestrator()
     try:

From 9f33653b43b23ac4bcc0b8bb8bfd973c3894311e Mon Sep 17 00:00:00 2001
From: ccyywwen <75376396+ccyywwen@users.noreply.github.com>
Date: Tue, 30 Jun 2026 15:59:27 +0800
Subject: [PATCH 5/7] Update: replace L3-L2 queue design doc

- Add the user-facing L3-L2 message queue documentation.

- Link the primitive L3-L2 orchestration communication doc to the queue wrapper doc.

- Remove the design-stage document from the branch while leaving the local copy available for follow-up work.
---
 docs/l3-l2-message-queue-design.md | 922 -----------------------------
 docs/l3-l2-message-queue.md        | 352 +++++++++++
 docs/l3-l2-orch-comm.md            |   4 +
 3 files changed, 356 insertions(+), 922 deletions(-)
 delete mode 100644 docs/l3-l2-message-queue-design.md
 create mode 100644 docs/l3-l2-message-queue.md

diff --git a/docs/l3-l2-message-queue-design.md b/docs/l3-l2-message-queue-design.md
deleted file mode 100644
index 414b80d02..000000000
--- a/docs/l3-l2-message-queue-design.md
+++ /dev/null
@@ -1,922 +0,0 @@
-# L3-L2 SPSC Message Queue Design
-
-## 1. Goal
-
-This document proposes the functional shape of an L3-L2 SPSC message queue
-wrapper built on top of the existing `docs/l3-l2-orch-comm.md` primitives.
-
-The feature goal is to let one L3 orchestrator exchange a sequence of input
-and output messages with one persistent L2 orchestrator run. L3 can enqueue
-task inputs and dequeue task outputs while the L2 run stays alive. This avoids
-stopping the L2 run after every task and then paying host/device finish and
-init costs again for the next task.
-
-The target shape has two layers:
-
-- a base bidirectional queue transport with input and output queues;
-- an L2-side input window helper that lets L2 hold multiple input messages
-  concurrently without changing the L3 API or the transport ABI.
-
-The base transport should land first for reviewability. The input window can
-then be added as an L2 helper policy on top of the same descriptor ABI, region
-layout, counter layout, and L3 queue API.
-
-The queue wrapper does not change the primitive L3-L2 communication service.
-It uses the existing region descriptor, payload byte range, and `int32_t`
-signal counter primitives.
-
-## 2. Existing Primitive Constraints
-
-The primitive L3-L2 communication layer provides:
-
-- one region descriptor containing payload and counter base/size fields;
-- contiguous payload byte access through `PAYLOAD_READ` and `PAYLOAD_WRITE`;
-- address-based `int32_t` signal counters through `SIGNAL_NOTIFY`,
-  `SIGNAL_TEST`, and `SIGNAL_WAIT`;
-- region lifetime, release, and poison state handling.
-
-The primitive layer deliberately does not define queue layout, stream headers,
-opcodes, tensor schema, descriptor rings, STOP semantics, or typed tensor
-metadata. The message queue wrapper owns those protocol choices.
-
-The primitive layer requires only 4-byte alignment for counter addresses inside
-the registered counter range. The queue wrapper places high-frequency shared
-counter signals at 64-byte strides so counters written by different agents do
-not share a cache line.
-
-## 3. Public Functional Shape
-
-L3 creates one bidirectional queue object:
-
-```python
-queue = orch.create_l3_l2_queue(
-    worker_id=0,
-    depth=8,
-    input_arena_bytes=1 << 20,
-    output_arena_bytes=1 << 20,
-)
-```
-
-The L3-visible queue API exposes an input queue and an output queue. L3 sends
-ordinary application messages to L2 through the input queue and receives
-ordinary application messages from L2 through the output queue.
-
-The wrapper computes:
-
-- descriptor ring sizes;
-- payload section offsets;
-- counter offsets;
-- total region payload bytes;
-- total counter bytes.
-
-The user does not pass internal descriptor offsets, arena offsets, or counter
-offsets.
-
-The queue owns one `L3L2OrchRegion`. The L2 task receives the primitive region
-descriptor plus queue layout scalars through `TaskArgs`.
-
-The intended L3 API shape is illustrative, but the semantics are part of the
-transport contract:
-
-```python
-queue.input.enqueue(host_buffer, nbytes=None, timeout=timeout_s)
-message = queue.output.dequeue_into(host_buffer, timeout=timeout_s)
-handle = queue.output.peek(timeout=timeout_s)
-queue.output.read_into(handle, host_buffer)
-queue.output.release(handle)
-queue.request_stop(timeout=timeout_s)
-queue.free()
-```
-
-The output ownership APIs `peek`, `read_into`, and `release` are part of the
-base L3 API. They are the recommended path for variable-size outputs because
-the caller can inspect `payload_nbytes` before choosing or allocating a target
-buffer. Convenience APIs such as `dequeue_into` may copy and release in one
-operation when the caller already has a large enough target buffer. Core APIs
-that hand ownership to the caller require explicit release.
-
-`queue.free()` releases the L3 queue handle. It rejects later queue operations,
-but it does not synchronously free device memory. Physical cleanup follows the
-underlying region lifetime model.
-
-The L3 public queue API accepts ordinary contiguous host byte spans for
-convenience enqueue and output read operations. When the supplied buffer is
-already a primitive-compatible registered `orch.alloc(...)` Tensor, the queue
-uses it as the zero-extra-host-copy fast path. Otherwise the queue lazily
-stages through an internal registered host Tensor before issuing the primitive
-payload command, then copies between that staging Tensor and the user buffer.
-Zero-byte DATA and ERROR messages may pass `None` as the buffer. Staging hides
-the primitive child-visible Tensor requirement from ordinary queue users, but
-may add one host-to-host copy.
-
-The L2 input window extension is not visible to L3. It is an L2 helper policy
-that controls how many DATA input messages L2 may hold concurrently before
-releasing them in FIFO-safe order.
-
-## 4. Non-Goals
-
-- Multiple L2 orchestrators.
-- Multi-producer or multi-consumer queues.
-- Shared input/output payload allocator.
-- Split payload spans across arena wrap.
-- Dtype, shape, stride, tensor rank, or tile layout interpretation.
-- Changes to `ALLOC_REGION`, `PAYLOAD_READ`, `PAYLOAD_WRITE`,
-  `SIGNAL_NOTIFY`, `SIGNAL_TEST`, or `SIGNAL_WAIT`.
-- Exposing the L2 input window configuration through the L3 API.
-- Out-of-order input payload release.
-- Fragmented or hole-filled input arena allocators.
-- Output-side STOP acknowledgement messages.
-
-## 5. Region Layout
-
-The physical L3-L2 region has one payload range and one counter range. The
-queue wrapper divides the payload range into four logical sections:
-
-```text
-payload region
-├─ input descriptor ring
-├─ output descriptor ring
-├─ input payload arena
-└─ output payload arena
-```
-
-The descriptor rings live in the payload region because they are structured
-byte metadata. The counter range stores only shared head/tail signals.
-
-The input and output payload arenas are logically separate. This preserves SPSC
-ownership:
-
-```text
-input arena:
-  producer = L3
-  consumer = L2
-
-output arena:
-  producer = L2
-  consumer = L3
-```
-
-A shared payload allocator is intentionally out of scope because it would have
-two producers and two releasers.
-
-The queue layout is derived, not transmitted as internal offsets. `TaskArgs`
-carry the primitive region descriptor followed by four queue parameters:
-
-```text
-primitive desc[0..5]
-queue_magic_version
-depth
-input_arena_bytes
-output_arena_bytes
-```
-
-The queue magic/version belongs to the queue wrapper ABI, not to the primitive
-region ABI. It covers the descriptor slot format, opcode values, deterministic
-layout function, head/tail reconstruction rules, and STOP/ERROR transport
-semantics.
-
-A shared C/C++ layout helper is the source of truth for derived offsets and
-sizes. Python may mirror that calculation, but tests must keep the Python
-calculation and the C/C++ helper in lockstep. The helper derives:
-
-```text
-input_desc_offset
-output_desc_offset
-input_arena_offset
-output_arena_offset
-input_desc_tail = 0
-input_desc_head = 64
-output_desc_tail = 128
-output_desc_head = 192
-l3_abort_flag = 256
-l2_abort_flag = 320
-```
-
-Validation rules:
-
-- `depth` must be a power of two and `depth <= 2^30`.
-- Queue capacity is `depth` messages, not `depth - 1`.
-- Descriptor slot size is fixed at 32 bytes.
-- Descriptor rings are 8-byte aligned.
-- Payload arena bases are 64-byte aligned.
-- `input_arena_bytes` and `output_arena_bytes` must be positive 64-byte
-  multiples. They do not need to be powers of two.
-- `counter_bytes` must be at least 384.
-- `payload_bytes` must contain both descriptor rings and both payload arenas.
-- Unsupported `queue_magic_version` on L2 is a fatal queue decode error for
-  this region.
-
-The L3 queue creator initializes the four shared head/tail counters and the
-two abort flags to zero before submitting the persistent L2 run. Descriptor
-slots and payload bytes do not need to be zeroed for correctness.
-
-## 6. Descriptor ABI
-
-Each descriptor slot is 32 bytes and is encoded as four little-endian
-`uint64_t` values:
-
-```cpp
-struct L3L2QueueDescSlot {
-    uint64_t seq;
-    uint64_t opcode;
-    uint64_t payload_offset;
-    uint64_t payload_nbytes;
-};
-static_assert(sizeof(L3L2QueueDescSlot) == 32);
-```
-
-The queue uses 64-byte spacing for shared signal counters, not for descriptor
-slots. Each descriptor ring is SPSC, so the base descriptor ABI needs only the
-four transport fields above.
-
-`seq` is a full 64-bit infrastructure sequence number used for ring
-correctness, wrap detection, diagnostics, and input-window validation. It is
-not a user correlation ID. Applications that need request IDs, batch IDs,
-partial/final markers, or other correlation should put them in their own
-payload header.
-
-`payload_offset` is relative to the primitive region payload base, so L2 can
-call `endpoint.payload_read(payload_offset, payload_nbytes, &view)` directly.
-
-Future descriptor extensions should use an ABI version or application payload
-headers instead of reserving unused fields in every slot.
-
-## 7. Opcodes
-
-The queue transport defines these opcodes:
-
-```text
-0      invalid / never published
-DATA   = 1 ordinary application payload message
-STOP   = 2 graceful input-side shutdown request, input queue only
-ERROR  = 3 ordinary application-level error payload message, either direction
-```
-
-`ERROR` is a normal queue message. The queue layer does not interpret its
-payload, does not raise a transport exception for it, and does not poison the
-queue when it sees one. Applications define whether an `ERROR` payload
-correlates with a request, batch, stream, or other application state.
-
-Infrastructure errors are handled through poison state, not by trying to write
-an `ERROR` message into a potentially untrusted queue.
-
-`STOP` is valid only on the input queue. The output queue has no STOP message.
-L2 shutdown acknowledgement is provided by `Worker.run` drain, not by an
-output STOP. Observing STOP on the output queue is invalid published
-descriptor state and poisons the queue.
-
-DATA and ERROR may carry zero payload bytes. For any zero-byte message,
-`payload_offset` must be zero and the message consumes no payload arena bytes.
-STOP must also use `payload_nbytes == 0` and `payload_offset == 0`.
-
-## 8. Descriptor Counters And Derived Payload Cursors
-
-The queue shares only descriptor head/tail values through the primitive layer's
-`int32_t` signal counters. Each shared head/tail uses a 64-byte stride:
-
-```text
-offset 0:   input_desc_tail       writer=L3
-offset 64:  input_desc_head       writer=L2
-offset 128: output_desc_tail      writer=L2
-offset 192: output_desc_head      writer=L3
-offset 256: l3_abort_flag         writer=L3
-offset 320: l2_abort_flag         writer=L2
-```
-
-`counter_bytes` must be at least 384.
-
-The abort flags are single-writer terminal booleans used to disambiguate
-operation timeouts from remote infrastructure abort. They are initialized to
-zero and set to one with `NotifyOp.Set` when the owning endpoint enters local
-infrastructure poison. They do not carry application `ERROR` semantics, do not
-count poison events, and do not reset within a queue lifetime.
-
-Blocking queue operations are not required to poll abort flags on every wait
-iteration. When a blocking operation times out, the implementation samples the
-peer abort flag. If the peer flag is zero, the timeout remains ordinary
-no-progress and does not poison the local queue. If the peer flag is one, the
-operation reports remote infrastructure abort and transitions the local handle
-to a terminal remote-aborted state. Observing a peer abort flag does not set
-the local endpoint's own abort flag.
-
-The shared descriptor counters store the low 32 bits of logical `uint64_t`
-head/tail values. These values are monotonic message counts. The primitive
-transports these bits through `int32_t` counters. Endpoints reconstruct local
-`uint64_t` head/tail values from sampled counter values using signed 32-bit
-delta semantics:
-
-```text
-delta = int32_t(observed_low32 - local_low32)
-valid progress: 0 <= delta <= depth
-```
-
-Negative deltas or deltas larger than `depth` are inconsistent shared state.
-Queue creation rejects descriptor depths that would make head/tail
-reconstruction ambiguous. This is a validation error, not a poison condition.
-
-Descriptor head/tail reconstruction is safe because unobserved descriptor
-progress is bounded by the descriptor ring depth. Payload byte cursors are not
-shared counters and are not reconstructed from low-32-bit signal values.
-
-Each endpoint maintains the payload cursors it needs as local `uint64_t`
-state:
-
-```text
-producer local:
-  payload_tail
-  inferred_payload_head
-
-consumer local:
-  payload_head
-```
-
-The producer infers reusable payload space by observing `desc_head`
-progress and replaying the released descriptors before reusing those descriptor
-slots. The consumer maintains its local `payload_head` while releasing
-descriptors.
-Because payload cursor progress is derived from descriptor FIFO history, payload
-arena size is not limited by 32-bit signal counter reconstruction.
-
-Queue correctness is based on reconstructed descriptor head/tail state plus
-descriptor replay, not on primitive `GE` / `LT` comparison over the 32-bit
-counter value. Blocking queue operations use bounded polling over `SIGNAL_TEST`
-snapshots plus local queue-state checks. The timeout belongs to the wrapper
-operation. The design does not require primitive `SIGNAL_WAIT` for queue
-correctness.
-
-Local queue state may advance only after a matched `SIGNAL_TEST` snapshot. A
-failed `SIGNAL_TEST` result does not establish acquire ordering, and its
-`observed` value must not drive descriptor head/tail reconstruction, descriptor
-replay, or payload release. Implementations should choose a comparison that
-matches when the sampled counter has changed, such as `NE` against the local
-low-32 value. The protocol does not prescribe a busy-poll, sleep, yield, or
-backoff strategy.
-
-If a live endpoint observes counter, head/tail, cursor, or descriptor state that
-contradicts the descriptor reconstruction or payload replay rules, that is
-inconsistent shared state and poisons the queue.
-
-Descriptor slots carry the full 64-bit per-message `seq`, so message-level
-validation does not depend on reconstructing sequence numbers from counters.
-Input and output queues have independent sequence spaces. In each direction,
-the first published message has `seq = 1`; head/tail counters start at zero and
-store the number of messages published or released. A published slot has
-`seq = tail_before_publish + 1`.
-
-## 9. Payload Arena
-
-Each direction has a variable-size SPSC byte arena.
-
-Rules:
-
-- `payload_tail` and `payload_head` are logical `uint64_t` byte cursors.
-- Actual arena offset is `cursor % arena_bytes`.
-- `arena_bytes` is limited by region allocation capacity, addressability, and
-  runtime memory budget, not by 32-bit signal counter reconstruction.
-- A single message payload must be one contiguous span.
-- A single message payload must be `<= arena_bytes`.
-- Split payloads across the arena wrap are not supported.
-- If remaining bytes at the arena end cannot hold the next payload, the
-  producer may insert invisible padding by advancing `payload_tail` to the next
-  arena cycle.
-- Padding has no descriptor. On release, the consumer compares
-  `payload_head % arena_bytes` with the descriptor's arena-relative payload
-  offset. If they differ, the only valid base-queue case is wrap padding: the
-  descriptor offset is the base offset of this direction's arena and the
-  releaser first advances `payload_head` to the next arena cycle. It then
-  advances `payload_head` by `payload_nbytes`. Any other mismatch is
-  inconsistent shared state and poisons the queue. The same replay rule is used
-  by the producer after observing `desc_head` progress, before it reuses
-  released descriptor slots.
-- Zero-byte messages do not participate in wrap-padding checks and do not
-  advance payload cursors.
-
-Backpressure must check both descriptor slots and payload arena bytes. A free
-descriptor slot is not enough if the payload arena lacks enough contiguous
-space.
-
-Payload validation is direction-local. DATA and ERROR payloads must lie wholly
-inside the input arena for input descriptors, and wholly inside the output
-arena for output descriptors. Being inside the primitive payload range is not
-enough.
-
-## 10. Core Operation Sequence
-
-The queue exposes direction-specific operations. Exact class names may change,
-but the operation set and ownership semantics are the transport contract.
-
-L3 owns the input producer and output consumer operations:
-
-```text
-input.enqueue(buffer, nbytes, timeout)
-input.try_enqueue(buffer, nbytes)
-output.dequeue_into(buffer, timeout)
-output.try_dequeue_into(buffer)
-output.peek(timeout) -> message handle
-output.try_peek() -> message handle or no-progress
-output.read_into(handle, buffer)
-output.release(handle)
-request_stop(timeout)
-try_request_stop()
-free()
-```
-
-`dequeue_into` is the convenience path for full-message copy and release.
-The `peek` / `read_into` / `release` path is the explicit-ownership path.
-`free` releases the L3 queue handle, not the physical region.
-
-L2 owns the input consumer and output producer operations:
-
-```text
-input.peek(timeout) -> input handle
-input.try_peek() -> input handle or no-progress
-input.release(handle)
-output.reserve(nbytes, timeout) -> reservation
-output.try_reserve(nbytes) -> reservation or no-progress
-output.publish(reservation, opcode)
-```
-
-The L2 input window extension wraps the input consumer with additional
-`complete(handle)` ownership; it does not change the base transport ABI. The
-base queue has no output dequeue operation on L2 and no input enqueue operation
-on L2.
-
-The producer sequence is:
-
-```text
-reserve -> fill/copy payload -> publish descriptor
-```
-
-The consumer sequence is:
-
-```text
-peek/acquire descriptor -> read/view payload -> release descriptor and payload
-```
-
-Convenience APIs are built from the core operation sequence:
-
-```text
-enqueue      = reserve + copy + publish
-dequeue_into = peek + read + release
-```
-
-L3 input enqueue can usually use the convenience path because the input payload
-already exists in a host-visible buffer.
-
-L2 output needs the core path because it often must reserve output arena space
-before launching AICore work:
-
-```cpp
-auto out = output_queue.reserve(output_nbytes, timeout);
-Tensor output = make_tensor_external(out.gm_addr, shape, rank, dtype);
-// submit AICore work that writes output
-// synchronize so output bytes are visible
-output_queue.publish(out, L3L2QueueOpcode::DATA);
-```
-
-Each queue direction allows at most one outstanding producer reservation.
-`publish` accepts only the current outstanding reservation for that direction.
-Publishing an unknown, stale, already-published, or cross-queue reservation is
-a local ownership contradiction and poisons the queue.
-
-The base queue does not support reservation cancel. A successful reserve must
-be published. If filling the reservation fails but the queue remains
-trustworthy, the application may publish an ERROR message using that
-reservation. If the reservation cannot be safely published, the producer
-poisons the queue.
-
-Descriptor publication is ordered. The producer writes payload bytes first,
-writes descriptor fields, writes `seq` as the descriptor validity marker after
-the other descriptor fields, and then release-publishes the tail counter. The
-consumer acquire-observes tail progress before reading the slot, and
-accepts the descriptor only when `slot.seq` equals the expected sequence.
-
-Descriptor slots do not need to be cleared before reuse. Sequence validation
-distinguishes old and new contents.
-
-Descriptor release is ordered in the opposite direction. The consumer must
-finish using the payload, update local release state, and release-publish the
-head counter. The producer may replay released descriptors and infer reusable
-payload space only after acquire-observing matched head progress.
-
-All blocking operations require finite timeouts. Nonblocking `try_*` variants
-return without changing shared state when no descriptor slot, message, or
-payload space is available. Timeout under ordinary backpressure does not
-poison the queue.
-
-The queue layer returns transport messages to the application:
-
-```text
-seq
-opcode
-payload bytes or payload view
-```
-
-The queue layer does not infer application request correlation from queue order
-or from transport `seq`.
-
-Queue ownership is per message, not per byte range. Release or complete always
-applies to the whole descriptor payload span.
-
-For L3 convenience dequeue, a too-small output buffer is a local validation
-failure. The descriptor remains at the queue head, no release is published, and
-the caller may retry with a larger child-visible buffer.
-
-## 11. Base L2 Processing Contract
-
-After dequeuing one input message, L2 application code may submit any number
-of message-local AICore tasks and use runtime dependencies, manual scopes,
-async notify, or other L2 orchestration features.
-
-The base helper and example do not overlap ownership of multiple input
-messages. They keep at most one active DATA input message at a time:
-
-```text
-peek input
-reserve output
-submit message-local AICore work
-wait or otherwise prove message-local work is safe
-publish output
-release input
-next message
-```
-
-L2 must not release an input message until AICore no longer reads that input
-payload and any corresponding output has been successfully published.
-
-After an input is released, L2 and any in-flight AICore work must not read its
-payload view again.
-
-The queue layer does not understand dtype, shape, stride, or tensor schema. It
-returns byte views. Applications build typed tensors with their own protocol
-metadata.
-
-## 12. L2 Input Window Extension
-
-The target feature shape includes an L2 input window helper. The helper lets L2
-hold multiple DATA input messages concurrently while preserving FIFO-safe input
-release. It enables application-defined output cardinality and output order:
-
-- one input may produce no output;
-- one input may produce multiple outputs;
-- several inputs may produce one output;
-- status or progress outputs may be published independently;
-- output publish order may differ from input acquire order.
-
-The L3-visible queue API is unchanged by the input window extension. L3 still
-observes an input queue and an output queue. L3 receives output messages in
-publish order and does not infer input/output correlation from queue order.
-Correlation, aggregation, partial/final markers, request IDs, and batch IDs
-belong in the application payload header.
-
-`max_l2_inflight` is a local L2 helper policy. It is not part of queue creation
-and does not affect region layout:
-
-```cpp
-L3L2QueueEndpoint queue(desc, layout);
-L3L2InputWindow input_window(
-    queue.input(),
-    L3L2InputWindowConfig{.max_l2_inflight = 4}
-);
-```
-
-The helper tracks input handles with these states:
-
-```text
-ACQUIRED
-  Descriptor has been read. Payload view is available to L2.
-
-COMPLETED
-  Application has declared the input payload is no longer needed.
-
-RELEASED
-  Helper has advanced the input descriptor and payload cursors past this input.
-```
-
-The state transition is:
-
-```text
-ACQUIRED -> COMPLETED -> RELEASED
-```
-
-The application owns the transition to `COMPLETED`; the helper owns the
-transition to `RELEASED`. Completing an input means no future L2 code or
-in-flight AICore task will read that input payload, and the payload is no
-longer needed to construct future output.
-
-Completion is explicit. The helper must not infer completion from C++ object
-destruction or lexical scope exit. A handle that is completed twice, released
-twice, or destroyed while still active is a local ownership error.
-
-The helper releases inputs through a FIFO watermark. If inputs 10, 11, and 12
-are acquired and inputs 10 and 12 are completed, the helper may release input
-10 only. It must not release input 12 until input 11 is also completed. This
-keeps the input payload arena monotonic and avoids holes.
-
-Output publish remains application-driven and independent of input handles:
-
-```cpp
-auto out = queue.output().reserve(nbytes, timeout);
-// fill output directly or submit AICore work that writes out.gm_addr
-queue.output().publish(out, L3L2QueueOpcode::DATA);
-```
-
-The input window extension does not add an output completion manager. The L2
-application owns completion tracking and decides when an output is ready to
-publish.
-
-Output reservation and publish remain single-outstanding per direction. The
-input window allows multiple active input handles; it does not introduce
-multiple concurrent output reservations.
-
-## 13. STOP Semantics
-
-`STOP` is an input queue descriptor message:
-
-```text
-seq + opcode=STOP + payload_nbytes=0
-```
-
-It follows normal FIFO ordering. STOP is a graceful shutdown request, not
-cancel and not an immediate no-more-output marker.
-
-Base helper behavior:
-
-- L2 exits only after processing messages before the STOP.
-- L2 releases the STOP descriptor and returns from the persistent run.
-- `Worker.run` drain acts as the final acknowledgement.
-- No extra STOP ACK counter is required.
-
-Input-window behavior:
-
-- STOP can be acquired while earlier DATA inputs are still active.
-- STOP does not take effect ahead of earlier DATA inputs.
-- The helper stops acquiring further DATA inputs after STOP is observed.
-- Earlier active DATA inputs continue until the application completes them.
-- Outputs produced by earlier DATA inputs may still be published while the
-  helper drains.
-- The helper releases only the FIFO completed prefix.
-- Once all earlier DATA inputs are released, the helper releases STOP and the
-  persistent L2 run exits.
-
-STOP takes an input descriptor slot but does not count against
-`max_l2_inflight`, because `max_l2_inflight` controls only active DATA input
-ownership.
-
-STOP is terminal for the input queue. After L3 successfully publishes STOP,
-the input queue rejects further DATA, ERROR, or STOP enqueue attempts locally
-without poisoning. If L2 has observed STOP and later observes any further
-published input descriptor, including a second STOP, that is invalid published
-descriptor state and poisons the queue.
-
-STOP does not close the output queue. After publishing STOP, L3 may continue
-dequeueing DATA or ERROR messages from the output queue. The transport has no
-output-side terminal message and does not automatically know that the
-persistent L2 run has returned. Applications that need to know all business
-outputs have arrived must define that condition in their payload protocol, for
-example with expected counts or final markers.
-
-Publishing STOP and then immediately returning from the L3 orchestration
-function is transport-legal. It can still be an application error if L2 needs
-to publish final outputs: the output queue may fill and prevent L2 from
-finishing, causing `Worker.run` drain to fail or time out.
-
-Convenience APIs may expose:
-
-```text
-try_request_stop()
-request_stop(timeout)
-```
-
-`try_request_stop()` attempts to publish a STOP descriptor to the input queue
-and returns immediately if no input descriptor slot is available.
-
-`request_stop(timeout)` performs a bounded wait until a STOP descriptor can be
-published. The timeout covers only STOP enqueue/publish. It does not wait for
-L2 exit and does not drain outputs. If the timeout expires before STOP is
-published, the queue remains live and is not poisoned.
-
-## 14. Queue Lifetime And Cleanup
-
-A queue owns one primitive `L3L2OrchRegion`. Queue cleanup follows the
-underlying region cleanup path:
-
-```text
-optional request_stop() -> L2 persistent run exits
-L3 orchestration function returns
-Worker.run drains submitted L2 work
-runtime sends FREE_REGION for live L3-L2 regions
-queue/region handles expire
-```
-
-`request_stop()` and `queue.free()` are different operations. `request_stop()`
-is a protocol message that asks L2 to stop acquiring input. `queue.free()` is a
-local handle release that rejects later queue use. Neither operation
-synchronously releases the physical payload/counter region.
-
-Physical release is deferred until `Worker.run` has drained submitted L2 work.
-This keeps region memory live while an in-flight L2 task may still hold the
-primitive descriptor or payload views. If the L3 orchestration function exits
-with a live queue, runtime cleanup releases it through the same region cleanup
-path.
-
-Queue cleanup does not require the output queue to be empty. Once `Worker.run`
-has drained and the persistent L2 run has returned, freeing the region is
-memory-safe even if L3 left output messages unread. Those unread messages are
-discarded with the region. Applications that need every output must dequeue
-until their own final-output condition is satisfied before calling
-`queue.free()` or returning from the orchestration function.
-
-## 15. Error And Poison
-
-Application-level failure is represented by `opcode=ERROR` and optional
-application-defined payload bytes. `ERROR` is allowed in either direction and
-may be published during normal processing or while draining after STOP.
-Receiving `ERROR` does not poison the queue and does not change STOP
-semantics.
-
-Infrastructure poison is a queue/region state, not a descriptor message.
-
-The guiding rule is:
-
-```text
-Before shared-state mutation: reject, no poison.
-After shared-state mutation or inconsistent shared-state observation: poison.
-```
-
-Examples that do not poison:
-
-- `try_enqueue` sees no space.
-- `try_request_stop` sees no input descriptor slot.
-- Blocking enqueue/dequeue/request-stop times out under ordinary backpressure.
-- Payload is larger than the arena before reserve mutates state.
-- Queue creation rejects ambiguous descriptor head/tail reconstruction
-  parameters.
-- User buffer is too small before read copies payload bytes.
-- Invalid API arguments are caught before touching shared state.
-
-Examples that poison:
-
-- descriptor sequence mismatch;
-- invalid opcode observed in a published descriptor;
-- STOP observed on the output queue;
-- descriptor payload range outside its arena;
-- descriptor head/tail reconstruction or payload replay observes impossible
-  shared state;
-- payload copy failure after command issue;
-- counter notify failure;
-- control-service response timeout after command issue;
-- L2 endpoint fatal error for this region;
-- reservation, publish, or release state becomes self-contradictory.
-
-Ordinary queue operation timeout does not prove remote poison. After a
-blocking operation times out, the endpoint samples the peer abort flag. If the
-peer flag is still zero, the timeout remains ordinary no-progress and does not
-poison the local queue. If the peer flag is one, the endpoint reports remote
-infrastructure abort and transitions its local handle to a terminal
-remote-aborted state without setting its own abort flag. The peer may also
-observe primitive region fatal errors or `Worker.run` drain errors.
-
-Only local infrastructure poison sets the endpoint's own abort flag. Ordinary
-timeouts, application `ERROR` messages, pre-mutation validation failures, and
-observing the peer's abort flag do not set it.
-
-The L2 input window helper also poisons the queue when local ownership state
-becomes contradictory:
-
-- completing an input handle unknown to the helper;
-- completing or releasing a handle twice;
-- attempting to release a non-contiguous input while earlier inputs remain
-  incomplete;
-- acquiring DATA after STOP has put the helper into draining;
-- observing an acquired input sequence that contradicts the helper window.
-
-The Python queue object mirrors the existing region state model:
-
-```text
-LIVE
-RELEASED
-POISONED(local-infrastructure)
-POISONED(remote-aborted)
-EXPIRED
-```
-
-After poison, reserve, enqueue, peek, read, release, publish, and stop-request
-operations reject. Cleanup/free remains idempotent and valid.
-
-L2 C++ helper poison reports a fatal error including the primitive region id,
-so existing Host-side parsing can poison the corresponding region.
-
-## 16. Implementation Staging
-
-The feature can be implemented in two review-friendly stages. This staging is
-not an API boundary: the base transport should intentionally leave room for
-the input window without later ABI or L3 API changes.
-
-```text
-Stage 1:
-  base SPSC message queue transport
-  input and output descriptor rings
-  input and output payload arenas
-  descriptor head/tail protocol over int32_t signal counters
-  single-writer abort flags for timeout disambiguation
-  derived uint64_t payload cursors via descriptor replay
-  DATA / ERROR / input-only STOP
-  one active DATA input in the L2 helper/example
-
-Stage 2:
-  L2 input window helper
-  max_l2_inflight
-  application-driven input complete
-  FIFO-safe release of completed input prefix
-  flexible output cardinality and out-of-input-order output publish
-  FIFO STOP drain with earlier DATA inputs still active
-```
-
-Stage 1 intentionally leaves room for Stage 2 through these hook points:
-
-- descriptor `seq` is explicit and 64-bit;
-- input release is explicit, not tied to dequeue;
-- output reserve and publish are separate;
-- each direction has at most one outstanding producer reservation;
-- application correlation is kept in payload, so queue transport does not
-  assume one input maps to one output;
-- L3 queue creation and output ownership/dequeue APIs do not depend on
-  `max_l2_inflight`.
-
-Expected implementation locations:
-
-```text
-python/simpler/l3_l2_message_queue.py
-src/common/platform/include/aicpu/l3_l2_message_queue.h
-docs/l3-l2-message-queue.md
-examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/
-examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue_input_window/
-```
-
-The exact Python module and public API names may change during implementation,
-but the transport contract should remain stable.
-
-## 17. Tests And Examples
-
-Base queue tests should cover:
-
-- layout calculation;
-- descriptor slot encoding;
-- counter offset assignment;
-- queue creation rejecting ambiguous descriptor head/tail reconstruction
-  parameters;
-- enqueue reserve failure for payload larger than arena;
-- backpressure when descriptor ring is full;
-- backpressure when payload arena is full;
-- arena wrap with invisible padding;
-- STOP descriptor handling;
-- `try_request_stop` and `request_stop(timeout)` behavior;
-- ERROR as a normal application message in either direction;
-- L3 ordinary host-buffer enqueue/read through lazy staging;
-- L3 primitive-compatible registered Tensor fast paths without staging;
-- staging allocation failure before primitive command issue not poisoning the
-  queue;
-- abort flags distinguishing ordinary timeout from remote infrastructure
-  abort;
-- local infrastructure poison setting the local abort flag;
-- remote-aborted terminal state not setting the local abort flag;
-- poison on invalid published descriptor state;
-- poison on descriptor head/tail reconstruction or payload replay
-  inconsistency;
-- no poison on pre-mutation validation failure.
-
-The new example should be parallel to the existing primitive stream example,
-not a replacement for it. The primitive stream example should remain as the
-minimal demonstration of `docs/l3-l2-orch-comm.md`.
-
-The base queue example should demonstrate:
-
-- `depth > 1`;
-- variable-size input and output payloads;
-- input and output backpressure;
-- L2 persistent loop;
-- one input message containing message-local AICore work;
-- FIFO STOP shutdown;
-- L3 optionally dequeuing output after STOP according to application final
-  output rules.
-
-Input window tests and examples should cover:
-
-- `max_l2_inflight > 1`;
-- refusing to acquire new DATA input when the input window is full;
-- multiple input messages acquired before earlier inputs release;
-- application-driven input completion;
-- releasing only the FIFO completed prefix;
-- one input producing multiple outputs;
-- multiple inputs producing one output;
-- output publish order differing from input acquire order;
-- output correlation stored in the application payload header;
-- STOP entering draining while earlier DATA inputs remain active;
-- output DATA or ERROR publish during STOP drain;
-- local ownership errors poisoning the queue.
-
-Future work beyond the staged implementation is limited to out-of-order input
-payload release, fragmented payload arena allocation, abort reason/status
-metadata, low-latency abort polling, or concurrent output reservations, if
-those become necessary.
diff --git a/docs/l3-l2-message-queue.md b/docs/l3-l2-message-queue.md
new file mode 100644
index 000000000..a77003537
--- /dev/null
+++ b/docs/l3-l2-message-queue.md
@@ -0,0 +1,352 @@
+# L3-L2 Message Queue
+
+L3-L2 Message Queue lets an L3 Host Orchestrator exchange ordered messages
+with one persistent L2 AICPU Orchestrator task.
+
+The intended use case is repeated in-flight work: L3 enqueues input messages,
+L2 consumes them while the L2 task stays alive, L2 publishes output messages,
+and L3 dequeues those outputs. The queue is built on top of the lower-level
+L3-L2 orchestration communication primitives described in
+[l3-l2-orch-comm.md](l3-l2-orch-comm.md). For where L3 and L2 sit in
+the runtime stack, see
+[hierarchical_level_runtime.md](hierarchical_level_runtime.md).
+
+## 1. API
+
+L3 creates one queue for one chip worker:
+
+```python
+queue = orch.create_l3_l2_queue(
+    worker_id=0,
+    depth=4,
+    input_arena_bytes=1 << 20,
+    output_arena_bytes=1 << 20,
+)
+```
+
+The queue owns one underlying `L3L2OrchRegion`. Its payload range is split into
+input/output descriptor rings and input/output payload arenas. Its counter
+range stores descriptor head/tail signals and abort flags.
+
+L3 passes the primitive region descriptor and queue layout arguments to L2:
+
+```python
+l2_args = TaskArgs()
+for value in queue.l2_task_arg_scalars():
+    l2_args.add_scalar(value)
+
+orch.submit_next_level(l2_handle, l2_args, cfg, worker=0)
+```
+
+`l2_task_arg_scalars()` returns:
+
+```text
+primitive region descriptor scalars[0..5]
+queue_magic_version
+depth
+input_arena_bytes
+output_arena_bytes
+```
+
+L3 sends input messages through `queue.input`:
+
+```python
+host_input = orch.alloc([nbytes], DataType.UINT8)
+fill_input(host_input)
+
+queue.input.enqueue(host_input, nbytes=nbytes, timeout=timeout_s)
+```
+
+`try_enqueue(buffer, nbytes)` is the non-blocking form. It returns `False`
+when the input descriptor ring or payload arena has no space. That result is
+ordinary backpressure and does not poison the queue.
+
+L3 receives output messages through `queue.output`:
+
+```python
+host_output = orch.alloc([max_output_nbytes], DataType.UINT8)
+
+message = queue.output.peek(timeout=timeout_s)
+queue.output.read_into(message, host_output)
+queue.output.release(message)
+```
+
+The convenience form reads and releases in one operation:
+
+```python
+message = queue.output.dequeue_into(host_output, timeout=timeout_s)
+```
+
+`try_peek()` and `try_dequeue_into(buffer)` are the non-blocking forms. They
+return `None` when no output message is available.
+
+The L3 buffer arguments currently must be runtime-managed tensors returned by
+`orch.alloc(...)`. Ordinary Python `bytes`, `bytearray`, and private tensors
+are rejected before shared queue state is modified. Zero-byte messages use
+`buffer_or_none=None` and `nbytes=0`.
+
+L3 requests graceful shutdown by publishing an input-side `STOP` descriptor:
+
+```python
+queue.request_stop(timeout=timeout_s)
+queue.free()
+```
+
+`try_request_stop()` is the non-blocking form. `queue.free()` releases the L3
+handle. It does not synchronously free device memory; physical cleanup follows
+the underlying region lifetime model after submitted L2 work has drained.
+
+On L2, orchestration code receives the primitive descriptor and queue args,
+then constructs an endpoint:
+
+```cpp
+L3L2OrchRegionDesc desc{/* scalars from TaskArgs */};
+L3L2QueueArgs queue_args{
+    magic_version,
+    depth,
+    input_arena_bytes,
+    output_arena_bytes,
+};
+
+L3L2QueueEndpoint queue(desc, queue_args);
+if (queue.error().kind != L3L2QueueErrorKind::NONE) {
+    return;
+}
+```
+
+L2 consumes input messages from `queue.input()` and publishes outputs through
+`queue.output()`:
+
+```cpp
+while (true) {
+    L3L2QueueInputHandle input{};
+    if (!queue.input().peek(timeout_ns, &input)) {
+        return;
+    }
+
+    if (input.opcode == L3L2QueueOpcode::STOP) {
+        queue.input().release(input);
+        return;
+    }
+
+    L3L2QueueOutputReservation output{};
+    if (!queue.output().reserve(input.payload_nbytes, timeout_ns, &output)) {
+        return;
+    }
+
+    launch_aicore(input.payload, output.payload);
+    wait_aicore_done();
+
+    queue.output().publish(output, L3L2QueueOpcode::DATA);
+    queue.input().release(input);
+}
+```
+
+`queue.input().try_peek(&input)` and
+`queue.output().try_reserve(nbytes, &reservation)` are non-blocking. A `false`
+return can mean no progress, timeout, validation failure, or poison; check
+`queue.error().kind` to distinguish ordinary no-progress from terminal error.
+
+## 2. Layout
+
+The physical region has one payload range:
+
+```text
+payload region
+|-- input descriptor ring
+|-- output descriptor ring
+|-- input payload arena
+`-- output payload arena
+```
+
+The two payload arenas are separate:
+
+```text
+input arena:  producer = L3, consumer = L2
+output arena: producer = L2, consumer = L3
+```
+
+`depth` is the descriptor-ring capacity in each direction. It must be a power
+of two and at most `2^30`. Queue capacity is exactly `depth` messages, not
+`depth - 1`.
+
+`input_arena_bytes` and `output_arena_bytes` must be positive 64-byte
+multiples. They do not need to be powers of two. A single message payload must
+fit as one contiguous span inside its direction's arena. Payloads are not split
+across arena wrap.
+
+The queue layout helper is shared by Python and C++:
+
+```text
+input_desc_offset
+output_desc_offset
+input_arena_offset
+output_arena_offset
+payload_bytes
+counter_bytes
+```
+
+Python exposes this as `queue.layout`; L2 exposes it as `queue.layout()`.
+
+## 3. Descriptor ABI
+
+Each descriptor slot is 32 bytes:
+
+```cpp
+struct L3L2QueueDescSlot {
+    uint64_t seq;
+    uint64_t opcode;
+    uint64_t payload_offset;
+    uint64_t payload_nbytes;
+};
+```
+
+`seq` is the transport sequence number for ring validation, wrap detection, and
+diagnostics. It is not a user request ID. Applications that need request IDs,
+batch IDs, final markers, or correlation fields should put them in their own
+payload header.
+
+`payload_offset` is relative to the primitive region payload base. The payload
+must be wholly inside the matching direction's arena. Zero-byte messages use
+`payload_offset == 0` and `payload_nbytes == 0`.
+
+The queue currently defines these opcodes:
+
+| Opcode | Meaning |
+| ------ | ------- |
+| `DATA` | Ordinary application payload message. |
+| `STOP` | Graceful input-side shutdown request. |
+| `ERROR` | Ordinary application-level error payload message. |
+
+`STOP` is valid only on the input queue. The output queue has no `STOP`
+message; L2 exit is observed through normal `Worker.run` drain.
+
+`ERROR` is a normal queue message. The queue layer does not interpret its
+payload and does not poison the queue when an `ERROR` message is received.
+Infrastructure failures use poison state instead.
+
+## 4. Signals And Ordering
+
+The queue uses the primitive signal counters as descriptor head/tail values.
+Each shared signal is placed on a 64-byte stride:
+
+```text
+offset 0:   input_desc_tail       writer=L3
+offset 64:  input_desc_head       writer=L2
+offset 128: output_desc_tail      writer=L2
+offset 192: output_desc_head      writer=L3
+offset 256: l3_abort_flag         writer=L3
+offset 320: l2_abort_flag         writer=L2
+```
+
+Descriptor counters store the low 32 bits of monotonic logical head/tail
+values. Each endpoint reconstructs its local 64-bit value from observed
+progress. The unobserved progress must be between zero and `depth`; anything
+else is inconsistent shared state and poisons the queue.
+
+The producer sequence is:
+
+```text
+reserve payload space
+write payload bytes
+write descriptor fields
+write descriptor seq
+publish descriptor tail counter
+```
+
+The consumer sequence is:
+
+```text
+observe descriptor tail progress
+read and validate descriptor
+use payload bytes or payload view
+release descriptor and payload
+publish descriptor head counter
+```
+
+All blocking queue operations require finite timeouts. Timeout under ordinary
+backpressure is not poison. After timeout, an endpoint samples the peer abort
+flag; if the peer flag is set, the local endpoint reports remote abort.
+
+## 5. Ownership
+
+Queue ownership is per message.
+
+On L3 output, `peek()` returns a handle that remains active until
+`release(handle)`. While a handle is active, repeated `try_peek()` returns the
+same handle. The caller may read the payload with `read_into(handle, buffer)`
+before releasing it. Releasing the wrong handle is an ownership error and
+poisons the queue.
+
+On L2 input, `peek()` returns one active input handle. L2 must not call
+`peek()` again before releasing that handle. L2 must not release an input until
+all AICore work that reads the input payload has completed.
+
+On L2 output, `reserve()` returns one active output reservation. L2 fills the
+reserved payload span, then calls `publish(reservation, opcode)`. Publishing an
+unknown, stale, already-published, or cross-queue reservation is an ownership
+error and poisons the queue.
+
+The base queue supports at most one active L2 input handle and one active L2
+output reservation. It does not provide a multi-input L2 window.
+
+## 6. STOP Semantics
+
+`STOP` is an input descriptor with no payload. It follows normal FIFO ordering:
+L2 observes and releases messages before `STOP`, then releases `STOP` and
+returns from the persistent run.
+
+After L3 successfully publishes `STOP`, the input queue rejects further input
+messages locally without poisoning. L3 may still dequeue output messages that
+L2 publishes before returning.
+
+`request_stop(timeout)` waits only until the `STOP` descriptor is published.
+It does not wait for L2 exit and does not drain outputs. Applications that need
+all outputs must keep dequeuing until their own protocol-level final condition
+is satisfied before returning from the L3 orchestration function.
+
+## 7. Error Handling
+
+The queue distinguishes no-progress, application errors, and infrastructure
+poison.
+
+No-progress is non-terminal:
+
+- descriptor ring full;
+- payload arena full;
+- empty output queue;
+- blocking operation timeout with no peer abort flag.
+
+Application-level error is represented by `opcode=ERROR`. It is delivered to
+the peer as a normal message and does not set an abort flag.
+
+Infrastructure poison is terminal for the local queue handle:
+
+- descriptor sequence mismatch;
+- invalid opcode in a published descriptor;
+- output-side `STOP`;
+- descriptor payload outside its direction's arena;
+- impossible counter reconstruction or payload replay;
+- payload command failure after shared mutation begins;
+- counter notify failure;
+- stale or invalid handle/reservation ownership.
+
+When an endpoint enters local infrastructure poison, it sets its own abort flag
+for the peer. Observing the peer abort flag reports remote abort but does not
+set the local abort flag.
+
+After poison, normal queue operations reject. Cleanup remains valid.
+
+## 8. Platform Support
+
+The message queue uses the existing L3-L2 orchestration communication region,
+payload, and counter primitives.
+
+- `a2a3sim`: supported.
+- `a5sim`: supported.
+- `a2a3` onboard: supported where the underlying L3-L2 communication
+  primitives are supported.
+- `a5` onboard: follows the underlying L3-L2 communication support status.
+
+Simulation backends preserve the same API, ordering, timeout, and error
+semantics as onboard backends.
diff --git a/docs/l3-l2-orch-comm.md b/docs/l3-l2-orch-comm.md
index 6c541dfe5..256babbf5 100644
--- a/docs/l3-l2-orch-comm.md
+++ b/docs/l3-l2-orch-comm.md
@@ -3,6 +3,10 @@
 L3-L2 Orchestrator Communication lets an L3 Host Orchestrator exchange payload
 bytes and signal counters with a running L2 AICPU Orchestrator task.
 
+This page documents the low-level region, payload, and counter primitives. For
+the ordered SPSC message queue wrapper built on these primitives, see
+[l3-l2-message-queue.md](l3-l2-message-queue.md).
+
 The intended use case is in-flight interaction: L3 can write input payload,
 publish a data-ready counter, wait for L2/AICore completion, and read output
 payload without ending the L2 orchestration task. For where L3 and L2 sit in

From bf505cbfdb997feca0160b93c6dc390596de60ad Mon Sep 17 00:00:00 2001
From: ccyywwen <75376396+ccyywwen@users.noreply.github.com>
Date: Wed, 1 Jul 2026 09:56:08 +0800
Subject: [PATCH 6/7] Fix: harden L3-L2 queue review feedback

- Add strict payload and counter size checks to the L3-L2 queue task args.

- Validate L2 input payload offsets before exposing payload views.

- Document timeout, layout, and queue free semantics, and expand no-hardware tests.
---
 docs/l3-l2-message-queue.md                   |  32 ++-
 python/simpler/l3_l2_message_queue.py         |   4 +-
 .../include/aicpu/l3_l2_message_queue.h       |  41 +++-
 .../cpp/common/test_l3_l2_message_queue.cpp   | 228 +++++++++---------
 .../test_worker/test_l3_l2_message_queue.py   |  19 ++
 5 files changed, 191 insertions(+), 133 deletions(-)

diff --git a/docs/l3-l2-message-queue.md b/docs/l3-l2-message-queue.md
index a77003537..12a0034ad 100644
--- a/docs/l3-l2-message-queue.md
+++ b/docs/l3-l2-message-queue.md
@@ -46,6 +46,8 @@ queue_magic_version
 depth
 input_arena_bytes
 output_arena_bytes
+payload_bytes
+counter_bytes
 ```
 
 L3 sends input messages through `queue.input`:
@@ -93,8 +95,11 @@ queue.free()
 ```
 
 `try_request_stop()` is the non-blocking form. `queue.free()` releases the L3
-handle. It does not synchronously free device memory; physical cleanup follows
-the underlying region lifetime model after submitted L2 work has drained.
+queue handle and marks the underlying `L3L2OrchRegion` handle released. It does
+not synchronously free device memory; physical cleanup follows the underlying
+region lifetime model after submitted L2 work has drained. Small Python wrapper
+scratch tensors used for descriptor staging are owned by the queue object and
+follow normal Python object lifetime.
 
 On L2, orchestration code receives the primitive descriptor and queue args,
 then constructs an endpoint:
@@ -106,6 +111,8 @@ L3L2QueueArgs queue_args{
     depth,
     input_arena_bytes,
     output_arena_bytes,
+    payload_bytes,
+    counter_bytes,
 };
 
 L3L2QueueEndpoint queue(desc, queue_args);
@@ -175,7 +182,7 @@ multiples. They do not need to be powers of two. A single message payload must
 fit as one contiguous span inside its direction's arena. Payloads are not split
 across arena wrap.
 
-The queue layout helper is shared by Python and C++:
+Python and C++ mirror the same deterministic queue layout calculation:
 
 ```text
 input_desc_offset
@@ -187,6 +194,10 @@ counter_bytes
 ```
 
 Python exposes this as `queue.layout`; L2 exposes it as `queue.layout()`.
+L3 passes the derived `payload_bytes` and `counter_bytes` to L2. L2 rejects
+initialization unless those values match both its local layout calculation and
+the primitive region descriptor sizes. Lockstep tests cover representative
+layout cases for the mirrored Python and C++ calculations.
 
 ## 3. Descriptor ABI
 
@@ -264,9 +275,18 @@ release descriptor and payload
 publish descriptor head counter
 ```
 
-All blocking queue operations require finite timeouts. Timeout under ordinary
-backpressure is not poison. After timeout, an endpoint samples the peer abort
-flag; if the peer flag is set, the local endpoint reports remote abort.
+All Python blocking queue operations require finite positive timeouts; passing
+`timeout <= 0` is a caller error and raises `ValueError`. Python `try_*` APIs
+are non-blocking and return `False` or `None` for ordinary no-progress.
+
+C++ blocking queue operations take `timeout_ns`; `timeout_ns == 0` is an
+immediate timeout probe. They return `false` on no-progress, timeout,
+validation failure, or poison. C++ `try_*` APIs are non-blocking and also
+return `false` for ordinary no-progress.
+
+Timeout under ordinary backpressure is not poison. After timeout, an endpoint
+samples the peer abort flag; if the peer flag is set, the local endpoint
+reports remote abort.
 
 ## 5. Ownership
 
diff --git a/python/simpler/l3_l2_message_queue.py b/python/simpler/l3_l2_message_queue.py
index 38f6b845b..91236fe5e 100644
--- a/python/simpler/l3_l2_message_queue.py
+++ b/python/simpler/l3_l2_message_queue.py
@@ -28,7 +28,7 @@
 
 L3L2_QUEUE_MAGIC = 0x4C335132
 L3L2_QUEUE_ABI_MAJOR = 1
-L3L2_QUEUE_ABI_MINOR = 0
+L3L2_QUEUE_ABI_MINOR = 1
 L3L2_QUEUE_DESC_SLOT_BYTES = 32
 L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT = 64
 L3L2_QUEUE_COUNTER_STRIDE = 64
@@ -235,6 +235,8 @@ def l2_task_arg_scalars(self) -> list[int]:
             self._layout.depth,
             self._layout.input_arena_bytes,
             self._layout.output_arena_bytes,
+            self._layout.payload_bytes,
+            self._layout.counter_bytes,
         ]
 
     def try_request_stop(self) -> bool:
diff --git a/src/common/platform/include/aicpu/l3_l2_message_queue.h b/src/common/platform/include/aicpu/l3_l2_message_queue.h
index 96dad5a40..4c149ba7e 100644
--- a/src/common/platform/include/aicpu/l3_l2_message_queue.h
+++ b/src/common/platform/include/aicpu/l3_l2_message_queue.h
@@ -20,7 +20,7 @@
 
 static constexpr uint32_t L3L2_QUEUE_MAGIC = 0x4C335132u;  // "L3Q2"
 static constexpr uint16_t L3L2_QUEUE_ABI_MAJOR = 1;
-static constexpr uint16_t L3L2_QUEUE_ABI_MINOR = 0;
+static constexpr uint16_t L3L2_QUEUE_ABI_MINOR = 1;
 static constexpr uint64_t L3L2_QUEUE_DESC_SLOT_BYTES = 32;
 static constexpr uint64_t L3L2_QUEUE_DESC_RING_ALIGNMENT = 8;
 static constexpr uint64_t L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT = 64;
@@ -94,6 +94,8 @@ struct L3L2QueueArgs {
     uint64_t depth;
     uint64_t input_arena_bytes;
     uint64_t output_arena_bytes;
+    uint64_t payload_bytes;
+    uint64_t counter_bytes;
 };
 
 struct L3L2QueueInputHandle {
@@ -209,7 +211,8 @@ l3_l2_queue_validate_region(const L3L2OrchRegionDesc &desc, const L3L2QueueArgs
         !l3_l2_queue_make_layout(args.depth, args.input_arena_bytes, args.output_arena_bytes, &layout)) {
         return false;
     }
-    if (desc.payload_bytes < layout.payload_bytes || desc.counter_bytes < layout.counter_bytes) {
+    if (args.payload_bytes != layout.payload_bytes || args.counter_bytes != layout.counter_bytes ||
+        desc.payload_bytes != layout.payload_bytes || desc.counter_bytes != layout.counter_bytes) {
         return false;
     }
     if (out_layout != nullptr) {
@@ -345,6 +348,11 @@ class L3L2QueueEndpoint {
                        )) {
                 parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "input payload out of arena");
                 return false;
+            } else if (!parent_->payload_matches_head(
+                           parent_->input_payload_head_, slot.payload_offset, slot.payload_nbytes,
+                           parent_->layout_.input_arena_offset, parent_->layout_.input_arena_bytes, "input.try_peek"
+                       )) {
+                return false;
             } else if (!parent_->endpoint_.payload_read(slot.payload_offset, slot.payload_nbytes, &view)) {
                 parent_->poison(
                     L3L2QueueErrorKind::ENDPOINT_ERROR, "input.try_peek", parent_->endpoint_.error().message
@@ -472,6 +480,8 @@ class L3L2QueueEndpoint {
                 uint64_t arena_bytes = parent_->layout_.output_arena_bytes;
                 uint64_t arena_pos = parent_->output_payload_tail_ % arena_bytes;
                 if (arena_pos + nbytes > arena_bytes) {
+                    // Payloads are never split across arena wrap. The skipped tail bytes are retired in the
+                    // monotonic virtual cursor even if this reservation later finds the arena full.
                     parent_->output_payload_tail_ += arena_bytes - arena_pos;
                     arena_pos = 0;
                 }
@@ -648,16 +658,33 @@ class L3L2QueueEndpoint {
         return offset >= arena_offset && offset + nbytes <= arena_offset + arena_bytes;
     }
 
+    bool payload_matches_head(
+        uint64_t cursor, uint64_t payload_offset, uint64_t nbytes, uint64_t arena_offset, uint64_t arena_bytes,
+        const char *op
+    ) {
+        if (nbytes == 0) {
+            return true;
+        }
+        uint64_t arena_pos = cursor % arena_bytes;
+        uint64_t expected_offset = arena_pos + nbytes > arena_bytes ? arena_offset : arena_offset + arena_pos;
+        if (payload_offset != expected_offset) {
+            poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "payload replay offset mismatch");
+            return false;
+        }
+        return true;
+    }
+
     void advance_payload_head(
         uint64_t &cursor, uint64_t payload_offset, uint64_t nbytes, uint64_t arena_offset, uint64_t arena_bytes,
         const char *op
     ) {
-        uint64_t expected_offset = arena_offset + (cursor % arena_bytes);
+        uint64_t arena_pos = cursor % arena_bytes;
+        uint64_t expected_offset = arena_pos + nbytes > arena_bytes ? arena_offset : arena_offset + arena_pos;
         if (expected_offset != payload_offset) {
-            if (payload_offset != arena_offset) {
-                poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "payload replay offset mismatch");
-                return;
-            }
+            poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "payload replay offset mismatch");
+            return;
+        }
+        if (arena_pos + nbytes > arena_bytes) {
             cursor += arena_bytes - (cursor % arena_bytes);
         }
         cursor += nbytes;
diff --git a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
index e2761c426..e7db495d4 100644
--- a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
+++ b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
@@ -40,6 +40,19 @@ L3L2OrchRegionDesc make_desc(RegionStorage *storage, uint64_t payload_bytes = 51
 
 size_t counter_index(uint64_t offset) { return static_cast<size_t>(offset / sizeof(int32_t)); }
 
+L3L2QueueArgs make_args(uint64_t depth, uint64_t input_arena_bytes, uint64_t output_arena_bytes) {
+    L3L2QueueLayout layout{};
+    EXPECT_TRUE(l3_l2_queue_make_layout(depth, input_arena_bytes, output_arena_bytes, &layout));
+    return L3L2QueueArgs{
+        l3_l2_queue_magic_version(), depth, input_arena_bytes, output_arena_bytes, layout.payload_bytes,
+        layout.counter_bytes,
+    };
+}
+
+L3L2OrchRegionDesc make_desc(RegionStorage *storage, const L3L2QueueArgs &args) {
+    return make_desc(storage, args.payload_bytes, args.counter_bytes);
+}
+
 void publish_input_desc(
     RegionStorage *storage, const L3L2QueueLayout &layout, uint64_t seq, L3L2QueueOpcode opcode,
     uint64_t payload_offset = 0, uint64_t payload_nbytes = 0
@@ -118,14 +131,10 @@ TEST(L3L2MessageQueueTest, LayoutRejectsInvalidDepthArenaAndCounterBytes) {
     EXPECT_FALSE(l3_l2_queue_make_layout(2, 65, 64, &layout));
 
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
+    L3L2QueueArgs args = make_args(2, 64, 64);
     EXPECT_FALSE(l3_l2_queue_validate_region(make_desc(&storage, 256, 320), args, &layout));
-    EXPECT_TRUE(l3_l2_queue_validate_region(make_desc(&storage, 512, 384), args, &layout));
+    EXPECT_FALSE(l3_l2_queue_validate_region(make_desc(&storage, 512, 384), args, &layout));
+    EXPECT_TRUE(l3_l2_queue_validate_region(make_desc(&storage, args), args, &layout));
 }
 
 TEST(L3L2MessageQueueTest, LayoutOverflowFailsClosedWithoutModifyingOutput) {
@@ -171,6 +180,10 @@ TEST(L3L2MessageQueueTest, Low32ReconstructionAcceptsWrapAndRejectsImpossibleDel
     EXPECT_TRUE(l3_l2_queue_reconstruct_counter(0, 4, &value));
     EXPECT_EQ(value, 0x1'0000'0000ull);
 
+    value = (1ull << 31) - 2;
+    EXPECT_TRUE(l3_l2_queue_reconstruct_counter(static_cast<int32_t>(0x8000'0001u), 4, &value));
+    EXPECT_EQ(value, (1ull << 31) + 1);
+
     value = 100;
     EXPECT_TRUE(l3_l2_queue_reconstruct_counter(104, 4, &value));
     EXPECT_EQ(value, 104u);
@@ -184,13 +197,8 @@ TEST(L3L2MessageQueueTest, Low32ReconstructionAcceptsWrapAndRejectsImpossibleDel
 
 TEST(L3L2MessageQueueTest, L2InputPeekHandlesZeroByteDescriptorBeforeArenaValidation) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     L3L2QueueDescSlot slot{};
@@ -211,13 +219,8 @@ TEST(L3L2MessageQueueTest, L2InputPeekHandlesZeroByteDescriptorBeforeArenaValida
 
 TEST(L3L2MessageQueueTest, L2InputPeekPoisonsZeroByteDescriptorWithNonzeroOffset) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     L3L2QueueDescSlot slot{};
@@ -232,15 +235,67 @@ TEST(L3L2MessageQueueTest, L2InputPeekPoisonsZeroByteDescriptorWithNonzeroOffset
     EXPECT_EQ(storage.counters[80], 1);
 }
 
+TEST(L3L2MessageQueueTest, L2InputPeekExposesNonzeroPayloadBytes) {
+    RegionStorage storage{};
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+    const std::array<uint8_t, 4> payload{{0x11, 0x22, 0x33, 0x44}};
+    std::memcpy(storage.payload.data() + queue.layout().input_arena_offset, payload.data(), payload.size());
+    publish_input_desc(
+        &storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, payload.size()
+    );
+
+    L3L2QueueInputHandle handle{};
+    ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message;
+
+    ASSERT_EQ(handle.payload_nbytes, payload.size());
+    const auto *observed = reinterpret_cast<const uint8_t *>(static_cast<uintptr_t>(handle.payload.gm_addr));
+    EXPECT_EQ(std::memcmp(observed, payload.data(), payload.size()), 0);
+    ASSERT_TRUE(queue.input().release(handle)) << queue.error().message;
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE);
+}
+
+TEST(L3L2MessageQueueTest, L2InputPeekAllowsArenaWrapAtExpectedPayloadHead) {
+    RegionStorage storage{};
+    L3L2QueueArgs args = make_args(2, 128, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+
+    publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, 80);
+    L3L2QueueInputHandle first{};
+    ASSERT_TRUE(queue.input().try_peek(&first)) << queue.error().message;
+    ASSERT_TRUE(queue.input().release(first)) << queue.error().message;
+
+    publish_input_desc(&storage, queue.layout(), 2, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, 64);
+    L3L2QueueInputHandle second{};
+    ASSERT_TRUE(queue.input().try_peek(&second)) << queue.error().message;
+
+    EXPECT_EQ(second.payload_offset, queue.layout().input_arena_offset);
+    EXPECT_EQ(second.payload_nbytes, 64u);
+    ASSERT_TRUE(queue.input().release(second)) << queue.error().message;
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE);
+}
+
+TEST(L3L2MessageQueueTest, L2InputPeekRejectsPayloadOffsetMismatchBeforeRelease) {
+    RegionStorage storage{};
+    L3L2QueueArgs args = make_args(2, 128, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+    publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset + 64, 16);
+
+    L3L2QueueInputHandle handle{};
+    EXPECT_FALSE(queue.input().try_peek(&handle));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::INVALID_DESCRIPTOR);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET)], 0);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1);
+}
+
 TEST(L3L2MessageQueueTest, L2OutputReservePublishWritesDescriptorAndTail) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     L3L2QueueOutputReservation reservation{};
@@ -260,13 +315,8 @@ TEST(L3L2MessageQueueTest, L2OutputReservePublishWritesDescriptorAndTail) {
 
 TEST(L3L2MessageQueueTest, L2OutputReserveReplaysReleasedDescriptorsBeforeReusingArena) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        4,
-        64,
-        128,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(4, 64, 128);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     L3L2QueueOutputReservation first{};
@@ -283,13 +333,8 @@ TEST(L3L2MessageQueueTest, L2OutputReserveReplaysReleasedDescriptorsBeforeReusin
 
 TEST(L3L2MessageQueueTest, RemoteAbortObservationDoesNotSetOwnAbortFlag) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
     storage.counters[64] = 1;
 
@@ -301,13 +346,8 @@ TEST(L3L2MessageQueueTest, RemoteAbortObservationDoesNotSetOwnAbortFlag) {
 
 TEST(L3L2MessageQueueTest, OrdinaryTimeoutDoesNotSetOwnAbortFlag) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     EXPECT_EQ(queue.disambiguate_timeout(), L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT);
@@ -318,13 +358,8 @@ TEST(L3L2MessageQueueTest, OrdinaryTimeoutDoesNotSetOwnAbortFlag) {
 
 TEST(L3L2MessageQueueTest, OutputCapacityEqualsDepthAndFullIsNoProgressWithoutAbort) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     for (int i = 0; i < 2; ++i) {
@@ -342,13 +377,8 @@ TEST(L3L2MessageQueueTest, OutputCapacityEqualsDepthAndFullIsNoProgressWithoutAb
 
 TEST(L3L2MessageQueueTest, FullAndEmptyUseMonotonicCountersNotMaskedIndices) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     for (int i = 0; i < 2; ++i) {
@@ -370,13 +400,8 @@ TEST(L3L2MessageQueueTest, FullAndEmptyUseMonotonicCountersNotMaskedIndices) {
 
 TEST(L3L2MessageQueueTest, OutputReserveTooLargeIsPreMutationNoProgressWithoutAbort) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     L3L2QueueOutputReservation reservation{};
@@ -389,13 +414,8 @@ TEST(L3L2MessageQueueTest, OutputReserveTooLargeIsPreMutationNoProgressWithoutAb
 
 TEST(L3L2MessageQueueTest, OutputPublishApplicationErrorDoesNotSetAbortFlag) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     L3L2QueueOutputReservation reservation{};
@@ -411,13 +431,8 @@ TEST(L3L2MessageQueueTest, OutputPublishApplicationErrorDoesNotSetAbortFlag) {
 
 TEST(L3L2MessageQueueTest, OutputPublishStaleReservationPoisonsAndSetsOwnAbortFlag) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     L3L2QueueOutputReservation reservation{};
@@ -431,13 +446,8 @@ TEST(L3L2MessageQueueTest, OutputPublishStaleReservationPoisonsAndSetsOwnAbortFl
 
 TEST(L3L2MessageQueueTest, InputApplicationErrorIsNormalMessageAndDoesNotSetAbortFlag) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
     publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::ERROR);
 
@@ -452,13 +462,8 @@ TEST(L3L2MessageQueueTest, InputApplicationErrorIsNormalMessageAndDoesNotSetAbor
 
 TEST(L3L2MessageQueueTest, InputReleaseRejectsCallerMutatedHandleMetadata) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
     publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, 16);
 
@@ -475,13 +480,8 @@ TEST(L3L2MessageQueueTest, InputReleaseRejectsCallerMutatedHandleMetadata) {
 
 TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidState) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
     publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::STOP);
 
@@ -499,13 +499,8 @@ TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidSt
 
 TEST(L3L2MessageQueueTest, NullInputPeekOutputIsPreMutationRejectionWithoutAbort) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
 
     EXPECT_FALSE(queue.input().try_peek(nullptr));
@@ -516,13 +511,8 @@ TEST(L3L2MessageQueueTest, NullInputPeekOutputIsPreMutationRejectionWithoutAbort
 
 TEST(L3L2MessageQueueTest, InputSecondPeekBeforeReleasePoisonsOwnershipAndSetsOwnAbortFlag) {
     RegionStorage storage{};
-    L3L2QueueArgs args{
-        l3_l2_queue_magic_version(),
-        2,
-        64,
-        64,
-    };
-    L3L2QueueEndpoint queue(make_desc(&storage), args);
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
     ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
     publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA);
 
diff --git a/tests/ut/py/test_worker/test_l3_l2_message_queue.py b/tests/ut/py/test_worker/test_l3_l2_message_queue.py
index 04573a62b..2a83b04a9 100644
--- a/tests/ut/py/test_worker/test_l3_l2_message_queue.py
+++ b/tests/ut/py/test_worker/test_l3_l2_message_queue.py
@@ -292,6 +292,8 @@ def test_create_l3_l2_queue_allocates_region_and_exposes_l2_task_scalars():
             4,
             128,
             192,
+            queue.layout.payload_bytes,
+            queue.layout.counter_bytes,
         ]
         assert fake_client.counters == {
             queue.layout.input_desc_tail_offset: 0,
@@ -429,6 +431,23 @@ def test_dequeue_into_reads_and_releases_output():
         _close(worker, shm)
 
 
+def test_output_error_opcode_is_delivered_without_poison():
+    orch, worker, shm, fake_client = _make_orchestrator()
+    try:
+        queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128)
+        _publish_output(fake_client, queue, payload=b"error-detail", opcode=int(L3L2QueueOpcode.ERROR))
+        output = orch.alloc([12], DataType.UINT8)
+
+        message = queue.output.dequeue_into(output, timeout=0.001)
+
+        assert message.opcode == L3L2QueueOpcode.ERROR
+        assert ctypes.string_at(int(output.data), 12) == b"error-detail"
+        assert fake_client.counters[queue.layout.output_desc_head_offset] == 1
+        assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0
+    finally:
+        _close(worker, shm)
+
+
 def test_try_dequeue_into_empty_returns_none_without_abort():
     orch, worker, shm, fake_client = _make_orchestrator()
     try:

From cb937e6281910ff5cffe5300759097e9e09a5da2 Mon Sep 17 00:00:00 2001
From: ccyywwen <75376396+ccyywwen@users.noreply.github.com>
Date: Thu, 2 Jul 2026 10:26:31 +0800
Subject: [PATCH 7/7] Fix: harden L3-L2 queue descriptor validation

- Pin descriptor slot ABI with compile-time layout checks.
- Reject STOP descriptors that carry payload metadata.
- Clarify non-blocking C++ try API return semantics.
---
 docs/l3-l2-message-queue.md                        |  4 ++--
 .../platform/include/aicpu/l3_l2_message_queue.h   | 12 ++++++++++++
 tests/ut/cpp/common/test_l3_l2_message_queue.cpp   | 14 ++++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/docs/l3-l2-message-queue.md b/docs/l3-l2-message-queue.md
index 12a0034ad..2cd2ee366 100644
--- a/docs/l3-l2-message-queue.md
+++ b/docs/l3-l2-message-queue.md
@@ -151,7 +151,7 @@ while (true) {
 
 `queue.input().try_peek(&input)` and
 `queue.output().try_reserve(nbytes, &reservation)` are non-blocking. A `false`
-return can mean no progress, timeout, validation failure, or poison; check
+return can mean ordinary no-progress, validation failure, or poison; check
 `queue.error().kind` to distinguish ordinary no-progress from terminal error.
 
 ## 2. Layout
@@ -282,7 +282,7 @@ are non-blocking and return `False` or `None` for ordinary no-progress.
 C++ blocking queue operations take `timeout_ns`; `timeout_ns == 0` is an
 immediate timeout probe. They return `false` on no-progress, timeout,
 validation failure, or poison. C++ `try_*` APIs are non-blocking and also
-return `false` for ordinary no-progress.
+return `false` for ordinary no-progress, validation failure, or poison.
 
 Timeout under ordinary backpressure is not poison. After timeout, an endpoint
 samples the peer abort flag; if the peer flag is set, the local endpoint
diff --git a/src/common/platform/include/aicpu/l3_l2_message_queue.h b/src/common/platform/include/aicpu/l3_l2_message_queue.h
index 4c149ba7e..1caff863d 100644
--- a/src/common/platform/include/aicpu/l3_l2_message_queue.h
+++ b/src/common/platform/include/aicpu/l3_l2_message_queue.h
@@ -41,6 +41,12 @@ struct L3L2QueueDescSlot {
     uint64_t payload_nbytes;
 };
 
+static_assert(sizeof(L3L2QueueDescSlot) == L3L2_QUEUE_DESC_SLOT_BYTES, "L3L2QueueDescSlot ABI size changed");
+static_assert(offsetof(L3L2QueueDescSlot, seq) == 0, "L3L2QueueDescSlot::seq offset changed");
+static_assert(offsetof(L3L2QueueDescSlot, opcode) == 8, "L3L2QueueDescSlot::opcode offset changed");
+static_assert(offsetof(L3L2QueueDescSlot, payload_offset) == 16, "L3L2QueueDescSlot::payload_offset changed");
+static_assert(offsetof(L3L2QueueDescSlot, payload_nbytes) == 24, "L3L2QueueDescSlot::payload_nbytes changed");
+
 enum class L3L2QueueOpcode : uint64_t {
     INVALID = 0,
     DATA = 1,
@@ -332,6 +338,12 @@ class L3L2QueueEndpoint {
                 parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "invalid input opcode");
                 return false;
             }
+            if (opcode == L3L2QueueOpcode::STOP && (slot.payload_offset != 0 || slot.payload_nbytes != 0)) {
+                parent_->poison(
+                    L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "STOP descriptor must be zero-byte"
+                );
+                return false;
+            }
 
             L3L2OrchPayloadView view{0, 0};
             if (slot.payload_nbytes == 0) {
diff --git a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
index e7db495d4..26e27a0f2 100644
--- a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
+++ b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp
@@ -497,6 +497,20 @@ TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidSt
     EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1);
 }
 
+TEST(L3L2MessageQueueTest, InputStopWithPayloadMetadataPoisonsAndSetsOwnAbortFlag) {
+    RegionStorage storage{};
+    L3L2QueueArgs args = make_args(2, 64, 64);
+    L3L2QueueEndpoint queue(make_desc(&storage, args), args);
+    ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message;
+    publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::STOP, queue.layout().input_arena_offset, 8);
+
+    L3L2QueueInputHandle handle{};
+    EXPECT_FALSE(queue.input().try_peek(&handle));
+
+    EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::INVALID_DESCRIPTOR);
+    EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1);
+}
+
 TEST(L3L2MessageQueueTest, NullInputPeekOutputIsPreMutationRejectionWithoutAbort) {
     RegionStorage storage{};
     L3L2QueueArgs args = make_args(2, 64, 64);