From 5962af6d248071f58658ba60654b1826497b2b6f Mon Sep 17 00:00:00 2001 From: ccyywwen <75376396+ccyywwen@users.noreply.github.com> Date: Wed, 24 Jun 2026 15:45:02 +0800 Subject: [PATCH 1/7] Add: L3-L2 message queue design - Define the staged base queue transport design and PR1/PR2 split. - Add the base implementation plan for the queue stack. --- docs/l3-l2-message-queue-base-impl.md | 798 ++++++++++++++++++++++ docs/l3-l2-message-queue-design.md | 922 ++++++++++++++++++++++++++ 2 files changed, 1720 insertions(+) create mode 100644 docs/l3-l2-message-queue-base-impl.md create mode 100644 docs/l3-l2-message-queue-design.md diff --git a/docs/l3-l2-message-queue-base-impl.md b/docs/l3-l2-message-queue-base-impl.md new file mode 100644 index 000000000..d63f446cf --- /dev/null +++ b/docs/l3-l2-message-queue-base-impl.md @@ -0,0 +1,798 @@ +# L3-L2 Message Queue Base Queue Two-PR Implementation Plan + +## 1. Scope And Platform Support + +This document covers a two-PR delivery of the base bidirectional SPSC message +queue transport described in `l3-l2-message-queue-design.md`. + +PR1 implements the core queue transport and primitive-compatible fast-path API: + +- one input queue from L3 to L2; +- one output queue from L2 to L3; +- descriptor rings and payload arenas in one primitive L3-L2 region; +- `DATA`, `ERROR`, and input-only `STOP` descriptors; +- explicit output reserve/publish on L2; +- explicit input peek/release on L2; +- L3 enqueue, output ownership/dequeue, stop, and cleanup APIs; +- non-zero L3 buffers limited to primitive-compatible registered + `orch.alloc(...)` host Tensors; +- two single-writer abort flags for timeout disambiguation; +- unit tests for ABI, layout, counters, zero-byte descriptors, queue + mechanics, and fast-path APIs. + +PR2 implements the usability and end-to-end layer: + +- lazy internal staging for ordinary L3 host buffers; +- ordinary host-buffer enqueue and output read convenience paths; +- one base queue example with a small message-local AICore task. +- scene tests on supported platforms; +- final user-facing documentation cleanup. + +Neither PR includes: + +- the L2 input window helper; +- multiple active DATA input handles on L2; +- out-of-order input release; +- fragmented payload arenas; +- multiple outstanding producer reservations per direction; +- output-side STOP acknowledgement messages. + +Supported across the two PRs: + +- `a2a3` onboard; +- `a2a3sim`; +- `a5sim`. + +Not supported: + +- `a5` onboard. + +The exact Python and C++ class names may change during implementation, but the +ABI, state transitions, and observable behavior in this document are base queue +requirements. Scope tags below identify whether a requirement lands in PR1 or +PR2. + +## 2. Expected User Flow + +The final base queue should be usable without exposing descriptor offsets, +counter offsets, or payload arena cursors to application code. PR1 supports +the same operation shape with primitive-compatible registered host Tensors for +non-zero L3 buffers. PR2 relaxes that buffer requirement with lazy staging. + +Expected L3 shape: + +```python +queue = orch.create_l3_l2_queue( + worker_id=0, + depth=8, + input_arena_bytes=1 << 20, + output_arena_bytes=1 << 20, +) + +for payload in input_payloads: + queue.input.enqueue(payload.buffer, nbytes=payload.nbytes, timeout=timeout_s) + +queue.input.enqueue(None, nbytes=0, timeout=timeout_s) # zero-byte DATA +queue.request_stop(timeout=timeout_s) + +while not application_done: + message = queue.output.peek(timeout=timeout_s) + output_buffer = choose_buffer(message.payload_nbytes) + queue.output.read_into(message, output_buffer) + queue.output.release(message) + handle_application_output(message) + +queue.free() +``` + +If the application already owns a large enough output buffer, it may use the +convenience path instead: + +```python +message = queue.output.dequeue_into(max_sized_output_buffer, timeout=timeout_s) +``` + +Expected base L2 shape: + +```cpp +L3L2QueueEndpoint queue(desc_scalars, queue_args); +for (;;) { + auto in = queue.input().peek(timeout); + if (in.opcode == L3L2QueueOpcode::STOP) { + queue.input().release(in); + break; + } + + auto out = queue.output().reserve(output_nbytes, timeout); + launch_message_local_aicore_work(in.payload_view, out.gm_addr); + wait_until_output_bytes_are_visible(); + queue.output().publish(out, L3L2QueueOpcode::DATA); + queue.input().release(in); +} +``` + +Application payload schema, request IDs, final-output markers, and output +cardinality are application responsibilities. PR1 transport order does not +imply request correlation beyond FIFO order within each queue direction. + +## 3. API Surface + +PR1 must expose the semantic operations below. PR2 keeps the same operation +surface and only expands accepted L3 buffer types through lazy staging. Exact +class and method names may change during implementation, but the +implementation must not require users to manipulate descriptor slots, counter +offsets, payload arena offsets, or head/tail reconstruction state directly. + +Required L3 Python surface: + +```text +orch.create_l3_l2_queue( + worker_id, + depth, + input_arena_bytes, + output_arena_bytes, +) -> queue + +queue.input.enqueue(buffer_or_none, nbytes, timeout) +queue.input.try_enqueue(buffer_or_none, nbytes) + +queue.output.dequeue_into(buffer, timeout) -> message +queue.output.try_dequeue_into(buffer) -> message or no-progress + +queue.request_stop(timeout) +queue.try_request_stop() +queue.free() +``` + +L3 message results must expose at least: + +```text +seq +opcode +payload_nbytes +``` + +Convenience dequeue APIs may copy and release in one operation. PR1 must also +expose explicit output ownership APIs with these semantics: + +```text +queue.output.peek(timeout) -> message_handle +queue.output.try_peek() -> message_handle or no-progress +queue.output.read_into(message_handle, buffer) +queue.output.release(message_handle) +``` + +Required L2 C++ surface: + +```text +L3L2QueueEndpoint queue(desc_scalars, queue_args) + +queue.input().peek(timeout) -> input_handle +queue.input().try_peek() -> input_handle or no-progress +queue.input().release(input_handle) + +queue.output().reserve(nbytes, timeout) -> output_reservation +queue.output().try_reserve(nbytes) -> output_reservation or no-progress +queue.output().publish(output_reservation, opcode) +``` + +L2 input handles must expose at least: + +```text +seq +opcode +payload_nbytes +payload_view or empty payload marker +``` + +L2 output reservations must expose at least: + +```text +seq or publish sequence context +payload_offset +payload_nbytes +gm_addr for non-zero payload writes +``` + +The API must preserve these user-visible semantics: + +- finite timeouts are required for blocking operations; +- `try_*` operations return no-progress without mutating shared state when the + queue cannot make progress; +- ordinary timeout does not poison the queue unless peer abort is observed; +- zero-byte messages may pass `buffer_or_none == None`; +- PR1 non-zero L3 buffers must be primitive-compatible registered + `orch.alloc(...)` host Tensors; +- PR2 L3 convenience APIs accept ordinary contiguous host byte spans and lazily + stage them when they are not primitive-compatible registered tensors; +- primitive-compatible `orch.alloc(...)` host Tensors remain the fast path in + both PRs; +- output ownership APIs are the recommended path for variable-size outputs, + while `dequeue_into` remains valid when the caller supplies a large enough + target buffer; +- after successful `request_stop`, L3 input enqueue rejects later input + messages locally without poisoning; +- `ERROR` is an application-level message, not a transport exception; +- cleanup/free remains valid after local poison or remote-aborted terminal + state. + +## 4. L3 Host Buffer Contract And Lazy Staging + +The primitive L3 payload APIs require a registered, child-visible +`orch.alloc(...)` host Tensor. + +PR1 buffer contract: + +- `nbytes == 0` accepts `buffer_or_none == None` and uses the zero-byte + descriptor path; +- non-zero L3 input enqueue buffers must be primitive-compatible registered + `orch.alloc(...)` host Tensors; +- non-zero L3 output read targets must be primitive-compatible registered + `orch.alloc(...)` host Tensors; +- ordinary `bytes`, `bytearray`, `memoryview`, private tensors, and other + non-registered host buffers are rejected before shared-state mutation; +- rejecting a non-registered buffer is a pre-mutation validation failure and + does not poison or set an abort flag. + +PR2 buffer contract: + +- `nbytes == 0` accepts `buffer_or_none == None` and uses the zero-byte + descriptor path; +- if the input buffer is a primitive-compatible registered `orch.alloc(...)` + host Tensor, enqueue uses it directly as the zero-extra-host-copy fast path; +- otherwise enqueue accepts an ordinary readable contiguous host byte span, + such as `bytes`, `bytearray`, `memoryview`, or a contiguous CPU tensor-like + object the implementation can view as bytes; +- non-fast-path enqueue copies the user bytes into an internal registered + staging Tensor, then issues primitive `payload_write` from that staging + Tensor. + +For L3 output read: + +- if the output target is a primitive-compatible registered `orch.alloc(...)` + host Tensor, `read_into` or `dequeue_into` uses it directly as the fast path; +- otherwise the target must be an ordinary writable contiguous host byte span; +- non-fast-path read first issues primitive `payload_read` into an internal + registered staging Tensor, then copies from staging into the user target. + +The staging Tensor is allocated lazily and owned by the queue handle. It may +grow when a later operation needs a larger staging span. The implementation +must not expose staging offsets or staging Tensor ownership to users. + +If a payload is too large for the current staging Tensor, the queue should grow +or allocate staging before issuing any primitive command. Failure to allocate +staging is a pre-mutation validation/allocation failure: it rejects the +operation, does not publish descriptors, does not release descriptors, does not +poison, and does not set an abort flag. + +Staging may add one host-to-host copy. Users that need the lowest host overhead +can pass primitive-compatible registered `orch.alloc(...)` host Tensors. + +## 5. PR1 ABI Surface + +The stable PR1 ABI is the L3/L2 shared contract. It is separate from exact +Python or C++ method names. + +TaskArgs carry the primitive region descriptor followed by queue parameters: + +```text +primitive desc[0..5] +queue_magic_version +depth +input_arena_bytes +output_arena_bytes +``` + +The queue ABI version covers: + +- descriptor slot size and field order; +- opcode numeric values; +- deterministic payload layout derivation; +- counter offsets and meanings; +- head/tail low32 reconstruction rules; +- abort flag semantics; +- zero-byte descriptor canonical form; +- STOP and ERROR transport semantics. + +Descriptor slot ABI: + +```cpp +struct L3L2QueueDescSlot { + uint64_t seq; + uint64_t opcode; + uint64_t payload_offset; + uint64_t payload_nbytes; +}; +static_assert(sizeof(L3L2QueueDescSlot) == 32); +``` + +Opcode ABI: + +```text +0 invalid / never published +DATA = 1 +STOP = 2 +ERROR = 3 +``` + +Counter ABI: + +```text +offset 0: input_desc_tail writer=L3 +offset 64: input_desc_head writer=L2 +offset 128: output_desc_tail writer=L2 +offset 192: output_desc_head writer=L3 +offset 256: l3_abort_flag writer=L3 +offset 320: l2_abort_flag writer=L2 +``` + +Layout validation ABI: + +- `depth` must be a power of two and `depth <= 2^30`; +- queue capacity is `depth`, not `depth - 1`; +- descriptor slot size is 32 bytes; +- descriptor rings are 8-byte aligned; +- payload arena bases are 64-byte aligned; +- arena byte sizes are positive 64-byte multiples; +- `counter_bytes >= 384`. + +The following are not PR1 ABI: + +- exact Python class names; +- exact C++ helper class names; +- internal helper function names; +- polling backoff strategy; +- application payload schema; +- example payload format. + +## 6. ABI And Layout + +The descriptor slot ABI is the existing 32-byte format: + +```cpp +struct L3L2QueueDescSlot { + uint64_t seq; + uint64_t opcode; + uint64_t payload_offset; + uint64_t payload_nbytes; +}; +static_assert(sizeof(L3L2QueueDescSlot) == 32); +``` + +`payload_offset` is relative to the primitive payload base. For non-zero +message payloads, it points into the direction-local payload arena. It does not +point to the descriptor slot itself. + +The layout helper must derive all payload and counter offsets. Python may +mirror the calculation, but tests must keep the Python calculation and the C/C++ +helper in lockstep. + +PR1 counter layout: + +```text +offset 0: input_desc_tail writer=L3 +offset 64: input_desc_head writer=L2 +offset 128: output_desc_tail writer=L2 +offset 192: output_desc_head writer=L3 +offset 256: l3_abort_flag writer=L3 +offset 320: l2_abort_flag writer=L2 +``` + +`counter_bytes` must be at least 384. The abort flags are low-frequency +diagnostic signals, but they still use the same 64-byte stride as the +descriptor counters to preserve single-writer cache-line ownership. + +All six counters are initialized to zero before submitting the persistent L2 +run. Descriptor slots and payload bytes do not need to be zeroed for +correctness. + +## 7. Primitive Command Mapping + +The queue is a wrapper over the existing L3-L2 primitive commands. PR1 must not +add a new primitive command or bypass the primitive region lifetime model. + +Descriptor rings live in the primitive payload region. Descriptor slot access +therefore uses the primitive payload APIs: + +- L3 writes input descriptor slots with `L3L2OrchRegion.payload_write`; +- L3 reads output descriptor slots with `L3L2OrchRegion.payload_read`; +- L2 reads input descriptor slots with `L3L2OrchEndpoint::payload_read`; +- L2 writes output descriptor slots with `L3L2OrchEndpoint::payload_write`. + +Message payload arena access also uses the primitive payload APIs when the +message payload is non-zero: + +- L3 input enqueue writes non-zero input payload bytes with + `L3L2OrchRegion.payload_write`; +- L3 output dequeue reads non-zero output payload bytes with + `L3L2OrchRegion.payload_read`; +- L2 input consume obtains a non-zero input payload GM view with + `L3L2OrchEndpoint::payload_read`; +- L2 output reserve returns a GM span in the output arena; L2 application code + or AICore work writes that span before `publish`; +- PR1 does not require a separate L2 message-payload copy API. If an + implementation uses `L3L2OrchEndpoint::payload_write` for a small L2-produced + output payload, it is only a helper for filling the reserved output arena + span before `publish`, not a separate transport path. + +Queue counters use the primitive signal APIs: + +- publishing descriptor tail, releasing descriptor head, and setting an abort + flag use `SIGNAL_NOTIFY` / `signal_notify`; +- head/tail polling uses `SIGNAL_TEST` / `signal_test` snapshots; +- timeout disambiguation samples the peer abort flag with `SIGNAL_TEST`, for + example `GE 1` against the peer flag address. + +Only a matched `SIGNAL_TEST` snapshot may drive head/tail reconstruction, +descriptor replay, payload release, or payload reuse. A failed head/tail test +does not establish acquire ordering and its observed value must not update +local queue state. For abort flags, a matched `GE 1` test reports remote abort; +an unmatched test leaves the timeout as ordinary no-progress. + +PR1 queue correctness must not depend on primitive `SIGNAL_WAIT`. Blocking +queue operations are wrapper-level bounded polling loops over `SIGNAL_TEST` +plus local queue-state checks. + +## 8. Zero-Byte Message Rules + +Zero-byte `DATA`, `ERROR`, and `STOP` descriptors are valid queue messages. +They still consume one descriptor slot and follow the normal descriptor +publication sequence. + +For any descriptor with `payload_nbytes == 0`: + +- `payload_offset` must be `0`; +- `payload_offset == 0` is a canonical sentinel, not a payload address; +- the message consumes no payload arena bytes; +- producer payload cursors do not advance; +- consumer payload cursors do not advance; +- payload wrap-padding replay is skipped for that descriptor; +- no message-payload arena copy/read/view is issued. + +Descriptor-ring access is separate from message-payload arena access. +Descriptor slots live in the primitive payload region, so publishing or reading +a zero-byte message may still use primitive payload access for descriptor-ring +metadata. The rule above skips only the message payload arena path. + +Consumer validation order must make the zero-byte path explicit: + +```text +1. validate descriptor sequence; +2. validate opcode and direction legality; +3. if payload_nbytes == 0: + require payload_offset == 0; + skip direction-local arena range checks and payload replay; + else: + require payload_offset to be inside the direction-local arena; + validate contiguous span and payload cursor replay. +``` + +This ordering matters because `payload_offset == 0` for a zero-byte output +descriptor usually is not inside the output arena. A consumer that runs arena +range validation before the zero-byte branch would reject a valid descriptor. + +If a published descriptor has `payload_nbytes == 0` and `payload_offset != 0`, +the descriptor is invalid published state. The observing endpoint transitions +to `POISONED(local-infrastructure)` and sets its own abort flag. + +## 9. Queue State And Abort Flags + +PR1 uses two single-writer abort flags: + +```text +l3_abort_flag: writer=L3, reader=L2 +l2_abort_flag: writer=L2, reader=L3 +``` + +Each flag is initialized to `0`. On local infrastructure poison, the endpoint +sets its owned flag to `1` with `NotifyOp.Set`. The flag never resets within a +queue lifetime. It is a terminal boolean, not an epoch and not a poison count. + +Abort flags are for timeout disambiguation. PR1 does not require every wait +loop iteration to poll both data progress and abort progress. A blocking queue +operation that reaches its timeout samples the peer abort flag: + +```text +peer abort_flag == 0: + return ordinary timeout/no-progress; + keep the local queue live; + do not set the local abort flag. + +peer abort_flag == 1: + return remote-aborted transport failure; + transition the local handle to a terminal remote-aborted state; + do not publish descriptors or advance queue state; + do not set the local abort flag solely because the peer flag was observed. +``` + +The implementation may represent terminal remote abort with the existing +`POISONED` state, but the reason must remain distinct: + +```text +POISONED(local-infrastructure): set own abort_flag = 1 +POISONED(remote-aborted): do not set own abort_flag +``` + +This distinction prevents a peer abort observation from being amplified into a +new local infrastructure poison report. + +## 10. Capacity, Counters, And Reconstruction + +`depth` is the user-visible queue capacity. A queue created with `depth=N` can +hold `N` published, unreleased descriptors. + +Validation rules: + +- `depth` must be a power of two; +- `depth <= 2^30`; +- queue capacity is `depth`, not `depth - 1`. + +Full and empty checks must use monotonic local `uint64_t` head/tail values, not +only masked ring indices: + +```text +empty iff tail == head +full iff tail - head == depth +invalid shared state iff tail - head > depth +``` + +The shared head/tail counters store only the low 32 bits. Each endpoint keeps +local `uint64_t` copies and reconstructs observed progress with signed 32-bit +delta semantics: + +```text +delta = int32_t(observed_low32 - local_low32) +valid progress: 0 <= delta <= depth +``` + +`delta == depth` is valid. A peer may legally move from empty to full between +observations. Negative deltas or deltas larger than `depth` are inconsistent +shared state and poison the observing endpoint. + +Descriptor slot validity does not depend on opcode or slot clearing. A +published descriptor is valid only when: + +```text +slot.seq == expected_seq +expected_seq == local_head_or_tail + 1 +slot_index == (expected_seq - 1) & (depth - 1) +``` + +Equivalent index calculations are allowed, but the sequence check must use the +full 64-bit `seq`. Descriptor slots do not need to be cleared before reuse. + +Before a producer reuses released descriptor slots or payload arena bytes, it +must replay exactly the released FIFO prefix after observing head progress. +Replay must happen before slot reuse. Zero-byte descriptors in replay advance +descriptor state only and do not advance payload cursors. + +## 11. Producer And Consumer Operation Details + +Producer sequence: + +```text +reserve -> fill/copy payload if payload_nbytes > 0 -> publish descriptor +``` + +Consumer sequence: + +```text +peek/acquire descriptor -> read/view payload if payload_nbytes > 0 +-> release descriptor and payload +``` + +Descriptor publication order: + +1. reserve a descriptor slot and, for non-zero payloads, a contiguous payload + arena span; +2. write or expose the payload bytes; +3. write descriptor fields other than `seq`; +4. write `seq` as the descriptor validity marker; +5. release-publish the tail counter. + +Descriptor release order: + +1. finish all uses of the message payload; +2. update local release and payload cursor state; +3. release-publish the head counter. + +Each direction allows at most one outstanding producer reservation. Publishing +an unknown, stale, already-published, already-canceled, or cross-queue +reservation is a local ownership contradiction and poisons the queue. + +The base queue has no reservation cancel. If a producer has successfully +reserved a non-zero payload span and later cannot safely publish either `DATA` +or application `ERROR`, it must poison the queue. If the queue remains +trustworthy, the application may publish an `ERROR` descriptor using the +reservation. + +`STOP` is an input-queue descriptor. It consumes one input descriptor slot, +uses `payload_nbytes == 0` and `payload_offset == 0`, and is terminal for L3 +input enqueue. After L3 successfully publishes `STOP`, later input `DATA`, +`ERROR`, or `STOP` attempts are rejected locally without poisoning. If L2 has +observed `STOP` and later observes another published input descriptor, the +descriptor is invalid published state and poisons the queue. + +`ERROR` remains an application-level message. Receiving `ERROR` does not poison +the queue, set an abort flag, stop either direction, or imply transport abort. + +## 12. Error Handling Rules + +The guiding rule remains: + +```text +Before shared-state mutation: reject, no poison, no abort flag. +After shared-state mutation or inconsistent shared-state observation: + poison local infrastructure, set own abort_flag. +``` + +Pre-mutation validation failures do not poison and do not set abort flags: + +- `try_enqueue` sees no descriptor or payload space; +- `try_request_stop` sees no input descriptor slot; +- a blocking operation times out under ordinary backpressure; +- payload size exceeds the arena before reservation mutates state; +- queue creation rejects invalid layout or reconstruction parameters; +- output buffer is too small before payload copy and before release; +- invalid API arguments are caught before shared state is touched; +- lazy staging allocation failure before primitive command issue; +- enqueue is attempted after L3 has already published `STOP`; +- application `ERROR` is sent or received normally. + +Infrastructure poison sets the endpoint's own abort flag: + +- descriptor sequence mismatch; +- invalid opcode observed in a published descriptor; +- `STOP` observed on the output queue; +- zero-byte descriptor with non-zero `payload_offset`; +- non-zero descriptor payload range outside its direction-local arena; +- head/tail reconstruction observes impossible progress; +- payload replay observes impossible state; +- payload copy failure after command issue; +- counter notify failure; +- control-service response timeout after command issue; +- L2 endpoint fatal error for this region; +- reservation, publish, or release ownership state becomes contradictory. + +Ordinary timeout is ambiguous until the peer abort flag is sampled. A timeout +with peer abort flag `0` is not poison. A timeout with peer abort flag `1` +transitions the local handle to terminal `remote-aborted` without setting the +local abort flag. + +Cleanup and `free()` remain valid and idempotent after both local +infrastructure poison and remote-aborted terminal state. + +## 13. Example + +PR2 adds one base queue example: + +```text +examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/ +``` + +The example should demonstrate the intended user shape, not every edge case. +It must show: + +- L3 creating a queue with `depth > 1`; +- multiple variable-size input `DATA` messages; +- one zero-byte `DATA` message; +- a persistent L2 loop; +- L2 processing at most one active DATA input at a time; +- one small message-local AICore task; +- L2 publishing one output `DATA` per input `DATA`; +- L3 publishing `STOP`; +- L3 continuing to dequeue outputs after `STOP` according to application final + output rules; +- L2 releasing the `STOP` descriptor and returning from the persistent run. + +The example should not demonstrate: + +- the L2 input window; +- multiple active input messages; +- one input producing multiple outputs; +- multiple inputs producing one output; +- out-of-input-order output publish; +- application `ERROR` protocol design; +- abort flag failure paths. + +The zero-byte `DATA` message should exercise the descriptor-only message path. +It should not require a child-visible zero-byte host buffer. + +## 14. Test Plan + +Both PRs require automated tests for their review-driven boundaries. A manual +review checklist is not enough. + +PR1 test scope: + +- ABI and layout; +- descriptor/counter protocol; +- zero-byte descriptor handling; +- capacity, full/empty, wrap, and low32 reconstruction; +- abort flag semantics; +- L2 endpoint API; +- L3 fast-path API with primitive-compatible registered host Tensors. + +PR2 test scope: + +- lazy internal staging for ordinary L3 host buffers; +- registered Tensor fast path remains no-staging; +- staging allocation failure is pre-mutation and non-poisoning; +- base queue example and scene coverage. + +Suggested C++ unit test category: + +```text +tests/ut/cpp/common/test_l3_l2_message_queue.cpp +``` + +Suggested C++ unit tests: + +- `LayoutAssignsAbortFlagsAfterDescriptorCounters` +- `LayoutRequiresCounterBytesForSixCounters` +- `DescriptorSlotEncodingIsStable` +- `ZeroByteDescriptorUsesCanonicalOffset` +- `ZeroByteDescriptorWithNonZeroOffsetPoisons` +- `CapacityEqualsDepthAllowsNPublishedDescriptors` +- `CapacityEqualsDepthRejectsNthPlusOneDescriptor` +- `FullAndEmptyUseMonotonicCountersNotMaskedIndices` +- `Low32ReconstructionAcceptsDeltaEqualDepth` +- `Low32ReconstructionHandlesCounterWrap` +- `Low32ReconstructionRejectsNegativeDelta` +- `Low32ReconstructionRejectsDeltaGreaterThanDepth` +- `ReplaySkipsPayloadCursorAdvanceForZeroByteDescriptors` +- `ReplayBeforeSlotReuseAfterFullQueueWrap` +- `LocalInfrastructurePoisonSetsOwnAbortFlag` +- `RemoteAbortObservationDoesNotSetOwnAbortFlag` +- `OrdinaryTimeoutDoesNotSetAbortFlag` +- `ApplicationErrorDoesNotSetAbortFlag` +- `PreMutationValidationFailureDoesNotSetAbortFlag` + +Suggested Python unit test category: + +```text +tests/ut/py/test_l3_l2_message_queue.py +``` + +Suggested Python unit tests: + +- `test_layout_matches_cpp_helper` +- `test_counter_offsets_include_abort_flags` +- `test_zero_byte_enqueue_skips_payload_arena_copy` +- `test_zero_byte_dequeue_skips_payload_arena_read` +- `test_enqueue_rejects_ordinary_host_bytes_before_pr2_staging` +- `test_output_read_rejects_ordinary_buffer_before_pr2_staging` +- `test_enqueue_accepts_ordinary_host_bytes_with_lazy_staging` +- `test_enqueue_registered_tensor_uses_fast_path_without_staging` +- `test_output_read_into_ordinary_buffer_uses_lazy_staging` +- `test_staging_allocation_failure_does_not_poison` +- `test_timeout_with_peer_abort_flag_reports_remote_aborted` +- `test_timeout_without_peer_abort_flag_returns_timeout` +- `test_remote_aborted_terminal_state_rejects_later_operations` + +Suggested scene/example tests: + +```text +examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/ +``` + +Suggested scene cases: + +- `variable_size_messages`: enqueue/dequeue several non-zero `DATA` messages; +- `zero_byte_data`: send one zero-byte `DATA` and verify one corresponding + output is produced without payload arena bytes; +- `depth_capacity`: with `depth=N`, publish `N` inputs before backpressure; +- `fifo_stop`: publish `STOP`, drain outputs, and verify L2 exits; +- `small_aicore_work`: each non-zero input launches message-local AICore work; +- `l2_abort_flag_timeout_disambiguation`: force an L2 local infrastructure + poison, then verify L3 timeout reports remote-aborted instead of ordinary + timeout. + +The scene test matrix should include the PR1 supported simulation platforms +where practical: + +- `a2a3sim`; +- `a5sim`. + +Hardware execution should include `a2a3` onboard when device access is +available through the repository's `task-submit` workflow. diff --git a/docs/l3-l2-message-queue-design.md b/docs/l3-l2-message-queue-design.md new file mode 100644 index 000000000..414b80d02 --- /dev/null +++ b/docs/l3-l2-message-queue-design.md @@ -0,0 +1,922 @@ +# L3-L2 SPSC Message Queue Design + +## 1. Goal + +This document proposes the functional shape of an L3-L2 SPSC message queue +wrapper built on top of the existing `docs/l3-l2-orch-comm.md` primitives. + +The feature goal is to let one L3 orchestrator exchange a sequence of input +and output messages with one persistent L2 orchestrator run. L3 can enqueue +task inputs and dequeue task outputs while the L2 run stays alive. This avoids +stopping the L2 run after every task and then paying host/device finish and +init costs again for the next task. + +The target shape has two layers: + +- a base bidirectional queue transport with input and output queues; +- an L2-side input window helper that lets L2 hold multiple input messages + concurrently without changing the L3 API or the transport ABI. + +The base transport should land first for reviewability. The input window can +then be added as an L2 helper policy on top of the same descriptor ABI, region +layout, counter layout, and L3 queue API. + +The queue wrapper does not change the primitive L3-L2 communication service. +It uses the existing region descriptor, payload byte range, and `int32_t` +signal counter primitives. + +## 2. Existing Primitive Constraints + +The primitive L3-L2 communication layer provides: + +- one region descriptor containing payload and counter base/size fields; +- contiguous payload byte access through `PAYLOAD_READ` and `PAYLOAD_WRITE`; +- address-based `int32_t` signal counters through `SIGNAL_NOTIFY`, + `SIGNAL_TEST`, and `SIGNAL_WAIT`; +- region lifetime, release, and poison state handling. + +The primitive layer deliberately does not define queue layout, stream headers, +opcodes, tensor schema, descriptor rings, STOP semantics, or typed tensor +metadata. The message queue wrapper owns those protocol choices. + +The primitive layer requires only 4-byte alignment for counter addresses inside +the registered counter range. The queue wrapper places high-frequency shared +counter signals at 64-byte strides so counters written by different agents do +not share a cache line. + +## 3. Public Functional Shape + +L3 creates one bidirectional queue object: + +```python +queue = orch.create_l3_l2_queue( + worker_id=0, + depth=8, + input_arena_bytes=1 << 20, + output_arena_bytes=1 << 20, +) +``` + +The L3-visible queue API exposes an input queue and an output queue. L3 sends +ordinary application messages to L2 through the input queue and receives +ordinary application messages from L2 through the output queue. + +The wrapper computes: + +- descriptor ring sizes; +- payload section offsets; +- counter offsets; +- total region payload bytes; +- total counter bytes. + +The user does not pass internal descriptor offsets, arena offsets, or counter +offsets. + +The queue owns one `L3L2OrchRegion`. The L2 task receives the primitive region +descriptor plus queue layout scalars through `TaskArgs`. + +The intended L3 API shape is illustrative, but the semantics are part of the +transport contract: + +```python +queue.input.enqueue(host_buffer, nbytes=None, timeout=timeout_s) +message = queue.output.dequeue_into(host_buffer, timeout=timeout_s) +handle = queue.output.peek(timeout=timeout_s) +queue.output.read_into(handle, host_buffer) +queue.output.release(handle) +queue.request_stop(timeout=timeout_s) +queue.free() +``` + +The output ownership APIs `peek`, `read_into`, and `release` are part of the +base L3 API. They are the recommended path for variable-size outputs because +the caller can inspect `payload_nbytes` before choosing or allocating a target +buffer. Convenience APIs such as `dequeue_into` may copy and release in one +operation when the caller already has a large enough target buffer. Core APIs +that hand ownership to the caller require explicit release. + +`queue.free()` releases the L3 queue handle. It rejects later queue operations, +but it does not synchronously free device memory. Physical cleanup follows the +underlying region lifetime model. + +The L3 public queue API accepts ordinary contiguous host byte spans for +convenience enqueue and output read operations. When the supplied buffer is +already a primitive-compatible registered `orch.alloc(...)` Tensor, the queue +uses it as the zero-extra-host-copy fast path. Otherwise the queue lazily +stages through an internal registered host Tensor before issuing the primitive +payload command, then copies between that staging Tensor and the user buffer. +Zero-byte DATA and ERROR messages may pass `None` as the buffer. Staging hides +the primitive child-visible Tensor requirement from ordinary queue users, but +may add one host-to-host copy. + +The L2 input window extension is not visible to L3. It is an L2 helper policy +that controls how many DATA input messages L2 may hold concurrently before +releasing them in FIFO-safe order. + +## 4. Non-Goals + +- Multiple L2 orchestrators. +- Multi-producer or multi-consumer queues. +- Shared input/output payload allocator. +- Split payload spans across arena wrap. +- Dtype, shape, stride, tensor rank, or tile layout interpretation. +- Changes to `ALLOC_REGION`, `PAYLOAD_READ`, `PAYLOAD_WRITE`, + `SIGNAL_NOTIFY`, `SIGNAL_TEST`, or `SIGNAL_WAIT`. +- Exposing the L2 input window configuration through the L3 API. +- Out-of-order input payload release. +- Fragmented or hole-filled input arena allocators. +- Output-side STOP acknowledgement messages. + +## 5. Region Layout + +The physical L3-L2 region has one payload range and one counter range. The +queue wrapper divides the payload range into four logical sections: + +```text +payload region +├─ input descriptor ring +├─ output descriptor ring +├─ input payload arena +└─ output payload arena +``` + +The descriptor rings live in the payload region because they are structured +byte metadata. The counter range stores only shared head/tail signals. + +The input and output payload arenas are logically separate. This preserves SPSC +ownership: + +```text +input arena: + producer = L3 + consumer = L2 + +output arena: + producer = L2 + consumer = L3 +``` + +A shared payload allocator is intentionally out of scope because it would have +two producers and two releasers. + +The queue layout is derived, not transmitted as internal offsets. `TaskArgs` +carry the primitive region descriptor followed by four queue parameters: + +```text +primitive desc[0..5] +queue_magic_version +depth +input_arena_bytes +output_arena_bytes +``` + +The queue magic/version belongs to the queue wrapper ABI, not to the primitive +region ABI. It covers the descriptor slot format, opcode values, deterministic +layout function, head/tail reconstruction rules, and STOP/ERROR transport +semantics. + +A shared C/C++ layout helper is the source of truth for derived offsets and +sizes. Python may mirror that calculation, but tests must keep the Python +calculation and the C/C++ helper in lockstep. The helper derives: + +```text +input_desc_offset +output_desc_offset +input_arena_offset +output_arena_offset +input_desc_tail = 0 +input_desc_head = 64 +output_desc_tail = 128 +output_desc_head = 192 +l3_abort_flag = 256 +l2_abort_flag = 320 +``` + +Validation rules: + +- `depth` must be a power of two and `depth <= 2^30`. +- Queue capacity is `depth` messages, not `depth - 1`. +- Descriptor slot size is fixed at 32 bytes. +- Descriptor rings are 8-byte aligned. +- Payload arena bases are 64-byte aligned. +- `input_arena_bytes` and `output_arena_bytes` must be positive 64-byte + multiples. They do not need to be powers of two. +- `counter_bytes` must be at least 384. +- `payload_bytes` must contain both descriptor rings and both payload arenas. +- Unsupported `queue_magic_version` on L2 is a fatal queue decode error for + this region. + +The L3 queue creator initializes the four shared head/tail counters and the +two abort flags to zero before submitting the persistent L2 run. Descriptor +slots and payload bytes do not need to be zeroed for correctness. + +## 6. Descriptor ABI + +Each descriptor slot is 32 bytes and is encoded as four little-endian +`uint64_t` values: + +```cpp +struct L3L2QueueDescSlot { + uint64_t seq; + uint64_t opcode; + uint64_t payload_offset; + uint64_t payload_nbytes; +}; +static_assert(sizeof(L3L2QueueDescSlot) == 32); +``` + +The queue uses 64-byte spacing for shared signal counters, not for descriptor +slots. Each descriptor ring is SPSC, so the base descriptor ABI needs only the +four transport fields above. + +`seq` is a full 64-bit infrastructure sequence number used for ring +correctness, wrap detection, diagnostics, and input-window validation. It is +not a user correlation ID. Applications that need request IDs, batch IDs, +partial/final markers, or other correlation should put them in their own +payload header. + +`payload_offset` is relative to the primitive region payload base, so L2 can +call `endpoint.payload_read(payload_offset, payload_nbytes, &view)` directly. + +Future descriptor extensions should use an ABI version or application payload +headers instead of reserving unused fields in every slot. + +## 7. Opcodes + +The queue transport defines these opcodes: + +```text +0 invalid / never published +DATA = 1 ordinary application payload message +STOP = 2 graceful input-side shutdown request, input queue only +ERROR = 3 ordinary application-level error payload message, either direction +``` + +`ERROR` is a normal queue message. The queue layer does not interpret its +payload, does not raise a transport exception for it, and does not poison the +queue when it sees one. Applications define whether an `ERROR` payload +correlates with a request, batch, stream, or other application state. + +Infrastructure errors are handled through poison state, not by trying to write +an `ERROR` message into a potentially untrusted queue. + +`STOP` is valid only on the input queue. The output queue has no STOP message. +L2 shutdown acknowledgement is provided by `Worker.run` drain, not by an +output STOP. Observing STOP on the output queue is invalid published +descriptor state and poisons the queue. + +DATA and ERROR may carry zero payload bytes. For any zero-byte message, +`payload_offset` must be zero and the message consumes no payload arena bytes. +STOP must also use `payload_nbytes == 0` and `payload_offset == 0`. + +## 8. Descriptor Counters And Derived Payload Cursors + +The queue shares only descriptor head/tail values through the primitive layer's +`int32_t` signal counters. Each shared head/tail uses a 64-byte stride: + +```text +offset 0: input_desc_tail writer=L3 +offset 64: input_desc_head writer=L2 +offset 128: output_desc_tail writer=L2 +offset 192: output_desc_head writer=L3 +offset 256: l3_abort_flag writer=L3 +offset 320: l2_abort_flag writer=L2 +``` + +`counter_bytes` must be at least 384. + +The abort flags are single-writer terminal booleans used to disambiguate +operation timeouts from remote infrastructure abort. They are initialized to +zero and set to one with `NotifyOp.Set` when the owning endpoint enters local +infrastructure poison. They do not carry application `ERROR` semantics, do not +count poison events, and do not reset within a queue lifetime. + +Blocking queue operations are not required to poll abort flags on every wait +iteration. When a blocking operation times out, the implementation samples the +peer abort flag. If the peer flag is zero, the timeout remains ordinary +no-progress and does not poison the local queue. If the peer flag is one, the +operation reports remote infrastructure abort and transitions the local handle +to a terminal remote-aborted state. Observing a peer abort flag does not set +the local endpoint's own abort flag. + +The shared descriptor counters store the low 32 bits of logical `uint64_t` +head/tail values. These values are monotonic message counts. The primitive +transports these bits through `int32_t` counters. Endpoints reconstruct local +`uint64_t` head/tail values from sampled counter values using signed 32-bit +delta semantics: + +```text +delta = int32_t(observed_low32 - local_low32) +valid progress: 0 <= delta <= depth +``` + +Negative deltas or deltas larger than `depth` are inconsistent shared state. +Queue creation rejects descriptor depths that would make head/tail +reconstruction ambiguous. This is a validation error, not a poison condition. + +Descriptor head/tail reconstruction is safe because unobserved descriptor +progress is bounded by the descriptor ring depth. Payload byte cursors are not +shared counters and are not reconstructed from low-32-bit signal values. + +Each endpoint maintains the payload cursors it needs as local `uint64_t` +state: + +```text +producer local: + payload_tail + inferred_payload_head + +consumer local: + payload_head +``` + +The producer infers reusable payload space by observing `desc_head` +progress and replaying the released descriptors before reusing those descriptor +slots. The consumer maintains its local `payload_head` while releasing +descriptors. +Because payload cursor progress is derived from descriptor FIFO history, payload +arena size is not limited by 32-bit signal counter reconstruction. + +Queue correctness is based on reconstructed descriptor head/tail state plus +descriptor replay, not on primitive `GE` / `LT` comparison over the 32-bit +counter value. Blocking queue operations use bounded polling over `SIGNAL_TEST` +snapshots plus local queue-state checks. The timeout belongs to the wrapper +operation. The design does not require primitive `SIGNAL_WAIT` for queue +correctness. + +Local queue state may advance only after a matched `SIGNAL_TEST` snapshot. A +failed `SIGNAL_TEST` result does not establish acquire ordering, and its +`observed` value must not drive descriptor head/tail reconstruction, descriptor +replay, or payload release. Implementations should choose a comparison that +matches when the sampled counter has changed, such as `NE` against the local +low-32 value. The protocol does not prescribe a busy-poll, sleep, yield, or +backoff strategy. + +If a live endpoint observes counter, head/tail, cursor, or descriptor state that +contradicts the descriptor reconstruction or payload replay rules, that is +inconsistent shared state and poisons the queue. + +Descriptor slots carry the full 64-bit per-message `seq`, so message-level +validation does not depend on reconstructing sequence numbers from counters. +Input and output queues have independent sequence spaces. In each direction, +the first published message has `seq = 1`; head/tail counters start at zero and +store the number of messages published or released. A published slot has +`seq = tail_before_publish + 1`. + +## 9. Payload Arena + +Each direction has a variable-size SPSC byte arena. + +Rules: + +- `payload_tail` and `payload_head` are logical `uint64_t` byte cursors. +- Actual arena offset is `cursor % arena_bytes`. +- `arena_bytes` is limited by region allocation capacity, addressability, and + runtime memory budget, not by 32-bit signal counter reconstruction. +- A single message payload must be one contiguous span. +- A single message payload must be `<= arena_bytes`. +- Split payloads across the arena wrap are not supported. +- If remaining bytes at the arena end cannot hold the next payload, the + producer may insert invisible padding by advancing `payload_tail` to the next + arena cycle. +- Padding has no descriptor. On release, the consumer compares + `payload_head % arena_bytes` with the descriptor's arena-relative payload + offset. If they differ, the only valid base-queue case is wrap padding: the + descriptor offset is the base offset of this direction's arena and the + releaser first advances `payload_head` to the next arena cycle. It then + advances `payload_head` by `payload_nbytes`. Any other mismatch is + inconsistent shared state and poisons the queue. The same replay rule is used + by the producer after observing `desc_head` progress, before it reuses + released descriptor slots. +- Zero-byte messages do not participate in wrap-padding checks and do not + advance payload cursors. + +Backpressure must check both descriptor slots and payload arena bytes. A free +descriptor slot is not enough if the payload arena lacks enough contiguous +space. + +Payload validation is direction-local. DATA and ERROR payloads must lie wholly +inside the input arena for input descriptors, and wholly inside the output +arena for output descriptors. Being inside the primitive payload range is not +enough. + +## 10. Core Operation Sequence + +The queue exposes direction-specific operations. Exact class names may change, +but the operation set and ownership semantics are the transport contract. + +L3 owns the input producer and output consumer operations: + +```text +input.enqueue(buffer, nbytes, timeout) +input.try_enqueue(buffer, nbytes) +output.dequeue_into(buffer, timeout) +output.try_dequeue_into(buffer) +output.peek(timeout) -> message handle +output.try_peek() -> message handle or no-progress +output.read_into(handle, buffer) +output.release(handle) +request_stop(timeout) +try_request_stop() +free() +``` + +`dequeue_into` is the convenience path for full-message copy and release. +The `peek` / `read_into` / `release` path is the explicit-ownership path. +`free` releases the L3 queue handle, not the physical region. + +L2 owns the input consumer and output producer operations: + +```text +input.peek(timeout) -> input handle +input.try_peek() -> input handle or no-progress +input.release(handle) +output.reserve(nbytes, timeout) -> reservation +output.try_reserve(nbytes) -> reservation or no-progress +output.publish(reservation, opcode) +``` + +The L2 input window extension wraps the input consumer with additional +`complete(handle)` ownership; it does not change the base transport ABI. The +base queue has no output dequeue operation on L2 and no input enqueue operation +on L2. + +The producer sequence is: + +```text +reserve -> fill/copy payload -> publish descriptor +``` + +The consumer sequence is: + +```text +peek/acquire descriptor -> read/view payload -> release descriptor and payload +``` + +Convenience APIs are built from the core operation sequence: + +```text +enqueue = reserve + copy + publish +dequeue_into = peek + read + release +``` + +L3 input enqueue can usually use the convenience path because the input payload +already exists in a host-visible buffer. + +L2 output needs the core path because it often must reserve output arena space +before launching AICore work: + +```cpp +auto out = output_queue.reserve(output_nbytes, timeout); +Tensor output = make_tensor_external(out.gm_addr, shape, rank, dtype); +// submit AICore work that writes output +// synchronize so output bytes are visible +output_queue.publish(out, L3L2QueueOpcode::DATA); +``` + +Each queue direction allows at most one outstanding producer reservation. +`publish` accepts only the current outstanding reservation for that direction. +Publishing an unknown, stale, already-published, or cross-queue reservation is +a local ownership contradiction and poisons the queue. + +The base queue does not support reservation cancel. A successful reserve must +be published. If filling the reservation fails but the queue remains +trustworthy, the application may publish an ERROR message using that +reservation. If the reservation cannot be safely published, the producer +poisons the queue. + +Descriptor publication is ordered. The producer writes payload bytes first, +writes descriptor fields, writes `seq` as the descriptor validity marker after +the other descriptor fields, and then release-publishes the tail counter. The +consumer acquire-observes tail progress before reading the slot, and +accepts the descriptor only when `slot.seq` equals the expected sequence. + +Descriptor slots do not need to be cleared before reuse. Sequence validation +distinguishes old and new contents. + +Descriptor release is ordered in the opposite direction. The consumer must +finish using the payload, update local release state, and release-publish the +head counter. The producer may replay released descriptors and infer reusable +payload space only after acquire-observing matched head progress. + +All blocking operations require finite timeouts. Nonblocking `try_*` variants +return without changing shared state when no descriptor slot, message, or +payload space is available. Timeout under ordinary backpressure does not +poison the queue. + +The queue layer returns transport messages to the application: + +```text +seq +opcode +payload bytes or payload view +``` + +The queue layer does not infer application request correlation from queue order +or from transport `seq`. + +Queue ownership is per message, not per byte range. Release or complete always +applies to the whole descriptor payload span. + +For L3 convenience dequeue, a too-small output buffer is a local validation +failure. The descriptor remains at the queue head, no release is published, and +the caller may retry with a larger child-visible buffer. + +## 11. Base L2 Processing Contract + +After dequeuing one input message, L2 application code may submit any number +of message-local AICore tasks and use runtime dependencies, manual scopes, +async notify, or other L2 orchestration features. + +The base helper and example do not overlap ownership of multiple input +messages. They keep at most one active DATA input message at a time: + +```text +peek input +reserve output +submit message-local AICore work +wait or otherwise prove message-local work is safe +publish output +release input +next message +``` + +L2 must not release an input message until AICore no longer reads that input +payload and any corresponding output has been successfully published. + +After an input is released, L2 and any in-flight AICore work must not read its +payload view again. + +The queue layer does not understand dtype, shape, stride, or tensor schema. It +returns byte views. Applications build typed tensors with their own protocol +metadata. + +## 12. L2 Input Window Extension + +The target feature shape includes an L2 input window helper. The helper lets L2 +hold multiple DATA input messages concurrently while preserving FIFO-safe input +release. It enables application-defined output cardinality and output order: + +- one input may produce no output; +- one input may produce multiple outputs; +- several inputs may produce one output; +- status or progress outputs may be published independently; +- output publish order may differ from input acquire order. + +The L3-visible queue API is unchanged by the input window extension. L3 still +observes an input queue and an output queue. L3 receives output messages in +publish order and does not infer input/output correlation from queue order. +Correlation, aggregation, partial/final markers, request IDs, and batch IDs +belong in the application payload header. + +`max_l2_inflight` is a local L2 helper policy. It is not part of queue creation +and does not affect region layout: + +```cpp +L3L2QueueEndpoint queue(desc, layout); +L3L2InputWindow input_window( + queue.input(), + L3L2InputWindowConfig{.max_l2_inflight = 4} +); +``` + +The helper tracks input handles with these states: + +```text +ACQUIRED + Descriptor has been read. Payload view is available to L2. + +COMPLETED + Application has declared the input payload is no longer needed. + +RELEASED + Helper has advanced the input descriptor and payload cursors past this input. +``` + +The state transition is: + +```text +ACQUIRED -> COMPLETED -> RELEASED +``` + +The application owns the transition to `COMPLETED`; the helper owns the +transition to `RELEASED`. Completing an input means no future L2 code or +in-flight AICore task will read that input payload, and the payload is no +longer needed to construct future output. + +Completion is explicit. The helper must not infer completion from C++ object +destruction or lexical scope exit. A handle that is completed twice, released +twice, or destroyed while still active is a local ownership error. + +The helper releases inputs through a FIFO watermark. If inputs 10, 11, and 12 +are acquired and inputs 10 and 12 are completed, the helper may release input +10 only. It must not release input 12 until input 11 is also completed. This +keeps the input payload arena monotonic and avoids holes. + +Output publish remains application-driven and independent of input handles: + +```cpp +auto out = queue.output().reserve(nbytes, timeout); +// fill output directly or submit AICore work that writes out.gm_addr +queue.output().publish(out, L3L2QueueOpcode::DATA); +``` + +The input window extension does not add an output completion manager. The L2 +application owns completion tracking and decides when an output is ready to +publish. + +Output reservation and publish remain single-outstanding per direction. The +input window allows multiple active input handles; it does not introduce +multiple concurrent output reservations. + +## 13. STOP Semantics + +`STOP` is an input queue descriptor message: + +```text +seq + opcode=STOP + payload_nbytes=0 +``` + +It follows normal FIFO ordering. STOP is a graceful shutdown request, not +cancel and not an immediate no-more-output marker. + +Base helper behavior: + +- L2 exits only after processing messages before the STOP. +- L2 releases the STOP descriptor and returns from the persistent run. +- `Worker.run` drain acts as the final acknowledgement. +- No extra STOP ACK counter is required. + +Input-window behavior: + +- STOP can be acquired while earlier DATA inputs are still active. +- STOP does not take effect ahead of earlier DATA inputs. +- The helper stops acquiring further DATA inputs after STOP is observed. +- Earlier active DATA inputs continue until the application completes them. +- Outputs produced by earlier DATA inputs may still be published while the + helper drains. +- The helper releases only the FIFO completed prefix. +- Once all earlier DATA inputs are released, the helper releases STOP and the + persistent L2 run exits. + +STOP takes an input descriptor slot but does not count against +`max_l2_inflight`, because `max_l2_inflight` controls only active DATA input +ownership. + +STOP is terminal for the input queue. After L3 successfully publishes STOP, +the input queue rejects further DATA, ERROR, or STOP enqueue attempts locally +without poisoning. If L2 has observed STOP and later observes any further +published input descriptor, including a second STOP, that is invalid published +descriptor state and poisons the queue. + +STOP does not close the output queue. After publishing STOP, L3 may continue +dequeueing DATA or ERROR messages from the output queue. The transport has no +output-side terminal message and does not automatically know that the +persistent L2 run has returned. Applications that need to know all business +outputs have arrived must define that condition in their payload protocol, for +example with expected counts or final markers. + +Publishing STOP and then immediately returning from the L3 orchestration +function is transport-legal. It can still be an application error if L2 needs +to publish final outputs: the output queue may fill and prevent L2 from +finishing, causing `Worker.run` drain to fail or time out. + +Convenience APIs may expose: + +```text +try_request_stop() +request_stop(timeout) +``` + +`try_request_stop()` attempts to publish a STOP descriptor to the input queue +and returns immediately if no input descriptor slot is available. + +`request_stop(timeout)` performs a bounded wait until a STOP descriptor can be +published. The timeout covers only STOP enqueue/publish. It does not wait for +L2 exit and does not drain outputs. If the timeout expires before STOP is +published, the queue remains live and is not poisoned. + +## 14. Queue Lifetime And Cleanup + +A queue owns one primitive `L3L2OrchRegion`. Queue cleanup follows the +underlying region cleanup path: + +```text +optional request_stop() -> L2 persistent run exits +L3 orchestration function returns +Worker.run drains submitted L2 work +runtime sends FREE_REGION for live L3-L2 regions +queue/region handles expire +``` + +`request_stop()` and `queue.free()` are different operations. `request_stop()` +is a protocol message that asks L2 to stop acquiring input. `queue.free()` is a +local handle release that rejects later queue use. Neither operation +synchronously releases the physical payload/counter region. + +Physical release is deferred until `Worker.run` has drained submitted L2 work. +This keeps region memory live while an in-flight L2 task may still hold the +primitive descriptor or payload views. If the L3 orchestration function exits +with a live queue, runtime cleanup releases it through the same region cleanup +path. + +Queue cleanup does not require the output queue to be empty. Once `Worker.run` +has drained and the persistent L2 run has returned, freeing the region is +memory-safe even if L3 left output messages unread. Those unread messages are +discarded with the region. Applications that need every output must dequeue +until their own final-output condition is satisfied before calling +`queue.free()` or returning from the orchestration function. + +## 15. Error And Poison + +Application-level failure is represented by `opcode=ERROR` and optional +application-defined payload bytes. `ERROR` is allowed in either direction and +may be published during normal processing or while draining after STOP. +Receiving `ERROR` does not poison the queue and does not change STOP +semantics. + +Infrastructure poison is a queue/region state, not a descriptor message. + +The guiding rule is: + +```text +Before shared-state mutation: reject, no poison. +After shared-state mutation or inconsistent shared-state observation: poison. +``` + +Examples that do not poison: + +- `try_enqueue` sees no space. +- `try_request_stop` sees no input descriptor slot. +- Blocking enqueue/dequeue/request-stop times out under ordinary backpressure. +- Payload is larger than the arena before reserve mutates state. +- Queue creation rejects ambiguous descriptor head/tail reconstruction + parameters. +- User buffer is too small before read copies payload bytes. +- Invalid API arguments are caught before touching shared state. + +Examples that poison: + +- descriptor sequence mismatch; +- invalid opcode observed in a published descriptor; +- STOP observed on the output queue; +- descriptor payload range outside its arena; +- descriptor head/tail reconstruction or payload replay observes impossible + shared state; +- payload copy failure after command issue; +- counter notify failure; +- control-service response timeout after command issue; +- L2 endpoint fatal error for this region; +- reservation, publish, or release state becomes self-contradictory. + +Ordinary queue operation timeout does not prove remote poison. After a +blocking operation times out, the endpoint samples the peer abort flag. If the +peer flag is still zero, the timeout remains ordinary no-progress and does not +poison the local queue. If the peer flag is one, the endpoint reports remote +infrastructure abort and transitions its local handle to a terminal +remote-aborted state without setting its own abort flag. The peer may also +observe primitive region fatal errors or `Worker.run` drain errors. + +Only local infrastructure poison sets the endpoint's own abort flag. Ordinary +timeouts, application `ERROR` messages, pre-mutation validation failures, and +observing the peer's abort flag do not set it. + +The L2 input window helper also poisons the queue when local ownership state +becomes contradictory: + +- completing an input handle unknown to the helper; +- completing or releasing a handle twice; +- attempting to release a non-contiguous input while earlier inputs remain + incomplete; +- acquiring DATA after STOP has put the helper into draining; +- observing an acquired input sequence that contradicts the helper window. + +The Python queue object mirrors the existing region state model: + +```text +LIVE +RELEASED +POISONED(local-infrastructure) +POISONED(remote-aborted) +EXPIRED +``` + +After poison, reserve, enqueue, peek, read, release, publish, and stop-request +operations reject. Cleanup/free remains idempotent and valid. + +L2 C++ helper poison reports a fatal error including the primitive region id, +so existing Host-side parsing can poison the corresponding region. + +## 16. Implementation Staging + +The feature can be implemented in two review-friendly stages. This staging is +not an API boundary: the base transport should intentionally leave room for +the input window without later ABI or L3 API changes. + +```text +Stage 1: + base SPSC message queue transport + input and output descriptor rings + input and output payload arenas + descriptor head/tail protocol over int32_t signal counters + single-writer abort flags for timeout disambiguation + derived uint64_t payload cursors via descriptor replay + DATA / ERROR / input-only STOP + one active DATA input in the L2 helper/example + +Stage 2: + L2 input window helper + max_l2_inflight + application-driven input complete + FIFO-safe release of completed input prefix + flexible output cardinality and out-of-input-order output publish + FIFO STOP drain with earlier DATA inputs still active +``` + +Stage 1 intentionally leaves room for Stage 2 through these hook points: + +- descriptor `seq` is explicit and 64-bit; +- input release is explicit, not tied to dequeue; +- output reserve and publish are separate; +- each direction has at most one outstanding producer reservation; +- application correlation is kept in payload, so queue transport does not + assume one input maps to one output; +- L3 queue creation and output ownership/dequeue APIs do not depend on + `max_l2_inflight`. + +Expected implementation locations: + +```text +python/simpler/l3_l2_message_queue.py +src/common/platform/include/aicpu/l3_l2_message_queue.h +docs/l3-l2-message-queue.md +examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/ +examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue_input_window/ +``` + +The exact Python module and public API names may change during implementation, +but the transport contract should remain stable. + +## 17. Tests And Examples + +Base queue tests should cover: + +- layout calculation; +- descriptor slot encoding; +- counter offset assignment; +- queue creation rejecting ambiguous descriptor head/tail reconstruction + parameters; +- enqueue reserve failure for payload larger than arena; +- backpressure when descriptor ring is full; +- backpressure when payload arena is full; +- arena wrap with invisible padding; +- STOP descriptor handling; +- `try_request_stop` and `request_stop(timeout)` behavior; +- ERROR as a normal application message in either direction; +- L3 ordinary host-buffer enqueue/read through lazy staging; +- L3 primitive-compatible registered Tensor fast paths without staging; +- staging allocation failure before primitive command issue not poisoning the + queue; +- abort flags distinguishing ordinary timeout from remote infrastructure + abort; +- local infrastructure poison setting the local abort flag; +- remote-aborted terminal state not setting the local abort flag; +- poison on invalid published descriptor state; +- poison on descriptor head/tail reconstruction or payload replay + inconsistency; +- no poison on pre-mutation validation failure. + +The new example should be parallel to the existing primitive stream example, +not a replacement for it. The primitive stream example should remain as the +minimal demonstration of `docs/l3-l2-orch-comm.md`. + +The base queue example should demonstrate: + +- `depth > 1`; +- variable-size input and output payloads; +- input and output backpressure; +- L2 persistent loop; +- one input message containing message-local AICore work; +- FIFO STOP shutdown; +- L3 optionally dequeuing output after STOP according to application final + output rules. + +Input window tests and examples should cover: + +- `max_l2_inflight > 1`; +- refusing to acquire new DATA input when the input window is full; +- multiple input messages acquired before earlier inputs release; +- application-driven input completion; +- releasing only the FIFO completed prefix; +- one input producing multiple outputs; +- multiple inputs producing one output; +- output publish order differing from input acquire order; +- output correlation stored in the application payload header; +- STOP entering draining while earlier DATA inputs remain active; +- output DATA or ERROR publish during STOP drain; +- local ownership errors poisoning the queue. + +Future work beyond the staged implementation is limited to out-of-order input +payload release, fragmented payload arena allocation, abort reason/status +metadata, low-latency abort polling, or concurrent output reservations, if +those become necessary. From 105de41b4e51a6759c07944d6d4ed0f783b51dab Mon Sep 17 00:00:00 2001 From: ccyywwen <75376396+ccyywwen@users.noreply.github.com> Date: Fri, 26 Jun 2026 18:24:32 +0800 Subject: [PATCH 2/7] Add: L3-L2 message queue core transport - Implement the PR1 L3 queue wrapper and L2 endpoint ABI on top of the primitive L3-L2 orchestration region transport. - Wire Orchestrator.create_l3_l2_queue and cover descriptor layout, zero-byte messages, abort flags, capacity, and fast-path buffers in Python and C++ unit tests. --- python/simpler/l3_l2_message_queue.py | 534 ++++++++++++++ python/simpler/orchestrator.py | 14 + .../include/aicpu/l3_l2_message_queue.h | 659 +++++++++++++++++ tests/ut/cpp/CMakeLists.txt | 17 + .../cpp/common/test_l3_l2_message_queue.cpp | 495 +++++++++++++ .../test_worker/test_l3_l2_message_queue.py | 666 ++++++++++++++++++ 6 files changed, 2385 insertions(+) create mode 100644 python/simpler/l3_l2_message_queue.py create mode 100644 src/common/platform/include/aicpu/l3_l2_message_queue.h create mode 100644 tests/ut/cpp/common/test_l3_l2_message_queue.cpp create mode 100644 tests/ut/py/test_worker/test_l3_l2_message_queue.py diff --git a/python/simpler/l3_l2_message_queue.py b/python/simpler/l3_l2_message_queue.py new file mode 100644 index 000000000..462554650 --- /dev/null +++ b/python/simpler/l3_l2_message_queue.py @@ -0,0 +1,534 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- +"""L3-side L3-L2 SPSC message queue wrapper.""" + +from __future__ import annotations + +import ctypes +import struct +import time +from dataclasses import dataclass +from enum import IntEnum +from typing import Any + +from .l3_l2_orch_comm import ( + L3L2OrchCommCmd, + L3L2OrchCommRequest, + L3L2OrchRegion, + NotifyOp, + WaitCmp, +) +from .task_interface import DataType, Tensor + +L3L2_QUEUE_MAGIC = 0x4C335132 +L3L2_QUEUE_ABI_MAJOR = 1 +L3L2_QUEUE_ABI_MINOR = 0 +L3L2_QUEUE_DESC_SLOT_BYTES = 32 +L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT = 64 +L3L2_QUEUE_COUNTER_STRIDE = 64 +L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET = 0 +L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET = 64 +L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET = 128 +L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET = 192 +L3L2_QUEUE_L3_ABORT_FLAG_OFFSET = 256 +L3L2_QUEUE_L2_ABORT_FLAG_OFFSET = 320 +L3L2_QUEUE_COUNTER_BYTES = 384 +L3L2_QUEUE_MAX_DEPTH = 1 << 30 + +_DESC = struct.Struct("<4Q") +_POLL_INTERVAL_S = 0.00005 + + +class L3L2QueueOpcode(IntEnum): + INVALID = 0 + DATA = 1 + STOP = 2 + ERROR = 3 + + +class _QueueState(IntEnum): + LIVE = 0 + RELEASED = 1 + POISONED_LOCAL = 2 + POISONED_REMOTE = 3 + EXPIRED = 4 + + +@dataclass(frozen=True) +class L3L2QueueLayout: + depth: int + input_desc_offset: int + output_desc_offset: int + input_arena_offset: int + output_arena_offset: int + input_arena_bytes: int + output_arena_bytes: int + payload_bytes: int + input_desc_tail_offset: int + input_desc_head_offset: int + output_desc_tail_offset: int + output_desc_head_offset: int + l3_abort_flag_offset: int + l2_abort_flag_offset: int + counter_bytes: int + + +@dataclass(frozen=True) +class L3L2QueueMessage: + seq: int + opcode: L3L2QueueOpcode + payload_offset: int + payload_nbytes: int + + +def l3_l2_queue_magic_version() -> int: + return (L3L2_QUEUE_MAGIC << 32) | (L3L2_QUEUE_ABI_MAJOR << 16) | L3L2_QUEUE_ABI_MINOR + + +def _align_up(value: int, align: int) -> int: + remainder = value % align + return value if remainder == 0 else value + (align - remainder) + + +def make_l3_l2_queue_layout(depth: int, input_arena_bytes: int, output_arena_bytes: int) -> L3L2QueueLayout: + depth = int(depth) + input_arena_bytes = int(input_arena_bytes) + output_arena_bytes = int(output_arena_bytes) + if depth <= 0 or depth & (depth - 1) != 0 or depth > L3L2_QUEUE_MAX_DEPTH: + raise ValueError("L3-L2 queue depth must be a power of two and <= 2^30") + if input_arena_bytes <= 0 or input_arena_bytes % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT != 0: + raise ValueError("L3-L2 queue input_arena_bytes must be a positive 64-byte multiple") + if output_arena_bytes <= 0 or output_arena_bytes % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT != 0: + raise ValueError("L3-L2 queue output_arena_bytes must be a positive 64-byte multiple") + + desc_ring_bytes = depth * L3L2_QUEUE_DESC_SLOT_BYTES + input_desc_offset = 0 + output_desc_offset = input_desc_offset + desc_ring_bytes + input_arena_offset = _align_up(output_desc_offset + desc_ring_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT) + output_arena_offset = _align_up(input_arena_offset + input_arena_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT) + payload_bytes = output_arena_offset + output_arena_bytes + return L3L2QueueLayout( + depth=depth, + input_desc_offset=input_desc_offset, + output_desc_offset=output_desc_offset, + input_arena_offset=input_arena_offset, + output_arena_offset=output_arena_offset, + input_arena_bytes=input_arena_bytes, + output_arena_bytes=output_arena_bytes, + payload_bytes=payload_bytes, + input_desc_tail_offset=L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET, + input_desc_head_offset=L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET, + output_desc_tail_offset=L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET, + output_desc_head_offset=L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET, + l3_abort_flag_offset=L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, + l2_abort_flag_offset=L3L2_QUEUE_L2_ABORT_FLAG_OFFSET, + counter_bytes=L3L2_QUEUE_COUNTER_BYTES, + ) + + +def create_l3_l2_queue( + orch: Any, + *, + worker_id: int, + depth: int, + input_arena_bytes: int, + output_arena_bytes: int, +) -> L3L2Queue: + layout = make_l3_l2_queue_layout(depth, input_arena_bytes, output_arena_bytes) + region = orch.create_l3_l2_region( + worker_id=int(worker_id), + payload_bytes=layout.payload_bytes, + counter_bytes=layout.counter_bytes, + ) + desc_fields = orch.alloc([24], DataType.UINT8) + desc_seq = orch.alloc([8], DataType.UINT8) + desc_read = orch.alloc([L3L2_QUEUE_DESC_SLOT_BYTES], DataType.UINT8) + for offset in ( + layout.input_desc_tail_offset, + layout.input_desc_head_offset, + layout.output_desc_tail_offset, + layout.output_desc_head_offset, + layout.l3_abort_flag_offset, + layout.l2_abort_flag_offset, + ): + region.counter(offset).notify(0, NotifyOp.Set) + return L3L2Queue(orch, region, layout, desc_fields, desc_seq, desc_read) + + +class L3L2Queue: + def __init__( + self, + orch: Any, + region: L3L2OrchRegion, + layout: L3L2QueueLayout, + desc_fields: Tensor, + desc_seq: Tensor, + desc_read: Tensor, + ) -> None: + self._orch = orch + self._region = region + self._layout = layout + self._desc_fields = desc_fields + self._desc_seq = desc_seq + self._desc_read = desc_read + self._state = _QueueState.LIVE + self._input_head = 0 + self._input_tail = 0 + self._output_head = 0 + self._output_tail = 0 + self._input_payload_tail = 0 + self._input_payload_head = 0 + self._output_payload_head = 0 + self._output_active: L3L2QueueMessage | None = None + self._stop_published = False + self.input = _L3InputQueue(self) + self.output = _L3OutputQueue(self) + + @property + def region(self) -> L3L2OrchRegion: + return self._region + + @property + def layout(self) -> L3L2QueueLayout: + return self._layout + + @property + def magic_version(self) -> int: + return l3_l2_queue_magic_version() + + def l2_task_arg_scalars(self) -> list[int]: + self._ensure_live() + return [ + *self._region.descriptor_scalars(), + self.magic_version, + self._layout.depth, + self._layout.input_arena_bytes, + self._layout.output_arena_bytes, + ] + + def try_request_stop(self) -> bool: + return self.input._try_enqueue(None, 0, L3L2QueueOpcode.STOP) + + def request_stop(self, timeout: float) -> None: + self.input._enqueue(None, 0, L3L2QueueOpcode.STOP, timeout) + + def free(self) -> None: + if self._state == _QueueState.RELEASED: + return + self._state = _QueueState.RELEASED + self._region.free() + + def _ensure_live(self) -> None: + if self._state == _QueueState.RELEASED: + raise RuntimeError("L3-L2 queue has been released") + if self._state == _QueueState.POISONED_REMOTE: + raise RuntimeError("L3-L2 queue is remote-aborted") + if self._state == _QueueState.POISONED_LOCAL: + raise RuntimeError("L3-L2 queue is poisoned") + if self._state == _QueueState.EXPIRED: + raise RuntimeError("L3-L2 queue expired after orchestration run") + if getattr(self._region, "_expired", False): + self._state = _QueueState.EXPIRED + raise RuntimeError("L3-L2 queue expired after orchestration run") + self._region._ensure_live() + + def _validate_registered_buffer(self, buffer: Any, nbytes: int) -> Tensor: + if not isinstance(buffer, Tensor): + raise ValueError("L3-L2 queue PR1 requires a registered Tensor returned by orch.alloc(...)") + self._region._owner._validate_l3_l2_orch_comm_host_buffer(buffer) + if int(nbytes) > int(buffer.nbytes()): + raise ValueError(f"L3-L2 queue nbytes={nbytes} exceeds registered Tensor size {int(buffer.nbytes())}") + return buffer + + def _refresh_counter(self, offset: int, local_value: int, depth: int) -> int: + result = self._signal_test(offset, local_value & 0xFFFF_FFFF, WaitCmp.NE) + if not result.matched: + return local_value + observed = int(result.observed) & 0xFFFF_FFFF + local_low = local_value & 0xFFFF_FFFF + delta = ctypes.c_int32((observed - local_low) & 0xFFFF_FFFF).value + if delta < 0 or delta > depth: + self._poison_local() + raise RuntimeError("L3-L2 queue counter reconstruction failed") + return local_value + delta + + def _sample_peer_abort_after_timeout(self) -> None: + result = self._signal_test(self._layout.l2_abort_flag_offset, 1, WaitCmp.GE) + if result.matched: + self._state = _QueueState.POISONED_REMOTE + raise RuntimeError("L3-L2 queue remote abort observed") + raise TimeoutError("L3-L2 queue operation timed out") + + def _poison_local(self) -> None: + if self._state != _QueueState.LIVE: + return + self._state = _QueueState.POISONED_LOCAL + try: + self._region._owner._l3_l2_orch_comm_submit( + self._region._worker_id, + L3L2OrchCommRequest( + cmd=L3L2OrchCommCmd.SIGNAL_NOTIFY, + op=int(NotifyOp.Set), + region_id=self._region.region_id, + counter_addr=int(self._region.descriptor.counter_base) + self._layout.l3_abort_flag_offset, + counter_operand=1, + ), + 5.0, + ) + except Exception: + pass + + def _run_primitive(self, fn: Any, *args: Any, **kwargs: Any) -> Any: + try: + return fn(*args, **kwargs) + except Exception: + self._poison_local() + raise + + def _signal_test(self, offset: int, cmp_value: int, cmp: WaitCmp) -> Any: + return self._run_primitive(lambda: self._region.counter(offset).test(cmp_value, cmp)) + + def _signal_notify(self, offset: int, value: int) -> None: + self._run_primitive(lambda: self._region.counter(offset).notify(value, NotifyOp.Set)) + + def _write_descriptor( + self, offset: int, seq: int, opcode: L3L2QueueOpcode, payload_offset: int, nbytes: int + ) -> None: + fields_buf = (ctypes.c_uint8 * 24).from_address(int(self._desc_fields.data)) + fields_buf[:] = _DESC.pack(0, int(opcode), int(payload_offset), int(nbytes))[8:] + seq_buf = (ctypes.c_uint8 * 8).from_address(int(self._desc_seq.data)) + seq_buf[:] = struct.pack(" L3L2QueueMessage: + self._run_primitive(self._region.payload_read, offset, self._desc_read, nbytes=L3L2_QUEUE_DESC_SLOT_BYTES) + raw = ctypes.string_at(int(self._desc_read.data), L3L2_QUEUE_DESC_SLOT_BYTES) + seq, opcode_value, payload_offset, payload_nbytes = _DESC.unpack(raw) + try: + opcode = L3L2QueueOpcode(opcode_value) + except ValueError: + self._poison_local() + raise RuntimeError("L3-L2 queue observed invalid descriptor opcode") from None + return L3L2QueueMessage( + seq=int(seq), + opcode=opcode, + payload_offset=int(payload_offset), + payload_nbytes=int(payload_nbytes), + ) + + def _advance_payload_head( + self, + cursor: int, + payload_offset: int, + payload_nbytes: int, + arena_offset: int, + arena_bytes: int, + ) -> int: + if payload_nbytes == 0: + return cursor + expected_offset = arena_offset + (cursor % arena_bytes) + if expected_offset != payload_offset: + if payload_offset != arena_offset: + self._poison_local() + raise RuntimeError("L3-L2 queue payload replay offset mismatch") + cursor += arena_bytes - (cursor % arena_bytes) + return cursor + payload_nbytes + + def _replay_released_input_descriptors(self, old_head: int, new_head: int) -> None: + cursor = old_head + while cursor < new_head: + slot_index = cursor & (self._layout.depth - 1) + slot_offset = self._layout.input_desc_offset + slot_index * L3L2_QUEUE_DESC_SLOT_BYTES + message = self._read_descriptor(slot_offset) + if message.seq != cursor + 1: + self._poison_local() + raise RuntimeError("L3-L2 queue input release replay seq mismatch") + self._input_payload_head = self._advance_payload_head( + self._input_payload_head, + message.payload_offset, + message.payload_nbytes, + self._layout.input_arena_offset, + self._layout.input_arena_bytes, + ) + cursor += 1 + + +class _L3InputQueue: + def __init__(self, queue: L3L2Queue) -> None: + self._queue = queue + + def enqueue(self, buffer_or_none: Any, nbytes: int, timeout: float) -> None: + self._enqueue(buffer_or_none, nbytes, L3L2QueueOpcode.DATA, timeout) + + def try_enqueue(self, buffer_or_none: Any, nbytes: int) -> bool: + return self._try_enqueue(buffer_or_none, nbytes, L3L2QueueOpcode.DATA) + + def _enqueue(self, buffer_or_none: Any, nbytes: int, opcode: L3L2QueueOpcode, timeout: float) -> None: + if timeout is None or float(timeout) <= 0: + raise ValueError("L3-L2 queue blocking operations require a positive timeout") + deadline = time.monotonic() + float(timeout) + while True: + if self._try_enqueue(buffer_or_none, nbytes, opcode): + return + if self._queue._stop_published: + raise RuntimeError("L3-L2 queue input is stopped") + if time.monotonic() >= deadline: + self._queue._sample_peer_abort_after_timeout() + time.sleep(_POLL_INTERVAL_S) + + def _try_enqueue(self, buffer_or_none: Any, nbytes: int, opcode: L3L2QueueOpcode) -> bool: + queue = self._queue + nbytes = int(nbytes) + if nbytes < 0: + raise ValueError("L3-L2 queue nbytes must be non-negative") + payload_tensor = None + if nbytes == 0: + if buffer_or_none is not None: + raise ValueError("L3-L2 queue zero-byte enqueue requires buffer_or_none == None") + else: + payload_tensor = queue._validate_registered_buffer(buffer_or_none, nbytes) + + queue._ensure_live() + if queue._stop_published: + return False + if opcode == L3L2QueueOpcode.STOP and nbytes != 0: + raise ValueError("L3-L2 queue STOP must be zero-byte") + + old_head = queue._input_head + queue._input_head = queue._refresh_counter( + queue._layout.input_desc_head_offset, queue._input_head, queue._layout.depth + ) + if queue._input_head != old_head: + queue._replay_released_input_descriptors(old_head, queue._input_head) + if queue._input_tail - queue._input_head >= queue._layout.depth: + return False + if nbytes > queue._layout.input_arena_bytes: + return False + + payload_offset = 0 + if nbytes != 0: + arena_pos = queue._input_payload_tail % queue._layout.input_arena_bytes + if arena_pos + nbytes > queue._layout.input_arena_bytes: + queue._input_payload_tail += queue._layout.input_arena_bytes - arena_pos + arena_pos = 0 + if queue._input_payload_tail + nbytes - queue._input_payload_head > queue._layout.input_arena_bytes: + return False + payload_offset = queue._layout.input_arena_offset + arena_pos + queue._run_primitive(queue._region.payload_write, payload_offset, payload_tensor, nbytes=nbytes) + queue._input_payload_tail += nbytes + + seq = queue._input_tail + 1 + slot_index = queue._input_tail & (queue._layout.depth - 1) + slot_offset = queue._layout.input_desc_offset + slot_index * L3L2_QUEUE_DESC_SLOT_BYTES + queue._write_descriptor(slot_offset, seq, opcode, payload_offset, nbytes) + queue._input_tail += 1 + queue._signal_notify(queue._layout.input_desc_tail_offset, queue._input_tail) + if opcode == L3L2QueueOpcode.STOP: + queue._stop_published = True + return True + + +class _L3OutputQueue: + def __init__(self, queue: L3L2Queue) -> None: + self._queue = queue + + def try_peek(self) -> L3L2QueueMessage | None: + queue = self._queue + queue._ensure_live() + if queue._output_active is not None: + return queue._output_active + queue._output_tail = queue._refresh_counter( + queue._layout.output_desc_tail_offset, queue._output_tail, queue._layout.depth + ) + if queue._output_tail == queue._output_head: + return None + slot_index = queue._output_head & (queue._layout.depth - 1) + slot_offset = queue._layout.output_desc_offset + slot_index * L3L2_QUEUE_DESC_SLOT_BYTES + message = queue._read_descriptor(slot_offset) + if message.seq != queue._output_head + 1: + queue._poison_local() + raise RuntimeError("L3-L2 queue output descriptor seq mismatch") + if message.opcode == L3L2QueueOpcode.STOP: + queue._poison_local() + raise RuntimeError("L3-L2 queue output descriptor cannot be STOP") + if message.payload_nbytes == 0: + if message.payload_offset != 0: + queue._poison_local() + raise RuntimeError("L3-L2 queue zero-byte output descriptor has nonzero offset") + else: + begin = queue._layout.output_arena_offset + end = begin + queue._layout.output_arena_bytes + if message.payload_offset < begin or message.payload_offset + message.payload_nbytes > end: + queue._poison_local() + raise RuntimeError("L3-L2 queue output payload outside output arena") + queue._advance_payload_head( + queue._output_payload_head, + message.payload_offset, + message.payload_nbytes, + queue._layout.output_arena_offset, + queue._layout.output_arena_bytes, + ) + queue._output_active = message + return message + + def peek(self, timeout: float) -> L3L2QueueMessage: + if timeout is None or float(timeout) <= 0: + raise ValueError("L3-L2 queue blocking operations require a positive timeout") + deadline = time.monotonic() + float(timeout) + while True: + message = self.try_peek() + if message is not None: + return message + if time.monotonic() >= deadline: + self._queue._sample_peer_abort_after_timeout() + time.sleep(_POLL_INTERVAL_S) + + def read_into(self, handle: L3L2QueueMessage, buffer: Any) -> None: + queue = self._queue + queue._ensure_live() + if queue._output_active != handle: + raise RuntimeError("L3-L2 queue output handle is not active") + if handle.payload_nbytes == 0: + if buffer is not None: + raise ValueError("L3-L2 queue zero-byte output read requires buffer == None") + return + target = queue._validate_registered_buffer(buffer, handle.payload_nbytes) + queue._run_primitive(queue._region.payload_read, handle.payload_offset, target, nbytes=handle.payload_nbytes) + + def release(self, handle: L3L2QueueMessage) -> None: + queue = self._queue + queue._ensure_live() + if queue._output_active != handle: + queue._poison_local() + raise RuntimeError("L3-L2 queue output handle is not active") + queue._output_payload_head = queue._advance_payload_head( + queue._output_payload_head, + handle.payload_offset, + handle.payload_nbytes, + queue._layout.output_arena_offset, + queue._layout.output_arena_bytes, + ) + queue._output_head += 1 + queue._output_active = None + queue._signal_notify(queue._layout.output_desc_head_offset, queue._output_head) + + def dequeue_into(self, buffer: Any, timeout: float) -> L3L2QueueMessage: + handle = self.peek(timeout) + self.read_into(handle, buffer) + self.release(handle) + return handle + + def try_dequeue_into(self, buffer: Any) -> L3L2QueueMessage | None: + handle = self.try_peek() + if handle is None: + return None + self.read_into(handle, buffer) + self.release(handle) + return handle diff --git a/python/simpler/orchestrator.py b/python/simpler/orchestrator.py index 87ec02e16..f998b48af 100644 --- a/python/simpler/orchestrator.py +++ b/python/simpler/orchestrator.py @@ -359,6 +359,20 @@ def create_l3_l2_region(self, *, worker_id: int, payload_bytes: int, counter_byt raise RuntimeError("create_l3_l2_region requires an Orchestrator bound to a Worker") return self._worker._create_l3_l2_region(int(worker_id), int(payload_bytes), int(counter_bytes)) + def create_l3_l2_queue(self, *, worker_id: int, depth: int, input_arena_bytes: int, output_arena_bytes: int): + """Create an L3-L2 message queue backed by one L3-L2 communication region.""" + if self._worker is None: + raise RuntimeError("create_l3_l2_queue requires an Orchestrator bound to a Worker") + from .l3_l2_message_queue import create_l3_l2_queue # noqa: PLC0415 + + return create_l3_l2_queue( + self, + worker_id=int(worker_id), + depth=int(depth), + input_arena_bytes=int(input_arena_bytes), + output_arena_bytes=int(output_arena_bytes), + ) + # ------------------------------------------------------------------ # Nested scope (Strict-1 per-scope rings) # ------------------------------------------------------------------ diff --git a/src/common/platform/include/aicpu/l3_l2_message_queue.h b/src/common/platform/include/aicpu/l3_l2_message_queue.h new file mode 100644 index 000000000..383785c54 --- /dev/null +++ b/src/common/platform/include/aicpu/l3_l2_message_queue.h @@ -0,0 +1,659 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#ifndef SRC_COMMON_PLATFORM_INCLUDE_AICPU_L3_L2_MESSAGE_QUEUE_H_ +#define SRC_COMMON_PLATFORM_INCLUDE_AICPU_L3_L2_MESSAGE_QUEUE_H_ + +#include +#include +#include + +#include "aicpu/l3_l2_orch_endpoint.h" + +static constexpr uint32_t L3L2_QUEUE_MAGIC = 0x4C335132u; // "L3Q2" +static constexpr uint16_t L3L2_QUEUE_ABI_MAJOR = 1; +static constexpr uint16_t L3L2_QUEUE_ABI_MINOR = 0; +static constexpr uint64_t L3L2_QUEUE_DESC_SLOT_BYTES = 32; +static constexpr uint64_t L3L2_QUEUE_DESC_RING_ALIGNMENT = 8; +static constexpr uint64_t L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT = 64; +static constexpr uint64_t L3L2_QUEUE_COUNTER_STRIDE = 64; +static constexpr uint64_t L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET = 0; +static constexpr uint64_t L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET = 64; +static constexpr uint64_t L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET = 128; +static constexpr uint64_t L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET = 192; +static constexpr uint64_t L3L2_QUEUE_L3_ABORT_FLAG_OFFSET = 256; +static constexpr uint64_t L3L2_QUEUE_L2_ABORT_FLAG_OFFSET = 320; +static constexpr uint64_t L3L2_QUEUE_COUNTER_BYTES = 384; +static constexpr uint64_t L3L2_QUEUE_MAX_DEPTH = 1ull << 30; + +struct L3L2QueueDescSlot { + uint64_t seq; + uint64_t opcode; + uint64_t payload_offset; + uint64_t payload_nbytes; +}; + +enum class L3L2QueueOpcode : uint64_t { + INVALID = 0, + DATA = 1, + STOP = 2, + ERROR = 3, +}; + +enum class L3L2QueueErrorKind : uint32_t { + NONE = 0, + BAD_ARGUMENT = 1, + BAD_DESCRIPTOR = 2, + INVALID_DESCRIPTOR = 3, + OUT_OF_SPACE = 4, + OWNERSHIP = 5, + REMOTE_ABORTED = 6, + ENDPOINT_ERROR = 7, +}; + +enum class L3L2QueueTimeoutStatus : uint32_t { + ORDINARY_TIMEOUT = 0, + REMOTE_ABORTED = 1, +}; + +struct L3L2QueueError { + L3L2QueueErrorKind kind; + const char *op; + uint64_t region_id; + const char *message; +}; + +struct L3L2QueueLayout { + uint64_t depth; + uint64_t input_desc_offset; + uint64_t output_desc_offset; + uint64_t input_arena_offset; + uint64_t output_arena_offset; + uint64_t input_arena_bytes; + uint64_t output_arena_bytes; + uint64_t payload_bytes; + uint64_t input_desc_tail_offset; + uint64_t input_desc_head_offset; + uint64_t output_desc_tail_offset; + uint64_t output_desc_head_offset; + uint64_t l3_abort_flag_offset; + uint64_t l2_abort_flag_offset; + uint64_t counter_bytes; +}; + +struct L3L2QueueArgs { + uint64_t magic_version; + uint64_t depth; + uint64_t input_arena_bytes; + uint64_t output_arena_bytes; +}; + +struct L3L2QueueInputHandle { + uint64_t seq; + L3L2QueueOpcode opcode; + uint64_t payload_offset; + uint64_t payload_nbytes; + L3L2OrchPayloadView payload; +}; + +struct L3L2QueueOutputReservation { + uint64_t seq; + uint64_t payload_offset; + uint64_t payload_nbytes; + L3L2OrchPayloadView payload; + bool valid; +}; + +static inline uint64_t l3_l2_queue_magic_version() { + return l3_l2_orch_comm_pack_magic_version(L3L2_QUEUE_MAGIC, L3L2_QUEUE_ABI_MAJOR, L3L2_QUEUE_ABI_MINOR); +} + +static inline bool l3_l2_queue_is_power_of_two(uint64_t value) { return value != 0 && (value & (value - 1)) == 0; } + +static inline uint64_t l3_l2_queue_align_up(uint64_t value, uint64_t align) { + if (align == 0) { + return value; + } + uint64_t remainder = value % align; + return remainder == 0 ? value : value + (align - remainder); +} + +static inline bool l3_l2_queue_valid_opcode(L3L2QueueOpcode opcode) { + return opcode == L3L2QueueOpcode::DATA || opcode == L3L2QueueOpcode::STOP || opcode == L3L2QueueOpcode::ERROR; +} + +static inline bool +l3_l2_queue_make_layout(uint64_t depth, uint64_t input_arena_bytes, uint64_t output_arena_bytes, L3L2QueueLayout *out) { + if (out == nullptr || !l3_l2_queue_is_power_of_two(depth) || depth > L3L2_QUEUE_MAX_DEPTH || + input_arena_bytes == 0 || output_arena_bytes == 0 || + input_arena_bytes % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT != 0 || + output_arena_bytes % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT != 0) { + return false; + } + + uint64_t desc_ring_bytes = depth * L3L2_QUEUE_DESC_SLOT_BYTES; + uint64_t input_desc_offset = 0; + uint64_t output_desc_offset = input_desc_offset + desc_ring_bytes; + uint64_t input_arena_offset = + l3_l2_queue_align_up(output_desc_offset + desc_ring_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT); + uint64_t output_arena_offset = + l3_l2_queue_align_up(input_arena_offset + input_arena_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT); + if (l3_l2_orch_comm_add_overflows(output_arena_offset, output_arena_bytes)) { + return false; + } + + *out = L3L2QueueLayout{ + depth, + input_desc_offset, + output_desc_offset, + input_arena_offset, + output_arena_offset, + input_arena_bytes, + output_arena_bytes, + output_arena_offset + output_arena_bytes, + L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET, + L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET, + L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET, + L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET, + L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, + L3L2_QUEUE_L2_ABORT_FLAG_OFFSET, + L3L2_QUEUE_COUNTER_BYTES, + }; + return output_desc_offset % L3L2_QUEUE_DESC_RING_ALIGNMENT == 0 && + input_arena_offset % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT == 0 && + output_arena_offset % L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT == 0; +} + +static inline bool +l3_l2_queue_validate_region(const L3L2OrchRegionDesc &desc, const L3L2QueueArgs &args, L3L2QueueLayout *out_layout) { + L3L2QueueLayout layout{}; + if (args.magic_version != l3_l2_queue_magic_version() || + l3_l2_orch_comm_validate_desc(desc) != L3L2OrchCommValidationError::OK || + !l3_l2_queue_make_layout(args.depth, args.input_arena_bytes, args.output_arena_bytes, &layout)) { + return false; + } + if (desc.payload_bytes < layout.payload_bytes || desc.counter_bytes < layout.counter_bytes) { + return false; + } + if (out_layout != nullptr) { + *out_layout = layout; + } + return true; +} + +static inline void l3_l2_queue_encode_desc( + L3L2QueueDescSlot *slot, uint64_t seq, L3L2QueueOpcode opcode, uint64_t payload_offset, uint64_t payload_nbytes +) { + if (slot == nullptr) { + return; + } + slot->seq = seq; + slot->opcode = static_cast(opcode); + slot->payload_offset = payload_offset; + slot->payload_nbytes = payload_nbytes; +} + +static inline bool l3_l2_queue_reconstruct_counter(int32_t observed_low32, uint64_t depth, uint64_t *local_value) { + if (local_value == nullptr || depth > L3L2_QUEUE_MAX_DEPTH) { + return false; + } + uint32_t local_low32 = static_cast(*local_value); + int32_t delta = static_cast(static_cast(observed_low32) - local_low32); + if (delta < 0 || static_cast(delta) > depth) { + return false; + } + *local_value += static_cast(delta); + return true; +} + +class L3L2QueueEndpoint { +public: + class InputQueue { + public: + explicit InputQueue(L3L2QueueEndpoint *parent) : + parent_(parent) {} + + bool peek(uint64_t timeout_ns, L3L2QueueInputHandle *out) { + if (out == nullptr) { + return false; + } + uint64_t start = l3_l2_orch_endpoint_now(); + uint64_t frequency_hz = l3_l2_orch_endpoint_timer_frequency_hz(); + while (true) { + if (try_peek(out)) { + return true; + } + if (parent_->error_.kind != L3L2QueueErrorKind::NONE) { + return false; + } + uint64_t now = l3_l2_orch_endpoint_now(); + if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) { + parent_->disambiguate_timeout(); + return false; + } + } + } + + bool try_peek(L3L2QueueInputHandle *out) { + if (out != nullptr) { + *out = L3L2QueueInputHandle{0, L3L2QueueOpcode::INVALID, 0, 0, L3L2OrchPayloadView{0, 0}}; + } + if (!parent_->ensure_live("input.try_peek") || out == nullptr) { + return false; + } + if (active_) { + parent_->poison(L3L2QueueErrorKind::OWNERSHIP, "input.try_peek", "input handle already active"); + return false; + } + if (!parent_->refresh_counter( + parent_->layout_.input_desc_tail_offset, parent_->input_tail_, parent_->layout_.depth, + "input.try_peek" + )) { + return false; + } + if (stopped_) { + if (parent_->input_tail_ != parent_->input_head_) { + parent_->poison( + L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", + "input descriptor published after STOP" + ); + } + return false; + } + if (parent_->input_tail_ == parent_->input_head_) { + return false; + } + if (parent_->input_tail_ - parent_->input_head_ > parent_->layout_.depth) { + parent_->poison( + L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "input descriptor state invalid" + ); + return false; + } + + L3L2QueueDescSlot slot{}; + uint64_t slot_index = parent_->input_head_ & (parent_->layout_.depth - 1); + uint64_t slot_offset = parent_->layout_.input_desc_offset + slot_index * sizeof(L3L2QueueDescSlot); + if (!parent_->read_desc_slot(slot_offset, &slot, "input.try_peek")) { + return false; + } + uint64_t expected_seq = parent_->input_head_ + 1; + if (slot.seq != expected_seq) { + parent_->poison( + L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "input descriptor seq mismatch" + ); + return false; + } + L3L2QueueOpcode opcode = static_cast(slot.opcode); + if (!l3_l2_queue_valid_opcode(opcode)) { + parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "invalid input opcode"); + return false; + } + + L3L2OrchPayloadView view{0, 0}; + if (slot.payload_nbytes == 0) { + if (slot.payload_offset != 0) { + parent_->poison( + L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", + "zero-byte descriptor uses nonzero payload offset" + ); + return false; + } + } else if (!parent_->payload_in_arena( + slot.payload_offset, slot.payload_nbytes, parent_->layout_.input_arena_offset, + parent_->layout_.input_arena_bytes + )) { + parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "input payload out of arena"); + return false; + } else if (!parent_->endpoint_.payload_read(slot.payload_offset, slot.payload_nbytes, &view)) { + parent_->poison( + L3L2QueueErrorKind::ENDPOINT_ERROR, "input.try_peek", parent_->endpoint_.error().message + ); + return false; + } + + *out = L3L2QueueInputHandle{slot.seq, opcode, slot.payload_offset, slot.payload_nbytes, view}; + active_ = true; + active_seq_ = slot.seq; + return true; + } + + bool release(const L3L2QueueInputHandle &handle) { + if (!parent_->ensure_live("input.release")) { + return false; + } + if (!active_ || handle.seq != active_seq_ || handle.seq != parent_->input_head_ + 1) { + parent_->poison(L3L2QueueErrorKind::OWNERSHIP, "input.release", "input handle is not active"); + return false; + } + if (handle.payload_nbytes != 0) { + parent_->advance_payload_head( + parent_->input_payload_head_, handle.payload_offset, handle.payload_nbytes, + parent_->layout_.input_arena_offset, parent_->layout_.input_arena_bytes, "input.release" + ); + if (parent_->error_.kind != L3L2QueueErrorKind::NONE) { + return false; + } + } + parent_->input_head_ += 1; + if (handle.opcode == L3L2QueueOpcode::STOP) { + stopped_ = true; + } + active_ = false; + active_seq_ = 0; + return parent_->notify_counter( + parent_->layout_.input_desc_head_offset, static_cast(parent_->input_head_), "input.release" + ); + } + + private: + L3L2QueueEndpoint *parent_; + bool active_{false}; + uint64_t active_seq_{0}; + bool stopped_{false}; + }; + + class OutputQueue { + public: + explicit OutputQueue(L3L2QueueEndpoint *parent) : + parent_(parent) {} + + bool reserve(uint64_t nbytes, uint64_t timeout_ns, L3L2QueueOutputReservation *out) { + if (out == nullptr) { + return false; + } + uint64_t start = l3_l2_orch_endpoint_now(); + uint64_t frequency_hz = l3_l2_orch_endpoint_timer_frequency_hz(); + while (true) { + if (try_reserve(nbytes, out)) { + return true; + } + if (parent_->error_.kind != L3L2QueueErrorKind::NONE) { + return false; + } + uint64_t now = l3_l2_orch_endpoint_now(); + if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) { + parent_->disambiguate_timeout(); + return false; + } + } + } + + bool try_reserve(uint64_t nbytes, L3L2QueueOutputReservation *out) { + if (out != nullptr) { + *out = L3L2QueueOutputReservation{0, 0, 0, L3L2OrchPayloadView{0, 0}, false}; + } + if (!parent_->ensure_live("output.try_reserve") || out == nullptr) { + return false; + } + if (reservation_active_) { + parent_->poison( + L3L2QueueErrorKind::OWNERSHIP, "output.try_reserve", "output reservation already active" + ); + return false; + } + if (nbytes > parent_->layout_.output_arena_bytes) { + return false; + } + uint64_t old_head = parent_->output_head_; + if (!parent_->refresh_counter( + parent_->layout_.output_desc_head_offset, parent_->output_head_, parent_->layout_.depth, + "output.try_reserve" + )) { + return false; + } + if (parent_->output_head_ != old_head && + !parent_->replay_output_releases(old_head, parent_->output_head_, "output.try_reserve")) { + return false; + } + if (parent_->output_tail_ - parent_->output_head_ >= parent_->layout_.depth) { + return false; + } + + uint64_t payload_offset = 0; + L3L2OrchPayloadView view{0, 0}; + if (nbytes != 0) { + uint64_t arena_base = parent_->layout_.output_arena_offset; + uint64_t arena_bytes = parent_->layout_.output_arena_bytes; + uint64_t arena_pos = parent_->output_payload_tail_ % arena_bytes; + if (arena_pos + nbytes > arena_bytes) { + parent_->output_payload_tail_ += arena_bytes - arena_pos; + arena_pos = 0; + } + if (parent_->output_payload_tail_ + nbytes - parent_->output_payload_head_ > arena_bytes) { + return false; + } + payload_offset = arena_base + arena_pos; + view = L3L2OrchPayloadView{parent_->endpoint_.descriptor().payload_base + payload_offset, nbytes}; + parent_->output_payload_tail_ += nbytes; + } + + reservation_active_ = true; + reservation_seq_ = parent_->output_tail_ + 1; + reservation_offset_ = payload_offset; + reservation_nbytes_ = nbytes; + *out = L3L2QueueOutputReservation{reservation_seq_, payload_offset, nbytes, view, true}; + return true; + } + + bool publish(const L3L2QueueOutputReservation &reservation, L3L2QueueOpcode opcode) { + if (!parent_->ensure_live("output.publish")) { + return false; + } + if (!reservation_active_ || !reservation.valid || reservation.seq != reservation_seq_ || + reservation.payload_offset != reservation_offset_ || + reservation.payload_nbytes != reservation_nbytes_) { + parent_->poison(L3L2QueueErrorKind::OWNERSHIP, "output.publish", "unknown output reservation"); + return false; + } + if (opcode == L3L2QueueOpcode::STOP || !l3_l2_queue_valid_opcode(opcode)) { + parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "output.publish", "invalid output opcode"); + return false; + } + L3L2QueueDescSlot slot{}; + l3_l2_queue_encode_desc(&slot, 0, opcode, reservation.payload_offset, reservation.payload_nbytes); + uint64_t slot_index = parent_->output_tail_ & (parent_->layout_.depth - 1); + uint64_t slot_offset = parent_->layout_.output_desc_offset + slot_index * sizeof(L3L2QueueDescSlot); + if (!parent_->write_desc_slot(slot_offset, slot, reservation.seq, "output.publish")) { + return false; + } + parent_->output_tail_ += 1; + reservation_active_ = false; + reservation_seq_ = 0; + reservation_offset_ = 0; + reservation_nbytes_ = 0; + return parent_->notify_counter( + parent_->layout_.output_desc_tail_offset, static_cast(parent_->output_tail_), "output.publish" + ); + } + + private: + L3L2QueueEndpoint *parent_; + bool reservation_active_{false}; + uint64_t reservation_seq_{0}; + uint64_t reservation_offset_{0}; + uint64_t reservation_nbytes_{0}; + }; + + L3L2QueueEndpoint(const L3L2OrchRegionDesc &desc, const L3L2QueueArgs &args) : + endpoint_(desc), + input_queue_(this), + output_queue_(this) { + if (endpoint_.error().kind != L3L2EndpointErrorKind::NONE || + !l3_l2_queue_validate_region(desc, args, &layout_)) { + set_error(L3L2QueueErrorKind::BAD_DESCRIPTOR, "init", desc.region_id, "invalid queue descriptor"); + } + } + + const L3L2QueueError &error() const { return error_; } + const L3L2QueueLayout &layout() const { return layout_; } + InputQueue &input() { return input_queue_; } + OutputQueue &output() { return output_queue_; } + + L3L2QueueTimeoutStatus disambiguate_timeout() { + if (error_.kind != L3L2QueueErrorKind::NONE) { + return error_.kind == L3L2QueueErrorKind::REMOTE_ABORTED ? L3L2QueueTimeoutStatus::REMOTE_ABORTED : + L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT; + } + L3L2OrchSignalTestResult result{}; + uint64_t addr = 0; + if (!endpoint_.counter_addr(layout_.l3_abort_flag_offset, &addr) || + !endpoint_.signal_test(addr, 1, L3L2OrchWaitCmp::GE, &result)) { + poison(L3L2QueueErrorKind::ENDPOINT_ERROR, "timeout", endpoint_.error().message); + return L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT; + } + if (result.matched) { + set_error(L3L2QueueErrorKind::REMOTE_ABORTED, "timeout", endpoint_.descriptor().region_id, "remote abort"); + return L3L2QueueTimeoutStatus::REMOTE_ABORTED; + } + return L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT; + } + +private: + bool ensure_live(const char *op) { + if (error_.kind == L3L2QueueErrorKind::NONE) { + return true; + } + (void)op; + return false; + } + + void set_error(L3L2QueueErrorKind kind, const char *op, uint64_t region_id, const char *message) { + if (error_.kind != L3L2QueueErrorKind::NONE) { + return; + } + error_ = L3L2QueueError{kind, op, region_id, message}; + } + + void poison(L3L2QueueErrorKind kind, const char *op, const char *message) { + set_error(kind, op, endpoint_.descriptor().region_id, message); + if (kind != L3L2QueueErrorKind::REMOTE_ABORTED) { + uint64_t addr = 0; + if (endpoint_.counter_addr(layout_.l2_abort_flag_offset, &addr)) { + endpoint_.signal_notify(addr, 1, L3L2OrchNotifyOp::Set); + } + } + } + + bool notify_counter(uint64_t offset, int32_t value, const char *op) { + uint64_t addr = 0; + if (!endpoint_.counter_addr(offset, &addr) || !endpoint_.signal_notify(addr, value, L3L2OrchNotifyOp::Set)) { + poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message); + return false; + } + return true; + } + + bool refresh_counter(uint64_t offset, uint64_t &local, uint64_t depth, const char *op) { + uint64_t addr = 0; + L3L2OrchSignalTestResult result{}; + if (!endpoint_.counter_addr(offset, &addr) || + !endpoint_.signal_test(addr, static_cast(local), L3L2OrchWaitCmp::NE, &result)) { + poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message); + return false; + } + if (!result.matched) { + return true; + } + if (!l3_l2_queue_reconstruct_counter(result.observed, depth, &local)) { + poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "counter reconstruction failed"); + return false; + } + return true; + } + + bool read_desc_slot(uint64_t slot_offset, L3L2QueueDescSlot *slot, const char *op) { + L3L2OrchPayloadView view{}; + if (!endpoint_.payload_read(slot_offset, sizeof(L3L2QueueDescSlot), &view)) { + poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message); + return false; + } + memcpy(slot, reinterpret_cast(static_cast(view.gm_addr)), sizeof(L3L2QueueDescSlot)); + return true; + } + + bool write_desc_slot(uint64_t slot_offset, const L3L2QueueDescSlot &slot, uint64_t seq, const char *op) { + L3L2QueueDescSlot fields = slot; + fields.seq = 0; + if (!endpoint_.payload_write(slot_offset + offsetof(L3L2QueueDescSlot, opcode), &fields.opcode, 24)) { + poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message); + return false; + } + if (!endpoint_.payload_write(slot_offset + offsetof(L3L2QueueDescSlot, seq), &seq, sizeof(seq))) { + poison(L3L2QueueErrorKind::ENDPOINT_ERROR, op, endpoint_.error().message); + return false; + } + return true; + } + + bool payload_in_arena(uint64_t offset, uint64_t nbytes, uint64_t arena_offset, uint64_t arena_bytes) const { + if (nbytes == 0 || l3_l2_orch_comm_add_overflows(offset, nbytes)) { + return false; + } + return offset >= arena_offset && offset + nbytes <= arena_offset + arena_bytes; + } + + void advance_payload_head( + uint64_t &cursor, uint64_t payload_offset, uint64_t nbytes, uint64_t arena_offset, uint64_t arena_bytes, + const char *op + ) { + uint64_t expected_offset = arena_offset + (cursor % arena_bytes); + if (expected_offset != payload_offset) { + if (payload_offset != arena_offset) { + poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "payload replay offset mismatch"); + return; + } + cursor += arena_bytes - (cursor % arena_bytes); + } + cursor += nbytes; + } + + bool replay_output_releases(uint64_t old_head, uint64_t new_head, const char *op) { + uint64_t cursor = old_head; + while (cursor < new_head) { + L3L2QueueDescSlot slot{}; + uint64_t slot_index = cursor & (layout_.depth - 1); + uint64_t slot_offset = layout_.output_desc_offset + slot_index * sizeof(L3L2QueueDescSlot); + if (!read_desc_slot(slot_offset, &slot, op)) { + return false; + } + if (slot.seq != cursor + 1) { + poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "output release replay seq mismatch"); + return false; + } + if (slot.payload_nbytes != 0) { + advance_payload_head( + output_payload_head_, slot.payload_offset, slot.payload_nbytes, layout_.output_arena_offset, + layout_.output_arena_bytes, op + ); + if (error_.kind != L3L2QueueErrorKind::NONE) { + return false; + } + } + cursor += 1; + } + return true; + } + + L3L2OrchEndpoint endpoint_; + L3L2QueueLayout layout_{}; + L3L2QueueError error_{L3L2QueueErrorKind::NONE, "", 0, ""}; + uint64_t input_head_{0}; + uint64_t input_tail_{0}; + uint64_t output_head_{0}; + uint64_t output_tail_{0}; + uint64_t input_payload_head_{0}; + uint64_t output_payload_head_{0}; + uint64_t output_payload_tail_{0}; + InputQueue input_queue_; + OutputQueue output_queue_; +}; + +#endif // SRC_COMMON_PLATFORM_INCLUDE_AICPU_L3_L2_MESSAGE_QUEUE_H_ diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 5fe6dd186..d4fcc497f 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -369,6 +369,23 @@ add_common_utils_test(test_elf_build_id common/test_elf_build_id.cpp) add_common_utils_test(test_runtime_orch_so common/test_runtime_orch_so.cpp) add_common_utils_test(test_device_arena common/test_device_arena.cpp) add_common_utils_test(test_l3_l2_orch_comm common/test_l3_l2_orch_comm.cpp) +add_executable(test_l3_l2_message_queue + common/test_l3_l2_message_queue.cpp + stubs/test_stubs.cpp +) +target_include_directories(test_l3_l2_message_queue PRIVATE + ${GTEST_INCLUDE_DIRS} + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include + ${CMAKE_SOURCE_DIR}/../../../src/common/platform/include +) +target_link_libraries(test_l3_l2_message_queue PRIVATE + ${GTEST_MAIN_LIB} + ${GTEST_LIB} + pthread +) +add_test(NAME test_l3_l2_message_queue COMMAND test_l3_l2_message_queue) +set_tests_properties(test_l3_l2_message_queue PROPERTIES LABELS "no_hardware") + add_executable(test_l3_l2_orch_endpoint common/test_l3_l2_orch_endpoint.cpp stubs/test_stubs.cpp diff --git a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp new file mode 100644 index 000000000..409da4763 --- /dev/null +++ b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp @@ -0,0 +1,495 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ + +#include +#include +#include +#include +#include + +#include + +#include "aicpu/l3_l2_message_queue.h" + +namespace { + +struct RegionStorage { + alignas(64) std::array payload{}; + alignas(64) std::array counters{}; +}; + +L3L2OrchRegionDesc make_desc(RegionStorage *storage, uint64_t payload_bytes = 512, uint64_t counter_bytes = 512) { + return L3L2OrchRegionDesc{ + l3_l2_orch_comm_magic_version(), + 19, + reinterpret_cast(storage->payload.data()), + payload_bytes, + reinterpret_cast(storage->counters.data()), + counter_bytes, + }; +} + +size_t counter_index(uint64_t offset) { return static_cast(offset / sizeof(int32_t)); } + +void publish_input_desc( + RegionStorage *storage, const L3L2QueueLayout &layout, uint64_t seq, L3L2QueueOpcode opcode, + uint64_t payload_offset = 0, uint64_t payload_nbytes = 0 +) { + L3L2QueueDescSlot slot{}; + l3_l2_queue_encode_desc(&slot, seq, opcode, payload_offset, payload_nbytes); + uint64_t desc_offset = layout.input_desc_offset + ((seq - 1) & (layout.depth - 1)) * sizeof(L3L2QueueDescSlot); + std::memcpy(storage->payload.data() + desc_offset, &slot, sizeof(slot)); + storage->counters[counter_index(layout.input_desc_tail_offset)] = static_cast(seq); +} + +TEST(L3L2MessageQueueTest, LayoutAssignsPayloadAndAbortCounterOffsets) { + L3L2QueueLayout layout{}; + + ASSERT_TRUE(l3_l2_queue_make_layout(4, 128, 192, &layout)); + + EXPECT_EQ(layout.input_desc_offset, 0u); + EXPECT_EQ(layout.output_desc_offset, 4u * sizeof(L3L2QueueDescSlot)); + EXPECT_EQ(layout.input_arena_offset % 64u, 0u); + EXPECT_EQ(layout.output_arena_offset % 64u, 0u); + EXPECT_EQ(layout.input_desc_tail_offset, 0u); + EXPECT_EQ(layout.input_desc_head_offset, 64u); + EXPECT_EQ(layout.output_desc_tail_offset, 128u); + EXPECT_EQ(layout.output_desc_head_offset, 192u); + EXPECT_EQ(layout.l3_abort_flag_offset, 256u); + EXPECT_EQ(layout.l2_abort_flag_offset, 320u); + EXPECT_EQ(layout.counter_bytes, 384u); + EXPECT_GE(layout.payload_bytes, layout.output_arena_offset + 192u); +} + +TEST(L3L2MessageQueueTest, LayoutLockstepCasesMatchPythonMirrorExpectations) { + struct LayoutCase { + uint64_t depth; + uint64_t input_arena_bytes; + uint64_t output_arena_bytes; + uint64_t output_desc_offset; + uint64_t input_arena_offset; + uint64_t output_arena_offset; + uint64_t payload_bytes; + }; + + const std::array cases{{ + {1, 64, 64, 32, 64, 128, 192}, + {4, 128, 192, 128, 256, 384, 576}, + {8, 192, 64, 256, 512, 704, 768}, + }}; + + for (const auto &test_case : cases) { + L3L2QueueLayout layout{}; + ASSERT_TRUE( + l3_l2_queue_make_layout(test_case.depth, test_case.input_arena_bytes, test_case.output_arena_bytes, &layout) + ); + + EXPECT_EQ(layout.input_desc_offset, 0u); + EXPECT_EQ(layout.output_desc_offset, test_case.output_desc_offset); + EXPECT_EQ(layout.output_desc_offset, test_case.depth * sizeof(L3L2QueueDescSlot)); + EXPECT_EQ(layout.input_arena_offset, test_case.input_arena_offset); + EXPECT_EQ(layout.output_arena_offset, test_case.output_arena_offset); + EXPECT_EQ(layout.payload_bytes, test_case.payload_bytes); + EXPECT_EQ(layout.input_desc_tail_offset, 0u); + EXPECT_EQ(layout.input_desc_head_offset, 64u); + EXPECT_EQ(layout.output_desc_tail_offset, 128u); + EXPECT_EQ(layout.output_desc_head_offset, 192u); + EXPECT_EQ(layout.l3_abort_flag_offset, 256u); + EXPECT_EQ(layout.l2_abort_flag_offset, 320u); + EXPECT_EQ(layout.counter_bytes, 384u); + } +} + +TEST(L3L2MessageQueueTest, LayoutRejectsInvalidDepthArenaAndCounterBytes) { + L3L2QueueLayout layout{}; + + EXPECT_FALSE(l3_l2_queue_make_layout(3, 64, 64, &layout)); + EXPECT_FALSE(l3_l2_queue_make_layout((1ull << 30) + 1, 64, 64, &layout)); + EXPECT_FALSE(l3_l2_queue_make_layout(2, 0, 64, &layout)); + EXPECT_FALSE(l3_l2_queue_make_layout(2, 65, 64, &layout)); + + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + EXPECT_FALSE(l3_l2_queue_validate_region(make_desc(&storage, 256, 320), args, &layout)); + EXPECT_TRUE(l3_l2_queue_validate_region(make_desc(&storage, 512, 384), args, &layout)); +} + +TEST(L3L2MessageQueueTest, DescriptorSlotEncodingIsStable) { + static_assert(std::is_standard_layout::value, "descriptor must be POD-like"); + static_assert(std::is_trivially_copyable::value, "descriptor must be fixed-size"); + + EXPECT_EQ(sizeof(L3L2QueueDescSlot), 32u); + EXPECT_EQ(offsetof(L3L2QueueDescSlot, seq), 0u); + EXPECT_EQ(offsetof(L3L2QueueDescSlot, opcode), 8u); + EXPECT_EQ(offsetof(L3L2QueueDescSlot, payload_offset), 16u); + EXPECT_EQ(offsetof(L3L2QueueDescSlot, payload_nbytes), 24u); + + L3L2QueueDescSlot slot{}; + l3_l2_queue_encode_desc(&slot, 7, L3L2QueueOpcode::ERROR, 128, 16); + EXPECT_EQ(slot.seq, 7u); + EXPECT_EQ(slot.opcode, 3u); + EXPECT_EQ(slot.payload_offset, 128u); + EXPECT_EQ(slot.payload_nbytes, 16u); +} + +TEST(L3L2MessageQueueTest, Low32ReconstructionAcceptsWrapAndRejectsImpossibleDeltas) { + uint64_t value = 0xFFFF'FFFFull; + + EXPECT_TRUE(l3_l2_queue_reconstruct_counter(0, 4, &value)); + EXPECT_EQ(value, 0x1'0000'0000ull); + + value = 100; + EXPECT_TRUE(l3_l2_queue_reconstruct_counter(104, 4, &value)); + EXPECT_EQ(value, 104u); + + value = 100; + EXPECT_FALSE(l3_l2_queue_reconstruct_counter(99, 4, &value)); + + value = 100; + EXPECT_FALSE(l3_l2_queue_reconstruct_counter(105, 4, &value)); +} + +TEST(L3L2MessageQueueTest, L2InputPeekHandlesZeroByteDescriptorBeforeArenaValidation) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + L3L2QueueDescSlot slot{}; + l3_l2_queue_encode_desc(&slot, 1, L3L2QueueOpcode::DATA, 0, 0); + std::memcpy(storage.payload.data() + queue.layout().input_desc_offset, &slot, sizeof(slot)); + storage.counters[0] = 1; + + L3L2QueueInputHandle handle{}; + ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message; + + EXPECT_EQ(handle.seq, 1u); + EXPECT_EQ(handle.opcode, L3L2QueueOpcode::DATA); + EXPECT_EQ(handle.payload_nbytes, 0u); + EXPECT_EQ(handle.payload.gm_addr, 0u); + EXPECT_TRUE(queue.input().release(handle)) << queue.error().message; + EXPECT_EQ(storage.counters[16], 1); +} + +TEST(L3L2MessageQueueTest, L2InputPeekPoisonsZeroByteDescriptorWithNonzeroOffset) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + L3L2QueueDescSlot slot{}; + l3_l2_queue_encode_desc(&slot, 1, L3L2QueueOpcode::DATA, 8, 0); + std::memcpy(storage.payload.data() + queue.layout().input_desc_offset, &slot, sizeof(slot)); + storage.counters[0] = 1; + + L3L2QueueInputHandle handle{}; + EXPECT_FALSE(queue.input().try_peek(&handle)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::INVALID_DESCRIPTOR); + EXPECT_EQ(storage.counters[80], 1); +} + +TEST(L3L2MessageQueueTest, L2OutputReservePublishWritesDescriptorAndTail) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + L3L2QueueOutputReservation reservation{}; + ASSERT_TRUE(queue.output().try_reserve(16, &reservation)) << queue.error().message; + EXPECT_EQ(reservation.payload_nbytes, 16u); + EXPECT_NE(reservation.payload.gm_addr, 0u); + + ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::DATA)) << queue.error().message; + + L3L2QueueDescSlot slot{}; + std::memcpy(&slot, storage.payload.data() + queue.layout().output_desc_offset, sizeof(slot)); + EXPECT_EQ(slot.seq, 1u); + EXPECT_EQ(slot.opcode, 1u); + EXPECT_EQ(slot.payload_nbytes, 16u); + EXPECT_EQ(storage.counters[32], 1); +} + +TEST(L3L2MessageQueueTest, L2OutputReserveReplaysReleasedDescriptorsBeforeReusingArena) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 4, + 64, + 128, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + L3L2QueueOutputReservation first{}; + ASSERT_TRUE(queue.output().try_reserve(80, &first)) << queue.error().message; + ASSERT_EQ(first.payload_offset, queue.layout().output_arena_offset); + ASSERT_TRUE(queue.output().publish(first, L3L2QueueOpcode::DATA)) << queue.error().message; + + storage.counters[48] = 1; + L3L2QueueOutputReservation second{}; + ASSERT_TRUE(queue.output().try_reserve(80, &second)) << queue.error().message; + + EXPECT_EQ(second.payload_offset, queue.layout().output_arena_offset); +} + +TEST(L3L2MessageQueueTest, RemoteAbortObservationDoesNotSetOwnAbortFlag) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + storage.counters[64] = 1; + + EXPECT_EQ(queue.disambiguate_timeout(), L3L2QueueTimeoutStatus::REMOTE_ABORTED); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::REMOTE_ABORTED); + EXPECT_EQ(storage.counters[80], 0); +} + +TEST(L3L2MessageQueueTest, OrdinaryTimeoutDoesNotSetOwnAbortFlag) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + EXPECT_EQ(queue.disambiguate_timeout(), L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0); +} + +TEST(L3L2MessageQueueTest, OutputCapacityEqualsDepthAndFullIsNoProgressWithoutAbort) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + for (int i = 0; i < 2; ++i) { + L3L2QueueOutputReservation reservation{}; + ASSERT_TRUE(queue.output().try_reserve(0, &reservation)) << queue.error().message; + ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::DATA)) << queue.error().message; + } + L3L2QueueOutputReservation third{}; + EXPECT_FALSE(queue.output().try_reserve(0, &third)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET)], 2); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0); +} + +TEST(L3L2MessageQueueTest, FullAndEmptyUseMonotonicCountersNotMaskedIndices) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + for (int i = 0; i < 2; ++i) { + L3L2QueueOutputReservation reservation{}; + ASSERT_TRUE(queue.output().try_reserve(0, &reservation)) << queue.error().message; + ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::DATA)) << queue.error().message; + } + storage.counters[counter_index(L3L2_QUEUE_OUTPUT_DESC_HEAD_OFFSET)] = 1; + + L3L2QueueOutputReservation third{}; + ASSERT_TRUE(queue.output().try_reserve(0, &third)) << queue.error().message; + ASSERT_TRUE(queue.output().publish(third, L3L2QueueOpcode::DATA)) << queue.error().message; + + EXPECT_EQ(third.seq, 3u); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET)], 3); + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0); +} + +TEST(L3L2MessageQueueTest, OutputReserveTooLargeIsPreMutationNoProgressWithoutAbort) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + L3L2QueueOutputReservation reservation{}; + EXPECT_FALSE(queue.output().try_reserve(65, &reservation)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET)], 0); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0); +} + +TEST(L3L2MessageQueueTest, OutputPublishApplicationErrorDoesNotSetAbortFlag) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + L3L2QueueOutputReservation reservation{}; + ASSERT_TRUE(queue.output().try_reserve(0, &reservation)) << queue.error().message; + ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::ERROR)) << queue.error().message; + + L3L2QueueDescSlot slot{}; + std::memcpy(&slot, storage.payload.data() + queue.layout().output_desc_offset, sizeof(slot)); + EXPECT_EQ(slot.opcode, static_cast(L3L2QueueOpcode::ERROR)); + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0); +} + +TEST(L3L2MessageQueueTest, OutputPublishStaleReservationPoisonsAndSetsOwnAbortFlag) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + L3L2QueueOutputReservation reservation{}; + ASSERT_TRUE(queue.output().try_reserve(0, &reservation)) << queue.error().message; + ASSERT_TRUE(queue.output().publish(reservation, L3L2QueueOpcode::DATA)) << queue.error().message; + EXPECT_FALSE(queue.output().publish(reservation, L3L2QueueOpcode::DATA)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::OWNERSHIP); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1); +} + +TEST(L3L2MessageQueueTest, InputApplicationErrorIsNormalMessageAndDoesNotSetAbortFlag) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::ERROR); + + L3L2QueueInputHandle handle{}; + ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message; + EXPECT_EQ(handle.opcode, L3L2QueueOpcode::ERROR); + ASSERT_TRUE(queue.input().release(handle)) << queue.error().message; + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0); +} + +TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidState) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::STOP); + + L3L2QueueInputHandle stop{}; + ASSERT_TRUE(queue.input().try_peek(&stop)) << queue.error().message; + ASSERT_TRUE(queue.input().release(stop)) << queue.error().message; + + publish_input_desc(&storage, queue.layout(), 2, L3L2QueueOpcode::DATA); + L3L2QueueInputHandle later{}; + EXPECT_FALSE(queue.input().try_peek(&later)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::INVALID_DESCRIPTOR); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1); +} + +TEST(L3L2MessageQueueTest, NullInputPeekOutputIsPreMutationRejectionWithoutAbort) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + EXPECT_FALSE(queue.input().try_peek(nullptr)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0); +} + +TEST(L3L2MessageQueueTest, InputSecondPeekBeforeReleasePoisonsOwnershipAndSetsOwnAbortFlag) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA); + + L3L2QueueInputHandle handle{}; + ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message; + L3L2QueueInputHandle second{}; + EXPECT_FALSE(queue.input().try_peek(&second)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::OWNERSHIP); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1); +} + +} // namespace diff --git a/tests/ut/py/test_worker/test_l3_l2_message_queue.py b/tests/ut/py/test_worker/test_l3_l2_message_queue.py new file mode 100644 index 000000000..64b39f8cb --- /dev/null +++ b/tests/ut/py/test_worker/test_l3_l2_message_queue.py @@ -0,0 +1,666 @@ +# Copyright (c) PyPTO Contributors. +# This program is free software, you can redistribute it and/or modify it under the terms and conditions of +# CANN Open Software License Agreement Version 2.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ----------------------------------------------------------------------------------------------------------- + +import ctypes +import math +import struct +from multiprocessing.shared_memory import SharedMemory + +import pytest +from simpler.l3_l2_message_queue import ( + L3L2_QUEUE_COUNTER_BYTES, + L3L2_QUEUE_DESC_SLOT_BYTES, + L3L2_QUEUE_L2_ABORT_FLAG_OFFSET, + L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, + L3L2QueueMessage, + L3L2QueueOpcode, + make_l3_l2_queue_layout, +) +from simpler.l3_l2_orch_comm import ( + L3L2OrchCommCmd, + L3L2OrchCommRequest, + L3L2OrchCommResponse, + L3L2OrchRegionDesc, + NotifyOp, + WaitCmp, +) +from simpler.orchestrator import Orchestrator +from simpler.task_interface import DataType, Tensor, get_element_size +from simpler.worker import _IDLE, _OFF_STATE, Worker, _buffer_field_addr, _mailbox_store_i32 + + +class _FakeCWorker: + def __init__(self): + self.bootstrap_calls: list[tuple[int, str]] = [] + + def control_l3_l2_orch_comm_init(self, worker_id: int, control_shm_name: str) -> None: + self.bootstrap_calls.append((int(worker_id), str(control_shm_name))) + + +class _FakeCOrch: + def __init__(self): + self._buffers = [] + + def alloc(self, shape, dtype): + nbytes = math.prod(int(x) for x in shape) * int(get_element_size(dtype)) + storage_t = ctypes.c_uint8 * nbytes + storage = storage_t() + self._buffers.append(storage) + return Tensor.make(ctypes.addressof(storage), tuple(int(x) for x in shape), dtype) + + +class _FakeClient: + def __init__(self): + self.requests: list[tuple[L3L2OrchCommRequest, float]] = [] + self.payload_writes: list[tuple[int, bytes]] = [] + self.next_region_id = 1 + self.payload_base = 0x1000_0000 + self.counter_base = 0x2000_0000 + self.payload = bytearray() + self.counters: dict[int, int] = {} + self.peer_abort = False + self.fail_next_cmd: L3L2OrchCommCmd | None = None + + def submit(self, request, timeout_s: float): + self.requests.append((request, timeout_s)) + if self.fail_next_cmd == request.cmd: + self.fail_next_cmd = None + raise RuntimeError(f"injected failure for {request.cmd.name}") + if request.cmd == L3L2OrchCommCmd.ALLOC_REGION: + region_id = self.next_region_id + self.next_region_id += 1 + self.payload = bytearray(int(request.payload_bytes)) + self.counters = {} + return L3L2OrchCommResponse( + status=0, + error_kind=0, + region_id=region_id, + observed_counter=0, + matched=False, + desc=L3L2OrchRegionDesc( + magic_version=0x4C334C3200020000, + region_id=region_id, + payload_base=self.payload_base, + payload_bytes=request.payload_bytes, + counter_base=self.counter_base, + counter_bytes=request.counter_bytes, + ), + message="", + ) + if request.cmd == L3L2OrchCommCmd.PAYLOAD_WRITE: + data = ctypes.string_at(int(request.host_ptr), int(request.payload_bytes)) + self.payload_writes.append( + ( + int(request.payload_offset), + data, + ) + ) + begin = int(request.payload_offset) + self.payload[begin : begin + int(request.payload_bytes)] = data + if request.cmd == L3L2OrchCommCmd.PAYLOAD_READ: + begin = int(request.payload_offset) + data = bytes(self.payload[begin : begin + int(request.payload_bytes)]) + ctypes.memmove(int(request.host_ptr), data, len(data)) + if request.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY: + offset = int(request.counter_addr) - self.counter_base + if int(request.op) == int(NotifyOp.Add): + self.counters[offset] = int(self.counters.get(offset, 0)) + int(request.counter_operand) + else: + self.counters[offset] = int(request.counter_operand) + if request.cmd == L3L2OrchCommCmd.SIGNAL_TEST: + offset = int(request.counter_addr) - self.counter_base + observed = ( + 1 if self.peer_abort and offset == L3L2_QUEUE_L2_ABORT_FLAG_OFFSET else self.counters.get(offset, 0) + ) + matched = _compare_counter(observed, int(request.counter_operand), int(request.op)) + return L3L2OrchCommResponse( + status=0, + error_kind=0, + region_id=request.region_id, + observed_counter=observed, + matched=matched, + desc=None, + message="", + ) + return L3L2OrchCommResponse( + status=0, + error_kind=0, + region_id=request.region_id, + observed_counter=request.counter_operand, + matched=True, + desc=None, + message="", + ) + + +def _compare_counter(observed: int, operand: int, cmp: int) -> bool: + if cmp == int(WaitCmp.EQ): + return observed == operand + if cmp == int(WaitCmp.NE): + return observed != operand + if cmp == int(WaitCmp.GT): + return observed > operand + if cmp == int(WaitCmp.GE): + return observed >= operand + if cmp == int(WaitCmp.LT): + return observed < operand + if cmp == int(WaitCmp.LE): + return observed <= operand + return False + + +def _make_orchestrator() -> tuple[Orchestrator, Worker, SharedMemory, _FakeClient]: + worker = Worker(level=3, device_ids=[0], platform="a2a3", runtime="tensormap_and_ringbuffer") + shm = SharedMemory(create=True, size=4096) + assert shm.buf is not None + _mailbox_store_i32(_buffer_field_addr(shm.buf, _OFF_STATE), _IDLE) + fake_client = _FakeClient() + worker._initialized = True + worker._hierarchical_started = True + worker._worker = _FakeCWorker() + worker._chip_shms = [shm] + worker._make_l3_l2_orch_comm_client = lambda _shm: fake_client + return Orchestrator(_FakeCOrch(), worker), worker, shm, fake_client + + +def _close(worker: Worker, shm: SharedMemory) -> None: + worker._close_l3_l2_orch_comm() + shm.close() + shm.unlink() + + +def _publish_output( + fake_client: _FakeClient, + queue, + *, + seq: int = 1, + payload: bytes = b"", + opcode: int = int(L3L2QueueOpcode.DATA), + payload_offset: int | None = None, +) -> None: + if payload_offset is None: + payload_offset = queue.layout.output_arena_offset if payload else 0 + if payload: + fake_client.payload[payload_offset : payload_offset + len(payload)] = payload + desc = struct.pack("<4Q", seq, int(opcode), payload_offset, len(payload)) + desc_offset = queue.layout.output_desc_offset + ((seq - 1) & (queue.layout.depth - 1)) * L3L2_QUEUE_DESC_SLOT_BYTES + fake_client.payload[desc_offset : desc_offset + L3L2_QUEUE_DESC_SLOT_BYTES] = desc + fake_client.counters[queue.layout.output_desc_tail_offset] = seq + + +def test_layout_rejects_invalid_pr1_parameters(): + invalid_args = [ + (3, 128, 128), + ((1 << 30) + 1, 128, 128), + (4, 0, 128), + (4, 127, 128), + (4, 128, 0), + (4, 128, 127), + ] + + for depth, input_arena_bytes, output_arena_bytes in invalid_args: + with pytest.raises(ValueError): + make_l3_l2_queue_layout(depth, input_arena_bytes, output_arena_bytes) + + +@pytest.mark.parametrize( + ("depth", "input_arena_bytes", "output_arena_bytes", "expected"), + [ + ( + 1, + 64, + 64, + { + "output_desc_offset": 32, + "input_arena_offset": 64, + "output_arena_offset": 128, + "payload_bytes": 192, + }, + ), + ( + 4, + 128, + 192, + { + "output_desc_offset": 128, + "input_arena_offset": 256, + "output_arena_offset": 384, + "payload_bytes": 576, + }, + ), + ( + 8, + 192, + 64, + { + "output_desc_offset": 256, + "input_arena_offset": 512, + "output_arena_offset": 704, + "payload_bytes": 768, + }, + ), + ], +) +def test_layout_lockstep_cases_match_cpp_helper_expectations(depth, input_arena_bytes, output_arena_bytes, expected): + layout = make_l3_l2_queue_layout( + depth=depth, + input_arena_bytes=input_arena_bytes, + output_arena_bytes=output_arena_bytes, + ) + + assert layout.input_desc_offset == 0 + assert layout.output_desc_offset == expected["output_desc_offset"] + assert layout.output_desc_offset == depth * L3L2_QUEUE_DESC_SLOT_BYTES + assert layout.input_arena_offset == expected["input_arena_offset"] + assert layout.output_arena_offset == expected["output_arena_offset"] + assert layout.payload_bytes == expected["payload_bytes"] + assert layout.input_arena_offset % 64 == 0 + assert layout.output_arena_offset % 64 == 0 + assert layout.input_desc_tail_offset == 0 + assert layout.input_desc_head_offset == 64 + assert layout.output_desc_tail_offset == 128 + assert layout.output_desc_head_offset == 192 + assert layout.l3_abort_flag_offset == L3L2_QUEUE_L3_ABORT_FLAG_OFFSET + assert layout.l2_abort_flag_offset == L3L2_QUEUE_L2_ABORT_FLAG_OFFSET + assert layout.counter_bytes == L3L2_QUEUE_COUNTER_BYTES + + +def test_create_l3_l2_queue_allocates_region_and_exposes_l2_task_scalars(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=192) + + alloc_req = fake_client.requests[0][0] + assert alloc_req.cmd == L3L2OrchCommCmd.ALLOC_REGION + assert alloc_req.payload_bytes == queue.layout.payload_bytes + assert alloc_req.counter_bytes == L3L2_QUEUE_COUNTER_BYTES + assert queue.l2_task_arg_scalars() == [ + *queue.region.descriptor_scalars(), + queue.magic_version, + 4, + 128, + 192, + ] + assert fake_client.counters == { + queue.layout.input_desc_tail_offset: 0, + queue.layout.input_desc_head_offset: 0, + queue.layout.output_desc_tail_offset: 0, + queue.layout.output_desc_head_offset: 0, + queue.layout.l3_abort_flag_offset: 0, + queue.layout.l2_abort_flag_offset: 0, + } + finally: + _close(worker, shm) + + +def test_zero_byte_enqueue_skips_message_payload_write_and_publishes_descriptor(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + fake_client.requests.clear() + fake_client.payload_writes.clear() + + queue.input.enqueue(None, nbytes=0, timeout=0.001) + + payload_write_offsets = [offset for offset, _data in fake_client.payload_writes] + assert queue.layout.input_arena_offset not in payload_write_offsets + assert queue.layout.input_desc_offset in payload_write_offsets + notify_req = fake_client.requests[-1][0] + assert notify_req.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY + assert notify_req.op == int(NotifyOp.Set) + assert notify_req.counter_addr == queue.region.descriptor.counter_base + queue.layout.input_desc_tail_offset + assert notify_req.counter_operand == 1 + finally: + _close(worker, shm) + + +def test_enqueue_registered_tensor_uses_fast_path_without_staging(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + host = orch.alloc([16], DataType.UINT8) + fake_client.requests.clear() + fake_client.payload_writes.clear() + + queue.input.enqueue(host, nbytes=16, timeout=0.001) + + payload_write_offsets = [offset for offset, _data in fake_client.payload_writes] + assert queue.layout.input_arena_offset in payload_write_offsets + assert queue.layout.input_desc_offset in payload_write_offsets + assert all(req.cmd != L3L2OrchCommCmd.ALLOC_REGION for req, _timeout in fake_client.requests) + finally: + _close(worker, shm) + + +def test_enqueue_replays_released_descriptors_before_reusing_input_arena(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + first = orch.alloc([80], DataType.UINT8) + second = orch.alloc([80], DataType.UINT8) + + queue.input.enqueue(first, nbytes=80, timeout=0.001) + fake_client.counters[queue.layout.input_desc_head_offset] = 1 + queue.input.enqueue(second, nbytes=80, timeout=0.001) + + payload_offsets = [ + offset for offset, data in fake_client.payload_writes if len(data) == 80 + ] + assert payload_offsets == [queue.layout.input_arena_offset, queue.layout.input_arena_offset] + finally: + _close(worker, shm) + + +def test_enqueue_rejects_ordinary_host_bytes_before_shared_mutation(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + fake_client.requests.clear() + + with pytest.raises(ValueError, match="registered.*orch.alloc"): + queue.input.enqueue(b"ordinary", nbytes=8, timeout=0.001) + + assert fake_client.requests == [] + assert queue.region.descriptor_scalars()[1] == 1 + finally: + _close(worker, shm) + + +def test_output_read_into_registered_tensor_uses_fast_path_and_release_notifies_head(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + _publish_output(fake_client, queue, payload=b"abcdefghijklmnop") + output = orch.alloc([16], DataType.UINT8) + + handle = queue.output.peek(timeout=0.001) + queue.output.read_into(handle, output) + queue.output.release(handle) + + assert ctypes.string_at(int(output.data), 16) == b"abcdefghijklmnop" + assert fake_client.counters[queue.layout.output_desc_head_offset] == 1 + finally: + _close(worker, shm) + + +def test_dequeue_into_reads_and_releases_output(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + _publish_output(fake_client, queue, payload=b"abcdefghijklmnop") + output = orch.alloc([16], DataType.UINT8) + + message = queue.output.dequeue_into(output, timeout=0.001) + + assert message.seq == 1 + assert message.opcode == L3L2QueueOpcode.DATA + assert ctypes.string_at(int(output.data), 16) == b"abcdefghijklmnop" + assert fake_client.counters[queue.layout.output_desc_head_offset] == 1 + finally: + _close(worker, shm) + + +def test_try_dequeue_into_empty_returns_none_without_abort(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + output = orch.alloc([16], DataType.UINT8) + fake_client.requests.clear() + + assert queue.output.try_dequeue_into(output) is None + + assert fake_client.counters.get(queue.layout.output_desc_head_offset, 0) == 0 + assert all( + not ( + req.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY + and req.counter_addr == queue.region.descriptor.counter_base + L3L2_QUEUE_L3_ABORT_FLAG_OFFSET + ) + for req, _timeout in fake_client.requests + ) + finally: + _close(worker, shm) + + +def test_output_read_rejects_ordinary_buffer_before_shared_mutation(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + _publish_output(fake_client, queue, payload=b"abcdefghijklmnop") + handle = queue.output.peek(timeout=0.001) + fake_client.requests.clear() + + with pytest.raises(ValueError, match="registered.*orch.alloc"): + queue.output.read_into(handle, bytearray(16)) + + assert fake_client.requests == [] + assert fake_client.counters.get(queue.layout.output_desc_head_offset, 0) == 0 + finally: + _close(worker, shm) + + +def test_output_release_inactive_handle_poisons_and_sets_l3_abort_flag(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + _publish_output(fake_client, queue, payload=b"abcdefghijklmnop") + handle = queue.output.peek(timeout=0.001) + wrong = L3L2QueueMessage(handle.seq + 1, handle.opcode, handle.payload_offset, handle.payload_nbytes) + fake_client.requests.clear() + + with pytest.raises(RuntimeError, match="not active"): + queue.output.release(wrong) + + assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1 + with pytest.raises(RuntimeError, match="poisoned"): + queue.output.try_peek() + finally: + _close(worker, shm) + + +def test_output_stop_descriptor_poisons_and_sets_l3_abort_flag(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + _publish_output(fake_client, queue, opcode=int(L3L2QueueOpcode.STOP)) + + with pytest.raises(RuntimeError, match="cannot be STOP"): + queue.output.peek(timeout=0.001) + + assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1 + finally: + _close(worker, shm) + + +def test_zero_byte_output_descriptor_with_nonzero_offset_poisons_and_sets_l3_abort_flag(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + _publish_output(fake_client, queue, payload_offset=queue.layout.output_arena_offset) + + with pytest.raises(RuntimeError, match="zero-byte.*nonzero"): + queue.output.peek(timeout=0.001) + + assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1 + finally: + _close(worker, shm) + + +def test_zero_byte_output_read_accepts_none_and_skips_payload_read(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + _publish_output(fake_client, queue, payload=b"") + handle = queue.output.peek(timeout=0.001) + fake_client.requests.clear() + + queue.output.read_into(handle, None) + queue.output.release(handle) + + assert all(req.cmd != L3L2OrchCommCmd.PAYLOAD_READ for req, _timeout in fake_client.requests) + assert fake_client.counters[queue.layout.output_desc_head_offset] == 1 + finally: + _close(worker, shm) + + +def test_try_enqueue_full_queue_returns_false_without_poison_or_publish(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=2, input_arena_bytes=128, output_arena_bytes=128) + queue.input.enqueue(None, nbytes=0, timeout=0.001) + queue.input.enqueue(None, nbytes=0, timeout=0.001) + fake_client.requests.clear() + fake_client.payload_writes.clear() + + assert queue.input.try_enqueue(None, nbytes=0) is False + + assert fake_client.payload_writes == [] + assert fake_client.counters[queue.layout.input_desc_tail_offset] == 2 + assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0 + finally: + _close(worker, shm) + + +def test_enqueue_after_stop_rejects_locally_without_polling_or_abort(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + queue.request_stop(timeout=0.001) + fake_client.requests.clear() + + assert queue.input.try_enqueue(None, nbytes=0) is False + with pytest.raises(RuntimeError, match="stopped"): + queue.input.enqueue(None, nbytes=0, timeout=0.001) + + assert fake_client.requests == [] + assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0 + finally: + _close(worker, shm) + + +def test_try_enqueue_payload_larger_than_arena_returns_false_without_poison_or_publish(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + host = orch.alloc([256], DataType.UINT8) + fake_client.requests.clear() + fake_client.payload_writes.clear() + + assert queue.input.try_enqueue(host, nbytes=256) is False + + assert fake_client.payload_writes == [] + assert fake_client.counters.get(queue.layout.input_desc_tail_offset, 0) == 0 + assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0 + finally: + _close(worker, shm) + + +def test_output_payload_offset_mismatch_poisons_before_payload_read(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + _publish_output( + fake_client, + queue, + payload=b"abcdefghijklmnop", + payload_offset=queue.layout.output_arena_offset + 16, + ) + fake_client.requests.clear() + + with pytest.raises(RuntimeError, match="payload.*mismatch"): + queue.output.peek(timeout=0.001) + + assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1 + assert all( + not ( + req.cmd == L3L2OrchCommCmd.PAYLOAD_READ + and req.payload_offset == queue.layout.output_arena_offset + 16 + ) + for req, _timeout in fake_client.requests + ) + finally: + _close(worker, shm) + + +def test_enqueue_payload_write_failure_sets_l3_abort_flag(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + host = orch.alloc([16], DataType.UINT8) + fake_client.fail_next_cmd = L3L2OrchCommCmd.PAYLOAD_WRITE + + with pytest.raises(RuntimeError, match="injected failure"): + queue.input.enqueue(host, nbytes=16, timeout=0.001) + + assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1 + with pytest.raises(RuntimeError, match="poisoned"): + queue.input.try_enqueue(None, nbytes=0) + finally: + _close(worker, shm) + + +def test_timeout_without_peer_abort_flag_returns_timeout_and_keeps_queue_live(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + fake_client.requests.clear() + + with pytest.raises(TimeoutError, match="timed out"): + queue.output.peek(timeout=0.0001) + + assert queue.region.descriptor_scalars()[1] == 1 + assert all( + not ( + req.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY + and req.counter_addr == queue.region.descriptor.counter_base + L3L2_QUEUE_L3_ABORT_FLAG_OFFSET + ) + for req, _timeout in fake_client.requests + ) + finally: + _close(worker, shm) + + +def test_timeout_with_peer_abort_flag_reports_remote_aborted_without_setting_own_flag(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + fake_client.peer_abort = True + fake_client.requests.clear() + + with pytest.raises(RuntimeError, match="remote.*abort"): + queue.output.peek(timeout=0.0001) + + with pytest.raises(RuntimeError, match="remote.*abort"): + queue.input.try_enqueue(None, nbytes=0) + assert all( + not ( + req.cmd == L3L2OrchCommCmd.SIGNAL_NOTIFY + and req.counter_addr == queue.region.descriptor.counter_base + L3L2_QUEUE_L3_ABORT_FLAG_OFFSET + ) + for req, _timeout in fake_client.requests + ) + finally: + _close(worker, shm) + + +def test_expired_queue_rejects_later_operations_without_abort_flag(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + queue.region._expire() + fake_client.requests.clear() + + with pytest.raises(RuntimeError, match="expired"): + queue.input.try_enqueue(None, nbytes=0) + with pytest.raises(RuntimeError, match="expired"): + queue.output.try_peek() + + assert fake_client.requests == [] + assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0 + finally: + _close(worker, shm) From 04e3a4c524a6df89bb32c34279e704aa2b99e8c6 Mon Sep 17 00:00:00 2001 From: ccyywwen <75376396+ccyywwen@users.noreply.github.com> Date: Mon, 29 Jun 2026 09:15:15 +0800 Subject: [PATCH 3/7] Update: clean up L3 L2 queue PR1 - Drop the base implementation guide from tracked PR1 files while keeping it available locally for PR2 planning. - Keep the L3-L2 queue Python tests compatible with the pyright target and ruff formatting used by CI. --- docs/l3-l2-message-queue-base-impl.md | 798 ------------------ .../test_worker/test_l3_l2_message_queue.py | 12 +- 2 files changed, 5 insertions(+), 805 deletions(-) delete mode 100644 docs/l3-l2-message-queue-base-impl.md diff --git a/docs/l3-l2-message-queue-base-impl.md b/docs/l3-l2-message-queue-base-impl.md deleted file mode 100644 index d63f446cf..000000000 --- a/docs/l3-l2-message-queue-base-impl.md +++ /dev/null @@ -1,798 +0,0 @@ -# L3-L2 Message Queue Base Queue Two-PR Implementation Plan - -## 1. Scope And Platform Support - -This document covers a two-PR delivery of the base bidirectional SPSC message -queue transport described in `l3-l2-message-queue-design.md`. - -PR1 implements the core queue transport and primitive-compatible fast-path API: - -- one input queue from L3 to L2; -- one output queue from L2 to L3; -- descriptor rings and payload arenas in one primitive L3-L2 region; -- `DATA`, `ERROR`, and input-only `STOP` descriptors; -- explicit output reserve/publish on L2; -- explicit input peek/release on L2; -- L3 enqueue, output ownership/dequeue, stop, and cleanup APIs; -- non-zero L3 buffers limited to primitive-compatible registered - `orch.alloc(...)` host Tensors; -- two single-writer abort flags for timeout disambiguation; -- unit tests for ABI, layout, counters, zero-byte descriptors, queue - mechanics, and fast-path APIs. - -PR2 implements the usability and end-to-end layer: - -- lazy internal staging for ordinary L3 host buffers; -- ordinary host-buffer enqueue and output read convenience paths; -- one base queue example with a small message-local AICore task. -- scene tests on supported platforms; -- final user-facing documentation cleanup. - -Neither PR includes: - -- the L2 input window helper; -- multiple active DATA input handles on L2; -- out-of-order input release; -- fragmented payload arenas; -- multiple outstanding producer reservations per direction; -- output-side STOP acknowledgement messages. - -Supported across the two PRs: - -- `a2a3` onboard; -- `a2a3sim`; -- `a5sim`. - -Not supported: - -- `a5` onboard. - -The exact Python and C++ class names may change during implementation, but the -ABI, state transitions, and observable behavior in this document are base queue -requirements. Scope tags below identify whether a requirement lands in PR1 or -PR2. - -## 2. Expected User Flow - -The final base queue should be usable without exposing descriptor offsets, -counter offsets, or payload arena cursors to application code. PR1 supports -the same operation shape with primitive-compatible registered host Tensors for -non-zero L3 buffers. PR2 relaxes that buffer requirement with lazy staging. - -Expected L3 shape: - -```python -queue = orch.create_l3_l2_queue( - worker_id=0, - depth=8, - input_arena_bytes=1 << 20, - output_arena_bytes=1 << 20, -) - -for payload in input_payloads: - queue.input.enqueue(payload.buffer, nbytes=payload.nbytes, timeout=timeout_s) - -queue.input.enqueue(None, nbytes=0, timeout=timeout_s) # zero-byte DATA -queue.request_stop(timeout=timeout_s) - -while not application_done: - message = queue.output.peek(timeout=timeout_s) - output_buffer = choose_buffer(message.payload_nbytes) - queue.output.read_into(message, output_buffer) - queue.output.release(message) - handle_application_output(message) - -queue.free() -``` - -If the application already owns a large enough output buffer, it may use the -convenience path instead: - -```python -message = queue.output.dequeue_into(max_sized_output_buffer, timeout=timeout_s) -``` - -Expected base L2 shape: - -```cpp -L3L2QueueEndpoint queue(desc_scalars, queue_args); -for (;;) { - auto in = queue.input().peek(timeout); - if (in.opcode == L3L2QueueOpcode::STOP) { - queue.input().release(in); - break; - } - - auto out = queue.output().reserve(output_nbytes, timeout); - launch_message_local_aicore_work(in.payload_view, out.gm_addr); - wait_until_output_bytes_are_visible(); - queue.output().publish(out, L3L2QueueOpcode::DATA); - queue.input().release(in); -} -``` - -Application payload schema, request IDs, final-output markers, and output -cardinality are application responsibilities. PR1 transport order does not -imply request correlation beyond FIFO order within each queue direction. - -## 3. API Surface - -PR1 must expose the semantic operations below. PR2 keeps the same operation -surface and only expands accepted L3 buffer types through lazy staging. Exact -class and method names may change during implementation, but the -implementation must not require users to manipulate descriptor slots, counter -offsets, payload arena offsets, or head/tail reconstruction state directly. - -Required L3 Python surface: - -```text -orch.create_l3_l2_queue( - worker_id, - depth, - input_arena_bytes, - output_arena_bytes, -) -> queue - -queue.input.enqueue(buffer_or_none, nbytes, timeout) -queue.input.try_enqueue(buffer_or_none, nbytes) - -queue.output.dequeue_into(buffer, timeout) -> message -queue.output.try_dequeue_into(buffer) -> message or no-progress - -queue.request_stop(timeout) -queue.try_request_stop() -queue.free() -``` - -L3 message results must expose at least: - -```text -seq -opcode -payload_nbytes -``` - -Convenience dequeue APIs may copy and release in one operation. PR1 must also -expose explicit output ownership APIs with these semantics: - -```text -queue.output.peek(timeout) -> message_handle -queue.output.try_peek() -> message_handle or no-progress -queue.output.read_into(message_handle, buffer) -queue.output.release(message_handle) -``` - -Required L2 C++ surface: - -```text -L3L2QueueEndpoint queue(desc_scalars, queue_args) - -queue.input().peek(timeout) -> input_handle -queue.input().try_peek() -> input_handle or no-progress -queue.input().release(input_handle) - -queue.output().reserve(nbytes, timeout) -> output_reservation -queue.output().try_reserve(nbytes) -> output_reservation or no-progress -queue.output().publish(output_reservation, opcode) -``` - -L2 input handles must expose at least: - -```text -seq -opcode -payload_nbytes -payload_view or empty payload marker -``` - -L2 output reservations must expose at least: - -```text -seq or publish sequence context -payload_offset -payload_nbytes -gm_addr for non-zero payload writes -``` - -The API must preserve these user-visible semantics: - -- finite timeouts are required for blocking operations; -- `try_*` operations return no-progress without mutating shared state when the - queue cannot make progress; -- ordinary timeout does not poison the queue unless peer abort is observed; -- zero-byte messages may pass `buffer_or_none == None`; -- PR1 non-zero L3 buffers must be primitive-compatible registered - `orch.alloc(...)` host Tensors; -- PR2 L3 convenience APIs accept ordinary contiguous host byte spans and lazily - stage them when they are not primitive-compatible registered tensors; -- primitive-compatible `orch.alloc(...)` host Tensors remain the fast path in - both PRs; -- output ownership APIs are the recommended path for variable-size outputs, - while `dequeue_into` remains valid when the caller supplies a large enough - target buffer; -- after successful `request_stop`, L3 input enqueue rejects later input - messages locally without poisoning; -- `ERROR` is an application-level message, not a transport exception; -- cleanup/free remains valid after local poison or remote-aborted terminal - state. - -## 4. L3 Host Buffer Contract And Lazy Staging - -The primitive L3 payload APIs require a registered, child-visible -`orch.alloc(...)` host Tensor. - -PR1 buffer contract: - -- `nbytes == 0` accepts `buffer_or_none == None` and uses the zero-byte - descriptor path; -- non-zero L3 input enqueue buffers must be primitive-compatible registered - `orch.alloc(...)` host Tensors; -- non-zero L3 output read targets must be primitive-compatible registered - `orch.alloc(...)` host Tensors; -- ordinary `bytes`, `bytearray`, `memoryview`, private tensors, and other - non-registered host buffers are rejected before shared-state mutation; -- rejecting a non-registered buffer is a pre-mutation validation failure and - does not poison or set an abort flag. - -PR2 buffer contract: - -- `nbytes == 0` accepts `buffer_or_none == None` and uses the zero-byte - descriptor path; -- if the input buffer is a primitive-compatible registered `orch.alloc(...)` - host Tensor, enqueue uses it directly as the zero-extra-host-copy fast path; -- otherwise enqueue accepts an ordinary readable contiguous host byte span, - such as `bytes`, `bytearray`, `memoryview`, or a contiguous CPU tensor-like - object the implementation can view as bytes; -- non-fast-path enqueue copies the user bytes into an internal registered - staging Tensor, then issues primitive `payload_write` from that staging - Tensor. - -For L3 output read: - -- if the output target is a primitive-compatible registered `orch.alloc(...)` - host Tensor, `read_into` or `dequeue_into` uses it directly as the fast path; -- otherwise the target must be an ordinary writable contiguous host byte span; -- non-fast-path read first issues primitive `payload_read` into an internal - registered staging Tensor, then copies from staging into the user target. - -The staging Tensor is allocated lazily and owned by the queue handle. It may -grow when a later operation needs a larger staging span. The implementation -must not expose staging offsets or staging Tensor ownership to users. - -If a payload is too large for the current staging Tensor, the queue should grow -or allocate staging before issuing any primitive command. Failure to allocate -staging is a pre-mutation validation/allocation failure: it rejects the -operation, does not publish descriptors, does not release descriptors, does not -poison, and does not set an abort flag. - -Staging may add one host-to-host copy. Users that need the lowest host overhead -can pass primitive-compatible registered `orch.alloc(...)` host Tensors. - -## 5. PR1 ABI Surface - -The stable PR1 ABI is the L3/L2 shared contract. It is separate from exact -Python or C++ method names. - -TaskArgs carry the primitive region descriptor followed by queue parameters: - -```text -primitive desc[0..5] -queue_magic_version -depth -input_arena_bytes -output_arena_bytes -``` - -The queue ABI version covers: - -- descriptor slot size and field order; -- opcode numeric values; -- deterministic payload layout derivation; -- counter offsets and meanings; -- head/tail low32 reconstruction rules; -- abort flag semantics; -- zero-byte descriptor canonical form; -- STOP and ERROR transport semantics. - -Descriptor slot ABI: - -```cpp -struct L3L2QueueDescSlot { - uint64_t seq; - uint64_t opcode; - uint64_t payload_offset; - uint64_t payload_nbytes; -}; -static_assert(sizeof(L3L2QueueDescSlot) == 32); -``` - -Opcode ABI: - -```text -0 invalid / never published -DATA = 1 -STOP = 2 -ERROR = 3 -``` - -Counter ABI: - -```text -offset 0: input_desc_tail writer=L3 -offset 64: input_desc_head writer=L2 -offset 128: output_desc_tail writer=L2 -offset 192: output_desc_head writer=L3 -offset 256: l3_abort_flag writer=L3 -offset 320: l2_abort_flag writer=L2 -``` - -Layout validation ABI: - -- `depth` must be a power of two and `depth <= 2^30`; -- queue capacity is `depth`, not `depth - 1`; -- descriptor slot size is 32 bytes; -- descriptor rings are 8-byte aligned; -- payload arena bases are 64-byte aligned; -- arena byte sizes are positive 64-byte multiples; -- `counter_bytes >= 384`. - -The following are not PR1 ABI: - -- exact Python class names; -- exact C++ helper class names; -- internal helper function names; -- polling backoff strategy; -- application payload schema; -- example payload format. - -## 6. ABI And Layout - -The descriptor slot ABI is the existing 32-byte format: - -```cpp -struct L3L2QueueDescSlot { - uint64_t seq; - uint64_t opcode; - uint64_t payload_offset; - uint64_t payload_nbytes; -}; -static_assert(sizeof(L3L2QueueDescSlot) == 32); -``` - -`payload_offset` is relative to the primitive payload base. For non-zero -message payloads, it points into the direction-local payload arena. It does not -point to the descriptor slot itself. - -The layout helper must derive all payload and counter offsets. Python may -mirror the calculation, but tests must keep the Python calculation and the C/C++ -helper in lockstep. - -PR1 counter layout: - -```text -offset 0: input_desc_tail writer=L3 -offset 64: input_desc_head writer=L2 -offset 128: output_desc_tail writer=L2 -offset 192: output_desc_head writer=L3 -offset 256: l3_abort_flag writer=L3 -offset 320: l2_abort_flag writer=L2 -``` - -`counter_bytes` must be at least 384. The abort flags are low-frequency -diagnostic signals, but they still use the same 64-byte stride as the -descriptor counters to preserve single-writer cache-line ownership. - -All six counters are initialized to zero before submitting the persistent L2 -run. Descriptor slots and payload bytes do not need to be zeroed for -correctness. - -## 7. Primitive Command Mapping - -The queue is a wrapper over the existing L3-L2 primitive commands. PR1 must not -add a new primitive command or bypass the primitive region lifetime model. - -Descriptor rings live in the primitive payload region. Descriptor slot access -therefore uses the primitive payload APIs: - -- L3 writes input descriptor slots with `L3L2OrchRegion.payload_write`; -- L3 reads output descriptor slots with `L3L2OrchRegion.payload_read`; -- L2 reads input descriptor slots with `L3L2OrchEndpoint::payload_read`; -- L2 writes output descriptor slots with `L3L2OrchEndpoint::payload_write`. - -Message payload arena access also uses the primitive payload APIs when the -message payload is non-zero: - -- L3 input enqueue writes non-zero input payload bytes with - `L3L2OrchRegion.payload_write`; -- L3 output dequeue reads non-zero output payload bytes with - `L3L2OrchRegion.payload_read`; -- L2 input consume obtains a non-zero input payload GM view with - `L3L2OrchEndpoint::payload_read`; -- L2 output reserve returns a GM span in the output arena; L2 application code - or AICore work writes that span before `publish`; -- PR1 does not require a separate L2 message-payload copy API. If an - implementation uses `L3L2OrchEndpoint::payload_write` for a small L2-produced - output payload, it is only a helper for filling the reserved output arena - span before `publish`, not a separate transport path. - -Queue counters use the primitive signal APIs: - -- publishing descriptor tail, releasing descriptor head, and setting an abort - flag use `SIGNAL_NOTIFY` / `signal_notify`; -- head/tail polling uses `SIGNAL_TEST` / `signal_test` snapshots; -- timeout disambiguation samples the peer abort flag with `SIGNAL_TEST`, for - example `GE 1` against the peer flag address. - -Only a matched `SIGNAL_TEST` snapshot may drive head/tail reconstruction, -descriptor replay, payload release, or payload reuse. A failed head/tail test -does not establish acquire ordering and its observed value must not update -local queue state. For abort flags, a matched `GE 1` test reports remote abort; -an unmatched test leaves the timeout as ordinary no-progress. - -PR1 queue correctness must not depend on primitive `SIGNAL_WAIT`. Blocking -queue operations are wrapper-level bounded polling loops over `SIGNAL_TEST` -plus local queue-state checks. - -## 8. Zero-Byte Message Rules - -Zero-byte `DATA`, `ERROR`, and `STOP` descriptors are valid queue messages. -They still consume one descriptor slot and follow the normal descriptor -publication sequence. - -For any descriptor with `payload_nbytes == 0`: - -- `payload_offset` must be `0`; -- `payload_offset == 0` is a canonical sentinel, not a payload address; -- the message consumes no payload arena bytes; -- producer payload cursors do not advance; -- consumer payload cursors do not advance; -- payload wrap-padding replay is skipped for that descriptor; -- no message-payload arena copy/read/view is issued. - -Descriptor-ring access is separate from message-payload arena access. -Descriptor slots live in the primitive payload region, so publishing or reading -a zero-byte message may still use primitive payload access for descriptor-ring -metadata. The rule above skips only the message payload arena path. - -Consumer validation order must make the zero-byte path explicit: - -```text -1. validate descriptor sequence; -2. validate opcode and direction legality; -3. if payload_nbytes == 0: - require payload_offset == 0; - skip direction-local arena range checks and payload replay; - else: - require payload_offset to be inside the direction-local arena; - validate contiguous span and payload cursor replay. -``` - -This ordering matters because `payload_offset == 0` for a zero-byte output -descriptor usually is not inside the output arena. A consumer that runs arena -range validation before the zero-byte branch would reject a valid descriptor. - -If a published descriptor has `payload_nbytes == 0` and `payload_offset != 0`, -the descriptor is invalid published state. The observing endpoint transitions -to `POISONED(local-infrastructure)` and sets its own abort flag. - -## 9. Queue State And Abort Flags - -PR1 uses two single-writer abort flags: - -```text -l3_abort_flag: writer=L3, reader=L2 -l2_abort_flag: writer=L2, reader=L3 -``` - -Each flag is initialized to `0`. On local infrastructure poison, the endpoint -sets its owned flag to `1` with `NotifyOp.Set`. The flag never resets within a -queue lifetime. It is a terminal boolean, not an epoch and not a poison count. - -Abort flags are for timeout disambiguation. PR1 does not require every wait -loop iteration to poll both data progress and abort progress. A blocking queue -operation that reaches its timeout samples the peer abort flag: - -```text -peer abort_flag == 0: - return ordinary timeout/no-progress; - keep the local queue live; - do not set the local abort flag. - -peer abort_flag == 1: - return remote-aborted transport failure; - transition the local handle to a terminal remote-aborted state; - do not publish descriptors or advance queue state; - do not set the local abort flag solely because the peer flag was observed. -``` - -The implementation may represent terminal remote abort with the existing -`POISONED` state, but the reason must remain distinct: - -```text -POISONED(local-infrastructure): set own abort_flag = 1 -POISONED(remote-aborted): do not set own abort_flag -``` - -This distinction prevents a peer abort observation from being amplified into a -new local infrastructure poison report. - -## 10. Capacity, Counters, And Reconstruction - -`depth` is the user-visible queue capacity. A queue created with `depth=N` can -hold `N` published, unreleased descriptors. - -Validation rules: - -- `depth` must be a power of two; -- `depth <= 2^30`; -- queue capacity is `depth`, not `depth - 1`. - -Full and empty checks must use monotonic local `uint64_t` head/tail values, not -only masked ring indices: - -```text -empty iff tail == head -full iff tail - head == depth -invalid shared state iff tail - head > depth -``` - -The shared head/tail counters store only the low 32 bits. Each endpoint keeps -local `uint64_t` copies and reconstructs observed progress with signed 32-bit -delta semantics: - -```text -delta = int32_t(observed_low32 - local_low32) -valid progress: 0 <= delta <= depth -``` - -`delta == depth` is valid. A peer may legally move from empty to full between -observations. Negative deltas or deltas larger than `depth` are inconsistent -shared state and poison the observing endpoint. - -Descriptor slot validity does not depend on opcode or slot clearing. A -published descriptor is valid only when: - -```text -slot.seq == expected_seq -expected_seq == local_head_or_tail + 1 -slot_index == (expected_seq - 1) & (depth - 1) -``` - -Equivalent index calculations are allowed, but the sequence check must use the -full 64-bit `seq`. Descriptor slots do not need to be cleared before reuse. - -Before a producer reuses released descriptor slots or payload arena bytes, it -must replay exactly the released FIFO prefix after observing head progress. -Replay must happen before slot reuse. Zero-byte descriptors in replay advance -descriptor state only and do not advance payload cursors. - -## 11. Producer And Consumer Operation Details - -Producer sequence: - -```text -reserve -> fill/copy payload if payload_nbytes > 0 -> publish descriptor -``` - -Consumer sequence: - -```text -peek/acquire descriptor -> read/view payload if payload_nbytes > 0 --> release descriptor and payload -``` - -Descriptor publication order: - -1. reserve a descriptor slot and, for non-zero payloads, a contiguous payload - arena span; -2. write or expose the payload bytes; -3. write descriptor fields other than `seq`; -4. write `seq` as the descriptor validity marker; -5. release-publish the tail counter. - -Descriptor release order: - -1. finish all uses of the message payload; -2. update local release and payload cursor state; -3. release-publish the head counter. - -Each direction allows at most one outstanding producer reservation. Publishing -an unknown, stale, already-published, already-canceled, or cross-queue -reservation is a local ownership contradiction and poisons the queue. - -The base queue has no reservation cancel. If a producer has successfully -reserved a non-zero payload span and later cannot safely publish either `DATA` -or application `ERROR`, it must poison the queue. If the queue remains -trustworthy, the application may publish an `ERROR` descriptor using the -reservation. - -`STOP` is an input-queue descriptor. It consumes one input descriptor slot, -uses `payload_nbytes == 0` and `payload_offset == 0`, and is terminal for L3 -input enqueue. After L3 successfully publishes `STOP`, later input `DATA`, -`ERROR`, or `STOP` attempts are rejected locally without poisoning. If L2 has -observed `STOP` and later observes another published input descriptor, the -descriptor is invalid published state and poisons the queue. - -`ERROR` remains an application-level message. Receiving `ERROR` does not poison -the queue, set an abort flag, stop either direction, or imply transport abort. - -## 12. Error Handling Rules - -The guiding rule remains: - -```text -Before shared-state mutation: reject, no poison, no abort flag. -After shared-state mutation or inconsistent shared-state observation: - poison local infrastructure, set own abort_flag. -``` - -Pre-mutation validation failures do not poison and do not set abort flags: - -- `try_enqueue` sees no descriptor or payload space; -- `try_request_stop` sees no input descriptor slot; -- a blocking operation times out under ordinary backpressure; -- payload size exceeds the arena before reservation mutates state; -- queue creation rejects invalid layout or reconstruction parameters; -- output buffer is too small before payload copy and before release; -- invalid API arguments are caught before shared state is touched; -- lazy staging allocation failure before primitive command issue; -- enqueue is attempted after L3 has already published `STOP`; -- application `ERROR` is sent or received normally. - -Infrastructure poison sets the endpoint's own abort flag: - -- descriptor sequence mismatch; -- invalid opcode observed in a published descriptor; -- `STOP` observed on the output queue; -- zero-byte descriptor with non-zero `payload_offset`; -- non-zero descriptor payload range outside its direction-local arena; -- head/tail reconstruction observes impossible progress; -- payload replay observes impossible state; -- payload copy failure after command issue; -- counter notify failure; -- control-service response timeout after command issue; -- L2 endpoint fatal error for this region; -- reservation, publish, or release ownership state becomes contradictory. - -Ordinary timeout is ambiguous until the peer abort flag is sampled. A timeout -with peer abort flag `0` is not poison. A timeout with peer abort flag `1` -transitions the local handle to terminal `remote-aborted` without setting the -local abort flag. - -Cleanup and `free()` remain valid and idempotent after both local -infrastructure poison and remote-aborted terminal state. - -## 13. Example - -PR2 adds one base queue example: - -```text -examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/ -``` - -The example should demonstrate the intended user shape, not every edge case. -It must show: - -- L3 creating a queue with `depth > 1`; -- multiple variable-size input `DATA` messages; -- one zero-byte `DATA` message; -- a persistent L2 loop; -- L2 processing at most one active DATA input at a time; -- one small message-local AICore task; -- L2 publishing one output `DATA` per input `DATA`; -- L3 publishing `STOP`; -- L3 continuing to dequeue outputs after `STOP` according to application final - output rules; -- L2 releasing the `STOP` descriptor and returning from the persistent run. - -The example should not demonstrate: - -- the L2 input window; -- multiple active input messages; -- one input producing multiple outputs; -- multiple inputs producing one output; -- out-of-input-order output publish; -- application `ERROR` protocol design; -- abort flag failure paths. - -The zero-byte `DATA` message should exercise the descriptor-only message path. -It should not require a child-visible zero-byte host buffer. - -## 14. Test Plan - -Both PRs require automated tests for their review-driven boundaries. A manual -review checklist is not enough. - -PR1 test scope: - -- ABI and layout; -- descriptor/counter protocol; -- zero-byte descriptor handling; -- capacity, full/empty, wrap, and low32 reconstruction; -- abort flag semantics; -- L2 endpoint API; -- L3 fast-path API with primitive-compatible registered host Tensors. - -PR2 test scope: - -- lazy internal staging for ordinary L3 host buffers; -- registered Tensor fast path remains no-staging; -- staging allocation failure is pre-mutation and non-poisoning; -- base queue example and scene coverage. - -Suggested C++ unit test category: - -```text -tests/ut/cpp/common/test_l3_l2_message_queue.cpp -``` - -Suggested C++ unit tests: - -- `LayoutAssignsAbortFlagsAfterDescriptorCounters` -- `LayoutRequiresCounterBytesForSixCounters` -- `DescriptorSlotEncodingIsStable` -- `ZeroByteDescriptorUsesCanonicalOffset` -- `ZeroByteDescriptorWithNonZeroOffsetPoisons` -- `CapacityEqualsDepthAllowsNPublishedDescriptors` -- `CapacityEqualsDepthRejectsNthPlusOneDescriptor` -- `FullAndEmptyUseMonotonicCountersNotMaskedIndices` -- `Low32ReconstructionAcceptsDeltaEqualDepth` -- `Low32ReconstructionHandlesCounterWrap` -- `Low32ReconstructionRejectsNegativeDelta` -- `Low32ReconstructionRejectsDeltaGreaterThanDepth` -- `ReplaySkipsPayloadCursorAdvanceForZeroByteDescriptors` -- `ReplayBeforeSlotReuseAfterFullQueueWrap` -- `LocalInfrastructurePoisonSetsOwnAbortFlag` -- `RemoteAbortObservationDoesNotSetOwnAbortFlag` -- `OrdinaryTimeoutDoesNotSetAbortFlag` -- `ApplicationErrorDoesNotSetAbortFlag` -- `PreMutationValidationFailureDoesNotSetAbortFlag` - -Suggested Python unit test category: - -```text -tests/ut/py/test_l3_l2_message_queue.py -``` - -Suggested Python unit tests: - -- `test_layout_matches_cpp_helper` -- `test_counter_offsets_include_abort_flags` -- `test_zero_byte_enqueue_skips_payload_arena_copy` -- `test_zero_byte_dequeue_skips_payload_arena_read` -- `test_enqueue_rejects_ordinary_host_bytes_before_pr2_staging` -- `test_output_read_rejects_ordinary_buffer_before_pr2_staging` -- `test_enqueue_accepts_ordinary_host_bytes_with_lazy_staging` -- `test_enqueue_registered_tensor_uses_fast_path_without_staging` -- `test_output_read_into_ordinary_buffer_uses_lazy_staging` -- `test_staging_allocation_failure_does_not_poison` -- `test_timeout_with_peer_abort_flag_reports_remote_aborted` -- `test_timeout_without_peer_abort_flag_returns_timeout` -- `test_remote_aborted_terminal_state_rejects_later_operations` - -Suggested scene/example tests: - -```text -examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/ -``` - -Suggested scene cases: - -- `variable_size_messages`: enqueue/dequeue several non-zero `DATA` messages; -- `zero_byte_data`: send one zero-byte `DATA` and verify one corresponding - output is produced without payload arena bytes; -- `depth_capacity`: with `depth=N`, publish `N` inputs before backpressure; -- `fifo_stop`: publish `STOP`, drain outputs, and verify L2 exits; -- `small_aicore_work`: each non-zero input launches message-local AICore work; -- `l2_abort_flag_timeout_disambiguation`: force an L2 local infrastructure - poison, then verify L3 timeout reports remote-aborted instead of ordinary - timeout. - -The scene test matrix should include the PR1 supported simulation platforms -where practical: - -- `a2a3sim`; -- `a5sim`. - -Hardware execution should include `a2a3` onboard when device access is -available through the repository's `task-submit` workflow. diff --git a/tests/ut/py/test_worker/test_l3_l2_message_queue.py b/tests/ut/py/test_worker/test_l3_l2_message_queue.py index 64b39f8cb..6f5230766 100644 --- a/tests/ut/py/test_worker/test_l3_l2_message_queue.py +++ b/tests/ut/py/test_worker/test_l3_l2_message_queue.py @@ -11,6 +11,7 @@ import math import struct from multiprocessing.shared_memory import SharedMemory +from typing import Optional import pytest from simpler.l3_l2_message_queue import ( @@ -65,7 +66,7 @@ def __init__(self): self.payload = bytearray() self.counters: dict[int, int] = {} self.peer_abort = False - self.fail_next_cmd: L3L2OrchCommCmd | None = None + self.fail_next_cmd: Optional[L3L2OrchCommCmd] = None def submit(self, request, timeout_s: float): self.requests.append((request, timeout_s)) @@ -182,7 +183,7 @@ def _publish_output( seq: int = 1, payload: bytes = b"", opcode: int = int(L3L2QueueOpcode.DATA), - payload_offset: int | None = None, + payload_offset: Optional[int] = None, ) -> None: if payload_offset is None: payload_offset = queue.layout.output_arena_offset if payload else 0 @@ -349,9 +350,7 @@ def test_enqueue_replays_released_descriptors_before_reusing_input_arena(): fake_client.counters[queue.layout.input_desc_head_offset] = 1 queue.input.enqueue(second, nbytes=80, timeout=0.001) - payload_offsets = [ - offset for offset, data in fake_client.payload_writes if len(data) == 80 - ] + payload_offsets = [offset for offset, data in fake_client.payload_writes if len(data) == 80] assert payload_offsets == [queue.layout.input_arena_offset, queue.layout.input_arena_offset] finally: _close(worker, shm) @@ -578,8 +577,7 @@ def test_output_payload_offset_mismatch_poisons_before_payload_read(): assert fake_client.counters[L3L2_QUEUE_L3_ABORT_FLAG_OFFSET] == 1 assert all( not ( - req.cmd == L3L2OrchCommCmd.PAYLOAD_READ - and req.payload_offset == queue.layout.output_arena_offset + 16 + req.cmd == L3L2OrchCommCmd.PAYLOAD_READ and req.payload_offset == queue.layout.output_arena_offset + 16 ) for req, _timeout in fake_client.requests ) From 6107c8135158a3abf937fd33096d1c75704e63b6 Mon Sep 17 00:00:00 2001 From: ccyywwen <75376396+ccyywwen@users.noreply.github.com> Date: Tue, 30 Jun 2026 11:03:12 +0800 Subject: [PATCH 4/7] Fix: harden L3-L2 queue edge cases - Fail closed on queue layout uint64 overflow in C++ and Python mirror calculations - Validate cached L2 input handle metadata before release and use cached descriptor state - Gate C++ spin-loop timer reads and clean up Python regions on partial construction failure --- python/simpler/l3_l2_message_queue.py | 59 +++++++++---- .../include/aicpu/l3_l2_message_queue.h | 82 +++++++++++++++---- .../cpp/common/test_l3_l2_message_queue.cpp | 43 ++++++++++ .../test_worker/test_l3_l2_message_queue.py | 24 ++++++ 4 files changed, 174 insertions(+), 34 deletions(-) diff --git a/python/simpler/l3_l2_message_queue.py b/python/simpler/l3_l2_message_queue.py index 462554650..38f6b845b 100644 --- a/python/simpler/l3_l2_message_queue.py +++ b/python/simpler/l3_l2_message_queue.py @@ -40,6 +40,7 @@ L3L2_QUEUE_L2_ABORT_FLAG_OFFSET = 320 L3L2_QUEUE_COUNTER_BYTES = 384 L3L2_QUEUE_MAX_DEPTH = 1 << 30 +_UINT64_MAX = (1 << 64) - 1 _DESC = struct.Struct("<4Q") _POLL_INTERVAL_S = 0.00005 @@ -92,8 +93,21 @@ def l3_l2_queue_magic_version() -> int: def _align_up(value: int, align: int) -> int: + if value < 0 or value > _UINT64_MAX: + raise ValueError("L3-L2 queue layout calculation overflowed uint64") remainder = value % align - return value if remainder == 0 else value + (align - remainder) + bump = 0 if remainder == 0 else align - remainder + result = value + bump + if result > _UINT64_MAX: + raise ValueError("L3-L2 queue layout calculation overflowed uint64") + return result + + +def _checked_add_u64(lhs: int, rhs: int) -> int: + result = lhs + rhs + if lhs < 0 or rhs < 0 or result > _UINT64_MAX: + raise ValueError("L3-L2 queue layout calculation overflowed uint64") + return result def make_l3_l2_queue_layout(depth: int, input_arena_bytes: int, output_arena_bytes: int) -> L3L2QueueLayout: @@ -108,11 +122,15 @@ def make_l3_l2_queue_layout(depth: int, input_arena_bytes: int, output_arena_byt raise ValueError("L3-L2 queue output_arena_bytes must be a positive 64-byte multiple") desc_ring_bytes = depth * L3L2_QUEUE_DESC_SLOT_BYTES + if desc_ring_bytes > _UINT64_MAX: + raise ValueError("L3-L2 queue layout calculation overflowed uint64") input_desc_offset = 0 - output_desc_offset = input_desc_offset + desc_ring_bytes - input_arena_offset = _align_up(output_desc_offset + desc_ring_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT) - output_arena_offset = _align_up(input_arena_offset + input_arena_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT) - payload_bytes = output_arena_offset + output_arena_bytes + output_desc_offset = _checked_add_u64(input_desc_offset, desc_ring_bytes) + desc_end = _checked_add_u64(output_desc_offset, desc_ring_bytes) + input_arena_offset = _align_up(desc_end, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT) + input_arena_end = _checked_add_u64(input_arena_offset, input_arena_bytes) + output_arena_offset = _align_up(input_arena_end, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT) + payload_bytes = _checked_add_u64(output_arena_offset, output_arena_bytes) return L3L2QueueLayout( depth=depth, input_desc_offset=input_desc_offset, @@ -146,18 +164,25 @@ def create_l3_l2_queue( payload_bytes=layout.payload_bytes, counter_bytes=layout.counter_bytes, ) - desc_fields = orch.alloc([24], DataType.UINT8) - desc_seq = orch.alloc([8], DataType.UINT8) - desc_read = orch.alloc([L3L2_QUEUE_DESC_SLOT_BYTES], DataType.UINT8) - for offset in ( - layout.input_desc_tail_offset, - layout.input_desc_head_offset, - layout.output_desc_tail_offset, - layout.output_desc_head_offset, - layout.l3_abort_flag_offset, - layout.l2_abort_flag_offset, - ): - region.counter(offset).notify(0, NotifyOp.Set) + try: + desc_fields = orch.alloc([24], DataType.UINT8) + desc_seq = orch.alloc([8], DataType.UINT8) + desc_read = orch.alloc([L3L2_QUEUE_DESC_SLOT_BYTES], DataType.UINT8) + for offset in ( + layout.input_desc_tail_offset, + layout.input_desc_head_offset, + layout.output_desc_tail_offset, + layout.output_desc_head_offset, + layout.l3_abort_flag_offset, + layout.l2_abort_flag_offset, + ): + region.counter(offset).notify(0, NotifyOp.Set) + except Exception: + try: + region.free() + except Exception: + pass + raise return L3L2Queue(orch, region, layout, desc_fields, desc_seq, desc_read) diff --git a/src/common/platform/include/aicpu/l3_l2_message_queue.h b/src/common/platform/include/aicpu/l3_l2_message_queue.h index 383785c54..96dad5a40 100644 --- a/src/common/platform/include/aicpu/l3_l2_message_queue.h +++ b/src/common/platform/include/aicpu/l3_l2_message_queue.h @@ -126,6 +126,19 @@ static inline uint64_t l3_l2_queue_align_up(uint64_t value, uint64_t align) { return remainder == 0 ? value : value + (align - remainder); } +static inline bool l3_l2_queue_align_up_checked(uint64_t value, uint64_t align, uint64_t *out) { + if (out == nullptr || align == 0) { + return false; + } + uint64_t remainder = value % align; + uint64_t bump = remainder == 0 ? 0 : align - remainder; + if (l3_l2_orch_comm_add_overflows(value, bump)) { + return false; + } + *out = value + bump; + return true; +} + static inline bool l3_l2_queue_valid_opcode(L3L2QueueOpcode opcode) { return opcode == L3L2QueueOpcode::DATA || opcode == L3L2QueueOpcode::STOP || opcode == L3L2QueueOpcode::ERROR; } @@ -141,14 +154,30 @@ l3_l2_queue_make_layout(uint64_t depth, uint64_t input_arena_bytes, uint64_t out uint64_t desc_ring_bytes = depth * L3L2_QUEUE_DESC_SLOT_BYTES; uint64_t input_desc_offset = 0; + if (l3_l2_orch_comm_add_overflows(input_desc_offset, desc_ring_bytes)) { + return false; + } uint64_t output_desc_offset = input_desc_offset + desc_ring_bytes; - uint64_t input_arena_offset = - l3_l2_queue_align_up(output_desc_offset + desc_ring_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT); - uint64_t output_arena_offset = - l3_l2_queue_align_up(input_arena_offset + input_arena_bytes, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT); + if (l3_l2_orch_comm_add_overflows(output_desc_offset, desc_ring_bytes)) { + return false; + } + uint64_t desc_end = output_desc_offset + desc_ring_bytes; + uint64_t input_arena_offset = 0; + if (!l3_l2_queue_align_up_checked(desc_end, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT, &input_arena_offset)) { + return false; + } + if (l3_l2_orch_comm_add_overflows(input_arena_offset, input_arena_bytes)) { + return false; + } + uint64_t input_arena_end = input_arena_offset + input_arena_bytes; + uint64_t output_arena_offset = 0; + if (!l3_l2_queue_align_up_checked(input_arena_end, L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT, &output_arena_offset)) { + return false; + } if (l3_l2_orch_comm_add_overflows(output_arena_offset, output_arena_bytes)) { return false; } + uint64_t payload_bytes = output_arena_offset + output_arena_bytes; *out = L3L2QueueLayout{ depth, @@ -158,7 +187,7 @@ l3_l2_queue_make_layout(uint64_t depth, uint64_t input_arena_bytes, uint64_t out output_arena_offset, input_arena_bytes, output_arena_bytes, - output_arena_offset + output_arena_bytes, + payload_bytes, L3L2_QUEUE_INPUT_DESC_TAIL_OFFSET, L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET, L3L2_QUEUE_OUTPUT_DESC_TAIL_OFFSET, @@ -227,6 +256,7 @@ class L3L2QueueEndpoint { } uint64_t start = l3_l2_orch_endpoint_now(); uint64_t frequency_hz = l3_l2_orch_endpoint_timer_frequency_hz(); + uint64_t spins = 0; while (true) { if (try_peek(out)) { return true; @@ -234,10 +264,13 @@ class L3L2QueueEndpoint { if (parent_->error_.kind != L3L2QueueErrorKind::NONE) { return false; } - uint64_t now = l3_l2_orch_endpoint_now(); - if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) { - parent_->disambiguate_timeout(); - return false; + spins += 1; + if (timeout_ns == 0 || (spins & 1023ull) == 0) { + uint64_t now = l3_l2_orch_endpoint_now(); + if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) { + parent_->disambiguate_timeout(); + return false; + } } } } @@ -322,6 +355,9 @@ class L3L2QueueEndpoint { *out = L3L2QueueInputHandle{slot.seq, opcode, slot.payload_offset, slot.payload_nbytes, view}; active_ = true; active_seq_ = slot.seq; + active_opcode_ = opcode; + active_payload_offset_ = slot.payload_offset; + active_payload_nbytes_ = slot.payload_nbytes; return true; } @@ -329,13 +365,15 @@ class L3L2QueueEndpoint { if (!parent_->ensure_live("input.release")) { return false; } - if (!active_ || handle.seq != active_seq_ || handle.seq != parent_->input_head_ + 1) { + if (!active_ || handle.seq != active_seq_ || handle.seq != parent_->input_head_ + 1 || + handle.opcode != active_opcode_ || handle.payload_offset != active_payload_offset_ || + handle.payload_nbytes != active_payload_nbytes_) { parent_->poison(L3L2QueueErrorKind::OWNERSHIP, "input.release", "input handle is not active"); return false; } - if (handle.payload_nbytes != 0) { + if (active_payload_nbytes_ != 0) { parent_->advance_payload_head( - parent_->input_payload_head_, handle.payload_offset, handle.payload_nbytes, + parent_->input_payload_head_, active_payload_offset_, active_payload_nbytes_, parent_->layout_.input_arena_offset, parent_->layout_.input_arena_bytes, "input.release" ); if (parent_->error_.kind != L3L2QueueErrorKind::NONE) { @@ -343,11 +381,14 @@ class L3L2QueueEndpoint { } } parent_->input_head_ += 1; - if (handle.opcode == L3L2QueueOpcode::STOP) { + if (active_opcode_ == L3L2QueueOpcode::STOP) { stopped_ = true; } active_ = false; active_seq_ = 0; + active_opcode_ = L3L2QueueOpcode::INVALID; + active_payload_offset_ = 0; + active_payload_nbytes_ = 0; return parent_->notify_counter( parent_->layout_.input_desc_head_offset, static_cast(parent_->input_head_), "input.release" ); @@ -357,6 +398,9 @@ class L3L2QueueEndpoint { L3L2QueueEndpoint *parent_; bool active_{false}; uint64_t active_seq_{0}; + L3L2QueueOpcode active_opcode_{L3L2QueueOpcode::INVALID}; + uint64_t active_payload_offset_{0}; + uint64_t active_payload_nbytes_{0}; bool stopped_{false}; }; @@ -371,6 +415,7 @@ class L3L2QueueEndpoint { } uint64_t start = l3_l2_orch_endpoint_now(); uint64_t frequency_hz = l3_l2_orch_endpoint_timer_frequency_hz(); + uint64_t spins = 0; while (true) { if (try_reserve(nbytes, out)) { return true; @@ -378,10 +423,13 @@ class L3L2QueueEndpoint { if (parent_->error_.kind != L3L2QueueErrorKind::NONE) { return false; } - uint64_t now = l3_l2_orch_endpoint_now(); - if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) { - parent_->disambiguate_timeout(); - return false; + spins += 1; + if (timeout_ns == 0 || (spins & 1023ull) == 0) { + uint64_t now = l3_l2_orch_endpoint_now(); + if (timeout_ns == 0 || l3_l2_orch_endpoint_elapsed_ns(start, now, frequency_hz) >= timeout_ns) { + parent_->disambiguate_timeout(); + return false; + } } } } diff --git a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp index 409da4763..e2761c426 100644 --- a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp +++ b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,25 @@ TEST(L3L2MessageQueueTest, LayoutRejectsInvalidDepthArenaAndCounterBytes) { EXPECT_TRUE(l3_l2_queue_validate_region(make_desc(&storage, 512, 384), args, &layout)); } +TEST(L3L2MessageQueueTest, LayoutOverflowFailsClosedWithoutModifyingOutput) { + L3L2QueueLayout layout{ + 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, + }; + const L3L2QueueLayout original = layout; + + EXPECT_FALSE(l3_l2_queue_make_layout(2, std::numeric_limits::max() - 63, 64, &layout)); + + EXPECT_EQ(layout.depth, original.depth); + EXPECT_EQ(layout.input_desc_offset, original.input_desc_offset); + EXPECT_EQ(layout.output_desc_offset, original.output_desc_offset); + EXPECT_EQ(layout.input_arena_offset, original.input_arena_offset); + EXPECT_EQ(layout.output_arena_offset, original.output_arena_offset); + EXPECT_EQ(layout.input_arena_bytes, original.input_arena_bytes); + EXPECT_EQ(layout.output_arena_bytes, original.output_arena_bytes); + EXPECT_EQ(layout.payload_bytes, original.payload_bytes); + EXPECT_EQ(layout.counter_bytes, original.counter_bytes); +} + TEST(L3L2MessageQueueTest, DescriptorSlotEncodingIsStable) { static_assert(std::is_standard_layout::value, "descriptor must be POD-like"); static_assert(std::is_trivially_copyable::value, "descriptor must be fixed-size"); @@ -430,6 +450,29 @@ TEST(L3L2MessageQueueTest, InputApplicationErrorIsNormalMessageAndDoesNotSetAbor EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 0); } +TEST(L3L2MessageQueueTest, InputReleaseRejectsCallerMutatedHandleMetadata) { + RegionStorage storage{}; + L3L2QueueArgs args{ + l3_l2_queue_magic_version(), + 2, + 64, + 64, + }; + L3L2QueueEndpoint queue(make_desc(&storage), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, 16); + + L3L2QueueInputHandle handle{}; + ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message; + handle.payload_nbytes = 0; + + EXPECT_FALSE(queue.input().release(handle)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::OWNERSHIP); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET)], 0); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1); +} + TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidState) { RegionStorage storage{}; L3L2QueueArgs args{ diff --git a/tests/ut/py/test_worker/test_l3_l2_message_queue.py b/tests/ut/py/test_worker/test_l3_l2_message_queue.py index 6f5230766..04573a62b 100644 --- a/tests/ut/py/test_worker/test_l3_l2_message_queue.py +++ b/tests/ut/py/test_worker/test_l3_l2_message_queue.py @@ -210,6 +210,11 @@ def test_layout_rejects_invalid_pr1_parameters(): make_l3_l2_queue_layout(depth, input_arena_bytes, output_arena_bytes) +def test_layout_rejects_uint64_overflow_to_match_cpp_helper(): + with pytest.raises(ValueError, match="overflowed uint64"): + make_l3_l2_queue_layout(2, (1 << 64) - 64, 64) + + @pytest.mark.parametrize( ("depth", "input_arena_bytes", "output_arena_bytes", "expected"), [ @@ -300,6 +305,25 @@ def test_create_l3_l2_queue_allocates_region_and_exposes_l2_task_scalars(): _close(worker, shm) +def test_create_l3_l2_queue_frees_region_on_post_region_alloc_failure(): + orch, worker, shm, _fake_client = _make_orchestrator() + original_alloc = orch._o.alloc + + def fail_alloc(_shape, _dtype): + raise RuntimeError("injected alloc failure") + + orch._o.alloc = fail_alloc + try: + with pytest.raises(RuntimeError, match="injected alloc failure"): + orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + + assert len(worker._live_l3_l2_regions) == 1 + assert worker._live_l3_l2_regions[0]._released is True + finally: + orch._o.alloc = original_alloc + _close(worker, shm) + + def test_zero_byte_enqueue_skips_message_payload_write_and_publishes_descriptor(): orch, worker, shm, fake_client = _make_orchestrator() try: From 9f33653b43b23ac4bcc0b8bb8bfd973c3894311e Mon Sep 17 00:00:00 2001 From: ccyywwen <75376396+ccyywwen@users.noreply.github.com> Date: Tue, 30 Jun 2026 15:59:27 +0800 Subject: [PATCH 5/7] Update: replace L3-L2 queue design doc - Add the user-facing L3-L2 message queue documentation. - Link the primitive L3-L2 orchestration communication doc to the queue wrapper doc. - Remove the design-stage document from the branch while leaving the local copy available for follow-up work. --- docs/l3-l2-message-queue-design.md | 922 ----------------------------- docs/l3-l2-message-queue.md | 352 +++++++++++ docs/l3-l2-orch-comm.md | 4 + 3 files changed, 356 insertions(+), 922 deletions(-) delete mode 100644 docs/l3-l2-message-queue-design.md create mode 100644 docs/l3-l2-message-queue.md diff --git a/docs/l3-l2-message-queue-design.md b/docs/l3-l2-message-queue-design.md deleted file mode 100644 index 414b80d02..000000000 --- a/docs/l3-l2-message-queue-design.md +++ /dev/null @@ -1,922 +0,0 @@ -# L3-L2 SPSC Message Queue Design - -## 1. Goal - -This document proposes the functional shape of an L3-L2 SPSC message queue -wrapper built on top of the existing `docs/l3-l2-orch-comm.md` primitives. - -The feature goal is to let one L3 orchestrator exchange a sequence of input -and output messages with one persistent L2 orchestrator run. L3 can enqueue -task inputs and dequeue task outputs while the L2 run stays alive. This avoids -stopping the L2 run after every task and then paying host/device finish and -init costs again for the next task. - -The target shape has two layers: - -- a base bidirectional queue transport with input and output queues; -- an L2-side input window helper that lets L2 hold multiple input messages - concurrently without changing the L3 API or the transport ABI. - -The base transport should land first for reviewability. The input window can -then be added as an L2 helper policy on top of the same descriptor ABI, region -layout, counter layout, and L3 queue API. - -The queue wrapper does not change the primitive L3-L2 communication service. -It uses the existing region descriptor, payload byte range, and `int32_t` -signal counter primitives. - -## 2. Existing Primitive Constraints - -The primitive L3-L2 communication layer provides: - -- one region descriptor containing payload and counter base/size fields; -- contiguous payload byte access through `PAYLOAD_READ` and `PAYLOAD_WRITE`; -- address-based `int32_t` signal counters through `SIGNAL_NOTIFY`, - `SIGNAL_TEST`, and `SIGNAL_WAIT`; -- region lifetime, release, and poison state handling. - -The primitive layer deliberately does not define queue layout, stream headers, -opcodes, tensor schema, descriptor rings, STOP semantics, or typed tensor -metadata. The message queue wrapper owns those protocol choices. - -The primitive layer requires only 4-byte alignment for counter addresses inside -the registered counter range. The queue wrapper places high-frequency shared -counter signals at 64-byte strides so counters written by different agents do -not share a cache line. - -## 3. Public Functional Shape - -L3 creates one bidirectional queue object: - -```python -queue = orch.create_l3_l2_queue( - worker_id=0, - depth=8, - input_arena_bytes=1 << 20, - output_arena_bytes=1 << 20, -) -``` - -The L3-visible queue API exposes an input queue and an output queue. L3 sends -ordinary application messages to L2 through the input queue and receives -ordinary application messages from L2 through the output queue. - -The wrapper computes: - -- descriptor ring sizes; -- payload section offsets; -- counter offsets; -- total region payload bytes; -- total counter bytes. - -The user does not pass internal descriptor offsets, arena offsets, or counter -offsets. - -The queue owns one `L3L2OrchRegion`. The L2 task receives the primitive region -descriptor plus queue layout scalars through `TaskArgs`. - -The intended L3 API shape is illustrative, but the semantics are part of the -transport contract: - -```python -queue.input.enqueue(host_buffer, nbytes=None, timeout=timeout_s) -message = queue.output.dequeue_into(host_buffer, timeout=timeout_s) -handle = queue.output.peek(timeout=timeout_s) -queue.output.read_into(handle, host_buffer) -queue.output.release(handle) -queue.request_stop(timeout=timeout_s) -queue.free() -``` - -The output ownership APIs `peek`, `read_into`, and `release` are part of the -base L3 API. They are the recommended path for variable-size outputs because -the caller can inspect `payload_nbytes` before choosing or allocating a target -buffer. Convenience APIs such as `dequeue_into` may copy and release in one -operation when the caller already has a large enough target buffer. Core APIs -that hand ownership to the caller require explicit release. - -`queue.free()` releases the L3 queue handle. It rejects later queue operations, -but it does not synchronously free device memory. Physical cleanup follows the -underlying region lifetime model. - -The L3 public queue API accepts ordinary contiguous host byte spans for -convenience enqueue and output read operations. When the supplied buffer is -already a primitive-compatible registered `orch.alloc(...)` Tensor, the queue -uses it as the zero-extra-host-copy fast path. Otherwise the queue lazily -stages through an internal registered host Tensor before issuing the primitive -payload command, then copies between that staging Tensor and the user buffer. -Zero-byte DATA and ERROR messages may pass `None` as the buffer. Staging hides -the primitive child-visible Tensor requirement from ordinary queue users, but -may add one host-to-host copy. - -The L2 input window extension is not visible to L3. It is an L2 helper policy -that controls how many DATA input messages L2 may hold concurrently before -releasing them in FIFO-safe order. - -## 4. Non-Goals - -- Multiple L2 orchestrators. -- Multi-producer or multi-consumer queues. -- Shared input/output payload allocator. -- Split payload spans across arena wrap. -- Dtype, shape, stride, tensor rank, or tile layout interpretation. -- Changes to `ALLOC_REGION`, `PAYLOAD_READ`, `PAYLOAD_WRITE`, - `SIGNAL_NOTIFY`, `SIGNAL_TEST`, or `SIGNAL_WAIT`. -- Exposing the L2 input window configuration through the L3 API. -- Out-of-order input payload release. -- Fragmented or hole-filled input arena allocators. -- Output-side STOP acknowledgement messages. - -## 5. Region Layout - -The physical L3-L2 region has one payload range and one counter range. The -queue wrapper divides the payload range into four logical sections: - -```text -payload region -├─ input descriptor ring -├─ output descriptor ring -├─ input payload arena -└─ output payload arena -``` - -The descriptor rings live in the payload region because they are structured -byte metadata. The counter range stores only shared head/tail signals. - -The input and output payload arenas are logically separate. This preserves SPSC -ownership: - -```text -input arena: - producer = L3 - consumer = L2 - -output arena: - producer = L2 - consumer = L3 -``` - -A shared payload allocator is intentionally out of scope because it would have -two producers and two releasers. - -The queue layout is derived, not transmitted as internal offsets. `TaskArgs` -carry the primitive region descriptor followed by four queue parameters: - -```text -primitive desc[0..5] -queue_magic_version -depth -input_arena_bytes -output_arena_bytes -``` - -The queue magic/version belongs to the queue wrapper ABI, not to the primitive -region ABI. It covers the descriptor slot format, opcode values, deterministic -layout function, head/tail reconstruction rules, and STOP/ERROR transport -semantics. - -A shared C/C++ layout helper is the source of truth for derived offsets and -sizes. Python may mirror that calculation, but tests must keep the Python -calculation and the C/C++ helper in lockstep. The helper derives: - -```text -input_desc_offset -output_desc_offset -input_arena_offset -output_arena_offset -input_desc_tail = 0 -input_desc_head = 64 -output_desc_tail = 128 -output_desc_head = 192 -l3_abort_flag = 256 -l2_abort_flag = 320 -``` - -Validation rules: - -- `depth` must be a power of two and `depth <= 2^30`. -- Queue capacity is `depth` messages, not `depth - 1`. -- Descriptor slot size is fixed at 32 bytes. -- Descriptor rings are 8-byte aligned. -- Payload arena bases are 64-byte aligned. -- `input_arena_bytes` and `output_arena_bytes` must be positive 64-byte - multiples. They do not need to be powers of two. -- `counter_bytes` must be at least 384. -- `payload_bytes` must contain both descriptor rings and both payload arenas. -- Unsupported `queue_magic_version` on L2 is a fatal queue decode error for - this region. - -The L3 queue creator initializes the four shared head/tail counters and the -two abort flags to zero before submitting the persistent L2 run. Descriptor -slots and payload bytes do not need to be zeroed for correctness. - -## 6. Descriptor ABI - -Each descriptor slot is 32 bytes and is encoded as four little-endian -`uint64_t` values: - -```cpp -struct L3L2QueueDescSlot { - uint64_t seq; - uint64_t opcode; - uint64_t payload_offset; - uint64_t payload_nbytes; -}; -static_assert(sizeof(L3L2QueueDescSlot) == 32); -``` - -The queue uses 64-byte spacing for shared signal counters, not for descriptor -slots. Each descriptor ring is SPSC, so the base descriptor ABI needs only the -four transport fields above. - -`seq` is a full 64-bit infrastructure sequence number used for ring -correctness, wrap detection, diagnostics, and input-window validation. It is -not a user correlation ID. Applications that need request IDs, batch IDs, -partial/final markers, or other correlation should put them in their own -payload header. - -`payload_offset` is relative to the primitive region payload base, so L2 can -call `endpoint.payload_read(payload_offset, payload_nbytes, &view)` directly. - -Future descriptor extensions should use an ABI version or application payload -headers instead of reserving unused fields in every slot. - -## 7. Opcodes - -The queue transport defines these opcodes: - -```text -0 invalid / never published -DATA = 1 ordinary application payload message -STOP = 2 graceful input-side shutdown request, input queue only -ERROR = 3 ordinary application-level error payload message, either direction -``` - -`ERROR` is a normal queue message. The queue layer does not interpret its -payload, does not raise a transport exception for it, and does not poison the -queue when it sees one. Applications define whether an `ERROR` payload -correlates with a request, batch, stream, or other application state. - -Infrastructure errors are handled through poison state, not by trying to write -an `ERROR` message into a potentially untrusted queue. - -`STOP` is valid only on the input queue. The output queue has no STOP message. -L2 shutdown acknowledgement is provided by `Worker.run` drain, not by an -output STOP. Observing STOP on the output queue is invalid published -descriptor state and poisons the queue. - -DATA and ERROR may carry zero payload bytes. For any zero-byte message, -`payload_offset` must be zero and the message consumes no payload arena bytes. -STOP must also use `payload_nbytes == 0` and `payload_offset == 0`. - -## 8. Descriptor Counters And Derived Payload Cursors - -The queue shares only descriptor head/tail values through the primitive layer's -`int32_t` signal counters. Each shared head/tail uses a 64-byte stride: - -```text -offset 0: input_desc_tail writer=L3 -offset 64: input_desc_head writer=L2 -offset 128: output_desc_tail writer=L2 -offset 192: output_desc_head writer=L3 -offset 256: l3_abort_flag writer=L3 -offset 320: l2_abort_flag writer=L2 -``` - -`counter_bytes` must be at least 384. - -The abort flags are single-writer terminal booleans used to disambiguate -operation timeouts from remote infrastructure abort. They are initialized to -zero and set to one with `NotifyOp.Set` when the owning endpoint enters local -infrastructure poison. They do not carry application `ERROR` semantics, do not -count poison events, and do not reset within a queue lifetime. - -Blocking queue operations are not required to poll abort flags on every wait -iteration. When a blocking operation times out, the implementation samples the -peer abort flag. If the peer flag is zero, the timeout remains ordinary -no-progress and does not poison the local queue. If the peer flag is one, the -operation reports remote infrastructure abort and transitions the local handle -to a terminal remote-aborted state. Observing a peer abort flag does not set -the local endpoint's own abort flag. - -The shared descriptor counters store the low 32 bits of logical `uint64_t` -head/tail values. These values are monotonic message counts. The primitive -transports these bits through `int32_t` counters. Endpoints reconstruct local -`uint64_t` head/tail values from sampled counter values using signed 32-bit -delta semantics: - -```text -delta = int32_t(observed_low32 - local_low32) -valid progress: 0 <= delta <= depth -``` - -Negative deltas or deltas larger than `depth` are inconsistent shared state. -Queue creation rejects descriptor depths that would make head/tail -reconstruction ambiguous. This is a validation error, not a poison condition. - -Descriptor head/tail reconstruction is safe because unobserved descriptor -progress is bounded by the descriptor ring depth. Payload byte cursors are not -shared counters and are not reconstructed from low-32-bit signal values. - -Each endpoint maintains the payload cursors it needs as local `uint64_t` -state: - -```text -producer local: - payload_tail - inferred_payload_head - -consumer local: - payload_head -``` - -The producer infers reusable payload space by observing `desc_head` -progress and replaying the released descriptors before reusing those descriptor -slots. The consumer maintains its local `payload_head` while releasing -descriptors. -Because payload cursor progress is derived from descriptor FIFO history, payload -arena size is not limited by 32-bit signal counter reconstruction. - -Queue correctness is based on reconstructed descriptor head/tail state plus -descriptor replay, not on primitive `GE` / `LT` comparison over the 32-bit -counter value. Blocking queue operations use bounded polling over `SIGNAL_TEST` -snapshots plus local queue-state checks. The timeout belongs to the wrapper -operation. The design does not require primitive `SIGNAL_WAIT` for queue -correctness. - -Local queue state may advance only after a matched `SIGNAL_TEST` snapshot. A -failed `SIGNAL_TEST` result does not establish acquire ordering, and its -`observed` value must not drive descriptor head/tail reconstruction, descriptor -replay, or payload release. Implementations should choose a comparison that -matches when the sampled counter has changed, such as `NE` against the local -low-32 value. The protocol does not prescribe a busy-poll, sleep, yield, or -backoff strategy. - -If a live endpoint observes counter, head/tail, cursor, or descriptor state that -contradicts the descriptor reconstruction or payload replay rules, that is -inconsistent shared state and poisons the queue. - -Descriptor slots carry the full 64-bit per-message `seq`, so message-level -validation does not depend on reconstructing sequence numbers from counters. -Input and output queues have independent sequence spaces. In each direction, -the first published message has `seq = 1`; head/tail counters start at zero and -store the number of messages published or released. A published slot has -`seq = tail_before_publish + 1`. - -## 9. Payload Arena - -Each direction has a variable-size SPSC byte arena. - -Rules: - -- `payload_tail` and `payload_head` are logical `uint64_t` byte cursors. -- Actual arena offset is `cursor % arena_bytes`. -- `arena_bytes` is limited by region allocation capacity, addressability, and - runtime memory budget, not by 32-bit signal counter reconstruction. -- A single message payload must be one contiguous span. -- A single message payload must be `<= arena_bytes`. -- Split payloads across the arena wrap are not supported. -- If remaining bytes at the arena end cannot hold the next payload, the - producer may insert invisible padding by advancing `payload_tail` to the next - arena cycle. -- Padding has no descriptor. On release, the consumer compares - `payload_head % arena_bytes` with the descriptor's arena-relative payload - offset. If they differ, the only valid base-queue case is wrap padding: the - descriptor offset is the base offset of this direction's arena and the - releaser first advances `payload_head` to the next arena cycle. It then - advances `payload_head` by `payload_nbytes`. Any other mismatch is - inconsistent shared state and poisons the queue. The same replay rule is used - by the producer after observing `desc_head` progress, before it reuses - released descriptor slots. -- Zero-byte messages do not participate in wrap-padding checks and do not - advance payload cursors. - -Backpressure must check both descriptor slots and payload arena bytes. A free -descriptor slot is not enough if the payload arena lacks enough contiguous -space. - -Payload validation is direction-local. DATA and ERROR payloads must lie wholly -inside the input arena for input descriptors, and wholly inside the output -arena for output descriptors. Being inside the primitive payload range is not -enough. - -## 10. Core Operation Sequence - -The queue exposes direction-specific operations. Exact class names may change, -but the operation set and ownership semantics are the transport contract. - -L3 owns the input producer and output consumer operations: - -```text -input.enqueue(buffer, nbytes, timeout) -input.try_enqueue(buffer, nbytes) -output.dequeue_into(buffer, timeout) -output.try_dequeue_into(buffer) -output.peek(timeout) -> message handle -output.try_peek() -> message handle or no-progress -output.read_into(handle, buffer) -output.release(handle) -request_stop(timeout) -try_request_stop() -free() -``` - -`dequeue_into` is the convenience path for full-message copy and release. -The `peek` / `read_into` / `release` path is the explicit-ownership path. -`free` releases the L3 queue handle, not the physical region. - -L2 owns the input consumer and output producer operations: - -```text -input.peek(timeout) -> input handle -input.try_peek() -> input handle or no-progress -input.release(handle) -output.reserve(nbytes, timeout) -> reservation -output.try_reserve(nbytes) -> reservation or no-progress -output.publish(reservation, opcode) -``` - -The L2 input window extension wraps the input consumer with additional -`complete(handle)` ownership; it does not change the base transport ABI. The -base queue has no output dequeue operation on L2 and no input enqueue operation -on L2. - -The producer sequence is: - -```text -reserve -> fill/copy payload -> publish descriptor -``` - -The consumer sequence is: - -```text -peek/acquire descriptor -> read/view payload -> release descriptor and payload -``` - -Convenience APIs are built from the core operation sequence: - -```text -enqueue = reserve + copy + publish -dequeue_into = peek + read + release -``` - -L3 input enqueue can usually use the convenience path because the input payload -already exists in a host-visible buffer. - -L2 output needs the core path because it often must reserve output arena space -before launching AICore work: - -```cpp -auto out = output_queue.reserve(output_nbytes, timeout); -Tensor output = make_tensor_external(out.gm_addr, shape, rank, dtype); -// submit AICore work that writes output -// synchronize so output bytes are visible -output_queue.publish(out, L3L2QueueOpcode::DATA); -``` - -Each queue direction allows at most one outstanding producer reservation. -`publish` accepts only the current outstanding reservation for that direction. -Publishing an unknown, stale, already-published, or cross-queue reservation is -a local ownership contradiction and poisons the queue. - -The base queue does not support reservation cancel. A successful reserve must -be published. If filling the reservation fails but the queue remains -trustworthy, the application may publish an ERROR message using that -reservation. If the reservation cannot be safely published, the producer -poisons the queue. - -Descriptor publication is ordered. The producer writes payload bytes first, -writes descriptor fields, writes `seq` as the descriptor validity marker after -the other descriptor fields, and then release-publishes the tail counter. The -consumer acquire-observes tail progress before reading the slot, and -accepts the descriptor only when `slot.seq` equals the expected sequence. - -Descriptor slots do not need to be cleared before reuse. Sequence validation -distinguishes old and new contents. - -Descriptor release is ordered in the opposite direction. The consumer must -finish using the payload, update local release state, and release-publish the -head counter. The producer may replay released descriptors and infer reusable -payload space only after acquire-observing matched head progress. - -All blocking operations require finite timeouts. Nonblocking `try_*` variants -return without changing shared state when no descriptor slot, message, or -payload space is available. Timeout under ordinary backpressure does not -poison the queue. - -The queue layer returns transport messages to the application: - -```text -seq -opcode -payload bytes or payload view -``` - -The queue layer does not infer application request correlation from queue order -or from transport `seq`. - -Queue ownership is per message, not per byte range. Release or complete always -applies to the whole descriptor payload span. - -For L3 convenience dequeue, a too-small output buffer is a local validation -failure. The descriptor remains at the queue head, no release is published, and -the caller may retry with a larger child-visible buffer. - -## 11. Base L2 Processing Contract - -After dequeuing one input message, L2 application code may submit any number -of message-local AICore tasks and use runtime dependencies, manual scopes, -async notify, or other L2 orchestration features. - -The base helper and example do not overlap ownership of multiple input -messages. They keep at most one active DATA input message at a time: - -```text -peek input -reserve output -submit message-local AICore work -wait or otherwise prove message-local work is safe -publish output -release input -next message -``` - -L2 must not release an input message until AICore no longer reads that input -payload and any corresponding output has been successfully published. - -After an input is released, L2 and any in-flight AICore work must not read its -payload view again. - -The queue layer does not understand dtype, shape, stride, or tensor schema. It -returns byte views. Applications build typed tensors with their own protocol -metadata. - -## 12. L2 Input Window Extension - -The target feature shape includes an L2 input window helper. The helper lets L2 -hold multiple DATA input messages concurrently while preserving FIFO-safe input -release. It enables application-defined output cardinality and output order: - -- one input may produce no output; -- one input may produce multiple outputs; -- several inputs may produce one output; -- status or progress outputs may be published independently; -- output publish order may differ from input acquire order. - -The L3-visible queue API is unchanged by the input window extension. L3 still -observes an input queue and an output queue. L3 receives output messages in -publish order and does not infer input/output correlation from queue order. -Correlation, aggregation, partial/final markers, request IDs, and batch IDs -belong in the application payload header. - -`max_l2_inflight` is a local L2 helper policy. It is not part of queue creation -and does not affect region layout: - -```cpp -L3L2QueueEndpoint queue(desc, layout); -L3L2InputWindow input_window( - queue.input(), - L3L2InputWindowConfig{.max_l2_inflight = 4} -); -``` - -The helper tracks input handles with these states: - -```text -ACQUIRED - Descriptor has been read. Payload view is available to L2. - -COMPLETED - Application has declared the input payload is no longer needed. - -RELEASED - Helper has advanced the input descriptor and payload cursors past this input. -``` - -The state transition is: - -```text -ACQUIRED -> COMPLETED -> RELEASED -``` - -The application owns the transition to `COMPLETED`; the helper owns the -transition to `RELEASED`. Completing an input means no future L2 code or -in-flight AICore task will read that input payload, and the payload is no -longer needed to construct future output. - -Completion is explicit. The helper must not infer completion from C++ object -destruction or lexical scope exit. A handle that is completed twice, released -twice, or destroyed while still active is a local ownership error. - -The helper releases inputs through a FIFO watermark. If inputs 10, 11, and 12 -are acquired and inputs 10 and 12 are completed, the helper may release input -10 only. It must not release input 12 until input 11 is also completed. This -keeps the input payload arena monotonic and avoids holes. - -Output publish remains application-driven and independent of input handles: - -```cpp -auto out = queue.output().reserve(nbytes, timeout); -// fill output directly or submit AICore work that writes out.gm_addr -queue.output().publish(out, L3L2QueueOpcode::DATA); -``` - -The input window extension does not add an output completion manager. The L2 -application owns completion tracking and decides when an output is ready to -publish. - -Output reservation and publish remain single-outstanding per direction. The -input window allows multiple active input handles; it does not introduce -multiple concurrent output reservations. - -## 13. STOP Semantics - -`STOP` is an input queue descriptor message: - -```text -seq + opcode=STOP + payload_nbytes=0 -``` - -It follows normal FIFO ordering. STOP is a graceful shutdown request, not -cancel and not an immediate no-more-output marker. - -Base helper behavior: - -- L2 exits only after processing messages before the STOP. -- L2 releases the STOP descriptor and returns from the persistent run. -- `Worker.run` drain acts as the final acknowledgement. -- No extra STOP ACK counter is required. - -Input-window behavior: - -- STOP can be acquired while earlier DATA inputs are still active. -- STOP does not take effect ahead of earlier DATA inputs. -- The helper stops acquiring further DATA inputs after STOP is observed. -- Earlier active DATA inputs continue until the application completes them. -- Outputs produced by earlier DATA inputs may still be published while the - helper drains. -- The helper releases only the FIFO completed prefix. -- Once all earlier DATA inputs are released, the helper releases STOP and the - persistent L2 run exits. - -STOP takes an input descriptor slot but does not count against -`max_l2_inflight`, because `max_l2_inflight` controls only active DATA input -ownership. - -STOP is terminal for the input queue. After L3 successfully publishes STOP, -the input queue rejects further DATA, ERROR, or STOP enqueue attempts locally -without poisoning. If L2 has observed STOP and later observes any further -published input descriptor, including a second STOP, that is invalid published -descriptor state and poisons the queue. - -STOP does not close the output queue. After publishing STOP, L3 may continue -dequeueing DATA or ERROR messages from the output queue. The transport has no -output-side terminal message and does not automatically know that the -persistent L2 run has returned. Applications that need to know all business -outputs have arrived must define that condition in their payload protocol, for -example with expected counts or final markers. - -Publishing STOP and then immediately returning from the L3 orchestration -function is transport-legal. It can still be an application error if L2 needs -to publish final outputs: the output queue may fill and prevent L2 from -finishing, causing `Worker.run` drain to fail or time out. - -Convenience APIs may expose: - -```text -try_request_stop() -request_stop(timeout) -``` - -`try_request_stop()` attempts to publish a STOP descriptor to the input queue -and returns immediately if no input descriptor slot is available. - -`request_stop(timeout)` performs a bounded wait until a STOP descriptor can be -published. The timeout covers only STOP enqueue/publish. It does not wait for -L2 exit and does not drain outputs. If the timeout expires before STOP is -published, the queue remains live and is not poisoned. - -## 14. Queue Lifetime And Cleanup - -A queue owns one primitive `L3L2OrchRegion`. Queue cleanup follows the -underlying region cleanup path: - -```text -optional request_stop() -> L2 persistent run exits -L3 orchestration function returns -Worker.run drains submitted L2 work -runtime sends FREE_REGION for live L3-L2 regions -queue/region handles expire -``` - -`request_stop()` and `queue.free()` are different operations. `request_stop()` -is a protocol message that asks L2 to stop acquiring input. `queue.free()` is a -local handle release that rejects later queue use. Neither operation -synchronously releases the physical payload/counter region. - -Physical release is deferred until `Worker.run` has drained submitted L2 work. -This keeps region memory live while an in-flight L2 task may still hold the -primitive descriptor or payload views. If the L3 orchestration function exits -with a live queue, runtime cleanup releases it through the same region cleanup -path. - -Queue cleanup does not require the output queue to be empty. Once `Worker.run` -has drained and the persistent L2 run has returned, freeing the region is -memory-safe even if L3 left output messages unread. Those unread messages are -discarded with the region. Applications that need every output must dequeue -until their own final-output condition is satisfied before calling -`queue.free()` or returning from the orchestration function. - -## 15. Error And Poison - -Application-level failure is represented by `opcode=ERROR` and optional -application-defined payload bytes. `ERROR` is allowed in either direction and -may be published during normal processing or while draining after STOP. -Receiving `ERROR` does not poison the queue and does not change STOP -semantics. - -Infrastructure poison is a queue/region state, not a descriptor message. - -The guiding rule is: - -```text -Before shared-state mutation: reject, no poison. -After shared-state mutation or inconsistent shared-state observation: poison. -``` - -Examples that do not poison: - -- `try_enqueue` sees no space. -- `try_request_stop` sees no input descriptor slot. -- Blocking enqueue/dequeue/request-stop times out under ordinary backpressure. -- Payload is larger than the arena before reserve mutates state. -- Queue creation rejects ambiguous descriptor head/tail reconstruction - parameters. -- User buffer is too small before read copies payload bytes. -- Invalid API arguments are caught before touching shared state. - -Examples that poison: - -- descriptor sequence mismatch; -- invalid opcode observed in a published descriptor; -- STOP observed on the output queue; -- descriptor payload range outside its arena; -- descriptor head/tail reconstruction or payload replay observes impossible - shared state; -- payload copy failure after command issue; -- counter notify failure; -- control-service response timeout after command issue; -- L2 endpoint fatal error for this region; -- reservation, publish, or release state becomes self-contradictory. - -Ordinary queue operation timeout does not prove remote poison. After a -blocking operation times out, the endpoint samples the peer abort flag. If the -peer flag is still zero, the timeout remains ordinary no-progress and does not -poison the local queue. If the peer flag is one, the endpoint reports remote -infrastructure abort and transitions its local handle to a terminal -remote-aborted state without setting its own abort flag. The peer may also -observe primitive region fatal errors or `Worker.run` drain errors. - -Only local infrastructure poison sets the endpoint's own abort flag. Ordinary -timeouts, application `ERROR` messages, pre-mutation validation failures, and -observing the peer's abort flag do not set it. - -The L2 input window helper also poisons the queue when local ownership state -becomes contradictory: - -- completing an input handle unknown to the helper; -- completing or releasing a handle twice; -- attempting to release a non-contiguous input while earlier inputs remain - incomplete; -- acquiring DATA after STOP has put the helper into draining; -- observing an acquired input sequence that contradicts the helper window. - -The Python queue object mirrors the existing region state model: - -```text -LIVE -RELEASED -POISONED(local-infrastructure) -POISONED(remote-aborted) -EXPIRED -``` - -After poison, reserve, enqueue, peek, read, release, publish, and stop-request -operations reject. Cleanup/free remains idempotent and valid. - -L2 C++ helper poison reports a fatal error including the primitive region id, -so existing Host-side parsing can poison the corresponding region. - -## 16. Implementation Staging - -The feature can be implemented in two review-friendly stages. This staging is -not an API boundary: the base transport should intentionally leave room for -the input window without later ABI or L3 API changes. - -```text -Stage 1: - base SPSC message queue transport - input and output descriptor rings - input and output payload arenas - descriptor head/tail protocol over int32_t signal counters - single-writer abort flags for timeout disambiguation - derived uint64_t payload cursors via descriptor replay - DATA / ERROR / input-only STOP - one active DATA input in the L2 helper/example - -Stage 2: - L2 input window helper - max_l2_inflight - application-driven input complete - FIFO-safe release of completed input prefix - flexible output cardinality and out-of-input-order output publish - FIFO STOP drain with earlier DATA inputs still active -``` - -Stage 1 intentionally leaves room for Stage 2 through these hook points: - -- descriptor `seq` is explicit and 64-bit; -- input release is explicit, not tied to dequeue; -- output reserve and publish are separate; -- each direction has at most one outstanding producer reservation; -- application correlation is kept in payload, so queue transport does not - assume one input maps to one output; -- L3 queue creation and output ownership/dequeue APIs do not depend on - `max_l2_inflight`. - -Expected implementation locations: - -```text -python/simpler/l3_l2_message_queue.py -src/common/platform/include/aicpu/l3_l2_message_queue.h -docs/l3-l2-message-queue.md -examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue/ -examples/a2a3/tensormap_and_ringbuffer/l3_l2_message_queue_input_window/ -``` - -The exact Python module and public API names may change during implementation, -but the transport contract should remain stable. - -## 17. Tests And Examples - -Base queue tests should cover: - -- layout calculation; -- descriptor slot encoding; -- counter offset assignment; -- queue creation rejecting ambiguous descriptor head/tail reconstruction - parameters; -- enqueue reserve failure for payload larger than arena; -- backpressure when descriptor ring is full; -- backpressure when payload arena is full; -- arena wrap with invisible padding; -- STOP descriptor handling; -- `try_request_stop` and `request_stop(timeout)` behavior; -- ERROR as a normal application message in either direction; -- L3 ordinary host-buffer enqueue/read through lazy staging; -- L3 primitive-compatible registered Tensor fast paths without staging; -- staging allocation failure before primitive command issue not poisoning the - queue; -- abort flags distinguishing ordinary timeout from remote infrastructure - abort; -- local infrastructure poison setting the local abort flag; -- remote-aborted terminal state not setting the local abort flag; -- poison on invalid published descriptor state; -- poison on descriptor head/tail reconstruction or payload replay - inconsistency; -- no poison on pre-mutation validation failure. - -The new example should be parallel to the existing primitive stream example, -not a replacement for it. The primitive stream example should remain as the -minimal demonstration of `docs/l3-l2-orch-comm.md`. - -The base queue example should demonstrate: - -- `depth > 1`; -- variable-size input and output payloads; -- input and output backpressure; -- L2 persistent loop; -- one input message containing message-local AICore work; -- FIFO STOP shutdown; -- L3 optionally dequeuing output after STOP according to application final - output rules. - -Input window tests and examples should cover: - -- `max_l2_inflight > 1`; -- refusing to acquire new DATA input when the input window is full; -- multiple input messages acquired before earlier inputs release; -- application-driven input completion; -- releasing only the FIFO completed prefix; -- one input producing multiple outputs; -- multiple inputs producing one output; -- output publish order differing from input acquire order; -- output correlation stored in the application payload header; -- STOP entering draining while earlier DATA inputs remain active; -- output DATA or ERROR publish during STOP drain; -- local ownership errors poisoning the queue. - -Future work beyond the staged implementation is limited to out-of-order input -payload release, fragmented payload arena allocation, abort reason/status -metadata, low-latency abort polling, or concurrent output reservations, if -those become necessary. diff --git a/docs/l3-l2-message-queue.md b/docs/l3-l2-message-queue.md new file mode 100644 index 000000000..a77003537 --- /dev/null +++ b/docs/l3-l2-message-queue.md @@ -0,0 +1,352 @@ +# L3-L2 Message Queue + +L3-L2 Message Queue lets an L3 Host Orchestrator exchange ordered messages +with one persistent L2 AICPU Orchestrator task. + +The intended use case is repeated in-flight work: L3 enqueues input messages, +L2 consumes them while the L2 task stays alive, L2 publishes output messages, +and L3 dequeues those outputs. The queue is built on top of the lower-level +L3-L2 orchestration communication primitives described in +[l3-l2-orch-comm.md](l3-l2-orch-comm.md). For where L3 and L2 sit in +the runtime stack, see +[hierarchical_level_runtime.md](hierarchical_level_runtime.md). + +## 1. API + +L3 creates one queue for one chip worker: + +```python +queue = orch.create_l3_l2_queue( + worker_id=0, + depth=4, + input_arena_bytes=1 << 20, + output_arena_bytes=1 << 20, +) +``` + +The queue owns one underlying `L3L2OrchRegion`. Its payload range is split into +input/output descriptor rings and input/output payload arenas. Its counter +range stores descriptor head/tail signals and abort flags. + +L3 passes the primitive region descriptor and queue layout arguments to L2: + +```python +l2_args = TaskArgs() +for value in queue.l2_task_arg_scalars(): + l2_args.add_scalar(value) + +orch.submit_next_level(l2_handle, l2_args, cfg, worker=0) +``` + +`l2_task_arg_scalars()` returns: + +```text +primitive region descriptor scalars[0..5] +queue_magic_version +depth +input_arena_bytes +output_arena_bytes +``` + +L3 sends input messages through `queue.input`: + +```python +host_input = orch.alloc([nbytes], DataType.UINT8) +fill_input(host_input) + +queue.input.enqueue(host_input, nbytes=nbytes, timeout=timeout_s) +``` + +`try_enqueue(buffer, nbytes)` is the non-blocking form. It returns `False` +when the input descriptor ring or payload arena has no space. That result is +ordinary backpressure and does not poison the queue. + +L3 receives output messages through `queue.output`: + +```python +host_output = orch.alloc([max_output_nbytes], DataType.UINT8) + +message = queue.output.peek(timeout=timeout_s) +queue.output.read_into(message, host_output) +queue.output.release(message) +``` + +The convenience form reads and releases in one operation: + +```python +message = queue.output.dequeue_into(host_output, timeout=timeout_s) +``` + +`try_peek()` and `try_dequeue_into(buffer)` are the non-blocking forms. They +return `None` when no output message is available. + +The L3 buffer arguments currently must be runtime-managed tensors returned by +`orch.alloc(...)`. Ordinary Python `bytes`, `bytearray`, and private tensors +are rejected before shared queue state is modified. Zero-byte messages use +`buffer_or_none=None` and `nbytes=0`. + +L3 requests graceful shutdown by publishing an input-side `STOP` descriptor: + +```python +queue.request_stop(timeout=timeout_s) +queue.free() +``` + +`try_request_stop()` is the non-blocking form. `queue.free()` releases the L3 +handle. It does not synchronously free device memory; physical cleanup follows +the underlying region lifetime model after submitted L2 work has drained. + +On L2, orchestration code receives the primitive descriptor and queue args, +then constructs an endpoint: + +```cpp +L3L2OrchRegionDesc desc{/* scalars from TaskArgs */}; +L3L2QueueArgs queue_args{ + magic_version, + depth, + input_arena_bytes, + output_arena_bytes, +}; + +L3L2QueueEndpoint queue(desc, queue_args); +if (queue.error().kind != L3L2QueueErrorKind::NONE) { + return; +} +``` + +L2 consumes input messages from `queue.input()` and publishes outputs through +`queue.output()`: + +```cpp +while (true) { + L3L2QueueInputHandle input{}; + if (!queue.input().peek(timeout_ns, &input)) { + return; + } + + if (input.opcode == L3L2QueueOpcode::STOP) { + queue.input().release(input); + return; + } + + L3L2QueueOutputReservation output{}; + if (!queue.output().reserve(input.payload_nbytes, timeout_ns, &output)) { + return; + } + + launch_aicore(input.payload, output.payload); + wait_aicore_done(); + + queue.output().publish(output, L3L2QueueOpcode::DATA); + queue.input().release(input); +} +``` + +`queue.input().try_peek(&input)` and +`queue.output().try_reserve(nbytes, &reservation)` are non-blocking. A `false` +return can mean no progress, timeout, validation failure, or poison; check +`queue.error().kind` to distinguish ordinary no-progress from terminal error. + +## 2. Layout + +The physical region has one payload range: + +```text +payload region +|-- input descriptor ring +|-- output descriptor ring +|-- input payload arena +`-- output payload arena +``` + +The two payload arenas are separate: + +```text +input arena: producer = L3, consumer = L2 +output arena: producer = L2, consumer = L3 +``` + +`depth` is the descriptor-ring capacity in each direction. It must be a power +of two and at most `2^30`. Queue capacity is exactly `depth` messages, not +`depth - 1`. + +`input_arena_bytes` and `output_arena_bytes` must be positive 64-byte +multiples. They do not need to be powers of two. A single message payload must +fit as one contiguous span inside its direction's arena. Payloads are not split +across arena wrap. + +The queue layout helper is shared by Python and C++: + +```text +input_desc_offset +output_desc_offset +input_arena_offset +output_arena_offset +payload_bytes +counter_bytes +``` + +Python exposes this as `queue.layout`; L2 exposes it as `queue.layout()`. + +## 3. Descriptor ABI + +Each descriptor slot is 32 bytes: + +```cpp +struct L3L2QueueDescSlot { + uint64_t seq; + uint64_t opcode; + uint64_t payload_offset; + uint64_t payload_nbytes; +}; +``` + +`seq` is the transport sequence number for ring validation, wrap detection, and +diagnostics. It is not a user request ID. Applications that need request IDs, +batch IDs, final markers, or correlation fields should put them in their own +payload header. + +`payload_offset` is relative to the primitive region payload base. The payload +must be wholly inside the matching direction's arena. Zero-byte messages use +`payload_offset == 0` and `payload_nbytes == 0`. + +The queue currently defines these opcodes: + +| Opcode | Meaning | +| ------ | ------- | +| `DATA` | Ordinary application payload message. | +| `STOP` | Graceful input-side shutdown request. | +| `ERROR` | Ordinary application-level error payload message. | + +`STOP` is valid only on the input queue. The output queue has no `STOP` +message; L2 exit is observed through normal `Worker.run` drain. + +`ERROR` is a normal queue message. The queue layer does not interpret its +payload and does not poison the queue when an `ERROR` message is received. +Infrastructure failures use poison state instead. + +## 4. Signals And Ordering + +The queue uses the primitive signal counters as descriptor head/tail values. +Each shared signal is placed on a 64-byte stride: + +```text +offset 0: input_desc_tail writer=L3 +offset 64: input_desc_head writer=L2 +offset 128: output_desc_tail writer=L2 +offset 192: output_desc_head writer=L3 +offset 256: l3_abort_flag writer=L3 +offset 320: l2_abort_flag writer=L2 +``` + +Descriptor counters store the low 32 bits of monotonic logical head/tail +values. Each endpoint reconstructs its local 64-bit value from observed +progress. The unobserved progress must be between zero and `depth`; anything +else is inconsistent shared state and poisons the queue. + +The producer sequence is: + +```text +reserve payload space +write payload bytes +write descriptor fields +write descriptor seq +publish descriptor tail counter +``` + +The consumer sequence is: + +```text +observe descriptor tail progress +read and validate descriptor +use payload bytes or payload view +release descriptor and payload +publish descriptor head counter +``` + +All blocking queue operations require finite timeouts. Timeout under ordinary +backpressure is not poison. After timeout, an endpoint samples the peer abort +flag; if the peer flag is set, the local endpoint reports remote abort. + +## 5. Ownership + +Queue ownership is per message. + +On L3 output, `peek()` returns a handle that remains active until +`release(handle)`. While a handle is active, repeated `try_peek()` returns the +same handle. The caller may read the payload with `read_into(handle, buffer)` +before releasing it. Releasing the wrong handle is an ownership error and +poisons the queue. + +On L2 input, `peek()` returns one active input handle. L2 must not call +`peek()` again before releasing that handle. L2 must not release an input until +all AICore work that reads the input payload has completed. + +On L2 output, `reserve()` returns one active output reservation. L2 fills the +reserved payload span, then calls `publish(reservation, opcode)`. Publishing an +unknown, stale, already-published, or cross-queue reservation is an ownership +error and poisons the queue. + +The base queue supports at most one active L2 input handle and one active L2 +output reservation. It does not provide a multi-input L2 window. + +## 6. STOP Semantics + +`STOP` is an input descriptor with no payload. It follows normal FIFO ordering: +L2 observes and releases messages before `STOP`, then releases `STOP` and +returns from the persistent run. + +After L3 successfully publishes `STOP`, the input queue rejects further input +messages locally without poisoning. L3 may still dequeue output messages that +L2 publishes before returning. + +`request_stop(timeout)` waits only until the `STOP` descriptor is published. +It does not wait for L2 exit and does not drain outputs. Applications that need +all outputs must keep dequeuing until their own protocol-level final condition +is satisfied before returning from the L3 orchestration function. + +## 7. Error Handling + +The queue distinguishes no-progress, application errors, and infrastructure +poison. + +No-progress is non-terminal: + +- descriptor ring full; +- payload arena full; +- empty output queue; +- blocking operation timeout with no peer abort flag. + +Application-level error is represented by `opcode=ERROR`. It is delivered to +the peer as a normal message and does not set an abort flag. + +Infrastructure poison is terminal for the local queue handle: + +- descriptor sequence mismatch; +- invalid opcode in a published descriptor; +- output-side `STOP`; +- descriptor payload outside its direction's arena; +- impossible counter reconstruction or payload replay; +- payload command failure after shared mutation begins; +- counter notify failure; +- stale or invalid handle/reservation ownership. + +When an endpoint enters local infrastructure poison, it sets its own abort flag +for the peer. Observing the peer abort flag reports remote abort but does not +set the local abort flag. + +After poison, normal queue operations reject. Cleanup remains valid. + +## 8. Platform Support + +The message queue uses the existing L3-L2 orchestration communication region, +payload, and counter primitives. + +- `a2a3sim`: supported. +- `a5sim`: supported. +- `a2a3` onboard: supported where the underlying L3-L2 communication + primitives are supported. +- `a5` onboard: follows the underlying L3-L2 communication support status. + +Simulation backends preserve the same API, ordering, timeout, and error +semantics as onboard backends. diff --git a/docs/l3-l2-orch-comm.md b/docs/l3-l2-orch-comm.md index 6c541dfe5..256babbf5 100644 --- a/docs/l3-l2-orch-comm.md +++ b/docs/l3-l2-orch-comm.md @@ -3,6 +3,10 @@ L3-L2 Orchestrator Communication lets an L3 Host Orchestrator exchange payload bytes and signal counters with a running L2 AICPU Orchestrator task. +This page documents the low-level region, payload, and counter primitives. For +the ordered SPSC message queue wrapper built on these primitives, see +[l3-l2-message-queue.md](l3-l2-message-queue.md). + The intended use case is in-flight interaction: L3 can write input payload, publish a data-ready counter, wait for L2/AICore completion, and read output payload without ending the L2 orchestration task. For where L3 and L2 sit in From bf505cbfdb997feca0160b93c6dc390596de60ad Mon Sep 17 00:00:00 2001 From: ccyywwen <75376396+ccyywwen@users.noreply.github.com> Date: Wed, 1 Jul 2026 09:56:08 +0800 Subject: [PATCH 6/7] Fix: harden L3-L2 queue review feedback - Add strict payload and counter size checks to the L3-L2 queue task args. - Validate L2 input payload offsets before exposing payload views. - Document timeout, layout, and queue free semantics, and expand no-hardware tests. --- docs/l3-l2-message-queue.md | 32 ++- python/simpler/l3_l2_message_queue.py | 4 +- .../include/aicpu/l3_l2_message_queue.h | 41 +++- .../cpp/common/test_l3_l2_message_queue.cpp | 228 +++++++++--------- .../test_worker/test_l3_l2_message_queue.py | 19 ++ 5 files changed, 191 insertions(+), 133 deletions(-) diff --git a/docs/l3-l2-message-queue.md b/docs/l3-l2-message-queue.md index a77003537..12a0034ad 100644 --- a/docs/l3-l2-message-queue.md +++ b/docs/l3-l2-message-queue.md @@ -46,6 +46,8 @@ queue_magic_version depth input_arena_bytes output_arena_bytes +payload_bytes +counter_bytes ``` L3 sends input messages through `queue.input`: @@ -93,8 +95,11 @@ queue.free() ``` `try_request_stop()` is the non-blocking form. `queue.free()` releases the L3 -handle. It does not synchronously free device memory; physical cleanup follows -the underlying region lifetime model after submitted L2 work has drained. +queue handle and marks the underlying `L3L2OrchRegion` handle released. It does +not synchronously free device memory; physical cleanup follows the underlying +region lifetime model after submitted L2 work has drained. Small Python wrapper +scratch tensors used for descriptor staging are owned by the queue object and +follow normal Python object lifetime. On L2, orchestration code receives the primitive descriptor and queue args, then constructs an endpoint: @@ -106,6 +111,8 @@ L3L2QueueArgs queue_args{ depth, input_arena_bytes, output_arena_bytes, + payload_bytes, + counter_bytes, }; L3L2QueueEndpoint queue(desc, queue_args); @@ -175,7 +182,7 @@ multiples. They do not need to be powers of two. A single message payload must fit as one contiguous span inside its direction's arena. Payloads are not split across arena wrap. -The queue layout helper is shared by Python and C++: +Python and C++ mirror the same deterministic queue layout calculation: ```text input_desc_offset @@ -187,6 +194,10 @@ counter_bytes ``` Python exposes this as `queue.layout`; L2 exposes it as `queue.layout()`. +L3 passes the derived `payload_bytes` and `counter_bytes` to L2. L2 rejects +initialization unless those values match both its local layout calculation and +the primitive region descriptor sizes. Lockstep tests cover representative +layout cases for the mirrored Python and C++ calculations. ## 3. Descriptor ABI @@ -264,9 +275,18 @@ release descriptor and payload publish descriptor head counter ``` -All blocking queue operations require finite timeouts. Timeout under ordinary -backpressure is not poison. After timeout, an endpoint samples the peer abort -flag; if the peer flag is set, the local endpoint reports remote abort. +All Python blocking queue operations require finite positive timeouts; passing +`timeout <= 0` is a caller error and raises `ValueError`. Python `try_*` APIs +are non-blocking and return `False` or `None` for ordinary no-progress. + +C++ blocking queue operations take `timeout_ns`; `timeout_ns == 0` is an +immediate timeout probe. They return `false` on no-progress, timeout, +validation failure, or poison. C++ `try_*` APIs are non-blocking and also +return `false` for ordinary no-progress. + +Timeout under ordinary backpressure is not poison. After timeout, an endpoint +samples the peer abort flag; if the peer flag is set, the local endpoint +reports remote abort. ## 5. Ownership diff --git a/python/simpler/l3_l2_message_queue.py b/python/simpler/l3_l2_message_queue.py index 38f6b845b..91236fe5e 100644 --- a/python/simpler/l3_l2_message_queue.py +++ b/python/simpler/l3_l2_message_queue.py @@ -28,7 +28,7 @@ L3L2_QUEUE_MAGIC = 0x4C335132 L3L2_QUEUE_ABI_MAJOR = 1 -L3L2_QUEUE_ABI_MINOR = 0 +L3L2_QUEUE_ABI_MINOR = 1 L3L2_QUEUE_DESC_SLOT_BYTES = 32 L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT = 64 L3L2_QUEUE_COUNTER_STRIDE = 64 @@ -235,6 +235,8 @@ def l2_task_arg_scalars(self) -> list[int]: self._layout.depth, self._layout.input_arena_bytes, self._layout.output_arena_bytes, + self._layout.payload_bytes, + self._layout.counter_bytes, ] def try_request_stop(self) -> bool: diff --git a/src/common/platform/include/aicpu/l3_l2_message_queue.h b/src/common/platform/include/aicpu/l3_l2_message_queue.h index 96dad5a40..4c149ba7e 100644 --- a/src/common/platform/include/aicpu/l3_l2_message_queue.h +++ b/src/common/platform/include/aicpu/l3_l2_message_queue.h @@ -20,7 +20,7 @@ static constexpr uint32_t L3L2_QUEUE_MAGIC = 0x4C335132u; // "L3Q2" static constexpr uint16_t L3L2_QUEUE_ABI_MAJOR = 1; -static constexpr uint16_t L3L2_QUEUE_ABI_MINOR = 0; +static constexpr uint16_t L3L2_QUEUE_ABI_MINOR = 1; static constexpr uint64_t L3L2_QUEUE_DESC_SLOT_BYTES = 32; static constexpr uint64_t L3L2_QUEUE_DESC_RING_ALIGNMENT = 8; static constexpr uint64_t L3L2_QUEUE_PAYLOAD_ARENA_ALIGNMENT = 64; @@ -94,6 +94,8 @@ struct L3L2QueueArgs { uint64_t depth; uint64_t input_arena_bytes; uint64_t output_arena_bytes; + uint64_t payload_bytes; + uint64_t counter_bytes; }; struct L3L2QueueInputHandle { @@ -209,7 +211,8 @@ l3_l2_queue_validate_region(const L3L2OrchRegionDesc &desc, const L3L2QueueArgs !l3_l2_queue_make_layout(args.depth, args.input_arena_bytes, args.output_arena_bytes, &layout)) { return false; } - if (desc.payload_bytes < layout.payload_bytes || desc.counter_bytes < layout.counter_bytes) { + if (args.payload_bytes != layout.payload_bytes || args.counter_bytes != layout.counter_bytes || + desc.payload_bytes != layout.payload_bytes || desc.counter_bytes != layout.counter_bytes) { return false; } if (out_layout != nullptr) { @@ -345,6 +348,11 @@ class L3L2QueueEndpoint { )) { parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "input payload out of arena"); return false; + } else if (!parent_->payload_matches_head( + parent_->input_payload_head_, slot.payload_offset, slot.payload_nbytes, + parent_->layout_.input_arena_offset, parent_->layout_.input_arena_bytes, "input.try_peek" + )) { + return false; } else if (!parent_->endpoint_.payload_read(slot.payload_offset, slot.payload_nbytes, &view)) { parent_->poison( L3L2QueueErrorKind::ENDPOINT_ERROR, "input.try_peek", parent_->endpoint_.error().message @@ -472,6 +480,8 @@ class L3L2QueueEndpoint { uint64_t arena_bytes = parent_->layout_.output_arena_bytes; uint64_t arena_pos = parent_->output_payload_tail_ % arena_bytes; if (arena_pos + nbytes > arena_bytes) { + // Payloads are never split across arena wrap. The skipped tail bytes are retired in the + // monotonic virtual cursor even if this reservation later finds the arena full. parent_->output_payload_tail_ += arena_bytes - arena_pos; arena_pos = 0; } @@ -648,16 +658,33 @@ class L3L2QueueEndpoint { return offset >= arena_offset && offset + nbytes <= arena_offset + arena_bytes; } + bool payload_matches_head( + uint64_t cursor, uint64_t payload_offset, uint64_t nbytes, uint64_t arena_offset, uint64_t arena_bytes, + const char *op + ) { + if (nbytes == 0) { + return true; + } + uint64_t arena_pos = cursor % arena_bytes; + uint64_t expected_offset = arena_pos + nbytes > arena_bytes ? arena_offset : arena_offset + arena_pos; + if (payload_offset != expected_offset) { + poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "payload replay offset mismatch"); + return false; + } + return true; + } + void advance_payload_head( uint64_t &cursor, uint64_t payload_offset, uint64_t nbytes, uint64_t arena_offset, uint64_t arena_bytes, const char *op ) { - uint64_t expected_offset = arena_offset + (cursor % arena_bytes); + uint64_t arena_pos = cursor % arena_bytes; + uint64_t expected_offset = arena_pos + nbytes > arena_bytes ? arena_offset : arena_offset + arena_pos; if (expected_offset != payload_offset) { - if (payload_offset != arena_offset) { - poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "payload replay offset mismatch"); - return; - } + poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, op, "payload replay offset mismatch"); + return; + } + if (arena_pos + nbytes > arena_bytes) { cursor += arena_bytes - (cursor % arena_bytes); } cursor += nbytes; diff --git a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp index e2761c426..e7db495d4 100644 --- a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp +++ b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp @@ -40,6 +40,19 @@ L3L2OrchRegionDesc make_desc(RegionStorage *storage, uint64_t payload_bytes = 51 size_t counter_index(uint64_t offset) { return static_cast(offset / sizeof(int32_t)); } +L3L2QueueArgs make_args(uint64_t depth, uint64_t input_arena_bytes, uint64_t output_arena_bytes) { + L3L2QueueLayout layout{}; + EXPECT_TRUE(l3_l2_queue_make_layout(depth, input_arena_bytes, output_arena_bytes, &layout)); + return L3L2QueueArgs{ + l3_l2_queue_magic_version(), depth, input_arena_bytes, output_arena_bytes, layout.payload_bytes, + layout.counter_bytes, + }; +} + +L3L2OrchRegionDesc make_desc(RegionStorage *storage, const L3L2QueueArgs &args) { + return make_desc(storage, args.payload_bytes, args.counter_bytes); +} + void publish_input_desc( RegionStorage *storage, const L3L2QueueLayout &layout, uint64_t seq, L3L2QueueOpcode opcode, uint64_t payload_offset = 0, uint64_t payload_nbytes = 0 @@ -118,14 +131,10 @@ TEST(L3L2MessageQueueTest, LayoutRejectsInvalidDepthArenaAndCounterBytes) { EXPECT_FALSE(l3_l2_queue_make_layout(2, 65, 64, &layout)); RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; + L3L2QueueArgs args = make_args(2, 64, 64); EXPECT_FALSE(l3_l2_queue_validate_region(make_desc(&storage, 256, 320), args, &layout)); - EXPECT_TRUE(l3_l2_queue_validate_region(make_desc(&storage, 512, 384), args, &layout)); + EXPECT_FALSE(l3_l2_queue_validate_region(make_desc(&storage, 512, 384), args, &layout)); + EXPECT_TRUE(l3_l2_queue_validate_region(make_desc(&storage, args), args, &layout)); } TEST(L3L2MessageQueueTest, LayoutOverflowFailsClosedWithoutModifyingOutput) { @@ -171,6 +180,10 @@ TEST(L3L2MessageQueueTest, Low32ReconstructionAcceptsWrapAndRejectsImpossibleDel EXPECT_TRUE(l3_l2_queue_reconstruct_counter(0, 4, &value)); EXPECT_EQ(value, 0x1'0000'0000ull); + value = (1ull << 31) - 2; + EXPECT_TRUE(l3_l2_queue_reconstruct_counter(static_cast(0x8000'0001u), 4, &value)); + EXPECT_EQ(value, (1ull << 31) + 1); + value = 100; EXPECT_TRUE(l3_l2_queue_reconstruct_counter(104, 4, &value)); EXPECT_EQ(value, 104u); @@ -184,13 +197,8 @@ TEST(L3L2MessageQueueTest, Low32ReconstructionAcceptsWrapAndRejectsImpossibleDel TEST(L3L2MessageQueueTest, L2InputPeekHandlesZeroByteDescriptorBeforeArenaValidation) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; L3L2QueueDescSlot slot{}; @@ -211,13 +219,8 @@ TEST(L3L2MessageQueueTest, L2InputPeekHandlesZeroByteDescriptorBeforeArenaValida TEST(L3L2MessageQueueTest, L2InputPeekPoisonsZeroByteDescriptorWithNonzeroOffset) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; L3L2QueueDescSlot slot{}; @@ -232,15 +235,67 @@ TEST(L3L2MessageQueueTest, L2InputPeekPoisonsZeroByteDescriptorWithNonzeroOffset EXPECT_EQ(storage.counters[80], 1); } +TEST(L3L2MessageQueueTest, L2InputPeekExposesNonzeroPayloadBytes) { + RegionStorage storage{}; + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + const std::array payload{{0x11, 0x22, 0x33, 0x44}}; + std::memcpy(storage.payload.data() + queue.layout().input_arena_offset, payload.data(), payload.size()); + publish_input_desc( + &storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, payload.size() + ); + + L3L2QueueInputHandle handle{}; + ASSERT_TRUE(queue.input().try_peek(&handle)) << queue.error().message; + + ASSERT_EQ(handle.payload_nbytes, payload.size()); + const auto *observed = reinterpret_cast(static_cast(handle.payload.gm_addr)); + EXPECT_EQ(std::memcmp(observed, payload.data(), payload.size()), 0); + ASSERT_TRUE(queue.input().release(handle)) << queue.error().message; + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE); +} + +TEST(L3L2MessageQueueTest, L2InputPeekAllowsArenaWrapAtExpectedPayloadHead) { + RegionStorage storage{}; + L3L2QueueArgs args = make_args(2, 128, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + + publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, 80); + L3L2QueueInputHandle first{}; + ASSERT_TRUE(queue.input().try_peek(&first)) << queue.error().message; + ASSERT_TRUE(queue.input().release(first)) << queue.error().message; + + publish_input_desc(&storage, queue.layout(), 2, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, 64); + L3L2QueueInputHandle second{}; + ASSERT_TRUE(queue.input().try_peek(&second)) << queue.error().message; + + EXPECT_EQ(second.payload_offset, queue.layout().input_arena_offset); + EXPECT_EQ(second.payload_nbytes, 64u); + ASSERT_TRUE(queue.input().release(second)) << queue.error().message; + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE); +} + +TEST(L3L2MessageQueueTest, L2InputPeekRejectsPayloadOffsetMismatchBeforeRelease) { + RegionStorage storage{}; + L3L2QueueArgs args = make_args(2, 128, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset + 64, 16); + + L3L2QueueInputHandle handle{}; + EXPECT_FALSE(queue.input().try_peek(&handle)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::INVALID_DESCRIPTOR); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_INPUT_DESC_HEAD_OFFSET)], 0); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1); +} + TEST(L3L2MessageQueueTest, L2OutputReservePublishWritesDescriptorAndTail) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; L3L2QueueOutputReservation reservation{}; @@ -260,13 +315,8 @@ TEST(L3L2MessageQueueTest, L2OutputReservePublishWritesDescriptorAndTail) { TEST(L3L2MessageQueueTest, L2OutputReserveReplaysReleasedDescriptorsBeforeReusingArena) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 4, - 64, - 128, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(4, 64, 128); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; L3L2QueueOutputReservation first{}; @@ -283,13 +333,8 @@ TEST(L3L2MessageQueueTest, L2OutputReserveReplaysReleasedDescriptorsBeforeReusin TEST(L3L2MessageQueueTest, RemoteAbortObservationDoesNotSetOwnAbortFlag) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; storage.counters[64] = 1; @@ -301,13 +346,8 @@ TEST(L3L2MessageQueueTest, RemoteAbortObservationDoesNotSetOwnAbortFlag) { TEST(L3L2MessageQueueTest, OrdinaryTimeoutDoesNotSetOwnAbortFlag) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; EXPECT_EQ(queue.disambiguate_timeout(), L3L2QueueTimeoutStatus::ORDINARY_TIMEOUT); @@ -318,13 +358,8 @@ TEST(L3L2MessageQueueTest, OrdinaryTimeoutDoesNotSetOwnAbortFlag) { TEST(L3L2MessageQueueTest, OutputCapacityEqualsDepthAndFullIsNoProgressWithoutAbort) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; for (int i = 0; i < 2; ++i) { @@ -342,13 +377,8 @@ TEST(L3L2MessageQueueTest, OutputCapacityEqualsDepthAndFullIsNoProgressWithoutAb TEST(L3L2MessageQueueTest, FullAndEmptyUseMonotonicCountersNotMaskedIndices) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; for (int i = 0; i < 2; ++i) { @@ -370,13 +400,8 @@ TEST(L3L2MessageQueueTest, FullAndEmptyUseMonotonicCountersNotMaskedIndices) { TEST(L3L2MessageQueueTest, OutputReserveTooLargeIsPreMutationNoProgressWithoutAbort) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; L3L2QueueOutputReservation reservation{}; @@ -389,13 +414,8 @@ TEST(L3L2MessageQueueTest, OutputReserveTooLargeIsPreMutationNoProgressWithoutAb TEST(L3L2MessageQueueTest, OutputPublishApplicationErrorDoesNotSetAbortFlag) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; L3L2QueueOutputReservation reservation{}; @@ -411,13 +431,8 @@ TEST(L3L2MessageQueueTest, OutputPublishApplicationErrorDoesNotSetAbortFlag) { TEST(L3L2MessageQueueTest, OutputPublishStaleReservationPoisonsAndSetsOwnAbortFlag) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; L3L2QueueOutputReservation reservation{}; @@ -431,13 +446,8 @@ TEST(L3L2MessageQueueTest, OutputPublishStaleReservationPoisonsAndSetsOwnAbortFl TEST(L3L2MessageQueueTest, InputApplicationErrorIsNormalMessageAndDoesNotSetAbortFlag) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::ERROR); @@ -452,13 +462,8 @@ TEST(L3L2MessageQueueTest, InputApplicationErrorIsNormalMessageAndDoesNotSetAbor TEST(L3L2MessageQueueTest, InputReleaseRejectsCallerMutatedHandleMetadata) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA, queue.layout().input_arena_offset, 16); @@ -475,13 +480,8 @@ TEST(L3L2MessageQueueTest, InputReleaseRejectsCallerMutatedHandleMetadata) { TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidState) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::STOP); @@ -499,13 +499,8 @@ TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidSt TEST(L3L2MessageQueueTest, NullInputPeekOutputIsPreMutationRejectionWithoutAbort) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; EXPECT_FALSE(queue.input().try_peek(nullptr)); @@ -516,13 +511,8 @@ TEST(L3L2MessageQueueTest, NullInputPeekOutputIsPreMutationRejectionWithoutAbort TEST(L3L2MessageQueueTest, InputSecondPeekBeforeReleasePoisonsOwnershipAndSetsOwnAbortFlag) { RegionStorage storage{}; - L3L2QueueArgs args{ - l3_l2_queue_magic_version(), - 2, - 64, - 64, - }; - L3L2QueueEndpoint queue(make_desc(&storage), args); + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::DATA); diff --git a/tests/ut/py/test_worker/test_l3_l2_message_queue.py b/tests/ut/py/test_worker/test_l3_l2_message_queue.py index 04573a62b..2a83b04a9 100644 --- a/tests/ut/py/test_worker/test_l3_l2_message_queue.py +++ b/tests/ut/py/test_worker/test_l3_l2_message_queue.py @@ -292,6 +292,8 @@ def test_create_l3_l2_queue_allocates_region_and_exposes_l2_task_scalars(): 4, 128, 192, + queue.layout.payload_bytes, + queue.layout.counter_bytes, ] assert fake_client.counters == { queue.layout.input_desc_tail_offset: 0, @@ -429,6 +431,23 @@ def test_dequeue_into_reads_and_releases_output(): _close(worker, shm) +def test_output_error_opcode_is_delivered_without_poison(): + orch, worker, shm, fake_client = _make_orchestrator() + try: + queue = orch.create_l3_l2_queue(worker_id=0, depth=4, input_arena_bytes=128, output_arena_bytes=128) + _publish_output(fake_client, queue, payload=b"error-detail", opcode=int(L3L2QueueOpcode.ERROR)) + output = orch.alloc([12], DataType.UINT8) + + message = queue.output.dequeue_into(output, timeout=0.001) + + assert message.opcode == L3L2QueueOpcode.ERROR + assert ctypes.string_at(int(output.data), 12) == b"error-detail" + assert fake_client.counters[queue.layout.output_desc_head_offset] == 1 + assert fake_client.counters.get(L3L2_QUEUE_L3_ABORT_FLAG_OFFSET, 0) == 0 + finally: + _close(worker, shm) + + def test_try_dequeue_into_empty_returns_none_without_abort(): orch, worker, shm, fake_client = _make_orchestrator() try: From cb937e6281910ff5cffe5300759097e9e09a5da2 Mon Sep 17 00:00:00 2001 From: ccyywwen <75376396+ccyywwen@users.noreply.github.com> Date: Thu, 2 Jul 2026 10:26:31 +0800 Subject: [PATCH 7/7] Fix: harden L3-L2 queue descriptor validation - Pin descriptor slot ABI with compile-time layout checks. - Reject STOP descriptors that carry payload metadata. - Clarify non-blocking C++ try API return semantics. --- docs/l3-l2-message-queue.md | 4 ++-- .../platform/include/aicpu/l3_l2_message_queue.h | 12 ++++++++++++ tests/ut/cpp/common/test_l3_l2_message_queue.cpp | 14 ++++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/docs/l3-l2-message-queue.md b/docs/l3-l2-message-queue.md index 12a0034ad..2cd2ee366 100644 --- a/docs/l3-l2-message-queue.md +++ b/docs/l3-l2-message-queue.md @@ -151,7 +151,7 @@ while (true) { `queue.input().try_peek(&input)` and `queue.output().try_reserve(nbytes, &reservation)` are non-blocking. A `false` -return can mean no progress, timeout, validation failure, or poison; check +return can mean ordinary no-progress, validation failure, or poison; check `queue.error().kind` to distinguish ordinary no-progress from terminal error. ## 2. Layout @@ -282,7 +282,7 @@ are non-blocking and return `False` or `None` for ordinary no-progress. C++ blocking queue operations take `timeout_ns`; `timeout_ns == 0` is an immediate timeout probe. They return `false` on no-progress, timeout, validation failure, or poison. C++ `try_*` APIs are non-blocking and also -return `false` for ordinary no-progress. +return `false` for ordinary no-progress, validation failure, or poison. Timeout under ordinary backpressure is not poison. After timeout, an endpoint samples the peer abort flag; if the peer flag is set, the local endpoint diff --git a/src/common/platform/include/aicpu/l3_l2_message_queue.h b/src/common/platform/include/aicpu/l3_l2_message_queue.h index 4c149ba7e..1caff863d 100644 --- a/src/common/platform/include/aicpu/l3_l2_message_queue.h +++ b/src/common/platform/include/aicpu/l3_l2_message_queue.h @@ -41,6 +41,12 @@ struct L3L2QueueDescSlot { uint64_t payload_nbytes; }; +static_assert(sizeof(L3L2QueueDescSlot) == L3L2_QUEUE_DESC_SLOT_BYTES, "L3L2QueueDescSlot ABI size changed"); +static_assert(offsetof(L3L2QueueDescSlot, seq) == 0, "L3L2QueueDescSlot::seq offset changed"); +static_assert(offsetof(L3L2QueueDescSlot, opcode) == 8, "L3L2QueueDescSlot::opcode offset changed"); +static_assert(offsetof(L3L2QueueDescSlot, payload_offset) == 16, "L3L2QueueDescSlot::payload_offset changed"); +static_assert(offsetof(L3L2QueueDescSlot, payload_nbytes) == 24, "L3L2QueueDescSlot::payload_nbytes changed"); + enum class L3L2QueueOpcode : uint64_t { INVALID = 0, DATA = 1, @@ -332,6 +338,12 @@ class L3L2QueueEndpoint { parent_->poison(L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "invalid input opcode"); return false; } + if (opcode == L3L2QueueOpcode::STOP && (slot.payload_offset != 0 || slot.payload_nbytes != 0)) { + parent_->poison( + L3L2QueueErrorKind::INVALID_DESCRIPTOR, "input.try_peek", "STOP descriptor must be zero-byte" + ); + return false; + } L3L2OrchPayloadView view{0, 0}; if (slot.payload_nbytes == 0) { diff --git a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp index e7db495d4..26e27a0f2 100644 --- a/tests/ut/cpp/common/test_l3_l2_message_queue.cpp +++ b/tests/ut/cpp/common/test_l3_l2_message_queue.cpp @@ -497,6 +497,20 @@ TEST(L3L2MessageQueueTest, InputStopReleaseRejectsLaterPublishedInputAsInvalidSt EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1); } +TEST(L3L2MessageQueueTest, InputStopWithPayloadMetadataPoisonsAndSetsOwnAbortFlag) { + RegionStorage storage{}; + L3L2QueueArgs args = make_args(2, 64, 64); + L3L2QueueEndpoint queue(make_desc(&storage, args), args); + ASSERT_EQ(queue.error().kind, L3L2QueueErrorKind::NONE) << queue.error().message; + publish_input_desc(&storage, queue.layout(), 1, L3L2QueueOpcode::STOP, queue.layout().input_arena_offset, 8); + + L3L2QueueInputHandle handle{}; + EXPECT_FALSE(queue.input().try_peek(&handle)); + + EXPECT_EQ(queue.error().kind, L3L2QueueErrorKind::INVALID_DESCRIPTOR); + EXPECT_EQ(storage.counters[counter_index(L3L2_QUEUE_L2_ABORT_FLAG_OFFSET)], 1); +} + TEST(L3L2MessageQueueTest, NullInputPeekOutputIsPreMutationRejectionWithoutAbort) { RegionStorage storage{}; L3L2QueueArgs args = make_args(2, 64, 64);