diff --git a/docs/trb-auto-realloc-temporary-buffer-modification-plan.md b/docs/trb-auto-realloc-temporary-buffer-modification-plan.md new file mode 100644 index 000000000..7208f9c97 --- /dev/null +++ b/docs/trb-auto-realloc-temporary-buffer-modification-plan.md @@ -0,0 +1,335 @@ +# TRB AUTO Realloc Temporary Buffer Modification Plan + +**Date**: 2026-07-01 +**Status**: replacement modification plan + +## Purpose + +This document replaces the previous chunk-growth AUTO plan for PR 1198 while +keeping that older plan file intact for review history. + +The new target is simpler: + +- no multi-chunk automatic growth mechanism; +- one retained temporary-buffer allocation per runner; +- at each run, compute the whole temporary-buffer requirement before staging; +- if the retained buffer is too small, free it first and allocate one new + buffer for the current run; +- use 1024-byte address alignment for the temporary buffer. + +The previous plan remains in +`docs/trb-auto-temporary-buffer-modification-plan.md`. + +## Target Behavior + +Temporary buffering still has two modes: + +- `off`: use the existing per-run `device_malloc()` / `device_free()` path. +- `auto`: use one runner-scoped retained temporary buffer. + +The default mode is `off`. + +AUTO mode does not take a caller-provided byte budget. The retained buffer +starts empty. On each TRB bind, the host builds a run plan for all ordinary +non-child tensors that would use temporary storage. The plan is packed with +1024-byte alignment. If the current retained buffer is large enough, the run +reuses it. If it is not large enough, the implementation frees the old +retained buffer and allocates one new retained buffer for this run. + +There is no incremental chunk growth and no per-acquire allocation. After +`begin_temporary_buffer_run(plan)` succeeds, every +`acquire_temporary_buffer_slice()` must be satisfied from the retained buffer. +A miss after a successful begin is a bug in plan/acquire consistency and must +fail clearly. It must not fall back to ordinary `device_malloc()`. + +## Alignment Contract + +Use a single alignment constant for this feature: + +```cpp +static constexpr size_t kTemporaryBufferAlignment = 1024; +``` + +Apply it to both: + +- the retained buffer base address exposed to tensor slices; +- every tensor slice offset allocated from that retained buffer. + +If the platform allocator does not guarantee 1024-byte alignment directly, the +temporary buffer must over-allocate and store both addresses: + +```cpp +struct Buffer { + void *raw_base; + void *base; // 1024-byte aligned address used by slices + size_t capacity; // usable bytes from base + size_t offset; +}; +``` + +Only `raw_base` is passed to the platform free callback. The usable +`capacity` is the bytes available from aligned `base`. + +The required capacity for a run is computed with the same 1024-byte alignment +rule as real acquire: + +```text +offset = 0 +for item in plan: + offset = align_up(offset, 1024) + offset += item.bytes +required = offset +``` + +The implementation may round `required` up to 1024 bytes before storing it as +capacity, but it must not use a coarse fixed MiB chunk granularity. + +## Run Planning + +Before staging tensors in TRB bind, build a plan using the same filtering and +ordering as real acquire: + +```text +for tensor in orch_args, in real bind order: + if tensor.is_child_memory(): + skip + else: + append {bytes=tensor.nbytes(), alignment=1024} +``` + +The plan includes ordinary non-child input, INOUT, and output tensors. Child +memory stays pass-through and is not included. + +Zero-byte tensors should not force a retained-buffer allocation. The plan and +real acquire path must handle them consistently. The preferred behavior is to +skip zero-byte tensors in the temporary-buffer plan and avoid consuming buffer +capacity for them. + +## Host API Shape + +Use plan-based AUTO callbacks, not a byte-budget API: + +```cpp +struct TemporaryBufferPlanItem { + size_t bytes; + size_t alignment; +}; + +bool (*temporary_buffer_enabled)(); +bool (*begin_temporary_buffer_run)( + const TemporaryBufferPlanItem *items, size_t item_count); +void *(*acquire_temporary_buffer_slice)(size_t bytes, size_t alignment); +void (*end_temporary_buffer_run)(); +``` + +`begin_temporary_buffer_run()` computes the packed required size and ensures +the retained buffer is large enough for the whole run. + +## Buffer State + +The implementation should store a single retained buffer, not a vector of +chunks: + +```cpp +Buffer buffer_; +size_t retained_bytes_; +size_t current_run_used_bytes_; +size_t high_water_used_bytes_; +bool enabled_; +bool active_; +``` + +Maintain these invariants: + +- `retained_bytes_ == buffer_.capacity`; +- `retained_bytes_ == 0` when `buffer_.raw_base == nullptr`; +- `buffer_.base` is 1024-byte aligned when non-null; +- `buffer_.offset` is reset to zero only after begin succeeds; +- `current_run_used_bytes_` is reset to zero only after begin succeeds; +- real acquire increments `current_run_used_bytes_` by padding plus bytes; +- `end_temporary_buffer_run()` updates `high_water_used_bytes_`; +- clear/finalize releases `raw_base` and resets all retained-buffer state. + +Useful diagnostics are: + +- `retained_bytes`; +- `high_water_used_bytes`; +- `realloc_count`; +- `realloc_failed_count`; +- `buffer_backed_allocation_count`. + +Do not expose a public budget getter. + +## Begin-Run Resize Logic + +`begin_temporary_buffer_run(plan)` owns the resize decision: + +```text +if AUTO is disabled: + return false + +if active_ is true: + fail clearly; do not reset offset + return false + +required = packed_size(plan, alignment=1024) + +if retained_bytes_ >= required: + buffer_.offset = 0 + current_run_used_bytes_ = 0 + active_ = true + return true + +free existing retained buffer +retained_bytes_ = 0 + +if required == 0: + buffer_.offset = 0 + current_run_used_bytes_ = 0 + active_ = true + return true + +allocate one new retained buffer with usable capacity >= required +if allocation fails: + active_ = false + return false + +buffer_.offset = 0 +retained_bytes_ = new usable capacity +current_run_used_bytes_ = 0 +active_ = true +return true +``` + +This is intentionally not transactional with respect to the old retained +buffer. If a larger run requires resize and the new allocation fails, the old +retained buffer has already been released. That follows the required +free-then-allocate behavior and avoids keeping two large temporary buffers +alive at once. + +## Real Acquire Logic + +After begin succeeds, real acquire is a single-buffer bump allocator: + +```text +if not active: + fail + +alignment = max(requested_alignment, 1024) +aligned = align_up(buffer_.offset, alignment) + +if bytes does not fit in buffer_.capacity - aligned: + fail clearly + +ptr = buffer_.base + aligned +buffer_.offset = aligned + bytes +current_run_used_bytes_ += aligned - old_offset + bytes +return ptr +``` + +The caller must pass 1024 for temporary tensor slices. The implementation +should still validate that any requested alignment is a power of two and use +at least 1024. + +## Cleanup And Lifetime + +Release the retained buffer when: + +- AUTO is disabled; +- an explicit clear path is called; +- runner/device context finalizes; +- `begin_temporary_buffer_run(plan)` needs a larger buffer. + +Do not shrink merely because a later run is smaller. Smaller later runs reuse +the larger retained buffer until one of the release events above occurs. + +If finalize sees an active temporary-buffer run, log a programming error and +still release the retained buffer before allocator teardown. + +## Implementation Steps + +1. Update `TemporaryVariableBuffer`. + - Replace chunk-vector state with a single retained buffer. + - Remove suffix growth and repeated simulation. + - Add 1024-byte alignment for base and slices. + - Add packed-size computation for the whole run plan. + - Implement free-then-allocate resize in begin-run. + +2. Update onboard and sim `DeviceRunnerBase`. + - Keep AUTO enable/disable APIs. + - Remove chunk-specific diagnostics. + - Report retained bytes, high-water, realloc count, and realloc failures. + +3. Update common `HostApi`. + - Keep `TemporaryBufferPlanItem`. + - Keep `temporary_buffer_enabled()`. + - Keep plan-based `begin_temporary_buffer_run(items, item_count)`. + - Do not restore `temporary_buffer_budget()`. + +4. Update TRB bind path for a2a3 and a5. + - Build the plan from ordinary non-child tensors before staging. + - Use 1024-byte alignment in the plan and real acquire. + - Begin AUTO run before staging. + - Fail clearly if begin or acquire fails. + - Keep child-memory, H2D, memset, and copy-back semantics unchanged. + +5. Update Python/C++ public API. + - Keep mode-based configuration, for example + `configure_temporary_buffer_auto(bool enabled)`. + - Keep `temporary_buffer_mode = "off" | "auto"`. + - Do not reintroduce caller-provided byte budgets. + +6. Update tests. + - Cover initial empty AUTO begin and allocation. + - Cover same-shape reuse with no realloc. + - Cover larger later run freeing old buffer and allocating one new buffer. + - Cover smaller later run not shrinking. + - Cover allocation failure after old buffer is freed. + - Cover 1024-byte base and slice alignment. + - Keep TRB child-memory, OUT memset, and error-cleanup regressions. + +## Test Plan + +Run focused unit tests first: + +```text +tests/ut/cpp/common/test_temporary_variable_buffer.cpp +tests/ut/cpp/common/test_trb_runtime_temp_buffer.cpp +tests/ut/py/test_chip_worker.py +tests/ut/py/test_worker/test_host_worker.py +``` + +Then run TRB prepared-callable coverage for both architectures where +available: + +```text +a2a3 TRB prepared-callable ST +a5 TRB prepared-callable ST +``` + +Hardware tests must use `task-submit`. + +For performance validation, use Qwen3 Path A with the same matrix already +requested for PR 1198: + +- skill-default model/input/output setting; +- batch size 1 and 16; +- short input and 256-token input; +- output length 20, 256, and 512; +- compare AUTO enabled vs disabled on the same NPU where possible. + +## Acceptance Criteria + +- No public caller-provided temporary-buffer byte budget remains. +- AUTO starts empty and does not allocate until the first planned run. +- The retained temporary buffer is a single allocation, not retained chunks. +- All temporary-buffer slice addresses are 1024-byte aligned. +- Same-shape repeated runs reuse the retained buffer without reallocating. +- A larger later run frees the old retained buffer before allocating a new + one. +- A smaller later run does not shrink the retained buffer. +- Allocation failure during resize leaves no retained old buffer behind. +- Acquire failure after successful begin fails clearly and never falls back to + ordinary malloc. +- Child-memory pass-through, OUT memset, and copy-back semantics are + unchanged. diff --git a/docs/trb-auto-temporary-buffer-modification-plan.md b/docs/trb-auto-temporary-buffer-modification-plan.md new file mode 100644 index 000000000..58b5ed625 --- /dev/null +++ b/docs/trb-auto-temporary-buffer-modification-plan.md @@ -0,0 +1,340 @@ +# TRB AUTO Temporary Buffer Modification Plan + +**Date**: 2026-07-01 +**Status**: implementation modification plan + +## Purpose + +This document describes how to modify the existing PR 1198 temporary-buffer +implementation from an explicit byte-budget design to an AUTO self-sizing +design. + +The existing design plan remains in +`docs/trb-serial-tensor-buffer-pool-plan.md`. This document is the concrete +change plan for updating code, tests, and public API surface. + +## Target Behavior + +Temporary buffering has two modes: + +- `off`: keep the current per-run `device_malloc()` / `device_free()` path. +- `auto`: enable retained temporary chunks that grow from observed run plans. + +The default mode is `off`, preserving existing behavior unless the caller +explicitly enables AUTO mode. AUTO mode must not require callers to provide +`max_temporary_buffer_bytes`. The buffer starts empty, grows when a run cannot +fit in retained chunks, and does not automatically shrink. + +After steady decode shapes converge, AUTO should perform no temporary-tensor +device allocation or free on repeated same-shape runs. + +## Design Changes + +### Remove Explicit Budget Semantics + +Remove public and internal behavior that treats a numeric byte budget as the +configuration contract: + +- public `max_temporary_buffer_bytes` worker config; +- public `configure_temporary_buffer(bytes)` budget API; +- public `temporary_buffer_budget` getter, unless replaced by diagnostic-only + retained/high-water reporting; +- fail-fast `"required X / configured Y"` budget-exceeded path. + +Do not replace the budget with a hidden large number. AUTO must be represented +as AUTO, not as a disguised explicit budget. + +### Add AUTO Mode Configuration + +Expose mode configuration instead of byte sizing. Acceptable shapes are: + +```text +temporary_buffer_mode = "off" | "auto" +``` + +or an equivalent bool/enum API: + +```cpp +configure_temporary_buffer_auto(bool enabled); +``` + +Rules: + +- enabling AUTO does not allocate retained HBM immediately; +- disabling AUTO clears retained chunks when no run is active; +- reconfiguration while a temporary-buffer run is active fails clearly; +- default configuration is `off`; +- `worker.malloc()` and `worker.free()` semantics stay unchanged. + +### Add Run Planning + +Before staging tensors in TRB bind, build a plan using the exact same tensor +filtering and ordering as real acquire: + +```text +for tensor in orch_args, in real bind order: + if tensor.is_child_memory(): + skip + else: + append {bytes=tensor.nbytes(), alignment=default_alignment} +``` + +The plan includes ordinary non-child input, INOUT, and output tensors. Child +memory stays pass-through and is not included. + +### Change Host API Shape + +Replace budget-based HostApi usage with plan-based AUTO callbacks: + +```cpp +struct TemporaryBufferPlanItem { + size_t bytes; + size_t alignment; +}; + +bool (*temporary_buffer_enabled)(); +bool (*begin_temporary_buffer_run)( + const TemporaryBufferPlanItem *items, size_t item_count); +void *(*acquire_temporary_buffer_slice)(size_t bytes, size_t alignment); +void (*end_temporary_buffer_run)(); +``` + +`begin_temporary_buffer_run()` performs simulation and any required growth. +After it succeeds, real `acquire_temporary_buffer_slice()` should only perform +first-fit bump allocation over retained chunks. + +### Implement Simulation-Based Growth + +The buffer owns retained chunks: + +```cpp +struct Chunk { + void *raw_base; + void *base; + size_t capacity; + size_t offset; +}; + +std::vector chunks_; +size_t retained_bytes_; +size_t current_run_used_bytes_; +size_t high_water_used_bytes_; +``` + +Maintain this invariant: + +```text +retained_bytes_ == sum(chunk.capacity for chunk in chunks_) +``` + +Counter lifecycle: + +- `retained_bytes_` is updated only when a retained chunk allocation succeeds + or when chunks are cleared; +- `current_run_used_bytes_` is reset to zero when begin succeeds; +- real acquire adds consumed bytes, including alignment padding, to + `current_run_used_bytes_`; +- `end_temporary_buffer_run()` updates `high_water_used_bytes_` from + `current_run_used_bytes_`; +- clear/finalize resets both run counters. + +Growth happens in `begin_temporary_buffer_run(plan)`: + +```text +if AUTO is disabled: + return false + +if active_ is true: + fail clearly; do not reset offsets + return false + +checkpoint chunk count and retained_bytes_ +simulate plan against retained chunks + +if simulation succeeds: + reset real chunk offsets + current_run_used_bytes_ = 0 + mark active + return true + +if simulation fails at item i: + remaining = packed_size_in_empty_chunk(plan[i:]) + retained = retained_bytes_ + if retained == 0: + new_chunk_size = remaining + else: + new_chunk_size = max(retained, remaining) + allocate one new retained chunk + retained_bytes_ += new_chunk_size + repeat simulation from the beginning + +if allocating a new chunk fails: + free chunks allocated after the checkpoint + restore chunk count and retained_bytes_ + active_ remains false + return false +``` + +`packed_size_in_empty_chunk()` uses the same alignment rule as real acquire: + +```text +offset = 0 +for item in suffix: + offset = align_up(offset, item.alignment) + offset += item.bytes +return offset +``` + +This rule avoids per-tensor grow in the real bind path, handles repeated +large tensors, and lets retained capacity approximately double when it already +exists. + +Growth is transactional at begin-run granularity. Newly allocated chunks are +committed only if `begin_temporary_buffer_run(plan)` succeeds. If growth fails, +only chunks allocated during that begin attempt are released; older retained +chunks remain available for later runs. + +Do not add a fixed MiB-size chunk granularity. Correctness comes from tensor +offset alignment. The implementation may align `new_chunk_size` to the default +slice alignment for simpler arithmetic. + +This plan assumes every tensor `nbytes()` value and aggregate temporary-buffer +plan size fits in `size_t`; handling values outside the `size_t` range is out +of scope for this modification. + +### Preserve Real Acquire Semantics + +After planning succeeds, real acquire uses the same first-fit bump rule: + +```text +for chunk in chunks: + aligned = align_up(chunk.offset, alignment) + if bytes fits in chunk.capacity - aligned: + return chunk.base + aligned +return nullptr +``` + +A null return after successful planning is a plan/acquire mismatch. It must +fail clearly and run normal cleanup. It must not silently fall back to +ordinary `device_malloc()`. + +### Cleanup And Lifetime + +AUTO chunks are retained across runs and are not automatically shrunk. + +Release retained chunks only when: + +- AUTO is disabled; +- an explicit clear path is called; +- runner/device context finalizes. + +If finalize sees an active temporary-buffer run, log a programming error and +still release retained chunks before allocator teardown. + +## Implementation Steps + +1. Update C++ `TemporaryVariableBuffer`. + - Replace budget configuration with AUTO enable/disable. + - Add plan-item simulation. + - Add suffix-size growth. + - Store and maintain `retained_bytes_`. + - Remove budget-exceeded error state. + +2. Update onboard and sim `DeviceRunnerBase`. + - Rename budget methods to AUTO-mode methods. + - Keep clear/finalize behavior. + - Expose diagnostics for retained bytes, high-water, grow count, and grow + failure count. + +3. Update common `HostApi`. + - Add `TemporaryBufferPlanItem`. + - Replace `temporary_buffer_budget()` usage with + `temporary_buffer_enabled()`. + - Change `begin_temporary_buffer_run()` to accept plan items. + - Wire both onboard and sim c-api shared implementations. + +4. Update TRB bind path for a2a3 and a5. + - Build the plan from ordinary non-child tensors before staging. + - Begin AUTO run with that plan. + - Acquire slices in the same order used by the plan. + - Keep child-memory pass-through unchanged. + - Keep H2D, memset, and copy-back semantics unchanged. + +5. Update Python/C++ public API. + - Remove byte-budget config paths in this PR. + - Add mode-based config for Worker and ChipWorker. + - Ensure level-3 child process setup forwards AUTO mode, not bytes. + +6. Update tests. + - Convert budget tests into AUTO growth tests. + - Add repeated max-size tensor simulation coverage. + - Add no-shrink coverage. + - Add grow-failure no-fallback coverage. + - Keep child-memory, OUT memset, and error-cleanup regressions. + +7. Update docs and PR metadata. + - Keep the main plan consistent with AUTO semantics. + - Remove references that present explicit byte budget as the target API. + - Update PR title/body from docs-only to implementation feature work. + +## Test Plan + +Run focused unit tests first: + +```text +tests/ut/cpp/common/test_temporary_variable_buffer.cpp +tests/ut/cpp/common/test_trb_runtime_temp_buffer.cpp +tests/ut/py/test_chip_worker.py +tests/ut/py/test_worker/test_host_worker.py +``` + +Then run TRB prepared-callable coverage for both architectures where +available: + +```text +a2a3 TRB prepared-callable ST +a5 TRB prepared-callable ST +``` + +Hardware tests must use `task-submit`. + +For performance validation, use Qwen3 Path A: + +- steady decode same-shape run, to confirm warmup then zero temporary-tensor + allocation/free; +- short input and 256-token context prefill cases, to quantify AUTO grow + jitter and p99 impact; +- output lengths that include short and long decode, so growth effects are not + confused with decode kernel timing. + +## Acceptance Criteria + +- No public caller-provided temporary-buffer byte budget remains. +- AUTO starts empty and does not allocate until the first planned run. +- Same-shape repeated runs reuse retained chunks without additional growth. +- Larger later runs grow and then stabilize. +- Smaller later runs do not shrink retained chunks. +- A failed begin rolls back chunks allocated during that begin attempt. +- Grow failure fails clearly and does not fall back to ordinary malloc. +- `retained_bytes_` remains equal to the sum of retained chunk capacities. +- `current_run_used_bytes_` and `high_water_used_bytes_` follow the documented + lifecycle. +- H2D/D2H bytes remain unchanged and explainable. +- Child-memory semantics remain unchanged. +- `worker.malloc()` / `worker.free()` semantics remain unchanged. +- Steady decode allocation/free count drops materially after warmup. +- Prefill growth jitter is measured and reported separately. + +## Risks + +AUTO gives up the explicit-budget fail-fast diagnostic. A true memory-pressure +failure becomes an allocator grow failure, so the error must include requested +tensor bytes, remaining suffix bytes, retained bytes, chunk count, and the +underlying allocator status. + +AUTO can allocate during prefill when sequence length grows. That may add +latency jitter in the same phase where timeout-related failures have been +observed. Prefill must be measured separately from steady decode. + +AUTO does not provide a bounded-memory serving contract. If serving needs a +hard memory cap, that is a separate design from this PR. diff --git a/docs/trb-serial-tensor-buffer-pool-plan.md b/docs/trb-serial-tensor-buffer-pool-plan.md new file mode 100644 index 000000000..7eee7bf1c --- /dev/null +++ b/docs/trb-serial-tensor-buffer-pool-plan.md @@ -0,0 +1,592 @@ +# TRB Temporary Variable Buffer Implementation Plan + +**Date**: 2026-06-29 +**Status**: implementation plan + +## Decision + +The target optimization is a runtime-side temporary variable buffer for +ordinary non-child tensors in the `tensormap_and_ringbuffer` path. +This plan uses "temporary variable buffer" for the same concept as +temporary tensor storage. + +The serving constraint is important: + +- Do not change Qwen3 model code or kernel signatures. +- Keep the current hidden input boundary: hidden is still produced on host and + copied H2D for each run. +- Preserve existing child-memory behavior. Device-resident weights, RoPE + tables, LM head, and KV cache are already handled by the caller as + child-memory / `DeviceTensor` style inputs. +- Do not try to infer model-specific maximum tensor sizes inside the runtime. + The serving or runner owner must provide the temporary-buffer memory budget + when enabling this optimization. + +Therefore the near-term optimization is: + +```text +ordinary non-child host tensor + -> acquire device slice from the temporary variable buffer + -> H2D or device memset + -> run + -> D2H if the host still needs the output + -> end the run and make the temporary variable buffer reusable +``` + +This reduces repeated `device_malloc()` / `device_free()` in the hot path. It +does not remove required H2D or D2H copies. + +## Non-Goals + +- Do not convert every non-child tensor into user-visible child-memory. +- Do not add full-bind cross-run overlap. +- Do not allocate a second per-run tensor-buffer set by default. +- Do not add output double buffering for logits. +- Do not add dirty/version tracking for Qwen3 hidden or small metadata tensors. +- Do not skip hidden H2D while model/kernel code still expects hidden input. +- Do not add a new env var or macro gate without explicit approval. +- Do not change `worker.malloc()` / `worker.free()` public semantics. +- Do not change copy-back policy in this plan. + +## Current Code Shape + +Current TRB bind does this for every non-child tensor: + +- `src/{a2a3,a5}/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp` + `bind_callable_to_runtime_impl()` +- allocate a fresh device buffer; +- copy input / INOUT tensors H2D; +- memset pure OUT tensors; +- record `TensorPair`. + +Current validate does this: + +- read PTO2 status/header; +- copy OUTPUT / INOUT tensors D2H when `needs_copy_back` is true; +- call `device_free()` for every recorded device pointer; +- clear `tensor_pairs_`. + +Existing child-memory does not need changes: + +- child-memory tensors are passed through directly; +- runtime does not H2D, D2H, or free them; +- ownership stays with the caller. + +## Target Ownership Model + +Replace the current implicit `TensorPair` ownership assumption with an explicit +lease model. + +Current implicit assumption: + +```text +every TensorPair.dev_ptr was allocated by this run +validate must device_free() every TensorPair.dev_ptr +``` + +Target explicit model: + +```cpp +enum class TensorReleaseKind { + Free, + BufferNoop, + ExternalNoop, +}; + +struct TensorLease { + void *host_ptr; + void *dev_ptr; + size_t size; + bool needs_copy_back; + TensorReleaseKind release_kind; +}; +``` + +Expected use: + +- `BufferNoop`: normal non-child TRB temporary-buffer slice. The per-tensor + release is a no-op; the whole buffer becomes reusable at run end. +- `Free`: existing per-run allocation path when the temporary-buffer + optimization is disabled. +- `ExternalNoop`: only for future explicit external device tensors, not needed + for current child-memory pass-through because those tensors are not recorded. + +It is acceptable to keep the member name `tensor_pairs_` temporarily if that +keeps the diff smaller, but the recorded object must carry release ownership. +The preferred cleanup is to rename it to `tensor_leases_` in both TRB runtime +headers. + +## Temporary Buffer Location + +Add the temporary variable buffer to `DeviceRunnerBase`, not to `Runtime`. + +Reason: + +- `DeviceRunnerBase` already owns `MemoryAllocator`. +- it already serializes device allocation/free through `device_mem_mu_`; +- the buffer lifetime should match the worker/device context; +- `Runtime` is per invocation and should only record leases for one run. + +Affected platform bases: + +- `src/common/platform/onboard/host/device_runner_base.{h,cpp}` +- `src/common/platform/sim/host/device_runner_base.{h,cpp}` + +The sim and onboard implementations should expose the same internal methods: + +```cpp +bool configure_temporary_buffer(size_t max_temporary_buffer_bytes); +bool begin_temporary_buffer_run(); +void *acquire_temporary_buffer_slice(size_t bytes, size_t alignment); +void end_temporary_buffer_run(); +void clear_temporary_buffer(); +``` + +The `alignment` argument is internal to the runtime/platform layer. It must +preserve the same or stricter alignment guarantee that callers previously got +from allocating each tensor with `device_malloc()`. The caller does not provide +a new model-specific alignment value. + +The exact error plumbing should follow local runtime style. The logical +contract is that configuration, begin, and acquire failures are observable by +the caller and are not silently ignored. + +`finalize_common()` / sim finalize must clear all retained temporary-buffer +chunks before final allocator teardown. If finalize sees an active +temporary-buffer run, that is a programming error: log it, release retained +chunks, and make the behavior explicit in the implementation contract. + +## Budget Contract + +The runtime should not compute Qwen3-specific maximum temporary-buffer size. +The serving or runner owner that enables this optimization must provide: + +```text +max_temporary_buffer_bytes +``` + +Definition: + +```text +maximum total aligned device bytes required by all ordinary non-child tensor +temporary-buffer allocations in one run_prepared() invocation for this runner +``` + +Rules: + +- The value is an aggregate byte budget, not a tensor count. +- The value must include alignment padding and a safety margin. +- The value covers hidden, small metadata tensors, and output buffers + that are not child-memory. +- Child-memory tensors are not counted. +- If the budget is zero or missing, keep the existing per-run malloc/free path + unless a compatibility rollout explicitly chooses to fail fast. +- If a run needs more than the configured budget, fail with a clear error that + reports required bytes and configured bytes. +- If a positive budget is configured, budget exhaustion is a configuration + error. Do not silently fall back to per-run `device_malloc()` for that tensor + or run. + +This keeps model-shape knowledge in serving code, where `max_batch_size`, +`max_token_num`, hidden size, vocab padding, and metadata shapes are already +known. + +This does not require every application caller to pass a budget on every run. +It means the component that integrates a model runner must provide a positive +runner-scoped budget if it wants this optimization. Without that budget, the +runtime stays on the current allocation/free behavior. + +## Configuration Ingress + +The budget is configured once per runner before the first +temporary-buffer-backed `run_prepared()` call on that runner. The expected +owner is serving or model-runner setup code, after maximum shapes are known. + +Preferred call path: + +```text +Qwen3 serving / runner setup + -> Worker(level=2, max_temporary_buffer_bytes=...) + or Worker.configure_temporary_buffer(bytes) + or ChipWorker.configure_temporary_buffer(bytes) + -> DeviceRunnerBase::configure_temporary_buffer(bytes) +``` + +The implementation must add a concrete configuration entrypoint above +`DeviceRunnerBase`. The `HostApi` begin/acquire/end callbacks consume the +already configured runner buffer; they must not receive or infer the budget +per tensor or per `run_prepared()` call. + +Rules: + +- missing or zero budget disables the temporary-buffer optimization; +- positive budget enables temporary-buffer-backed allocation for TRB + non-child tensors; +- positive budget configuration must allocate retained device chunks before + the first temporary-buffer-backed run; +- reconfiguration is allowed only when no temporary-buffer run is active; +- repeated configuration with the same positive budget should be a no-op; +- `clear()` should run only when disabling the optimization or when the + configured budget changes; +- invalid reconfiguration must fail with a clear error; +- diagnostics must expose the configured budget. + +## Buffer Behavior + +Implement the temporary variable buffer as a per-run bump allocator over +retained device chunks. + +Basic lifecycle: + +```text +configure(max_temporary_buffer_bytes) + if unchanged, keep existing retained chunks + if disabling or changing the budget, clear existing retained chunks + record the budget + allocate retained device chunks up to the configured budget + +begin_run() + reset chunk offsets to zero + caller must ensure no other temporary-buffer-backed run is active + +acquire(bytes, alignment) + align the chunk offset to the device allocation alignment + return the next aligned slice from existing chunks + fail if the slice would exceed max_temporary_buffer_bytes + +end_run() + mark the temporary variable buffer inactive + keep chunks retained for the next run + +clear() + free all retained chunks +``` + +Pseudocode: + +```cpp +void *TemporaryVariableBuffer::acquire(size_t bytes, size_t alignment) { + if (!active_) { + return nullptr; // temporary buffer is disabled or not in a run + } + + if (void *ptr = try_allocate_from_existing_chunks(bytes, alignment)) { + return ptr; + } + + return nullptr; // configured budget or chunk layout is insufficient +} +``` + +`acquire()` must not call `device_malloc()` on the run hot path. Device chunk +allocation happens during configuration, not lazily while binding a run. + +This is "serial" reuse: + +```text +run N uses temporary-buffer slices +validate for run N copies back required outputs +validate ends the temporary-buffer run +run N+1 may reuse the same temporary-buffer memory +``` + +The temporary variable buffer is not a cross-run overlap mechanism. It is +acquired only inside the active `run_prepared()` call. A later run may use it +only after the previous `run_prepared()` reaches validate and ends the buffer +run. + +## Concurrency Assumption + +This plan assumes one active `run_prepared()` lifecycle per runner for +temporary-buffer-backed tensors. + +The implementation does not add: + +- locking around the full bind/run/validate lifecycle; +- active-run guards; +- fallback malloc/free for concurrent binds; +- double buffering. + +If two host threads call `run_prepared()` concurrently on the same runner while +temporary buffering is enabled, behavior is unsupported. The caller or serving +scheduler is responsible for serializing same-runner runs. + +Future same-runner concurrency must add a run-lifecycle mutex, active-run +guard, fallback-to-malloc behavior, or true double buffering. That work is +outside this implementation plan. + +## Segmented Chunks + +The configured budget is an aggregate limit. It should not require one huge +contiguous HBM allocation. + +Preferred implementation: + +```cpp +struct Chunk { + void *base; + size_t capacity; + size_t offset; +}; + +std::vector chunks_; +size_t max_temporary_buffer_bytes_; +``` + +Allocation policy: + +- Support multiple chunks so the implementation does not depend on the largest + contiguous allocatable HBM block. +- Allocate retained chunks during positive-budget configuration. Do not add + chunks lazily from `acquire()` during bind. +- Never let total retained chunk capacity exceed + `max_temporary_buffer_bytes`. +- A tensor slice must be contiguous within one chunk. If a single tensor is + larger than every retained chunk, configuration must create a large-enough + chunk within the same aggregate budget or fail before the run. + +This keeps the hot path deterministic after warmup while avoiding the fragility +of one large allocation when HBM is fragmented by weights, KV cache, runtime +control buffers, or driver allocations. + +## Host API Wiring + +Do not change the public `device_malloc_ctx()` / `device_free_ctx()` APIs. +Those are used by explicit caller-owned device memory and must keep real +malloc/free semantics. + +Instead, expose internal temporary-buffer callbacks through the runtime +`HostApi`: + +```cpp +bool (*begin_temporary_buffer_run)(); +void *(*acquire_temporary_buffer_slice)(size_t size, size_t alignment); +void (*end_temporary_buffer_run)(); +``` + +Wire these callbacks in: + +- `src/common/platform/onboard/host/c_api_shared.cpp` +- `src/common/platform/sim/host/c_api_shared.cpp` + +HostApi compatibility matters. If common platform code initializes HostApi +fields for multiple runtime variants, add the fields consistently to those +HostApi definitions or guard the wiring so non-TRB runtimes compile cleanly. +Only TRB should use the temporary-buffer callbacks in this plan. + +## Bind Path Changes + +In TRB `bind_callable_to_runtime_impl()`: + +1. Begin a temporary-buffer run before processing non-child tensors when the + optimization is enabled. +2. Keep the child-memory branch unchanged. +3. For every non-child tensor, acquire a temporary-buffer slice: + + ```text + if temporary variable buffer is enabled: + dev_ptr = acquire_temporary_buffer_slice(size, alignment) + fail clearly if the configured budget is insufficient + do not fall back to device_malloc() + release_kind = BufferNoop + else: + dev_ptr = device_malloc(size) + release_kind = Free + ``` + +4. Record a `TensorLease` immediately after a device pointer is acquired. + This lets the failure path release every acquired buffer. +5. Preserve current copy behavior: + + - `ArgDirection::OUT`: device memset when available; + - otherwise: H2D copy from host. + +6. Set `needs_copy_back` from the current signature logic. +7. On failure before bind succeeds, release through the recorded release kind + and end the temporary-buffer run exactly once. + +Do not add dirty/version skip logic. Hidden and small metadata still need the +same copy semantics as today. + +## Validate Path Changes + +In TRB `validate_runtime_impl()`: + +1. Keep PTO2 status/header readback behavior. +2. Keep D2H copy when `needs_copy_back` is true. +3. Replace unconditional `device_free()` with release dispatch: + + ```text + Free -> device_free(dev_ptr) + BufferNoop -> do nothing for this tensor + ExternalNoop -> do nothing + ``` + +4. Clear the per-run lease vector at the end. +5. End the temporary-buffer run after all copy-back and cleanup decisions. +6. On runtime failure, skip tensor copy-back as today, but still release every + `Free` allocation and end the temporary-buffer run correctly. + +For Qwen3 with host sampling, logits copy-back remains required. The temporary +variable buffer only removes repeated allocation/free around that output +buffer. + +## Bind / Validate Cleanup Contract + +Cleanup ownership must be explicit: + +```text +bind owns cleanup until bind succeeds +validate owns cleanup after bind succeeds +``` + +If `begin_temporary_buffer_run()` succeeds, exactly one matching +`end_temporary_buffer_run()` must run. This applies whether bind, H2D, memset, +run, status readback, D2H, or validation fails. + +Bind should use a local cleanup guard: + +```text +temp_run_active = false + +if temporary buffer is enabled: + begin_temporary_buffer_run() + temp_run_active = true + +for tensor in tensors: + acquire or malloc dev_ptr + record TensorLease immediately + copy or memset + +runtime.temporary_buffer_run_active = temp_run_active +release bind cleanup guard +``` + +Before the cleanup guard is released, bind failure cleanup must: + +- release all recorded `Free` leases with `device_free()`; +- leave `BufferNoop` and `ExternalNoop` tensor leases as per-tensor no-ops; +- end the temporary-buffer run if `temp_run_active` is true; +- clear recorded leases. + +After bind succeeds, validate cleanup must perform the same release dispatch +and end the temporary-buffer run if +`runtime.temporary_buffer_run_active` is true. + +## Copy Behavior + +This plan intentionally keeps data movement behavior unchanged: + +- hidden remains H2D every decode step; +- seq / chunk / block metadata remains H2D when passed as host tensors; +- logits remains D2H for host sampling; +- OUTPUT / INOUT copy-back still follows existing `needs_copy_back` logic; +- PTO2 status/header D2H remains unchanged. + +Any future H2D/D2H avoidance would need a separate correctness contract for +which tensor content is device-resident, host-visible, dirty, or final. That is +not needed for the allocation/free optimization in this plan. + +## Qwen3 Tensor Classification + +With the current model/kernel boundary, treat Qwen3 tensors as follows: + +- weights / RoPE / LM head: existing child-memory / `DeviceTensor`; + do not touch. +- KV cache: existing child-memory / `DeviceTensor`; do not touch. +- hidden: ordinary non-child tensor; temporary-buffer slice; keep H2D. +- seq / chunk metadata: ordinary non-child tensor; temporary-buffer slice; + keep H2D. +- block_table / slot_mapping: ordinary non-child tensor; temporary-buffer + slice; keep H2D. +- logits: ordinary output tensor; temporary-buffer slice; keep D2H. + +The hidden H2D copy cannot be removed without changing the model/kernel +boundary to accept token ids and perform embedding lookup on device. That is +out of scope for this plan. + +## Logging And Metrics + +Add lightweight temporary-buffer counters, preferably exposed only through +debug logs or existing diagnostics: + +- configured temporary-buffer budget; +- retained chunk count; +- retained chunk bytes; +- current run used bytes; +- high-water used bytes; +- buffer-backed allocation count; +- `Free` allocation count; +- budget-exceeded count. + +Do not add a new behavior env var for this. If rollout needs a gate, ask for +explicit approval and document the default before adding it. + +Keep hot-path per-tensor logs out of `LOG_INFO_V0`. Use debug or aggregate +summary logs so performance runs are not perturbed. + +## Tests + +Minimum focused tests: + +- buffer unit test: begin, allocate several aligned slices, end, and next run + reuses the same base memory; +- buffer unit test: configured budget is enforced with a clear error; +- buffer unit test: segmented chunks work when one chunk cannot satisfy the + aggregate budget; +- buffer unit test: finalize frees retained chunks exactly once; +- runtime bind/validate test with a fake `HostApi`: repeated run records fewer + allocator calls while preserving H2D/D2H counts; +- child-memory regression: child-memory tensor is still pass-through and is not + recorded for temporary-buffer release; +- OUT tensor regression: pure OUT still receives device memset before run; +- error-path regression: failed copy or failed run releases every `Free` + allocation and ends the temporary-buffer run exactly once. + +Recommended integration checks: + +- existing prepared-callable ST for a2a3 TRB; +- existing prepared-callable ST for a5 TRB; +- qwen3 steady decode benchmark before/after with quiet logs. + +Benchmark and correctness validation must use the supported single-active-run +usage model. Hardware runs must still use `task-submit` when available. + +## Acceptance Criteria + +Accept the implementation only if all of these hold: + +- correctness tests are unchanged; +- no change to public child-memory semantics; +- no change to public `worker.malloc/free` semantics; +- the optimization requires a caller-provided `max_temporary_buffer_bytes` + and does not compute Qwen3 shape maxima inside the runtime; +- if a run exceeds the configured temporary-buffer budget, the error reports + required bytes and configured bytes; +- after warmup, steady decode `device_malloc` / `device_free` calls for + non-child temporary-buffer allocation drop materially; +- H2D/D2H bytes remain explainable and do not silently disappear; +- retained temporary-buffer HBM is bounded by `max_temporary_buffer_bytes`; +- live allocation count does not grow across repeated steady decode; +- `host_wall` improves by at least 1 ms or 5 percent on the target workload, or + allocator timing shows the expected reduction even if end-to-end impact is + smaller; +- `device_wall` and device-log `Total` do not regress by more than 1 percent; +- p99 latency does not regress by more than 5 percent. + +## Deferred Work + +These ideas are not part of this implementation: + +- dirty/version contracts for ordinary host tensors; +- skipping hidden H2D; +- device-side embedding lookup; +- device-side sampling; +- output copy-back elimination for logits; +- cross-run full-bind overlap; +- full tensor double buffering; +- runtime arena double buffering; +- AICPU init/teardown overlap. + +They can be revisited only if measurements show the temporary variable buffer +no longer addresses the dominant host-side overhead. diff --git a/python/bindings/task_interface.cpp b/python/bindings/task_interface.cpp index 43c2e6e9c..15a240058 100644 --- a/python/bindings/task_interface.cpp +++ b/python/bindings/task_interface.cpp @@ -917,6 +917,10 @@ NB_MODULE(_task_interface, m) { "host_build_graph variants. Mirrors aicpu_dlopen_count for the " "host-orchestration path; 0 on device-orch variants." ) + .def( + "configure_temporary_buffer_auto", &ChipWorker::configure_temporary_buffer_auto, nb::arg("enabled") = true, + "Enable or disable the runner-scoped TRB AUTO temporary variable buffer." + ) .def("malloc", &ChipWorker::malloc, nb::arg("size")) .def("free", &ChipWorker::free, nb::arg("ptr")) .def("copy_to", &ChipWorker::copy_to, nb::arg("dst"), nb::arg("src"), nb::arg("size")) diff --git a/python/simpler/task_interface.py b/python/simpler/task_interface.py index b29e24911..1673f08ca 100644 --- a/python/simpler/task_interface.py +++ b/python/simpler/task_interface.py @@ -1192,6 +1192,10 @@ def host_dlopen_count(self): """Number of host-side orch SO dlopens (host_build_graph variants).""" return self._impl.host_dlopen_count + def configure_temporary_buffer_auto(self, enabled: bool = True) -> None: + """Enable or disable the runner-scoped TRB AUTO temporary variable buffer.""" + self._impl.configure_temporary_buffer_auto(bool(enabled)) + def malloc(self, size): """Allocate memory. Returns a pointer (uint64).""" return int(self._impl.malloc(int(size))) diff --git a/python/simpler/worker.py b/python/simpler/worker.py index a8c56ae82..fceb4582b 100644 --- a/python/simpler/worker.py +++ b/python/simpler/worker.py @@ -267,6 +267,22 @@ class _CallableRegistration: eligible_worker_ids: tuple[int, ...] = () +@dataclass(frozen=True) +class _ChipProcessConfig: + log_level: int = 1 + log_info_v: int = 5 + platform: str = "" + runtime: str = "" + temporary_buffer_mode: str = "off" + + +def _normalize_temporary_buffer_mode(mode: Any) -> str: + normalized = str(mode).lower() + if normalized not in ("off", "auto"): + raise ValueError("temporary_buffer_mode must be 'off' or 'auto'") + return normalized + + @dataclass(frozen=True) class RemoteCallable: """Import-path descriptor for a parent-facing remote L3 callable.""" @@ -1148,10 +1164,7 @@ def _chip_process_loop( registry: dict[int, Any], identity_table: dict[bytes, int], identity_refs: dict[bytes, int], - log_level: int = 1, - log_info_v: int = 5, - platform: str = "", - runtime: str = "", + config: _ChipProcessConfig | None = None, ) -> None: """Runs in forked child process. Loads host_runtime.so in own address space. @@ -1164,9 +1177,14 @@ def _chip_process_loop( """ import traceback as _tb # noqa: PLC0415 + if config is None: + config = _ChipProcessConfig() + try: cw = ChipWorker() - cw.init(device_id, bins, log_level=log_level, log_info_v=log_info_v) + cw.init(device_id, bins, log_level=config.log_level, log_info_v=config.log_info_v) + if _normalize_temporary_buffer_mode(config.temporary_buffer_mode) == "auto": + cw.configure_temporary_buffer_auto(True) except Exception as e: _tb.print_exc() # Write the message so any parent reader that *does* inspect this @@ -1196,8 +1214,8 @@ def _chip_process_loop( registry, identity_table, identity_refs, - chip_platform=platform, - chip_runtime=runtime, + chip_platform=config.platform, + chip_runtime=config.runtime, ) finally: cw.finalize() @@ -1360,6 +1378,9 @@ def __init__( **config, ) -> None: self.level = level + if "max_temporary_buffer_bytes" in config: + raise ValueError("max_temporary_buffer_bytes has been removed; use temporary_buffer_mode='auto'") + config["temporary_buffer_mode"] = _normalize_temporary_buffer_mode(config.get("temporary_buffer_mode", "off")) self._config = config self._callable_registry: dict[int, Any] = {} self._identity_registry: dict[bytes, _CallableIdentityState] = {} @@ -2875,6 +2896,8 @@ def _init_level2(self) -> None: self._chip_worker = ChipWorker() self._chip_worker.init(device_id, binaries) + if _normalize_temporary_buffer_mode(self._config.get("temporary_buffer_mode", "off")) == "auto": + self._chip_worker.configure_temporary_buffer_auto(True) # Pre-warm any registered ChipCallable so the first run(handle, …) # does not pay the H2D upload cost. @@ -2887,6 +2910,7 @@ def _init_hierarchical(self) -> None: device_ids = self._config.get("device_ids", []) n_sub = self._config.get("num_sub_workers", 0) heap_ring_size = self._config.get("heap_ring_size", None) + _normalize_temporary_buffer_mode(self._config.get("temporary_buffer_mode", "off")) if self.level >= 4 and device_ids: raise RuntimeError("Worker level >= 4 must use add_worker(); device_ids are only supported on L3 Workers") @@ -2970,6 +2994,7 @@ def _start_hierarchical(self) -> None: # noqa: PLR0912 -- three parallel fork l """Fork child processes and start C++ scheduler. Called on first run().""" device_ids = self._config.get("device_ids", []) n_sub = self._config.get("num_sub_workers", 0) + temporary_buffer_mode = _normalize_temporary_buffer_mode(self._config.get("temporary_buffer_mode", "off")) try: # Fork children from an immutable snapshot. The state transition @@ -3025,10 +3050,13 @@ def _start_hierarchical(self) -> None: # noqa: PLR0912 -- three parallel fork l callable_kind="CHIP_CALLABLE", target_namespace="LOCAL_CHIP", ), - chip_log_level, - chip_log_info_v, - str(self._config["platform"]), - str(self._config["runtime"]), + _ChipProcessConfig( + log_level=chip_log_level, + log_info_v=chip_log_info_v, + platform=str(self._config["platform"]), + runtime=str(self._config["runtime"]), + temporary_buffer_mode=temporary_buffer_mode, + ), ) os._exit(0) else: @@ -3786,6 +3814,18 @@ def copy_from(self, dst: int, src: int, size: int, worker_id: int = 0) -> None: assert self._orch is not None self._orch.copy_from(worker_id, dst, src, size) + def configure_temporary_buffer_auto(self, enabled: bool = True) -> None: + """Enable or disable the TRB AUTO temporary variable buffer.""" + if self.level not in (2, 3): + raise NotImplementedError("Worker.configure_temporary_buffer_auto supports level 2 and level 3 only") + if self.level == 3 and self._hierarchical_start_state == "started": + raise RuntimeError( + "Worker.configure_temporary_buffer_auto for level 3 must be called before hierarchy startup" + ) + self._config["temporary_buffer_mode"] = "auto" if enabled else "off" + if self._chip_worker is not None: + self._chip_worker.configure_temporary_buffer_auto(enabled) + # ------------------------------------------------------------------ # run — uniform entry point # ------------------------------------------------------------------ @@ -3891,6 +3931,11 @@ def host_dlopen_count(self) -> int: return 0 return self._chip_worker.host_dlopen_count + @property + def temporary_buffer_mode(self) -> str: + """Configured TRB temporary-buffer mode: ``off`` or ``auto``.""" + return _normalize_temporary_buffer_mode(self._config.get("temporary_buffer_mode", "off")) + # ------------------------------------------------------------------ # close # ------------------------------------------------------------------ diff --git a/src/a2a3/platform/sim/host/device_runner.cpp b/src/a2a3/platform/sim/host/device_runner.cpp index c8e0929b6..cb85663c2 100644 --- a/src/a2a3/platform/sim/host/device_runner.cpp +++ b/src/a2a3/platform/sim/host/device_runner.cpp @@ -683,6 +683,7 @@ int DeviceRunner::finalize() { gm_heap_arena_.release(); gm_sm_arena_.release(); runtime_arena_pool_.release(); + clear_temporary_buffer(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index b22cfcfaa..27b8c1972 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -51,6 +51,8 @@ #include "common/strace.h" #include "common/unified_log.h" #include "host/platform_compile_info.h" +#include "host/raii_scope_guard.h" +#include "host/temporary_variable_buffer.h" #include "utils/device_arena.h" #include "prepare_callable_common.h" @@ -267,6 +269,43 @@ static int32_t pto2_read_runtime_status(Runtime *runtime, const HostApi *api, PT return runtime_status_from_error_codes(orch_error_code, sched_error_code); } +static void release_tensor_leases(Runtime *runtime, const HostApi *api) { + int freed = 0; + int buffer_noop = 0; + int external_noop = 0; + for (TensorLease &lease : runtime->tensor_leases_) { + if (lease.dev_ptr == nullptr) { + continue; + } + switch (lease.release_kind) { + case TensorReleaseKind::Free: + api->device_free(lease.dev_ptr); + ++freed; + break; + case TensorReleaseKind::BufferNoop: + ++buffer_noop; + break; + case TensorReleaseKind::ExternalNoop: + ++external_noop; + break; + } + } + LOG_DEBUG("Released tensor leases: freed=%d buffer_noop=%d external_noop=%d", freed, buffer_noop, external_noop); + runtime->tensor_leases_.clear(); +} + +static void end_temporary_buffer_run_if_active(const HostApi *api, bool &active) { + if (!active) { + return; + } + if (api->end_temporary_buffer_run == nullptr) { + LOG_ERROR("Temporary buffer run is active but end_temporary_buffer_run is not wired"); + } else { + api->end_temporary_buffer_run(); + } + active = false; +} + /** * Stage the per-callable resources (kernel binaries + orchestration SO) into * the supplied runtime so a subsequent bind_callable_to_runtime_impl can use @@ -334,7 +373,7 @@ struct ArenaStaticSizes { }; // Device pointers to the per-Worker static pools that DeviceRunner keeps alive -// across runs (freed in DeviceRunner::finalize(), never in tensor_pairs_). +// across runs (freed in DeviceRunner::finalize(), never in tensor_leases_). struct StaticArenaPtrs { void *gm_heap; void *gm_sm; @@ -412,15 +451,29 @@ static bool derive_arena_static_sizes(const ArenaSizingConfig &sizing, ArenaStat return true; } +static void +build_temporary_buffer_plan(const ChipStorageTaskArgs *orch_args, std::vector *out) { + out->clear(); + int tensor_count = orch_args->tensor_count(); + out->reserve(tensor_count); + for (int i = 0; i < tensor_count; i++) { + Tensor t = orch_args->tensor(i); + if (t.is_child_memory() || t.nbytes() == 0) { + continue; + } + out->push_back({static_cast(t.nbytes()), TemporaryVariableBuffer::kTemporaryBufferAlignment}); + } +} + // per-run: the only signature-aware step. Copy the orch args, replacing each // host tensor pointer with a freshly staged device pointer (H2D copy-in, or an // on-device zero for pure-OUTPUT buffers), and record the host/device pair for // copy-back. Read-only INPUT tensors skip copy-back. On failure the partially -// staged device_args / tensor_pairs_ stay owned by the caller's Runtime, which +// staged device_args / tensor_leases_ stay owned by the caller's Runtime, which // frees them in validate_runtime_impl. static bool stage_device_args( Runtime *runtime, const HostApi *api, const ChipStorageTaskArgs *orch_args, const ArgDirection *signature, - int sig_count, ChipStorageTaskArgs *out + int sig_count, bool use_temporary_buffer, ChipStorageTaskArgs *out ) { int tensor_count = orch_args->tensor_count(); int scalar_count = orch_args->scalar_count(); @@ -438,8 +491,24 @@ static bool stage_device_args( void *host_ptr = reinterpret_cast(static_cast(t.buffer.addr)); size_t size = static_cast(t.nbytes()); + if (size == 0) { + t.buffer.addr = 0; + out->add_tensor(t); + continue; + } - void *dev_ptr = api->device_malloc(size); + void *dev_ptr = nullptr; + TensorReleaseKind release_kind = TensorReleaseKind::Free; + if (use_temporary_buffer) { + dev_ptr = api->acquire_temporary_buffer_slice(size, TemporaryVariableBuffer::kTemporaryBufferAlignment); + release_kind = TensorReleaseKind::BufferNoop; + if (dev_ptr == nullptr) { + LOG_ERROR("AUTO temporary buffer acquire failed for tensor %d: tensor bytes=%zu", i, size); + return false; + } + } else { + dev_ptr = api->device_malloc(size); + } if (dev_ptr == nullptr) { LOG_ERROR("Failed to allocate device memory for tensor %d", i); return false; @@ -460,7 +529,9 @@ static bool stage_device_args( } if (rc != 0) { LOG_ERROR("Failed to stage tensor %d to device", i); - api->device_free(dev_ptr); + if (release_kind == TensorReleaseKind::Free) { + api->device_free(dev_ptr); + } return false; } // Read-only INPUT tensors are never written by the kernel, so there is @@ -470,7 +541,7 @@ static bool stage_device_args( // tensor entries). Anything not provably IN keeps the safe default of // copying back. bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN); - runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back}); + runtime->tensor_leases_.push_back({host_ptr, dev_ptr, size, needs_copy_back, release_kind}); LOG_INFO_V0(" Tensor %d: %zu bytes at %p", i, size, dev_ptr); t.buffer.addr = reinterpret_cast(dev_ptr); @@ -694,6 +765,8 @@ extern "C" int bind_callable_to_runtime_impl( int tensor_count = orch_args->tensor_count(); int scalar_count = orch_args->scalar_count(); LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count); + runtime->tensor_leases_.clear(); + runtime->temporary_buffer_run_active_ = false; int64_t t_total_start = _now_ms(); @@ -702,8 +775,36 @@ extern "C" int bind_callable_to_runtime_impl( return -1; } + bool use_temporary_buffer = api->temporary_buffer_enabled != nullptr && api->temporary_buffer_enabled(); + if (use_temporary_buffer && + (api->begin_temporary_buffer_run == nullptr || api->acquire_temporary_buffer_slice == nullptr || + api->end_temporary_buffer_run == nullptr)) { + LOG_ERROR("AUTO temporary buffer is enabled but HostApi temporary-buffer callbacks are not wired"); + return -1; + } + + std::vector temporary_buffer_plan; + bool temp_run_active = false; + if (use_temporary_buffer) { + build_temporary_buffer_plan(orch_args, &temporary_buffer_plan); + const TemporaryBufferPlanItem *plan_data = + temporary_buffer_plan.empty() ? nullptr : temporary_buffer_plan.data(); + if (!api->begin_temporary_buffer_run(plan_data, temporary_buffer_plan.size())) { + LOG_ERROR("Failed to begin temporary buffer run"); + return -1; + } + temp_run_active = true; + runtime->temporary_buffer_run_active_ = true; + } + + auto bind_cleanup = RAIIScopeGuard([&]() { + release_tensor_leases(runtime, api); + end_temporary_buffer_run_if_active(api, temp_run_active); + runtime->temporary_buffer_run_active_ = false; + }); + ChipStorageTaskArgs device_args; - if (!stage_device_args(runtime, api, orch_args, signature, sig_count, &device_args)) { + if (!stage_device_args(runtime, api, orch_args, signature, sig_count, use_temporary_buffer, &device_args)) { return -1; } @@ -748,6 +849,8 @@ extern "C" int bind_callable_to_runtime_impl( LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start); LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); + runtime->temporary_buffer_run_active_ = temp_run_active; + bind_cleanup.dismiss(); return 0; } @@ -756,8 +859,8 @@ extern "C" int bind_callable_to_runtime_impl( * * This function: * 1. Copies recorded tensors from device back to host - * 2. Frees device memory for recorded tensors - * 3. Clears tensor pair state + * 2. Releases recorded tensor leases + * 3. Clears tensor lease state * * @param runtime Pointer to Runtime * @return 0 on success, -1 on failure @@ -777,10 +880,10 @@ extern "C" int validate_runtime_impl(Runtime *runtime, const HostApi *api) { LOG_INFO_V0("=== Copying Results Back to Host ==="); // Copy all recorded tensors from device back to host - TensorPair *tensor_pairs = runtime->tensor_pairs_.data(); - int tensor_pair_count = static_cast(runtime->tensor_pairs_.size()); + TensorLease *tensor_leases = runtime->tensor_leases_.data(); + int tensor_lease_count = static_cast(runtime->tensor_leases_.size()); - LOG_INFO_V0("Tensor pairs to process: %d", tensor_pair_count); + LOG_INFO_V0("Tensor leases to process: %d", tensor_lease_count); // PTO2 (device orchestration): graph output may be in packed buffer uint64_t graph_out_ptr = 0; @@ -829,31 +932,31 @@ extern "C" int validate_runtime_impl(Runtime *runtime, const HostApi *api) { LOG_WARN("Skipping tensor copy-back because PTO2 runtime reported fatal status"); } else { bool first_output_tensor = true; - for (int i = 0; i < tensor_pair_count; i++) { - const TensorPair &pair = tensor_pairs[i]; + for (int i = 0; i < tensor_lease_count; i++) { + const TensorLease &lease = tensor_leases[i]; // Skip if device pointer is null - if (pair.dev_ptr == nullptr) { + if (lease.dev_ptr == nullptr) { LOG_WARN("Tensor %d has null device pointer, skipping", i); continue; } // If host pointer is null, this is a device-only allocation (no copy-back) - if (pair.host_ptr == nullptr) { + if (lease.host_ptr == nullptr) { LOG_INFO_V0("Tensor %d: device-only allocation (no copy-back)", i); continue; } // Read-only INPUT tensors were uploaded H2D but the kernel never // wrote them — copying them back (potentially ~GB) is pure waste. - // They are still device_free'd in the cleanup loop below. - if (!pair.needs_copy_back) { + // They are still released through release_kind below. + if (!lease.needs_copy_back) { LOG_INFO_V0("Tensor %d: read-only input, skipping copy-back", i); continue; } - void *src_ptr = pair.dev_ptr; - size_t copy_size = pair.size; + void *src_ptr = lease.dev_ptr; + size_t copy_size = lease.size; // Use graph_output_ptr for the first output tensor if available if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) { @@ -863,27 +966,20 @@ extern "C" int validate_runtime_impl(Runtime *runtime, const HostApi *api) { first_output_tensor = false; } - int copy_rc = api->copy_from_device(pair.host_ptr, src_ptr, copy_size); + int copy_rc = api->copy_from_device(lease.host_ptr, src_ptr, copy_size); if (copy_rc != 0) { LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); rc = copy_rc; } else { - LOG_INFO_V0("Tensor %d: %zu bytes copied to host", i, pair.size); + LOG_INFO_V0("Tensor %d: %zu bytes copied to host", i, lease.size); } } } // Cleanup device tensors LOG_INFO_V0("=== Cleaning Up ==="); - for (int i = 0; i < tensor_pair_count; i++) { - if (tensor_pairs[i].dev_ptr != nullptr) { - api->device_free(tensor_pairs[i].dev_ptr); - } - } - LOG_INFO_V0("Freed %d device allocations", tensor_pair_count); - - // Clear tensor pairs - runtime->tensor_pairs_.clear(); + release_tensor_leases(runtime, api); + end_temporary_buffer_run_if_active(api, runtime->temporary_buffer_run_active_); LOG_INFO_V0("=== Finalize Complete ==="); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h index 036d69b22..97442cff3 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_types.h @@ -19,7 +19,7 @@ * defined in tensor.h. * * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h - * without type conflicts (Handshake, TensorPair, HostApi). + * without type conflicts (Handshake, TensorLease, HostApi). */ #ifndef SRC_A2A3_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_ diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 6772aa5e2..22b2ca222 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -103,11 +103,16 @@ struct Handshake { volatile uint32_t aicore_regs_ready; // AICore ID reported: 0=pending, 1=done } __attribute__((aligned(64))); +enum class TensorReleaseKind { + Free, + BufferNoop, + ExternalNoop, +}; + /** - * Tensor pair for tracking host-device memory mappings. - * Used for copy-back during finalize. + * Tensor lease for tracking host-device memory mappings and release ownership. */ -struct TensorPair { +struct TensorLease { void *host_ptr; void *dev_ptr; size_t size; @@ -115,6 +120,7 @@ struct TensorPair { // so the end-of-run D2H copy-back is skipped. OUTPUT/INOUT/unknown // keep the safe default of copying back. bool needs_copy_back = true; + TensorReleaseKind release_kind = TensorReleaseKind::Free; }; /** @@ -301,7 +307,8 @@ class Runtime { // Host-side tensor ledger for D2H copy-back at finalize. Populated by // runtime_maker.cpp from orch_args at bind time, then iterated in // validate_runtime_impl. Host-only (after `dev`): never uploaded. - std::vector tensor_pairs_; + std::vector tensor_leases_; + bool temporary_buffer_run_active_ = false; }; // `dev` must be the first member so the narrowed H2D copy starts at offset 0. diff --git a/src/a5/platform/sim/host/device_runner.cpp b/src/a5/platform/sim/host/device_runner.cpp index 81ea8ea14..be935397b 100644 --- a/src/a5/platform/sim/host/device_runner.cpp +++ b/src/a5/platform/sim/host/device_runner.cpp @@ -652,6 +652,7 @@ int DeviceRunner::finalize() { gm_heap_arena_.release(); gm_sm_arena_.release(); runtime_arena_pool_.release(); + clear_temporary_buffer(); cached_gm_heap_size_ = 0; cached_gm_sm_size_ = 0; cached_runtime_arena_size_ = 0; diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index b22cfcfaa..27b8c1972 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -51,6 +51,8 @@ #include "common/strace.h" #include "common/unified_log.h" #include "host/platform_compile_info.h" +#include "host/raii_scope_guard.h" +#include "host/temporary_variable_buffer.h" #include "utils/device_arena.h" #include "prepare_callable_common.h" @@ -267,6 +269,43 @@ static int32_t pto2_read_runtime_status(Runtime *runtime, const HostApi *api, PT return runtime_status_from_error_codes(orch_error_code, sched_error_code); } +static void release_tensor_leases(Runtime *runtime, const HostApi *api) { + int freed = 0; + int buffer_noop = 0; + int external_noop = 0; + for (TensorLease &lease : runtime->tensor_leases_) { + if (lease.dev_ptr == nullptr) { + continue; + } + switch (lease.release_kind) { + case TensorReleaseKind::Free: + api->device_free(lease.dev_ptr); + ++freed; + break; + case TensorReleaseKind::BufferNoop: + ++buffer_noop; + break; + case TensorReleaseKind::ExternalNoop: + ++external_noop; + break; + } + } + LOG_DEBUG("Released tensor leases: freed=%d buffer_noop=%d external_noop=%d", freed, buffer_noop, external_noop); + runtime->tensor_leases_.clear(); +} + +static void end_temporary_buffer_run_if_active(const HostApi *api, bool &active) { + if (!active) { + return; + } + if (api->end_temporary_buffer_run == nullptr) { + LOG_ERROR("Temporary buffer run is active but end_temporary_buffer_run is not wired"); + } else { + api->end_temporary_buffer_run(); + } + active = false; +} + /** * Stage the per-callable resources (kernel binaries + orchestration SO) into * the supplied runtime so a subsequent bind_callable_to_runtime_impl can use @@ -334,7 +373,7 @@ struct ArenaStaticSizes { }; // Device pointers to the per-Worker static pools that DeviceRunner keeps alive -// across runs (freed in DeviceRunner::finalize(), never in tensor_pairs_). +// across runs (freed in DeviceRunner::finalize(), never in tensor_leases_). struct StaticArenaPtrs { void *gm_heap; void *gm_sm; @@ -412,15 +451,29 @@ static bool derive_arena_static_sizes(const ArenaSizingConfig &sizing, ArenaStat return true; } +static void +build_temporary_buffer_plan(const ChipStorageTaskArgs *orch_args, std::vector *out) { + out->clear(); + int tensor_count = orch_args->tensor_count(); + out->reserve(tensor_count); + for (int i = 0; i < tensor_count; i++) { + Tensor t = orch_args->tensor(i); + if (t.is_child_memory() || t.nbytes() == 0) { + continue; + } + out->push_back({static_cast(t.nbytes()), TemporaryVariableBuffer::kTemporaryBufferAlignment}); + } +} + // per-run: the only signature-aware step. Copy the orch args, replacing each // host tensor pointer with a freshly staged device pointer (H2D copy-in, or an // on-device zero for pure-OUTPUT buffers), and record the host/device pair for // copy-back. Read-only INPUT tensors skip copy-back. On failure the partially -// staged device_args / tensor_pairs_ stay owned by the caller's Runtime, which +// staged device_args / tensor_leases_ stay owned by the caller's Runtime, which // frees them in validate_runtime_impl. static bool stage_device_args( Runtime *runtime, const HostApi *api, const ChipStorageTaskArgs *orch_args, const ArgDirection *signature, - int sig_count, ChipStorageTaskArgs *out + int sig_count, bool use_temporary_buffer, ChipStorageTaskArgs *out ) { int tensor_count = orch_args->tensor_count(); int scalar_count = orch_args->scalar_count(); @@ -438,8 +491,24 @@ static bool stage_device_args( void *host_ptr = reinterpret_cast(static_cast(t.buffer.addr)); size_t size = static_cast(t.nbytes()); + if (size == 0) { + t.buffer.addr = 0; + out->add_tensor(t); + continue; + } - void *dev_ptr = api->device_malloc(size); + void *dev_ptr = nullptr; + TensorReleaseKind release_kind = TensorReleaseKind::Free; + if (use_temporary_buffer) { + dev_ptr = api->acquire_temporary_buffer_slice(size, TemporaryVariableBuffer::kTemporaryBufferAlignment); + release_kind = TensorReleaseKind::BufferNoop; + if (dev_ptr == nullptr) { + LOG_ERROR("AUTO temporary buffer acquire failed for tensor %d: tensor bytes=%zu", i, size); + return false; + } + } else { + dev_ptr = api->device_malloc(size); + } if (dev_ptr == nullptr) { LOG_ERROR("Failed to allocate device memory for tensor %d", i); return false; @@ -460,7 +529,9 @@ static bool stage_device_args( } if (rc != 0) { LOG_ERROR("Failed to stage tensor %d to device", i); - api->device_free(dev_ptr); + if (release_kind == TensorReleaseKind::Free) { + api->device_free(dev_ptr); + } return false; } // Read-only INPUT tensors are never written by the kernel, so there is @@ -470,7 +541,7 @@ static bool stage_device_args( // tensor entries). Anything not provably IN keeps the safe default of // copying back. bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN); - runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back}); + runtime->tensor_leases_.push_back({host_ptr, dev_ptr, size, needs_copy_back, release_kind}); LOG_INFO_V0(" Tensor %d: %zu bytes at %p", i, size, dev_ptr); t.buffer.addr = reinterpret_cast(dev_ptr); @@ -694,6 +765,8 @@ extern "C" int bind_callable_to_runtime_impl( int tensor_count = orch_args->tensor_count(); int scalar_count = orch_args->scalar_count(); LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count); + runtime->tensor_leases_.clear(); + runtime->temporary_buffer_run_active_ = false; int64_t t_total_start = _now_ms(); @@ -702,8 +775,36 @@ extern "C" int bind_callable_to_runtime_impl( return -1; } + bool use_temporary_buffer = api->temporary_buffer_enabled != nullptr && api->temporary_buffer_enabled(); + if (use_temporary_buffer && + (api->begin_temporary_buffer_run == nullptr || api->acquire_temporary_buffer_slice == nullptr || + api->end_temporary_buffer_run == nullptr)) { + LOG_ERROR("AUTO temporary buffer is enabled but HostApi temporary-buffer callbacks are not wired"); + return -1; + } + + std::vector temporary_buffer_plan; + bool temp_run_active = false; + if (use_temporary_buffer) { + build_temporary_buffer_plan(orch_args, &temporary_buffer_plan); + const TemporaryBufferPlanItem *plan_data = + temporary_buffer_plan.empty() ? nullptr : temporary_buffer_plan.data(); + if (!api->begin_temporary_buffer_run(plan_data, temporary_buffer_plan.size())) { + LOG_ERROR("Failed to begin temporary buffer run"); + return -1; + } + temp_run_active = true; + runtime->temporary_buffer_run_active_ = true; + } + + auto bind_cleanup = RAIIScopeGuard([&]() { + release_tensor_leases(runtime, api); + end_temporary_buffer_run_if_active(api, temp_run_active); + runtime->temporary_buffer_run_active_ = false; + }); + ChipStorageTaskArgs device_args; - if (!stage_device_args(runtime, api, orch_args, signature, sig_count, &device_args)) { + if (!stage_device_args(runtime, api, orch_args, signature, sig_count, use_temporary_buffer, &device_args)) { return -1; } @@ -748,6 +849,8 @@ extern "C" int bind_callable_to_runtime_impl( LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start); LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start); + runtime->temporary_buffer_run_active_ = temp_run_active; + bind_cleanup.dismiss(); return 0; } @@ -756,8 +859,8 @@ extern "C" int bind_callable_to_runtime_impl( * * This function: * 1. Copies recorded tensors from device back to host - * 2. Frees device memory for recorded tensors - * 3. Clears tensor pair state + * 2. Releases recorded tensor leases + * 3. Clears tensor lease state * * @param runtime Pointer to Runtime * @return 0 on success, -1 on failure @@ -777,10 +880,10 @@ extern "C" int validate_runtime_impl(Runtime *runtime, const HostApi *api) { LOG_INFO_V0("=== Copying Results Back to Host ==="); // Copy all recorded tensors from device back to host - TensorPair *tensor_pairs = runtime->tensor_pairs_.data(); - int tensor_pair_count = static_cast(runtime->tensor_pairs_.size()); + TensorLease *tensor_leases = runtime->tensor_leases_.data(); + int tensor_lease_count = static_cast(runtime->tensor_leases_.size()); - LOG_INFO_V0("Tensor pairs to process: %d", tensor_pair_count); + LOG_INFO_V0("Tensor leases to process: %d", tensor_lease_count); // PTO2 (device orchestration): graph output may be in packed buffer uint64_t graph_out_ptr = 0; @@ -829,31 +932,31 @@ extern "C" int validate_runtime_impl(Runtime *runtime, const HostApi *api) { LOG_WARN("Skipping tensor copy-back because PTO2 runtime reported fatal status"); } else { bool first_output_tensor = true; - for (int i = 0; i < tensor_pair_count; i++) { - const TensorPair &pair = tensor_pairs[i]; + for (int i = 0; i < tensor_lease_count; i++) { + const TensorLease &lease = tensor_leases[i]; // Skip if device pointer is null - if (pair.dev_ptr == nullptr) { + if (lease.dev_ptr == nullptr) { LOG_WARN("Tensor %d has null device pointer, skipping", i); continue; } // If host pointer is null, this is a device-only allocation (no copy-back) - if (pair.host_ptr == nullptr) { + if (lease.host_ptr == nullptr) { LOG_INFO_V0("Tensor %d: device-only allocation (no copy-back)", i); continue; } // Read-only INPUT tensors were uploaded H2D but the kernel never // wrote them — copying them back (potentially ~GB) is pure waste. - // They are still device_free'd in the cleanup loop below. - if (!pair.needs_copy_back) { + // They are still released through release_kind below. + if (!lease.needs_copy_back) { LOG_INFO_V0("Tensor %d: read-only input, skipping copy-back", i); continue; } - void *src_ptr = pair.dev_ptr; - size_t copy_size = pair.size; + void *src_ptr = lease.dev_ptr; + size_t copy_size = lease.size; // Use graph_output_ptr for the first output tensor if available if (first_output_tensor && graph_out_ptr != 0 && graph_out_size > 0) { @@ -863,27 +966,20 @@ extern "C" int validate_runtime_impl(Runtime *runtime, const HostApi *api) { first_output_tensor = false; } - int copy_rc = api->copy_from_device(pair.host_ptr, src_ptr, copy_size); + int copy_rc = api->copy_from_device(lease.host_ptr, src_ptr, copy_size); if (copy_rc != 0) { LOG_ERROR("Failed to copy tensor %d from device: %d", i, copy_rc); rc = copy_rc; } else { - LOG_INFO_V0("Tensor %d: %zu bytes copied to host", i, pair.size); + LOG_INFO_V0("Tensor %d: %zu bytes copied to host", i, lease.size); } } } // Cleanup device tensors LOG_INFO_V0("=== Cleaning Up ==="); - for (int i = 0; i < tensor_pair_count; i++) { - if (tensor_pairs[i].dev_ptr != nullptr) { - api->device_free(tensor_pairs[i].dev_ptr); - } - } - LOG_INFO_V0("Freed %d device allocations", tensor_pair_count); - - // Clear tensor pairs - runtime->tensor_pairs_.clear(); + release_tensor_leases(runtime, api); + end_temporary_buffer_run_if_active(api, runtime->temporary_buffer_run_active_); LOG_INFO_V0("=== Finalize Complete ==="); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h index 821d6ce3a..391d6c96b 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_types.h @@ -19,7 +19,7 @@ * defined in tensor.h. * * This header is independent of orch_build_graph_runtime.h to allow inclusion from runtime.h - * without type conflicts (Handshake, TensorPair, HostApi). + * without type conflicts (Handshake, TensorLease, HostApi). */ #ifndef SRC_A5_RUNTIME_TENSORMAP_AND_RINGBUFFER_RUNTIME_PTO_TYPES_H_ diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index fd16f3d49..0363622b5 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -111,11 +111,16 @@ struct Handshake { volatile uint32_t aicore_regs_ready; // AICore ID reported: 0=pending, 1=done } __attribute__((aligned(64))); +enum class TensorReleaseKind { + Free, + BufferNoop, + ExternalNoop, +}; + /** - * Tensor pair for tracking host-device memory mappings. - * Used for copy-back during finalize. + * Tensor lease for tracking host-device memory mappings and release ownership. */ -struct TensorPair { +struct TensorLease { void *host_ptr; void *dev_ptr; size_t size; @@ -123,6 +128,7 @@ struct TensorPair { // so the end-of-run D2H copy-back is skipped. OUTPUT/INOUT/unknown // keep the safe default of copying back. bool needs_copy_back = true; + TensorReleaseKind release_kind = TensorReleaseKind::Free; }; /** @@ -315,7 +321,8 @@ class Runtime { // Host-side tensor ledger for D2H copy-back at finalize. Populated by // runtime_maker.cpp from orch_args at bind time, then iterated in // validate_runtime_impl. Host-only (after `dev`): never uploaded. - std::vector tensor_pairs_; + std::vector tensor_leases_; + bool temporary_buffer_run_active_ = false; }; // `dev` must be the first member so the narrowed H2D copy starts at offset 0. diff --git a/src/common/platform/include/common/host_api.h b/src/common/platform/include/common/host_api.h index 89c482d13..68434b1ec 100644 --- a/src/common/platform/include/common/host_api.h +++ b/src/common/platform/include/common/host_api.h @@ -14,6 +14,11 @@ #include #include +struct TemporaryBufferPlanItem { + size_t bytes; + size_t alignment; +}; + /** * Host API function pointers for device memory operations. * Allows a runtime to use pluggable device-memory backends. @@ -34,6 +39,13 @@ struct HostApi { // null on backends that don't wire it; callers must fall back to // copy_to_device. int (*device_memset)(void *dev_ptr, int value, size_t size); + // Runner-scoped AUTO temporary variable buffer. Only trb bind consumes + // these callbacks; public device_malloc/device_free keep real allocation + // semantics. + bool (*temporary_buffer_enabled)(); + bool (*begin_temporary_buffer_run)(const TemporaryBufferPlanItem *items, size_t item_count); + void *(*acquire_temporary_buffer_slice)(size_t size, size_t alignment); + void (*end_temporary_buffer_run)(); // Commit the three per-Worker pooled regions (PTO2 GM heap, PTO2 shared // memory, trb prebuilt runtime arena) as three independent device // allocations. `runtime_arena_size == 0` skips the third region (hbg @@ -44,7 +56,7 @@ struct HostApi { // memory / prebuilt runtime arena. setup_static_arena must have already // committed the relevant region; the returned pointer is owned by the // DeviceRunner and freed in `DeviceRunner::finalize()` — do NOT pass it - // to device_free or record it in `tensor_pairs_`. + // to device_free or record it as an owned tensor lease. // // acquire_pooled_runtime_arena is trb-only — the runtime-arena region is // only committed when setup_static_arena was invoked with diff --git a/src/common/platform/include/host/temporary_variable_buffer.h b/src/common/platform/include/host/temporary_variable_buffer.h new file mode 100644 index 000000000..34eeedd04 --- /dev/null +++ b/src/common/platform/include/host/temporary_variable_buffer.h @@ -0,0 +1,326 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +#ifndef SRC_COMMON_PLATFORM_INCLUDE_HOST_TEMPORARY_VARIABLE_BUFFER_H_ +#define SRC_COMMON_PLATFORM_INCLUDE_HOST_TEMPORARY_VARIABLE_BUFFER_H_ + +#include +#include +#include +#include +#include +#include + +#include "common/host_api.h" + +class TemporaryVariableBuffer { +public: + using AllocFn = void *(*)(void *ctx, size_t size); + using FreeFn = void (*)(void *ctx, void *ptr); + + static constexpr size_t kTemporaryBufferAlignment = 1024; + static constexpr size_t kDefaultAlignment = kTemporaryBufferAlignment; + + struct Stats { + bool enabled{false}; + size_t retained_bytes{0}; + size_t current_run_used_bytes{0}; + size_t high_water_used_bytes{0}; + size_t buffer_backed_allocation_count{0}; + size_t realloc_count{0}; + size_t realloc_failed_count{0}; + bool active{false}; + }; + + TemporaryVariableBuffer(AllocFn alloc, FreeFn free_fn, void *ctx) : + alloc_(alloc), + free_(free_fn), + ctx_(ctx) {} + + ~TemporaryVariableBuffer() { clear(); } + + TemporaryVariableBuffer(const TemporaryVariableBuffer &) = delete; + TemporaryVariableBuffer &operator=(const TemporaryVariableBuffer &) = delete; + + bool configure_auto(bool enabled); + bool begin_run(const TemporaryBufferPlanItem *items, size_t item_count); + void *acquire(size_t bytes, size_t alignment); + void end_run(); + void clear(); + + bool enabled() const { return enabled_; } + bool active() const { return active_; } + Stats stats() const; + const std::string &last_error() const { return last_error_; } + +private: + struct Buffer { + void *raw_base{nullptr}; + void *base{nullptr}; + size_t capacity{0}; + size_t raw_size{0}; + size_t offset{0}; + }; + + static bool is_power_of_two(size_t value) { return value != 0 && (value & (value - 1)) == 0; } + + static bool align_up_checked(size_t value, size_t alignment, size_t *out); + static bool align_ptr_checked(void *ptr, size_t alignment, void **out); + + bool validate_plan_item(const TemporaryBufferPlanItem &item); + bool packed_plan_size(const TemporaryBufferPlanItem *items, size_t item_count, size_t *out); + bool allocate_buffer(size_t required_bytes); + void release_buffer(); + void reset_run_state(); + void set_error(std::string msg) { last_error_ = std::move(msg); } + + AllocFn alloc_{nullptr}; + FreeFn free_{nullptr}; + void *ctx_{nullptr}; + + Buffer buffer_; + size_t current_run_used_bytes_{0}; + size_t high_water_used_bytes_{0}; + size_t buffer_backed_allocation_count_{0}; + size_t realloc_count_{0}; + size_t realloc_failed_count_{0}; + bool enabled_{false}; + bool active_{false}; + std::string last_error_; +}; + +inline bool TemporaryVariableBuffer::align_up_checked(size_t value, size_t alignment, size_t *out) { + if (out == nullptr || !is_power_of_two(alignment)) { + return false; + } + const size_t padding = alignment - 1; + if (value > std::numeric_limits::max() - padding) { + return false; + } + *out = (value + padding) & ~padding; + return true; +} + +inline bool TemporaryVariableBuffer::align_ptr_checked(void *ptr, size_t alignment, void **out) { + if (out == nullptr || ptr == nullptr || !is_power_of_two(alignment)) { + return false; + } + uintptr_t raw = reinterpret_cast(ptr); + const uintptr_t padding = static_cast(alignment - 1); + if (raw > std::numeric_limits::max() - padding) { + return false; + } + uintptr_t aligned = (raw + padding) & ~padding; + *out = reinterpret_cast(aligned); + return true; +} + +inline bool TemporaryVariableBuffer::configure_auto(bool enabled) { + if (active_) { + set_error("cannot reconfigure temporary buffer while a run is active"); + return false; + } + if (enabled == enabled_) { + last_error_.clear(); + return true; + } + + enabled_ = enabled; + if (!enabled_) { + clear(); + } + last_error_.clear(); + return true; +} + +inline bool TemporaryVariableBuffer::begin_run(const TemporaryBufferPlanItem *items, size_t item_count) { + if (active_) { + set_error("temporary buffer run is already active"); + return false; + } + if (!enabled_) { + set_error("temporary buffer is disabled"); + return false; + } + if (items == nullptr && item_count != 0) { + set_error("temporary buffer plan items pointer is null"); + return false; + } + + size_t required = 0; + if (!packed_plan_size(items, item_count, &required)) { + return false; + } + + if (buffer_.capacity < required) { + release_buffer(); + if (required != 0 && !allocate_buffer(required)) { + ++realloc_failed_count_; + set_error("temporary buffer AUTO realloc failed: required bytes " + std::to_string(required)); + return false; + } + } + + reset_run_state(); + active_ = true; + last_error_.clear(); + return true; +} + +inline void *TemporaryVariableBuffer::acquire(size_t bytes, size_t alignment) { + if (!active_) { + set_error("temporary buffer acquire requested outside an active run"); + return nullptr; + } + if (alignment == 0) { + alignment = kTemporaryBufferAlignment; + } + if (!is_power_of_two(alignment)) { + set_error("temporary buffer alignment must be a power of two"); + return nullptr; + } + alignment = std::max(alignment, kTemporaryBufferAlignment); + + if (buffer_.base == nullptr) { + set_error("temporary buffer acquire requested with no retained buffer"); + return nullptr; + } + + size_t aligned_offset = 0; + if (!align_up_checked(buffer_.offset, alignment, &aligned_offset)) { + set_error("temporary buffer acquire alignment overflow"); + return nullptr; + } + if (bytes > buffer_.capacity || aligned_offset > buffer_.capacity - bytes) { + set_error( + "temporary buffer acquire missed after successful plan: tensor bytes " + std::to_string(bytes) + + ", retained bytes " + std::to_string(buffer_.capacity) + ); + return nullptr; + } + + void *ptr = static_cast(buffer_.base) + aligned_offset; + current_run_used_bytes_ += (aligned_offset - buffer_.offset) + bytes; + buffer_.offset = aligned_offset + bytes; + ++buffer_backed_allocation_count_; + last_error_.clear(); + return ptr; +} + +inline void TemporaryVariableBuffer::end_run() { + if (!active_) { + return; + } + if (current_run_used_bytes_ > high_water_used_bytes_) { + high_water_used_bytes_ = current_run_used_bytes_; + } + active_ = false; +} + +inline void TemporaryVariableBuffer::clear() { + release_buffer(); + enabled_ = false; + current_run_used_bytes_ = 0; + high_water_used_bytes_ = 0; + buffer_backed_allocation_count_ = 0; + realloc_count_ = 0; + realloc_failed_count_ = 0; + active_ = false; + last_error_.clear(); +} + +inline TemporaryVariableBuffer::Stats TemporaryVariableBuffer::stats() const { + return Stats{ + enabled_, + buffer_.capacity, + current_run_used_bytes_, + high_water_used_bytes_, + buffer_backed_allocation_count_, + realloc_count_, + realloc_failed_count_, + active_, + }; +} + +inline bool TemporaryVariableBuffer::validate_plan_item(const TemporaryBufferPlanItem &item) { + if (item.alignment == 0 || !is_power_of_two(item.alignment)) { + set_error("temporary buffer plan alignment must be a power of two"); + return false; + } + return true; +} + +inline bool +TemporaryVariableBuffer::packed_plan_size(const TemporaryBufferPlanItem *items, size_t item_count, size_t *out) { + if (out == nullptr) { + set_error("temporary buffer packed size received invalid arguments"); + return false; + } + size_t offset = 0; + for (size_t i = 0; i < item_count; ++i) { + if (!validate_plan_item(items[i])) { + return false; + } + const size_t alignment = std::max(items[i].alignment, kTemporaryBufferAlignment); + size_t aligned = 0; + if (!align_up_checked(offset, alignment, &aligned) || + items[i].bytes > std::numeric_limits::max() - aligned) { + set_error("temporary buffer plan size overflow"); + return false; + } + offset = aligned + items[i].bytes; + } + *out = offset; + return true; +} + +inline bool TemporaryVariableBuffer::allocate_buffer(size_t required_bytes) { + if (alloc_ == nullptr || free_ == nullptr) { + set_error("temporary buffer allocator callbacks are not configured"); + return false; + } + size_t capacity = 0; + if (!align_up_checked(required_bytes, kTemporaryBufferAlignment, &capacity)) { + set_error("temporary buffer capacity overflow"); + return false; + } + if (capacity > std::numeric_limits::max() - (kTemporaryBufferAlignment - 1)) { + set_error("temporary buffer raw allocation size overflow"); + return false; + } + const size_t raw_size = capacity + (kTemporaryBufferAlignment - 1); + void *raw = alloc_(ctx_, raw_size); + if (raw == nullptr) { + return false; + } + void *base = nullptr; + if (!align_ptr_checked(raw, kTemporaryBufferAlignment, &base)) { + free_(ctx_, raw); + set_error("temporary buffer base alignment overflow"); + return false; + } + buffer_ = Buffer{raw, base, capacity, raw_size, 0}; + ++realloc_count_; + return true; +} + +inline void TemporaryVariableBuffer::release_buffer() { + if (buffer_.raw_base != nullptr && free_ != nullptr) { + free_(ctx_, buffer_.raw_base); + } + buffer_ = Buffer{}; +} + +inline void TemporaryVariableBuffer::reset_run_state() { + buffer_.offset = 0; + current_run_used_bytes_ = 0; +} + +#endif // SRC_COMMON_PLATFORM_INCLUDE_HOST_TEMPORARY_VARIABLE_BUFFER_H_ diff --git a/src/common/platform/onboard/host/c_api_shared.cpp b/src/common/platform/onboard/host/c_api_shared.cpp index 5924c4184..d90ddb4b2 100644 --- a/src/common/platform/onboard/host/c_api_shared.cpp +++ b/src/common/platform/onboard/host/c_api_shared.cpp @@ -120,6 +120,36 @@ static int device_memset(void *dev_ptr, int value, size_t size) { } } +static bool temporary_buffer_enabled() { + try { + return current_runner()->temporary_buffer_enabled(); + } catch (...) { + return false; + } +} + +static bool begin_temporary_buffer_run(const TemporaryBufferPlanItem *items, size_t item_count) { + try { + return current_runner()->begin_temporary_buffer_run(items, item_count); + } catch (...) { + return false; + } +} + +static void *acquire_temporary_buffer_slice(size_t size, size_t alignment) { + try { + return current_runner()->acquire_temporary_buffer_slice(size, alignment); + } catch (...) { + return nullptr; + } +} + +static void end_temporary_buffer_run() { + try { + current_runner()->end_temporary_buffer_run(); + } catch (...) {} +} + static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { try { return current_runner()->upload_chip_callable_buffer(static_cast(callable)); @@ -230,6 +260,15 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de } } +int configure_temporary_buffer_auto_ctx(DeviceContextHandle ctx, int enabled) { + if (ctx == NULL) return -1; + try { + return static_cast(ctx)->configure_temporary_buffer_auto(enabled != 0) ? 0 : -1; + } catch (...) { + return -1; + } +} + int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { @@ -505,6 +544,10 @@ int simpler_run( api.copy_to_device = copy_to_device; api.copy_from_device = copy_from_device; api.device_memset = device_memset; + api.temporary_buffer_enabled = temporary_buffer_enabled; + api.begin_temporary_buffer_run = begin_temporary_buffer_run; + api.acquire_temporary_buffer_slice = acquire_temporary_buffer_slice; + api.end_temporary_buffer_run = end_temporary_buffer_run; api.setup_static_arena = setup_static_arena_wrapper; api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; diff --git a/src/common/platform/onboard/host/device_runner_base.cpp b/src/common/platform/onboard/host/device_runner_base.cpp index a48a2eab4..d686fc602 100644 --- a/src/common/platform/onboard/host/device_runner_base.cpp +++ b/src/common/platform/onboard/host/device_runner_base.cpp @@ -115,6 +115,7 @@ HostRuntimeTimeoutConfig resolve_onboard_timeout_config() { } // namespace DeviceRunnerBase::DeviceRunnerBase() : + temporary_buffer_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} @@ -139,6 +140,54 @@ int DeviceRunnerBase::device_memset(void *dev_ptr, int value, std::size_t bytes) return aclrtMemset(dev_ptr, bytes, value, bytes); } +bool DeviceRunnerBase::configure_temporary_buffer_auto(bool enabled) { + if (!temporary_buffer_.configure_auto(enabled)) { + LOG_ERROR("configure_temporary_buffer_auto(%d) failed: %s", enabled, temporary_buffer_.last_error().c_str()); + return false; + } + auto stats = temporary_buffer_.stats(); + LOG_DEBUG("Temporary buffer AUTO configured: enabled=%d retained_bytes=%zu", stats.enabled, stats.retained_bytes); + return true; +} + +bool DeviceRunnerBase::temporary_buffer_enabled() const { return temporary_buffer_.enabled(); } + +bool DeviceRunnerBase::begin_temporary_buffer_run(const TemporaryBufferPlanItem *items, std::size_t item_count) { + if (!temporary_buffer_.begin_run(items, item_count)) { + LOG_ERROR("begin_temporary_buffer_run failed: %s", temporary_buffer_.last_error().c_str()); + return false; + } + return true; +} + +void *DeviceRunnerBase::acquire_temporary_buffer_slice(std::size_t bytes, std::size_t alignment) { + void *ptr = temporary_buffer_.acquire(bytes, alignment); + if (ptr == nullptr) { + LOG_ERROR( + "acquire_temporary_buffer_slice failed: bytes=%zu retained_bytes=%zu: %s", bytes, + temporary_buffer_.stats().retained_bytes, temporary_buffer_.last_error().c_str() + ); + } + return ptr; +} + +void DeviceRunnerBase::end_temporary_buffer_run() { + temporary_buffer_.end_run(); + auto stats = temporary_buffer_.stats(); + LOG_DEBUG( + "Temporary buffer run ended: used=%zu high_water=%zu allocations=%zu reallocs=%zu realloc_failed=%zu", + stats.current_run_used_bytes, stats.high_water_used_bytes, stats.buffer_backed_allocation_count, + stats.realloc_count, stats.realloc_failed_count + ); +} + +void DeviceRunnerBase::clear_temporary_buffer() { + if (temporary_buffer_.active()) { + LOG_ERROR("clear_temporary_buffer called while a temporary-buffer run is active"); + } + temporary_buffer_.clear(); +} + int DeviceRunnerBase::l3_l2_orch_comm_init(void *control_block, size_t control_block_size) { if (!l3_l2_orch_comm_supported()) { return PTO_RUNTIME_ERR_UNSUPPORTED; @@ -1035,6 +1084,8 @@ int DeviceRunnerBase::finalize_common() { prebuilt_runtime_arena_cache_runtime_arena_base_ = nullptr; prebuilt_runtime_arena_cache_image_.clear(); + clear_temporary_buffer(); + // Free the 8-byte device_wall buffer (allocated lazily in run()) while // mem_alloc_ and the device context are still live. free_tensor() routes // through mem_alloc_.free(), so it must run before mem_alloc_.finalize() diff --git a/src/common/platform/onboard/host/device_runner_base.h b/src/common/platform/onboard/host/device_runner_base.h index d313ec3dd..744f9a5c4 100644 --- a/src/common/platform/onboard/host/device_runner_base.h +++ b/src/common/platform/onboard/host/device_runner_base.h @@ -62,6 +62,7 @@ #include "host/runtime_timeout_config.h" #include "host/scope_stats_collector.h" #include "host/tensor_dump_collector.h" +#include "host/temporary_variable_buffer.h" #include "prepare_callable_common.h" /** @@ -90,6 +91,12 @@ class DeviceRunnerBase : public L3L2OrchCommBackend { int copy_to_device(void *dev_ptr, const void *host_ptr, std::size_t bytes); int copy_from_device(void *host_ptr, const void *dev_ptr, std::size_t bytes); int device_memset(void *dev_ptr, int value, std::size_t bytes); + bool configure_temporary_buffer_auto(bool enabled); + bool temporary_buffer_enabled() const; + bool begin_temporary_buffer_run(const TemporaryBufferPlanItem *items, std::size_t item_count); + void *acquire_temporary_buffer_slice(std::size_t bytes, std::size_t alignment); + void end_temporary_buffer_run(); + void clear_temporary_buffer(); int l3_l2_orch_comm_init(void *control_block, size_t control_block_size); int l3_l2_orch_comm_shutdown(); @@ -801,6 +808,7 @@ class DeviceRunnerBase : public L3L2OrchCommBackend { host::LoadAicpuOp load_aicpu_op_; MemoryAllocator mem_alloc_; + TemporaryVariableBuffer temporary_buffer_; DeviceArena gm_heap_arena_; DeviceArena gm_sm_arena_; DeviceArena runtime_arena_pool_; diff --git a/src/common/platform/sim/host/c_api_shared.cpp b/src/common/platform/sim/host/c_api_shared.cpp index c43b06f8f..6e7604d73 100644 --- a/src/common/platform/sim/host/c_api_shared.cpp +++ b/src/common/platform/sim/host/c_api_shared.cpp @@ -117,6 +117,36 @@ static int device_memset(void *dev_ptr, int value, size_t size) { } } +static bool temporary_buffer_enabled() { + try { + return current_runner()->temporary_buffer_enabled(); + } catch (...) { + return false; + } +} + +static bool begin_temporary_buffer_run(const TemporaryBufferPlanItem *items, size_t item_count) { + try { + return current_runner()->begin_temporary_buffer_run(items, item_count); + } catch (...) { + return false; + } +} + +static void *acquire_temporary_buffer_slice(size_t size, size_t alignment) { + try { + return current_runner()->acquire_temporary_buffer_slice(size, alignment); + } catch (...) { + return nullptr; + } +} + +static void end_temporary_buffer_run() { + try { + current_runner()->end_temporary_buffer_run(); + } catch (...) {} +} + static uint64_t upload_chip_callable_buffer_wrapper(const void *callable) { try { return current_runner()->upload_chip_callable_buffer(static_cast(callable)); @@ -223,6 +253,15 @@ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *de } } +int configure_temporary_buffer_auto_ctx(DeviceContextHandle ctx, int enabled) { + if (ctx == NULL) return -1; + try { + return static_cast(ctx)->configure_temporary_buffer_auto(enabled != 0) ? 0 : -1; + } catch (...) { + return -1; + } +} + int finalize_device(DeviceContextHandle ctx) { if (ctx == NULL) return -1; try { @@ -456,6 +495,10 @@ int simpler_run( api.copy_to_device = copy_to_device; api.copy_from_device = copy_from_device; api.device_memset = device_memset; + api.temporary_buffer_enabled = temporary_buffer_enabled; + api.begin_temporary_buffer_run = begin_temporary_buffer_run; + api.acquire_temporary_buffer_slice = acquire_temporary_buffer_slice; + api.end_temporary_buffer_run = end_temporary_buffer_run; api.setup_static_arena = setup_static_arena_wrapper; api.acquire_pooled_gm_heap = acquire_pooled_gm_heap_wrapper; api.acquire_pooled_gm_sm = acquire_pooled_gm_sm_wrapper; diff --git a/src/common/platform/sim/host/device_runner_base.cpp b/src/common/platform/sim/host/device_runner_base.cpp index 547fe58b8..32b35777a 100644 --- a/src/common/platform/sim/host/device_runner_base.cpp +++ b/src/common/platform/sim/host/device_runner_base.cpp @@ -262,6 +262,54 @@ int SimDeviceRunnerBase::device_memset(void *dev_ptr, int value, size_t bytes) { return 0; } +bool SimDeviceRunnerBase::configure_temporary_buffer_auto(bool enabled) { + if (!temporary_buffer_.configure_auto(enabled)) { + LOG_ERROR("configure_temporary_buffer_auto(%d) failed: %s", enabled, temporary_buffer_.last_error().c_str()); + return false; + } + auto stats = temporary_buffer_.stats(); + LOG_DEBUG("Temporary buffer AUTO configured: enabled=%d retained_bytes=%zu", stats.enabled, stats.retained_bytes); + return true; +} + +bool SimDeviceRunnerBase::temporary_buffer_enabled() const { return temporary_buffer_.enabled(); } + +bool SimDeviceRunnerBase::begin_temporary_buffer_run(const TemporaryBufferPlanItem *items, size_t item_count) { + if (!temporary_buffer_.begin_run(items, item_count)) { + LOG_ERROR("begin_temporary_buffer_run failed: %s", temporary_buffer_.last_error().c_str()); + return false; + } + return true; +} + +void *SimDeviceRunnerBase::acquire_temporary_buffer_slice(size_t bytes, size_t alignment) { + void *ptr = temporary_buffer_.acquire(bytes, alignment); + if (ptr == nullptr) { + LOG_ERROR( + "acquire_temporary_buffer_slice failed: bytes=%zu retained_bytes=%zu: %s", bytes, + temporary_buffer_.stats().retained_bytes, temporary_buffer_.last_error().c_str() + ); + } + return ptr; +} + +void SimDeviceRunnerBase::end_temporary_buffer_run() { + temporary_buffer_.end_run(); + auto stats = temporary_buffer_.stats(); + LOG_DEBUG( + "Temporary buffer run ended: used=%zu high_water=%zu allocations=%zu reallocs=%zu realloc_failed=%zu", + stats.current_run_used_bytes, stats.high_water_used_bytes, stats.buffer_backed_allocation_count, + stats.realloc_count, stats.realloc_failed_count + ); +} + +void SimDeviceRunnerBase::clear_temporary_buffer() { + if (temporary_buffer_.active()) { + LOG_ERROR("clear_temporary_buffer called while a temporary-buffer run is active"); + } + temporary_buffer_.clear(); +} + int SimDeviceRunnerBase::l3_l2_orch_comm_init(void *control_block, size_t control_block_size) { return l3_l2_orch_comm_service_.start(this, control_block, control_block_size); } diff --git a/src/common/platform/sim/host/device_runner_base.h b/src/common/platform/sim/host/device_runner_base.h index a147bc015..9acad2f8f 100644 --- a/src/common/platform/sim/host/device_runner_base.h +++ b/src/common/platform/sim/host/device_runner_base.h @@ -51,11 +51,13 @@ #include "host/tensor_dump_collector.h" #include "host/pmu_collector.h" #include "host/scope_stats_collector.h" +#include "host/temporary_variable_buffer.h" #include "runtime.h" class SimDeviceRunnerBase : public L3L2OrchCommBackend { public: SimDeviceRunnerBase() : + temporary_buffer_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), gm_heap_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), gm_sm_arena_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_), runtime_arena_pool_(&arena_alloc_trampoline, &arena_free_trampoline, &mem_alloc_) {} @@ -94,6 +96,12 @@ class SimDeviceRunnerBase : public L3L2OrchCommBackend { int copy_to_device(void *dev_ptr, const void *host_ptr, size_t bytes); int copy_from_device(void *host_ptr, const void *dev_ptr, size_t bytes); int device_memset(void *dev_ptr, int value, size_t bytes); + bool configure_temporary_buffer_auto(bool enabled); + bool temporary_buffer_enabled() const; + bool begin_temporary_buffer_run(const TemporaryBufferPlanItem *items, size_t item_count); + void *acquire_temporary_buffer_slice(size_t bytes, size_t alignment); + void end_temporary_buffer_run(); + void clear_temporary_buffer(); int l3_l2_orch_comm_init(void *control_block, size_t control_block_size); int l3_l2_orch_comm_shutdown(); @@ -185,6 +193,7 @@ class SimDeviceRunnerBase : public L3L2OrchCommBackend { std::vector aicore_kernel_binary_; MemoryAllocator mem_alloc_; + TemporaryVariableBuffer temporary_buffer_; // Three independent per-Worker arenas, each backing a single pooled // region (PTO2 GM heap / PTO2 shared memory / trb prebuilt runtime diff --git a/src/common/worker/chip_worker.cpp b/src/common/worker/chip_worker.cpp index 5a51d5a48..9b518a1d8 100644 --- a/src/common/worker/chip_worker.cpp +++ b/src/common/worker/chip_worker.cpp @@ -101,6 +101,8 @@ void ChipWorker::init( device_free_ctx_fn_ = load_symbol(handle, "device_free_ctx"); copy_to_device_ctx_fn_ = load_symbol(handle, "copy_to_device_ctx"); copy_from_device_ctx_fn_ = load_symbol(handle, "copy_from_device_ctx"); + configure_temporary_buffer_auto_ctx_fn_ = + load_symbol(handle, "configure_temporary_buffer_auto_ctx"); get_runtime_size_fn_ = load_symbol(handle, "get_runtime_size"); simpler_init_fn_ = load_symbol(handle, "simpler_init"); register_callable_fn_ = load_symbol(handle, "simpler_register_callable"); @@ -184,6 +186,7 @@ void ChipWorker::init( device_free_ctx_fn_ = nullptr; copy_to_device_ctx_fn_ = nullptr; copy_from_device_ctx_fn_ = nullptr; + configure_temporary_buffer_auto_ctx_fn_ = nullptr; get_runtime_size_fn_ = nullptr; simpler_init_fn_ = nullptr; register_callable_fn_ = nullptr; @@ -224,6 +227,7 @@ void ChipWorker::init( device_free_ctx_fn_ = nullptr; copy_to_device_ctx_fn_ = nullptr; copy_from_device_ctx_fn_ = nullptr; + configure_temporary_buffer_auto_ctx_fn_ = nullptr; get_runtime_size_fn_ = nullptr; simpler_init_fn_ = nullptr; register_callable_fn_ = nullptr; @@ -279,6 +283,7 @@ void ChipWorker::finalize() { device_free_ctx_fn_ = nullptr; copy_to_device_ctx_fn_ = nullptr; copy_from_device_ctx_fn_ = nullptr; + configure_temporary_buffer_auto_ctx_fn_ = nullptr; get_runtime_size_fn_ = nullptr; register_callable_fn_ = nullptr; run_fn_ = nullptr; @@ -368,6 +373,16 @@ size_t ChipWorker::host_dlopen_count() const { return get_host_dlopen_count_fn_(device_ctx_); } +void ChipWorker::configure_temporary_buffer_auto(bool enabled) { + if (!initialized_) { + throw std::runtime_error("ChipWorker not initialized; call init() first"); + } + int rc = configure_temporary_buffer_auto_ctx_fn_(device_ctx_, enabled ? 1 : 0); + if (rc != 0) { + throw std::runtime_error("configure_temporary_buffer_auto failed with code " + std::to_string(rc)); + } +} + void *ChipWorker::create_comm_stream_checked(const char *op_name) { int rc = ensure_acl_ready_fn_(device_ctx_, device_id_); if (rc != 0) { diff --git a/src/common/worker/chip_worker.h b/src/common/worker/chip_worker.h index 086e5b6b6..d2be14389 100644 --- a/src/common/worker/chip_worker.h +++ b/src/common/worker/chip_worker.h @@ -82,6 +82,7 @@ class ChipWorker { void free(uint64_t ptr); void copy_to(uint64_t dst, uint64_t src, size_t size); void copy_from(uint64_t dst, uint64_t src, size_t size); + void configure_temporary_buffer_auto(bool enabled); void l3_l2_orch_comm_init(uint64_t control_block_addr, size_t control_block_size); void l3_l2_orch_comm_shutdown(); @@ -138,6 +139,7 @@ class ChipWorker { using DeviceFreeCtxFn = void (*)(void *, void *); using CopyToDeviceCtxFn = int (*)(void *, void *, const void *, size_t); using CopyFromDeviceCtxFn = int (*)(void *, void *, const void *, size_t); + using ConfigureTemporaryBufferAutoCtxFn = int (*)(void *, int); using GetRuntimeSizeFn = size_t (*)(); // From host_runtime.so. Single platform-side init that does (a) thread // attach + device-id record, (b) executor binary takeover, (c) onboard @@ -192,6 +194,7 @@ class ChipWorker { DeviceFreeCtxFn device_free_ctx_fn_ = nullptr; CopyToDeviceCtxFn copy_to_device_ctx_fn_ = nullptr; CopyFromDeviceCtxFn copy_from_device_ctx_fn_ = nullptr; + ConfigureTemporaryBufferAutoCtxFn configure_temporary_buffer_auto_ctx_fn_ = nullptr; GetRuntimeSizeFn get_runtime_size_fn_ = nullptr; SimplerInitFn simpler_init_fn_ = nullptr; SimplerRegisterCallableFn register_callable_fn_ = nullptr; diff --git a/src/common/worker/pto_runtime_c_api.h b/src/common/worker/pto_runtime_c_api.h index 380827084..133e5e8f7 100644 --- a/src/common/worker/pto_runtime_c_api.h +++ b/src/common/worker/pto_runtime_c_api.h @@ -24,6 +24,7 @@ * - sizing: get_runtime_size * - device-mem: device_malloc_ctx, device_free_ctx, * copy_to_device_ctx, copy_from_device_ctx + * - temp-buffer: configure_temporary_buffer_auto_ctx * - prepared run: simpler_register_callable, simpler_run, unregister_callable, * get_aicpu_dlopen_count, get_host_dlopen_count * - L3-L2 orch: l3_l2_orch_comm_init_ctx, @@ -91,6 +92,9 @@ int copy_to_device_ctx(DeviceContextHandle ctx, void *dev_ptr, const void *host_ /** Copy device memory to a host pointer within the given device context. */ int copy_from_device_ctx(DeviceContextHandle ctx, void *host_ptr, const void *dev_ptr, size_t size); +/** Enable or disable the runner-scoped AUTO temporary variable buffer. */ +int configure_temporary_buffer_auto_ctx(DeviceContextHandle ctx, int enabled); + /** * One-shot platform-side init. Called once by ChipWorker::init() right * after dlopen, before any other entry. Three responsibilities, in order: diff --git a/tests/ut/cpp/CMakeLists.txt b/tests/ut/cpp/CMakeLists.txt index 794673731..72dd96191 100644 --- a/tests/ut/cpp/CMakeLists.txt +++ b/tests/ut/cpp/CMakeLists.txt @@ -378,7 +378,37 @@ target_include_directories(test_runtime_orch_so PRIVATE ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include ) add_common_utils_test(test_device_arena common/test_device_arena.cpp) +add_common_utils_test(test_temporary_variable_buffer common/test_temporary_variable_buffer.cpp) add_common_utils_test(test_l3_l2_orch_comm common/test_l3_l2_orch_comm.cpp) + +add_executable(test_trb_runtime_temp_buffer + common/test_trb_runtime_temp_buffer.cpp + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp + ${CMAKE_SOURCE_DIR}/../../../src/common/platform/shared/host/platform_compile_info.cpp +) +target_compile_definitions(test_trb_runtime_temp_buffer PRIVATE SIMPLER_PLATFORM_NAME="a2a3sim") +target_include_directories(test_trb_runtime_temp_buffer PRIVATE + ${GTEST_INCLUDE_DIRS} + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/host + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/orchestration + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/runtime + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/runtime/tensormap_and_ringbuffer/common + ${CMAKE_SOURCE_DIR}/../../../src/a2a3/platform/include + ${CMAKE_SOURCE_DIR}/../../../src/common/platform/include + ${CMAKE_SOURCE_DIR}/../../../src/common/task_interface + ${CMAKE_SOURCE_DIR}/../../../src/common/log/include + ${CMAKE_SOURCE_DIR}/../../../src/common +) +target_link_libraries(test_trb_runtime_temp_buffer PRIVATE + a2a3_rt_objs + ${GTEST_MAIN_LIB} + ${GTEST_LIB} + pthread +) +add_test(NAME test_trb_runtime_temp_buffer COMMAND test_trb_runtime_temp_buffer) +set_tests_properties(test_trb_runtime_temp_buffer PROPERTIES LABELS "no_hardware") + add_executable(test_l3_l2_orch_endpoint common/test_l3_l2_orch_endpoint.cpp stubs/test_stubs.cpp diff --git a/tests/ut/cpp/common/test_temporary_variable_buffer.cpp b/tests/ut/cpp/common/test_temporary_variable_buffer.cpp new file mode 100644 index 000000000..026b4ae43 --- /dev/null +++ b/tests/ut/cpp/common/test_temporary_variable_buffer.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Unit tests for host/temporary_variable_buffer.h. + +#include +#include +#include +#include +#include + +#include + +#include "host/temporary_variable_buffer.h" + +namespace { + +struct MockBackend { + int alloc_count = 0; + int free_count = 0; + size_t max_alloc_size = 0; + size_t total_alloc_bytes = 0; + std::unordered_set live; + + void *alloc(size_t size) { + if (max_alloc_size != 0 && size > max_alloc_size) { + return nullptr; + } + void *ptr = nullptr; + if (posix_memalign(&ptr, TemporaryVariableBuffer::kTemporaryBufferAlignment, size) != 0) { + return nullptr; + } + ++alloc_count; + total_alloc_bytes += size; + live.insert(ptr); + return ptr; + } + + void free(void *ptr) { + ++free_count; + EXPECT_EQ(live.count(ptr), 1u) << "free called on a pointer that is not live"; + live.erase(ptr); + std::free(ptr); + } +}; + +void *mock_alloc(void *ctx, size_t size) { return static_cast(ctx)->alloc(size); } +void mock_free(void *ctx, void *ptr) { static_cast(ctx)->free(ptr); } + +bool is_aligned(const void *ptr, size_t alignment) { return (reinterpret_cast(ptr) & (alignment - 1)) == 0; } + +} // namespace + +TEST(TemporaryVariableBufferTest, AutoAllocatesSingleAlignedBufferForPlan) { + MockBackend backend; + TemporaryVariableBuffer buffer(mock_alloc, mock_free, &backend); + TemporaryBufferPlanItem plan[] = { + {512, TemporaryVariableBuffer::kTemporaryBufferAlignment}, + {256, TemporaryVariableBuffer::kTemporaryBufferAlignment}, + }; + + ASSERT_TRUE(buffer.configure_auto(true)) << buffer.last_error(); + ASSERT_TRUE(buffer.begin_run(plan, 2)) << buffer.last_error(); + EXPECT_EQ(backend.alloc_count, 1); + EXPECT_EQ(buffer.stats().retained_bytes, 2048u); + EXPECT_EQ(buffer.stats().realloc_count, 1u); + + void *first = buffer.acquire(512, TemporaryVariableBuffer::kTemporaryBufferAlignment); + ASSERT_NE(first, nullptr) << buffer.last_error(); + EXPECT_TRUE(is_aligned(first, TemporaryVariableBuffer::kTemporaryBufferAlignment)); + void *second = buffer.acquire(256, TemporaryVariableBuffer::kTemporaryBufferAlignment); + ASSERT_NE(second, nullptr) << buffer.last_error(); + EXPECT_TRUE(is_aligned(second, TemporaryVariableBuffer::kTemporaryBufferAlignment)); + EXPECT_EQ(static_cast(second) - static_cast(first), 1024); + EXPECT_EQ(buffer.stats().current_run_used_bytes, 1280u); + buffer.end_run(); + EXPECT_EQ(buffer.stats().high_water_used_bytes, 1280u); +} + +TEST(TemporaryVariableBufferTest, RepeatedSamePlanReusesRetainedBuffer) { + MockBackend backend; + TemporaryVariableBuffer buffer(mock_alloc, mock_free, &backend); + TemporaryBufferPlanItem plan[] = {{512, TemporaryVariableBuffer::kTemporaryBufferAlignment}}; + + ASSERT_TRUE(buffer.configure_auto(true)) << buffer.last_error(); + ASSERT_TRUE(buffer.begin_run(plan, 1)) << buffer.last_error(); + void *first = buffer.acquire(512, TemporaryVariableBuffer::kTemporaryBufferAlignment); + ASSERT_NE(first, nullptr) << buffer.last_error(); + buffer.end_run(); + + ASSERT_TRUE(buffer.begin_run(plan, 1)) << buffer.last_error(); + void *again = buffer.acquire(512, TemporaryVariableBuffer::kTemporaryBufferAlignment); + EXPECT_EQ(again, first); + buffer.end_run(); + + EXPECT_EQ(backend.alloc_count, 1); + EXPECT_EQ(backend.free_count, 0); + EXPECT_EQ(buffer.stats().realloc_count, 1u); +} + +TEST(TemporaryVariableBufferTest, LargerPlanFreesOldBufferBeforeAllocatingNewOne) { + MockBackend backend; + TemporaryVariableBuffer buffer(mock_alloc, mock_free, &backend); + TemporaryBufferPlanItem small_plan[] = {{512, TemporaryVariableBuffer::kTemporaryBufferAlignment}}; + TemporaryBufferPlanItem large_plan[] = { + {2048, TemporaryVariableBuffer::kTemporaryBufferAlignment}, + {2048, TemporaryVariableBuffer::kTemporaryBufferAlignment}, + }; + + ASSERT_TRUE(buffer.configure_auto(true)) << buffer.last_error(); + ASSERT_TRUE(buffer.begin_run(small_plan, 1)) << buffer.last_error(); + ASSERT_NE(buffer.acquire(512, TemporaryVariableBuffer::kTemporaryBufferAlignment), nullptr) << buffer.last_error(); + buffer.end_run(); + EXPECT_EQ(buffer.stats().retained_bytes, 1024u); + + ASSERT_TRUE(buffer.begin_run(large_plan, 2)) << buffer.last_error(); + EXPECT_EQ(backend.alloc_count, 2); + EXPECT_EQ(backend.free_count, 1); + EXPECT_EQ(buffer.stats().retained_bytes, 4096u); + EXPECT_EQ(buffer.stats().realloc_count, 2u); + ASSERT_NE(buffer.acquire(2048, TemporaryVariableBuffer::kTemporaryBufferAlignment), nullptr) << buffer.last_error(); + ASSERT_NE(buffer.acquire(2048, TemporaryVariableBuffer::kTemporaryBufferAlignment), nullptr) << buffer.last_error(); + buffer.end_run(); +} + +TEST(TemporaryVariableBufferTest, SmallerPlanDoesNotShrinkRetainedBuffer) { + MockBackend backend; + TemporaryVariableBuffer buffer(mock_alloc, mock_free, &backend); + TemporaryBufferPlanItem large_plan[] = {{4096, TemporaryVariableBuffer::kTemporaryBufferAlignment}}; + TemporaryBufferPlanItem small_plan[] = {{512, TemporaryVariableBuffer::kTemporaryBufferAlignment}}; + + ASSERT_TRUE(buffer.configure_auto(true)) << buffer.last_error(); + ASSERT_TRUE(buffer.begin_run(large_plan, 1)) << buffer.last_error(); + buffer.end_run(); + const size_t retained_after_large = buffer.stats().retained_bytes; + + ASSERT_TRUE(buffer.begin_run(small_plan, 1)) << buffer.last_error(); + buffer.end_run(); + EXPECT_EQ(buffer.stats().retained_bytes, retained_after_large); + EXPECT_EQ(backend.alloc_count, 1); + EXPECT_EQ(backend.free_count, 0); +} + +TEST(TemporaryVariableBufferTest, ReallocFailureLeavesOldBufferReleased) { + MockBackend backend; + TemporaryVariableBuffer buffer(mock_alloc, mock_free, &backend); + TemporaryBufferPlanItem small_plan[] = {{512, TemporaryVariableBuffer::kTemporaryBufferAlignment}}; + TemporaryBufferPlanItem too_large_plan[] = {{4096, TemporaryVariableBuffer::kTemporaryBufferAlignment}}; + + ASSERT_TRUE(buffer.configure_auto(true)) << buffer.last_error(); + ASSERT_TRUE(buffer.begin_run(small_plan, 1)) << buffer.last_error(); + buffer.end_run(); + ASSERT_EQ(buffer.stats().retained_bytes, 1024u); + + backend.max_alloc_size = 1024; + EXPECT_FALSE(buffer.begin_run(too_large_plan, 1)); + EXPECT_NE(buffer.last_error().find("AUTO realloc failed"), std::string::npos); + EXPECT_FALSE(buffer.stats().active); + EXPECT_EQ(buffer.stats().retained_bytes, 0u); + EXPECT_EQ(backend.free_count, 1); + EXPECT_TRUE(backend.live.empty()); + EXPECT_EQ(buffer.stats().realloc_failed_count, 1u); +} + +TEST(TemporaryVariableBufferTest, ClearFreesRetainedBufferExactlyOnce) { + MockBackend backend; + TemporaryVariableBuffer buffer(mock_alloc, mock_free, &backend); + TemporaryBufferPlanItem plan[] = {{1024, TemporaryVariableBuffer::kTemporaryBufferAlignment}}; + + ASSERT_TRUE(buffer.configure_auto(true)) << buffer.last_error(); + ASSERT_TRUE(buffer.begin_run(plan, 1)) << buffer.last_error(); + buffer.end_run(); + EXPECT_EQ(backend.alloc_count, 1); + buffer.clear(); + EXPECT_EQ(backend.free_count, 1); + EXPECT_TRUE(backend.live.empty()); + EXPECT_FALSE(buffer.enabled()); + EXPECT_EQ(buffer.stats().retained_bytes, 0u); + + buffer.clear(); + EXPECT_EQ(backend.free_count, 1); +} + +TEST(TemporaryVariableBufferTest, ActiveReconfigurationFailsClearly) { + MockBackend backend; + TemporaryVariableBuffer buffer(mock_alloc, mock_free, &backend); + TemporaryBufferPlanItem plan[] = {{1024, TemporaryVariableBuffer::kTemporaryBufferAlignment}}; + + ASSERT_TRUE(buffer.configure_auto(true)) << buffer.last_error(); + ASSERT_TRUE(buffer.begin_run(plan, 1)) << buffer.last_error(); + EXPECT_FALSE(buffer.configure_auto(false)); + EXPECT_NE(buffer.last_error().find("cannot reconfigure"), std::string::npos); + buffer.end_run(); +} diff --git a/tests/ut/cpp/common/test_trb_runtime_temp_buffer.cpp b/tests/ut/cpp/common/test_trb_runtime_temp_buffer.cpp new file mode 100644 index 000000000..be6b0a769 --- /dev/null +++ b/tests/ut/cpp/common/test_trb_runtime_temp_buffer.cpp @@ -0,0 +1,374 @@ +/* + * Copyright (c) PyPTO Contributors. + * This program is free software, you can redistribute it and/or modify it under the terms and conditions of + * CANN Open Software License Agreement Version 2.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + * ----------------------------------------------------------------------------------------------------------- + */ +// Host-side fake HostApi tests for a2a3 TRB bind/validate tensor leases. + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "arg_direction.h" +#include "pto_runtime2_types.h" +#include "runtime.h" +#include "task_args.h" +#include "host/temporary_variable_buffer.h" + +extern "C" int bind_callable_to_runtime_impl( + Runtime *runtime, const HostApi *api, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr, + const ArgDirection *signature, int sig_count, const uint64_t *ring_task_window, const uint64_t *ring_heap, + const uint64_t *ring_dep_pool +); +extern "C" int validate_runtime_impl(Runtime *runtime, const HostApi *api); + +namespace { + +size_t align_up(size_t value, size_t alignment) { return (value + alignment - 1) & ~(alignment - 1); } + +struct FakeHostApi { + int device_malloc_count = 0; + int device_free_count = 0; + int copy_to_count = 0; + int copy_from_count = 0; + int device_memset_count = 0; + int setup_static_arena_count = 0; + int temp_begin_count = 0; + int temp_end_count = 0; + int temp_acquire_attempts = 0; + int temp_acquire_successes = 0; + int fail_copy_to_on_call = 0; + size_t temp_capacity = 0; + size_t temp_offset = 0; + size_t temp_plan_count = 0; + size_t temp_plan_required_bytes = 0; + bool temp_enabled = false; + bool temp_active = false; + void *temp_pool = nullptr; + std::unordered_set live_mallocs; + std::vector gm_heap; + std::vector gm_sm; + std::vector runtime_arena; + + ~FakeHostApi() { release_all(); } + + void release_all() { + for (void *ptr : live_mallocs) { + std::free(ptr); + } + live_mallocs.clear(); + if (temp_pool != nullptr) { + std::free(temp_pool); + temp_pool = nullptr; + } + } + + void reset(size_t capacity = 0) { + release_all(); + *this = FakeHostApi(); + temp_enabled = capacity > 0; + temp_capacity = capacity; + if (capacity > 0) { + ASSERT_EQ(posix_memalign(&temp_pool, TemporaryVariableBuffer::kTemporaryBufferAlignment, capacity), 0); + std::memset(temp_pool, 0, capacity); + } + } +}; + +FakeHostApi *g_fake = nullptr; + +void *fake_device_malloc(size_t size) { + void *ptr = std::malloc(std::max(size, 1)); + if (ptr == nullptr) { + return nullptr; + } + ++g_fake->device_malloc_count; + g_fake->live_mallocs.insert(ptr); + return ptr; +} + +void fake_device_free(void *ptr) { + if (ptr == nullptr) { + return; + } + ++g_fake->device_free_count; + EXPECT_EQ(g_fake->live_mallocs.count(ptr), 1u); + g_fake->live_mallocs.erase(ptr); + std::free(ptr); +} + +int fake_copy_to_device(void *dev_ptr, const void *host_ptr, size_t size) { + ++g_fake->copy_to_count; + if (g_fake->fail_copy_to_on_call != 0 && g_fake->copy_to_count == g_fake->fail_copy_to_on_call) { + return -7; + } + std::memcpy(dev_ptr, host_ptr, size); + return 0; +} + +int fake_copy_from_device(void *host_ptr, const void *dev_ptr, size_t size) { + ++g_fake->copy_from_count; + std::memcpy(host_ptr, dev_ptr, size); + return 0; +} + +int fake_device_memset(void *dev_ptr, int value, size_t size) { + ++g_fake->device_memset_count; + std::memset(dev_ptr, value, size); + return 0; +} + +bool fake_temporary_buffer_enabled() { return g_fake->temp_enabled; } + +bool fake_begin_temporary_buffer_run(const TemporaryBufferPlanItem *items, size_t item_count) { + ++g_fake->temp_begin_count; + if (!g_fake->temp_enabled || g_fake->temp_capacity == 0 || g_fake->temp_pool == nullptr || g_fake->temp_active || + (items == nullptr && item_count != 0)) { + return false; + } + size_t offset = 0; + for (size_t i = 0; i < item_count; ++i) { + const size_t alignment = std::max(items[i].alignment, TemporaryVariableBuffer::kTemporaryBufferAlignment); + offset = align_up(offset, alignment); + if (items[i].bytes > g_fake->temp_capacity || offset > g_fake->temp_capacity - items[i].bytes) { + return false; + } + offset += items[i].bytes; + } + g_fake->temp_plan_count = item_count; + g_fake->temp_plan_required_bytes = offset; + g_fake->temp_offset = 0; + g_fake->temp_active = true; + return true; +} + +void *fake_acquire_temporary_buffer_slice(size_t size, size_t alignment) { + ++g_fake->temp_acquire_attempts; + const size_t effective_alignment = + std::max(alignment == 0 ? size_t{1} : alignment, TemporaryVariableBuffer::kTemporaryBufferAlignment); + const size_t offset = align_up(g_fake->temp_offset, effective_alignment); + if (!g_fake->temp_active || offset > g_fake->temp_capacity || size > g_fake->temp_capacity - offset) { + return nullptr; + } + void *ptr = static_cast(g_fake->temp_pool) + offset; + g_fake->temp_offset = offset + size; + ++g_fake->temp_acquire_successes; + return ptr; +} + +void fake_end_temporary_buffer_run() { + EXPECT_TRUE(g_fake->temp_active); + ++g_fake->temp_end_count; + g_fake->temp_active = false; +} + +int fake_setup_static_arena(size_t gm_heap_size, size_t gm_sm_size, size_t runtime_arena_size) { + ++g_fake->setup_static_arena_count; + g_fake->gm_heap.assign(gm_heap_size, 0); + g_fake->gm_sm.assign(gm_sm_size, 0); + g_fake->runtime_arena.assign(runtime_arena_size, 0); + return 0; +} + +void *fake_acquire_pooled_gm_heap() { return g_fake->gm_heap.empty() ? nullptr : g_fake->gm_heap.data(); } +void *fake_acquire_pooled_gm_sm() { return g_fake->gm_sm.empty() ? nullptr : g_fake->gm_sm.data(); } +void *fake_acquire_pooled_runtime_arena() { + return g_fake->runtime_arena.empty() ? nullptr : g_fake->runtime_arena.data(); +} +bool fake_lookup_prebuilt_runtime_arena_cache( + uint64_t /* hash */, const void * /* key_data */, size_t /* key_size */, void ** /* gm_heap_base */, + void ** /* sm_base */, void ** /* runtime_arena_base */, size_t * /* runtime_off */, const void ** /* image_data */, + size_t * /* image_size */ +) { + return false; +} +void fake_mark_prebuilt_runtime_arena_cached( + uint64_t /* hash */, const void * /* key_data */, size_t /* key_size */, void * /* gm_heap_base */, + void * /* sm_base */, void * /* runtime_arena_base */, size_t /* runtime_off */, const void * /* image_data */, + size_t /* image_size */ +) {} +uint64_t fake_upload_chip_callable_buffer(const void * /* callable */) { return 0; } + +HostApi make_host_api() { + return HostApi{ + fake_device_malloc, + fake_device_free, + fake_copy_to_device, + fake_copy_from_device, + fake_device_memset, + fake_temporary_buffer_enabled, + fake_begin_temporary_buffer_run, + fake_acquire_temporary_buffer_slice, + fake_end_temporary_buffer_run, + fake_setup_static_arena, + fake_acquire_pooled_gm_heap, + fake_acquire_pooled_gm_sm, + fake_acquire_pooled_runtime_arena, + fake_lookup_prebuilt_runtime_arena_cache, + fake_mark_prebuilt_runtime_arena_cached, + fake_upload_chip_callable_buffer, + }; +} + +Tensor make_tensor(std::vector &storage, bool child_memory = false) { + Tensor tensor; + uint32_t shape[1] = {static_cast(storage.size())}; + tensor.init_external(storage.data(), storage.size(), shape, 1, DataType::UINT8, 0, false, child_memory ? 1 : 0); + return tensor; +} + +ChipStorageTaskArgs make_args(std::vector &input, std::vector &output) { + ChipStorageTaskArgs args; + args.add_tensor(make_tensor(input)); + args.add_tensor(make_tensor(output)); + return args; +} + +int bind_runtime( + Runtime &runtime, const HostApi &api, const ChipStorageTaskArgs &args, const ArgDirection *signature, int sig_count +) { + uint64_t ring_task_window[PTO2_MAX_RING_DEPTH] = {4, 4, 4, 4}; + uint64_t ring_heap[PTO2_MAX_RING_DEPTH] = {1024, 1024, 1024, 1024}; + uint64_t ring_dep_pool[PTO2_MAX_RING_DEPTH] = {4, 4, 4, 4}; + return bind_callable_to_runtime_impl( + &runtime, &api, &args, nullptr, signature, sig_count, ring_task_window, ring_heap, ring_dep_pool + ); +} + +class TrbRuntimeTempBufferTest : public ::testing::Test { +protected: + void SetUp() override { g_fake = &fake_; } + void TearDown() override { + fake_.release_all(); + g_fake = nullptr; + } + + Runtime make_runtime() { return Runtime{}; } + + FakeHostApi fake_; + HostApi api_ = make_host_api(); +}; + +} // namespace + +TEST_F(TrbRuntimeTempBufferTest, AutoEnabledUsesTemporarySlicesWithoutChangingCopies) { + std::vector input(64, 7); + std::vector output(64, 0); + ChipStorageTaskArgs args = make_args(input, output); + ArgDirection signature[2] = {ArgDirection::IN, ArgDirection::OUT}; + + fake_.reset(0); + Runtime malloc_runtime = make_runtime(); + ASSERT_EQ(bind_runtime(malloc_runtime, api_, args, signature, 2), 0); + EXPECT_EQ(fake_.device_malloc_count, 2); + EXPECT_EQ(fake_.copy_to_count, 2); + EXPECT_EQ(fake_.device_memset_count, 1); + ASSERT_EQ(validate_runtime_impl(&malloc_runtime, &api_), 0); + EXPECT_EQ(fake_.device_free_count, 2); + EXPECT_EQ(fake_.copy_from_count, 2); + EXPECT_EQ(fake_.temp_begin_count, 0); + + fake_.reset(4096); + Runtime buffer_runtime = make_runtime(); + ASSERT_EQ(bind_runtime(buffer_runtime, api_, args, signature, 2), 0); + EXPECT_EQ(fake_.device_malloc_count, 0); + EXPECT_EQ(fake_.temp_begin_count, 1); + EXPECT_EQ(fake_.temp_plan_count, 2u); + EXPECT_EQ(fake_.temp_plan_required_bytes, 1088u); + EXPECT_EQ(fake_.temp_acquire_successes, 2); + EXPECT_EQ(fake_.copy_to_count, 2); + EXPECT_EQ(fake_.device_memset_count, 1); + ASSERT_EQ(validate_runtime_impl(&buffer_runtime, &api_), 0); + EXPECT_EQ(fake_.device_free_count, 0); + EXPECT_EQ(fake_.temp_end_count, 1); + EXPECT_EQ(fake_.copy_from_count, 2); +} + +TEST_F(TrbRuntimeTempBufferTest, ChildMemoryIsPassThroughAndPureOutStillMemsets) { + fake_.reset(4096); + Runtime runtime = make_runtime(); + std::vector child(64, 3); + std::vector output(64, 0); + ChipStorageTaskArgs args; + args.add_tensor(make_tensor(child, true)); + args.add_tensor(make_tensor(output)); + ArgDirection signature[2] = {ArgDirection::IN, ArgDirection::OUT}; + + ASSERT_EQ(bind_runtime(runtime, api_, args, signature, 2), 0); + EXPECT_EQ(fake_.temp_plan_count, 1u); + EXPECT_EQ(fake_.temp_plan_required_bytes, 64u); + EXPECT_EQ(fake_.temp_acquire_successes, 1); + EXPECT_EQ(fake_.device_malloc_count, 0); + EXPECT_EQ(fake_.copy_to_count, 1); + EXPECT_EQ(fake_.device_memset_count, 1); + ASSERT_EQ(validate_runtime_impl(&runtime, &api_), 0); + EXPECT_EQ(fake_.device_free_count, 0); + EXPECT_EQ(fake_.temp_end_count, 1); +} + +TEST_F(TrbRuntimeTempBufferTest, TemporaryPlanFailureFailsWithoutMallocFallback) { + fake_.reset(1024); + Runtime runtime = make_runtime(); + std::vector input(768, 1); + std::vector output(768, 0); + ChipStorageTaskArgs args = make_args(input, output); + ArgDirection signature[2] = {ArgDirection::IN, ArgDirection::OUT}; + + EXPECT_EQ(bind_runtime(runtime, api_, args, signature, 2), -1); + EXPECT_EQ(fake_.device_malloc_count, 0); + EXPECT_EQ(fake_.temp_begin_count, 1); + EXPECT_EQ(fake_.temp_acquire_attempts, 0); + EXPECT_EQ(fake_.temp_acquire_successes, 0); + EXPECT_EQ(fake_.temp_end_count, 0); + EXPECT_FALSE(runtime.temporary_buffer_run_active_); + EXPECT_TRUE(runtime.tensor_leases_.empty()); +} + +TEST_F(TrbRuntimeTempBufferTest, FailedCopyReleasesRecordedFreeLease) { + fake_.reset(0); + fake_.fail_copy_to_on_call = 1; + Runtime runtime = make_runtime(); + std::vector input(64, 9); + ChipStorageTaskArgs args; + args.add_tensor(make_tensor(input)); + ArgDirection signature[1] = {ArgDirection::IN}; + + EXPECT_EQ(bind_runtime(runtime, api_, args, signature, 1), -1); + EXPECT_EQ(fake_.device_malloc_count, 1); + EXPECT_EQ(fake_.device_free_count, 1); + EXPECT_TRUE(runtime.tensor_leases_.empty()); + EXPECT_FALSE(runtime.temporary_buffer_run_active_); +} + +TEST_F(TrbRuntimeTempBufferTest, FailedCopyAfterTemporaryRunBeginsEndsRunOnce) { + fake_.reset(4096); + fake_.fail_copy_to_on_call = 1; + Runtime runtime = make_runtime(); + std::vector input(64, 9); + ChipStorageTaskArgs args; + args.add_tensor(make_tensor(input)); + ArgDirection signature[1] = {ArgDirection::IN}; + + EXPECT_EQ(bind_runtime(runtime, api_, args, signature, 1), -1); + EXPECT_EQ(fake_.device_malloc_count, 0); + EXPECT_EQ(fake_.device_free_count, 0); + EXPECT_EQ(fake_.temp_begin_count, 1); + EXPECT_EQ(fake_.temp_acquire_attempts, 1); + EXPECT_EQ(fake_.temp_acquire_successes, 1); + EXPECT_EQ(fake_.copy_to_count, 1); + EXPECT_EQ(fake_.temp_end_count, 1); + EXPECT_FALSE(fake_.temp_active); + EXPECT_TRUE(runtime.tensor_leases_.empty()); + EXPECT_FALSE(runtime.temporary_buffer_run_active_); +} diff --git a/tests/ut/py/test_chip_worker.py b/tests/ut/py/test_chip_worker.py index fe6efe4e5..83f33bf0b 100644 --- a/tests/ut/py/test_chip_worker.py +++ b/tests/ut/py/test_chip_worker.py @@ -239,6 +239,11 @@ def test_l3_l2_orch_comm_shutdown_before_init_raises(self): with pytest.raises(RuntimeError, match="not initialized"): worker.l3_l2_orch_comm_shutdown() + def test_configure_temporary_buffer_before_init_raises(self): + worker = _ChipWorker() + with pytest.raises(RuntimeError, match="not initialized"): + worker.configure_temporary_buffer_auto(True) + # ============================================================================ # Python-level ChipWorker wrapper tests @@ -271,6 +276,7 @@ def __init__(self): self.unregistered = [] self.aicpu_dlopen_count = 0 self.host_dlopen_count = 0 + self.configured_temporary_buffer_auto = [] def register_callable(self, slot, callable_obj): self.prepared.append((slot, callable_obj)) @@ -281,6 +287,9 @@ def run(self, slot, args, config): def unregister_callable(self, slot): self.unregistered.append(slot) + def configure_temporary_buffer_auto(self, enabled): + self.configured_temporary_buffer_auto.append(enabled) + worker = ChipWorker() fake = FakeImpl() worker._impl = fake @@ -304,6 +313,10 @@ def unregister_callable(self, slot): worker.unregister_callable(second) assert fake.unregistered == [0] + worker.configure_temporary_buffer_auto(True) + worker.configure_temporary_buffer_auto(False) + assert fake.configured_temporary_buffer_auto == [True, False] + def test_public_wrapper_rejects_raw_slot_run(self): from _task_interface import ChipStorageTaskArgs # noqa: PLC0415 from simpler.task_interface import ChipWorker # noqa: PLC0415 # pyright: ignore[reportAttributeAccessIssue] diff --git a/tests/ut/py/test_worker/test_host_worker.py b/tests/ut/py/test_worker/test_host_worker.py index e385e1143..7f911452a 100644 --- a/tests/ut/py/test_worker/test_host_worker.py +++ b/tests/ut/py/test_worker/test_host_worker.py @@ -18,6 +18,7 @@ from multiprocessing.shared_memory import SharedMemory import pytest +import simpler.worker as worker_mod from _task_interface import MAX_REGISTERED_CALLABLE_IDS # pyright: ignore[reportMissingImports] from simpler.callable_identity import ( CallableHandle, @@ -106,6 +107,14 @@ def _slot_for(worker: Worker, handle: CallableHandle) -> int: return worker._identity_registry[handle.digest].slot_id +class _FakeChipWorker: + def __init__(self) -> None: + self.configured_temporary_buffer_auto: list[bool] = [] + + def configure_temporary_buffer_auto(self, enabled: bool) -> None: + self.configured_temporary_buffer_auto.append(enabled) + + class _FakeControlResult: def __init__(self, worker_type: str, worker_id: int = 0, ok: bool = True, error_message: str = ""): self.worker_type = worker_type @@ -122,6 +131,83 @@ def _chip_payload_shm(callable_obj: ChipCallable) -> SharedMemory: return shm +def test_l2_worker_configure_temporary_buffer_auto_records_and_forwards(): + worker = Worker(level=2, platform="a2a3sim", runtime="tensormap_and_ringbuffer") + + assert worker.temporary_buffer_mode == "off" + worker.configure_temporary_buffer_auto(True) + assert worker._config["temporary_buffer_mode"] == "auto" + assert worker.temporary_buffer_mode == "auto" + + fake_chip = _FakeChipWorker() + worker._chip_worker = fake_chip + worker.configure_temporary_buffer_auto(False) + assert fake_chip.configured_temporary_buffer_auto == [False] + assert worker.temporary_buffer_mode == "off" + + +def test_temporary_buffer_configuration_records_for_l3_children(): + worker = Worker(level=3, num_sub_workers=0) + + worker.configure_temporary_buffer_auto(True) + assert worker._config["temporary_buffer_mode"] == "auto" + assert worker.temporary_buffer_mode == "auto" + + +def test_temporary_buffer_rejects_removed_budget_config(): + with pytest.raises(ValueError, match="max_temporary_buffer_bytes"): + Worker(level=2, max_temporary_buffer_bytes=1024) + + with pytest.raises(ValueError, match="temporary_buffer_mode"): + Worker(level=2, temporary_buffer_mode="bad") + + +def test_chip_process_loop_configures_temporary_buffer(monkeypatch): + events: list[tuple] = [] + + class FakeChipWorker: + def init(self, device_id, bins, *, log_level, log_info_v): + events.append(("init", device_id, bins, log_level, log_info_v)) + + def configure_temporary_buffer_auto(self, enabled: bool) -> None: + events.append(("configure_temporary_buffer_auto", enabled)) + + def finalize(self) -> None: + events.append(("finalize",)) + + def fake_run_chip_main_loop(cw, *_args, chip_platform, chip_runtime): + events.append(("main_loop", cw, chip_platform, chip_runtime)) + + monkeypatch.setattr(worker_mod, "ChipWorker", FakeChipWorker) + monkeypatch.setattr(worker_mod, "_run_chip_main_loop", fake_run_chip_main_loop) + + shm = SharedMemory(create=True, size=MAILBOX_SIZE) + try: + assert shm.buf is not None + worker_mod._chip_process_loop( + shm.buf, + "bins", + 7, + {}, + {}, + {}, + worker_mod._ChipProcessConfig( + platform="a2a3", + runtime="tensormap_and_ringbuffer", + temporary_buffer_mode="auto", + ), + ) + finally: + shm.close() + shm.unlink() + + assert events[0] == ("init", 7, "bins", 1, 5) + assert events[1] == ("configure_temporary_buffer_auto", True) + assert events[2][0] == "main_loop" + assert events[2][2:] == ("a2a3", "tensormap_and_ringbuffer") + assert events[3] == ("finalize",) + + def _chip_digest(callable_obj: ChipCallable, *, platform: str = "", runtime: str = "") -> bytes: descriptor = build_chip_callable_descriptor(target=callable_obj, platform=platform, runtime=runtime) return hashid_to_digest(compute_callable_hashid(descriptor))