From 7526853aaeb02fa8098ab55d02f22ed06721cec4 Mon Sep 17 00:00:00 2001 From: Chao Wang <26245345+ChaoWao@users.noreply.github.com> Date: Wed, 1 Jul 2026 09:15:33 +0800 Subject: [PATCH] Refactor: remove dead gm_heap_ptr_ and slot_states_ptr_ from trb Runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up cleanup to #1216. Two runtime fields were carried but never read: - `slot_states_ptr_` (in the device-copied `dev` descriptor): only ever written nullptr (ctor + one aicpu_executor site) and never read anywhere. Removed the field, its setter, and the nullptr-write call site. Since `dev` is alignas(64), the sizeof%64==0 static_assert still holds. - `gm_heap_ptr_` (host-only tail): written via set_gm_heap in runtime_maker but never read — the GM heap buffer is owned by DeviceRunnerBase's gm_heap_arena_ and released in finalize(), independent of this field (runtime_maker keeps the local StaticArenaPtrs::gm_heap that feeds init_data_from_layout). Removed the field, its getter/setter, and the set_gm_heap call site; the acquire_pooled_gm_heap() acquisition stays. Both arches (a2a3 + a5) lockstep. hbg is untouched (it has its own Runtime and never referenced these trb accessors). Verified: a2a3+a5 sim+onboard builds (trb+hbg); a2a3sim prepared_callable + dummy_task + orch_so_cache; a2a3 onboard prepared_callable + dummy_task + mixed_example. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp | 3 --- .../runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp | 1 - src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h | 6 ------ .../tensormap_and_ringbuffer/runtime/shared/runtime.cpp | 5 ----- .../tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp | 3 --- .../runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp | 1 - src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h | 6 ------ .../tensormap_and_ringbuffer/runtime/shared/runtime.cpp | 5 ----- 8 files changed, 30 deletions(-) diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 2d5613ba8..6e5d79e80 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -560,9 +560,6 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } #endif - // With multi-ring, slot_states are per-ring inside the scheduler. - runtime->set_slot_states_ptr(nullptr); - // Wire scheduler context to the newly created PTO2Runtime before // releasing scheduler threads from runtime_init_ready_. sched_ctx_.bind_runtime(rt); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index dc0ddefb8..99902ccfe 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -514,7 +514,6 @@ static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizi LOG_ERROR("Failed to acquire pooled GM heap"); return false; } - runtime->set_gm_heap(out->gm_heap); int64_t t_sm_start = _now_ms(); out->gm_sm = runtime->host_api.acquire_pooled_gm_sm(); diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h index ae6a2446e..e75f45f36 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -236,7 +236,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { bool serial_orch_sched; void *gm_sm_ptr_; // GM pointer to PTO2 shared memory (device) - void *slot_states_ptr_; // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling) ChipStorageTaskArgs orch_args_storage_; // Copy of args for device // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing @@ -280,8 +279,6 @@ class Runtime { int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID]; int registered_kernel_count_; - void *gm_heap_ptr_; // GM heap for orchestrator output buffers (device); host-only bookkeeping - public: /** * Constructor - zero-initialize all arrays @@ -319,11 +316,8 @@ class Runtime { // ========================================================================= void *get_gm_sm_ptr() const; - void *get_gm_heap_ptr() const; const ChipStorageTaskArgs &get_orch_args() const; void set_gm_sm_ptr(void *p); - void set_gm_heap(void *p); - void set_slot_states_ptr(void *p); void set_orch_args(const ChipStorageTaskArgs &args); // Prebuilt-arena fast path (trb only). Set by host's diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 8b28e620e..1fb2e4513 100644 --- a/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a2a3/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -40,7 +40,6 @@ Runtime::Runtime() { dev.orch_to_sched = false; dev.serial_orch_sched = false; dev.gm_sm_ptr_ = nullptr; - dev.slot_states_ptr_ = nullptr; dev.orch_args_storage_.clear(); dev.prebuilt_arena_base_ = nullptr; dev.prebuilt_runtime_offset_ = 0; @@ -50,7 +49,6 @@ Runtime::Runtime() { } // Initialize host-only tail. - gm_heap_ptr_ = nullptr; registered_kernel_count_ = 0; } @@ -59,11 +57,8 @@ Runtime::Runtime() { // ============================================================================= void *Runtime::get_gm_sm_ptr() const { return dev.gm_sm_ptr_; } -void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; } const ChipStorageTaskArgs &Runtime::get_orch_args() const { return dev.orch_args_storage_; } void Runtime::set_gm_sm_ptr(void *p) { dev.gm_sm_ptr_ = p; } -void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; } -void Runtime::set_slot_states_ptr(void *p) { dev.slot_states_ptr_ = p; } void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { dev.orch_args_storage_ = args; } void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) { diff --git a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp index 41a97ace8..c030da69c 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp @@ -558,9 +558,6 @@ int32_t AicpuExecutor::run(Runtime *runtime) { } #endif - // With multi-ring, slot_states are per-ring inside the scheduler. - runtime->set_slot_states_ptr(nullptr); - // Wire scheduler context to the newly created PTO2Runtime before // releasing scheduler threads from runtime_init_ready_. sched_ctx_.bind_runtime(rt); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp index dc0ddefb8..99902ccfe 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp @@ -514,7 +514,6 @@ static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizi LOG_ERROR("Failed to acquire pooled GM heap"); return false; } - runtime->set_gm_heap(out->gm_heap); int64_t t_sm_start = _now_ms(); out->gm_sm = runtime->host_api.acquire_pooled_gm_sm(); diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h index 6d2bc08a0..be5126576 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/runtime.h @@ -250,7 +250,6 @@ struct alignas(64) DeviceRuntimeLaunchDesc { bool serial_orch_sched; void *gm_sm_ptr_; // GM pointer to PTO2 shared memory (device) - void *slot_states_ptr_; // Pointer to PTO2TaskSlotState array (scheduler-private, for profiling) ChipStorageTaskArgs orch_args_storage_; // Copy of args for device // Prebuilt-arena fast path (trb only). Set by the host before rtMemcpy'ing @@ -294,8 +293,6 @@ class Runtime { int registered_kernel_func_ids_[RUNTIME_MAX_FUNC_ID]; int registered_kernel_count_; - void *gm_heap_ptr_; // GM heap for orchestrator output buffers (device); host-only bookkeeping - public: /** * Constructor - zero-initialize all arrays @@ -333,11 +330,8 @@ class Runtime { // ========================================================================= void *get_gm_sm_ptr() const; - void *get_gm_heap_ptr() const; const ChipStorageTaskArgs &get_orch_args() const; void set_gm_sm_ptr(void *p); - void set_gm_heap(void *p); - void set_slot_states_ptr(void *p); void set_orch_args(const ChipStorageTaskArgs &args); // Prebuilt-arena fast path (trb only). Set by host's diff --git a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp index 8b28e620e..1fb2e4513 100644 --- a/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp +++ b/src/a5/runtime/tensormap_and_ringbuffer/runtime/shared/runtime.cpp @@ -40,7 +40,6 @@ Runtime::Runtime() { dev.orch_to_sched = false; dev.serial_orch_sched = false; dev.gm_sm_ptr_ = nullptr; - dev.slot_states_ptr_ = nullptr; dev.orch_args_storage_.clear(); dev.prebuilt_arena_base_ = nullptr; dev.prebuilt_runtime_offset_ = 0; @@ -50,7 +49,6 @@ Runtime::Runtime() { } // Initialize host-only tail. - gm_heap_ptr_ = nullptr; registered_kernel_count_ = 0; } @@ -59,11 +57,8 @@ Runtime::Runtime() { // ============================================================================= void *Runtime::get_gm_sm_ptr() const { return dev.gm_sm_ptr_; } -void *Runtime::get_gm_heap_ptr() const { return gm_heap_ptr_; } const ChipStorageTaskArgs &Runtime::get_orch_args() const { return dev.orch_args_storage_; } void Runtime::set_gm_sm_ptr(void *p) { dev.gm_sm_ptr_ = p; } -void Runtime::set_gm_heap(void *p) { gm_heap_ptr_ = p; } -void Runtime::set_slot_states_ptr(void *p) { dev.slot_states_ptr_ = p; } void Runtime::set_orch_args(const ChipStorageTaskArgs &args) { dev.orch_args_storage_ = args; } void Runtime::set_prebuilt_arena(void *arena_base, size_t runtime_off) {