From f4ba6c92fc331f7e10516586bfbe2bbe520dcea2 Mon Sep 17 00:00:00 2001
From: Chao Wang <26245345+ChaoWao@users.noreply.github.com>
Date: Tue, 30 Jun 2026 19:43:00 +0800
Subject: [PATCH] Refactor: split trb bind_callable into lifecycle helpers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The ~200-line bind_callable_to_runtime_impl folded three distinct
lifecycles into one function. Split it into named steps so the entry
point reads as the lifecycles it orchestrates:

- resolve_arena_sizing  (per-config): ring sizing + derived heap/SM
  sizes + scheduler timeout — the layout input half, host arithmetic
- stage_device_args     (per-run): the only signature-aware step —
  H2D copy / pure-OUTPUT zeroing / copy-back recording
- apply_orch_sched_env_flags (per-run): latch the orch->sched env gates
- ensure_static_arenas  (per-config): reserve + acquire the static pools
- build_runtime_image   (per-config): pure host image build, no device
  touch — the hook a later image-cache stage can memoize
- bind_launch_state     (per-run): publish args + rtMemcpy + record base

Behavior is byte-identical: TIMING logs, the simpler_run.bind.{args,
prebuilt} STRACE spans, log ordering, and error paths are preserved.
The host DeviceArena stays a caller-owned local passed by reference
(it is non-copyable/non-movable), so the image outlives the call until
upload.

Also re-syncs the drifted a2a3/a5 runtime_maker copies: a5 adopts the
STRACE markers, common/strace.h include, and pto2_-prefixed naming that
were pure drift, leaving the two files byte-identical.

Verified on sim (behavior unchanged): a2a3sim trb ST 30 passed/1
skipped, a5sim trb ST 20 passed.
---
 .../host/runtime_maker.cpp                    | 454 +++++++++++-------
 .../host/runtime_maker.cpp                    | 387 +++++++++------
 2 files changed, 517 insertions(+), 324 deletions(-)

diff --git a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index 25dfe2d3a..dc0ddefb8 100644
--- a/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a2a3/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -345,6 +345,257 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const
     return 0;
 }
 
+// Effective ring sizing for one (callable_id, config): the input half of the
+// arena description. Resolved once per config from per-task overrides + env +
+// compile-time defaults; depends on nothing that varies per run. `total_heap`
+// and `sm_size` are the derived backing-allocation sizes; `scheduler_timeout_ms`
+// is the resolved per-platform scheduler no-progress budget.
+struct ArenaSizingConfig {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    int32_t scheduler_timeout_ms;
+    uint64_t total_heap;
+    uint64_t sm_size;
+};
+
+// Device pointers to the per-Worker static pools that DeviceRunner keeps alive
+// across runs (freed in DeviceRunner::finalize(), never in tensor_pairs_).
+struct StaticArenaPtrs {
+    void *gm_heap;
+    void *gm_sm;
+    void *runtime_arena_dev;
+};
+
+// per-(cid,config): resolve the arena sizing. Pure host arithmetic over
+// per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the
+// total heap (with overflow check) and SM sizes and the scheduler timeout.
+// Returns false on an invalid ring config or a heap-size overflow.
+static bool resolve_arena_sizing(
+    const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out
+) {
+    if (!resolve_ring_config(
+            ring_task_window, ring_heap, ring_dep_pool, out->task_window_sizes, out->heap_sizes,
+            out->dep_pool_capacities
+        )) {
+        return false;
+    }
+    const std::string task_window_log = format_ring_array(out->task_window_sizes);
+    const std::string heap_log = format_ring_array(out->heap_sizes);
+    const std::string dep_pool_log = format_ring_array(out->dep_pool_capacities);
+    LOG_INFO_V0(
+        "Ring buffer sizes: task_window=%s heap=%s dep_pool=%s", task_window_log.c_str(), heap_log.c_str(),
+        dep_pool_log.c_str()
+    );
+
+    out->total_heap = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (out->heap_sizes[r] > std::numeric_limits<uint64_t>::max() - out->total_heap) {
+            LOG_ERROR("Total ring heap size overflows uint64_t");
+            return false;
+        }
+        out->total_heap += out->heap_sizes[r];
+    }
+    out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes);
+    out->scheduler_timeout_ms = resolve_scheduler_timeout_ms();
+    return true;
+}
+
+// per-run: the only signature-aware step. Copy the orch args, replacing each
+// host tensor pointer with a freshly staged device pointer (H2D copy-in, or an
+// on-device zero for pure-OUTPUT buffers), and record the host/device pair for
+// copy-back. Read-only INPUT tensors skip copy-back. On failure the partially
+// staged device_args / tensor_pairs_ stay owned by the caller's Runtime, which
+// frees them in validate_runtime_impl.
+static bool stage_device_args(
+    Runtime *runtime, const ChipStorageTaskArgs *orch_args, const ArgDirection *signature, int sig_count,
+    ChipStorageTaskArgs *out
+) {
+    int tensor_count = orch_args->tensor_count();
+    int scalar_count = orch_args->scalar_count();
+
+    int64_t t_args_start = _now_ms();
+    STRACE_A("simpler_run.bind.args", "");
+    for (int i = 0; i < tensor_count; i++) {
+        Tensor t = orch_args->tensor(i);
+
+        if (t.is_child_memory()) {
+            LOG_INFO_V0("  Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.buffer.addr);
+            out->add_tensor(t);
+            continue;
+        }
+
+        void *host_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(t.buffer.addr));
+        size_t size = static_cast<size_t>(t.nbytes());
+
+        void *dev_ptr = runtime->host_api.device_malloc(size);
+        if (dev_ptr == nullptr) {
+            LOG_ERROR("Failed to allocate device memory for tensor %d", i);
+            return false;
+        }
+
+        // Pure write-only OUTPUT buffers carry no meaningful host content, so
+        // the H2D copy-in is wasted. Zero them on-device instead (cheap HBM
+        // memset, no PCIe) so any region the kernel leaves unwritten reads as 0
+        // rather than pooled-allocator garbage. INOUT (read-before-write)
+        // and IN keep the H2D copy. Falls back to copy_to_device if a backend
+        // did not wire device_memset.
+        bool is_pure_output = (signature != nullptr && i < sig_count && signature[i] == ArgDirection::OUT);
+        int rc;
+        if (is_pure_output && runtime->host_api.device_memset != nullptr) {
+            rc = runtime->host_api.device_memset(dev_ptr, 0, size);
+        } else {
+            rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size);
+        }
+        if (rc != 0) {
+            LOG_ERROR("Failed to stage tensor %d to device", i);
+            runtime->host_api.device_free(dev_ptr);
+            return false;
+        }
+        // Read-only INPUT tensors are never written by the kernel, so there is
+        // no point copying them back D2H at the end. Index the signature
+        // by the orch tensor index `i` (child_memory tensors are skipped above
+        // but do not consume a separate signature slot — scalars follow the
+        // tensor entries). Anything not provably IN keeps the safe default of
+        // copying back.
+        bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN);
+        runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back});
+        LOG_INFO_V0("  Tensor %d: %zu bytes at %p", i, size, dev_ptr);
+
+        t.buffer.addr = reinterpret_cast<uint64_t>(dev_ptr);
+        out->add_tensor(t);
+    }
+    for (int i = 0; i < scalar_count; i++) {
+        out->add_scalar(orch_args->scalar(i));
+    }
+    int64_t t_args_end = _now_ms();
+    LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
+    return true;
+}
+
+// per-run: latch the env-driven orchestrator/scheduler hand-off flags onto the
+// runtime. Behavior-only env reads (no new gates); kept here so the args and
+// image steps stay free of unrelated state.
+static void apply_orch_sched_env_flags(Runtime *runtime) {
+    const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED");
+    runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T');
+    LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled");
+
+    const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED");
+    runtime->dev.serial_orch_sched =
+        serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T');
+    LOG_INFO_V0(
+        "Serial orchestrator-to-scheduler start gate: %s", runtime->dev.serial_orch_sched ? "enabled" : "disabled"
+    );
+}
+
+// per-(cid,config): reserve and acquire the static device pools. GM heap, PTO2
+// shared memory, and the prebuilt runtime arena all live in one backing
+// allocation; setup_static_arena reserves the three regions and commits in one
+// shot. The runtime-arena size is recovered by replaying the (pure, cheap)
+// reserve sequence on a throwaway host arena. Idempotent across runs — the
+// pools are owned by DeviceRunner and freed in DeviceRunner::finalize().
+static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizing, StaticArenaPtrs *out) {
+    DeviceArena sizing_arena;  // discarded; only its computed arena_size is read
+    PTO2RuntimeArenaLayout layout =
+        runtime_reserve_layout(sizing_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
+
+    int64_t t_setup_start = _now_ms();
+    if (runtime->host_api.setup_static_arena(sizing.total_heap, sizing.sm_size, layout.arena_size) != 0) {
+        LOG_ERROR("Failed to setup pooled static arena");
+        return false;
+    }
+    int64_t t_setup_end = _now_ms();
+
+    int64_t t_heap_start = _now_ms();
+    out->gm_heap = runtime->host_api.acquire_pooled_gm_heap();
+    int64_t t_heap_end = _now_ms();
+    if (out->gm_heap == nullptr) {
+        LOG_ERROR("Failed to acquire pooled GM heap");
+        return false;
+    }
+    runtime->set_gm_heap(out->gm_heap);
+
+    int64_t t_sm_start = _now_ms();
+    out->gm_sm = runtime->host_api.acquire_pooled_gm_sm();
+    int64_t t_sm_end = _now_ms();
+    if (out->gm_sm == nullptr) {
+        LOG_ERROR("Failed to acquire pooled PTO2 shared memory");
+        return false;
+    }
+    runtime->set_gm_sm_ptr(out->gm_sm);
+
+    out->runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
+    if (out->runtime_arena_dev == nullptr) {
+        LOG_ERROR("Failed to acquire pooled runtime arena");
+        return false;
+    }
+
+    LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
+    LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
+    LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
+    return true;
+}
+
+// per-(cid,config): build the prebuilt runtime-arena image on host. Pure host
+// work — touches no device memory, only `host_arena` (owned by the caller so
+// the image outlives this call until the upload) and the device *addresses* in
+// `ptrs` (stored, never dereferenced).
+//
+// We pre-compute every byte the AICPU's runtime arena would otherwise have to
+// write at boot: layout offsets, sub-structure init data, and pointers back to
+// the SM / GM heap. AICPU boot then becomes attach + wire (cheap pointer fixup)
+// + sm_handle->init (SM reset) + a handful of device-only field fixups.
+//
+// The layout is stashed inside the image (rt->prebuilt_layout) so the AICPU can
+// recover every arena-internal offset after the rtMemcpy. Returns the layout
+// via `out_layout`; the runtime-arena device base travels separately on the
+// host Runtime (bind_launch_state), since the AICPU needs that pointer *before*
+// it can dereference the image.
+static bool build_runtime_image(
+    const ArenaSizingConfig &sizing, const StaticArenaPtrs &ptrs, DeviceArena *host_arena,
+    PTO2RuntimeArenaLayout *out_layout
+) {
+    PTO2RuntimeArenaLayout layout =
+        runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
+    layout.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
+    if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
+        return false;
+    }
+
+    PTO2Runtime *rt = runtime_init_data_from_layout(
+        *host_arena, layout, PTO2_MODE_EXECUTE, ptrs.gm_sm, sizing.sm_size, ptrs.gm_heap, sizing.heap_sizes
+    );
+    if (rt == nullptr) {
+        LOG_ERROR("runtime_init_data_from_layout failed");
+        return false;
+    }
+    runtime_wire_arena_pointers(*host_arena, layout, rt);
+    rt->prebuilt_layout = layout;
+
+    *out_layout = layout;
+    return true;
+}
+
+// per-run: publish the launch state. Copy the staged args onto the runtime,
+// rtMemcpy the host image into the pooled runtime-arena region, and record the
+// device base + runtime offset the AICPU reads before dereferencing the image.
+static bool bind_launch_state(
+    Runtime *runtime, const StaticArenaPtrs &ptrs, const DeviceArena &host_arena, const PTO2RuntimeArenaLayout &layout,
+    const ChipStorageTaskArgs &device_args
+) {
+    runtime->set_orch_args(device_args);
+
+    int rc_upload = runtime->host_api.copy_to_device(ptrs.runtime_arena_dev, host_arena.base(), layout.arena_size);
+    if (rc_upload != 0) {
+        LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
+        return false;
+    }
+    runtime->set_prebuilt_arena(ptrs.runtime_arena_dev, layout.off_runtime);
+    return true;
+}
+
 /**
  * Per-run binding: build device-side argument storage (tensor copy-out, GM
  * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
@@ -355,6 +606,11 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const
  * design: register/run invokes this every call, while the prep
  * half runs only once per callable_id.
  *
+ * Orchestrates the three lifecycles behind the bind: per-config arena sizing
+ * (resolve_arena_sizing) + static pools (ensure_static_arenas) + host image
+ * (build_runtime_image), and per-run args (stage_device_args) + launch publish
+ * (bind_launch_state).
+ *
  * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
  * @param orch_args  Separated tensor/scalar arguments for this run
  * @return 0 on success, -1 on failure
@@ -385,207 +641,43 @@ extern "C" int bind_callable_to_runtime_impl(
 
     int64_t t_total_start = _now_ms();
 
-    uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH];
-    uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH];
-    int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH];
-    if (!resolve_ring_config(
-            ring_task_window, ring_heap, ring_dep_pool, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities
-        )) {
+    ArenaSizingConfig sizing;
+    if (!resolve_arena_sizing(ring_task_window, ring_heap, ring_dep_pool, &sizing)) {
         return -1;
     }
-    const std::string task_window_log = format_ring_array(eff_task_window_sizes);
-    const std::string heap_log = format_ring_array(eff_heap_sizes);
-    const std::string dep_pool_log = format_ring_array(eff_dep_pool_capacities);
-    LOG_INFO_V0(
-        "Ring buffer sizes: task_window=%s heap=%s dep_pool=%s", task_window_log.c_str(), heap_log.c_str(),
-        dep_pool_log.c_str()
-    );
 
-    // Build device args: copy from input, replace host tensor pointers with device pointers
     ChipStorageTaskArgs device_args;
-
-    int64_t t_args_start = _now_ms();
-    {
-        STRACE_A("simpler_run.bind.args", "");
-        for (int i = 0; i < tensor_count; i++) {
-            Tensor t = orch_args->tensor(i);
-
-            if (t.is_child_memory()) {
-                LOG_INFO_V0("  Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.buffer.addr);
-                device_args.add_tensor(t);
-                continue;
-            }
-
-            void *host_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(t.buffer.addr));
-            size_t size = static_cast<size_t>(t.nbytes());
-
-            void *dev_ptr = runtime->host_api.device_malloc(size);
-            if (dev_ptr == nullptr) {
-                LOG_ERROR("Failed to allocate device memory for tensor %d", i);
-                return -1;
-            }
-
-            // Pure write-only OUTPUT buffers carry no meaningful host content, so
-            // the H2D copy-in is wasted. Zero them on-device instead (cheap HBM
-            // memset, no PCIe) so any region the kernel leaves unwritten reads as 0
-            // rather than pooled-allocator garbage. INOUT (read-before-write)
-            // and IN keep the H2D copy. Falls back to copy_to_device if a backend
-            // did not wire device_memset.
-            bool is_pure_output = (signature != nullptr && i < sig_count && signature[i] == ArgDirection::OUT);
-            int rc;
-            if (is_pure_output && runtime->host_api.device_memset != nullptr) {
-                rc = runtime->host_api.device_memset(dev_ptr, 0, size);
-            } else {
-                rc = runtime->host_api.copy_to_device(dev_ptr, host_ptr, size);
-            }
-            if (rc != 0) {
-                LOG_ERROR("Failed to stage tensor %d to device", i);
-                runtime->host_api.device_free(dev_ptr);
-                return -1;
-            }
-            // Read-only INPUT tensors are never written by the kernel, so there is
-            // no point copying them back D2H at the end. Index the signature
-            // by the orch tensor index `i` (child_memory tensors are skipped above
-            // but do not consume a separate signature slot — scalars follow the
-            // tensor entries). Anything not provably IN keeps the safe default of
-            // copying back.
-            bool needs_copy_back = !(signature != nullptr && i < sig_count && signature[i] == ArgDirection::IN);
-            runtime->tensor_pairs_.push_back({host_ptr, dev_ptr, size, needs_copy_back});
-            LOG_INFO_V0("  Tensor %d: %zu bytes at %p", i, size, dev_ptr);
-
-            t.buffer.addr = reinterpret_cast<uint64_t>(dev_ptr);
-            device_args.add_tensor(t);
-        }
-        for (int i = 0; i < scalar_count; i++) {
-            device_args.add_scalar(orch_args->scalar(i));
-        }
-    }
-    int64_t t_args_end = _now_ms();
-
-    // Read orchestrator-to-scheduler transition flag from environment
-    {
-        const char *env_val = std::getenv("PTO2_ORCH_TO_SCHED");
-        if (env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T')) {
-            runtime->dev.orch_to_sched = true;
-        }
-        LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled");
+    if (!stage_device_args(runtime, orch_args, signature, sig_count, &device_args)) {
+        return -1;
     }
 
-    // Read serial orchestrator -> scheduler start gate from environment.
-    {
-        const char *env_val = std::getenv("PTO2_SERIAL_ORCH_SCHED");
-        runtime->dev.serial_orch_sched = env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T');
-        LOG_INFO_V0(
-            "Serial orchestrator-to-scheduler start gate: %s", runtime->dev.serial_orch_sched ? "enabled" : "disabled"
-        );
-    }
-
-    // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory,
-    // and the prebuilt runtime arena all live in a single backing allocation;
-    // setup_static_arena reserves the three regions and commits in one shot.
-    // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the
-    // free is deferred to DeviceRunner::finalize(). The runtime-arena size is
-    // determined by replaying the reserve sequence on a host-side arena.
-    uint64_t total_heap_size = 0;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (eff_heap_sizes[r] > std::numeric_limits<uint64_t>::max() - total_heap_size) {
-            LOG_ERROR("Total ring heap size overflows uint64_t");
-            return -1;
-        }
-        total_heap_size += eff_heap_sizes[r];
-    }
-    uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(eff_task_window_sizes);
+    apply_orch_sched_env_flags(runtime);
 
     int64_t t_prebuilt_start = _now_ms();
     {
         STRACE("simpler_run.bind.prebuilt");
-        DeviceArena host_arena;  // libc malloc backend by default
-        PTO2RuntimeArenaLayout layout =
-            runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities);
-        layout.scheduler_timeout_ms = resolve_scheduler_timeout_ms();
-        if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-            LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
-            return -1;
-        }
-
-        int64_t t_setup_start = _now_ms();
-        if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) {
-            LOG_ERROR("Failed to setup pooled static arena");
+        StaticArenaPtrs ptrs;
+        if (!ensure_static_arenas(runtime, sizing, &ptrs)) {
             return -1;
         }
-        int64_t t_setup_end = _now_ms();
 
-        int64_t t_heap_start = _now_ms();
-        void *gm_heap = runtime->host_api.acquire_pooled_gm_heap();
-        int64_t t_heap_end = _now_ms();
-        if (gm_heap == nullptr) {
-            LOG_ERROR("Failed to acquire pooled GM heap");
+        DeviceArena host_arena;  // libc malloc backend; owns the image until upload
+        PTO2RuntimeArenaLayout layout;
+        if (!build_runtime_image(sizing, ptrs, &host_arena, &layout)) {
             return -1;
         }
-        runtime->set_gm_heap(gm_heap);
 
-        int64_t t_sm_start = _now_ms();
-        void *sm_ptr = runtime->host_api.acquire_pooled_gm_sm();
-        int64_t t_sm_end = _now_ms();
-        if (sm_ptr == nullptr) {
-            LOG_ERROR("Failed to acquire pooled PTO2 shared memory");
+        if (!bind_launch_state(runtime, ptrs, host_arena, layout, device_args)) {
             return -1;
         }
-        runtime->set_gm_sm_ptr(sm_ptr);
-
-        void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
-        if (runtime_arena_dev == nullptr) {
-            LOG_ERROR("Failed to acquire pooled runtime arena");
-            return -1;
-        }
-
-        // Set up device orchestration state
-        runtime->set_orch_args(device_args);
-
-        // -------------------------------------------------------------------------
-        // Build the prebuilt runtime-arena image on host.
-        //
-        // We pre-compute every byte the AICPU's runtime arena would otherwise have
-        // to write at boot: layout offsets, sub-structure init data, and pointers
-        // back to the SM / GM heap. Then we rtMemcpy the image into the pooled
-        // runtime-arena region that DeviceRunner keeps alive across runs. AICPU
-        // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM
-        // reset) + a handful of device-only field fixups.
-        // -------------------------------------------------------------------------
-        PTO2Runtime *rt = runtime_init_data_from_layout(
-            host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_sizes
-        );
-        if (rt == nullptr) {
-            LOG_ERROR("runtime_init_data_from_layout failed");
-            return -1;
-        }
-        runtime_wire_arena_pointers(host_arena, layout, rt);
-
-        // Stash the layout inside the PTO2Runtime image so the AICPU can recover
-        // every arena-internal offset after rtMemcpy. The runtime arena's device
-        // base does NOT travel in this image — it's on the host Runtime
-        // (set_prebuilt_arena below), since the AICPU needs that pointer
-        // *before* it can dereference the image.
-        rt->prebuilt_layout = layout;
-
-        int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
-        if (rc_upload != 0) {
-            LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
-            return -1;
-        }
-        runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime);
-        int64_t t_prebuilt_end = _now_ms();
+    }
+    int64_t t_prebuilt_end = _now_ms();
 
-        LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
+    LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
 
-        int64_t t_total_end = _now_ms();
-        LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
-        LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
-        LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
-        LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
-        LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
-        LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
-    }
+    int64_t t_total_end = _now_ms();
+    LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
+    LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
 
     return 0;
 }
diff --git a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
index e83edc8bf..dc0ddefb8 100644
--- a/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
+++ b/src/a5/runtime/tensormap_and_ringbuffer/host/runtime_maker.cpp
@@ -45,12 +45,13 @@
 #include "../runtime/pto_shared_memory.h"
 #include "../runtime/runtime.h"
 #include "../../../../common/task_interface/call_config.h"
-#include "utils/device_arena.h"
 #include "callable.h"
 #include "common/platform_config.h"
+#include "common/strace.h"
 #include "common/unified_log.h"
 #include "host/platform_compile_info.h"
 #include "host/runtime_timeout_config.h"
+#include "utils/device_arena.h"
 #include "prepare_callable_common.h"
 
 static_assert(
@@ -271,17 +272,17 @@ static int32_t resolve_scheduler_timeout_ms() {
     return cfg.scheduler_timeout_ms;
 }
 
-static int32_t read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
+static int32_t pto2_read_runtime_status(Runtime *runtime, PTO2SharedMemoryHeader *host_header) {
     if (runtime == nullptr || host_header == nullptr) {
         return 0;
     }
 
-    void *sm_ptr = runtime->get_gm_sm_ptr();
-    if (sm_ptr == nullptr) {
+    void *pto2_sm = runtime->get_gm_sm_ptr();
+    if (pto2_sm == nullptr) {
         return 0;
     }
 
-    int hdr_rc = runtime->host_api.copy_from_device(host_header, sm_ptr, sizeof(PTO2SharedMemoryHeader));
+    int hdr_rc = runtime->host_api.copy_from_device(host_header, pto2_sm, sizeof(PTO2SharedMemoryHeader));
     if (hdr_rc != 0) {
         LOG_WARN("Failed to copy PTO2 header from device");
         return 0;
@@ -344,72 +345,83 @@ register_callable_impl(const ChipCallable *callable, uint64_t (*upload_fn)(const
     return 0;
 }
 
-/**
- * Per-run binding: build device-side argument storage (tensor copy-out, GM
- * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
- * callable-side state (kernel binaries, orch SO bytes, func/config names)
- * is already populated by register_callable_impl.
- *
- * Splitting this from register_callable_impl matches the per-callable_id
- * design: register/run invokes this every call, while the prep
- * half runs only once per callable_id.
- *
- * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
- * @param orch_args  Separated tensor/scalar arguments for this run
- * @return 0 on success, -1 on failure
- */
-extern "C" int bind_callable_to_runtime_impl(
-    Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr, const ArgDirection *signature,
-    int sig_count, const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool
+// Effective ring sizing for one (callable_id, config): the input half of the
+// arena description. Resolved once per config from per-task overrides + env +
+// compile-time defaults; depends on nothing that varies per run. `total_heap`
+// and `sm_size` are the derived backing-allocation sizes; `scheduler_timeout_ms`
+// is the resolved per-platform scheduler no-progress budget.
+struct ArenaSizingConfig {
+    uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH];
+    uint64_t heap_sizes[PTO2_MAX_RING_DEPTH];
+    int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH];
+    int32_t scheduler_timeout_ms;
+    uint64_t total_heap;
+    uint64_t sm_size;
+};
+
+// Device pointers to the per-Worker static pools that DeviceRunner keeps alive
+// across runs (freed in DeviceRunner::finalize(), never in tensor_pairs_).
+struct StaticArenaPtrs {
+    void *gm_heap;
+    void *gm_sm;
+    void *runtime_arena_dev;
+};
+
+// per-(cid,config): resolve the arena sizing. Pure host arithmetic over
+// per-task overrides, PTO2_RING_* env, and compile-time defaults; derives the
+// total heap (with overflow check) and SM sizes and the scheduler timeout.
+// Returns false on an invalid ring config or a heap-size overflow.
+static bool resolve_arena_sizing(
+    const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool, ArenaSizingConfig *out
 ) {
-    if (runtime == nullptr) {
-        LOG_ERROR("Runtime pointer is null");
-        return -1;
-    }
-    if (orch_args == nullptr) {
-        LOG_ERROR("orch_args pointer is null");
-        return -1;
-    }
-    // trb runs orchestration on the device — there is no host-side orch
-    // function pointer to invoke. The c_api signature accepts one for
-    // symmetry with hbg; assert the trb-side invariant here.
-    if (host_orch_func_ptr != nullptr) {
-        LOG_ERROR("bind_callable_to_runtime_impl: trb does not accept a host_orch_func_ptr");
-        return -1;
-    }
-
-    int tensor_count = orch_args->tensor_count();
-    int scalar_count = orch_args->scalar_count();
-    LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
-
-    int64_t t_total_start = _now_ms();
-
-    uint64_t eff_task_window_sizes[PTO2_MAX_RING_DEPTH];
-    uint64_t eff_heap_sizes[PTO2_MAX_RING_DEPTH];
-    int32_t eff_dep_pool_capacities[PTO2_MAX_RING_DEPTH];
     if (!resolve_ring_config(
-            ring_task_window, ring_heap, ring_dep_pool, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities
+            ring_task_window, ring_heap, ring_dep_pool, out->task_window_sizes, out->heap_sizes,
+            out->dep_pool_capacities
         )) {
-        return -1;
+        return false;
     }
-    const std::string task_window_log = format_ring_array(eff_task_window_sizes);
-    const std::string heap_log = format_ring_array(eff_heap_sizes);
-    const std::string dep_pool_log = format_ring_array(eff_dep_pool_capacities);
+    const std::string task_window_log = format_ring_array(out->task_window_sizes);
+    const std::string heap_log = format_ring_array(out->heap_sizes);
+    const std::string dep_pool_log = format_ring_array(out->dep_pool_capacities);
     LOG_INFO_V0(
         "Ring buffer sizes: task_window=%s heap=%s dep_pool=%s", task_window_log.c_str(), heap_log.c_str(),
         dep_pool_log.c_str()
     );
 
-    // Build device args: copy from input, replace host tensor pointers with device pointers
-    ChipStorageTaskArgs device_args;
+    out->total_heap = 0;
+    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
+        if (out->heap_sizes[r] > std::numeric_limits<uint64_t>::max() - out->total_heap) {
+            LOG_ERROR("Total ring heap size overflows uint64_t");
+            return false;
+        }
+        out->total_heap += out->heap_sizes[r];
+    }
+    out->sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(out->task_window_sizes);
+    out->scheduler_timeout_ms = resolve_scheduler_timeout_ms();
+    return true;
+}
+
+// per-run: the only signature-aware step. Copy the orch args, replacing each
+// host tensor pointer with a freshly staged device pointer (H2D copy-in, or an
+// on-device zero for pure-OUTPUT buffers), and record the host/device pair for
+// copy-back. Read-only INPUT tensors skip copy-back. On failure the partially
+// staged device_args / tensor_pairs_ stay owned by the caller's Runtime, which
+// frees them in validate_runtime_impl.
+static bool stage_device_args(
+    Runtime *runtime, const ChipStorageTaskArgs *orch_args, const ArgDirection *signature, int sig_count,
+    ChipStorageTaskArgs *out
+) {
+    int tensor_count = orch_args->tensor_count();
+    int scalar_count = orch_args->scalar_count();
 
     int64_t t_args_start = _now_ms();
+    STRACE_A("simpler_run.bind.args", "");
     for (int i = 0; i < tensor_count; i++) {
         Tensor t = orch_args->tensor(i);
 
         if (t.is_child_memory()) {
             LOG_INFO_V0("  Tensor %d: child memory, pass-through (0x%" PRIx64 ")", i, t.buffer.addr);
-            device_args.add_tensor(t);
+            out->add_tensor(t);
             continue;
         }
 
@@ -419,7 +431,7 @@ extern "C" int bind_callable_to_runtime_impl(
         void *dev_ptr = runtime->host_api.device_malloc(size);
         if (dev_ptr == nullptr) {
             LOG_ERROR("Failed to allocate device memory for tensor %d", i);
-            return -1;
+            return false;
         }
 
         // Pure write-only OUTPUT buffers carry no meaningful host content, so
@@ -438,7 +450,7 @@ extern "C" int bind_callable_to_runtime_impl(
         if (rc != 0) {
             LOG_ERROR("Failed to stage tensor %d to device", i);
             runtime->host_api.device_free(dev_ptr);
-            return -1;
+            return false;
         }
         // Read-only INPUT tensors are never written by the kernel, so there is
         // no point copying them back D2H at the end. Index the signature
@@ -451,131 +463,219 @@ extern "C" int bind_callable_to_runtime_impl(
         LOG_INFO_V0("  Tensor %d: %zu bytes at %p", i, size, dev_ptr);
 
         t.buffer.addr = reinterpret_cast<uint64_t>(dev_ptr);
-        device_args.add_tensor(t);
+        out->add_tensor(t);
     }
     for (int i = 0; i < scalar_count; i++) {
-        device_args.add_scalar(orch_args->scalar(i));
+        out->add_scalar(orch_args->scalar(i));
     }
     int64_t t_args_end = _now_ms();
+    LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
+    return true;
+}
 
-    // Read orchestrator-to-scheduler transition flag from environment
-    {
-        const char *env_val = std::getenv("PTO2_ORCH_TO_SCHED");
-        if (env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T')) {
-            runtime->dev.orch_to_sched = true;
-        }
-        LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled");
-    }
-
-    // Read serial orchestrator -> scheduler start gate from environment.
-    {
-        const char *env_val = std::getenv("PTO2_SERIAL_ORCH_SCHED");
-        runtime->dev.serial_orch_sched = env_val && (env_val[0] == '1' || env_val[0] == 't' || env_val[0] == 'T');
-        LOG_INFO_V0(
-            "Serial orchestrator-to-scheduler start gate: %s", runtime->dev.serial_orch_sched ? "enabled" : "disabled"
-        );
-    }
-
-    // Lay out the per-Worker static device arena. GM heap, PTO2 shared memory,
-    // and the prebuilt runtime arena all live in a single backing allocation;
-    // setup_static_arena reserves the three regions and commits in one shot.
-    // Owned by DeviceRunner across runs — do NOT record in tensor_pairs_; the
-    // free is deferred to DeviceRunner::finalize(). The runtime-arena size is
-    // determined by replaying the reserve sequence on a host-side arena.
-    uint64_t total_heap_size = 0;
-    for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
-        if (eff_heap_sizes[r] > std::numeric_limits<uint64_t>::max() - total_heap_size) {
-            LOG_ERROR("Total ring heap size overflows uint64_t");
-            return -1;
-        }
-        total_heap_size += eff_heap_sizes[r];
-    }
-    uint64_t sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(eff_task_window_sizes);
+// per-run: latch the env-driven orchestrator/scheduler hand-off flags onto the
+// runtime. Behavior-only env reads (no new gates); kept here so the args and
+// image steps stay free of unrelated state.
+static void apply_orch_sched_env_flags(Runtime *runtime) {
+    const char *orch_env = std::getenv("PTO2_ORCH_TO_SCHED");
+    runtime->dev.orch_to_sched = orch_env && (orch_env[0] == '1' || orch_env[0] == 't' || orch_env[0] == 'T');
+    LOG_INFO_V0("Orchestrator-to-scheduler transition: %s", runtime->dev.orch_to_sched ? "enabled" : "disabled");
+
+    const char *serial_env = std::getenv("PTO2_SERIAL_ORCH_SCHED");
+    runtime->dev.serial_orch_sched =
+        serial_env && (serial_env[0] == '1' || serial_env[0] == 't' || serial_env[0] == 'T');
+    LOG_INFO_V0(
+        "Serial orchestrator-to-scheduler start gate: %s", runtime->dev.serial_orch_sched ? "enabled" : "disabled"
+    );
+}
 
-    int64_t t_prebuilt_start = _now_ms();
-    DeviceArena host_arena;  // libc malloc backend by default
+// per-(cid,config): reserve and acquire the static device pools. GM heap, PTO2
+// shared memory, and the prebuilt runtime arena all live in one backing
+// allocation; setup_static_arena reserves the three regions and commits in one
+// shot. The runtime-arena size is recovered by replaying the (pure, cheap)
+// reserve sequence on a throwaway host arena. Idempotent across runs — the
+// pools are owned by DeviceRunner and freed in DeviceRunner::finalize().
+static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizing, StaticArenaPtrs *out) {
+    DeviceArena sizing_arena;  // discarded; only its computed arena_size is read
     PTO2RuntimeArenaLayout layout =
-        runtime_reserve_layout(host_arena, eff_task_window_sizes, eff_heap_sizes, eff_dep_pool_capacities);
-    layout.scheduler_timeout_ms = resolve_scheduler_timeout_ms();
-    if (host_arena.commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
-        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
-        return -1;
-    }
+        runtime_reserve_layout(sizing_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
 
     int64_t t_setup_start = _now_ms();
-    if (runtime->host_api.setup_static_arena(total_heap_size, sm_size, layout.arena_size) != 0) {
+    if (runtime->host_api.setup_static_arena(sizing.total_heap, sizing.sm_size, layout.arena_size) != 0) {
         LOG_ERROR("Failed to setup pooled static arena");
-        return -1;
+        return false;
     }
     int64_t t_setup_end = _now_ms();
 
     int64_t t_heap_start = _now_ms();
-    void *gm_heap = runtime->host_api.acquire_pooled_gm_heap();
+    out->gm_heap = runtime->host_api.acquire_pooled_gm_heap();
     int64_t t_heap_end = _now_ms();
-    if (gm_heap == nullptr) {
+    if (out->gm_heap == nullptr) {
         LOG_ERROR("Failed to acquire pooled GM heap");
-        return -1;
+        return false;
     }
-    runtime->set_gm_heap(gm_heap);
+    runtime->set_gm_heap(out->gm_heap);
 
     int64_t t_sm_start = _now_ms();
-    void *sm_ptr = runtime->host_api.acquire_pooled_gm_sm();
+    out->gm_sm = runtime->host_api.acquire_pooled_gm_sm();
     int64_t t_sm_end = _now_ms();
-    if (sm_ptr == nullptr) {
+    if (out->gm_sm == nullptr) {
         LOG_ERROR("Failed to acquire pooled PTO2 shared memory");
-        return -1;
+        return false;
     }
-    runtime->set_gm_sm_ptr(sm_ptr);
+    runtime->set_gm_sm_ptr(out->gm_sm);
 
-    void *runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
-    if (runtime_arena_dev == nullptr) {
+    out->runtime_arena_dev = runtime->host_api.acquire_pooled_runtime_arena();
+    if (out->runtime_arena_dev == nullptr) {
         LOG_ERROR("Failed to acquire pooled runtime arena");
-        return -1;
+        return false;
     }
 
-    // Set up device orchestration state
-    runtime->set_orch_args(device_args);
+    LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
+    LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
+    LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
+    return true;
+}
+
+// per-(cid,config): build the prebuilt runtime-arena image on host. Pure host
+// work — touches no device memory, only `host_arena` (owned by the caller so
+// the image outlives this call until the upload) and the device *addresses* in
+// `ptrs` (stored, never dereferenced).
+//
+// We pre-compute every byte the AICPU's runtime arena would otherwise have to
+// write at boot: layout offsets, sub-structure init data, and pointers back to
+// the SM / GM heap. AICPU boot then becomes attach + wire (cheap pointer fixup)
+// + sm_handle->init (SM reset) + a handful of device-only field fixups.
+//
+// The layout is stashed inside the image (rt->prebuilt_layout) so the AICPU can
+// recover every arena-internal offset after the rtMemcpy. Returns the layout
+// via `out_layout`; the runtime-arena device base travels separately on the
+// host Runtime (bind_launch_state), since the AICPU needs that pointer *before*
+// it can dereference the image.
+static bool build_runtime_image(
+    const ArenaSizingConfig &sizing, const StaticArenaPtrs &ptrs, DeviceArena *host_arena,
+    PTO2RuntimeArenaLayout *out_layout
+) {
+    PTO2RuntimeArenaLayout layout =
+        runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
+    layout.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
+    if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
+        LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
+        return false;
+    }
 
-    // -------------------------------------------------------------------------
-    // Build the prebuilt runtime-arena image on host.
-    //
-    // We pre-compute every byte the AICPU's runtime arena would otherwise have
-    // to write at boot: layout offsets, sub-structure init data, and pointers
-    // back to the SM / GM heap. Then we rtMemcpy the image into the pooled
-    // runtime-arena region that DeviceRunner keeps alive across runs. AICPU
-    // boot becomes attach + wire (cheap pointer fixup) + sm_handle->init (SM
-    // reset) + a handful of device-only field fixups.
-    // -------------------------------------------------------------------------
-    PTO2Runtime *rt =
-        runtime_init_data_from_layout(host_arena, layout, PTO2_MODE_EXECUTE, sm_ptr, sm_size, gm_heap, eff_heap_sizes);
+    PTO2Runtime *rt = runtime_init_data_from_layout(
+        *host_arena, layout, PTO2_MODE_EXECUTE, ptrs.gm_sm, sizing.sm_size, ptrs.gm_heap, sizing.heap_sizes
+    );
     if (rt == nullptr) {
         LOG_ERROR("runtime_init_data_from_layout failed");
-        return -1;
+        return false;
     }
-    runtime_wire_arena_pointers(host_arena, layout, rt);
-
-    // Stash the layout inside the PTO2Runtime image so the AICPU can recover
-    // every arena-internal offset after rtMemcpy. The runtime arena's device
-    // base does NOT travel in this image — it's on the host Runtime
-    // (set_prebuilt_arena below), since the AICPU needs that pointer
-    // *before* it can dereference the image.
+    runtime_wire_arena_pointers(*host_arena, layout, rt);
     rt->prebuilt_layout = layout;
 
-    int rc_upload = runtime->host_api.copy_to_device(runtime_arena_dev, host_arena.base(), layout.arena_size);
+    *out_layout = layout;
+    return true;
+}
+
+// per-run: publish the launch state. Copy the staged args onto the runtime,
+// rtMemcpy the host image into the pooled runtime-arena region, and record the
+// device base + runtime offset the AICPU reads before dereferencing the image.
+static bool bind_launch_state(
+    Runtime *runtime, const StaticArenaPtrs &ptrs, const DeviceArena &host_arena, const PTO2RuntimeArenaLayout &layout,
+    const ChipStorageTaskArgs &device_args
+) {
+    runtime->set_orch_args(device_args);
+
+    int rc_upload = runtime->host_api.copy_to_device(ptrs.runtime_arena_dev, host_arena.base(), layout.arena_size);
     if (rc_upload != 0) {
         LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
+        return false;
+    }
+    runtime->set_prebuilt_arena(ptrs.runtime_arena_dev, layout.off_runtime);
+    return true;
+}
+
+/**
+ * Per-run binding: build device-side argument storage (tensor copy-out, GM
+ * heap, PTO2 shared memory) and publish it to the runtime. Assumes the
+ * callable-side state (kernel binaries, orch SO bytes, func/config names)
+ * is already populated by register_callable_impl.
+ *
+ * Splitting this from register_callable_impl matches the per-callable_id
+ * design: register/run invokes this every call, while the prep
+ * half runs only once per callable_id.
+ *
+ * Orchestrates the three lifecycles behind the bind: per-config arena sizing
+ * (resolve_arena_sizing) + static pools (ensure_static_arenas) + host image
+ * (build_runtime_image), and per-run args (stage_device_args) + launch publish
+ * (bind_launch_state).
+ *
+ * @param runtime    Pointer to pre-constructed Runtime (host_api populated)
+ * @param orch_args  Separated tensor/scalar arguments for this run
+ * @return 0 on success, -1 on failure
+ */
+extern "C" int bind_callable_to_runtime_impl(
+    Runtime *runtime, const ChipStorageTaskArgs *orch_args, void *host_orch_func_ptr, const ArgDirection *signature,
+    int sig_count, const uint64_t *ring_task_window, const uint64_t *ring_heap, const uint64_t *ring_dep_pool
+) {
+    if (runtime == nullptr) {
+        LOG_ERROR("Runtime pointer is null");
+        return -1;
+    }
+    if (orch_args == nullptr) {
+        LOG_ERROR("orch_args pointer is null");
+        return -1;
+    }
+    // trb runs orchestration on the device — there is no host-side orch
+    // function pointer to invoke. The c_api signature accepts one for
+    // symmetry with hbg; assert the trb-side invariant here.
+    if (host_orch_func_ptr != nullptr) {
+        LOG_ERROR("bind_callable_to_runtime_impl: trb does not accept a host_orch_func_ptr");
+        return -1;
+    }
+
+    int tensor_count = orch_args->tensor_count();
+    int scalar_count = orch_args->scalar_count();
+    LOG_INFO_V0("RT2 bind: %d tensors + %d scalars, device orchestration mode", tensor_count, scalar_count);
+
+    int64_t t_total_start = _now_ms();
+
+    ArenaSizingConfig sizing;
+    if (!resolve_arena_sizing(ring_task_window, ring_heap, ring_dep_pool, &sizing)) {
+        return -1;
+    }
+
+    ChipStorageTaskArgs device_args;
+    if (!stage_device_args(runtime, orch_args, signature, sig_count, &device_args)) {
         return -1;
     }
-    runtime->set_prebuilt_arena(runtime_arena_dev, layout.off_runtime);
+
+    apply_orch_sched_env_flags(runtime);
+
+    int64_t t_prebuilt_start = _now_ms();
+    {
+        STRACE("simpler_run.bind.prebuilt");
+        StaticArenaPtrs ptrs;
+        if (!ensure_static_arenas(runtime, sizing, &ptrs)) {
+            return -1;
+        }
+
+        DeviceArena host_arena;  // libc malloc backend; owns the image until upload
+        PTO2RuntimeArenaLayout layout;
+        if (!build_runtime_image(sizing, ptrs, &host_arena, &layout)) {
+            return -1;
+        }
+
+        if (!bind_launch_state(runtime, ptrs, host_arena, layout, device_args)) {
+            return -1;
+        }
+    }
     int64_t t_prebuilt_end = _now_ms();
 
     LOG_INFO_V0("Device orchestration ready: %d tensors + %d scalars", tensor_count, scalar_count);
 
     int64_t t_total_end = _now_ms();
-    LOG_INFO_V0("TIMING: args_malloc_copy = %" PRId64 "ms", t_args_end - t_args_start);
-    LOG_INFO_V0("TIMING: static_arena_setup = %" PRId64 "ms", t_setup_end - t_setup_start);
-    LOG_INFO_V0("TIMING: gm_heap_acquire = %" PRId64 "ms", t_heap_end - t_heap_start);
-    LOG_INFO_V0("TIMING: shared_mem_acquire = %" PRId64 "ms", t_sm_end - t_sm_start);
     LOG_INFO_V0("TIMING: prebuilt_runtime_arena = %" PRId64 "ms", t_prebuilt_end - t_prebuilt_start);
     LOG_INFO_V0("TIMING: total_init_runtime_impl = %" PRId64 "ms", t_total_end - t_total_start);
 
@@ -617,7 +717,7 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
     PTO2SharedMemoryHeader host_header;
     memset(&host_header, 0, sizeof(host_header));
 
-    runtime_status = read_runtime_status(runtime, &host_header);
+    runtime_status = pto2_read_runtime_status(runtime, &host_header);
     if (runtime_status != 0) {
         int32_t orch_error_code = host_header.orch_error_code.load(std::memory_order_relaxed);
         int32_t sched_error_code = host_header.sched_error_code.load(std::memory_order_relaxed);
@@ -712,7 +812,8 @@ extern "C" int validate_runtime_impl(Runtime *runtime) {
     // Clear the per-run dispatch-table entries staged by register_callable_impl.
     // The underlying chip-callable device buffer is pool-managed by
     // DeviceRunner (keyed by content hash) and bulk-freed in
-    // DeviceRunner::finalize().
+    // DeviceRunner::finalize(); re-running the same callable repeatedly
+    // should not re-upload.
     int kernel_count = runtime->get_registered_kernel_count();
     for (int i = 0; i < kernel_count; i++) {
         int func_id = runtime->get_registered_kernel_func_id(i);