Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -503,12 +503,12 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
// Wire every arena-internal pointer field (host wrote host-mirror
// addresses; we overwrite them with device addresses).
runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(rt->prebuilt_layout.task_window_sizes);
sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(rt->prebuilt_layout.sizing.task_window_sizes);
for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) {
LOG_INFO_V0(
"Thread %d: Ring %d sizes: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%d", thread_idx, r,
rt->prebuilt_layout.task_window_sizes[r], rt->prebuilt_layout.heap_sizes[r],
rt->prebuilt_layout.dep_pool_capacities[r]
rt->prebuilt_layout.sizing.task_window_sizes[r], rt->prebuilt_layout.sizing.heap_sizes[r],
rt->prebuilt_layout.sizing.dep_pool_capacities[r]
);
}
}
Expand All @@ -522,7 +522,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
AicpuPhaseScope sm_reset(AicpuPhase::SmReset);
memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
if (!rt->sm_handle->init_per_ring(
sm_ptr, sm_size, rt->prebuilt_layout.task_window_sizes, rt->prebuilt_layout.heap_sizes
sm_ptr, sm_size, rt->prebuilt_layout.sizing.task_window_sizes,
rt->prebuilt_layout.sizing.heap_sizes
)) {
LOG_ERROR("Thread %d: sm_handle->init_per_ring failed", thread_idx);
rt = nullptr;
Expand Down Expand Up @@ -553,7 +554,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
auto &alloc = orch.rings[r].task_allocator;
scope_stats_set_ring_capacity(
r, alloc.window_size(), alloc.heap_capacity(), rt->prebuilt_layout.dep_pool_capacities[r]
r, alloc.window_size(), alloc.heap_capacity(),
rt->prebuilt_layout.sizing.dep_pool_capacities[r]
);
}
scope_stats_set_tensormap_capacity(orch.tensor_map.pool_capacity());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizi
runtime_reserve_layout(sizing_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);

int64_t t_setup_start = _now_ms();
if (runtime->host_api.setup_static_arena(sizing.total_heap, sizing.sm_size, layout.arena_size) != 0) {
if (runtime->host_api.setup_static_arena(sizing.total_heap, sizing.sm_size, layout.offsets.arena_size) != 0) {
LOG_ERROR("Failed to setup pooled static arena");
return false;
}
Expand Down Expand Up @@ -558,7 +558,7 @@ static bool build_runtime_image(
) {
PTO2RuntimeArenaLayout layout =
runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
layout.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
layout.sizing.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
return false;
Expand Down Expand Up @@ -587,12 +587,13 @@ static bool bind_launch_state(
) {
runtime->set_orch_args(device_args);

int rc_upload = runtime->host_api.copy_to_device(ptrs.runtime_arena_dev, host_arena.base(), layout.arena_size);
int rc_upload =
runtime->host_api.copy_to_device(ptrs.runtime_arena_dev, host_arena.base(), layout.offsets.arena_size);
if (rc_upload != 0) {
LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
return false;
}
runtime->set_prebuilt_arena(ptrs.runtime_arena_dev, layout.off_runtime);
runtime->set_prebuilt_arena(ptrs.runtime_arena_dev, layout.offsets.off_runtime);
return true;
}

Expand Down
44 changes: 32 additions & 12 deletions src/a2a3/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,30 +96,50 @@ struct PTO2RuntimeOps {
};

/**
* Layout descriptor for the prebuilt runtime arena. Holds all sub-region
* offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
* AICore mailbox) plus the layout-defining capacities. Produced once on the
* host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
* and runtime_wire_arena_pointers.
* Sizing half of the runtime-arena layout: the capacities that *define* the
* layout (the input to runtime_reserve_layout) plus the scheduler timeout.
* Stable per (callable_id, ring config); re-read at AICPU boot to reconstruct
* ring/heap/dep-pool capacities and the scheduler no-progress budget.
*/
struct PTO2RuntimeArenaLayout {
struct ArenaSizingKey {
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
int32_t scheduler_timeout_ms{0};
};

/**
* Offset half of the runtime-arena layout: every sub-region offset
* (sm_handle wrapper / orchestrator / scheduler / runtime header / AICore
* mailbox) plus the committed arena byte size. The *output* of
* runtime_reserve_layout; consumed by runtime_init_data_from_layout and
* runtime_wire_arena_pointers (the AICPU re-wires arena-internal pointers
* from these after rtMemcpy).
*/
struct ArenaOffsets {
size_t off_sm_handle{0};
PTO2OrchestratorLayout orch;
PTO2SchedulerLayout sched;
size_t off_runtime{0};
size_t off_mailbox{0};

// Cached parameters (re-used by init_data + wire stages).
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
int32_t scheduler_timeout_ms{0};

// Total arena byte size post-commit. Used by host to size the prebuilt
// image buffer and as the rtMemcpy length.
size_t arena_size{0};
};

/**
* Layout descriptor for the prebuilt runtime arena. Two named halves with
* distinct lifetimes/semantics: `sizing` is the layout-defining input
* (capacities + scheduler timeout), `offsets` is the computed sub-region
* offsets + arena size. Produced once on the host by runtime_reserve_layout();
* consumed by runtime_init_data_from_layout and runtime_wire_arena_pointers.
*/
struct PTO2RuntimeArenaLayout {
ArenaSizingKey sizing;
ArenaOffsets offsets;
};

/**
* PTO Runtime2 context
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -885,9 +885,9 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
// an undefined value.
uint64_t last_progress_ts = get_sys_cnt_aicpu();
uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES;
if (rt_ != nullptr && rt_->prebuilt_layout.scheduler_timeout_ms > 0) {
scheduler_timeout_cycles =
static_cast<uint64_t>(rt_->prebuilt_layout.scheduler_timeout_ms) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
if (rt_ != nullptr && rt_->prebuilt_layout.sizing.scheduler_timeout_ms > 0) {
scheduler_timeout_cycles = static_cast<uint64_t>(rt_->prebuilt_layout.sizing.scheduler_timeout_ms) *
(PLATFORM_PROF_SYS_CNT_FREQ / 1000);
}

while (true) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -399,22 +399,22 @@ PTO2RuntimeArenaLayout runtime_reserve_layout(
PTO2RuntimeArenaLayout layout{};

for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
layout.task_window_sizes[r] = task_window_sizes[r];
layout.heap_sizes[r] = heap_sizes[r];
layout.dep_pool_capacities[r] = dep_pool_capacities[r];
layout.sizing.task_window_sizes[r] = task_window_sizes[r];
layout.sizing.heap_sizes[r] = heap_sizes[r];
layout.sizing.dep_pool_capacities[r] = dep_pool_capacities[r];
}

layout.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
layout.offsets.off_sm_handle = arena.reserve(sizeof(PTO2SharedMemoryHandle), alignof(PTO2SharedMemoryHandle));
int32_t task_window_sizes_i32[PTO2_MAX_RING_DEPTH];
for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
task_window_sizes_i32[r] = static_cast<int32_t>(task_window_sizes[r]);
}
Comment thread
ChaoWao marked this conversation as resolved.
layout.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes_i32, dep_pool_capacities);
layout.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacities);
layout.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
layout.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));
layout.offsets.orch = PTO2OrchestratorState::reserve_layout(arena, task_window_sizes_i32, dep_pool_capacities);
layout.offsets.sched = PTO2SchedulerState::reserve_layout(arena, dep_pool_capacities);
layout.offsets.off_runtime = arena.reserve(sizeof(PTO2Runtime), PTO2_ALIGN_SIZE);
layout.offsets.off_mailbox = arena.reserve(sizeof(AICoreCompletionMailbox), alignof(AICoreCompletionMailbox));

layout.arena_size = arena.total_size();
layout.offsets.arena_size = arena.total_size();
return layout;
}

Expand All @@ -433,10 +433,10 @@ PTO2Runtime *runtime_init_data_from_layout(
DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2RuntimeMode mode, void *sm_dev_base,
uint64_t /*sm_size*/, void *gm_heap_dev_base, const uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]
) {
PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.off_runtime));
PTO2Runtime *rt = static_cast<PTO2Runtime *>(arena.region_ptr(layout.offsets.off_runtime));
memset(rt, 0, sizeof(*rt));

auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
auto *sm_wrap = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.offsets.off_sm_handle));
memset(sm_wrap, 0, sizeof(*sm_wrap));

// rt->ops is filled by the AICPU at boot.
Expand All @@ -451,25 +451,25 @@ PTO2Runtime *runtime_init_data_from_layout(
rt->total_cycles = 0;

if (!rt->orchestrator.init_data_from_layout(
layout.orch, arena, sm_dev_base, gm_heap_dev_base, heap_sizes, layout.task_window_sizes
layout.offsets.orch, arena, sm_dev_base, gm_heap_dev_base, heap_sizes, layout.sizing.task_window_sizes
)) {
return nullptr;
}
if (!rt->scheduler.init_data_from_layout(layout.sched, arena, sm_dev_base)) {
if (!rt->scheduler.init_data_from_layout(layout.offsets.sched, arena, sm_dev_base)) {
return nullptr;
}

auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
auto *mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.offsets.off_mailbox));
memset(mailbox, 0, sizeof(*mailbox));

return rt;
}

void runtime_wire_arena_pointers(DeviceArena &arena, const PTO2RuntimeArenaLayout &layout, PTO2Runtime *rt) {
rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.off_sm_handle));
rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.off_mailbox));
rt->orchestrator.wire_arena_pointers(layout.orch, arena, &rt->scheduler);
rt->scheduler.wire_arena_pointers(layout.sched, arena);
rt->sm_handle = static_cast<PTO2SharedMemoryHandle *>(arena.region_ptr(layout.offsets.off_sm_handle));
rt->aicore_mailbox = static_cast<AICoreCompletionMailbox *>(arena.region_ptr(layout.offsets.off_mailbox));
rt->orchestrator.wire_arena_pointers(layout.offsets.orch, arena, &rt->scheduler);
rt->scheduler.wire_arena_pointers(layout.offsets.sched, arena);
}

void runtime_destroy(PTO2Runtime *rt, DeviceArena & /*arena*/) {
Expand Down
12 changes: 7 additions & 5 deletions src/a5/runtime/tensormap_and_ringbuffer/aicpu/aicpu_executor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -500,12 +500,12 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
// Wire every arena-internal pointer field (host wrote host-mirror
// addresses; we overwrite them with device addresses).
runtime_wire_arena_pointers(runtime_arena_, rt->prebuilt_layout, rt);
sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(rt->prebuilt_layout.task_window_sizes);
sm_size = PTO2SharedMemoryHandle::calculate_size_per_ring(rt->prebuilt_layout.sizing.task_window_sizes);
for (int r = 0; r < PTO2_MAX_RING_DEPTH; ++r) {
LOG_INFO_V0(
"Thread %d: Ring %d sizes: task_window=%" PRIu64 " heap=%" PRIu64 " dep_pool=%d", thread_idx, r,
rt->prebuilt_layout.task_window_sizes[r], rt->prebuilt_layout.heap_sizes[r],
rt->prebuilt_layout.dep_pool_capacities[r]
rt->prebuilt_layout.sizing.task_window_sizes[r], rt->prebuilt_layout.sizing.heap_sizes[r],
rt->prebuilt_layout.sizing.dep_pool_capacities[r]
);
}
}
Expand All @@ -519,7 +519,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
AicpuPhaseScope sm_reset(AicpuPhase::SmReset);
memset(rt->sm_handle, 0, sizeof(*rt->sm_handle));
if (!rt->sm_handle->init_per_ring(
sm_ptr, sm_size, rt->prebuilt_layout.task_window_sizes, rt->prebuilt_layout.heap_sizes
sm_ptr, sm_size, rt->prebuilt_layout.sizing.task_window_sizes,
rt->prebuilt_layout.sizing.heap_sizes
)) {
LOG_ERROR("Thread %d: sm_handle->init_per_ring failed", thread_idx);
rt = nullptr;
Expand Down Expand Up @@ -551,7 +552,8 @@ int32_t AicpuExecutor::run(Runtime *runtime) {
for (int r = 0; r < PTO2_MAX_RING_DEPTH; r++) {
auto &alloc = orch.rings[r].task_allocator;
scope_stats_set_ring_capacity(
r, alloc.window_size(), alloc.heap_capacity(), rt->prebuilt_layout.dep_pool_capacities[r]
r, alloc.window_size(), alloc.heap_capacity(),
rt->prebuilt_layout.sizing.dep_pool_capacities[r]
);
}
scope_stats_set_tensormap_capacity(orch.tensor_map.pool_capacity());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -501,7 +501,7 @@ static bool ensure_static_arenas(Runtime *runtime, const ArenaSizingConfig &sizi
runtime_reserve_layout(sizing_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);

int64_t t_setup_start = _now_ms();
if (runtime->host_api.setup_static_arena(sizing.total_heap, sizing.sm_size, layout.arena_size) != 0) {
if (runtime->host_api.setup_static_arena(sizing.total_heap, sizing.sm_size, layout.offsets.arena_size) != 0) {
LOG_ERROR("Failed to setup pooled static arena");
return false;
}
Expand Down Expand Up @@ -558,7 +558,7 @@ static bool build_runtime_image(
) {
PTO2RuntimeArenaLayout layout =
runtime_reserve_layout(*host_arena, sizing.task_window_sizes, sizing.heap_sizes, sizing.dep_pool_capacities);
layout.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
layout.sizing.scheduler_timeout_ms = sizing.scheduler_timeout_ms;
if (host_arena->commit(DeviceArena::kDefaultBaseAlign) == nullptr) {
LOG_ERROR("Failed to commit host arena for prebuilt runtime image");
return false;
Expand Down Expand Up @@ -587,12 +587,13 @@ static bool bind_launch_state(
) {
runtime->set_orch_args(device_args);

int rc_upload = runtime->host_api.copy_to_device(ptrs.runtime_arena_dev, host_arena.base(), layout.arena_size);
int rc_upload =
runtime->host_api.copy_to_device(ptrs.runtime_arena_dev, host_arena.base(), layout.offsets.arena_size);
if (rc_upload != 0) {
LOG_ERROR("Failed to rtMemcpy prebuilt runtime arena to device (rc=%d)", rc_upload);
return false;
}
runtime->set_prebuilt_arena(ptrs.runtime_arena_dev, layout.off_runtime);
runtime->set_prebuilt_arena(ptrs.runtime_arena_dev, layout.offsets.off_runtime);
return true;
}

Expand Down
44 changes: 32 additions & 12 deletions src/a5/runtime/tensormap_and_ringbuffer/runtime/pto_runtime2.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,30 +97,50 @@ struct PTO2RuntimeOps {
};

/**
* Layout descriptor for the prebuilt runtime arena. Holds all sub-region
* offsets (orchestrator / scheduler / sm_handle wrapper / runtime header /
* AICore mailbox) plus the layout-defining capacities. Produced once on the
* host by runtime_reserve_layout(); consumed by runtime_init_data_from_layout
* and runtime_wire_arena_pointers.
* Sizing half of the runtime-arena layout: the capacities that *define* the
* layout (the input to runtime_reserve_layout) plus the scheduler timeout.
* Stable per (callable_id, ring config); re-read at AICPU boot to reconstruct
* ring/heap/dep-pool capacities and the scheduler no-progress budget.
*/
struct PTO2RuntimeArenaLayout {
struct ArenaSizingKey {
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
int32_t scheduler_timeout_ms{0};
};

/**
* Offset half of the runtime-arena layout: every sub-region offset
* (sm_handle wrapper / orchestrator / scheduler / runtime header / AICore
* mailbox) plus the committed arena byte size. The *output* of
* runtime_reserve_layout; consumed by runtime_init_data_from_layout and
* runtime_wire_arena_pointers (the AICPU re-wires arena-internal pointers
* from these after rtMemcpy).
*/
struct ArenaOffsets {
size_t off_sm_handle{0};
PTO2OrchestratorLayout orch;
PTO2SchedulerLayout sched;
size_t off_runtime{0};
size_t off_mailbox{0};

// Cached parameters (re-used by init_data + wire stages).
uint64_t task_window_sizes[PTO2_MAX_RING_DEPTH]{};
uint64_t heap_sizes[PTO2_MAX_RING_DEPTH]{};
int32_t dep_pool_capacities[PTO2_MAX_RING_DEPTH]{};
int32_t scheduler_timeout_ms{0};

// Total arena byte size post-commit. Used by host to size the prebuilt
// image buffer and as the rtMemcpy length.
size_t arena_size{0};
};

/**
* Layout descriptor for the prebuilt runtime arena. Two named halves with
* distinct lifetimes/semantics: `sizing` is the layout-defining input
* (capacities + scheduler timeout), `offsets` is the computed sub-region
* offsets + arena size. Produced once on the host by runtime_reserve_layout();
* consumed by runtime_init_data_from_layout and runtime_wire_arena_pointers.
*/
struct PTO2RuntimeArenaLayout {
ArenaSizingKey sizing;
ArenaOffsets offsets;
};

/**
* PTO Runtime2 context
*
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -603,9 +603,9 @@ int32_t SchedulerContext::resolve_and_dispatch(Runtime *runtime, int32_t thread_
// an undefined value.
uint64_t last_progress_ts = get_sys_cnt_aicpu();
uint64_t scheduler_timeout_cycles = SCHEDULER_TIMEOUT_CYCLES;
if (rt_ != nullptr && rt_->prebuilt_layout.scheduler_timeout_ms > 0) {
scheduler_timeout_cycles =
static_cast<uint64_t>(rt_->prebuilt_layout.scheduler_timeout_ms) * (PLATFORM_PROF_SYS_CNT_FREQ / 1000);
if (rt_ != nullptr && rt_->prebuilt_layout.sizing.scheduler_timeout_ms > 0) {
scheduler_timeout_cycles = static_cast<uint64_t>(rt_->prebuilt_layout.sizing.scheduler_timeout_ms) *
(PLATFORM_PROF_SYS_CNT_FREQ / 1000);
}

while (true) {
Expand Down
Loading
Loading